summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/fid.c5
-rw-r--r--fs/9p/v9fs.h34
-rw-r--r--fs/9p/v9fs_vfs.h2
-rw-r--r--fs/9p/vfs_inode.c129
-rw-r--r--fs/9p/vfs_inode_dotl.c112
-rw-r--r--fs/9p/vfs_super.c2
-rw-r--r--fs/adfs/super.c186
-rw-r--r--fs/affs/super.c374
-rw-r--r--fs/afs/dir.c25
-rw-r--r--fs/afs/dir_edit.c91
-rw-r--r--fs/afs/internal.h4
-rw-r--r--fs/afs/rxrpc.c83
-rw-r--r--fs/aio.c1
-rw-r--r--fs/attr.c61
-rw-r--r--fs/autofs/dev-ioctl.c5
-rw-r--r--fs/backing-file.c8
-rw-r--r--fs/bcachefs/alloc_background.c47
-rw-r--r--fs/bcachefs/alloc_background.h3
-rw-r--r--fs/bcachefs/alloc_foreground.c21
-rw-r--r--fs/bcachefs/backpointers.c17
-rw-r--r--fs/bcachefs/bcachefs.h1
-rw-r--r--fs/bcachefs/bcachefs_format.h3
-rw-r--r--fs/bcachefs/bkey.c7
-rw-r--r--fs/bcachefs/btree_cache.c107
-rw-r--r--fs/bcachefs/btree_cache.h2
-rw-r--r--fs/bcachefs/btree_gc.c29
-rw-r--r--fs/bcachefs/btree_io.c13
-rw-r--r--fs/bcachefs/btree_iter.c19
-rw-r--r--fs/bcachefs/btree_iter.h10
-rw-r--r--fs/bcachefs/btree_node_scan.c5
-rw-r--r--fs/bcachefs/btree_update.c4
-rw-r--r--fs/bcachefs/btree_update.h2
-rw-r--r--fs/bcachefs/btree_update_interior.c37
-rw-r--r--fs/bcachefs/btree_write_buffer.c30
-rw-r--r--fs/bcachefs/btree_write_buffer.h1
-rw-r--r--fs/bcachefs/buckets.c7
-rw-r--r--fs/bcachefs/buckets.h31
-rw-r--r--fs/bcachefs/chardev.c1
-rw-r--r--fs/bcachefs/darray.c15
-rw-r--r--fs/bcachefs/data_update.c22
-rw-r--r--fs/bcachefs/data_update.h3
-rw-r--r--fs/bcachefs/dirent.c7
-rw-r--r--fs/bcachefs/dirent.h7
-rw-r--r--fs/bcachefs/disk_accounting.c155
-rw-r--r--fs/bcachefs/ec.c112
-rw-r--r--fs/bcachefs/errcode.h6
-rw-r--r--fs/bcachefs/error.c5
-rw-r--r--fs/bcachefs/extents.c91
-rw-r--r--fs/bcachefs/extents.h15
-rw-r--r--fs/bcachefs/fs-io-buffered.c6
-rw-r--r--fs/bcachefs/fs-io-direct.c3
-rw-r--r--fs/bcachefs/fs-io-pagecache.c70
-rw-r--r--fs/bcachefs/fs-io.c19
-rw-r--r--fs/bcachefs/fs.c120
-rw-r--r--fs/bcachefs/fs.h9
-rw-r--r--fs/bcachefs/fsck.c645
-rw-r--r--fs/bcachefs/fsck.h1
-rw-r--r--fs/bcachefs/inode.c302
-rw-r--r--fs/bcachefs/inode.h11
-rw-r--r--fs/bcachefs/inode_format.h9
-rw-r--r--fs/bcachefs/io_misc.c2
-rw-r--r--fs/bcachefs/io_read.c18
-rw-r--r--fs/bcachefs/io_write.c11
-rw-r--r--fs/bcachefs/journal.c23
-rw-r--r--fs/bcachefs/journal.h2
-rw-r--r--fs/bcachefs/journal_io.c5
-rw-r--r--fs/bcachefs/move.c2
-rw-r--r--fs/bcachefs/opts.c12
-rw-r--r--fs/bcachefs/opts.h3
-rw-r--r--fs/bcachefs/quota.c2
-rw-r--r--fs/bcachefs/rebalance.c4
-rw-r--r--fs/bcachefs/recovery.c26
-rw-r--r--fs/bcachefs/recovery_passes.c12
-rw-r--r--fs/bcachefs/recovery_passes_types.h2
-rw-r--r--fs/bcachefs/replicas.c39
-rw-r--r--fs/bcachefs/sb-downgrade.c8
-rw-r--r--fs/bcachefs/sb-errors_format.h14
-rw-r--r--fs/bcachefs/sb-members.c12
-rw-r--r--fs/bcachefs/sb-members_format.h6
-rw-r--r--fs/bcachefs/snapshot.c129
-rw-r--r--fs/bcachefs/snapshot.h3
-rw-r--r--fs/bcachefs/str_hash.h60
-rw-r--r--fs/bcachefs/subvolume.c7
-rw-r--r--fs/bcachefs/super-io.c5
-rw-r--r--fs/bcachefs/super.c37
-rw-r--r--fs/bcachefs/tests.c9
-rw-r--r--fs/bcachefs/xattr.c2
-rw-r--r--fs/befs/linuxvfs.c199
-rw-r--r--fs/binfmt_elf.c6
-rw-r--r--fs/binfmt_elf_fdpic.c6
-rw-r--r--fs/btrfs/Kconfig26
-rw-r--r--fs/btrfs/Makefile3
-rw-r--r--fs/btrfs/backref.c3
-rw-r--r--fs/btrfs/bio.c39
-rw-r--r--fs/btrfs/bio.h3
-rw-r--r--fs/btrfs/block-group.c4
-rw-r--r--fs/btrfs/btrfs_inode.h15
-rw-r--r--fs/btrfs/compression.c14
-rw-r--r--fs/btrfs/compression.h2
-rw-r--r--fs/btrfs/ctree.c132
-rw-r--r--fs/btrfs/ctree.h13
-rw-r--r--fs/btrfs/defrag.c10
-rw-r--r--fs/btrfs/delayed-inode.h2
-rw-r--r--fs/btrfs/delayed-ref.c334
-rw-r--r--fs/btrfs/delayed-ref.h70
-rw-r--r--fs/btrfs/dev-replace.c4
-rw-r--r--fs/btrfs/dir-item.c15
-rw-r--r--fs/btrfs/dir-item.h3
-rw-r--r--fs/btrfs/direct-io.c2
-rw-r--r--fs/btrfs/disk-io.c95
-rw-r--r--fs/btrfs/disk-io.h6
-rw-r--r--fs/btrfs/extent-tree.c104
-rw-r--r--fs/btrfs/extent_io.c133
-rw-r--r--fs/btrfs/extent_map.c160
-rw-r--r--fs/btrfs/extent_map.h3
-rw-r--r--fs/btrfs/fiemap.c6
-rw-r--r--fs/btrfs/file.c374
-rw-r--r--fs/btrfs/file.h7
-rw-r--r--fs/btrfs/free-space-cache.c22
-rw-r--r--fs/btrfs/fs.h16
-rw-r--r--fs/btrfs/inode.c512
-rw-r--r--fs/btrfs/ioctl.c483
-rw-r--r--fs/btrfs/ioctl.h2
-rw-r--r--fs/btrfs/locking.c15
-rw-r--r--fs/btrfs/locking.h1
-rw-r--r--fs/btrfs/lzo.c2
-rw-r--r--fs/btrfs/ordered-data.c4
-rw-r--r--fs/btrfs/qgroup.c105
-rw-r--r--fs/btrfs/qgroup.h19
-rw-r--r--fs/btrfs/raid-stripe-tree.c92
-rw-r--r--fs/btrfs/raid-stripe-tree.h5
-rw-r--r--fs/btrfs/raid56.c3
-rw-r--r--fs/btrfs/relocation.c2
-rw-r--r--fs/btrfs/scrub.c37
-rw-r--r--fs/btrfs/send.c65
-rw-r--r--fs/btrfs/send.h2
-rw-r--r--fs/btrfs/space-info.c12
-rw-r--r--fs/btrfs/subpage.c204
-rw-r--r--fs/btrfs/subpage.h39
-rw-r--r--fs/btrfs/super.c73
-rw-r--r--fs/btrfs/sysfs.c4
-rw-r--r--fs/btrfs/tests/btrfs-tests.c4
-rw-r--r--fs/btrfs/tests/btrfs-tests.h2
-rw-r--r--fs/btrfs/tests/raid-stripe-tree-tests.c538
-rw-r--r--fs/btrfs/transaction.c8
-rw-r--r--fs/btrfs/transaction.h2
-rw-r--r--fs/btrfs/tree-checker.c16
-rw-r--r--fs/btrfs/tree-checker.h4
-rw-r--r--fs/btrfs/tree-log.c9
-rw-r--r--fs/btrfs/tree-mod-log.c1
-rw-r--r--fs/btrfs/tree-mod-log.h1
-rw-r--r--fs/btrfs/volumes.c164
-rw-r--r--fs/btrfs/volumes.h11
-rw-r--r--fs/btrfs/xattr.c5
-rw-r--r--fs/btrfs/zlib.c2
-rw-r--r--fs/btrfs/zoned.c17
-rw-r--r--fs/btrfs/zstd.c4
-rw-r--r--fs/buffer.c5
-rw-r--r--fs/cachefiles/interface.c14
-rw-r--r--fs/cachefiles/namei.c5
-rw-r--r--fs/cachefiles/ondemand.c38
-rw-r--r--fs/ceph/addr.c20
-rw-r--r--fs/char_dev.c2
-rw-r--r--fs/compat_binfmt_elf.c10
-rw-r--r--fs/coredump.c1
-rw-r--r--fs/crypto/keyring.c1
-rw-r--r--fs/dax.c45
-rw-r--r--fs/dcache.c16
-rw-r--r--fs/ecryptfs/crypto.c35
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h9
-rw-r--r--fs/ecryptfs/inode.c12
-rw-r--r--fs/ecryptfs/mmap.c136
-rw-r--r--fs/ecryptfs/read_write.c50
-rw-r--r--fs/efs/super.c43
-rw-r--r--fs/erofs/super.c17
-rw-r--r--fs/erofs/zdata.c29
-rw-r--r--fs/erofs/zmap.c32
-rw-r--r--fs/eventfd.c9
-rw-r--r--fs/eventpoll.c51
-rw-r--r--fs/ext4/balloc.c4
-rw-r--r--fs/ext4/dir.c7
-rw-r--r--fs/ext4/ext4.h22
-rw-r--r--fs/ext4/extents.c13
-rw-r--r--fs/ext4/extents_status.c8
-rw-r--r--fs/ext4/extents_status.h3
-rw-r--r--fs/ext4/fast_commit.c8
-rw-r--r--fs/ext4/file.c36
-rw-r--r--fs/ext4/fsmap.c54
-rw-r--r--fs/ext4/ialloc.c5
-rw-r--r--fs/ext4/indirect.c2
-rw-r--r--fs/ext4/inode.c109
-rw-r--r--fs/ext4/ioctl.c21
-rw-r--r--fs/ext4/mballoc.c22
-rw-r--r--fs/ext4/mballoc.h1
-rw-r--r--fs/ext4/mmp.c2
-rw-r--r--fs/ext4/move_extent.c2
-rw-r--r--fs/ext4/namei.c23
-rw-r--r--fs/ext4/page-io.c6
-rw-r--r--fs/ext4/resize.c2
-rw-r--r--fs/ext4/super.c113
-rw-r--r--fs/f2fs/data.c9
-rw-r--r--fs/f2fs/file.c18
-rw-r--r--fs/fat/namei_vfat.c2
-rw-r--r--fs/fcntl.c46
-rw-r--r--fs/fhandle.c5
-rw-r--r--fs/file.c288
-rw-r--r--fs/file_table.c50
-rw-r--r--fs/freevxfs/vxfs_dir.h2
-rw-r--r--fs/fs-writeback.c40
-rw-r--r--fs/fs_parser.c21
-rw-r--r--fs/fsopen.c19
-rw-r--r--fs/fuse/dev.c6
-rw-r--r--fs/fuse/file.c18
-rw-r--r--fs/fuse/passthrough.c9
-rw-r--r--fs/gfs2/export.c1
-rw-r--r--fs/gfs2/file.c2
-rw-r--r--fs/gfs2/glock.c12
-rw-r--r--fs/hfs/super.c342
-rw-r--r--fs/hfsplus/hfsplus_fs.h7
-rw-r--r--fs/hfsplus/options.c263
-rw-r--r--fs/hfsplus/super.c84
-rw-r--r--fs/hfsplus/wrapper.c2
-rw-r--r--fs/hpfs/super.c414
-rw-r--r--fs/hugetlbfs/inode.c19
-rw-r--r--fs/inode.c313
-rw-r--r--fs/internal.h18
-rw-r--r--fs/ioctl.c23
-rw-r--r--fs/iomap/buffered-io.c130
-rw-r--r--fs/iomap/direct-io.c43
-rw-r--r--fs/iomap/trace.h3
-rw-r--r--fs/jbd2/commit.c4
-rw-r--r--fs/jbd2/journal.c15
-rw-r--r--fs/jbd2/recovery.c311
-rw-r--r--fs/jfs/jfs_dmap.c2
-rw-r--r--fs/jfs/jfs_filsys.h1
-rw-r--r--fs/jfs/super.c469
-rw-r--r--fs/kernel_read_file.c12
-rw-r--r--fs/libfs.c23
-rw-r--r--fs/lockd/svclock.c7
-rw-r--r--fs/locks.c15
-rw-r--r--fs/mpage.c2
-rw-r--r--fs/namei.c88
-rw-r--r--fs/namespace.c212
-rw-r--r--fs/netfs/buffered_read.c55
-rw-r--r--fs/netfs/buffered_write.c41
-rw-r--r--fs/netfs/fscache_volume.c3
-rw-r--r--fs/netfs/locking.c3
-rw-r--r--fs/netfs/read_collect.c2
-rw-r--r--fs/nfs/client.c3
-rw-r--r--fs/nfs/inode.c70
-rw-r--r--fs/nfs/localio.c3
-rw-r--r--fs/nfs/nfs4proc.c4
-rw-r--r--fs/nfs/super.c10
-rw-r--r--fs/nfs_common/nfslocalio.c23
-rw-r--r--fs/nfsd/nfs4proc.c8
-rw-r--r--fs/nfsd/nfs4state.c69
-rw-r--r--fs/nfsd/state.h2
-rw-r--r--fs/nfsd/vfs.c13
-rw-r--r--fs/nilfs2/btnode.c2
-rw-r--r--fs/nilfs2/dir.c48
-rw-r--r--fs/nilfs2/gcinode.c4
-rw-r--r--fs/nilfs2/mdt.c1
-rw-r--r--fs/nilfs2/namei.c42
-rw-r--r--fs/nilfs2/nilfs.h2
-rw-r--r--fs/nilfs2/page.c31
-rw-r--r--fs/notify/dnotify/dnotify.c5
-rw-r--r--fs/notify/fanotify/fanotify.c1
-rw-r--r--fs/notify/fanotify/fanotify_user.c45
-rw-r--r--fs/notify/inotify/inotify_user.c38
-rw-r--r--fs/ocfs2/cluster/heartbeat.c24
-rw-r--r--fs/ocfs2/export.c1
-rw-r--r--fs/ocfs2/file.c19
-rw-r--r--fs/ocfs2/resize.c2
-rw-r--r--fs/ocfs2/super.c13
-rw-r--r--fs/ocfs2/xattr.c3
-rw-r--r--fs/open.c80
-rw-r--r--fs/overlayfs/copy_up.c1
-rw-r--r--fs/overlayfs/file.c9
-rw-r--r--fs/overlayfs/inode.c10
-rw-r--r--fs/overlayfs/overlayfs.h8
-rw-r--r--fs/overlayfs/params.c116
-rw-r--r--fs/pidfs.c86
-rw-r--r--fs/posix_acl.c13
-rw-r--r--fs/proc/base.c1
-rw-r--r--fs/proc/fd.c14
-rw-r--r--fs/proc/softirqs.c2
-rw-r--r--fs/proc/task_mmu.c22
-rw-r--r--fs/proc/vmcore.c9
-rw-r--r--fs/quota/quota.c12
-rw-r--r--fs/read_write.c161
-rw-r--r--fs/readdir.c28
-rw-r--r--fs/remap_range.c11
-rw-r--r--fs/select.c48
-rw-r--r--fs/seq_file.c2
-rw-r--r--fs/signalfd.c9
-rw-r--r--fs/smb/client/cifsfs.c2
-rw-r--r--fs/smb/client/cifsproto.h9
-rw-r--r--fs/smb/client/compress.c4
-rw-r--r--fs/smb/client/connect.c26
-rw-r--r--fs/smb/client/fs_context.c7
-rw-r--r--fs/smb/client/ioctl.c11
-rw-r--r--fs/smb/client/sess.c32
-rw-r--r--fs/smb/client/smb2ops.c3
-rw-r--r--fs/smb/client/smb2pdu.c9
-rw-r--r--fs/smb/server/auth.c6
-rw-r--r--fs/smb/server/connection.c1
-rw-r--r--fs/smb/server/connection.h1
-rw-r--r--fs/smb/server/ksmbd_netlink.h17
-rw-r--r--fs/smb/server/mgmt/user_config.c45
-rw-r--r--fs/smb/server/mgmt/user_config.h5
-rw-r--r--fs/smb/server/mgmt/user_session.c41
-rw-r--r--fs/smb/server/mgmt/user_session.h4
-rw-r--r--fs/smb/server/server.c18
-rw-r--r--fs/smb/server/smb2pdu.c8
-rw-r--r--fs/smb/server/smb_common.c25
-rw-r--r--fs/smb/server/smb_common.h2
-rw-r--r--fs/smb/server/transport_ipc.c64
-rw-r--r--fs/smb/server/transport_ipc.h2
-rw-r--r--fs/splice.c78
-rw-r--r--fs/squashfs/file_direct.c9
-rw-r--r--fs/stat.c98
-rw-r--r--fs/statfs.c12
-rw-r--r--fs/super.c26
-rw-r--r--fs/sync.c29
-rw-r--r--fs/timerfd.c40
-rw-r--r--fs/tracefs/inode.c12
-rw-r--r--fs/ubifs/super.c399
-rw-r--r--fs/ufs/balloc.c107
-rw-r--r--fs/ufs/cylinder.c31
-rw-r--r--fs/ufs/dir.c29
-rw-r--r--fs/ufs/file.c1
-rw-r--r--fs/ufs/inode.c179
-rw-r--r--fs/ufs/namei.c39
-rw-r--r--fs/ufs/super.c49
-rw-r--r--fs/ufs/ufs.h12
-rw-r--r--fs/ufs/ufs_fs.h4
-rw-r--r--fs/ufs/util.c46
-rw-r--r--fs/ufs/util.h61
-rw-r--r--fs/unicode/utf8-core.c26
-rw-r--r--fs/unicode/utf8-selftest.c3
-rw-r--r--fs/userfaultfd.c28
-rw-r--r--fs/utimes.c11
-rw-r--r--fs/xattr.c446
-rw-r--r--fs/xfs/libxfs/xfs_ag.c75
-rw-r--r--fs/xfs/libxfs/xfs_ag.h11
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c2
-rw-r--r--fs/xfs/libxfs/xfs_trans_inode.c6
-rw-r--r--fs/xfs/scrub/bmap_repair.c2
-rw-r--r--fs/xfs/scrub/repair.c8
-rw-r--r--fs/xfs/xfs_aops.c4
-rw-r--r--fs/xfs/xfs_bmap_util.c10
-rw-r--r--fs/xfs/xfs_bmap_util.h2
-rw-r--r--fs/xfs/xfs_buf.c7
-rw-r--r--fs/xfs/xfs_buf.h4
-rw-r--r--fs/xfs/xfs_buf_item_recover.c70
-rw-r--r--fs/xfs/xfs_exchrange.c18
-rw-r--r--fs/xfs/xfs_file.c162
-rw-r--r--fs/xfs/xfs_filestream.c99
-rw-r--r--fs/xfs/xfs_fsops.c20
-rw-r--r--fs/xfs/xfs_handle.c16
-rw-r--r--fs/xfs/xfs_inode.c2
-rw-r--r--fs/xfs/xfs_inode.h20
-rw-r--r--fs/xfs/xfs_ioctl.c73
-rw-r--r--fs/xfs/xfs_iomap.c69
-rw-r--r--fs/xfs/xfs_iops.c32
-rw-r--r--fs/xfs/xfs_log_recover.c7
-rw-r--r--fs/xfs/xfs_mount.c9
-rw-r--r--fs/xfs/xfs_super.c2
-rw-r--r--fs/xfs/xfs_trace.h15
369 files changed, 9868 insertions, 6546 deletions
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index de009a33e0e2..f84412290a30 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -131,10 +131,9 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, kuid_t uid, int any)
}
}
spin_unlock(&dentry->d_lock);
- } else {
- if (dentry->d_inode)
- ret = v9fs_fid_find_inode(dentry->d_inode, false, uid, any);
}
+ if (!ret && dentry->d_inode)
+ ret = v9fs_fid_find_inode(dentry->d_inode, false, uid, any);
return ret;
}
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 1775fcc7f0e8..698c43dd5dc8 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -179,14 +179,16 @@ extern int v9fs_vfs_rename(struct mnt_idmap *idmap,
struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry,
unsigned int flags);
-extern struct inode *v9fs_fid_iget(struct super_block *sb, struct p9_fid *fid,
- bool new);
+extern struct inode *v9fs_inode_from_fid(struct v9fs_session_info *v9ses,
+ struct p9_fid *fid,
+ struct super_block *sb, int new);
extern const struct inode_operations v9fs_dir_inode_operations_dotl;
extern const struct inode_operations v9fs_file_inode_operations_dotl;
extern const struct inode_operations v9fs_symlink_inode_operations_dotl;
extern const struct netfs_request_ops v9fs_req_ops;
-extern struct inode *v9fs_fid_iget_dotl(struct super_block *sb,
- struct p9_fid *fid, bool new);
+extern struct inode *v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses,
+ struct p9_fid *fid,
+ struct super_block *sb, int new);
/* other default globals */
#define V9FS_PORT 564
@@ -225,12 +227,30 @@ static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses)
*/
static inline struct inode *
v9fs_get_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
- struct super_block *sb, bool new)
+ struct super_block *sb)
{
if (v9fs_proto_dotl(v9ses))
- return v9fs_fid_iget_dotl(sb, fid, new);
+ return v9fs_inode_from_fid_dotl(v9ses, fid, sb, 0);
else
- return v9fs_fid_iget(sb, fid, new);
+ return v9fs_inode_from_fid(v9ses, fid, sb, 0);
+}
+
+/**
+ * v9fs_get_new_inode_from_fid - Helper routine to populate an inode by
+ * issuing a attribute request
+ * @v9ses: session information
+ * @fid: fid to issue attribute request for
+ * @sb: superblock on which to create inode
+ *
+ */
+static inline struct inode *
+v9fs_get_new_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+ struct super_block *sb)
+{
+ if (v9fs_proto_dotl(v9ses))
+ return v9fs_inode_from_fid_dotl(v9ses, fid, sb, 1);
+ else
+ return v9fs_inode_from_fid(v9ses, fid, sb, 1);
}
#endif
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 7923c3c347cb..d3aefbec4de6 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -42,7 +42,7 @@ struct inode *v9fs_alloc_inode(struct super_block *sb);
void v9fs_free_inode(struct inode *inode);
void v9fs_set_netfs_context(struct inode *inode);
int v9fs_init_inode(struct v9fs_session_info *v9ses,
- struct inode *inode, struct p9_qid *qid, umode_t mode, dev_t rdev);
+ struct inode *inode, umode_t mode, dev_t rdev);
void v9fs_evict_inode(struct inode *inode);
#if (BITS_PER_LONG == 32)
#define QID2INO(q) ((ino_t) (((q)->path+2) ^ (((q)->path) >> 32)))
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index fd72fc38c8f5..3e68521f4e2f 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -256,12 +256,9 @@ void v9fs_set_netfs_context(struct inode *inode)
}
int v9fs_init_inode(struct v9fs_session_info *v9ses,
- struct inode *inode, struct p9_qid *qid, umode_t mode, dev_t rdev)
+ struct inode *inode, umode_t mode, dev_t rdev)
{
int err = 0;
- struct v9fs_inode *v9inode = V9FS_I(inode);
-
- memcpy(&v9inode->qid, qid, sizeof(struct p9_qid));
inode_init_owner(&nop_mnt_idmap, inode, NULL, mode);
inode->i_blocks = 0;
@@ -365,59 +362,105 @@ void v9fs_evict_inode(struct inode *inode)
clear_inode(inode);
}
-struct inode *
-v9fs_fid_iget(struct super_block *sb, struct p9_fid *fid, bool new)
+static int v9fs_test_inode(struct inode *inode, void *data)
+{
+ int umode;
+ dev_t rdev;
+ struct v9fs_inode *v9inode = V9FS_I(inode);
+ struct p9_wstat *st = (struct p9_wstat *)data;
+ struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode);
+
+ umode = p9mode2unixmode(v9ses, st, &rdev);
+ /* don't match inode of different type */
+ if (inode_wrong_type(inode, umode))
+ return 0;
+
+ /* compare qid details */
+ if (memcmp(&v9inode->qid.version,
+ &st->qid.version, sizeof(v9inode->qid.version)))
+ return 0;
+
+ if (v9inode->qid.type != st->qid.type)
+ return 0;
+
+ if (v9inode->qid.path != st->qid.path)
+ return 0;
+ return 1;
+}
+
+static int v9fs_test_new_inode(struct inode *inode, void *data)
+{
+ return 0;
+}
+
+static int v9fs_set_inode(struct inode *inode, void *data)
+{
+ struct v9fs_inode *v9inode = V9FS_I(inode);
+ struct p9_wstat *st = (struct p9_wstat *)data;
+
+ memcpy(&v9inode->qid, &st->qid, sizeof(st->qid));
+ return 0;
+}
+
+static struct inode *v9fs_qid_iget(struct super_block *sb,
+ struct p9_qid *qid,
+ struct p9_wstat *st,
+ int new)
{
dev_t rdev;
int retval;
umode_t umode;
struct inode *inode;
- struct p9_wstat *st;
struct v9fs_session_info *v9ses = sb->s_fs_info;
+ int (*test)(struct inode *inode, void *data);
- inode = iget_locked(sb, QID2INO(&fid->qid));
- if (unlikely(!inode))
- return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW)) {
- if (!new) {
- goto done;
- } else {
- p9_debug(P9_DEBUG_VFS, "WARNING: Inode collision %ld\n",
- inode->i_ino);
- iput(inode);
- remove_inode_hash(inode);
- inode = iget_locked(sb, QID2INO(&fid->qid));
- WARN_ON(!(inode->i_state & I_NEW));
- }
- }
+ if (new)
+ test = v9fs_test_new_inode;
+ else
+ test = v9fs_test_inode;
+ inode = iget5_locked(sb, QID2INO(qid), test, v9fs_set_inode, st);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+ if (!(inode->i_state & I_NEW))
+ return inode;
/*
* initialize the inode with the stat info
* FIXME!! we may need support for stale inodes
* later.
*/
- st = p9_client_stat(fid);
- if (IS_ERR(st)) {
- retval = PTR_ERR(st);
- goto error;
- }
-
+ inode->i_ino = QID2INO(qid);
umode = p9mode2unixmode(v9ses, st, &rdev);
- retval = v9fs_init_inode(v9ses, inode, &fid->qid, umode, rdev);
- v9fs_stat2inode(st, inode, sb, 0);
- p9stat_free(st);
- kfree(st);
+ retval = v9fs_init_inode(v9ses, inode, umode, rdev);
if (retval)
goto error;
+ v9fs_stat2inode(st, inode, sb, 0);
v9fs_set_netfs_context(inode);
v9fs_cache_inode_get_cookie(inode);
unlock_new_inode(inode);
-done:
return inode;
error:
iget_failed(inode);
return ERR_PTR(retval);
+
+}
+
+struct inode *
+v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+ struct super_block *sb, int new)
+{
+ struct p9_wstat *st;
+ struct inode *inode = NULL;
+
+ st = p9_client_stat(fid);
+ if (IS_ERR(st))
+ return ERR_CAST(st);
+
+ inode = v9fs_qid_iget(sb, &st->qid, st, new);
+ p9stat_free(st);
+ kfree(st);
+ return inode;
}
/**
@@ -449,15 +492,8 @@ static int v9fs_at_to_dotl_flags(int flags)
*/
static void v9fs_dec_count(struct inode *inode)
{
- if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2) {
- if (inode->i_nlink) {
- drop_nlink(inode);
- } else {
- p9_debug(P9_DEBUG_VFS,
- "WARNING: unexpected i_nlink zero %d inode %ld\n",
- inode->i_nlink, inode->i_ino);
- }
- }
+ if (!S_ISDIR(inode->i_mode) || inode->i_nlink > 2)
+ drop_nlink(inode);
}
/**
@@ -508,9 +544,6 @@ static int v9fs_remove(struct inode *dir, struct dentry *dentry, int flags)
} else
v9fs_dec_count(inode);
- if (inode->i_nlink <= 0) /* no more refs unhash it */
- remove_inode_hash(inode);
-
v9fs_invalidate_inode_attr(inode);
v9fs_invalidate_inode_attr(dir);
@@ -576,7 +609,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
/*
* instantiate inode and assign the unopened fid to the dentry
*/
- inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb, true);
+ inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
p9_debug(P9_DEBUG_VFS,
@@ -704,8 +737,10 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
inode = NULL;
else if (IS_ERR(fid))
inode = ERR_CAST(fid);
+ else if (v9ses->cache & (CACHE_META|CACHE_LOOSE))
+ inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
else
- inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb, false);
+ inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
/*
* If we had a rename on the server and a parallel lookup
* for the new name, then make sure we instantiate with
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index c61b97bd13b9..143ac03b7425 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -52,50 +52,80 @@ static kgid_t v9fs_get_fsgid_for_create(struct inode *dir_inode)
return current_fsgid();
}
+static int v9fs_test_inode_dotl(struct inode *inode, void *data)
+{
+ struct v9fs_inode *v9inode = V9FS_I(inode);
+ struct p9_stat_dotl *st = (struct p9_stat_dotl *)data;
+ /* don't match inode of different type */
+ if (inode_wrong_type(inode, st->st_mode))
+ return 0;
-struct inode *
-v9fs_fid_iget_dotl(struct super_block *sb, struct p9_fid *fid, bool new)
+ if (inode->i_generation != st->st_gen)
+ return 0;
+
+ /* compare qid details */
+ if (memcmp(&v9inode->qid.version,
+ &st->qid.version, sizeof(v9inode->qid.version)))
+ return 0;
+
+ if (v9inode->qid.type != st->qid.type)
+ return 0;
+
+ if (v9inode->qid.path != st->qid.path)
+ return 0;
+ return 1;
+}
+
+/* Always get a new inode */
+static int v9fs_test_new_inode_dotl(struct inode *inode, void *data)
+{
+ return 0;
+}
+
+static int v9fs_set_inode_dotl(struct inode *inode, void *data)
+{
+ struct v9fs_inode *v9inode = V9FS_I(inode);
+ struct p9_stat_dotl *st = (struct p9_stat_dotl *)data;
+
+ memcpy(&v9inode->qid, &st->qid, sizeof(st->qid));
+ inode->i_generation = st->st_gen;
+ return 0;
+}
+
+static struct inode *v9fs_qid_iget_dotl(struct super_block *sb,
+ struct p9_qid *qid,
+ struct p9_fid *fid,
+ struct p9_stat_dotl *st,
+ int new)
{
int retval;
struct inode *inode;
- struct p9_stat_dotl *st;
struct v9fs_session_info *v9ses = sb->s_fs_info;
+ int (*test)(struct inode *inode, void *data);
- inode = iget_locked(sb, QID2INO(&fid->qid));
- if (unlikely(!inode))
- return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW)) {
- if (!new) {
- goto done;
- } else { /* deal with race condition in inode number reuse */
- p9_debug(P9_DEBUG_ERROR, "WARNING: Inode collision %lx\n",
- inode->i_ino);
- iput(inode);
- remove_inode_hash(inode);
- inode = iget_locked(sb, QID2INO(&fid->qid));
- WARN_ON(!(inode->i_state & I_NEW));
- }
- }
+ if (new)
+ test = v9fs_test_new_inode_dotl;
+ else
+ test = v9fs_test_inode_dotl;
+ inode = iget5_locked(sb, QID2INO(qid), test, v9fs_set_inode_dotl, st);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+ if (!(inode->i_state & I_NEW))
+ return inode;
/*
* initialize the inode with the stat info
* FIXME!! we may need support for stale inodes
* later.
*/
- st = p9_client_getattr_dotl(fid, P9_STATS_BASIC | P9_STATS_GEN);
- if (IS_ERR(st)) {
- retval = PTR_ERR(st);
- goto error;
- }
-
- retval = v9fs_init_inode(v9ses, inode, &fid->qid,
+ inode->i_ino = QID2INO(qid);
+ retval = v9fs_init_inode(v9ses, inode,
st->st_mode, new_decode_dev(st->st_rdev));
- v9fs_stat2inode_dotl(st, inode, 0);
- kfree(st);
if (retval)
goto error;
+ v9fs_stat2inode_dotl(st, inode, 0);
v9fs_set_netfs_context(inode);
v9fs_cache_inode_get_cookie(inode);
retval = v9fs_get_acl(inode, fid);
@@ -103,11 +133,27 @@ v9fs_fid_iget_dotl(struct super_block *sb, struct p9_fid *fid, bool new)
goto error;
unlock_new_inode(inode);
-done:
return inode;
error:
iget_failed(inode);
return ERR_PTR(retval);
+
+}
+
+struct inode *
+v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
+ struct super_block *sb, int new)
+{
+ struct p9_stat_dotl *st;
+ struct inode *inode = NULL;
+
+ st = p9_client_getattr_dotl(fid, P9_STATS_BASIC | P9_STATS_GEN);
+ if (IS_ERR(st))
+ return ERR_CAST(st);
+
+ inode = v9fs_qid_iget_dotl(sb, &st->qid, fid, st, new);
+ kfree(st);
+ return inode;
}
struct dotl_openflag_map {
@@ -259,7 +305,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
goto out;
}
- inode = v9fs_fid_iget_dotl(dir->i_sb, fid, true);
+ inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", err);
@@ -309,6 +355,7 @@ static int v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap,
umode_t omode)
{
int err;
+ struct v9fs_session_info *v9ses;
struct p9_fid *fid = NULL, *dfid = NULL;
kgid_t gid;
const unsigned char *name;
@@ -318,6 +365,7 @@ static int v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap,
struct posix_acl *dacl = NULL, *pacl = NULL;
p9_debug(P9_DEBUG_VFS, "name %pd\n", dentry);
+ v9ses = v9fs_inode2v9ses(dir);
omode |= S_IFDIR;
if (dir->i_mode & S_ISGID)
@@ -352,7 +400,7 @@ static int v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap,
}
/* instantiate inode and assign the unopened fid to the dentry */
- inode = v9fs_fid_iget_dotl(dir->i_sb, fid, true);
+ inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n",
@@ -749,6 +797,7 @@ v9fs_vfs_mknod_dotl(struct mnt_idmap *idmap, struct inode *dir,
kgid_t gid;
const unsigned char *name;
umode_t mode;
+ struct v9fs_session_info *v9ses;
struct p9_fid *fid = NULL, *dfid = NULL;
struct inode *inode;
struct p9_qid qid;
@@ -758,6 +807,7 @@ v9fs_vfs_mknod_dotl(struct mnt_idmap *idmap, struct inode *dir,
dir->i_ino, dentry, omode,
MAJOR(rdev), MINOR(rdev));
+ v9ses = v9fs_inode2v9ses(dir);
dfid = v9fs_parent_fid(dentry);
if (IS_ERR(dfid)) {
err = PTR_ERR(dfid);
@@ -788,7 +838,7 @@ v9fs_vfs_mknod_dotl(struct mnt_idmap *idmap, struct inode *dir,
err);
goto error;
}
- inode = v9fs_fid_iget_dotl(dir->i_sb, fid, true);
+ inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n",
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index f52fdf42945c..489db161abc9 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -139,7 +139,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
else
sb->s_d_op = &v9fs_dentry_operations;
- inode = v9fs_get_inode_from_fid(v9ses, fid, sb, true);
+ inode = v9fs_get_new_inode_from_fid(v9ses, fid, sb);
if (IS_ERR(inode)) {
retval = PTR_ERR(inode);
goto release_sb;
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index f0b999a4961b..017c48a80203 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -6,7 +6,8 @@
*/
#include <linux/module.h>
#include <linux/init.h>
-#include <linux/parser.h>
+#include <linux/fs_parser.h>
+#include <linux/fs_context.h>
#include <linux/mount.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
@@ -115,87 +116,61 @@ static int adfs_show_options(struct seq_file *seq, struct dentry *root)
return 0;
}
-enum {Opt_uid, Opt_gid, Opt_ownmask, Opt_othmask, Opt_ftsuffix, Opt_err};
+enum {Opt_uid, Opt_gid, Opt_ownmask, Opt_othmask, Opt_ftsuffix};
-static const match_table_t tokens = {
- {Opt_uid, "uid=%u"},
- {Opt_gid, "gid=%u"},
- {Opt_ownmask, "ownmask=%o"},
- {Opt_othmask, "othmask=%o"},
- {Opt_ftsuffix, "ftsuffix=%u"},
- {Opt_err, NULL}
+static const struct fs_parameter_spec adfs_param_spec[] = {
+ fsparam_uid ("uid", Opt_uid),
+ fsparam_gid ("gid", Opt_gid),
+ fsparam_u32oct ("ownmask", Opt_ownmask),
+ fsparam_u32oct ("othmask", Opt_othmask),
+ fsparam_u32 ("ftsuffix", Opt_ftsuffix),
+ {}
};
-static int parse_options(struct super_block *sb, struct adfs_sb_info *asb,
- char *options)
+static int adfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- char *p;
- int option;
-
- if (!options)
- return 0;
-
- while ((p = strsep(&options, ",")) != NULL) {
- substring_t args[MAX_OPT_ARGS];
- int token;
- if (!*p)
- continue;
-
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_uid:
- if (match_int(args, &option))
- return -EINVAL;
- asb->s_uid = make_kuid(current_user_ns(), option);
- if (!uid_valid(asb->s_uid))
- return -EINVAL;
- break;
- case Opt_gid:
- if (match_int(args, &option))
- return -EINVAL;
- asb->s_gid = make_kgid(current_user_ns(), option);
- if (!gid_valid(asb->s_gid))
- return -EINVAL;
- break;
- case Opt_ownmask:
- if (match_octal(args, &option))
- return -EINVAL;
- asb->s_owner_mask = option;
- break;
- case Opt_othmask:
- if (match_octal(args, &option))
- return -EINVAL;
- asb->s_other_mask = option;
- break;
- case Opt_ftsuffix:
- if (match_int(args, &option))
- return -EINVAL;
- asb->s_ftsuffix = option;
- break;
- default:
- adfs_msg(sb, KERN_ERR,
- "unrecognised mount option \"%s\" or missing value",
- p);
- return -EINVAL;
- }
+ struct adfs_sb_info *asb = fc->s_fs_info;
+ struct fs_parse_result result;
+ int opt;
+
+ opt = fs_parse(fc, adfs_param_spec, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_uid:
+ asb->s_uid = result.uid;
+ break;
+ case Opt_gid:
+ asb->s_gid = result.gid;
+ break;
+ case Opt_ownmask:
+ asb->s_owner_mask = result.uint_32;
+ break;
+ case Opt_othmask:
+ asb->s_other_mask = result.uint_32;
+ break;
+ case Opt_ftsuffix:
+ asb->s_ftsuffix = result.uint_32;
+ break;
+ default:
+ return -EINVAL;
}
return 0;
}
-static int adfs_remount(struct super_block *sb, int *flags, char *data)
+static int adfs_reconfigure(struct fs_context *fc)
{
- struct adfs_sb_info temp_asb;
- int ret;
+ struct adfs_sb_info *new_asb = fc->s_fs_info;
+ struct adfs_sb_info *asb = ADFS_SB(fc->root->d_sb);
- sync_filesystem(sb);
- *flags |= ADFS_SB_FLAGS;
+ sync_filesystem(fc->root->d_sb);
+ fc->sb_flags |= ADFS_SB_FLAGS;
- temp_asb = *ADFS_SB(sb);
- ret = parse_options(sb, &temp_asb, data);
- if (ret == 0)
- *ADFS_SB(sb) = temp_asb;
+ /* Structure copy newly parsed options */
+ *asb = *new_asb;
- return ret;
+ return 0;
}
static int adfs_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -273,7 +248,6 @@ static const struct super_operations adfs_sops = {
.write_inode = adfs_write_inode,
.put_super = adfs_put_super,
.statfs = adfs_statfs,
- .remount_fs = adfs_remount,
.show_options = adfs_show_options,
};
@@ -361,34 +335,21 @@ static int adfs_validate_dr0(struct super_block *sb, struct buffer_head *bh,
return 0;
}
-static int adfs_fill_super(struct super_block *sb, void *data, int silent)
+static int adfs_fill_super(struct super_block *sb, struct fs_context *fc)
{
struct adfs_discrecord *dr;
struct object_info root_obj;
- struct adfs_sb_info *asb;
+ struct adfs_sb_info *asb = sb->s_fs_info;
struct inode *root;
int ret = -EINVAL;
+ int silent = fc->sb_flags & SB_SILENT;
sb->s_flags |= ADFS_SB_FLAGS;
- asb = kzalloc(sizeof(*asb), GFP_KERNEL);
- if (!asb)
- return -ENOMEM;
-
sb->s_fs_info = asb;
sb->s_magic = ADFS_SUPER_MAGIC;
sb->s_time_gran = 10000000;
- /* set default options */
- asb->s_uid = GLOBAL_ROOT_UID;
- asb->s_gid = GLOBAL_ROOT_GID;
- asb->s_owner_mask = ADFS_DEFAULT_OWNER_MASK;
- asb->s_other_mask = ADFS_DEFAULT_OTHER_MASK;
- asb->s_ftsuffix = 0;
-
- if (parse_options(sb, asb, data))
- goto error;
-
/* Try to probe the filesystem boot block */
ret = adfs_probe(sb, ADFS_DISCRECORD, 1, adfs_validate_bblk);
if (ret == -EILSEQ)
@@ -453,18 +414,61 @@ error:
return ret;
}
-static struct dentry *adfs_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int adfs_get_tree(struct fs_context *fc)
+{
+ return get_tree_bdev(fc, adfs_fill_super);
+}
+
+static void adfs_free_fc(struct fs_context *fc)
{
- return mount_bdev(fs_type, flags, dev_name, data, adfs_fill_super);
+ struct adfs_context *asb = fc->s_fs_info;
+
+ kfree(asb);
+}
+
+static const struct fs_context_operations adfs_context_ops = {
+ .parse_param = adfs_parse_param,
+ .get_tree = adfs_get_tree,
+ .reconfigure = adfs_reconfigure,
+ .free = adfs_free_fc,
+};
+
+static int adfs_init_fs_context(struct fs_context *fc)
+{
+ struct adfs_sb_info *asb;
+
+ asb = kzalloc(sizeof(struct adfs_sb_info), GFP_KERNEL);
+ if (!asb)
+ return -ENOMEM;
+
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+ struct super_block *sb = fc->root->d_sb;
+ struct adfs_sb_info *old_asb = ADFS_SB(sb);
+
+ /* structure copy existing options before parsing */
+ *asb = *old_asb;
+ } else {
+ /* set default options */
+ asb->s_uid = GLOBAL_ROOT_UID;
+ asb->s_gid = GLOBAL_ROOT_GID;
+ asb->s_owner_mask = ADFS_DEFAULT_OWNER_MASK;
+ asb->s_other_mask = ADFS_DEFAULT_OTHER_MASK;
+ asb->s_ftsuffix = 0;
+ }
+
+ fc->ops = &adfs_context_ops;
+ fc->s_fs_info = asb;
+
+ return 0;
}
static struct file_system_type adfs_fs_type = {
.owner = THIS_MODULE,
.name = "adfs",
- .mount = adfs_mount,
.kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV,
+ .init_fs_context = adfs_init_fs_context,
+ .parameters = adfs_param_spec,
};
MODULE_ALIAS_FS("adfs");
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 3c5821339609..2fa40337776d 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -14,7 +14,8 @@
#include <linux/module.h>
#include <linux/init.h>
#include <linux/statfs.h>
-#include <linux/parser.h>
+#include <linux/fs_parser.h>
+#include <linux/fs_context.h>
#include <linux/magic.h>
#include <linux/sched.h>
#include <linux/cred.h>
@@ -27,7 +28,6 @@
static int affs_statfs(struct dentry *dentry, struct kstatfs *buf);
static int affs_show_options(struct seq_file *m, struct dentry *root);
-static int affs_remount (struct super_block *sb, int *flags, char *data);
static void
affs_commit_super(struct super_block *sb, int wait)
@@ -155,140 +155,114 @@ static const struct super_operations affs_sops = {
.put_super = affs_put_super,
.sync_fs = affs_sync_fs,
.statfs = affs_statfs,
- .remount_fs = affs_remount,
.show_options = affs_show_options,
};
enum {
Opt_bs, Opt_mode, Opt_mufs, Opt_notruncate, Opt_prefix, Opt_protect,
Opt_reserved, Opt_root, Opt_setgid, Opt_setuid,
- Opt_verbose, Opt_volume, Opt_ignore, Opt_err,
+ Opt_verbose, Opt_volume, Opt_ignore,
};
-static const match_table_t tokens = {
- {Opt_bs, "bs=%u"},
- {Opt_mode, "mode=%o"},
- {Opt_mufs, "mufs"},
- {Opt_notruncate, "nofilenametruncate"},
- {Opt_prefix, "prefix=%s"},
- {Opt_protect, "protect"},
- {Opt_reserved, "reserved=%u"},
- {Opt_root, "root=%u"},
- {Opt_setgid, "setgid=%u"},
- {Opt_setuid, "setuid=%u"},
- {Opt_verbose, "verbose"},
- {Opt_volume, "volume=%s"},
- {Opt_ignore, "grpquota"},
- {Opt_ignore, "noquota"},
- {Opt_ignore, "quota"},
- {Opt_ignore, "usrquota"},
- {Opt_err, NULL},
+struct affs_context {
+ kuid_t uid; /* uid to override */
+ kgid_t gid; /* gid to override */
+ unsigned int mode; /* mode to override */
+ unsigned int reserved; /* Number of reserved blocks */
+ int root_block; /* FFS root block number */
+ int blocksize; /* Initial device blksize */
+ char *prefix; /* Prefix for volumes and assigns */
+ char volume[32]; /* Vol. prefix for absolute symlinks */
+ unsigned long mount_flags; /* Options */
};
-static int
-parse_options(char *options, kuid_t *uid, kgid_t *gid, int *mode, int *reserved, s32 *root,
- int *blocksize, char **prefix, char *volume, unsigned long *mount_opts)
+static const struct fs_parameter_spec affs_param_spec[] = {
+ fsparam_u32 ("bs", Opt_bs),
+ fsparam_u32oct ("mode", Opt_mode),
+ fsparam_flag ("mufs", Opt_mufs),
+ fsparam_flag ("nofilenametruncate", Opt_notruncate),
+ fsparam_string ("prefix", Opt_prefix),
+ fsparam_flag ("protect", Opt_protect),
+ fsparam_u32 ("reserved", Opt_reserved),
+ fsparam_u32 ("root", Opt_root),
+ fsparam_gid ("setgid", Opt_setgid),
+ fsparam_uid ("setuid", Opt_setuid),
+ fsparam_flag ("verbose", Opt_verbose),
+ fsparam_string ("volume", Opt_volume),
+ fsparam_flag ("grpquota", Opt_ignore),
+ fsparam_flag ("noquota", Opt_ignore),
+ fsparam_flag ("quota", Opt_ignore),
+ fsparam_flag ("usrquota", Opt_ignore),
+ {},
+};
+
+static int affs_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- char *p;
- substring_t args[MAX_OPT_ARGS];
-
- /* Fill in defaults */
-
- *uid = current_uid();
- *gid = current_gid();
- *reserved = 2;
- *root = -1;
- *blocksize = -1;
- volume[0] = ':';
- volume[1] = 0;
- *mount_opts = 0;
- if (!options)
- return 1;
-
- while ((p = strsep(&options, ",")) != NULL) {
- int token, n, option;
- if (!*p)
- continue;
-
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_bs:
- if (match_int(&args[0], &n))
- return 0;
- if (n != 512 && n != 1024 && n != 2048
- && n != 4096) {
- pr_warn("Invalid blocksize (512, 1024, 2048, 4096 allowed)\n");
- return 0;
- }
- *blocksize = n;
- break;
- case Opt_mode:
- if (match_octal(&args[0], &option))
- return 0;
- *mode = option & 0777;
- affs_set_opt(*mount_opts, SF_SETMODE);
- break;
- case Opt_mufs:
- affs_set_opt(*mount_opts, SF_MUFS);
- break;
- case Opt_notruncate:
- affs_set_opt(*mount_opts, SF_NO_TRUNCATE);
- break;
- case Opt_prefix:
- kfree(*prefix);
- *prefix = match_strdup(&args[0]);
- if (!*prefix)
- return 0;
- affs_set_opt(*mount_opts, SF_PREFIX);
- break;
- case Opt_protect:
- affs_set_opt(*mount_opts, SF_IMMUTABLE);
- break;
- case Opt_reserved:
- if (match_int(&args[0], reserved))
- return 0;
- break;
- case Opt_root:
- if (match_int(&args[0], root))
- return 0;
- break;
- case Opt_setgid:
- if (match_int(&args[0], &option))
- return 0;
- *gid = make_kgid(current_user_ns(), option);
- if (!gid_valid(*gid))
- return 0;
- affs_set_opt(*mount_opts, SF_SETGID);
- break;
- case Opt_setuid:
- if (match_int(&args[0], &option))
- return 0;
- *uid = make_kuid(current_user_ns(), option);
- if (!uid_valid(*uid))
- return 0;
- affs_set_opt(*mount_opts, SF_SETUID);
- break;
- case Opt_verbose:
- affs_set_opt(*mount_opts, SF_VERBOSE);
- break;
- case Opt_volume: {
- char *vol = match_strdup(&args[0]);
- if (!vol)
- return 0;
- strscpy(volume, vol, 32);
- kfree(vol);
- break;
- }
- case Opt_ignore:
- /* Silently ignore the quota options */
- break;
- default:
- pr_warn("Unrecognized mount option \"%s\" or missing value\n",
- p);
- return 0;
+ struct affs_context *ctx = fc->fs_private;
+ struct fs_parse_result result;
+ int n;
+ int opt;
+
+ opt = fs_parse(fc, affs_param_spec, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_bs:
+ n = result.uint_32;
+ if (n != 512 && n != 1024 && n != 2048
+ && n != 4096) {
+ pr_warn("Invalid blocksize (512, 1024, 2048, 4096 allowed)\n");
+ return -EINVAL;
}
+ ctx->blocksize = n;
+ break;
+ case Opt_mode:
+ ctx->mode = result.uint_32 & 0777;
+ affs_set_opt(ctx->mount_flags, SF_SETMODE);
+ break;
+ case Opt_mufs:
+ affs_set_opt(ctx->mount_flags, SF_MUFS);
+ break;
+ case Opt_notruncate:
+ affs_set_opt(ctx->mount_flags, SF_NO_TRUNCATE);
+ break;
+ case Opt_prefix:
+ kfree(ctx->prefix);
+ ctx->prefix = param->string;
+ param->string = NULL;
+ affs_set_opt(ctx->mount_flags, SF_PREFIX);
+ break;
+ case Opt_protect:
+ affs_set_opt(ctx->mount_flags, SF_IMMUTABLE);
+ break;
+ case Opt_reserved:
+ ctx->reserved = result.uint_32;
+ break;
+ case Opt_root:
+ ctx->root_block = result.uint_32;
+ break;
+ case Opt_setgid:
+ ctx->gid = result.gid;
+ affs_set_opt(ctx->mount_flags, SF_SETGID);
+ break;
+ case Opt_setuid:
+ ctx->uid = result.uid;
+ affs_set_opt(ctx->mount_flags, SF_SETUID);
+ break;
+ case Opt_verbose:
+ affs_set_opt(ctx->mount_flags, SF_VERBOSE);
+ break;
+ case Opt_volume:
+ strscpy(ctx->volume, param->string, 32);
+ break;
+ case Opt_ignore:
+ /* Silently ignore the quota options */
+ break;
+ default:
+ return -EINVAL;
}
- return 1;
+ return 0;
}
static int affs_show_options(struct seq_file *m, struct dentry *root)
@@ -329,27 +303,22 @@ static int affs_show_options(struct seq_file *m, struct dentry *root)
* hopefully have the guts to do so. Until then: sorry for the mess.
*/
-static int affs_fill_super(struct super_block *sb, void *data, int silent)
+static int affs_fill_super(struct super_block *sb, struct fs_context *fc)
{
struct affs_sb_info *sbi;
+ struct affs_context *ctx = fc->fs_private;
struct buffer_head *root_bh = NULL;
struct buffer_head *boot_bh;
struct inode *root_inode = NULL;
- s32 root_block;
+ int silent = fc->sb_flags & SB_SILENT;
int size, blocksize;
u32 chksum;
int num_bm;
int i, j;
- kuid_t uid;
- kgid_t gid;
- int reserved;
- unsigned long mount_flags;
int tmp_flags; /* fix remount prototype... */
u8 sig[4];
int ret;
- pr_debug("read_super(%s)\n", data ? (const char *)data : "no options");
-
sb->s_magic = AFFS_SUPER_MAGIC;
sb->s_op = &affs_sops;
sb->s_flags |= SB_NODIRATIME;
@@ -369,19 +338,16 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
spin_lock_init(&sbi->work_lock);
INIT_DELAYED_WORK(&sbi->sb_work, flush_superblock);
- if (!parse_options(data,&uid,&gid,&i,&reserved,&root_block,
- &blocksize,&sbi->s_prefix,
- sbi->s_volume, &mount_flags)) {
- pr_err("Error parsing options\n");
- return -EINVAL;
- }
- /* N.B. after this point s_prefix must be released */
+ sbi->s_flags = ctx->mount_flags;
+ sbi->s_mode = ctx->mode;
+ sbi->s_uid = ctx->uid;
+ sbi->s_gid = ctx->gid;
+ sbi->s_reserved = ctx->reserved;
+ sbi->s_prefix = ctx->prefix;
+ ctx->prefix = NULL;
+ memcpy(sbi->s_volume, ctx->volume, 32);
- sbi->s_flags = mount_flags;
- sbi->s_mode = i;
- sbi->s_uid = uid;
- sbi->s_gid = gid;
- sbi->s_reserved= reserved;
+ /* N.B. after this point s_prefix must be released */
/* Get the size of the device in 512-byte blocks.
* If we later see that the partition uses bigger
@@ -396,15 +362,16 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
i = bdev_logical_block_size(sb->s_bdev);
j = PAGE_SIZE;
+ blocksize = ctx->blocksize;
if (blocksize > 0) {
i = j = blocksize;
size = size / (blocksize / 512);
}
for (blocksize = i; blocksize <= j; blocksize <<= 1, size >>= 1) {
- sbi->s_root_block = root_block;
- if (root_block < 0)
- sbi->s_root_block = (reserved + size - 1) / 2;
+ sbi->s_root_block = ctx->root_block;
+ if (ctx->root_block < 0)
+ sbi->s_root_block = (ctx->reserved + size - 1) / 2;
pr_debug("setting blocksize to %d\n", blocksize);
affs_set_blocksize(sb, blocksize);
sbi->s_partition_size = size;
@@ -424,7 +391,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
"size=%d, reserved=%d\n",
sb->s_id,
sbi->s_root_block + num_bm,
- blocksize, size, reserved);
+ ctx->blocksize, size, ctx->reserved);
root_bh = affs_bread(sb, sbi->s_root_block + num_bm);
if (!root_bh)
continue;
@@ -447,7 +414,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
got_root:
/* Keep super block in cache */
sbi->s_root_bh = root_bh;
- root_block = sbi->s_root_block;
+ ctx->root_block = sbi->s_root_block;
/* Find out which kind of FS we have */
boot_bh = sb_bread(sb, 0);
@@ -506,7 +473,7 @@ got_root:
return -EINVAL;
}
- if (affs_test_opt(mount_flags, SF_VERBOSE)) {
+ if (affs_test_opt(ctx->mount_flags, SF_VERBOSE)) {
u8 len = AFFS_ROOT_TAIL(sb, root_bh)->disk_name[0];
pr_notice("Mounting volume \"%.*s\": Type=%.3s\\%c, Blocksize=%d\n",
len > 31 ? 31 : len,
@@ -528,7 +495,7 @@ got_root:
/* set up enough so that it can read an inode */
- root_inode = affs_iget(sb, root_block);
+ root_inode = affs_iget(sb, ctx->root_block);
if (IS_ERR(root_inode))
return PTR_ERR(root_inode);
@@ -548,56 +515,43 @@ got_root:
return 0;
}
-static int
-affs_remount(struct super_block *sb, int *flags, char *data)
+static int affs_reconfigure(struct fs_context *fc)
{
+ struct super_block *sb = fc->root->d_sb;
+ struct affs_context *ctx = fc->fs_private;
struct affs_sb_info *sbi = AFFS_SB(sb);
- int blocksize;
- kuid_t uid;
- kgid_t gid;
- int mode;
- int reserved;
- int root_block;
- unsigned long mount_flags;
int res = 0;
- char volume[32];
- char *prefix = NULL;
-
- pr_debug("%s(flags=0x%x,opts=\"%s\")\n", __func__, *flags, data);
sync_filesystem(sb);
- *flags |= SB_NODIRATIME;
-
- memcpy(volume, sbi->s_volume, 32);
- if (!parse_options(data, &uid, &gid, &mode, &reserved, &root_block,
- &blocksize, &prefix, volume,
- &mount_flags)) {
- kfree(prefix);
- return -EINVAL;
- }
+ fc->sb_flags |= SB_NODIRATIME;
flush_delayed_work(&sbi->sb_work);
- sbi->s_flags = mount_flags;
- sbi->s_mode = mode;
- sbi->s_uid = uid;
- sbi->s_gid = gid;
+ /*
+ * NB: Historically, only mount_flags, mode, uid, gic, prefix,
+ * and volume are accepted during remount.
+ */
+ sbi->s_flags = ctx->mount_flags;
+ sbi->s_mode = ctx->mode;
+ sbi->s_uid = ctx->uid;
+ sbi->s_gid = ctx->gid;
/* protect against readers */
spin_lock(&sbi->symlink_lock);
- if (prefix) {
+ if (ctx->prefix) {
kfree(sbi->s_prefix);
- sbi->s_prefix = prefix;
+ sbi->s_prefix = ctx->prefix;
+ ctx->prefix = NULL;
}
- memcpy(sbi->s_volume, volume, 32);
+ memcpy(sbi->s_volume, ctx->volume, 32);
spin_unlock(&sbi->symlink_lock);
- if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
+ if ((bool)(fc->sb_flags & SB_RDONLY) == sb_rdonly(sb))
return 0;
- if (*flags & SB_RDONLY)
+ if (fc->sb_flags & SB_RDONLY)
affs_free_bitmap(sb);
else
- res = affs_init_bitmap(sb, flags);
+ res = affs_init_bitmap(sb, &fc->sb_flags);
return res;
}
@@ -624,10 +578,9 @@ affs_statfs(struct dentry *dentry, struct kstatfs *buf)
return 0;
}
-static struct dentry *affs_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int affs_get_tree(struct fs_context *fc)
{
- return mount_bdev(fs_type, flags, dev_name, data, affs_fill_super);
+ return get_tree_bdev(fc, affs_fill_super);
}
static void affs_kill_sb(struct super_block *sb)
@@ -643,12 +596,61 @@ static void affs_kill_sb(struct super_block *sb)
}
}
+static void affs_free_fc(struct fs_context *fc)
+{
+ struct affs_context *ctx = fc->fs_private;
+
+ kfree(ctx->prefix);
+ kfree(ctx);
+}
+
+static const struct fs_context_operations affs_context_ops = {
+ .parse_param = affs_parse_param,
+ .get_tree = affs_get_tree,
+ .reconfigure = affs_reconfigure,
+ .free = affs_free_fc,
+};
+
+static int affs_init_fs_context(struct fs_context *fc)
+{
+ struct affs_context *ctx;
+
+ ctx = kzalloc(sizeof(struct affs_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+ struct super_block *sb = fc->root->d_sb;
+ struct affs_sb_info *sbi = AFFS_SB(sb);
+
+ /*
+ * NB: historically, no options other than volume were
+ * preserved across a remount unless they were explicitly
+ * passed in.
+ */
+ memcpy(ctx->volume, sbi->s_volume, 32);
+ } else {
+ ctx->uid = current_uid();
+ ctx->gid = current_gid();
+ ctx->reserved = 2;
+ ctx->root_block = -1;
+ ctx->blocksize = -1;
+ ctx->volume[0] = ':';
+ }
+
+ fc->ops = &affs_context_ops;
+ fc->fs_private = ctx;
+
+ return 0;
+}
+
static struct file_system_type affs_fs_type = {
.owner = THIS_MODULE,
.name = "affs",
- .mount = affs_mount,
.kill_sb = affs_kill_sb,
.fs_flags = FS_REQUIRES_DEV,
+ .init_fs_context = affs_init_fs_context,
+ .parameters = affs_param_spec,
};
MODULE_ALIAS_FS("affs");
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index f8622ed72e08..ada363af5aab 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -12,6 +12,7 @@
#include <linux/swap.h>
#include <linux/ctype.h>
#include <linux/sched.h>
+#include <linux/iversion.h>
#include <linux/task_io_accounting_ops.h>
#include "internal.h"
#include "afs_fs.h"
@@ -1823,6 +1824,8 @@ error:
static void afs_rename_success(struct afs_operation *op)
{
+ struct afs_vnode *vnode = AFS_FS_I(d_inode(op->dentry));
+
_enter("op=%08x", op->debug_id);
op->ctime = op->file[0].scb.status.mtime_client;
@@ -1832,6 +1835,22 @@ static void afs_rename_success(struct afs_operation *op)
op->ctime = op->file[1].scb.status.mtime_client;
afs_vnode_commit_status(op, &op->file[1]);
}
+
+ /* If we're moving a subdir between dirs, we need to update
+ * its DV counter too as the ".." will be altered.
+ */
+ if (S_ISDIR(vnode->netfs.inode.i_mode) &&
+ op->file[0].vnode != op->file[1].vnode) {
+ u64 new_dv;
+
+ write_seqlock(&vnode->cb_lock);
+
+ new_dv = vnode->status.data_version + 1;
+ vnode->status.data_version = new_dv;
+ inode_set_iversion_raw(&vnode->netfs.inode, new_dv);
+
+ write_sequnlock(&vnode->cb_lock);
+ }
}
static void afs_rename_edit_dir(struct afs_operation *op)
@@ -1873,6 +1892,12 @@ static void afs_rename_edit_dir(struct afs_operation *op)
&vnode->fid, afs_edit_dir_for_rename_2);
}
+ if (S_ISDIR(vnode->netfs.inode.i_mode) &&
+ new_dvnode != orig_dvnode &&
+ test_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
+ afs_edit_dir_update_dotdot(vnode, new_dvnode,
+ afs_edit_dir_for_rename_sub);
+
new_inode = d_inode(new_dentry);
if (new_inode) {
spin_lock(&new_inode->i_lock);
diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c
index a71bff10496b..fe223fb78111 100644
--- a/fs/afs/dir_edit.c
+++ b/fs/afs/dir_edit.c
@@ -127,10 +127,10 @@ static struct folio *afs_dir_get_folio(struct afs_vnode *vnode, pgoff_t index)
/*
* Scan a directory block looking for a dirent of the right name.
*/
-static int afs_dir_scan_block(union afs_xdr_dir_block *block, struct qstr *name,
+static int afs_dir_scan_block(const union afs_xdr_dir_block *block, const struct qstr *name,
unsigned int blocknum)
{
- union afs_xdr_dirent *de;
+ const union afs_xdr_dirent *de;
u64 bitmap;
int d, len, n;
@@ -492,3 +492,90 @@ error:
clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
goto out_unmap;
}
+
+/*
+ * Edit a subdirectory that has been moved between directories to update the
+ * ".." entry.
+ */
+void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_dvnode,
+ enum afs_edit_dir_reason why)
+{
+ union afs_xdr_dir_block *block;
+ union afs_xdr_dirent *de;
+ struct folio *folio;
+ unsigned int nr_blocks, b;
+ pgoff_t index;
+ loff_t i_size;
+ int slot;
+
+ _enter("");
+
+ i_size = i_size_read(&vnode->netfs.inode);
+ if (i_size < AFS_DIR_BLOCK_SIZE) {
+ clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
+ return;
+ }
+ nr_blocks = i_size / AFS_DIR_BLOCK_SIZE;
+
+ /* Find a block that has sufficient slots available. Each folio
+ * contains two or more directory blocks.
+ */
+ for (b = 0; b < nr_blocks; b++) {
+ index = b / AFS_DIR_BLOCKS_PER_PAGE;
+ folio = afs_dir_get_folio(vnode, index);
+ if (!folio)
+ goto error;
+
+ block = kmap_local_folio(folio, b * AFS_DIR_BLOCK_SIZE - folio_pos(folio));
+
+ /* Abandon the edit if we got a callback break. */
+ if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
+ goto invalidated;
+
+ slot = afs_dir_scan_block(block, &dotdot_name, b);
+ if (slot >= 0)
+ goto found_dirent;
+
+ kunmap_local(block);
+ folio_unlock(folio);
+ folio_put(folio);
+ }
+
+ /* Didn't find the dirent to clobber. Download the directory again. */
+ trace_afs_edit_dir(vnode, why, afs_edit_dir_update_nodd,
+ 0, 0, 0, 0, "..");
+ clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
+ goto out;
+
+found_dirent:
+ de = &block->dirents[slot];
+ de->u.vnode = htonl(new_dvnode->fid.vnode);
+ de->u.unique = htonl(new_dvnode->fid.unique);
+
+ trace_afs_edit_dir(vnode, why, afs_edit_dir_update_dd, b, slot,
+ ntohl(de->u.vnode), ntohl(de->u.unique), "..");
+
+ kunmap_local(block);
+ folio_unlock(folio);
+ folio_put(folio);
+ inode_set_iversion_raw(&vnode->netfs.inode, vnode->status.data_version);
+
+out:
+ _leave("");
+ return;
+
+invalidated:
+ kunmap_local(block);
+ folio_unlock(folio);
+ folio_put(folio);
+ trace_afs_edit_dir(vnode, why, afs_edit_dir_update_inval,
+ 0, 0, 0, 0, "..");
+ clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
+ goto out;
+
+error:
+ trace_afs_edit_dir(vnode, why, afs_edit_dir_update_error,
+ 0, 0, 0, 0, "..");
+ clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
+ goto out;
+}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 6e1d3c4daf72..c9d620175e80 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -130,6 +130,7 @@ struct afs_call {
wait_queue_head_t waitq; /* processes awaiting completion */
struct work_struct async_work; /* async I/O processor */
struct work_struct work; /* actual work processor */
+ struct work_struct free_work; /* Deferred free processor */
struct rxrpc_call *rxcall; /* RxRPC call handle */
struct rxrpc_peer *peer; /* Remote endpoint */
struct key *key; /* security for this call */
@@ -1072,6 +1073,8 @@ extern void afs_check_for_remote_deletion(struct afs_operation *);
extern void afs_edit_dir_add(struct afs_vnode *, struct qstr *, struct afs_fid *,
enum afs_edit_dir_reason);
extern void afs_edit_dir_remove(struct afs_vnode *, struct qstr *, enum afs_edit_dir_reason);
+void afs_edit_dir_update_dotdot(struct afs_vnode *vnode, struct afs_vnode *new_dvnode,
+ enum afs_edit_dir_reason why);
/*
* dir_silly.c
@@ -1331,6 +1334,7 @@ extern int __net_init afs_open_socket(struct afs_net *);
extern void __net_exit afs_close_socket(struct afs_net *);
extern void afs_charge_preallocation(struct work_struct *);
extern void afs_put_call(struct afs_call *);
+void afs_deferred_put_call(struct afs_call *call);
void afs_make_call(struct afs_call *call, gfp_t gfp);
void afs_wait_for_call_to_complete(struct afs_call *call);
extern struct afs_call *afs_alloc_flat_call(struct afs_net *,
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index c453428f3c8b..9f2a3bb56ec6 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -18,6 +18,7 @@
struct workqueue_struct *afs_async_calls;
+static void afs_deferred_free_worker(struct work_struct *work);
static void afs_wake_up_call_waiter(struct sock *, struct rxrpc_call *, unsigned long);
static void afs_wake_up_async_call(struct sock *, struct rxrpc_call *, unsigned long);
static void afs_process_async_call(struct work_struct *);
@@ -149,6 +150,7 @@ static struct afs_call *afs_alloc_call(struct afs_net *net,
call->debug_id = atomic_inc_return(&rxrpc_debug_id);
refcount_set(&call->ref, 1);
INIT_WORK(&call->async_work, afs_process_async_call);
+ INIT_WORK(&call->free_work, afs_deferred_free_worker);
init_waitqueue_head(&call->waitq);
spin_lock_init(&call->state_lock);
call->iter = &call->def_iter;
@@ -159,6 +161,36 @@ static struct afs_call *afs_alloc_call(struct afs_net *net,
return call;
}
+static void afs_free_call(struct afs_call *call)
+{
+ struct afs_net *net = call->net;
+ int o;
+
+ ASSERT(!work_pending(&call->async_work));
+
+ rxrpc_kernel_put_peer(call->peer);
+
+ if (call->rxcall) {
+ rxrpc_kernel_shutdown_call(net->socket, call->rxcall);
+ rxrpc_kernel_put_call(net->socket, call->rxcall);
+ call->rxcall = NULL;
+ }
+ if (call->type->destructor)
+ call->type->destructor(call);
+
+ afs_unuse_server_notime(call->net, call->server, afs_server_trace_put_call);
+ kfree(call->request);
+
+ o = atomic_read(&net->nr_outstanding_calls);
+ trace_afs_call(call->debug_id, afs_call_trace_free, 0, o,
+ __builtin_return_address(0));
+ kfree(call);
+
+ o = atomic_dec_return(&net->nr_outstanding_calls);
+ if (o == 0)
+ wake_up_var(&net->nr_outstanding_calls);
+}
+
/*
* Dispose of a reference on a call.
*/
@@ -173,32 +205,34 @@ void afs_put_call(struct afs_call *call)
o = atomic_read(&net->nr_outstanding_calls);
trace_afs_call(debug_id, afs_call_trace_put, r - 1, o,
__builtin_return_address(0));
+ if (zero)
+ afs_free_call(call);
+}
- if (zero) {
- ASSERT(!work_pending(&call->async_work));
- ASSERT(call->type->name != NULL);
-
- rxrpc_kernel_put_peer(call->peer);
-
- if (call->rxcall) {
- rxrpc_kernel_shutdown_call(net->socket, call->rxcall);
- rxrpc_kernel_put_call(net->socket, call->rxcall);
- call->rxcall = NULL;
- }
- if (call->type->destructor)
- call->type->destructor(call);
+static void afs_deferred_free_worker(struct work_struct *work)
+{
+ struct afs_call *call = container_of(work, struct afs_call, free_work);
- afs_unuse_server_notime(call->net, call->server, afs_server_trace_put_call);
- kfree(call->request);
+ afs_free_call(call);
+}
- trace_afs_call(call->debug_id, afs_call_trace_free, 0, o,
- __builtin_return_address(0));
- kfree(call);
+/*
+ * Dispose of a reference on a call, deferring the cleanup to a workqueue
+ * to avoid lock recursion.
+ */
+void afs_deferred_put_call(struct afs_call *call)
+{
+ struct afs_net *net = call->net;
+ unsigned int debug_id = call->debug_id;
+ bool zero;
+ int r, o;
- o = atomic_dec_return(&net->nr_outstanding_calls);
- if (o == 0)
- wake_up_var(&net->nr_outstanding_calls);
- }
+ zero = __refcount_dec_and_test(&call->ref, &r);
+ o = atomic_read(&net->nr_outstanding_calls);
+ trace_afs_call(debug_id, afs_call_trace_put, r - 1, o,
+ __builtin_return_address(0));
+ if (zero)
+ schedule_work(&call->free_work);
}
static struct afs_call *afs_get_call(struct afs_call *call,
@@ -640,7 +674,8 @@ static void afs_wake_up_call_waiter(struct sock *sk, struct rxrpc_call *rxcall,
}
/*
- * wake up an asynchronous call
+ * Wake up an asynchronous call. The caller is holding the call notify
+ * spinlock around this, so we can't call afs_put_call().
*/
static void afs_wake_up_async_call(struct sock *sk, struct rxrpc_call *rxcall,
unsigned long call_user_ID)
@@ -657,7 +692,7 @@ static void afs_wake_up_async_call(struct sock *sk, struct rxrpc_call *rxcall,
__builtin_return_address(0));
if (!queue_work(afs_async_calls, &call->async_work))
- afs_put_call(call);
+ afs_deferred_put_call(call);
}
}
diff --git a/fs/aio.c b/fs/aio.c
index e8920178b50f..72e3970f4225 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -2191,7 +2191,6 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
return -EINVAL;
spin_lock_irq(&ctx->ctx_lock);
- /* TODO: use a hash or array, this sucks. */
list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) {
if (kiocb->ki_res.obj == obj) {
ret = kiocb->ki_cancel(&kiocb->rw);
diff --git a/fs/attr.c b/fs/attr.c
index c04d19b58f12..9caf63d20d03 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -272,6 +272,47 @@ out_big:
EXPORT_SYMBOL(inode_newsize_ok);
/**
+ * setattr_copy_mgtime - update timestamps for mgtime inodes
+ * @inode: inode timestamps to be updated
+ * @attr: attrs for the update
+ *
+ * With multigrain timestamps, take more care to prevent races when
+ * updating the ctime. Always update the ctime to the very latest using
+ * the standard mechanism, and use that to populate the atime and mtime
+ * appropriately (unless those are being set to specific values).
+ */
+static void setattr_copy_mgtime(struct inode *inode, const struct iattr *attr)
+{
+ unsigned int ia_valid = attr->ia_valid;
+ struct timespec64 now;
+
+ if (ia_valid & ATTR_CTIME) {
+ /*
+ * In the case of an update for a write delegation, we must respect
+ * the value in ia_ctime and not use the current time.
+ */
+ if (ia_valid & ATTR_DELEG)
+ now = inode_set_ctime_deleg(inode, attr->ia_ctime);
+ else
+ now = inode_set_ctime_current(inode);
+ } else {
+ /* If ATTR_CTIME isn't set, then ATTR_MTIME shouldn't be either. */
+ WARN_ON_ONCE(ia_valid & ATTR_MTIME);
+ now = current_time(inode);
+ }
+
+ if (ia_valid & ATTR_ATIME_SET)
+ inode_set_atime_to_ts(inode, attr->ia_atime);
+ else if (ia_valid & ATTR_ATIME)
+ inode_set_atime_to_ts(inode, now);
+
+ if (ia_valid & ATTR_MTIME_SET)
+ inode_set_mtime_to_ts(inode, attr->ia_mtime);
+ else if (ia_valid & ATTR_MTIME)
+ inode_set_mtime_to_ts(inode, now);
+}
+
+/**
* setattr_copy - copy simple metadata updates into the generic inode
* @idmap: idmap of the mount the inode was found from
* @inode: the inode to be updated
@@ -303,12 +344,6 @@ void setattr_copy(struct mnt_idmap *idmap, struct inode *inode,
i_uid_update(idmap, attr, inode);
i_gid_update(idmap, attr, inode);
- if (ia_valid & ATTR_ATIME)
- inode_set_atime_to_ts(inode, attr->ia_atime);
- if (ia_valid & ATTR_MTIME)
- inode_set_mtime_to_ts(inode, attr->ia_mtime);
- if (ia_valid & ATTR_CTIME)
- inode_set_ctime_to_ts(inode, attr->ia_ctime);
if (ia_valid & ATTR_MODE) {
umode_t mode = attr->ia_mode;
if (!in_group_or_capable(idmap, inode,
@@ -316,6 +351,20 @@ void setattr_copy(struct mnt_idmap *idmap, struct inode *inode,
mode &= ~S_ISGID;
inode->i_mode = mode;
}
+
+ if (is_mgtime(inode))
+ return setattr_copy_mgtime(inode, attr);
+
+ if (ia_valid & ATTR_ATIME)
+ inode_set_atime_to_ts(inode, attr->ia_atime);
+ if (ia_valid & ATTR_MTIME)
+ inode_set_mtime_to_ts(inode, attr->ia_mtime);
+ if (ia_valid & ATTR_CTIME) {
+ if (ia_valid & ATTR_DELEG)
+ inode_set_ctime_deleg(inode, attr->ia_ctime);
+ else
+ inode_set_ctime_to_ts(inode, attr->ia_ctime);
+ }
}
EXPORT_SYMBOL(setattr_copy);
diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c
index f011e026358e..6d57efbb8110 100644
--- a/fs/autofs/dev-ioctl.c
+++ b/fs/autofs/dev-ioctl.c
@@ -110,6 +110,7 @@ static inline void free_dev_ioctl(struct autofs_dev_ioctl *param)
*/
static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param)
{
+ unsigned int inr = _IOC_NR(cmd);
int err;
err = check_dev_ioctl_version(cmd, param);
@@ -133,7 +134,7 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param)
* check_name() return for AUTOFS_DEV_IOCTL_TIMEOUT_CMD.
*/
err = check_name(param->path);
- if (cmd == AUTOFS_DEV_IOCTL_TIMEOUT_CMD)
+ if (inr == AUTOFS_DEV_IOCTL_TIMEOUT_CMD)
err = err ? 0 : -EINVAL;
if (err) {
pr_warn("invalid path supplied for cmd(0x%08x)\n",
@@ -141,8 +142,6 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param)
goto out;
}
} else {
- unsigned int inr = _IOC_NR(cmd);
-
if (inr == AUTOFS_DEV_IOCTL_OPENMOUNT_CMD ||
inr == AUTOFS_DEV_IOCTL_REQUESTER_CMD ||
inr == AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD) {
diff --git a/fs/backing-file.c b/fs/backing-file.c
index 8860dac58c37..09a9be945d45 100644
--- a/fs/backing-file.c
+++ b/fs/backing-file.c
@@ -80,7 +80,7 @@ struct backing_aio {
refcount_t ref;
struct kiocb *orig_iocb;
/* used for aio completion */
- void (*end_write)(struct file *);
+ void (*end_write)(struct file *, loff_t, ssize_t);
struct work_struct work;
long res;
};
@@ -109,7 +109,7 @@ static void backing_aio_cleanup(struct backing_aio *aio, long res)
struct kiocb *orig_iocb = aio->orig_iocb;
if (aio->end_write)
- aio->end_write(orig_iocb->ki_filp);
+ aio->end_write(orig_iocb->ki_filp, iocb->ki_pos, res);
orig_iocb->ki_pos = iocb->ki_pos;
backing_aio_put(aio);
@@ -239,7 +239,7 @@ ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter,
ret = vfs_iter_write(file, iter, &iocb->ki_pos, rwf);
if (ctx->end_write)
- ctx->end_write(ctx->user_file);
+ ctx->end_write(ctx->user_file, iocb->ki_pos, ret);
} else {
struct backing_aio *aio;
@@ -317,7 +317,7 @@ ssize_t backing_file_splice_write(struct pipe_inode_info *pipe,
revert_creds(old_cred);
if (ctx->end_write)
- ctx->end_write(ctx->user_file);
+ ctx->end_write(ctx->user_file, ppos ? *ppos : 0, ret);
return ret;
}
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 4e4a448f6931..c84a91572a1d 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -639,6 +639,16 @@ int bch2_alloc_read(struct bch_fs *c)
continue;
}
+ if (k.k->p.offset < ca->mi.first_bucket) {
+ bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode, ca->mi.first_bucket));
+ continue;
+ }
+
+ if (k.k->p.offset >= ca->mi.nbuckets) {
+ bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0));
+ continue;
+ }
+
struct bch_alloc_v4 a;
*bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen;
0;
@@ -1967,7 +1977,7 @@ static void bch2_do_discards_fast_work(struct work_struct *work)
ca->mi.bucket_size,
GFP_KERNEL);
- int ret = bch2_trans_do(c, NULL, NULL,
+ int ret = bch2_trans_commit_do(c, NULL, NULL,
BCH_WATERMARK_btree|
BCH_TRANS_COMMIT_no_enospc,
bch2_clear_bucket_needs_discard(trans, POS(ca->dev_idx, bucket)));
@@ -2127,14 +2137,15 @@ static void bch2_do_invalidates_work(struct work_struct *work)
struct bkey_s_c k = next_lru_key(trans, &iter, ca, &wrapped);
ret = bkey_err(k);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- continue;
if (ret)
- break;
+ goto restart_err;
if (!k.k)
break;
ret = invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate);
+restart_err:
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ continue;
if (ret)
break;
@@ -2340,24 +2351,19 @@ int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
/* Bucket IO clocks: */
-int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
- size_t bucket_nr, int rw)
+static int __bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
+ size_t bucket_nr, int rw)
{
struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_i_alloc_v4 *a;
- u64 now;
- int ret = 0;
-
- if (bch2_trans_relock(trans))
- bch2_trans_begin(trans);
- a = bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(dev, bucket_nr));
- ret = PTR_ERR_OR_ZERO(a);
+ struct btree_iter iter;
+ struct bkey_i_alloc_v4 *a =
+ bch2_trans_start_alloc_update_noupdate(trans, &iter, POS(dev, bucket_nr));
+ int ret = PTR_ERR_OR_ZERO(a);
if (ret)
return ret;
- now = bch2_current_io_time(c, rw);
+ u64 now = bch2_current_io_time(c, rw);
if (a->v.io_time[rw] == now)
goto out;
@@ -2370,6 +2376,15 @@ out:
return ret;
}
+int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev,
+ size_t bucket_nr, int rw)
+{
+ if (bch2_trans_relock(trans))
+ bch2_trans_begin(trans);
+
+ return nested_lockrestart_do(trans, __bch2_bucket_io_time_reset(trans, dev, bucket_nr, rw));
+}
+
/* Startup/shutdown (ro/rw): */
void bch2_recalc_capacity(struct bch_fs *c)
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index f8e87c6721b1..163a67b97a40 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -168,6 +168,9 @@ static inline bool data_type_movable(enum bch_data_type type)
static inline u64 alloc_lru_idx_fragmentation(struct bch_alloc_v4 a,
struct bch_dev *ca)
{
+ if (a.data_type >= BCH_DATA_NR)
+ return 0;
+
if (!data_type_movable(a.data_type) ||
!bch2_bucket_sectors_fragmented(ca, a))
return 0;
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index d0e0b56892e3..372178c8d416 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -162,6 +162,10 @@ static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob)
ARRAY_SIZE(c->open_buckets_partial));
spin_lock(&c->freelist_lock);
+ rcu_read_lock();
+ bch2_dev_rcu(c, ob->dev)->nr_partial_buckets++;
+ rcu_read_unlock();
+
ob->on_partial_list = true;
c->open_buckets_partial[c->open_buckets_partial_nr++] =
ob - c->open_buckets;
@@ -684,7 +688,7 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca,
struct bch_dev_usage usage;
struct open_bucket *ob;
- bch2_trans_do(c, NULL, NULL, 0,
+ bch2_trans_do(c,
PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark,
data_type, cl, false, &usage)));
return ob;
@@ -972,7 +976,7 @@ static int bucket_alloc_set_partial(struct bch_fs *c,
u64 avail;
bch2_dev_usage_read_fast(ca, &usage);
- avail = dev_buckets_free(ca, usage, watermark);
+ avail = dev_buckets_free(ca, usage, watermark) + ca->nr_partial_buckets;
if (!avail)
continue;
@@ -981,6 +985,10 @@ static int bucket_alloc_set_partial(struct bch_fs *c,
i);
ob->on_partial_list = false;
+ rcu_read_lock();
+ bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--;
+ rcu_read_unlock();
+
ret = add_new_bucket(c, ptrs, devs_may_alloc,
nr_replicas, nr_effective,
have_cache, ob);
@@ -1191,7 +1199,13 @@ void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca,
--c->open_buckets_partial_nr;
swap(c->open_buckets_partial[i],
c->open_buckets_partial[c->open_buckets_partial_nr]);
+
ob->on_partial_list = false;
+
+ rcu_read_lock();
+ bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--;
+ rcu_read_unlock();
+
spin_unlock(&c->freelist_lock);
bch2_open_bucket_put(c, ob);
spin_lock(&c->freelist_lock);
@@ -1610,8 +1624,7 @@ void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c,
ob < c->open_buckets + ARRAY_SIZE(c->open_buckets);
ob++) {
spin_lock(&ob->lock);
- if (ob->valid && !ob->on_partial_list &&
- (!ca || ob->dev == ca->dev_idx))
+ if (ob->valid && (!ca || ob->dev == ca->dev_idx))
bch2_open_bucket_to_text(out, c, ob);
spin_unlock(&ob->lock);
}
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 47455a85c909..654a58132a4d 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -52,6 +52,12 @@ int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k,
enum bch_validate_flags flags)
{
struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
+ int ret = 0;
+
+ bkey_fsck_err_on(bp.v->level > BTREE_MAX_DEPTH,
+ c, backpointer_level_bad,
+ "backpointer level bad: %u >= %u",
+ bp.v->level, BTREE_MAX_DEPTH);
rcu_read_lock();
struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp.k->p.inode);
@@ -64,7 +70,6 @@ int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k,
struct bpos bucket = bp_pos_to_bucket(ca, bp.k->p);
struct bpos bp_pos = bucket_pos_to_bp_noerror(ca, bucket, bp.v->bucket_offset);
rcu_read_unlock();
- int ret = 0;
bkey_fsck_err_on((bp.v->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT) >= ca->mi.bucket_size ||
!bpos_eq(bp.k->p, bp_pos),
@@ -947,9 +952,13 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c)
static int check_one_backpointer(struct btree_trans *trans,
struct bbpos start,
struct bbpos end,
- struct bkey_s_c_backpointer bp,
+ struct bkey_s_c bp_k,
struct bkey_buf *last_flushed)
{
+ if (bp_k.k->type != KEY_TYPE_backpointer)
+ return 0;
+
+ struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k);
struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bbpos pos = bp_to_bbpos(*bp.v);
@@ -1004,9 +1013,7 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
POS_MIN, BTREE_ITER_prefetch, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
progress_update_iter(trans, &progress, &iter, "backpointers_to_extents");
- check_one_backpointer(trans, start, end,
- bkey_s_c_to_backpointer(k),
- &last_flushed);
+ check_one_backpointer(trans, start, end, k, &last_flushed);
}));
bch2_bkey_buf_exit(&last_flushed, c);
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index f4151ee51b03..e94a83b8113e 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -555,6 +555,7 @@ struct bch_dev {
u64 alloc_cursor[3];
unsigned nr_open_buckets;
+ unsigned nr_partial_buckets;
unsigned nr_btree_reserve;
size_t inc_gen_needs_gc;
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 84832c2d4df9..5004f6ba997c 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -678,7 +678,8 @@ struct bch_sb_field_ext {
x(disk_accounting_v2, BCH_VERSION(1, 9)) \
x(disk_accounting_v3, BCH_VERSION(1, 10)) \
x(disk_accounting_inum, BCH_VERSION(1, 11)) \
- x(rebalance_work_acct_fix, BCH_VERSION(1, 12))
+ x(rebalance_work_acct_fix, BCH_VERSION(1, 12)) \
+ x(inode_has_child_snapshots, BCH_VERSION(1, 13))
enum bcachefs_metadata_version {
bcachefs_metadata_version_min = 9,
diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
index 587d7318a2e8..995ba32e9b6e 100644
--- a/fs/bcachefs/bkey.c
+++ b/fs/bcachefs/bkey.c
@@ -643,7 +643,7 @@ int bch2_bkey_format_invalid(struct bch_fs *c,
enum bch_validate_flags flags,
struct printbuf *err)
{
- unsigned i, bits = KEY_PACKED_BITS_START;
+ unsigned bits = KEY_PACKED_BITS_START;
if (f->nr_fields != BKEY_NR_FIELDS) {
prt_printf(err, "incorrect number of fields: got %u, should be %u",
@@ -655,9 +655,8 @@ int bch2_bkey_format_invalid(struct bch_fs *c,
* Verify that the packed format can't represent fields larger than the
* unpacked format:
*/
- for (i = 0; i < f->nr_fields; i++) {
- if ((!c || c->sb.version_min >= bcachefs_metadata_version_snapshot) &&
- bch2_bkey_format_field_overflows(f, i)) {
+ for (unsigned i = 0; i < f->nr_fields; i++) {
+ if (bch2_bkey_format_field_overflows(f, i)) {
unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i];
u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1));
unsigned packed_bits = min(64, f->bits_per_field[i]);
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 6e4afb2b5441..7123019ab3bc 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -59,16 +59,38 @@ static inline size_t btree_cache_can_free(struct btree_cache_list *list)
static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b)
{
+ BUG_ON(!list_empty(&b->list));
+
if (b->c.lock.readers)
- list_move(&b->list, &bc->freed_pcpu);
+ list_add(&b->list, &bc->freed_pcpu);
else
- list_move(&b->list, &bc->freed_nonpcpu);
+ list_add(&b->list, &bc->freed_nonpcpu);
+}
+
+static void __bch2_btree_node_to_freelist(struct btree_cache *bc, struct btree *b)
+{
+ BUG_ON(!list_empty(&b->list));
+ BUG_ON(!b->data);
+
+ bc->nr_freeable++;
+ list_add(&b->list, &bc->freeable);
}
-static void btree_node_data_free(struct bch_fs *c, struct btree *b)
+void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b)
{
struct btree_cache *bc = &c->btree_cache;
+ mutex_lock(&bc->lock);
+ __bch2_btree_node_to_freelist(bc, b);
+ mutex_unlock(&bc->lock);
+
+ six_unlock_write(&b->c.lock);
+ six_unlock_intent(&b->c.lock);
+}
+
+static void __btree_node_data_free(struct btree_cache *bc, struct btree *b)
+{
+ BUG_ON(!list_empty(&b->list));
BUG_ON(btree_node_hashed(b));
/*
@@ -94,11 +116,17 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b)
#endif
b->aux_data = NULL;
- bc->nr_freeable--;
-
btree_node_to_freedlist(bc, b);
}
+static void btree_node_data_free(struct btree_cache *bc, struct btree *b)
+{
+ BUG_ON(list_empty(&b->list));
+ list_del_init(&b->list);
+ --bc->nr_freeable;
+ __btree_node_data_free(bc, b);
+}
+
static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg,
const void *obj)
{
@@ -174,21 +202,10 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c)
bch2_btree_lock_init(&b->c, 0);
- bc->nr_freeable++;
- list_add(&b->list, &bc->freeable);
+ __bch2_btree_node_to_freelist(bc, b);
return b;
}
-void bch2_btree_node_to_freelist(struct bch_fs *c, struct btree *b)
-{
- mutex_lock(&c->btree_cache.lock);
- list_move(&b->list, &c->btree_cache.freeable);
- mutex_unlock(&c->btree_cache.lock);
-
- six_unlock_write(&b->c.lock);
- six_unlock_intent(&b->c.lock);
-}
-
static inline bool __btree_node_pinned(struct btree_cache *bc, struct btree *b)
{
struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p);
@@ -236,11 +253,11 @@ void bch2_btree_cache_unpin(struct bch_fs *c)
/* Btree in memory cache - hash table */
-void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
+void __bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
{
lockdep_assert_held(&bc->lock);
- int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
+ int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params);
BUG_ON(ret);
/* Cause future lookups for this node to fail: */
@@ -248,17 +265,22 @@ void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
if (b->c.btree_id < BTREE_ID_NR)
--bc->nr_by_btree[b->c.btree_id];
+ --bc->live[btree_node_pinned(b)].nr;
+ list_del_init(&b->list);
+}
- bc->live[btree_node_pinned(b)].nr--;
- bc->nr_freeable++;
- list_move(&b->list, &bc->freeable);
+void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b)
+{
+ __bch2_btree_node_hash_remove(bc, b);
+ __bch2_btree_node_to_freelist(bc, b);
}
int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
{
+ BUG_ON(!list_empty(&b->list));
BUG_ON(b->hash_val);
- b->hash_val = btree_ptr_hash_val(&b->key);
+ b->hash_val = btree_ptr_hash_val(&b->key);
int ret = rhashtable_lookup_insert_fast(&bc->table, &b->hash,
bch_btree_cache_params);
if (ret)
@@ -270,10 +292,8 @@ int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b)
bool p = __btree_node_pinned(bc, b);
mod_bit(BTREE_NODE_pinned, &b->flags, p);
- list_move_tail(&b->list, &bc->live[p].list);
+ list_add_tail(&b->list, &bc->live[p].list);
bc->live[p].nr++;
-
- bc->nr_freeable--;
return 0;
}
@@ -485,7 +505,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink,
goto out;
if (!btree_node_reclaim(c, b, true)) {
- btree_node_data_free(c, b);
+ btree_node_data_free(bc, b);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
freed++;
@@ -501,10 +521,10 @@ restart:
bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_access_bit]++;
--touched;;
} else if (!btree_node_reclaim(c, b, true)) {
- bch2_btree_node_hash_remove(bc, b);
+ __bch2_btree_node_hash_remove(bc, b);
+ __btree_node_data_free(bc, b);
freed++;
- btree_node_data_free(c, b);
bc->nr_freed++;
six_unlock_write(&b->c.lock);
@@ -587,7 +607,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
BUG_ON(btree_node_read_in_flight(b) ||
btree_node_write_in_flight(b));
- btree_node_data_free(c, b);
+ btree_node_data_free(bc, b);
}
BUG_ON(!bch2_journal_error(&c->journal) &&
@@ -786,8 +806,8 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea
BUG_ON(!six_trylock_intent(&b->c.lock));
BUG_ON(!six_trylock_write(&b->c.lock));
-got_node:
+got_node:
/*
* btree_free() doesn't free memory; it sticks the node on the end of
* the list. Check if there's any freed nodes there:
@@ -796,7 +816,12 @@ got_node:
if (!btree_node_reclaim(c, b2, false)) {
swap(b->data, b2->data);
swap(b->aux_data, b2->aux_data);
+
+ list_del_init(&b2->list);
+ --bc->nr_freeable;
btree_node_to_freedlist(bc, b2);
+ mutex_unlock(&bc->lock);
+
six_unlock_write(&b2->c.lock);
six_unlock_intent(&b2->c.lock);
goto got_mem;
@@ -810,11 +835,8 @@ got_node:
goto err;
}
- mutex_lock(&bc->lock);
- bc->nr_freeable++;
got_mem:
- mutex_unlock(&bc->lock);
-
+ BUG_ON(!list_empty(&b->list));
BUG_ON(btree_node_hashed(b));
BUG_ON(btree_node_dirty(b));
BUG_ON(btree_node_write_in_flight(b));
@@ -845,7 +867,7 @@ err:
if (bc->alloc_lock == current) {
b2 = btree_node_cannibalize(c);
clear_btree_node_just_written(b2);
- bch2_btree_node_hash_remove(bc, b2);
+ __bch2_btree_node_hash_remove(bc, b2);
if (b) {
swap(b->data, b2->data);
@@ -855,9 +877,9 @@ err:
six_unlock_intent(&b2->c.lock);
} else {
b = b2;
- list_del_init(&b->list);
}
+ BUG_ON(!list_empty(&b->list));
mutex_unlock(&bc->lock);
trace_and_count(c, btree_cache_cannibalize, trans);
@@ -936,7 +958,7 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
b->hash_val = 0;
mutex_lock(&bc->lock);
- list_add(&b->list, &bc->freeable);
+ __bch2_btree_node_to_freelist(bc, b);
mutex_unlock(&bc->lock);
six_unlock_write(&b->c.lock);
@@ -1312,9 +1334,12 @@ int bch2_btree_node_prefetch(struct btree_trans *trans,
b = bch2_btree_node_fill(trans, path, k, btree_id,
level, SIX_LOCK_read, false);
- if (!IS_ERR_OR_NULL(b))
+ int ret = PTR_ERR_OR_ZERO(b);
+ if (ret)
+ return ret;
+ if (b)
six_unlock_read(&b->c.lock);
- return bch2_trans_relock(trans) ?: PTR_ERR_OR_ZERO(b);
+ return 0;
}
void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k)
@@ -1353,7 +1378,7 @@ wait_on_io:
mutex_lock(&bc->lock);
bch2_btree_node_hash_remove(bc, b);
- btree_node_data_free(c, b);
+ btree_node_data_free(bc, b);
mutex_unlock(&bc->lock);
out:
six_unlock_write(&b->c.lock);
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index 367acd217c6a..66e86d1a178d 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -14,7 +14,9 @@ void bch2_recalc_btree_reserve(struct bch_fs *);
void bch2_btree_node_to_freelist(struct bch_fs *, struct btree *);
+void __bch2_btree_node_hash_remove(struct btree_cache *, struct btree *);
void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *);
+
int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *);
int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *,
unsigned, enum btree_id);
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 771154e3a291..81dcf9e512c0 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -182,7 +182,7 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
bch2_btree_node_drop_keys_outside_node(b);
mutex_lock(&c->btree_cache.lock);
- bch2_btree_node_hash_remove(&c->btree_cache, b);
+ __bch2_btree_node_hash_remove(&c->btree_cache, b);
bkey_copy(&b->key, &new->k_i);
ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
@@ -820,12 +820,22 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
* fix that here:
*/
alloc_data_type_set(&gc, gc.data_type);
-
if (gc.data_type != old_gc.data_type ||
gc.dirty_sectors != old_gc.dirty_sectors) {
ret = bch2_alloc_key_to_dev_counters(trans, ca, &old_gc, &gc, BTREE_TRIGGER_gc);
if (ret)
return ret;
+
+ /*
+ * Ugly: alloc_key_to_dev_counters(..., BTREE_TRIGGER_gc) is not
+ * safe w.r.t. transaction restarts, so fixup the gc_bucket so
+ * we don't run it twice:
+ */
+ percpu_down_read(&c->mark_lock);
+ struct bucket *gc_m = gc_bucket(ca, iter->pos.offset);
+ gc_m->data_type = gc.data_type;
+ gc_m->dirty_sectors = gc.dirty_sectors;
+ percpu_up_read(&c->mark_lock);
}
if (fsck_err_on(new.data_type != gc.data_type,
@@ -1224,17 +1234,20 @@ int bch2_gc_gens(struct bch_fs *c)
u64 b, start_time = local_clock();
int ret;
- /*
- * Ideally we would be using state_lock and not gc_gens_lock here, but that
- * introduces a deadlock in the RO path - we currently take the state
- * lock at the start of going RO, thus the gc thread may get stuck:
- */
if (!mutex_trylock(&c->gc_gens_lock))
return 0;
trace_and_count(c, gc_gens_start, c);
- down_read(&c->state_lock);
+ /*
+ * We have to use trylock here. Otherwise, we would
+ * introduce a deadlock in the RO path - we take the
+ * state lock at the start of going RO.
+ */
+ if (!down_read_trylock(&c->state_lock)) {
+ mutex_unlock(&c->gc_gens_lock);
+ return 0;
+ }
for_each_member_device(c, ca) {
struct bucket_gens *gens = bucket_gens(ca);
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 1c1448b52207..839d68802e42 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -733,11 +733,8 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
c, ca, b, i, NULL,
bset_past_end_of_btree_node,
"bset past end of btree node (offset %u len %u but written %zu)",
- offset, sectors, ptr_written ?: btree_sectors(c))) {
+ offset, sectors, ptr_written ?: btree_sectors(c)))
i->u64s = 0;
- ret = 0;
- goto out;
- }
btree_err_on(offset && !i->u64s,
-BCH_ERR_btree_node_read_err_fixable,
@@ -829,7 +826,6 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca,
BSET_BIG_ENDIAN(i), write,
&bn->format);
}
-out:
fsck_err:
printbuf_exit(&buf2);
printbuf_exit(&buf1);
@@ -1838,10 +1834,11 @@ static void btree_node_write_done(struct bch_fs *c, struct btree *b)
struct btree_trans *trans = bch2_trans_get(c);
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
- __btree_node_write_done(c, b);
- six_unlock_read(&b->c.lock);
+ /* we don't need transaction context anymore after we got the lock. */
bch2_trans_put(trans);
+ __btree_node_write_done(c, b);
+ six_unlock_read(&b->c.lock);
}
static void btree_node_write_work(struct work_struct *work)
@@ -1870,7 +1867,7 @@ static void btree_node_write_work(struct work_struct *work)
}
} else {
- ret = bch2_trans_do(c, NULL, NULL, 0,
+ ret = bch2_trans_do(c,
bch2_btree_node_update_key_get_iter(trans, b, &wbio->key,
BCH_WATERMARK_interior_updates|
BCH_TRANS_COMMIT_journal_reclaim|
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index bfe9f0c1e1be..eef9b89c561d 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -882,6 +882,18 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
__bch2_btree_and_journal_iter_init_node_iter(trans, &jiter, l->b, l->iter, path->pos);
k = bch2_btree_and_journal_iter_peek(&jiter);
+ if (!k.k) {
+ struct printbuf buf = PRINTBUF;
+
+ prt_str(&buf, "node not found at pos ");
+ bch2_bpos_to_text(&buf, path->pos);
+ prt_str(&buf, " at btree ");
+ bch2_btree_pos_to_text(&buf, c, l->b);
+
+ ret = bch2_fs_topology_error(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ goto err;
+ }
bch2_bkey_buf_reassemble(out, c, k);
@@ -889,6 +901,7 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
c->opts.btree_node_prefetch)
ret = btree_path_prefetch_j(trans, path, &jiter);
+err:
bch2_btree_and_journal_iter_exit(&jiter);
return ret;
}
@@ -2381,9 +2394,9 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
else
iter_pos = bkey_max(iter->pos, bkey_start_pos(k.k));
- if (unlikely(!(iter->flags & BTREE_ITER_is_extents)
- ? bkey_gt(iter_pos, end)
- : bkey_ge(iter_pos, end)))
+ if (unlikely(iter->flags & BTREE_ITER_all_snapshots ? bpos_gt(iter_pos, end) :
+ iter->flags & BTREE_ITER_is_extents ? bkey_ge(iter_pos, end) :
+ bkey_gt(iter_pos, end)))
goto end;
break;
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 78e63ad7d380..0bda054f80d7 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -857,6 +857,14 @@ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, _start,\
SPOS_MAX, _flags, _k, _ret)
+#define for_each_btree_key_reverse_norestart(_trans, _iter, _btree_id, \
+ _start, _flags, _k, _ret) \
+ for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
+ (_start), (_flags)); \
+ (_k) = bch2_btree_iter_peek_prev_type(&(_iter), _flags), \
+ !((_ret) = bkey_err(_k)) && (_k).k; \
+ bch2_btree_iter_rewind(&(_iter)))
+
#define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \
for_each_btree_key_upto_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret)
@@ -904,6 +912,8 @@ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
_ret; \
})
+#define bch2_trans_do(_c, _do) bch2_trans_run(_c, lockrestart_do(trans, _do))
+
struct btree_trans *__bch2_trans_get(struct bch_fs *, unsigned);
void bch2_trans_put(struct btree_trans *);
diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
index 1e694fedc5da..30131c3bdd97 100644
--- a/fs/bcachefs/btree_node_scan.c
+++ b/fs/bcachefs/btree_node_scan.c
@@ -171,6 +171,9 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
if (BTREE_NODE_LEVEL(bn) >= BTREE_MAX_DEPTH)
return;
+ if (BTREE_NODE_ID(bn) >= BTREE_ID_NR_MAX)
+ return;
+
rcu_read_lock();
struct found_btree_node n = {
.btree_id = BTREE_NODE_ID(bn),
@@ -183,7 +186,7 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
.ptrs[0].type = 1 << BCH_EXTENT_ENTRY_ptr,
.ptrs[0].offset = offset,
.ptrs[0].dev = ca->dev_idx,
- .ptrs[0].gen = *bucket_gen(ca, sector_to_bucket(ca, offset)),
+ .ptrs[0].gen = bucket_gen_get(ca, sector_to_bucket(ca, offset)),
};
rcu_read_unlock();
diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index 514df618548e..5d809e8bd170 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -668,7 +668,7 @@ int bch2_btree_insert(struct bch_fs *c, enum btree_id id, struct bkey_i *k,
struct disk_reservation *disk_res, int flags,
enum btree_iter_update_trigger_flags iter_flags)
{
- return bch2_trans_do(c, disk_res, NULL, flags,
+ return bch2_trans_commit_do(c, disk_res, NULL, flags,
bch2_btree_insert_trans(trans, id, k, iter_flags));
}
@@ -865,7 +865,7 @@ __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
memcpy(l->d, buf.buf, buf.pos);
c->journal.early_journal_entries.nr += jset_u64s(u64s);
} else {
- ret = bch2_trans_do(c, NULL, NULL,
+ ret = bch2_trans_commit_do(c, NULL, NULL,
BCH_TRANS_COMMIT_lazy_rw|commit_flags,
__bch2_trans_log_msg(trans, &buf, u64s));
}
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 6a454f2fa005..70b3c989fac2 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -192,7 +192,7 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
nested_lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\
(_journal_seq), (_flags)))
-#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do) \
+#define bch2_trans_commit_do(_c, _disk_res, _journal_seq, _flags, _do) \
bch2_trans_run(_c, commit_do(trans, _disk_res, _journal_seq, _flags, _do))
#define trans_for_each_update(_trans, _i) \
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 190bc1e81756..d596ef93239f 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -237,10 +237,6 @@ static void __btree_node_free(struct btree_trans *trans, struct btree *b)
BUG_ON(b->will_make_reachable);
clear_btree_node_noevict(b);
-
- mutex_lock(&c->btree_cache.lock);
- list_move(&b->list, &c->btree_cache.freeable);
- mutex_unlock(&c->btree_cache.lock);
}
static void bch2_btree_node_free_inmem(struct btree_trans *trans,
@@ -252,12 +248,12 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans,
bch2_btree_node_lock_write_nofail(trans, path, &b->c);
+ __btree_node_free(trans, b);
+
mutex_lock(&c->btree_cache.lock);
bch2_btree_node_hash_remove(&c->btree_cache, b);
mutex_unlock(&c->btree_cache.lock);
- __btree_node_free(trans, b);
-
six_unlock_write(&b->c.lock);
mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED);
@@ -289,7 +285,7 @@ static void bch2_btree_node_free_never_used(struct btree_update *as,
clear_btree_node_need_write(b);
mutex_lock(&c->btree_cache.lock);
- bch2_btree_node_hash_remove(&c->btree_cache, b);
+ __bch2_btree_node_hash_remove(&c->btree_cache, b);
mutex_unlock(&c->btree_cache.lock);
BUG_ON(p->nr >= ARRAY_SIZE(p->b));
@@ -521,8 +517,7 @@ static void bch2_btree_reserve_put(struct btree_update *as, struct btree_trans *
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
__btree_node_free(trans, b);
- six_unlock_write(&b->c.lock);
- six_unlock_intent(&b->c.lock);
+ bch2_btree_node_to_freelist(c, b);
}
}
}
@@ -1434,6 +1429,15 @@ bch2_btree_insert_keys_interior(struct btree_update *as,
}
}
+static bool key_deleted_in_insert(struct keylist *insert_keys, struct bpos pos)
+{
+ if (insert_keys)
+ for_each_keylist_key(insert_keys, k)
+ if (bkey_deleted(&k->k) && bpos_eq(k->k.p, pos))
+ return true;
+ return false;
+}
+
/*
* Move keys from n1 (original replacement node, now lower node) to n2 (higher
* node)
@@ -1441,7 +1445,8 @@ bch2_btree_insert_keys_interior(struct btree_update *as,
static void __btree_split_node(struct btree_update *as,
struct btree_trans *trans,
struct btree *b,
- struct btree *n[2])
+ struct btree *n[2],
+ struct keylist *insert_keys)
{
struct bkey_packed *k;
struct bpos n1_pos = POS_MIN;
@@ -1476,7 +1481,8 @@ static void __btree_split_node(struct btree_update *as,
if (b->c.level &&
u64s < n1_u64s &&
u64s + k->u64s >= n1_u64s &&
- bch2_key_deleted_in_journal(trans, b->c.btree_id, b->c.level, uk.p))
+ (bch2_key_deleted_in_journal(trans, b->c.btree_id, b->c.level, uk.p) ||
+ key_deleted_in_insert(insert_keys, uk.p)))
n1_u64s += k->u64s;
i = u64s >= n1_u64s;
@@ -1603,7 +1609,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
n[0] = n1 = bch2_btree_node_alloc(as, trans, b->c.level);
n[1] = n2 = bch2_btree_node_alloc(as, trans, b->c.level);
- __btree_split_node(as, trans, b, n);
+ __btree_split_node(as, trans, b, n, keys);
if (keys) {
btree_split_insert_keys(as, trans, path, n1, keys);
@@ -2239,10 +2245,8 @@ static void async_btree_node_rewrite_work(struct work_struct *work)
struct async_btree_rewrite *a =
container_of(work, struct async_btree_rewrite, work);
struct bch_fs *c = a->c;
- int ret;
- ret = bch2_trans_do(c, NULL, NULL, 0,
- async_btree_node_rewrite_trans(trans, a));
+ int ret = bch2_trans_do(c, async_btree_node_rewrite_trans(trans, a));
bch_err_fn_ratelimited(c, ret);
bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite);
kfree(a);
@@ -2394,7 +2398,8 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
if (new_hash) {
mutex_lock(&c->btree_cache.lock);
bch2_btree_node_hash_remove(&c->btree_cache, new_hash);
- bch2_btree_node_hash_remove(&c->btree_cache, b);
+
+ __bch2_btree_node_hash_remove(&c->btree_cache, b);
bkey_copy(&b->key, new_key);
ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
index 3f56b584f8ec..1639c60dffa0 100644
--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -277,6 +277,10 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
bool accounting_replay_done = test_bit(BCH_FS_accounting_replay_done, &c->flags);
int ret = 0;
+ ret = bch2_journal_error(&c->journal);
+ if (ret)
+ return ret;
+
bch2_trans_unlock(trans);
bch2_trans_begin(trans);
@@ -491,7 +495,8 @@ static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 seq)
return ret;
}
-static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq)
+static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq,
+ bool *did_work)
{
struct bch_fs *c = trans->c;
struct btree_write_buffer *wb = &c->btree_write_buffer;
@@ -502,6 +507,8 @@ static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq)
fetch_from_journal_err = fetch_wb_keys_from_journal(c, seq);
+ *did_work |= wb->inc.keys.nr || wb->flushing.keys.nr;
+
/*
* On memory allocation failure, bch2_btree_write_buffer_flush_locked()
* is not guaranteed to empty wb->inc:
@@ -521,17 +528,34 @@ static int bch2_btree_write_buffer_journal_flush(struct journal *j,
struct journal_entry_pin *_pin, u64 seq)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ bool did_work = false;
- return bch2_trans_run(c, btree_write_buffer_flush_seq(trans, seq));
+ return bch2_trans_run(c, btree_write_buffer_flush_seq(trans, seq, &did_work));
}
int bch2_btree_write_buffer_flush_sync(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
+ bool did_work = false;
trace_and_count(c, write_buffer_flush_sync, trans, _RET_IP_);
- return btree_write_buffer_flush_seq(trans, journal_cur_seq(&c->journal));
+ return btree_write_buffer_flush_seq(trans, journal_cur_seq(&c->journal), &did_work);
+}
+
+/*
+ * The write buffer requires flushing when going RO: keys in the journal for the
+ * write buffer don't have a journal pin yet
+ */
+bool bch2_btree_write_buffer_flush_going_ro(struct bch_fs *c)
+{
+ if (bch2_journal_error(&c->journal))
+ return false;
+
+ bool did_work = false;
+ bch2_trans_run(c, btree_write_buffer_flush_seq(trans,
+ journal_cur_seq(&c->journal), &did_work));
+ return did_work;
}
int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *trans)
diff --git a/fs/bcachefs/btree_write_buffer.h b/fs/bcachefs/btree_write_buffer.h
index 725e79654216..d535cea28bde 100644
--- a/fs/bcachefs/btree_write_buffer.h
+++ b/fs/bcachefs/btree_write_buffer.h
@@ -21,6 +21,7 @@ static inline bool bch2_btree_write_buffer_must_wait(struct bch_fs *c)
struct btree_trans;
int bch2_btree_write_buffer_flush_sync(struct btree_trans *);
+bool bch2_btree_write_buffer_flush_going_ro(struct bch_fs *);
int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *);
int bch2_btree_write_buffer_tryflush(struct btree_trans *);
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 546cd01a72e3..ec7d9a59bea9 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -1160,11 +1160,11 @@ int bch2_trans_mark_dev_sbs(struct bch_fs *c)
#define SECTORS_CACHE 1024
int __bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
- u64 sectors, int flags)
+ u64 sectors, enum bch_reservation_flags flags)
{
struct bch_fs_pcpu *pcpu;
u64 old, get;
- s64 sectors_available;
+ u64 sectors_available;
int ret;
percpu_down_read(&c->mark_lock);
@@ -1202,6 +1202,9 @@ recalculate:
percpu_u64_set(&c->pcpu->sectors_available, 0);
sectors_available = avail_factor(__bch2_fs_usage_read_short(c).free);
+ if (sectors_available && (flags & BCH_DISK_RESERVATION_PARTIAL))
+ sectors = min(sectors, sectors_available);
+
if (sectors <= sectors_available ||
(flags & BCH_DISK_RESERVATION_NOFAIL)) {
atomic64_set(&c->sectors_available,
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index e2cb7b24b220..ccc78bfe2fd4 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -103,12 +103,18 @@ static inline u8 *bucket_gen(struct bch_dev *ca, size_t b)
return gens->b + b;
}
-static inline u8 bucket_gen_get(struct bch_dev *ca, size_t b)
+static inline int bucket_gen_get_rcu(struct bch_dev *ca, size_t b)
+{
+ u8 *gen = bucket_gen(ca, b);
+ return gen ? *gen : -1;
+}
+
+static inline int bucket_gen_get(struct bch_dev *ca, size_t b)
{
rcu_read_lock();
- u8 gen = *bucket_gen(ca, b);
+ int ret = bucket_gen_get_rcu(ca, b);
rcu_read_unlock();
- return gen;
+ return ret;
}
static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca,
@@ -169,10 +175,8 @@ static inline int gen_after(u8 a, u8 b)
static inline int dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ptr *ptr)
{
- u8 *gen = bucket_gen(ca, PTR_BUCKET_NR(ca, ptr));
- if (!gen)
- return -1;
- return gen_after(*gen, ptr->gen);
+ int gen = bucket_gen_get_rcu(ca, PTR_BUCKET_NR(ca, ptr));
+ return gen < 0 ? gen : gen_after(gen, ptr->gen);
}
/**
@@ -184,7 +188,6 @@ static inline int dev_ptr_stale(struct bch_dev *ca, const struct bch_extent_ptr
rcu_read_lock();
int ret = dev_ptr_stale_rcu(ca, ptr);
rcu_read_unlock();
-
return ret;
}
@@ -344,14 +347,16 @@ static inline void bch2_disk_reservation_put(struct bch_fs *c,
}
}
-#define BCH_DISK_RESERVATION_NOFAIL (1 << 0)
+enum bch_reservation_flags {
+ BCH_DISK_RESERVATION_NOFAIL = 1 << 0,
+ BCH_DISK_RESERVATION_PARTIAL = 1 << 1,
+};
-int __bch2_disk_reservation_add(struct bch_fs *,
- struct disk_reservation *,
- u64, int);
+int __bch2_disk_reservation_add(struct bch_fs *, struct disk_reservation *,
+ u64, enum bch_reservation_flags);
static inline int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res,
- u64 sectors, int flags)
+ u64 sectors, enum bch_reservation_flags flags)
{
#ifdef __KERNEL__
u64 old, new;
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index cbfd88f98472..2182b555c112 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -225,6 +225,7 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a
opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio);
opt_set(thr->opts, read_only, 1);
+ opt_set(thr->opts, ratelimit_errors, 0);
/* We need request_key() to be called before we punt to kthread: */
opt_set(thr->opts, nostart, true);
diff --git a/fs/bcachefs/darray.c b/fs/bcachefs/darray.c
index 4f06cd8bbbe1..e86d36d23e9e 100644
--- a/fs/bcachefs/darray.c
+++ b/fs/bcachefs/darray.c
@@ -2,6 +2,7 @@
#include <linux/log2.h>
#include <linux/slab.h>
+#include <linux/vmalloc.h>
#include "darray.h"
int __bch2_darray_resize_noprof(darray_char *d, size_t element_size, size_t new_size, gfp_t gfp)
@@ -9,7 +10,19 @@ int __bch2_darray_resize_noprof(darray_char *d, size_t element_size, size_t new_
if (new_size > d->size) {
new_size = roundup_pow_of_two(new_size);
- void *data = kvmalloc_array_noprof(new_size, element_size, gfp);
+ /*
+ * This is a workaround: kvmalloc() doesn't support > INT_MAX
+ * allocations, but vmalloc() does.
+ * The limit needs to be lifted from kvmalloc, and when it does
+ * we'll go back to just using that.
+ */
+ size_t bytes;
+ if (unlikely(check_mul_overflow(new_size, element_size, &bytes)))
+ return -ENOMEM;
+
+ void *data = likely(bytes < INT_MAX)
+ ? kvmalloc_noprof(bytes, gfp)
+ : vmalloc_noprof(bytes);
if (!data)
return -ENOMEM;
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 462b1a2fe1ad..8e75a852b358 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -80,6 +80,7 @@ static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struc
if (ptr2 == ptr)
break;
+ ca = bch2_dev_have_ref(c, ptr2->dev);
bucket = PTR_BUCKET_POS(ca, ptr2);
bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0);
}
@@ -235,7 +236,8 @@ static int __bch2_data_update_index_update(struct btree_trans *trans,
if (((1U << i) & m->data_opts.rewrite_ptrs) &&
(ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) &&
!ptr->cached) {
- bch2_extent_ptr_set_cached(bkey_i_to_s(insert), ptr);
+ bch2_extent_ptr_set_cached(c, &m->op.opts,
+ bkey_i_to_s(insert), ptr);
rewrites_found |= 1U << i;
}
i++;
@@ -283,7 +285,8 @@ restart_drop_extra_replicas:
durability - ptr_durability >= m->op.opts.data_replicas) {
durability -= ptr_durability;
- bch2_extent_ptr_set_cached(bkey_i_to_s(insert), &entry->ptr);
+ bch2_extent_ptr_set_cached(c, &m->op.opts,
+ bkey_i_to_s(insert), &entry->ptr);
goto restart_drop_extra_replicas;
}
}
@@ -294,7 +297,7 @@ restart_drop_extra_replicas:
bch2_extent_ptr_decoded_append(insert, &p);
bch2_bkey_narrow_crcs(insert, (struct bch_extent_crc_unpacked) { 0 });
- bch2_extent_normalize(c, bkey_i_to_s(insert));
+ bch2_extent_normalize_by_opts(c, &m->op.opts, bkey_i_to_s(insert));
ret = bch2_sum_sector_overwrites(trans, &iter, insert,
&should_check_enospc,
@@ -557,7 +560,8 @@ void bch2_data_update_to_text(struct printbuf *out, struct data_update *m)
int bch2_extent_drop_ptrs(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k,
- struct data_update_opts data_opts)
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
{
struct bch_fs *c = trans->c;
struct bkey_i *n;
@@ -568,11 +572,11 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans,
if (ret)
return ret;
- while (data_opts.kill_ptrs) {
- unsigned i = 0, drop = __fls(data_opts.kill_ptrs);
+ while (data_opts->kill_ptrs) {
+ unsigned i = 0, drop = __fls(data_opts->kill_ptrs);
bch2_bkey_drop_ptrs_noerror(bkey_i_to_s(n), ptr, i++ == drop);
- data_opts.kill_ptrs ^= 1U << drop;
+ data_opts->kill_ptrs ^= 1U << drop;
}
/*
@@ -580,7 +584,7 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans,
* will do the appropriate thing with it (turning it into a
* KEY_TYPE_error key, or just a discard if it was a cached extent)
*/
- bch2_extent_normalize(c, bkey_i_to_s(n));
+ bch2_extent_normalize_by_opts(c, io_opts, bkey_i_to_s(n));
/*
* Since we're not inserting through an extent iterator
@@ -719,7 +723,7 @@ int bch2_data_update_init(struct btree_trans *trans,
m->data_opts.rewrite_ptrs = 0;
/* if iter == NULL, it's just a promote */
if (iter)
- ret = bch2_extent_drop_ptrs(trans, iter, k, m->data_opts);
+ ret = bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &m->data_opts);
goto out;
}
diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h
index 8d36365bdea8..e4b50723428e 100644
--- a/fs/bcachefs/data_update.h
+++ b/fs/bcachefs/data_update.h
@@ -40,7 +40,8 @@ void bch2_data_update_read_done(struct data_update *,
int bch2_extent_drop_ptrs(struct btree_trans *,
struct btree_iter *,
struct bkey_s_c,
- struct data_update_opts);
+ struct bch_io_opts *,
+ struct data_update_opts *);
void bch2_data_update_exit(struct data_update *);
int bch2_data_update_init(struct btree_trans *, struct btree_iter *,
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 84dd4a879d98..faffc98d5605 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -250,13 +250,6 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir,
return ret;
}
-static void dirent_copy_target(struct bkey_i_dirent *dst,
- struct bkey_s_c_dirent src)
-{
- dst->v.d_inum = src.v->d_inum;
- dst->v.d_type = src.v->d_type;
-}
-
int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir,
struct bkey_s_c_dirent d, subvol_inum *target)
{
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index 8945145865c5..53ad99666022 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -34,6 +34,13 @@ static inline unsigned dirent_val_u64s(unsigned len)
int bch2_dirent_read_target(struct btree_trans *, subvol_inum,
struct bkey_s_c_dirent, subvol_inum *);
+static inline void dirent_copy_target(struct bkey_i_dirent *dst,
+ struct bkey_s_c_dirent src)
+{
+ dst->v.d_inum = src.v->d_inum;
+ dst->v.d_type = src.v->d_type;
+}
+
int bch2_dirent_create_snapshot(struct btree_trans *, u32, u64, u32,
const struct bch_hash_info *, u8,
const struct qstr *, u64, u64 *,
diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c
index 9f3133e3e7e5..07eb8fa1b026 100644
--- a/fs/bcachefs/disk_accounting.c
+++ b/fs/bcachefs/disk_accounting.c
@@ -242,6 +242,14 @@ void bch2_accounting_swab(struct bkey_s k)
*p = swab64(*p);
}
+static inline void __accounting_to_replicas(struct bch_replicas_entry_v1 *r,
+ struct disk_accounting_pos acc)
+{
+ unsafe_memcpy(r, &acc.replicas,
+ replicas_entry_bytes(&acc.replicas),
+ "variable length struct");
+}
+
static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struct bpos p)
{
struct disk_accounting_pos acc_k;
@@ -249,9 +257,7 @@ static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struc
switch (acc_k.type) {
case BCH_DISK_ACCOUNTING_replicas:
- unsafe_memcpy(r, &acc_k.replicas,
- replicas_entry_bytes(&acc_k.replicas),
- "variable length struct");
+ __accounting_to_replicas(r, acc_k);
return true;
default:
return false;
@@ -608,6 +614,81 @@ static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k)
return ret;
}
+static int bch2_disk_accounting_validate_late(struct btree_trans *trans,
+ struct disk_accounting_pos acc,
+ u64 *v, unsigned nr)
+{
+ struct bch_fs *c = trans->c;
+ struct printbuf buf = PRINTBUF;
+ int ret = 0, invalid_dev = -1;
+
+ switch (acc.type) {
+ case BCH_DISK_ACCOUNTING_replicas: {
+ struct bch_replicas_padded r;
+ __accounting_to_replicas(&r.e, acc);
+
+ for (unsigned i = 0; i < r.e.nr_devs; i++)
+ if (r.e.devs[i] != BCH_SB_MEMBER_INVALID &&
+ !bch2_dev_exists(c, r.e.devs[i])) {
+ invalid_dev = r.e.devs[i];
+ goto invalid_device;
+ }
+
+ /*
+ * All replicas entry checks except for invalid device are done
+ * in bch2_accounting_validate
+ */
+ BUG_ON(bch2_replicas_entry_validate(&r.e, c, &buf));
+
+ if (fsck_err_on(!bch2_replicas_marked_locked(c, &r.e),
+ trans, accounting_replicas_not_marked,
+ "accounting not marked in superblock replicas\n %s",
+ (printbuf_reset(&buf),
+ bch2_accounting_key_to_text(&buf, &acc),
+ buf.buf))) {
+ /*
+ * We're not RW yet and still single threaded, dropping
+ * and retaking lock is ok:
+ */
+ percpu_up_write(&c->mark_lock);
+ ret = bch2_mark_replicas(c, &r.e);
+ if (ret)
+ goto fsck_err;
+ percpu_down_write(&c->mark_lock);
+ }
+ break;
+ }
+
+ case BCH_DISK_ACCOUNTING_dev_data_type:
+ if (!bch2_dev_exists(c, acc.dev_data_type.dev)) {
+ invalid_dev = acc.dev_data_type.dev;
+ goto invalid_device;
+ }
+ break;
+ }
+
+fsck_err:
+ printbuf_exit(&buf);
+ return ret;
+invalid_device:
+ if (fsck_err(trans, accounting_to_invalid_device,
+ "accounting entry points to invalid device %i\n %s",
+ invalid_dev,
+ (printbuf_reset(&buf),
+ bch2_accounting_key_to_text(&buf, &acc),
+ buf.buf))) {
+ for (unsigned i = 0; i < nr; i++)
+ v[i] = -v[i];
+
+ ret = commit_do(trans, NULL, NULL, 0,
+ bch2_disk_accounting_mod(trans, &acc, v, nr, false)) ?:
+ -BCH_ERR_remove_disk_accounting_entry;
+ } else {
+ ret = -BCH_ERR_remove_disk_accounting_entry;
+ }
+ goto fsck_err;
+}
+
/*
* At startup time, initialize the in memory accounting from the btree (and
* journal)
@@ -666,44 +747,42 @@ int bch2_accounting_read(struct bch_fs *c)
}
keys->gap = keys->nr = dst - keys->data;
- percpu_down_read(&c->mark_lock);
- for (unsigned i = 0; i < acc->k.nr; i++) {
- u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
- bch2_accounting_mem_read_counters(acc, i, v, ARRAY_SIZE(v), false);
+ percpu_down_write(&c->mark_lock);
+ unsigned i = 0;
+ while (i < acc->k.nr) {
+ unsigned idx = inorder_to_eytzinger0(i, acc->k.nr);
- if (bch2_is_zero(v, sizeof(v[0]) * acc->k.data[i].nr_counters))
- continue;
+ struct disk_accounting_pos acc_k;
+ bpos_to_disk_accounting_pos(&acc_k, acc->k.data[idx].pos);
- struct bch_replicas_padded r;
- if (!accounting_to_replicas(&r.e, acc->k.data[i].pos))
- continue;
+ u64 v[BCH_ACCOUNTING_MAX_COUNTERS];
+ bch2_accounting_mem_read_counters(acc, idx, v, ARRAY_SIZE(v), false);
/*
- * If the replicas entry is invalid it'll get cleaned up by
- * check_allocations:
+ * If the entry counters are zeroed, it should be treated as
+ * nonexistent - it might point to an invalid device.
+ *
+ * Remove it, so that if it's re-added it gets re-marked in the
+ * superblock:
*/
- if (bch2_replicas_entry_validate(&r.e, c, &buf))
+ ret = bch2_is_zero(v, sizeof(v[0]) * acc->k.data[idx].nr_counters)
+ ? -BCH_ERR_remove_disk_accounting_entry
+ : bch2_disk_accounting_validate_late(trans, acc_k,
+ v, acc->k.data[idx].nr_counters);
+
+ if (ret == -BCH_ERR_remove_disk_accounting_entry) {
+ free_percpu(acc->k.data[idx].v[0]);
+ free_percpu(acc->k.data[idx].v[1]);
+ darray_remove_item(&acc->k, &acc->k.data[idx]);
+ eytzinger0_sort(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
+ accounting_pos_cmp, NULL);
+ ret = 0;
continue;
-
- struct disk_accounting_pos k;
- bpos_to_disk_accounting_pos(&k, acc->k.data[i].pos);
-
- if (fsck_err_on(!bch2_replicas_marked_locked(c, &r.e),
- trans, accounting_replicas_not_marked,
- "accounting not marked in superblock replicas\n %s",
- (printbuf_reset(&buf),
- bch2_accounting_key_to_text(&buf, &k),
- buf.buf))) {
- /*
- * We're not RW yet and still single threaded, dropping
- * and retaking lock is ok:
- */
- percpu_up_read(&c->mark_lock);
- ret = bch2_mark_replicas(c, &r.e);
- if (ret)
- goto fsck_err;
- percpu_down_read(&c->mark_lock);
}
+
+ if (ret)
+ goto fsck_err;
+ i++;
}
preempt_disable();
@@ -742,7 +821,7 @@ int bch2_accounting_read(struct bch_fs *c)
}
preempt_enable();
fsck_err:
- percpu_up_read(&c->mark_lock);
+ percpu_up_write(&c->mark_lock);
err:
printbuf_exit(&buf);
bch2_trans_put(trans);
@@ -777,8 +856,10 @@ int bch2_dev_usage_init(struct bch_dev *ca, bool gc)
};
u64 v[3] = { ca->mi.nbuckets - ca->mi.first_bucket, 0, 0 };
- int ret = bch2_trans_do(c, NULL, NULL, 0,
- bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), gc));
+ int ret = bch2_trans_do(c, ({
+ bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), gc) ?:
+ (!gc ? bch2_trans_commit(trans, NULL, NULL, 0) : 0);
+ }));
bch_err_fn(c, ret);
return ret;
}
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 1587c6e1866a..749dcf368841 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -124,6 +124,11 @@ int bch2_stripe_validate(struct bch_fs *c, struct bkey_s_c k,
"incorrect value size (%zu < %u)",
bkey_val_u64s(k.k), stripe_val_u64s(s));
+ bkey_fsck_err_on(s->csum_granularity_bits >= 64,
+ c, stripe_csum_granularity_bad,
+ "invalid csum granularity (%u >= 64)",
+ s->csum_granularity_bits);
+
ret = bch2_bkey_ptrs_validate(c, k, flags);
fsck_err:
return ret;
@@ -145,7 +150,11 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
nr_data,
s.nr_redundant);
bch2_prt_csum_type(out, s.csum_type);
- prt_printf(out, " gran %u", 1U << s.csum_granularity_bits);
+ prt_str(out, " gran ");
+ if (s.csum_granularity_bits < 64)
+ prt_printf(out, "%llu", 1ULL << s.csum_granularity_bits);
+ else
+ prt_printf(out, "(invalid shift %u)", s.csum_granularity_bits);
if (s.disk_label) {
prt_str(out, " label");
@@ -257,12 +266,12 @@ static int __mark_stripe_bucket(struct btree_trans *trans,
if (!deleting) {
a->stripe = s.k->p.offset;
a->stripe_redundancy = s.v->nr_redundant;
+ alloc_data_type_set(a, data_type);
} else {
a->stripe = 0;
a->stripe_redundancy = 0;
+ alloc_data_type_set(a, BCH_DATA_user);
}
-
- alloc_data_type_set(a, data_type);
err:
printbuf_exit(&buf);
return ret;
@@ -1177,7 +1186,7 @@ static void ec_stripe_delete_work(struct work_struct *work)
if (!idx)
break;
- int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
ec_stripe_delete(trans, idx));
bch_err_fn(c, ret);
if (ret)
@@ -1197,47 +1206,62 @@ void bch2_do_stripe_deletes(struct bch_fs *c)
/* stripe creation: */
static int ec_stripe_key_update(struct btree_trans *trans,
- struct bkey_i_stripe *new,
- bool create)
+ struct bkey_i_stripe *old,
+ struct bkey_i_stripe *new)
{
struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
+ bool create = !old;
- k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes,
- new->k.p, BTREE_ITER_intent);
- ret = bkey_err(k);
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes,
+ new->k.p, BTREE_ITER_intent);
+ int ret = bkey_err(k);
if (ret)
goto err;
- if (k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe)) {
- bch2_fs_inconsistent(c, "error %s stripe: got existing key type %s",
- create ? "creating" : "updating",
- bch2_bkey_types[k.k->type]);
+ if (bch2_fs_inconsistent_on(k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe),
+ c, "error %s stripe: got existing key type %s",
+ create ? "creating" : "updating",
+ bch2_bkey_types[k.k->type])) {
ret = -EINVAL;
goto err;
}
if (k.k->type == KEY_TYPE_stripe) {
- const struct bch_stripe *old = bkey_s_c_to_stripe(k).v;
- unsigned i;
+ const struct bch_stripe *v = bkey_s_c_to_stripe(k).v;
- if (old->nr_blocks != new->v.nr_blocks) {
- bch_err(c, "error updating stripe: nr_blocks does not match");
- ret = -EINVAL;
- goto err;
- }
+ BUG_ON(old->v.nr_blocks != new->v.nr_blocks);
+ BUG_ON(old->v.nr_blocks != v->nr_blocks);
- for (i = 0; i < new->v.nr_blocks; i++) {
- unsigned v = stripe_blockcount_get(old, i);
+ for (unsigned i = 0; i < new->v.nr_blocks; i++) {
+ unsigned sectors = stripe_blockcount_get(v, i);
- BUG_ON(v &&
- (old->ptrs[i].dev != new->v.ptrs[i].dev ||
- old->ptrs[i].gen != new->v.ptrs[i].gen ||
- old->ptrs[i].offset != new->v.ptrs[i].offset));
+ if (!bch2_extent_ptr_eq(old->v.ptrs[i], new->v.ptrs[i]) && sectors) {
+ struct printbuf buf = PRINTBUF;
- stripe_blockcount_set(&new->v, i, v);
+ prt_printf(&buf, "stripe changed nonempty block %u", i);
+ prt_str(&buf, "\nold: ");
+ bch2_bkey_val_to_text(&buf, c, k);
+ prt_str(&buf, "\nnew: ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new->k_i));
+ bch2_fs_inconsistent(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ ret = -EINVAL;
+ goto err;
+ }
+
+ /*
+ * If the stripe ptr changed underneath us, it must have
+ * been dev_remove_stripes() -> * invalidate_stripe_to_dev()
+ */
+ if (!bch2_extent_ptr_eq(old->v.ptrs[i], v->ptrs[i])) {
+ BUG_ON(v->ptrs[i].dev != BCH_SB_MEMBER_INVALID);
+
+ if (bch2_extent_ptr_eq(old->v.ptrs[i], new->v.ptrs[i]))
+ new->v.ptrs[i].dev = BCH_SB_MEMBER_INVALID;
+ }
+
+ stripe_blockcount_set(&new->v, i, sectors);
}
}
@@ -1495,12 +1519,14 @@ static void ec_stripe_create(struct ec_stripe_new *s)
goto err;
}
- ret = bch2_trans_do(c, &s->res, NULL,
- BCH_TRANS_COMMIT_no_check_rw|
- BCH_TRANS_COMMIT_no_enospc,
- ec_stripe_key_update(trans,
- bkey_i_to_stripe(&s->new_stripe.key),
- !s->have_existing_stripe));
+ ret = bch2_trans_commit_do(c, &s->res, NULL,
+ BCH_TRANS_COMMIT_no_check_rw|
+ BCH_TRANS_COMMIT_no_enospc,
+ ec_stripe_key_update(trans,
+ s->have_existing_stripe
+ ? bkey_i_to_stripe(&s->existing_stripe.key)
+ : NULL,
+ bkey_i_to_stripe(&s->new_stripe.key)));
bch_err_msg(c, ret, "creating stripe key");
if (ret) {
goto err;
@@ -1844,6 +1870,10 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans,
}
h = ec_new_stripe_head_alloc(c, disk_label, algo, redundancy, watermark);
+ if (!h) {
+ h = ERR_PTR(-BCH_ERR_ENOMEM_stripe_head_alloc);
+ goto err;
+ }
found:
if (h->rw_devs_change_count != c->rw_devs_change_count)
ec_stripe_head_devs_update(c, h);
@@ -1876,7 +1906,15 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_
bitmap_and(devs.d, devs.d, c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX);
for_each_set_bit(i, h->s->blocks_gotten, v->nr_blocks) {
- __clear_bit(v->ptrs[i].dev, devs.d);
+ /*
+ * Note: we don't yet repair invalid blocks (failed/removed
+ * devices) when reusing stripes - we still need a codepath to
+ * walk backpointers and update all extents that point to that
+ * block when updating the stripe
+ */
+ if (v->ptrs[i].dev != BCH_SB_MEMBER_INVALID)
+ __clear_bit(v->ptrs[i].dev, devs.d);
+
if (i < h->s->nr_data)
nr_have_data++;
else
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 60b7875adada..9c4fe5cdbfb7 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -83,6 +83,8 @@
x(ENOMEM, ENOMEM_fs_other_alloc) \
x(ENOMEM, ENOMEM_dev_alloc) \
x(ENOMEM, ENOMEM_disk_accounting) \
+ x(ENOMEM, ENOMEM_stripe_head_alloc) \
+ x(ENOMEM, ENOMEM_journal_read_bucket) \
x(ENOSPC, ENOSPC_disk_reservation) \
x(ENOSPC, ENOSPC_bucket_alloc) \
x(ENOSPC, ENOSPC_disk_label_add) \
@@ -222,6 +224,7 @@
x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_type) \
x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_nr_superblocks) \
x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_superblocks_overlap) \
+ x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_sb_max_size_bits) \
x(BCH_ERR_invalid_sb, invalid_sb_members_missing) \
x(BCH_ERR_invalid_sb, invalid_sb_members) \
x(BCH_ERR_invalid_sb, invalid_sb_disk_groups) \
@@ -268,7 +271,8 @@
x(BCH_ERR_nopromote, nopromote_no_writes) \
x(BCH_ERR_nopromote, nopromote_enomem) \
x(0, invalid_snapshot_node) \
- x(0, option_needs_open_fs)
+ x(0, option_needs_open_fs) \
+ x(0, remove_disk_accounting_entry)
enum bch_errcode {
BCH_ERR_START = 2048,
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index 7a79f695ba2e..b679def8fb98 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -251,7 +251,10 @@ int __bch2_fsck_err(struct bch_fs *c,
* delete the key)
* - and we don't need to warn if we're not prompting
*/
- WARN_ON(!(flags & FSCK_AUTOFIX) && !trans && bch2_current_has_btree_trans(c));
+ WARN_ON((flags & FSCK_CAN_FIX) &&
+ !(flags & FSCK_AUTOFIX) &&
+ !trans &&
+ bch2_current_has_btree_trans(c));
if ((flags & FSCK_CAN_FIX) &&
test_bit(err, c->sb.errors_silent))
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index cc0d22085aef..37e3d69bec06 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -978,31 +978,54 @@ bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, struct bke
return NULL;
}
-void bch2_extent_ptr_set_cached(struct bkey_s k, struct bch_extent_ptr *ptr)
+static bool want_cached_ptr(struct bch_fs *c, struct bch_io_opts *opts,
+ struct bch_extent_ptr *ptr)
+{
+ if (!opts->promote_target ||
+ !bch2_dev_in_target(c, ptr->dev, opts->promote_target))
+ return false;
+
+ struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev);
+
+ return ca && bch2_dev_is_readable(ca) && !dev_ptr_stale_rcu(ca, ptr);
+}
+
+void bch2_extent_ptr_set_cached(struct bch_fs *c,
+ struct bch_io_opts *opts,
+ struct bkey_s k,
+ struct bch_extent_ptr *ptr)
{
struct bkey_ptrs ptrs = bch2_bkey_ptrs(k);
union bch_extent_entry *entry;
- union bch_extent_entry *ec = NULL;
+ struct extent_ptr_decoded p;
- bkey_extent_entry_for_each(ptrs, entry) {
+ rcu_read_lock();
+ if (!want_cached_ptr(c, opts, ptr)) {
+ bch2_bkey_drop_ptr_noerror(k, ptr);
+ goto out;
+ }
+
+ /*
+ * Stripes can't contain cached data, for - reasons.
+ *
+ * Possibly something we can fix in the future?
+ */
+ bkey_for_each_ptr_decode(k.k, ptrs, p, entry)
if (&entry->ptr == ptr) {
- ptr->cached = true;
- if (ec)
- extent_entry_drop(k, ec);
- return;
+ if (p.has_ec)
+ bch2_bkey_drop_ptr_noerror(k, ptr);
+ else
+ ptr->cached = true;
+ goto out;
}
- if (extent_entry_is_stripe_ptr(entry))
- ec = entry;
- else if (extent_entry_is_ptr(entry))
- ec = NULL;
- }
-
BUG();
+out:
+ rcu_read_unlock();
}
/*
- * bch_extent_normalize - clean up an extent, dropping stale pointers etc.
+ * bch2_extent_normalize - clean up an extent, dropping stale pointers etc.
*
* Returns true if @k should be dropped entirely
*
@@ -1016,8 +1039,39 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
rcu_read_lock();
bch2_bkey_drop_ptrs(k, ptr,
ptr->cached &&
- (ca = bch2_dev_rcu(c, ptr->dev)) &&
- dev_ptr_stale_rcu(ca, ptr) > 0);
+ (!(ca = bch2_dev_rcu(c, ptr->dev)) ||
+ dev_ptr_stale_rcu(ca, ptr) > 0));
+ rcu_read_unlock();
+
+ return bkey_deleted(k.k);
+}
+
+/*
+ * bch2_extent_normalize_by_opts - clean up an extent, dropping stale pointers etc.
+ *
+ * Like bch2_extent_normalize(), but also only keeps a single cached pointer on
+ * the promote target.
+ */
+bool bch2_extent_normalize_by_opts(struct bch_fs *c,
+ struct bch_io_opts *opts,
+ struct bkey_s k)
+{
+ struct bkey_ptrs ptrs;
+ bool have_cached_ptr;
+
+ rcu_read_lock();
+restart_drop_ptrs:
+ ptrs = bch2_bkey_ptrs(k);
+ have_cached_ptr = false;
+
+ bkey_for_each_ptr(ptrs, ptr)
+ if (ptr->cached) {
+ if (have_cached_ptr || !want_cached_ptr(c, opts, ptr)) {
+ bch2_bkey_drop_ptr(k, ptr);
+ goto restart_drop_ptrs;
+ }
+ have_cached_ptr = true;
+ }
rcu_read_unlock();
return bkey_deleted(k.k);
@@ -1310,7 +1364,7 @@ void bch2_ptr_swab(struct bkey_s k)
for (entry = ptrs.start;
entry < ptrs.end;
entry = extent_entry_next(entry)) {
- switch (extent_entry_type(entry)) {
+ switch (__extent_entry_type(entry)) {
case BCH_EXTENT_ENTRY_ptr:
break;
case BCH_EXTENT_ENTRY_crc32:
@@ -1330,6 +1384,9 @@ void bch2_ptr_swab(struct bkey_s k)
break;
case BCH_EXTENT_ENTRY_rebalance:
break;
+ default:
+ /* Bad entry type: will be caught by validate() */
+ return;
}
}
}
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index ed5001dd662e..bcffcf60aaaf 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -686,15 +686,28 @@ bool bch2_extents_match(struct bkey_s_c, struct bkey_s_c);
struct bch_extent_ptr *
bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s);
-void bch2_extent_ptr_set_cached(struct bkey_s, struct bch_extent_ptr *);
+void bch2_extent_ptr_set_cached(struct bch_fs *, struct bch_io_opts *,
+ struct bkey_s, struct bch_extent_ptr *);
+bool bch2_extent_normalize_by_opts(struct bch_fs *, struct bch_io_opts *, struct bkey_s);
bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
+
void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *, const struct bch_extent_ptr *);
void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
int bch2_bkey_ptrs_validate(struct bch_fs *, struct bkey_s_c,
enum bch_validate_flags);
+static inline bool bch2_extent_ptr_eq(struct bch_extent_ptr ptr1,
+ struct bch_extent_ptr ptr2)
+{
+ return (ptr1.cached == ptr2.cached &&
+ ptr1.unwritten == ptr2.unwritten &&
+ ptr1.offset == ptr2.offset &&
+ ptr1.dev == ptr2.dev &&
+ ptr1.dev == ptr2.dev);
+}
+
void bch2_ptr_swab(struct bkey_s);
const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c);
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
index 48a1ab9a649b..95972809e76d 100644
--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -856,6 +856,12 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
folios_trunc(&fs, fi);
end = min(end, folio_end_pos(darray_last(fs)));
} else {
+ if (!folio_test_uptodate(f)) {
+ ret = bch2_read_single_folio(f, mapping);
+ if (ret)
+ goto out;
+ }
+
folios_trunc(&fs, fi + 1);
end = f_pos + f_reserved;
}
diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c
index ee1c0325f313..6d3a05ae5da8 100644
--- a/fs/bcachefs/fs-io-direct.c
+++ b/fs/bcachefs/fs-io-direct.c
@@ -369,6 +369,7 @@ static noinline void bch2_dio_write_flush(struct dio_write *dio)
static __always_inline long bch2_dio_write_done(struct dio_write *dio)
{
+ struct bch_fs *c = dio->op.c;
struct kiocb *req = dio->req;
struct bch_inode_info *inode = dio->inode;
bool sync = dio->sync;
@@ -387,7 +388,7 @@ static __always_inline long bch2_dio_write_done(struct dio_write *dio)
ret = dio->op.error ?: ((long) dio->written << 9);
bio_put(&dio->op.wbio.bio);
- bch2_write_ref_put(dio->op.c, BCH_WRITE_REF_dio_write);
+ bch2_write_ref_put(c, BCH_WRITE_REF_dio_write);
/* inode->i_dio_count is our ref on inode and thus bch_fs */
inode_dio_end(&inode->v);
diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c
index af3a24546aa3..1d4910ea0f1d 100644
--- a/fs/bcachefs/fs-io-pagecache.c
+++ b/fs/bcachefs/fs-io-pagecache.c
@@ -399,14 +399,17 @@ void bch2_folio_reservation_put(struct bch_fs *c,
bch2_quota_reservation_put(c, inode, &res->quota);
}
-int bch2_folio_reservation_get(struct bch_fs *c,
+static int __bch2_folio_reservation_get(struct bch_fs *c,
struct bch_inode_info *inode,
struct folio *folio,
struct bch2_folio_reservation *res,
- size_t offset, size_t len)
+ size_t offset, size_t len,
+ bool partial)
{
struct bch_folio *s = bch2_folio_create(folio, 0);
unsigned i, disk_sectors = 0, quota_sectors = 0;
+ struct disk_reservation disk_res = {};
+ size_t reserved = len;
int ret;
if (!s)
@@ -422,48 +425,65 @@ int bch2_folio_reservation_get(struct bch_fs *c,
}
if (disk_sectors) {
- ret = bch2_disk_reservation_add(c, &res->disk, disk_sectors, 0);
+ ret = bch2_disk_reservation_add(c, &disk_res, disk_sectors,
+ partial ? BCH_DISK_RESERVATION_PARTIAL : 0);
if (unlikely(ret))
return ret;
+
+ if (unlikely(disk_res.sectors != disk_sectors)) {
+ disk_sectors = quota_sectors = 0;
+
+ for (i = round_down(offset, block_bytes(c)) >> 9;
+ i < round_up(offset + len, block_bytes(c)) >> 9;
+ i++) {
+ disk_sectors += sectors_to_reserve(&s->s[i], res->disk.nr_replicas);
+ if (disk_sectors > disk_res.sectors) {
+ /*
+ * Make sure to get a reservation that's
+ * aligned to the filesystem blocksize:
+ */
+ unsigned reserved_offset = round_down(i << 9, block_bytes(c));
+ reserved = clamp(reserved_offset, offset, offset + len) - offset;
+
+ if (!reserved) {
+ bch2_disk_reservation_put(c, &disk_res);
+ return -BCH_ERR_ENOSPC_disk_reservation;
+ }
+ break;
+ }
+ quota_sectors += s->s[i].state == SECTOR_unallocated;
+ }
+ }
}
if (quota_sectors) {
ret = bch2_quota_reservation_add(c, inode, &res->quota, quota_sectors, true);
if (unlikely(ret)) {
- struct disk_reservation tmp = { .sectors = disk_sectors };
-
- bch2_disk_reservation_put(c, &tmp);
- res->disk.sectors -= disk_sectors;
+ bch2_disk_reservation_put(c, &disk_res);
return ret;
}
}
- return 0;
+ res->disk.sectors += disk_res.sectors;
+ return partial ? reserved : 0;
}
-ssize_t bch2_folio_reservation_get_partial(struct bch_fs *c,
+int bch2_folio_reservation_get(struct bch_fs *c,
struct bch_inode_info *inode,
struct folio *folio,
struct bch2_folio_reservation *res,
size_t offset, size_t len)
{
- size_t l, reserved = 0;
- int ret;
-
- while ((l = len - reserved)) {
- while ((ret = bch2_folio_reservation_get(c, inode, folio, res, offset, l))) {
- if ((offset & (block_bytes(c) - 1)) + l <= block_bytes(c))
- return reserved ?: ret;
-
- len = reserved + l;
- l /= 2;
- }
-
- offset += l;
- reserved += l;
- }
+ return __bch2_folio_reservation_get(c, inode, folio, res, offset, len, false);
+}
- return reserved;
+ssize_t bch2_folio_reservation_get_partial(struct bch_fs *c,
+ struct bch_inode_info *inode,
+ struct folio *folio,
+ struct bch2_folio_reservation *res,
+ size_t offset, size_t len)
+{
+ return __bch2_folio_reservation_get(c, inode, folio, res, offset, len, true);
}
static void bch2_clear_folio_bits(struct folio *folio)
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 71d0fa387509..2456c41b215e 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -182,7 +182,7 @@ static int bch2_flush_inode(struct bch_fs *c,
struct bch_inode_unpacked u;
int ret = bch2_inode_find_by_inum(c, inode_inum(inode), &u) ?:
- bch2_journal_flush_seq(&c->journal, u.bi_journal_seq) ?:
+ bch2_journal_flush_seq(&c->journal, u.bi_journal_seq, TASK_INTERRUPTIBLE) ?:
bch2_inode_flush_nocow_writes(c, inode);
bch2_write_ref_put(c, BCH_WRITE_REF_fsync);
return ret;
@@ -587,7 +587,7 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
POS(inode->v.i_ino, start_sector),
BTREE_ITER_slots|BTREE_ITER_intent);
- while (!ret && bkey_lt(iter.pos, end_pos)) {
+ while (!ret) {
s64 i_sectors_delta = 0;
struct quota_res quota_res = { 0 };
struct bkey_s_c k;
@@ -598,6 +598,9 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
bch2_trans_begin(trans);
+ if (bkey_ge(iter.pos, end_pos))
+ break;
+
ret = bch2_subvolume_get_snapshot(trans,
inode->ei_inum.subvol, &snapshot);
if (ret)
@@ -634,12 +637,15 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
if (bch2_clamp_data_hole(&inode->v,
&hole_start,
&hole_end,
- opts.data_replicas, true))
+ opts.data_replicas, true)) {
ret = drop_locks_do(trans,
(bch2_clamp_data_hole(&inode->v,
&hole_start,
&hole_end,
opts.data_replicas, false), 0));
+ if (ret)
+ goto bkey_err;
+ }
bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, hole_start));
if (ret)
@@ -667,10 +673,13 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
bch2_i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
if (bch2_mark_pagecache_reserved(inode, &hole_start,
- iter.pos.offset, true))
- drop_locks_do(trans,
+ iter.pos.offset, true)) {
+ ret = drop_locks_do(trans,
bch2_mark_pagecache_reserved(inode, &hole_start,
iter.pos.offset, false));
+ if (ret)
+ goto bkey_err;
+ }
bkey_err:
bch2_quota_reservation_put(c, inode, &quota_res);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 02969dff165d..a41d0d8a2f7b 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -157,6 +157,20 @@ static bool subvol_inum_eq(subvol_inum a, subvol_inum b)
return a.subvol == b.subvol && a.inum == b.inum;
}
+static u32 bch2_vfs_inode_hash_fn(const void *data, u32 len, u32 seed)
+{
+ const subvol_inum *inum = data;
+
+ return jhash(&inum->inum, sizeof(inum->inum), seed);
+}
+
+static u32 bch2_vfs_inode_obj_hash_fn(const void *data, u32 len, u32 seed)
+{
+ const struct bch_inode_info *inode = data;
+
+ return bch2_vfs_inode_hash_fn(&inode->ei_inum, sizeof(inode->ei_inum), seed);
+}
+
static int bch2_vfs_inode_cmp_fn(struct rhashtable_compare_arg *arg,
const void *obj)
{
@@ -170,11 +184,91 @@ static const struct rhashtable_params bch2_vfs_inodes_params = {
.head_offset = offsetof(struct bch_inode_info, hash),
.key_offset = offsetof(struct bch_inode_info, ei_inum),
.key_len = sizeof(subvol_inum),
+ .hashfn = bch2_vfs_inode_hash_fn,
+ .obj_hashfn = bch2_vfs_inode_obj_hash_fn,
.obj_cmpfn = bch2_vfs_inode_cmp_fn,
.automatic_shrinking = true,
};
-struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum)
+int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p)
+{
+ struct bch_fs *c = trans->c;
+ struct rhashtable *ht = &c->vfs_inodes_table;
+ subvol_inum inum = (subvol_inum) { .inum = p.offset };
+ DARRAY(u32) subvols;
+ int ret = 0;
+
+ if (!test_bit(BCH_FS_started, &c->flags))
+ return false;
+
+ darray_init(&subvols);
+restart_from_top:
+
+ /*
+ * Tweaked version of __rhashtable_lookup(); we need to get a list of
+ * subvolumes in which the given inode number is open.
+ *
+ * For this to work, we don't include the subvolume ID in the key that
+ * we hash - all inodes with the same inode number regardless of
+ * subvolume will hash to the same slot.
+ *
+ * This will be less than ideal if the same file is ever open
+ * simultaneously in many different snapshots:
+ */
+ rcu_read_lock();
+ struct rhash_lock_head __rcu *const *bkt;
+ struct rhash_head *he;
+ unsigned int hash;
+ struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
+restart:
+ hash = rht_key_hashfn(ht, tbl, &inum, bch2_vfs_inodes_params);
+ bkt = rht_bucket(tbl, hash);
+ do {
+ struct bch_inode_info *inode;
+
+ rht_for_each_entry_rcu_from(inode, he, rht_ptr_rcu(bkt), tbl, hash, hash) {
+ if (inode->ei_inum.inum == inum.inum) {
+ ret = darray_push_gfp(&subvols, inode->ei_inum.subvol,
+ GFP_NOWAIT|__GFP_NOWARN);
+ if (ret) {
+ rcu_read_unlock();
+ ret = darray_make_room(&subvols, 1);
+ if (ret)
+ goto err;
+ subvols.nr = 0;
+ goto restart_from_top;
+ }
+ }
+ }
+ /* An object might have been moved to a different hash chain,
+ * while we walk along it - better check and retry.
+ */
+ } while (he != RHT_NULLS_MARKER(bkt));
+
+ /* Ensure we see any new tables. */
+ smp_rmb();
+
+ tbl = rht_dereference_rcu(tbl->future_tbl, ht);
+ if (unlikely(tbl))
+ goto restart;
+ rcu_read_unlock();
+
+ darray_for_each(subvols, i) {
+ u32 snap;
+ ret = bch2_subvolume_get_snapshot(trans, *i, &snap);
+ if (ret)
+ goto err;
+
+ ret = bch2_snapshot_is_ancestor(c, snap, p.snapshot);
+ if (ret)
+ break;
+ }
+err:
+ darray_exit(&subvols);
+ return ret;
+}
+
+static struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum)
{
return rhashtable_lookup_fast(&c->vfs_inodes_table, &inum, bch2_vfs_inodes_params);
}
@@ -184,7 +278,8 @@ static void __wait_on_freeing_inode(struct bch_fs *c,
subvol_inum inum)
{
wait_queue_head_t *wq;
- DEFINE_WAIT_BIT(wait, &inode->v.i_state, __I_NEW);
+ struct wait_bit_queue_entry wait;
+
wq = inode_bit_waitqueue(&wait, &inode->v, __I_NEW);
prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
spin_unlock(&inode->v.i_lock);
@@ -252,7 +347,8 @@ static struct bch_inode_info *bch2_inode_hash_insert(struct bch_fs *c,
set_bit(EI_INODE_HASHED, &inode->ei_flags);
retry:
- if (unlikely(rhashtable_lookup_insert_fast(&c->vfs_inodes_table,
+ if (unlikely(rhashtable_lookup_insert_key(&c->vfs_inodes_table,
+ &inode->ei_inum,
&inode->hash,
bch2_vfs_inodes_params))) {
old = bch2_inode_hash_find(c, trans, inode->ei_inum);
@@ -560,7 +656,7 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode);
struct bch_inode_info *inode;
- bch2_trans_do(c, NULL, NULL, 0,
+ bch2_trans_do(c,
PTR_ERR_OR_ZERO(inode = bch2_lookup_trans(trans, inode_inum(dir),
&hash, &dentry->d_name)));
if (IS_ERR(inode))
@@ -773,7 +869,7 @@ static int bch2_rename2(struct mnt_idmap *idmap,
ret = bch2_subvol_is_ro_trans(trans, src_dir->ei_inum.subvol) ?:
bch2_subvol_is_ro_trans(trans, dst_dir->ei_inum.subvol);
if (ret)
- goto err;
+ goto err_tx_restart;
if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) {
ret = bch2_fs_quota_transfer(c, src_inode,
@@ -1170,7 +1266,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
bch2_trans_iter_init(trans, &iter, BTREE_ID_extents,
POS(ei->v.i_ino, start), 0);
- while (true) {
+ while (!ret || bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
enum btree_id data_btree = BTREE_ID_extents;
bch2_trans_begin(trans);
@@ -1178,14 +1274,14 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
u32 snapshot;
ret = bch2_subvolume_get_snapshot(trans, ei->ei_inum.subvol, &snapshot);
if (ret)
- goto err;
+ continue;
bch2_btree_iter_set_snapshot(&iter, snapshot);
k = bch2_btree_iter_peek_upto(&iter, end);
ret = bkey_err(k);
if (ret)
- goto err;
+ continue;
if (!k.k)
break;
@@ -1205,7 +1301,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
ret = bch2_read_indirect_extent(trans, &data_btree,
&offset_into_extent, &cur);
if (ret)
- break;
+ continue;
k = bkey_i_to_s_c(cur.k);
bch2_bkey_buf_realloc(&prev, c, k.k->u64s);
@@ -1233,10 +1329,6 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info,
bch2_btree_iter_set_pos(&iter,
POS(iter.pos.inode, iter.pos.offset + sectors));
-err:
- if (ret &&
- !bch2_err_matches(ret, BCH_ERR_transaction_restart))
- break;
}
bch2_trans_iter_exit(trans, &iter);
@@ -1944,7 +2036,7 @@ static int bch2_show_options(struct seq_file *seq, struct dentry *root)
bch2_opts_to_text(&buf, c->opts, c, c->disk_sb.sb,
OPT_MOUNT, OPT_HIDDEN, OPT_SHOW_MOUNT_STYLE);
printbuf_nul_terminate(&buf);
- seq_puts(seq, buf.buf);
+ seq_printf(seq, ",%s", buf.buf);
int ret = buf.allocation_failure ? -ENOMEM : 0;
printbuf_exit(&buf);
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
index da74ecc236e7..59f9f7ae728d 100644
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
@@ -54,8 +54,6 @@ static inline subvol_inum inode_inum(struct bch_inode_info *inode)
return inode->ei_inum;
}
-struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *, subvol_inum);
-
/*
* Set if we've gotten a btree error for this inode, and thus the vfs inode and
* btree inode may be inconsistent:
@@ -148,6 +146,8 @@ struct bch_inode_info *
__bch2_create(struct mnt_idmap *, struct bch_inode_info *,
struct dentry *, umode_t, dev_t, subvol_inum, unsigned);
+int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p);
+
int bch2_fs_quota_transfer(struct bch_fs *,
struct bch_inode_info *,
struct bch_qid,
@@ -198,10 +198,7 @@ int bch2_vfs_init(void);
#define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields) ({ do {} while (0); })
-static inline struct bch_inode_info *__bch2_inode_hash_find(struct bch_fs *c, subvol_inum inum)
-{
- return NULL;
-}
+static inline int bch2_inode_or_descendents_is_open(struct btree_trans *trans, struct bpos p) { return 0; }
static inline void bch2_evict_subvolume_inodes(struct bch_fs *c,
snapshot_id_list *s) {}
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index b8a6ceb0cc7a..75c8a97a6954 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -326,17 +326,54 @@ err:
return ret;
}
+static inline bool inode_should_reattach(struct bch_inode_unpacked *inode)
+{
+ if (inode->bi_inum == BCACHEFS_ROOT_INO &&
+ inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)
+ return false;
+
+ return !inode->bi_dir && !(inode->bi_flags & BCH_INODE_unlinked);
+}
+
+static int maybe_delete_dirent(struct btree_trans *trans, struct bpos d_pos, u32 snapshot)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_dirents,
+ SPOS(d_pos.inode, d_pos.offset, snapshot),
+ BTREE_ITER_intent|
+ BTREE_ITER_with_updates);
+ int ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ if (bpos_eq(k.k->p, d_pos)) {
+ /*
+ * delet_at() doesn't work because the update path doesn't
+ * internally use BTREE_ITER_with_updates yet
+ */
+ struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k));
+ ret = PTR_ERR_OR_ZERO(k);
+ if (ret)
+ goto err;
+
+ bkey_init(&k->k);
+ k->k.type = KEY_TYPE_whiteout;
+ k->k.p = iter.pos;
+ ret = bch2_trans_update(trans, &iter, k, BTREE_UPDATE_internal_snapshot_node);
+ }
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *inode)
{
struct bch_fs *c = trans->c;
- struct bch_hash_info dir_hash;
struct bch_inode_unpacked lostfound;
char name_buf[20];
- struct qstr name;
- u64 dir_offset = 0;
- u32 dirent_snapshot = inode->bi_snapshot;
int ret;
+ u32 dirent_snapshot = inode->bi_snapshot;
if (inode->bi_subvol) {
inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL;
@@ -367,9 +404,10 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *
if (ret)
return ret;
- dir_hash = bch2_hash_info_init(c, &lostfound);
+ struct bch_hash_info dir_hash = bch2_hash_info_init(c, &lostfound);
+ struct qstr name = (struct qstr) QSTR(name_buf);
- name = (struct qstr) QSTR(name_buf);
+ inode->bi_dir = lostfound.bi_inum;
ret = bch2_dirent_create_snapshot(trans,
inode->bi_parent_subvol, lostfound.bi_inum,
@@ -378,17 +416,70 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked *
inode_d_type(inode),
&name,
inode->bi_subvol ?: inode->bi_inum,
- &dir_offset,
+ &inode->bi_dir_offset,
STR_HASH_must_create);
if (ret) {
bch_err_msg(c, ret, "error creating dirent");
return ret;
}
- inode->bi_dir = lostfound.bi_inum;
- inode->bi_dir_offset = dir_offset;
+ ret = __bch2_fsck_write_inode(trans, inode);
+ if (ret)
+ return ret;
+
+ /*
+ * Fix up inodes in child snapshots: if they should also be reattached
+ * update the backpointer field, if they should not be we need to emit
+ * whiteouts for the dirent we just created.
+ */
+ if (!inode->bi_subvol && bch2_snapshot_is_leaf(c, inode->bi_snapshot) <= 0) {
+ snapshot_id_list whiteouts_done;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+
+ darray_init(&whiteouts_done);
+
+ for_each_btree_key_reverse_norestart(trans, iter,
+ BTREE_ID_inodes, SPOS(0, inode->bi_inum, inode->bi_snapshot - 1),
+ BTREE_ITER_all_snapshots|BTREE_ITER_intent, k, ret) {
+ if (k.k->p.offset != inode->bi_inum)
+ break;
+
+ if (!bkey_is_inode(k.k) ||
+ !bch2_snapshot_is_ancestor(c, k.k->p.snapshot, inode->bi_snapshot) ||
+ snapshot_list_has_ancestor(c, &whiteouts_done, k.k->p.snapshot))
+ continue;
+
+ struct bch_inode_unpacked child_inode;
+ bch2_inode_unpack(k, &child_inode);
- return __bch2_fsck_write_inode(trans, inode);
+ if (!inode_should_reattach(&child_inode)) {
+ ret = maybe_delete_dirent(trans,
+ SPOS(lostfound.bi_inum, inode->bi_dir_offset,
+ dirent_snapshot),
+ k.k->p.snapshot);
+ if (ret)
+ break;
+
+ ret = snapshot_list_add(c, &whiteouts_done, k.k->p.snapshot);
+ if (ret)
+ break;
+ } else {
+ iter.snapshot = k.k->p.snapshot;
+ child_inode.bi_dir = inode->bi_dir;
+ child_inode.bi_dir_offset = inode->bi_dir_offset;
+
+ ret = bch2_inode_write_flags(trans, &iter, &child_inode,
+ BTREE_UPDATE_internal_snapshot_node);
+ if (ret)
+ break;
+ }
+ }
+ darray_exit(&whiteouts_done);
+ bch2_trans_iter_exit(trans, &iter);
+ }
+
+ return ret;
}
static int remove_backpointer(struct btree_trans *trans,
@@ -838,35 +929,138 @@ static int get_visible_inodes(struct btree_trans *trans,
return ret;
}
-static int hash_redo_key(struct btree_trans *trans,
- const struct bch_hash_desc desc,
- struct bch_hash_info *hash_info,
- struct btree_iter *k_iter, struct bkey_s_c k)
+static int dirent_has_target(struct btree_trans *trans, struct bkey_s_c_dirent d)
{
- struct bkey_i *delete;
- struct bkey_i *tmp;
+ if (d.v->d_type == DT_SUBVOL) {
+ u32 snap;
+ u64 inum;
+ int ret = subvol_lookup(trans, le32_to_cpu(d.v->d_child_subvol), &snap, &inum);
+ if (ret && !bch2_err_matches(ret, ENOENT))
+ return ret;
+ return !ret;
+ } else {
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
+ SPOS(0, le64_to_cpu(d.v->d_inum), d.k->p.snapshot), 0);
+ int ret = bkey_err(k);
+ if (ret)
+ return ret;
- delete = bch2_trans_kmalloc(trans, sizeof(*delete));
- if (IS_ERR(delete))
- return PTR_ERR(delete);
+ ret = bkey_is_inode(k.k);
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+ }
+}
- tmp = bch2_bkey_make_mut_noupdate(trans, k);
- if (IS_ERR(tmp))
- return PTR_ERR(tmp);
+/*
+ * Prefer to delete the first one, since that will be the one at the wrong
+ * offset:
+ * return value: 0 -> delete k1, 1 -> delete k2
+ */
+static int hash_pick_winner(struct btree_trans *trans,
+ const struct bch_hash_desc desc,
+ struct bch_hash_info *hash_info,
+ struct bkey_s_c k1,
+ struct bkey_s_c k2)
+{
+ if (bkey_val_bytes(k1.k) == bkey_val_bytes(k2.k) &&
+ !memcmp(k1.v, k2.v, bkey_val_bytes(k1.k)))
+ return 0;
- bkey_init(&delete->k);
- delete->k.p = k_iter->pos;
- return bch2_btree_iter_traverse(k_iter) ?:
- bch2_trans_update(trans, k_iter, delete, 0) ?:
- bch2_hash_set_in_snapshot(trans, desc, hash_info,
- (subvol_inum) { 0, k.k->p.inode },
- k.k->p.snapshot, tmp,
- STR_HASH_must_create|
- BTREE_UPDATE_internal_snapshot_node) ?:
- bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+ switch (desc.btree_id) {
+ case BTREE_ID_dirents: {
+ int ret = dirent_has_target(trans, bkey_s_c_to_dirent(k1));
+ if (ret < 0)
+ return ret;
+ if (!ret)
+ return 0;
+
+ ret = dirent_has_target(trans, bkey_s_c_to_dirent(k2));
+ if (ret < 0)
+ return ret;
+ if (!ret)
+ return 1;
+ return 2;
+ }
+ default:
+ return 0;
+ }
+}
+
+static int fsck_update_backpointers(struct btree_trans *trans,
+ struct snapshots_seen *s,
+ const struct bch_hash_desc desc,
+ struct bch_hash_info *hash_info,
+ struct bkey_i *new)
+{
+ if (new->k.type != KEY_TYPE_dirent)
+ return 0;
+
+ struct bkey_i_dirent *d = bkey_i_to_dirent(new);
+ struct inode_walker target = inode_walker_init();
+ int ret = 0;
+
+ if (d->v.d_type == DT_SUBVOL) {
+ BUG();
+ } else {
+ ret = get_visible_inodes(trans, &target, s, le64_to_cpu(d->v.d_inum));
+ if (ret)
+ goto err;
+
+ darray_for_each(target.inodes, i) {
+ i->inode.bi_dir_offset = d->k.p.offset;
+ ret = __bch2_fsck_write_inode(trans, &i->inode);
+ if (ret)
+ goto err;
+ }
+ }
+err:
+ inode_walker_exit(&target);
+ return ret;
+}
+
+static int fsck_rename_dirent(struct btree_trans *trans,
+ struct snapshots_seen *s,
+ const struct bch_hash_desc desc,
+ struct bch_hash_info *hash_info,
+ struct bkey_s_c_dirent old)
+{
+ struct qstr old_name = bch2_dirent_get_name(old);
+ struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, bkey_bytes(old.k) + 32);
+ int ret = PTR_ERR_OR_ZERO(new);
+ if (ret)
+ return ret;
+
+ bkey_dirent_init(&new->k_i);
+ dirent_copy_target(new, old);
+ new->k.p = old.k->p;
+
+ for (unsigned i = 0; i < 1000; i++) {
+ unsigned len = sprintf(new->v.d_name, "%.*s.fsck_renamed-%u",
+ old_name.len, old_name.name, i);
+ unsigned u64s = BKEY_U64s + dirent_val_u64s(len);
+
+ if (u64s > U8_MAX)
+ return -EINVAL;
+
+ new->k.u64s = u64s;
+
+ ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info,
+ (subvol_inum) { 0, old.k->p.inode },
+ old.k->p.snapshot, &new->k_i,
+ BTREE_UPDATE_internal_snapshot_node);
+ if (!bch2_err_matches(ret, EEXIST))
+ break;
+ }
+
+ if (ret)
+ return ret;
+
+ return fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i);
}
static int hash_check_key(struct btree_trans *trans,
+ struct snapshots_seen *s,
const struct bch_hash_desc desc,
struct bch_hash_info *hash_info,
struct btree_iter *k_iter, struct bkey_s_c hash_k)
@@ -895,16 +1089,9 @@ static int hash_check_key(struct btree_trans *trans,
if (bkey_eq(k.k->p, hash_k.k->p))
break;
- if (fsck_err_on(k.k->type == desc.key_type &&
- !desc.cmp_bkey(k, hash_k),
- trans, hash_table_key_duplicate,
- "duplicate hash table keys:\n%s",
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, hash_k),
- buf.buf))) {
- ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1;
- break;
- }
+ if (k.k->type == desc.key_type &&
+ !desc.cmp_bkey(k, hash_k))
+ goto duplicate_entries;
if (bkey_deleted(k.k)) {
bch2_trans_iter_exit(trans, &iter);
@@ -917,18 +1104,66 @@ out:
return ret;
bad_hash:
if (fsck_err(trans, hash_table_key_wrong_offset,
- "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s",
+ "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n %s",
bch2_btree_id_str(desc.btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) {
- ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k);
- bch_err_fn(c, ret);
+ struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, hash_k);
+ if (IS_ERR(new))
+ return PTR_ERR(new);
+
+ k = bch2_hash_set_or_get_in_snapshot(trans, &iter, desc, hash_info,
+ (subvol_inum) { 0, hash_k.k->p.inode },
+ hash_k.k->p.snapshot, new,
+ STR_HASH_must_create|
+ BTREE_ITER_with_updates|
+ BTREE_UPDATE_internal_snapshot_node);
+ ret = bkey_err(k);
if (ret)
- return ret;
- ret = -BCH_ERR_transaction_restart_nested;
+ goto out;
+ if (k.k)
+ goto duplicate_entries;
+
+ ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter,
+ BTREE_UPDATE_internal_snapshot_node) ?:
+ fsck_update_backpointers(trans, s, desc, hash_info, new) ?:
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?:
+ -BCH_ERR_transaction_restart_nested;
+ goto out;
}
fsck_err:
goto out;
+duplicate_entries:
+ ret = hash_pick_winner(trans, desc, hash_info, hash_k, k);
+ if (ret < 0)
+ goto out;
+
+ if (!fsck_err(trans, hash_table_key_duplicate,
+ "duplicate hash table keys%s:\n%s",
+ ret != 2 ? "" : ", both point to valid inodes",
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, hash_k),
+ prt_newline(&buf),
+ bch2_bkey_val_to_text(&buf, c, k),
+ buf.buf)))
+ goto out;
+
+ switch (ret) {
+ case 0:
+ ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0);
+ break;
+ case 1:
+ ret = bch2_hash_delete_at(trans, desc, hash_info, &iter, 0);
+ break;
+ case 2:
+ ret = fsck_rename_dirent(trans, s, desc, hash_info, bkey_s_c_to_dirent(hash_k)) ?:
+ bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0);
+ goto out;
+ }
+
+ ret = bch2_trans_commit(trans, NULL, NULL, 0) ?:
+ -BCH_ERR_transaction_restart_nested;
+ goto out;
}
static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
@@ -994,7 +1229,6 @@ static int check_inode_dirent_inode(struct btree_trans *trans,
*/
inode->bi_dir = 0;
inode->bi_dir_offset = 0;
- inode->bi_flags &= ~BCH_INODE_backptr_untrusted;
*write_inode = true;
}
@@ -1006,28 +1240,37 @@ fsck_err:
return ret;
}
-static bool bch2_inode_is_open(struct bch_fs *c, struct bpos p)
+static int get_snapshot_root_inode(struct btree_trans *trans,
+ struct bch_inode_unpacked *root,
+ u64 inum)
{
- subvol_inum inum = {
- .subvol = snapshot_t(c, p.snapshot)->subvol,
- .inum = p.offset,
- };
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret = 0;
- /* snapshot tree corruption, can't safely delete */
- if (!inum.subvol) {
- bch_warn_ratelimited(c, "%s(): snapshot %u has no subvol, unlinked but can't safely delete", __func__, p.snapshot);
- return true;
+ for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes,
+ SPOS(0, inum, U32_MAX),
+ BTREE_ITER_all_snapshots, k, ret) {
+ if (k.k->p.offset != inum)
+ break;
+ if (bkey_is_inode(k.k))
+ goto found_root;
}
-
- return __bch2_inode_hash_find(c, inum) != NULL;
+ if (ret)
+ goto err;
+ BUG();
+found_root:
+ BUG_ON(bch2_inode_unpack(k, root));
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
}
static int check_inode(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k,
- struct bch_inode_unpacked *prev,
- struct snapshots_seen *s,
- bool full)
+ struct bch_inode_unpacked *snapshot_root,
+ struct snapshots_seen *s)
{
struct bch_fs *c = trans->c;
struct printbuf buf = PRINTBUF;
@@ -1050,22 +1293,19 @@ static int check_inode(struct btree_trans *trans,
BUG_ON(bch2_inode_unpack(k, &u));
- if (!full &&
- !(u.bi_flags & (BCH_INODE_i_size_dirty|
- BCH_INODE_i_sectors_dirty|
- BCH_INODE_unlinked)))
- return 0;
-
- if (prev->bi_inum != u.bi_inum)
- *prev = u;
+ if (snapshot_root->bi_inum != u.bi_inum) {
+ ret = get_snapshot_root_inode(trans, snapshot_root, u.bi_inum);
+ if (ret)
+ goto err;
+ }
- if (fsck_err_on(prev->bi_hash_seed != u.bi_hash_seed ||
- inode_d_type(prev) != inode_d_type(&u),
+ if (fsck_err_on(u.bi_hash_seed != snapshot_root->bi_hash_seed ||
+ INODE_STR_HASH(&u) != INODE_STR_HASH(snapshot_root),
trans, inode_snapshot_mismatch,
"inodes in different snapshots don't match")) {
- bch_err(c, "repair not implemented yet");
- ret = -BCH_ERR_fsck_repair_unimplemented;
- goto err_noprint;
+ u.bi_hash_seed = snapshot_root->bi_hash_seed;
+ SET_INODE_STR_HASH(&u, INODE_STR_HASH(snapshot_root));
+ do_update = true;
}
if (u.bi_dir || u.bi_dir_offset) {
@@ -1101,28 +1341,27 @@ static int check_inode(struct btree_trans *trans,
ret = 0;
}
- if ((u.bi_flags & (BCH_INODE_i_size_dirty|BCH_INODE_unlinked)) &&
- bch2_key_has_snapshot_overwrites(trans, BTREE_ID_inodes, k.k->p)) {
- struct bpos new_min_pos;
-
- ret = bch2_propagate_key_to_snapshot_leaves(trans, iter->btree_id, k, &new_min_pos);
- if (ret)
- goto err;
-
- u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked;
-
- ret = __bch2_fsck_write_inode(trans, &u);
+ ret = bch2_inode_has_child_snapshots(trans, k.k->p);
+ if (ret < 0)
+ goto err;
- bch_err_msg(c, ret, "in fsck updating inode");
+ if (fsck_err_on(ret != !!(u.bi_flags & BCH_INODE_has_child_snapshot),
+ trans, inode_has_child_snapshots_wrong,
+ "inode has_child_snapshots flag wrong (should be %u)\n%s",
+ ret,
+ (printbuf_reset(&buf),
+ bch2_inode_unpacked_to_text(&buf, &u),
+ buf.buf))) {
if (ret)
- goto err_noprint;
-
- if (!bpos_eq(new_min_pos, POS_MIN))
- bch2_btree_iter_set_pos(iter, bpos_predecessor(new_min_pos));
- goto err_noprint;
+ u.bi_flags |= BCH_INODE_has_child_snapshot;
+ else
+ u.bi_flags &= ~BCH_INODE_has_child_snapshot;
+ do_update = true;
}
+ ret = 0;
- if (u.bi_flags & BCH_INODE_unlinked) {
+ if ((u.bi_flags & BCH_INODE_unlinked) &&
+ !(u.bi_flags & BCH_INODE_has_child_snapshot)) {
if (!test_bit(BCH_FS_started, &c->flags)) {
/*
* If we're not in online fsck, don't delete unlinked
@@ -1147,7 +1386,11 @@ static int check_inode(struct btree_trans *trans,
if (ret)
goto err;
} else {
- if (fsck_err_on(!bch2_inode_is_open(c, k.k->p),
+ ret = bch2_inode_or_descendents_is_open(trans, k.k->p);
+ if (ret < 0)
+ goto err;
+
+ if (fsck_err_on(!ret,
trans, inode_unlinked_and_not_open,
"inode %llu%u unlinked and not open",
u.bi_inum, u.bi_snapshot)) {
@@ -1155,69 +1398,10 @@ static int check_inode(struct btree_trans *trans,
bch_err_msg(c, ret, "in fsck deleting inode");
goto err_noprint;
}
+ ret = 0;
}
}
- /* i_size_dirty is vestigal, since we now have logged ops for truncate * */
- if (u.bi_flags & BCH_INODE_i_size_dirty &&
- (!test_bit(BCH_FS_clean_recovery, &c->flags) ||
- fsck_err(trans, inode_i_size_dirty_but_clean,
- "filesystem marked clean, but inode %llu has i_size dirty",
- u.bi_inum))) {
- bch_verbose(c, "truncating inode %llu", u.bi_inum);
-
- /*
- * XXX: need to truncate partial blocks too here - or ideally
- * just switch units to bytes and that issue goes away
- */
- ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents,
- SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9,
- iter->pos.snapshot),
- POS(u.bi_inum, U64_MAX),
- 0, NULL);
- bch_err_msg(c, ret, "in fsck truncating inode");
- if (ret)
- return ret;
-
- /*
- * We truncated without our normal sector accounting hook, just
- * make sure we recalculate it:
- */
- u.bi_flags |= BCH_INODE_i_sectors_dirty;
-
- u.bi_flags &= ~BCH_INODE_i_size_dirty;
- do_update = true;
- }
-
- /* i_sectors_dirty is vestigal, i_sectors is always updated transactionally */
- if (u.bi_flags & BCH_INODE_i_sectors_dirty &&
- (!test_bit(BCH_FS_clean_recovery, &c->flags) ||
- fsck_err(trans, inode_i_sectors_dirty_but_clean,
- "filesystem marked clean, but inode %llu has i_sectors dirty",
- u.bi_inum))) {
- s64 sectors;
-
- bch_verbose(c, "recounting sectors for inode %llu",
- u.bi_inum);
-
- sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot);
- if (sectors < 0) {
- bch_err_msg(c, sectors, "in fsck recounting inode sectors");
- return sectors;
- }
-
- u.bi_sectors = sectors;
- u.bi_flags &= ~BCH_INODE_i_sectors_dirty;
- do_update = true;
- }
-
- if (u.bi_flags & BCH_INODE_backptr_untrusted) {
- u.bi_dir = 0;
- u.bi_dir_offset = 0;
- u.bi_flags &= ~BCH_INODE_backptr_untrusted;
- do_update = true;
- }
-
if (fsck_err_on(u.bi_parent_subvol &&
(u.bi_subvol == 0 ||
u.bi_subvol == BCACHEFS_ROOT_SUBVOL),
@@ -1274,8 +1458,7 @@ err_noprint:
int bch2_check_inodes(struct bch_fs *c)
{
- bool full = c->opts.fsck;
- struct bch_inode_unpacked prev = { 0 };
+ struct bch_inode_unpacked snapshot_root = {};
struct snapshots_seen s;
snapshots_seen_init(&s);
@@ -1285,13 +1468,104 @@ int bch2_check_inodes(struct bch_fs *c)
POS_MIN,
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- check_inode(trans, &iter, k, &prev, &s, full)));
+ check_inode(trans, &iter, k, &snapshot_root, &s)));
snapshots_seen_exit(&s);
bch_err_fn(c, ret);
return ret;
}
+static int find_oldest_inode_needs_reattach(struct btree_trans *trans,
+ struct bch_inode_unpacked *inode)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ /*
+ * We look for inodes to reattach in natural key order, leaves first,
+ * but we should do the reattach at the oldest version that needs to be
+ * reattached:
+ */
+ for_each_btree_key_norestart(trans, iter,
+ BTREE_ID_inodes,
+ SPOS(0, inode->bi_inum, inode->bi_snapshot + 1),
+ BTREE_ITER_all_snapshots, k, ret) {
+ if (k.k->p.offset != inode->bi_inum)
+ break;
+
+ if (!bch2_snapshot_is_ancestor(c, inode->bi_snapshot, k.k->p.snapshot))
+ continue;
+
+ if (!bkey_is_inode(k.k))
+ break;
+
+ struct bch_inode_unpacked parent_inode;
+ bch2_inode_unpack(k, &parent_inode);
+
+ if (!inode_should_reattach(&parent_inode))
+ break;
+
+ *inode = parent_inode;
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
+ return ret;
+}
+
+static int check_unreachable_inode(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct bkey_s_c k)
+{
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
+
+ if (!bkey_is_inode(k.k))
+ return 0;
+
+ struct bch_inode_unpacked inode;
+ BUG_ON(bch2_inode_unpack(k, &inode));
+
+ if (!inode_should_reattach(&inode))
+ return 0;
+
+ ret = find_oldest_inode_needs_reattach(trans, &inode);
+ if (ret)
+ return ret;
+
+ if (fsck_err(trans, inode_unreachable,
+ "unreachable inode:\n%s",
+ (bch2_inode_unpacked_to_text(&buf, &inode),
+ buf.buf)))
+ ret = reattach_inode(trans, &inode);
+fsck_err:
+ printbuf_exit(&buf);
+ return ret;
+}
+
+/*
+ * Reattach unreachable (but not unlinked) inodes
+ *
+ * Run after check_inodes() and check_dirents(), so we node that inode
+ * backpointer fields point to valid dirents, and every inode that has a dirent
+ * that points to it has its backpointer field set - so we're just looking for
+ * non-unlinked inodes without backpointers:
+ *
+ * XXX: this is racy w.r.t. hardlink removal in online fsck
+ */
+int bch2_check_unreachable_inodes(struct bch_fs *c)
+{
+ int ret = bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
+ POS_MIN,
+ BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ check_unreachable_inode(trans, &iter, k)));
+ bch_err_fn(c, ret);
+ return ret;
+}
+
static inline bool btree_matches_i_mode(enum btree_id btree, unsigned mode)
{
switch (btree) {
@@ -1694,8 +1968,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
!key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot))
continue;
- if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_size_dirty) &&
- k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
+ if (fsck_err_on(k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 &&
!bkey_extent_is_reservation(k),
trans, extent_past_end_of_inode,
"extent type past end of inode %llu:%u, i_size %llu\n %s",
@@ -2207,7 +2480,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
*hash_info = bch2_hash_info_init(c, &i->inode);
dir->first_this_inode = false;
- ret = hash_check_key(trans, bch2_dirent_hash_desc, hash_info, iter, k);
+ ret = hash_check_key(trans, s, bch2_dirent_hash_desc, hash_info, iter, k);
if (ret < 0)
goto err;
if (ret) {
@@ -2321,7 +2594,7 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
*hash_info = bch2_hash_info_init(c, &i->inode);
inode->first_this_inode = false;
- ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k);
+ ret = hash_check_key(trans, NULL, bch2_xattr_hash_desc, hash_info, iter, k);
bch_err_fn(c, ret);
return ret;
}
@@ -2409,7 +2682,7 @@ fsck_err:
/* Get root directory, create if it doesn't exist: */
int bch2_check_root(struct bch_fs *c)
{
- int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
check_root_trans(trans));
bch_err_fn(c, ret);
return ret;
@@ -2450,22 +2723,6 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter,
if (ret)
break;
- /*
- * We've checked that inode backpointers point to valid dirents;
- * here, it's sufficient to check that the subvolume root has a
- * dirent:
- */
- if (fsck_err_on(!subvol_root.bi_dir,
- trans, subvol_unreachable,
- "unreachable subvolume %s",
- (bch2_bkey_val_to_text(&buf, c, s.s_c),
- prt_newline(&buf),
- bch2_inode_unpacked_to_text(&buf, &subvol_root),
- buf.buf))) {
- ret = reattach_subvol(trans, s);
- break;
- }
-
u32 parent = le32_to_cpu(s.v->fs_path_parent);
if (darray_u32_has(&subvol_path, parent)) {
@@ -2526,12 +2783,6 @@ static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
return false;
}
-/*
- * Check that a given inode is reachable from its subvolume root - we already
- * verified subvolume connectivity:
- *
- * XXX: we should also be verifying that inodes are in the right subvolumes
- */
static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c inode_k)
{
struct bch_fs *c = trans->c;
@@ -2545,6 +2796,9 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino
BUG_ON(bch2_inode_unpack(inode_k, &inode));
+ if (!S_ISDIR(inode.bi_mode))
+ return 0;
+
while (!inode.bi_subvol) {
struct btree_iter dirent_iter;
struct bkey_s_c_dirent d;
@@ -2559,21 +2813,15 @@ static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c ino
bch2_trans_iter_exit(trans, &dirent_iter);
if (bch2_err_matches(ret, ENOENT)) {
- ret = 0;
- if (fsck_err(trans, inode_unreachable,
- "unreachable inode\n%s",
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, inode_k),
- buf.buf)))
- ret = reattach_inode(trans, &inode);
+ printbuf_reset(&buf);
+ bch2_bkey_val_to_text(&buf, c, inode_k);
+ bch_err(c, "unreachable inode in check_directory_structure: %s\n%s",
+ bch2_err_str(ret), buf.buf);
goto out;
}
bch2_trans_iter_exit(trans, &dirent_iter);
- if (!S_ISDIR(inode.bi_mode))
- break;
-
ret = darray_push(p, ((struct pathbuf_entry) {
.inum = inode.bi_inum,
.snapshot = snapshot,
@@ -2626,9 +2874,8 @@ fsck_err:
}
/*
- * Check for unreachable inodes, as well as loops in the directory structure:
- * After bch2_check_dirents(), if an inode backpointer doesn't exist that means it's
- * unreachable:
+ * Check for loops in the directory structure: all other connectivity issues
+ * have been fixed by prior passes
*/
int bch2_check_directory_structure(struct bch_fs *c)
{
@@ -2756,6 +3003,10 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c,
if (S_ISDIR(u.bi_mode))
continue;
+ /*
+ * Previous passes ensured that bi_nlink is nonzero if
+ * it had multiple hardlinks:
+ */
if (!u.bi_nlink)
continue;
diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h
index a4ef94271784..1cca31011530 100644
--- a/fs/bcachefs/fsck.h
+++ b/fs/bcachefs/fsck.h
@@ -9,6 +9,7 @@ int bch2_check_dirents(struct bch_fs *);
int bch2_check_xattrs(struct bch_fs *);
int bch2_check_root(struct bch_fs *);
int bch2_check_subvolume_structure(struct bch_fs *);
+int bch2_check_unreachable_inodes(struct bch_fs *);
int bch2_check_directory_structure(struct bch_fs *);
int bch2_check_nlinks(struct bch_fs *);
int bch2_fix_reflink_p(struct bch_fs *);
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 3e5bc01961b8..039cb7a22244 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -12,6 +12,7 @@
#include "error.h"
#include "extents.h"
#include "extent_update.h"
+#include "fs.h"
#include "inode.h"
#include "str_hash.h"
#include "snapshot.h"
@@ -34,6 +35,8 @@ static const char * const bch2_inode_flag_strs[] = {
};
#undef x
+static int delete_ancestor_snapshot_inodes(struct btree_trans *, struct bpos);
+
static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 };
static int inode_decode_field(const u8 *in, const u8 *end,
@@ -160,8 +163,8 @@ static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode,
unsigned fieldnr = 0, field_bits;
int ret;
-#define x(_name, _bits) \
- if (fieldnr++ == INODE_NR_FIELDS(inode.v)) { \
+#define x(_name, _bits) \
+ if (fieldnr++ == INODEv1_NR_FIELDS(inode.v)) { \
unsigned offset = offsetof(struct bch_inode_unpacked, _name);\
memset((void *) unpacked + offset, 0, \
sizeof(*unpacked) - offset); \
@@ -280,6 +283,8 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k,
{
memset(unpacked, 0, sizeof(*unpacked));
+ unpacked->bi_snapshot = k.k->p.snapshot;
+
switch (k.k->type) {
case KEY_TYPE_inode: {
struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
@@ -290,10 +295,10 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k,
unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags);
unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode);
- if (INODE_NEW_VARINT(inode.v)) {
+ if (INODEv1_NEW_VARINT(inode.v)) {
return bch2_inode_unpack_v2(unpacked, inode.v->fields,
bkey_val_end(inode),
- INODE_NR_FIELDS(inode.v));
+ INODEv1_NR_FIELDS(inode.v));
} else {
return bch2_inode_unpack_v1(inode, unpacked);
}
@@ -468,10 +473,10 @@ int bch2_inode_validate(struct bch_fs *c, struct bkey_s_c k,
struct bkey_s_c_inode inode = bkey_s_c_to_inode(k);
int ret = 0;
- bkey_fsck_err_on(INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR,
+ bkey_fsck_err_on(INODEv1_STR_HASH(inode.v) >= BCH_STR_HASH_NR,
c, inode_str_hash_invalid,
"invalid str hash type (%llu >= %u)",
- INODE_STR_HASH(inode.v), BCH_STR_HASH_NR);
+ INODEv1_STR_HASH(inode.v), BCH_STR_HASH_NR);
ret = __bch2_inode_validate(c, k, flags);
fsck_err:
@@ -530,6 +535,10 @@ static void __bch2_inode_unpacked_to_text(struct printbuf *out,
prt_printf(out, "(%x)\n", inode->bi_flags);
prt_printf(out, "journal_seq=%llu\n", inode->bi_journal_seq);
+ prt_printf(out, "hash_seed=%llx\n", inode->bi_hash_seed);
+ prt_printf(out, "hash_type=");
+ bch2_prt_str_hash_type(out, INODE_STR_HASH(inode));
+ prt_newline(out);
prt_printf(out, "bi_size=%llu\n", inode->bi_size);
prt_printf(out, "bi_sectors=%llu\n", inode->bi_sectors);
prt_printf(out, "bi_version=%llu\n", inode->bi_version);
@@ -575,9 +584,137 @@ static inline u64 bkey_inode_flags(struct bkey_s_c k)
}
}
-static inline bool bkey_is_deleted_inode(struct bkey_s_c k)
+static inline void bkey_inode_flags_set(struct bkey_s k, u64 f)
+{
+ switch (k.k->type) {
+ case KEY_TYPE_inode:
+ bkey_s_to_inode(k).v->bi_flags = cpu_to_le32(f);
+ return;
+ case KEY_TYPE_inode_v2:
+ bkey_s_to_inode_v2(k).v->bi_flags = cpu_to_le64(f);
+ return;
+ case KEY_TYPE_inode_v3:
+ bkey_s_to_inode_v3(k).v->bi_flags = cpu_to_le64(f);
+ return;
+ default:
+ BUG();
+ }
+}
+
+static inline bool bkey_is_unlinked_inode(struct bkey_s_c k)
{
- return bkey_inode_flags(k) & BCH_INODE_unlinked;
+ unsigned f = bkey_inode_flags(k) & BCH_INODE_unlinked;
+
+ return (f & BCH_INODE_unlinked) && !(f & BCH_INODE_has_child_snapshot);
+}
+
+static struct bkey_s_c
+bch2_bkey_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter *iter,
+ enum btree_id btree, struct bpos pos,
+ unsigned flags)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ for_each_btree_key_upto_norestart(trans, *iter, btree,
+ bpos_successor(pos),
+ SPOS(pos.inode, pos.offset, U32_MAX),
+ flags|BTREE_ITER_all_snapshots, k, ret)
+ if (bch2_snapshot_is_ancestor(c, pos.snapshot, k.k->p.snapshot))
+ return k;
+
+ bch2_trans_iter_exit(trans, iter);
+ return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
+}
+
+static struct bkey_s_c
+bch2_inode_get_iter_snapshot_parent(struct btree_trans *trans, struct btree_iter *iter,
+ struct bpos pos, unsigned flags)
+{
+ struct bkey_s_c k;
+again:
+ k = bch2_bkey_get_iter_snapshot_parent(trans, iter, BTREE_ID_inodes, pos, flags);
+ if (!k.k ||
+ bkey_err(k) ||
+ bkey_is_inode(k.k))
+ return k;
+
+ bch2_trans_iter_exit(trans, iter);
+ pos = k.k->p;
+ goto again;
+}
+
+int __bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret = 0;
+
+ for_each_btree_key_upto_norestart(trans, iter,
+ BTREE_ID_inodes, POS(0, pos.offset), bpos_predecessor(pos),
+ BTREE_ITER_all_snapshots|
+ BTREE_ITER_with_updates, k, ret)
+ if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot) &&
+ bkey_is_inode(k.k)) {
+ ret = 1;
+ break;
+ }
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static int update_inode_has_children(struct btree_trans *trans,
+ struct bkey_s k,
+ bool have_child)
+{
+ if (!have_child) {
+ int ret = bch2_inode_has_child_snapshots(trans, k.k->p);
+ if (ret)
+ return ret < 0 ? ret : 0;
+ }
+
+ u64 f = bkey_inode_flags(k.s_c);
+ if (have_child != !!(f & BCH_INODE_has_child_snapshot))
+ bkey_inode_flags_set(k, f ^ BCH_INODE_has_child_snapshot);
+
+ return 0;
+}
+
+static int update_parent_inode_has_children(struct btree_trans *trans, struct bpos pos,
+ bool have_child)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_inode_get_iter_snapshot_parent(trans,
+ &iter, pos, BTREE_ITER_with_updates);
+ int ret = bkey_err(k);
+ if (ret)
+ return ret;
+ if (!k.k)
+ return 0;
+
+ if (!have_child) {
+ ret = bch2_inode_has_child_snapshots(trans, k.k->p);
+ if (ret) {
+ ret = ret < 0 ? ret : 0;
+ goto err;
+ }
+ }
+
+ u64 f = bkey_inode_flags(k);
+ if (have_child != !!(f & BCH_INODE_has_child_snapshot)) {
+ struct bkey_i *update = bch2_bkey_make_mut(trans, &iter, &k,
+ BTREE_UPDATE_internal_snapshot_node);
+ ret = PTR_ERR_OR_ZERO(update);
+ if (ret)
+ goto err;
+
+ bkey_inode_flags_set(bkey_i_to_s(update), f ^ BCH_INODE_has_child_snapshot);
+ }
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
}
int bch2_trigger_inode(struct btree_trans *trans,
@@ -586,6 +723,8 @@ int bch2_trigger_inode(struct btree_trans *trans,
struct bkey_s new,
enum btree_iter_update_trigger_flags flags)
{
+ struct bch_fs *c = trans->c;
+
if ((flags & BTREE_TRIGGER_atomic) && (flags & BTREE_TRIGGER_insert)) {
BUG_ON(!trans->journal_res.seq);
bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq);
@@ -599,13 +738,41 @@ int bch2_trigger_inode(struct btree_trans *trans,
return ret;
}
- int deleted_delta = (int) bkey_is_deleted_inode(new.s_c) -
- (int) bkey_is_deleted_inode(old);
- if ((flags & BTREE_TRIGGER_transactional) && deleted_delta) {
- int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes,
- new.k->p, deleted_delta > 0);
- if (ret)
- return ret;
+ if (flags & BTREE_TRIGGER_transactional) {
+ int unlinked_delta = (int) bkey_is_unlinked_inode(new.s_c) -
+ (int) bkey_is_unlinked_inode(old);
+ if (unlinked_delta) {
+ int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes,
+ new.k->p, unlinked_delta > 0);
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * If we're creating or deleting an inode at this snapshot ID,
+ * and there might be an inode in a parent snapshot ID, we might
+ * need to set or clear the has_child_snapshot flag on the
+ * parent.
+ */
+ int deleted_delta = (int) bkey_is_inode(new.k) -
+ (int) bkey_is_inode(old.k);
+ if (deleted_delta &&
+ bch2_snapshot_parent(c, new.k->p.snapshot)) {
+ int ret = update_parent_inode_has_children(trans, new.k->p,
+ deleted_delta > 0);
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * When an inode is first updated in a new snapshot, we may need
+ * to clear has_child_snapshot
+ */
+ if (deleted_delta > 0) {
+ int ret = update_inode_has_children(trans, new, false);
+ if (ret)
+ return ret;
+ }
}
return 0;
@@ -639,10 +806,8 @@ void bch2_inode_init_early(struct bch_fs *c,
memset(inode_u, 0, sizeof(*inode_u));
- /* ick */
- inode_u->bi_flags |= str_hash << INODE_STR_HASH_OFFSET;
- get_random_bytes(&inode_u->bi_hash_seed,
- sizeof(inode_u->bi_hash_seed));
+ SET_INODE_STR_HASH(inode_u, str_hash);
+ get_random_bytes(&inode_u->bi_hash_seed, sizeof(inode_u->bi_hash_seed));
}
void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now,
@@ -888,6 +1053,11 @@ err:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
+ if (ret)
+ goto err2;
+
+ ret = delete_ancestor_snapshot_inodes(trans, SPOS(0, inum.inum, snapshot));
+err2:
bch2_trans_put(trans);
return ret;
}
@@ -921,8 +1091,7 @@ int bch2_inode_find_by_inum_trans(struct btree_trans *trans,
int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum,
struct bch_inode_unpacked *inode)
{
- return bch2_trans_do(c, NULL, NULL, 0,
- bch2_inode_find_by_inum_trans(trans, inum, inode));
+ return bch2_trans_do(c, bch2_inode_find_by_inum_trans(trans, inum, inode));
}
int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi)
@@ -992,7 +1161,7 @@ int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_i
return 0;
}
-int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
+static noinline int __bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
{
struct bch_fs *c = trans->c;
struct btree_iter iter = { NULL };
@@ -1055,6 +1224,45 @@ err:
return ret ?: -BCH_ERR_transaction_restart_nested;
}
+/*
+ * After deleting an inode, there may be versions in older snapshots that should
+ * also be deleted - if they're not referenced by sibling snapshots and not open
+ * in other subvolumes:
+ */
+static int delete_ancestor_snapshot_inodes(struct btree_trans *trans, struct bpos pos)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret;
+next_parent:
+ ret = lockrestart_do(trans,
+ bkey_err(k = bch2_inode_get_iter_snapshot_parent(trans, &iter, pos, 0)));
+ if (ret || !k.k)
+ return ret;
+
+ bool unlinked = bkey_is_unlinked_inode(k);
+ pos = k.k->p;
+ bch2_trans_iter_exit(trans, &iter);
+
+ if (!unlinked)
+ return 0;
+
+ ret = lockrestart_do(trans, bch2_inode_or_descendents_is_open(trans, pos));
+ if (ret)
+ return ret < 0 ? ret : 0;
+
+ ret = __bch2_inode_rm_snapshot(trans, pos.offset, pos.snapshot);
+ if (ret)
+ return ret;
+ goto next_parent;
+}
+
+int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot)
+{
+ return __bch2_inode_rm_snapshot(trans, inum, snapshot) ?:
+ delete_ancestor_snapshot_inodes(trans, SPOS(0, inum, snapshot));
+}
+
static int may_delete_deleted_inode(struct btree_trans *trans,
struct btree_iter *iter,
struct bpos pos,
@@ -1064,6 +1272,7 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
struct btree_iter inode_iter;
struct bkey_s_c k;
struct bch_inode_unpacked inode;
+ struct printbuf buf = PRINTBUF;
int ret;
k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes, pos, BTREE_ITER_cached);
@@ -1099,6 +1308,31 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
pos.offset, pos.snapshot))
goto delete;
+ if (fsck_err_on(inode.bi_flags & BCH_INODE_has_child_snapshot,
+ trans, deleted_inode_has_child_snapshots,
+ "inode with child snapshots %llu:%u in deleted_inodes btree",
+ pos.offset, pos.snapshot))
+ goto delete;
+
+ ret = bch2_inode_has_child_snapshots(trans, k.k->p);
+ if (ret < 0)
+ goto out;
+
+ if (ret) {
+ if (fsck_err(trans, inode_has_child_snapshots_wrong,
+ "inode has_child_snapshots flag wrong (should be set)\n%s",
+ (printbuf_reset(&buf),
+ bch2_inode_unpacked_to_text(&buf, &inode),
+ buf.buf))) {
+ inode.bi_flags |= BCH_INODE_has_child_snapshot;
+ ret = __bch2_fsck_write_inode(trans, &inode);
+ if (ret)
+ goto out;
+ }
+ goto delete;
+
+ }
+
if (test_bit(BCH_FS_clean_recovery, &c->flags) &&
!fsck_err(trans, deleted_inode_but_clean,
"filesystem marked as clean but have deleted inode %llu:%u",
@@ -1107,33 +1341,11 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
goto out;
}
- if (bch2_snapshot_is_internal_node(c, pos.snapshot)) {
- struct bpos new_min_pos;
-
- ret = bch2_propagate_key_to_snapshot_leaves(trans, inode_iter.btree_id, k, &new_min_pos);
- if (ret)
- goto out;
-
- inode.bi_flags &= ~BCH_INODE_unlinked;
-
- ret = bch2_inode_write_flags(trans, &inode_iter, &inode,
- BTREE_UPDATE_internal_snapshot_node);
- bch_err_msg(c, ret, "clearing inode unlinked flag");
- if (ret)
- goto out;
-
- /*
- * We'll need another write buffer flush to pick up the new
- * unlinked inodes in the snapshot leaves:
- */
- *need_another_pass = true;
- goto out;
- }
-
ret = 1;
out:
fsck_err:
bch2_trans_iter_exit(trans, &inode_iter);
+ printbuf_exit(&buf);
return ret;
delete:
ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, pos, false);
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index 9c1f67705684..eab82b5eb897 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -5,6 +5,7 @@
#include "bkey.h"
#include "bkey_methods.h"
#include "opts.h"
+#include "snapshot.h"
enum bch_validate_flags;
extern const char * const bch2_inode_opts[];
@@ -17,6 +18,15 @@ int bch2_inode_v3_validate(struct bch_fs *, struct bkey_s_c,
enum bch_validate_flags);
void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+int __bch2_inode_has_child_snapshots(struct btree_trans *, struct bpos);
+
+static inline int bch2_inode_has_child_snapshots(struct btree_trans *trans, struct bpos pos)
+{
+ return bch2_snapshot_is_leaf(trans->c, pos.snapshot) <= 0
+ ? __bch2_inode_has_child_snapshots(trans, pos)
+ : 0;
+}
+
int bch2_trigger_inode(struct btree_trans *, enum btree_id, unsigned,
struct bkey_s_c, struct bkey_s,
enum btree_iter_update_trigger_flags);
@@ -82,6 +92,7 @@ struct bch_inode_unpacked {
BCH_INODE_FIELDS_v3()
#undef x
};
+BITMASK(INODE_STR_HASH, struct bch_inode_unpacked, bi_flags, 20, 24);
struct bkey_inode_buf {
struct bkey_i_inode_v3 inode;
diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h
index 83d107331edf..7928d0c6954f 100644
--- a/fs/bcachefs/inode_format.h
+++ b/fs/bcachefs/inode_format.h
@@ -133,7 +133,8 @@ enum inode_opt_id {
x(i_size_dirty, 5) \
x(i_sectors_dirty, 6) \
x(unlinked, 7) \
- x(backptr_untrusted, 8)
+ x(backptr_untrusted, 8) \
+ x(has_child_snapshot, 9)
/* bits 20+ reserved for packed fields below: */
@@ -149,9 +150,9 @@ enum __bch_inode_flags {
#undef x
};
-LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24);
-LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31);
-LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32);
+LE32_BITMASK(INODEv1_STR_HASH, struct bch_inode, bi_flags, 20, 24);
+LE32_BITMASK(INODEv1_NR_FIELDS, struct bch_inode, bi_flags, 24, 31);
+LE32_BITMASK(INODEv1_NEW_VARINT,struct bch_inode, bi_flags, 31, 32);
LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24);
LE64_BITMASK(INODEv2_NR_FIELDS, struct bch_inode_v2, bi_flags, 24, 31);
diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
index 307ed0a45184..f283051758d6 100644
--- a/fs/bcachefs/io_misc.c
+++ b/fs/bcachefs/io_misc.c
@@ -377,7 +377,7 @@ static int __bch2_resume_logged_op_finsert(struct btree_trans *trans,
* check for missing subvolume before fpunch, as in resume we don't want
* it to be a fatal error
*/
- ret = __bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot, warn_errors);
+ ret = lockrestart_do(trans, __bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot, warn_errors));
if (ret)
return ret;
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index e4fc17c548fd..b3b934a87c6d 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -262,7 +262,8 @@ err:
bio_free_pages(&(*rbio)->bio);
kfree(*rbio);
*rbio = NULL;
- kfree(op);
+ /* We may have added to the rhashtable and thus need rcu freeing: */
+ kfree_rcu(op, rcu);
bch2_write_ref_put(c, BCH_WRITE_REF_promote);
return ERR_PTR(ret);
}
@@ -409,8 +410,8 @@ retry:
bch2_trans_begin(trans);
rbio->bio.bi_status = 0;
- k = bch2_btree_iter_peek_slot(&iter);
- if (bkey_err(k))
+ ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
+ if (ret)
goto err;
bch2_bkey_buf_reassemble(&sk, c, k);
@@ -557,8 +558,8 @@ out:
static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
{
- bch2_trans_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
- __bch2_rbio_narrow_crcs(trans, rbio));
+ bch2_trans_commit_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ __bch2_rbio_narrow_crcs(trans, rbio));
}
/* Inner part that may run in process context */
@@ -802,16 +803,15 @@ static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans,
PTR_BUCKET_POS(ca, &ptr),
BTREE_ITER_cached);
- u8 *gen = bucket_gen(ca, iter.pos.offset);
- if (gen) {
-
+ int gen = bucket_gen_get(ca, iter.pos.offset);
+ if (gen >= 0) {
prt_printf(&buf, "Attempting to read from stale dirty pointer:\n");
printbuf_indent_add(&buf, 2);
bch2_bkey_val_to_text(&buf, c, k);
prt_newline(&buf);
- prt_printf(&buf, "memory gen: %u", *gen);
+ prt_printf(&buf, "memory gen: %u", gen);
ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
if (!ret) {
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index b5fe9e0dc155..96720adcfee0 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -1300,11 +1300,8 @@ retry:
bucket_to_u64(i->b),
BUCKET_NOCOW_LOCK_UPDATE);
- rcu_read_lock();
- u8 *gen = bucket_gen(ca, i->b.offset);
- stale = !gen ? -1 : gen_after(*gen, i->gen);
- rcu_read_unlock();
-
+ int gen = bucket_gen_get(ca, i->b.offset);
+ stale = gen < 0 ? gen : gen_after(gen, i->gen);
if (unlikely(stale)) {
stale_at = i;
goto err_bucket_stale;
@@ -1437,7 +1434,7 @@ again:
* freeing up space on specific disks, which means that
* allocations for specific disks may hang arbitrarily long:
*/
- ret = bch2_trans_do(c, NULL, NULL, 0,
+ ret = bch2_trans_run(c, lockrestart_do(trans,
bch2_alloc_sectors_start_trans(trans,
op->target,
op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED),
@@ -1447,7 +1444,7 @@ again:
op->nr_replicas_required,
op->watermark,
op->flags,
- &op->cl, &wp));
+ &op->cl, &wp)));
if (unlikely(ret)) {
if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
break;
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index f5f7db50ca31..2dc0d60c1745 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -603,6 +603,19 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res,
{
int ret;
+ if (closure_wait_event_timeout(&j->async_wait,
+ (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked ||
+ (flags & JOURNAL_RES_GET_NONBLOCK),
+ HZ * 10))
+ return ret;
+
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct printbuf buf = PRINTBUF;
+ bch2_journal_debug_to_text(&buf, j);
+ bch_err(c, "Journal stuck? Waited for 10 seconds...\n%s",
+ buf.buf);
+ printbuf_exit(&buf);
+
closure_wait_event(&j->async_wait,
(ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked ||
(flags & JOURNAL_RES_GET_NONBLOCK));
@@ -745,7 +758,7 @@ out:
return ret;
}
-int bch2_journal_flush_seq(struct journal *j, u64 seq)
+int bch2_journal_flush_seq(struct journal *j, u64 seq, unsigned task_state)
{
u64 start_time = local_clock();
int ret, ret2;
@@ -756,7 +769,9 @@ int bch2_journal_flush_seq(struct journal *j, u64 seq)
if (seq <= j->flushed_seq_ondisk)
return 0;
- ret = wait_event_interruptible(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL)));
+ ret = wait_event_state(j->wait,
+ (ret2 = bch2_journal_flush_seq_async(j, seq, NULL)),
+ task_state);
if (!ret)
bch2_time_stats_update(j->flush_seq_time, start_time);
@@ -775,7 +790,7 @@ void bch2_journal_flush_async(struct journal *j, struct closure *parent)
int bch2_journal_flush(struct journal *j)
{
- return bch2_journal_flush_seq(j, atomic64_read(&j->seq));
+ return bch2_journal_flush_seq(j, atomic64_read(&j->seq), TASK_UNINTERRUPTIBLE);
}
/*
@@ -838,7 +853,7 @@ int bch2_journal_meta(struct journal *j)
bch2_journal_res_put(j, &res);
- return bch2_journal_flush_seq(j, res.seq);
+ return bch2_journal_flush_seq(j, res.seq, TASK_UNINTERRUPTIBLE);
}
/* block/unlock the journal: */
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 377a3750406e..2762be6f9814 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -401,7 +401,7 @@ void bch2_journal_entry_res_resize(struct journal *,
int bch2_journal_flush_seq_async(struct journal *, u64, struct closure *);
void bch2_journal_flush_async(struct journal *, struct closure *);
-int bch2_journal_flush_seq(struct journal *, u64);
+int bch2_journal_flush_seq(struct journal *, u64, unsigned);
int bch2_journal_flush(struct journal *);
bool bch2_journal_noflush_seq(struct journal *, u64);
int bch2_journal_meta(struct journal *);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 954f6a96e0f4..fb35dd336331 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -708,6 +708,9 @@ static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs
container_of(entry, struct jset_entry_dev_usage, entry);
unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
+ if (vstruct_bytes(entry) < sizeof(*u))
+ return;
+
prt_printf(out, "dev=%u", le32_to_cpu(u->dev));
printbuf_indent_add(out, 2);
@@ -1012,6 +1015,8 @@ reread:
nr_bvecs = buf_pages(buf->data, sectors_read << 9);
bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
+ if (!bio)
+ return -BCH_ERR_ENOMEM_journal_read_bucket;
bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ);
bio->bi_iter.bi_sector = offset;
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 8c456d8b8b99..0ef4a86850bb 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -266,7 +266,7 @@ int bch2_move_extent(struct moving_context *ctxt,
if (!data_opts.rewrite_ptrs &&
!data_opts.extra_replicas) {
if (data_opts.kill_ptrs)
- return bch2_extent_drop_ptrs(trans, iter, k, data_opts);
+ return bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &data_opts);
return 0;
}
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 232be8a44051..0e2ee262fbd4 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -63,7 +63,7 @@ const char * const bch2_compression_opts[] = {
NULL
};
-const char * const bch2_str_hash_types[] = {
+const char * const __bch2_str_hash_types[] = {
BCH_STR_HASH_TYPES()
NULL
};
@@ -115,6 +115,7 @@ PRT_STR_OPT_BOUNDSCHECKED(fs_usage_type, enum bch_fs_usage_type);
PRT_STR_OPT_BOUNDSCHECKED(data_type, enum bch_data_type);
PRT_STR_OPT_BOUNDSCHECKED(csum_type, enum bch_csum_type);
PRT_STR_OPT_BOUNDSCHECKED(compression_type, enum bch_compression_type);
+PRT_STR_OPT_BOUNDSCHECKED(str_hash_type, enum bch_str_hash_type);
static int bch2_opt_fix_errors_parse(struct bch_fs *c, const char *val, u64 *res,
struct printbuf *err)
@@ -225,7 +226,7 @@ const struct bch_option bch2_opt_table[] = {
#define OPT_UINT(_min, _max) .type = BCH_OPT_UINT, \
.min = _min, .max = _max
#define OPT_STR(_choices) .type = BCH_OPT_STR, \
- .min = 0, .max = ARRAY_SIZE(_choices), \
+ .min = 0, .max = ARRAY_SIZE(_choices) - 1, \
.choices = _choices
#define OPT_STR_NOLIMIT(_choices) .type = BCH_OPT_STR, \
.min = 0, .max = U64_MAX, \
@@ -427,7 +428,9 @@ void bch2_opt_to_text(struct printbuf *out,
prt_printf(out, "%lli", v);
break;
case BCH_OPT_STR:
- if (flags & OPT_SHOW_FULL_LIST)
+ if (v < opt->min || v >= opt->max)
+ prt_printf(out, "(invalid option %lli)", v);
+ else if (flags & OPT_SHOW_FULL_LIST)
prt_string_option(out, opt->choices, v);
else
prt_str(out, opt->choices[v]);
@@ -594,6 +597,9 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
copied_opts_start = copied_opts;
while ((opt = strsep(&copied_opts, ",")) != NULL) {
+ if (!*opt)
+ continue;
+
name = strsep(&opt, "=");
val = opt;
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index cb2e244a2429..23dda014e331 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -18,7 +18,7 @@ extern const char * const bch2_sb_compat[];
extern const char * const __bch2_btree_ids[];
extern const char * const bch2_csum_opts[];
extern const char * const bch2_compression_opts[];
-extern const char * const bch2_str_hash_types[];
+extern const char * const __bch2_str_hash_types[];
extern const char * const bch2_str_hash_opts[];
extern const char * const __bch2_data_types[];
extern const char * const bch2_member_states[];
@@ -29,6 +29,7 @@ void bch2_prt_fs_usage_type(struct printbuf *, enum bch_fs_usage_type);
void bch2_prt_data_type(struct printbuf *, enum bch_data_type);
void bch2_prt_csum_type(struct printbuf *, enum bch_csum_type);
void bch2_prt_compression_type(struct printbuf *, enum bch_compression_type);
+void bch2_prt_str_hash_type(struct printbuf *, enum bch_str_hash_type);
static inline const char *bch2_d_type_str(unsigned d_type)
{
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index c32a05e252e2..74f45a8162ad 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -869,7 +869,7 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid,
bkey_quota_init(&new_quota.k_i);
new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid));
- ret = bch2_trans_do(c, NULL, NULL, 0,
+ ret = bch2_trans_commit_do(c, NULL, NULL, 0,
bch2_set_quota_trans(trans, &new_quota, qdq)) ?:
__bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i), qdq);
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 2d299a37cf07..cd6647374353 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -70,7 +70,9 @@ err:
int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum)
{
- int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw,
+ int ret = bch2_trans_commit_do(c, NULL, NULL,
+ BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_lazy_rw,
__bch2_set_rebalance_needs_scan(trans, inum));
rebalance_wakeup(c);
return ret;
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 6db72d3bad7d..3c7f941dde39 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -94,11 +94,10 @@ static void bch2_reconstruct_alloc(struct bch_fs *c)
__set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent);
c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
- bch2_write_super(c);
- mutex_unlock(&c->sb_lock);
-
c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
+ bch2_write_super(c);
+ mutex_unlock(&c->sb_lock);
bch2_shoot_down_journal_keys(c, BTREE_ID_alloc,
0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
@@ -287,7 +286,8 @@ int bch2_journal_replay(struct bch_fs *c)
BCH_TRANS_COMMIT_no_enospc|
BCH_TRANS_COMMIT_journal_reclaim|
BCH_TRANS_COMMIT_skip_accounting_apply|
- BCH_TRANS_COMMIT_no_journal_res,
+ BCH_TRANS_COMMIT_no_journal_res|
+ BCH_WATERMARK_reclaim,
bch2_journal_replay_accounting_key(trans, k));
if (bch2_fs_fatal_err_on(ret, c, "error replaying accounting; %s", bch2_err_str(ret)))
goto err;
@@ -862,6 +862,13 @@ use_clean:
if (ret)
goto err;
+ /*
+ * Normally set by the appropriate recovery pass: when cleared, this
+ * indicates we're in early recovery and btree updates should be done by
+ * being applied to the journal replay keys. _Must_ be cleared before
+ * multithreaded use:
+ */
+ set_bit(BCH_FS_may_go_rw, &c->flags);
clear_bit(BCH_FS_fsck_running, &c->flags);
/* in case we don't run journal replay, i.e. norecovery mode */
@@ -1001,6 +1008,7 @@ int bch2_fs_initialize(struct bch_fs *c)
struct bch_inode_unpacked root_inode, lostfound_inode;
struct bkey_inode_buf packed_inode;
struct qstr lostfound = QSTR("lost+found");
+ struct bch_member *m;
int ret;
bch_notice(c, "initializing new filesystem");
@@ -1017,6 +1025,14 @@ int bch2_fs_initialize(struct bch_fs *c)
SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current);
bch2_write_super(c);
}
+
+ for_each_member_device(c, ca) {
+ m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
+ SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, false);
+ ca->mi = bch2_mi_to_cpu(m);
+ }
+
+ bch2_write_super(c);
mutex_unlock(&c->sb_lock);
c->curr_recovery_pass = BCH_RECOVERY_PASS_NR;
@@ -1090,7 +1106,7 @@ int bch2_fs_initialize(struct bch_fs *c)
bch2_inode_init_early(c, &lostfound_inode);
- ret = bch2_trans_do(c, NULL, NULL, 0,
+ ret = bch2_trans_commit_do(c, NULL, NULL, 0,
bch2_create_trans(trans,
BCACHEFS_ROOT_SUBVOL_INUM,
&root_inode, &lostfound_inode,
diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c
index 735b8adc8f9d..dff589ddc984 100644
--- a/fs/bcachefs/recovery_passes.c
+++ b/fs/bcachefs/recovery_passes.c
@@ -27,6 +27,12 @@ const char * const bch2_recovery_passes[] = {
NULL
};
+/* Fake recovery pass, so that scan_for_btree_nodes isn't 0: */
+static int bch2_recovery_pass_empty(struct bch_fs *c)
+{
+ return 0;
+}
+
static int bch2_set_may_go_rw(struct bch_fs *c)
{
struct journal_keys *keys = &c->journal_keys;
@@ -221,6 +227,12 @@ int bch2_run_recovery_passes(struct bch_fs *c)
{
int ret = 0;
+ /*
+ * We can't allow set_may_go_rw to be excluded; that would cause us to
+ * use the journal replay keys for updates where it's not expected.
+ */
+ c->opts.recovery_passes_exclude &= ~BCH_RECOVERY_PASS_set_may_go_rw;
+
while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) {
if (c->opts.recovery_pass_last &&
c->curr_recovery_pass > c->opts.recovery_pass_last)
diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h
index 50406ce0e4ef..94dc20ca2065 100644
--- a/fs/bcachefs/recovery_passes_types.h
+++ b/fs/bcachefs/recovery_passes_types.h
@@ -13,6 +13,7 @@
* must never change:
*/
#define BCH_RECOVERY_PASSES() \
+ x(recovery_pass_empty, 41, PASS_SILENT) \
x(scan_for_btree_nodes, 37, 0) \
x(check_topology, 4, 0) \
x(accounting_read, 39, PASS_ALWAYS) \
@@ -46,6 +47,7 @@
x(check_dirents, 27, PASS_FSCK) \
x(check_xattrs, 28, PASS_FSCK) \
x(check_root, 29, PASS_ONLINE|PASS_FSCK) \
+ x(check_unreachable_inodes, 40, PASS_ONLINE|PASS_FSCK) \
x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \
x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \
x(check_nlinks, 31, PASS_FSCK) \
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index bcb3276747e0..477ef0997949 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -66,9 +66,9 @@ void bch2_replicas_entry_to_text(struct printbuf *out,
prt_printf(out, "]");
}
-static int bch2_replicas_entry_validate_locked(struct bch_replicas_entry_v1 *r,
- struct bch_sb *sb,
- struct printbuf *err)
+static int bch2_replicas_entry_sb_validate(struct bch_replicas_entry_v1 *r,
+ struct bch_sb *sb,
+ struct printbuf *err)
{
if (!r->nr_devs) {
prt_printf(err, "no devices in entry ");
@@ -98,10 +98,28 @@ int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r,
struct bch_fs *c,
struct printbuf *err)
{
- mutex_lock(&c->sb_lock);
- int ret = bch2_replicas_entry_validate_locked(r, c->disk_sb.sb, err);
- mutex_unlock(&c->sb_lock);
- return ret;
+ if (!r->nr_devs) {
+ prt_printf(err, "no devices in entry ");
+ goto bad;
+ }
+
+ if (r->nr_required > 1 &&
+ r->nr_required >= r->nr_devs) {
+ prt_printf(err, "bad nr_required in entry ");
+ goto bad;
+ }
+
+ for (unsigned i = 0; i < r->nr_devs; i++)
+ if (r->devs[i] != BCH_SB_MEMBER_INVALID &&
+ !bch2_dev_exists(c, r->devs[i])) {
+ prt_printf(err, "invalid device %u in entry ", r->devs[i]);
+ goto bad;
+ }
+
+ return 0;
+bad:
+ bch2_replicas_entry_to_text(err, r);
+ return -BCH_ERR_invalid_replicas_entry;
}
void bch2_cpu_replicas_to_text(struct printbuf *out,
@@ -686,7 +704,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
struct bch_replicas_entry_v1 *e =
cpu_replicas_entry(cpu_r, i);
- int ret = bch2_replicas_entry_validate_locked(e, sb, err);
+ int ret = bch2_replicas_entry_sb_validate(e, sb, err);
if (ret)
return ret;
@@ -803,6 +821,11 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
rcu_read_lock();
for (unsigned i = 0; i < e->nr_devs; i++) {
+ if (e->devs[i] == BCH_SB_MEMBER_INVALID) {
+ nr_failed++;
+ continue;
+ }
+
nr_online += test_bit(e->devs[i], devs.d);
struct bch_dev *ca = bch2_dev_rcu_noerror(c, e->devs[i]);
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
index 5102059a0f1d..8767c33c2b51 100644
--- a/fs/bcachefs/sb-downgrade.c
+++ b/fs/bcachefs/sb-downgrade.c
@@ -78,7 +78,10 @@
BCH_FSCK_ERR_accounting_mismatch) \
x(rebalance_work_acct_fix, \
BIT_ULL(BCH_RECOVERY_PASS_check_allocations), \
- BCH_FSCK_ERR_accounting_mismatch)
+ BCH_FSCK_ERR_accounting_mismatch) \
+ x(inode_has_child_snapshots, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \
+ BCH_FSCK_ERR_inode_has_child_snapshots_wrong)
#define DOWNGRADE_TABLE() \
x(bucket_stripe_sectors, \
@@ -140,6 +143,9 @@ UPGRADE_TABLE()
static int have_stripes(struct bch_fs *c)
{
+ if (IS_ERR_OR_NULL(c->btree_roots_known[BTREE_ID_stripes].b))
+ return 0;
+
return !btree_node_fake(c->btree_roots_known[BTREE_ID_stripes].b);
}
diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h
index 4135b1ea2fec..9feb6739f77a 100644
--- a/fs/bcachefs/sb-errors_format.h
+++ b/fs/bcachefs/sb-errors_format.h
@@ -136,7 +136,9 @@ enum bch_fsck_flags {
x(bucket_gens_nonzero_for_invalid_buckets, 122, FSCK_AUTOFIX) \
x(need_discard_freespace_key_to_invalid_dev_bucket, 123, 0) \
x(need_discard_freespace_key_bad, 124, 0) \
+ x(discarding_bucket_not_in_need_discard_btree, 291, 0) \
x(backpointer_bucket_offset_wrong, 125, 0) \
+ x(backpointer_level_bad, 294, 0) \
x(backpointer_to_missing_device, 126, 0) \
x(backpointer_to_missing_alloc, 127, 0) \
x(backpointer_to_missing_ptr, 128, 0) \
@@ -177,9 +179,12 @@ enum bch_fsck_flags {
x(ptr_stripe_redundant, 163, 0) \
x(reservation_key_nr_replicas_invalid, 164, 0) \
x(reflink_v_refcount_wrong, 165, 0) \
+ x(reflink_v_pos_bad, 292, 0) \
x(reflink_p_to_missing_reflink_v, 166, 0) \
+ x(reflink_refcount_underflow, 293, 0) \
x(stripe_pos_bad, 167, 0) \
x(stripe_val_size_bad, 168, 0) \
+ x(stripe_csum_granularity_bad, 290, 0) \
x(stripe_sector_count_wrong, 169, 0) \
x(snapshot_tree_pos_bad, 170, 0) \
x(snapshot_tree_to_missing_snapshot, 171, 0) \
@@ -225,11 +230,13 @@ enum bch_fsck_flags {
x(inode_multiple_links_but_nlink_0, 207, FSCK_AUTOFIX) \
x(inode_wrong_backpointer, 208, FSCK_AUTOFIX) \
x(inode_wrong_nlink, 209, FSCK_AUTOFIX) \
+ x(inode_has_child_snapshots_wrong, 287, 0) \
x(inode_unreachable, 210, FSCK_AUTOFIX) \
x(deleted_inode_but_clean, 211, FSCK_AUTOFIX) \
x(deleted_inode_missing, 212, FSCK_AUTOFIX) \
x(deleted_inode_is_dir, 213, FSCK_AUTOFIX) \
x(deleted_inode_not_unlinked, 214, FSCK_AUTOFIX) \
+ x(deleted_inode_has_child_snapshots, 288, FSCK_AUTOFIX) \
x(extent_overlapping, 215, 0) \
x(key_in_missing_inode, 216, 0) \
x(key_in_wrong_inode_type, 217, 0) \
@@ -264,8 +271,8 @@ enum bch_fsck_flags {
x(journal_entry_dup_same_device, 246, 0) \
x(inode_bi_subvol_missing, 247, 0) \
x(inode_bi_subvol_wrong, 248, 0) \
- x(inode_points_to_missing_dirent, 249, 0) \
- x(inode_points_to_wrong_dirent, 250, 0) \
+ x(inode_points_to_missing_dirent, 249, FSCK_AUTOFIX) \
+ x(inode_points_to_wrong_dirent, 250, FSCK_AUTOFIX) \
x(inode_bi_parent_nonzero, 251, 0) \
x(dirent_to_missing_parent_subvol, 252, 0) \
x(dirent_not_visible_in_parent_subvol, 253, 0) \
@@ -289,6 +296,7 @@ enum bch_fsck_flags {
x(alloc_key_stripe_sectors_wrong, 271, FSCK_AUTOFIX) \
x(accounting_mismatch, 272, FSCK_AUTOFIX) \
x(accounting_replicas_not_marked, 273, 0) \
+ x(accounting_to_invalid_device, 289, 0) \
x(invalid_btree_id, 274, 0) \
x(alloc_key_io_time_bad, 275, 0) \
x(alloc_key_fragmentation_lru_wrong, 276, FSCK_AUTOFIX) \
@@ -298,7 +306,7 @@ enum bch_fsck_flags {
x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \
x(accounting_key_version_0, 282, FSCK_AUTOFIX) \
x(logged_op_but_clean, 283, FSCK_AUTOFIX) \
- x(MAX, 287, 0)
+ x(MAX, 295, 0)
enum bch_sb_error_id {
#define x(t, n, ...) BCH_FSCK_ERR_##t = n,
diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c
index 02bcde3c1b02..116131f95815 100644
--- a/fs/bcachefs/sb-members.c
+++ b/fs/bcachefs/sb-members.c
@@ -163,6 +163,11 @@ static int validate_member(struct printbuf *err,
return -BCH_ERR_invalid_sb_members;
}
+ if (m.btree_bitmap_shift >= BCH_MI_BTREE_BITMAP_SHIFT_MAX) {
+ prt_printf(err, "device %u: invalid btree_bitmap_shift %u", i, m.btree_bitmap_shift);
+ return -BCH_ERR_invalid_sb_members;
+ }
+
return 0;
}
@@ -247,7 +252,10 @@ static void member_to_text(struct printbuf *out,
prt_newline(out);
prt_printf(out, "Btree allocated bitmap blocksize:\t");
- prt_units_u64(out, 1ULL << m.btree_bitmap_shift);
+ if (m.btree_bitmap_shift < 64)
+ prt_units_u64(out, 1ULL << m.btree_bitmap_shift);
+ else
+ prt_printf(out, "(invalid shift %u)", m.btree_bitmap_shift);
prt_newline(out);
prt_printf(out, "Btree allocated bitmap:\t");
@@ -442,7 +450,7 @@ static void __bch2_dev_btree_bitmap_mark(struct bch_sb_field_members_v2 *mi, uns
m->btree_bitmap_shift += resize;
}
- BUG_ON(m->btree_bitmap_shift > 57);
+ BUG_ON(m->btree_bitmap_shift >= BCH_MI_BTREE_BITMAP_SHIFT_MAX);
BUG_ON(end > 64ULL << m->btree_bitmap_shift);
for (unsigned bit = start >> m->btree_bitmap_shift;
diff --git a/fs/bcachefs/sb-members_format.h b/fs/bcachefs/sb-members_format.h
index d727d2dfda08..2adf1221a440 100644
--- a/fs/bcachefs/sb-members_format.h
+++ b/fs/bcachefs/sb-members_format.h
@@ -66,6 +66,12 @@ struct bch_member {
};
/*
+ * btree_allocated_bitmap can represent sector addresses of a u64: it itself has
+ * 64 elements, so 64 - ilog2(64)
+ */
+#define BCH_MI_BTREE_BITMAP_SHIFT_MAX 58
+
+/*
* This limit comes from the bucket_gens array - it's a single allocation, and
* kernel allocation are limited to INT_MAX
*/
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index 1809442b00ee..ae57638506c3 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -905,12 +905,30 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id)
if (bch2_snapshot_equiv(c, id))
return 0;
- /* 0 is an invalid tree ID */
+ /* Do we need to reconstruct the snapshot_tree entry as well? */
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret = 0;
u32 tree_id = 0;
- int ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id);
+
+ for_each_btree_key_norestart(trans, iter, BTREE_ID_snapshot_trees, POS_MIN,
+ 0, k, ret) {
+ if (le32_to_cpu(bkey_s_c_to_snapshot_tree(k).v->root_snapshot) == id) {
+ tree_id = k.k->p.offset;
+ break;
+ }
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
if (ret)
return ret;
+ if (!tree_id) {
+ ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id);
+ if (ret)
+ return ret;
+ }
+
struct bkey_i_snapshot *snapshot = bch2_trans_kmalloc(trans, sizeof(*snapshot));
ret = PTR_ERR_OR_ZERO(snapshot);
if (ret)
@@ -921,6 +939,16 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id)
snapshot->v.tree = cpu_to_le32(tree_id);
snapshot->v.btime.lo = cpu_to_le64(bch2_current_time(c));
+ for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
+ 0, k, ret) {
+ if (le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot) == id) {
+ snapshot->v.subvol = cpu_to_le32(k.k->p.offset);
+ SET_BCH_SNAPSHOT_SUBVOL(&snapshot->v, true);
+ break;
+ }
+ }
+ bch2_trans_iter_exit(trans, &iter);
+
return bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?:
bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0) ?:
@@ -1732,103 +1760,6 @@ int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
return ret;
}
-static u32 bch2_snapshot_smallest_child(struct bch_fs *c, u32 id)
-{
- const struct snapshot_t *s = snapshot_t(c, id);
-
- return s->children[1] ?: s->children[0];
-}
-
-static u32 bch2_snapshot_smallest_descendent(struct bch_fs *c, u32 id)
-{
- u32 child;
-
- while ((child = bch2_snapshot_smallest_child(c, id)))
- id = child;
- return id;
-}
-
-static int bch2_propagate_key_to_snapshot_leaf(struct btree_trans *trans,
- enum btree_id btree,
- struct bkey_s_c interior_k,
- u32 leaf_id, struct bpos *new_min_pos)
-{
- struct btree_iter iter;
- struct bpos pos = interior_k.k->p;
- struct bkey_s_c k;
- struct bkey_i *new;
- int ret;
-
- pos.snapshot = leaf_id;
-
- bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_intent);
- k = bch2_btree_iter_peek_slot(&iter);
- ret = bkey_err(k);
- if (ret)
- goto out;
-
- /* key already overwritten in this snapshot? */
- if (k.k->p.snapshot != interior_k.k->p.snapshot)
- goto out;
-
- if (bpos_eq(*new_min_pos, POS_MIN)) {
- *new_min_pos = k.k->p;
- new_min_pos->snapshot = leaf_id;
- }
-
- new = bch2_bkey_make_mut_noupdate(trans, interior_k);
- ret = PTR_ERR_OR_ZERO(new);
- if (ret)
- goto out;
-
- new->k.p.snapshot = leaf_id;
- ret = bch2_trans_update(trans, &iter, new, 0);
-out:
- bch2_set_btree_iter_dontneed(&iter);
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
-int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *trans,
- enum btree_id btree,
- struct bkey_s_c k,
- struct bpos *new_min_pos)
-{
- struct bch_fs *c = trans->c;
- struct bkey_buf sk;
- u32 restart_count = trans->restart_count;
- int ret = 0;
-
- bch2_bkey_buf_init(&sk);
- bch2_bkey_buf_reassemble(&sk, c, k);
- k = bkey_i_to_s_c(sk.k);
-
- *new_min_pos = POS_MIN;
-
- for (u32 id = bch2_snapshot_smallest_descendent(c, k.k->p.snapshot);
- id < k.k->p.snapshot;
- id++) {
- if (!bch2_snapshot_is_ancestor(c, id, k.k->p.snapshot) ||
- !bch2_snapshot_is_leaf(c, id))
- continue;
-again:
- ret = btree_trans_too_many_iters(trans) ?:
- bch2_propagate_key_to_snapshot_leaf(trans, btree, k, id, new_min_pos) ?:
- bch2_trans_commit(trans, NULL, NULL, 0);
- if (ret && bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
- bch2_trans_begin(trans);
- goto again;
- }
-
- if (ret)
- break;
- }
-
- bch2_bkey_buf_exit(&sk, c);
-
- return ret ?: trans_was_restarted(trans, restart_count);
-}
-
static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h
index eb5ef64221d6..29c94716293e 100644
--- a/fs/bcachefs/snapshot.h
+++ b/fs/bcachefs/snapshot.h
@@ -259,9 +259,6 @@ static inline int bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
return __bch2_key_has_snapshot_overwrites(trans, id, pos);
}
-int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *, enum btree_id,
- struct bkey_s_c, struct bpos *);
-
int bch2_snapshots_read(struct bch_fs *);
void bch2_fs_snapshots_exit(struct bch_fs *);
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index 215eed4cce6d..ec2b1feea520 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -46,8 +46,7 @@ bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi)
{
/* XXX ick */
struct bch_hash_info info = {
- .type = (bi->bi_flags >> INODE_STR_HASH_OFFSET) &
- ~(~0U << INODE_STR_HASH_BITS),
+ .type = INODE_STR_HASH(bi),
.siphash_key = { .k0 = bi->bi_hash_seed }
};
@@ -253,19 +252,20 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans,
}
static __always_inline
-int bch2_hash_set_in_snapshot(struct btree_trans *trans,
+struct bkey_s_c bch2_hash_set_or_get_in_snapshot(struct btree_trans *trans,
+ struct btree_iter *iter,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
subvol_inum inum, u32 snapshot,
struct bkey_i *insert,
enum btree_iter_update_trigger_flags flags)
{
- struct btree_iter iter, slot = { NULL };
+ struct btree_iter slot = {};
struct bkey_s_c k;
bool found = false;
int ret;
- for_each_btree_key_upto_norestart(trans, iter, desc.btree_id,
+ for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id,
SPOS(insert->k.p.inode,
desc.hash_bkey(info, bkey_i_to_s_c(insert)),
snapshot),
@@ -280,7 +280,7 @@ int bch2_hash_set_in_snapshot(struct btree_trans *trans,
}
if (!slot.path && !(flags & STR_HASH_must_replace))
- bch2_trans_copy_iter(&slot, &iter);
+ bch2_trans_copy_iter(&slot, iter);
if (k.k->type != KEY_TYPE_hash_whiteout)
goto not_found;
@@ -290,29 +290,50 @@ int bch2_hash_set_in_snapshot(struct btree_trans *trans,
ret = -BCH_ERR_ENOSPC_str_hash_create;
out:
bch2_trans_iter_exit(trans, &slot);
- bch2_trans_iter_exit(trans, &iter);
-
- return ret;
+ bch2_trans_iter_exit(trans, iter);
+ return ret ? bkey_s_c_err(ret) : bkey_s_c_null;
found:
found = true;
not_found:
-
- if (!found && (flags & STR_HASH_must_replace)) {
+ if (found && (flags & STR_HASH_must_create)) {
+ bch2_trans_iter_exit(trans, &slot);
+ return k;
+ } else if (!found && (flags & STR_HASH_must_replace)) {
ret = -BCH_ERR_ENOENT_str_hash_set_must_replace;
- } else if (found && (flags & STR_HASH_must_create)) {
- ret = -BCH_ERR_EEXIST_str_hash_set;
} else {
if (!found && slot.path)
- swap(iter, slot);
+ swap(*iter, slot);
- insert->k.p = iter.pos;
- ret = bch2_trans_update(trans, &iter, insert, flags);
+ insert->k.p = iter->pos;
+ ret = bch2_trans_update(trans, iter, insert, flags);
}
goto out;
}
static __always_inline
+int bch2_hash_set_in_snapshot(struct btree_trans *trans,
+ const struct bch_hash_desc desc,
+ const struct bch_hash_info *info,
+ subvol_inum inum, u32 snapshot,
+ struct bkey_i *insert,
+ enum btree_iter_update_trigger_flags flags)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_hash_set_or_get_in_snapshot(trans, &iter, desc, info, inum,
+ snapshot, insert, flags);
+ int ret = bkey_err(k);
+ if (ret)
+ return ret;
+ if (k.k) {
+ bch2_trans_iter_exit(trans, &iter);
+ return -BCH_ERR_EEXIST_str_hash_set;
+ }
+
+ return 0;
+}
+
+static __always_inline
int bch2_hash_set(struct btree_trans *trans,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
@@ -363,8 +384,11 @@ int bch2_hash_delete(struct btree_trans *trans,
struct btree_iter iter;
struct bkey_s_c k = bch2_hash_lookup(trans, &iter, desc, info, inum, key,
BTREE_ITER_intent);
- int ret = bkey_err(k) ?:
- bch2_hash_delete_at(trans, desc, info, &iter, 0);
+ int ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ ret = bch2_hash_delete_at(trans, desc, info, &iter, 0);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 91d8187ee168..80e5efaff524 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -319,8 +319,7 @@ int bch2_subvol_is_ro_trans(struct btree_trans *trans, u32 subvol)
int bch2_subvol_is_ro(struct bch_fs *c, u32 subvol)
{
- return bch2_trans_do(c, NULL, NULL, 0,
- bch2_subvol_is_ro_trans(trans, subvol));
+ return bch2_trans_do(c, bch2_subvol_is_ro_trans(trans, subvol));
}
int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot,
@@ -676,8 +675,8 @@ err:
/* set bi_subvol on root inode */
int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c)
{
- int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw,
- __bch2_fs_upgrade_for_subvolumes(trans));
+ int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw,
+ __bch2_fs_upgrade_for_subvolumes(trans));
bch_err_fn(c, ret);
return ret;
}
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index ce7410d72089..7c71594f6a8b 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -287,6 +287,11 @@ static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out
return -BCH_ERR_invalid_sb_layout_nr_superblocks;
}
+ if (layout->sb_max_size_bits > BCH_SB_LAYOUT_SIZE_BITS_MAX) {
+ prt_printf(out, "Invalid superblock layout: max_size_bits too high");
+ return -BCH_ERR_invalid_sb_layout_sb_max_size_bits;
+ }
+
max_sectors = 1 << layout->sb_max_size_bits;
prev_offset = le64_to_cpu(layout->sb_offset[0]);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 873e4be7e1dc..a6ed9a0bf1c7 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -184,6 +184,7 @@ static DEFINE_MUTEX(bch_fs_list_lock);
DECLARE_WAIT_QUEUE_HEAD(bch2_read_only_wait);
+static void bch2_dev_unlink(struct bch_dev *);
static void bch2_dev_free(struct bch_dev *);
static int bch2_dev_alloc(struct bch_fs *, unsigned);
static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *);
@@ -271,6 +272,7 @@ static void __bch2_fs_read_only(struct bch_fs *c)
clean_passes++;
if (bch2_btree_interior_updates_flush(c) ||
+ bch2_btree_write_buffer_flush_going_ro(c) ||
bch2_journal_flush_all_pins(&c->journal) ||
bch2_btree_flush_all_writes(c) ||
seq != atomic64_read(&c->journal.seq)) {
@@ -620,9 +622,7 @@ void __bch2_fs_stop(struct bch_fs *c)
up_write(&c->state_lock);
for_each_member_device(c, ca)
- if (ca->kobj.state_in_sysfs &&
- ca->disk_sb.bdev)
- sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
+ bch2_dev_unlink(ca);
if (c->kobj.state_in_sysfs)
kobject_del(&c->kobj);
@@ -1187,9 +1187,7 @@ static void bch2_dev_free(struct bch_dev *ca)
{
cancel_work_sync(&ca->io_error_work);
- if (ca->kobj.state_in_sysfs &&
- ca->disk_sb.bdev)
- sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
+ bch2_dev_unlink(ca);
if (ca->kobj.state_in_sysfs)
kobject_del(&ca->kobj);
@@ -1226,10 +1224,7 @@ static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca)
percpu_ref_kill(&ca->io_ref);
wait_for_completion(&ca->io_ref_completion);
- if (ca->kobj.state_in_sysfs) {
- sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
- sysfs_remove_link(&ca->kobj, "block");
- }
+ bch2_dev_unlink(ca);
bch2_free_super(&ca->disk_sb);
bch2_dev_journal_exit(ca);
@@ -1251,6 +1246,26 @@ static void bch2_dev_io_ref_complete(struct percpu_ref *ref)
complete(&ca->io_ref_completion);
}
+static void bch2_dev_unlink(struct bch_dev *ca)
+{
+ struct kobject *b;
+
+ /*
+ * This is racy w.r.t. the underlying block device being hot-removed,
+ * which removes it from sysfs.
+ *
+ * It'd be lovely if we had a way to handle this race, but the sysfs
+ * code doesn't appear to provide a good method and block/holder.c is
+ * susceptible as well:
+ */
+ if (ca->kobj.state_in_sysfs &&
+ ca->disk_sb.bdev &&
+ (b = bdev_kobj(ca->disk_sb.bdev))->state_in_sysfs) {
+ sysfs_remove_link(b, "bcachefs");
+ sysfs_remove_link(&ca->kobj, "block");
+ }
+}
+
static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca)
{
int ret;
@@ -1958,7 +1973,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
};
u64 v[3] = { nbuckets - old_nbuckets, 0, 0 };
- ret = bch2_trans_do(ca->fs, NULL, NULL, 0,
+ ret = bch2_trans_commit_do(ca->fs, NULL, NULL, 0,
bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), false)) ?:
bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets);
if (ret)
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index b2f209743afe..fb5c1543e52f 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -450,7 +450,7 @@ static int insert_test_overlapping_extent(struct bch_fs *c, u64 inum, u64 start,
k.k_i.k.p.snapshot = snapid;
k.k_i.k.size = len;
- ret = bch2_trans_do(c, NULL, NULL, 0,
+ ret = bch2_trans_commit_do(c, NULL, NULL, 0,
bch2_btree_insert_nonextent(trans, BTREE_ID_extents, &k.k_i,
BTREE_UPDATE_internal_snapshot_node));
bch_err_fn(c, ret);
@@ -510,7 +510,7 @@ static int test_snapshots(struct bch_fs *c, u64 nr)
if (ret)
return ret;
- ret = bch2_trans_do(c, NULL, NULL, 0,
+ ret = bch2_trans_commit_do(c, NULL, NULL, 0,
bch2_snapshot_node_create(trans, U32_MAX,
snapids,
snapid_subvols,
@@ -809,6 +809,11 @@ int bch2_btree_perf_test(struct bch_fs *c, const char *testname,
unsigned i;
u64 time;
+ if (nr == 0 || nr_threads == 0) {
+ pr_err("nr of iterations or threads is not allowed to be 0");
+ return -EINVAL;
+ }
+
atomic_set(&j.ready, nr_threads);
init_waitqueue_head(&j.ready_wait);
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 56c8d3fe55a4..952aca400faf 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -330,7 +330,7 @@ static int bch2_xattr_get_handler(const struct xattr_handler *handler,
{
struct bch_inode_info *inode = to_bch_ei(vinode);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- int ret = bch2_trans_do(c, NULL, NULL, 0,
+ int ret = bch2_trans_do(c,
bch2_xattr_get_trans(trans, inode, name, buffer, size, handler->flags));
if (ret < 0 && bch2_err_matches(ret, ENOENT))
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index f92f108840f5..8f430ff8e445 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -11,12 +11,13 @@
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/fs.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include <linux/errno.h>
#include <linux/stat.h>
#include <linux/nls.h>
#include <linux/buffer_head.h>
#include <linux/vfs.h>
-#include <linux/parser.h>
#include <linux/namei.h>
#include <linux/sched.h>
#include <linux/cred.h>
@@ -54,22 +55,20 @@ static int befs_utf2nls(struct super_block *sb, const char *in, int in_len,
static int befs_nls2utf(struct super_block *sb, const char *in, int in_len,
char **out, int *out_len);
static void befs_put_super(struct super_block *);
-static int befs_remount(struct super_block *, int *, char *);
static int befs_statfs(struct dentry *, struct kstatfs *);
static int befs_show_options(struct seq_file *, struct dentry *);
-static int parse_options(char *, struct befs_mount_options *);
static struct dentry *befs_fh_to_dentry(struct super_block *sb,
struct fid *fid, int fh_len, int fh_type);
static struct dentry *befs_fh_to_parent(struct super_block *sb,
struct fid *fid, int fh_len, int fh_type);
static struct dentry *befs_get_parent(struct dentry *child);
+static void befs_free_fc(struct fs_context *fc);
static const struct super_operations befs_sops = {
.alloc_inode = befs_alloc_inode, /* allocate a new inode */
.free_inode = befs_free_inode, /* deallocate an inode */
.put_super = befs_put_super, /* uninit super */
.statfs = befs_statfs, /* statfs */
- .remount_fs = befs_remount,
.show_options = befs_show_options,
};
@@ -672,92 +671,53 @@ static struct dentry *befs_get_parent(struct dentry *child)
}
enum {
- Opt_uid, Opt_gid, Opt_charset, Opt_debug, Opt_err,
+ Opt_uid, Opt_gid, Opt_charset, Opt_debug,
};
-static const match_table_t befs_tokens = {
- {Opt_uid, "uid=%d"},
- {Opt_gid, "gid=%d"},
- {Opt_charset, "iocharset=%s"},
- {Opt_debug, "debug"},
- {Opt_err, NULL}
+static const struct fs_parameter_spec befs_param_spec[] = {
+ fsparam_uid ("uid", Opt_uid),
+ fsparam_gid ("gid", Opt_gid),
+ fsparam_string ("iocharset", Opt_charset),
+ fsparam_flag ("debug", Opt_debug),
+ {}
};
static int
-parse_options(char *options, struct befs_mount_options *opts)
+befs_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- char *p;
- substring_t args[MAX_OPT_ARGS];
- int option;
- kuid_t uid;
- kgid_t gid;
-
- /* Initialize options */
- opts->uid = GLOBAL_ROOT_UID;
- opts->gid = GLOBAL_ROOT_GID;
- opts->use_uid = 0;
- opts->use_gid = 0;
- opts->iocharset = NULL;
- opts->debug = 0;
-
- if (!options)
- return 1;
-
- while ((p = strsep(&options, ",")) != NULL) {
- int token;
-
- if (!*p)
- continue;
-
- token = match_token(p, befs_tokens, args);
- switch (token) {
- case Opt_uid:
- if (match_int(&args[0], &option))
- return 0;
- uid = INVALID_UID;
- if (option >= 0)
- uid = make_kuid(current_user_ns(), option);
- if (!uid_valid(uid)) {
- pr_err("Invalid uid %d, "
- "using default\n", option);
- break;
- }
- opts->uid = uid;
- opts->use_uid = 1;
- break;
- case Opt_gid:
- if (match_int(&args[0], &option))
- return 0;
- gid = INVALID_GID;
- if (option >= 0)
- gid = make_kgid(current_user_ns(), option);
- if (!gid_valid(gid)) {
- pr_err("Invalid gid %d, "
- "using default\n", option);
- break;
- }
- opts->gid = gid;
- opts->use_gid = 1;
- break;
- case Opt_charset:
- kfree(opts->iocharset);
- opts->iocharset = match_strdup(&args[0]);
- if (!opts->iocharset) {
- pr_err("allocation failure for "
- "iocharset string\n");
- return 0;
- }
- break;
- case Opt_debug:
- opts->debug = 1;
- break;
- default:
- pr_err("Unrecognized mount option \"%s\" "
- "or missing value\n", p);
- return 0;
- }
+ struct befs_mount_options *opts = fc->fs_private;
+ int token;
+ struct fs_parse_result result;
+
+ /* befs ignores all options on remount */
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE)
+ return 0;
+
+ token = fs_parse(fc, befs_param_spec, param, &result);
+ if (token < 0)
+ return token;
+
+ switch (token) {
+ case Opt_uid:
+ opts->uid = result.uid;
+ opts->use_uid = 1;
+ break;
+ case Opt_gid:
+ opts->gid = result.gid;
+ opts->use_gid = 1;
+ break;
+ case Opt_charset:
+ kfree(opts->iocharset);
+ opts->iocharset = param->string;
+ param->string = NULL;
+ break;
+ case Opt_debug:
+ opts->debug = 1;
+ break;
+ default:
+ return -EINVAL;
}
- return 1;
+ return 0;
}
static int befs_show_options(struct seq_file *m, struct dentry *root)
@@ -793,6 +753,21 @@ befs_put_super(struct super_block *sb)
sb->s_fs_info = NULL;
}
+/*
+ * Copy the parsed options into the sbi mount_options member
+ */
+static void
+befs_set_options(struct befs_sb_info *sbi, struct befs_mount_options *opts)
+{
+ sbi->mount_opts.uid = opts->uid;
+ sbi->mount_opts.gid = opts->gid;
+ sbi->mount_opts.use_uid = opts->use_uid;
+ sbi->mount_opts.use_gid = opts->use_gid;
+ sbi->mount_opts.debug = opts->debug;
+ sbi->mount_opts.iocharset = opts->iocharset;
+ opts->iocharset = NULL;
+}
+
/* Allocate private field of the superblock, fill it.
*
* Finish filling the public superblock fields
@@ -800,7 +775,7 @@ befs_put_super(struct super_block *sb)
* Load a set of NLS translations if needed.
*/
static int
-befs_fill_super(struct super_block *sb, void *data, int silent)
+befs_fill_super(struct super_block *sb, struct fs_context *fc)
{
struct buffer_head *bh;
struct befs_sb_info *befs_sb;
@@ -810,6 +785,8 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
const unsigned long sb_block = 0;
const off_t x86_sb_off = 512;
int blocksize;
+ struct befs_mount_options *parsed_opts = fc->fs_private;
+ int silent = fc->sb_flags & SB_SILENT;
sb->s_fs_info = kzalloc(sizeof(*befs_sb), GFP_KERNEL);
if (sb->s_fs_info == NULL)
@@ -817,11 +794,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
befs_sb = BEFS_SB(sb);
- if (!parse_options((char *) data, &befs_sb->mount_opts)) {
- if (!silent)
- befs_error(sb, "cannot parse mount options");
- goto unacquire_priv_sbp;
- }
+ befs_set_options(befs_sb, parsed_opts);
befs_debug(sb, "---> %s", __func__);
@@ -934,10 +907,10 @@ unacquire_none:
}
static int
-befs_remount(struct super_block *sb, int *flags, char *data)
+befs_reconfigure(struct fs_context *fc)
{
- sync_filesystem(sb);
- if (!(*flags & SB_RDONLY))
+ sync_filesystem(fc->root->d_sb);
+ if (!(fc->sb_flags & SB_RDONLY))
return -EINVAL;
return 0;
}
@@ -965,19 +938,51 @@ befs_statfs(struct dentry *dentry, struct kstatfs *buf)
return 0;
}
-static struct dentry *
-befs_mount(struct file_system_type *fs_type, int flags, const char *dev_name,
- void *data)
+static int befs_get_tree(struct fs_context *fc)
+{
+ return get_tree_bdev(fc, befs_fill_super);
+}
+
+static const struct fs_context_operations befs_context_ops = {
+ .parse_param = befs_parse_param,
+ .get_tree = befs_get_tree,
+ .reconfigure = befs_reconfigure,
+ .free = befs_free_fc,
+};
+
+static int befs_init_fs_context(struct fs_context *fc)
+{
+ struct befs_mount_options *opts;
+
+ opts = kzalloc(sizeof(*opts), GFP_KERNEL);
+ if (!opts)
+ return -ENOMEM;
+
+ /* Initialize options */
+ opts->uid = GLOBAL_ROOT_UID;
+ opts->gid = GLOBAL_ROOT_GID;
+
+ fc->fs_private = opts;
+ fc->ops = &befs_context_ops;
+
+ return 0;
+}
+
+static void befs_free_fc(struct fs_context *fc)
{
- return mount_bdev(fs_type, flags, dev_name, data, befs_fill_super);
+ struct befs_mount_options *opts = fc->fs_private;
+
+ kfree(opts->iocharset);
+ kfree(fc->fs_private);
}
static struct file_system_type befs_fs_type = {
.owner = THIS_MODULE,
.name = "befs",
- .mount = befs_mount,
.kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV,
+ .init_fs_context = befs_init_fs_context,
+ .parameters = befs_param_spec,
};
MODULE_ALIAS_FS("befs");
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 06dc4a57ba78..3039a6b7aba4 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -258,6 +258,12 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
#ifdef ELF_HWCAP2
NEW_AUX_ENT(AT_HWCAP2, ELF_HWCAP2);
#endif
+#ifdef ELF_HWCAP3
+ NEW_AUX_ENT(AT_HWCAP3, ELF_HWCAP3);
+#endif
+#ifdef ELF_HWCAP4
+ NEW_AUX_ENT(AT_HWCAP4, ELF_HWCAP4);
+#endif
NEW_AUX_ENT(AT_EXECFN, bprm->exec);
if (k_platform) {
NEW_AUX_ENT(AT_PLATFORM,
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 4fe5bb9f1b1f..31d253bd3961 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -624,6 +624,12 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
#ifdef ELF_HWCAP2
NEW_AUX_ENT(AT_HWCAP2, ELF_HWCAP2);
#endif
+#ifdef ELF_HWCAP3
+ NEW_AUX_ENT(AT_HWCAP3, ELF_HWCAP3);
+#endif
+#ifdef ELF_HWCAP4
+ NEW_AUX_ENT(AT_HWCAP4, ELF_HWCAP4);
+#endif
NEW_AUX_ENT(AT_PAGESZ, PAGE_SIZE);
NEW_AUX_ENT(AT_CLKTCK, CLOCKS_PER_SEC);
NEW_AUX_ENT(AT_PHDR, exec_params->ph_addr);
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 4fb925e8c981..fa8515598341 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -78,6 +78,32 @@ config BTRFS_ASSERT
If unsure, say N.
+config BTRFS_EXPERIMENTAL
+ bool "Btrfs experimental features"
+ depends on BTRFS_FS
+ default n
+ help
+ Enable experimental features. These features may not be stable enough
+ for end users. This is meant for btrfs developers or users who wish
+ to test the functionality and report problems.
+
+ Current list:
+
+ - extent map shrinker - performance problems with too frequent shrinks
+
+ - send stream protocol v3 - fs-verity support
+
+ - checksum offload mode - sysfs knob to affect when checksums are
+ calculated (at IO time, or in a thread)
+
+ - raid-stripe-tree - additional mapping of extents to devices to
+ support RAID1* profiles on zoned devices,
+ RAID56 not yet supported
+
+ - extent tree v2 - complex rework of extent tracking
+
+ If unsure, say N.
+
config BTRFS_FS_REF_VERIFY
bool "Btrfs with the ref verify tool compiled in"
depends on BTRFS_FS
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 87617f2968bc..3cfc440c636c 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -43,4 +43,5 @@ btrfs-$(CONFIG_FS_VERITY) += verity.o
btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \
tests/extent-buffer-tests.o tests/btrfs-tests.o \
tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o \
- tests/free-space-tree-tests.o tests/extent-map-tests.o
+ tests/free-space-tree-tests.o tests/extent-map-tests.o \
+ tests/raid-stripe-tree-tests.o
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index f8e1d5b2c512..04f53ca548e1 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1442,7 +1442,8 @@ again:
*/
delayed_refs = &ctx->trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock);
- head = btrfs_find_delayed_ref_head(delayed_refs, ctx->bytenr);
+ head = btrfs_find_delayed_ref_head(ctx->fs_info, delayed_refs,
+ ctx->bytenr);
if (head) {
if (!mutex_trylock(&head->mutex)) {
refcount_inc(&head->refs);
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index fec5c6cde0a7..1f216d07eff6 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -49,6 +49,7 @@ void btrfs_bio_init(struct btrfs_bio *bbio, struct btrfs_fs_info *fs_info,
bbio->end_io = end_io;
bbio->private = private;
atomic_set(&bbio->pending_ios, 1);
+ WRITE_ONCE(bbio->status, BLK_STS_OK);
}
/*
@@ -113,41 +114,29 @@ static void __btrfs_bio_end_io(struct btrfs_bio *bbio)
}
}
-static void btrfs_orig_write_end_io(struct bio *bio);
-
-static void btrfs_bbio_propagate_error(struct btrfs_bio *bbio,
- struct btrfs_bio *orig_bbio)
-{
- /*
- * For writes we tolerate nr_mirrors - 1 write failures, so we can't
- * just blindly propagate a write failure here. Instead increment the
- * error count in the original I/O context so that it is guaranteed to
- * be larger than the error tolerance.
- */
- if (bbio->bio.bi_end_io == &btrfs_orig_write_end_io) {
- struct btrfs_io_stripe *orig_stripe = orig_bbio->bio.bi_private;
- struct btrfs_io_context *orig_bioc = orig_stripe->bioc;
-
- atomic_add(orig_bioc->max_errors, &orig_bioc->error);
- } else {
- orig_bbio->bio.bi_status = bbio->bio.bi_status;
- }
-}
-
void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
{
bbio->bio.bi_status = status;
if (bbio->bio.bi_pool == &btrfs_clone_bioset) {
struct btrfs_bio *orig_bbio = bbio->private;
- if (bbio->bio.bi_status)
- btrfs_bbio_propagate_error(bbio, orig_bbio);
btrfs_cleanup_bio(bbio);
bbio = orig_bbio;
}
- if (atomic_dec_and_test(&bbio->pending_ios))
+ /*
+ * At this point, bbio always points to the original btrfs_bio. Save
+ * the first error in it.
+ */
+ if (status != BLK_STS_OK)
+ cmpxchg(&bbio->status, BLK_STS_OK, status);
+
+ if (atomic_dec_and_test(&bbio->pending_ios)) {
+ /* Load split bio's error which might be set above. */
+ if (status == BLK_STS_OK)
+ bbio->bio.bi_status = READ_ONCE(bbio->status);
__btrfs_bio_end_io(bbio);
+ }
}
static int next_repair_mirror(struct btrfs_failed_bio *fbio, int cur_mirror)
@@ -598,7 +587,7 @@ static bool should_async_write(struct btrfs_bio *bbio)
{
bool auto_csum_mode = true;
-#ifdef CONFIG_BTRFS_DEBUG
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
struct btrfs_fs_devices *fs_devices = bbio->fs_info->fs_devices;
enum btrfs_offload_csum_mode csum_mode = READ_ONCE(fs_devices->offload_csum_mode);
diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h
index e48612340745..e2fe16074ad6 100644
--- a/fs/btrfs/bio.h
+++ b/fs/btrfs/bio.h
@@ -79,6 +79,9 @@ struct btrfs_bio {
/* File system that this I/O operates on. */
struct btrfs_fs_info *fs_info;
+ /* Save the first error status of split bio. */
+ blk_status_t status;
+
/*
* This member must come last, bio_alloc_bioset will allocate enough
* bytes for entire btrfs_bio but relies on bio being last.
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 7980b2e33a92..4427c1b835e8 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -2797,7 +2797,7 @@ next:
* uncompressed data size, because the compression is only done
* when writeback triggered and we don't know how much space we
* are actually going to need, so we reserve the uncompressed
- * size because the data may be uncompressible in the worst case.
+ * size because the data may be incompressible in the worst case.
*/
if (ret == 0) {
bool used;
@@ -3819,6 +3819,8 @@ void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
spin_lock(&cache->lock);
if (cache->ro)
space_info->bytes_readonly += num_bytes;
+ else if (btrfs_is_zoned(cache->fs_info))
+ space_info->bytes_zone_unusable += num_bytes;
cache->reserved -= num_bytes;
space_info->bytes_reserved -= num_bytes;
space_info->max_extent_size = 0;
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index e152fde888fc..aa1f55cd81b7 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -577,7 +577,6 @@ void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state
struct extent_state *other);
void btrfs_split_delalloc_extent(struct btrfs_inode *inode,
struct extent_state *orig, u64 split);
-void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end);
void btrfs_evict_inode(struct inode *inode);
struct inode *btrfs_alloc_inode(struct super_block *sb);
void btrfs_destroy_inode(struct inode *inode);
@@ -613,11 +612,17 @@ int btrfs_writepage_cow_fixup(struct folio *folio);
int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
int compress_type);
int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
- u64 file_offset, u64 disk_bytenr,
- u64 disk_io_size,
- struct page **pages);
+ u64 disk_bytenr, u64 disk_io_size,
+ struct page **pages, void *uring_ctx);
ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
- struct btrfs_ioctl_encoded_io_args *encoded);
+ struct btrfs_ioctl_encoded_io_args *encoded,
+ struct extent_state **cached_state,
+ u64 *disk_bytenr, u64 *disk_io_size);
+ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter,
+ u64 start, u64 lockend,
+ struct extent_state **cached_state,
+ u64 disk_bytenr, u64 disk_io_size,
+ size_t count, bool compressed, bool *unlocked);
ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
const struct btrfs_ioctl_encoded_io_args *encoded);
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 90aef2627ca2..0c4d486c3048 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -453,7 +453,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
if (pg_index > end_index)
break;
- folio = __filemap_get_folio(mapping, pg_index, 0, 0);
+ folio = filemap_get_folio(mapping, pg_index);
if (!IS_ERR(folio)) {
u64 folio_sz = folio_size(folio);
u64 offset = offset_in_folio(folio, cur);
@@ -545,8 +545,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
* subpage::readers and to unlock the page.
*/
if (fs_info->sectorsize < PAGE_SIZE)
- btrfs_subpage_start_reader(fs_info, folio, cur,
- add_size);
+ btrfs_folio_set_lock(fs_info, folio, cur, add_size);
folio_put(folio);
cur += add_size;
}
@@ -702,7 +701,7 @@ static void free_heuristic_ws(struct list_head *ws)
kfree(workspace);
}
-static struct list_head *alloc_heuristic_ws(unsigned int level)
+static struct list_head *alloc_heuristic_ws(void)
{
struct heuristic_ws *ws;
@@ -744,9 +743,9 @@ static const struct btrfs_compress_op * const btrfs_compress_op[] = {
static struct list_head *alloc_workspace(int type, unsigned int level)
{
switch (type) {
- case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(level);
+ case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws();
case BTRFS_COMPRESS_ZLIB: return zlib_alloc_workspace(level);
- case BTRFS_COMPRESS_LZO: return lzo_alloc_workspace(level);
+ case BTRFS_COMPRESS_LZO: return lzo_alloc_workspace();
case BTRFS_COMPRESS_ZSTD: return zstd_alloc_workspace(level);
default:
/*
@@ -1030,6 +1029,7 @@ int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping
{
int type = btrfs_compress_type(type_level);
int level = btrfs_compress_level(type_level);
+ const unsigned long orig_len = *total_out;
struct list_head *workspace;
int ret;
@@ -1037,6 +1037,8 @@ int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping
workspace = get_workspace(type, level);
ret = compression_compress_pages(type, workspace, mapping, start, folios,
out_folios, total_in, total_out);
+ /* The total read-in bytes should be no larger than the input. */
+ ASSERT(*total_in <= orig_len);
put_workspace(type, workspace);
return ret;
}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index b6563b6a333e..954034086d0d 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -175,7 +175,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
int lzo_decompress(struct list_head *ws, const u8 *data_in,
struct folio *dest_folio, unsigned long dest_pgoff, size_t srclen,
size_t destlen);
-struct list_head *lzo_alloc_workspace(unsigned int level);
+struct list_head *lzo_alloc_workspace(void);
void lzo_free_workspace(struct list_head *ws);
int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 0cc919d15b14..148648ea1c8b 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1508,26 +1508,26 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
*/
static int
read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
- struct extent_buffer **eb_ret, int level, int slot,
+ struct extent_buffer **eb_ret, int slot,
const struct btrfs_key *key)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_tree_parent_check check = { 0 };
u64 blocknr;
- u64 gen;
- struct extent_buffer *tmp;
- int ret;
+ struct extent_buffer *tmp = NULL;
+ int ret = 0;
int parent_level;
- bool unlock_up;
+ int err;
+ bool read_tmp = false;
+ bool tmp_locked = false;
+ bool path_released = false;
- unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]);
blocknr = btrfs_node_blockptr(*eb_ret, slot);
- gen = btrfs_node_ptr_generation(*eb_ret, slot);
parent_level = btrfs_header_level(*eb_ret);
btrfs_node_key_to_cpu(*eb_ret, &check.first_key, slot);
check.has_first_key = true;
check.level = parent_level - 1;
- check.transid = gen;
+ check.transid = btrfs_node_ptr_generation(*eb_ret, slot);
check.owner_root = btrfs_root_id(root);
/*
@@ -1540,79 +1540,115 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
tmp = find_extent_buffer(fs_info, blocknr);
if (tmp) {
if (p->reada == READA_FORWARD_ALWAYS)
- reada_for_search(fs_info, p, level, slot, key->objectid);
+ reada_for_search(fs_info, p, parent_level, slot, key->objectid);
/* first we do an atomic uptodate check */
- if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
+ if (btrfs_buffer_uptodate(tmp, check.transid, 1) > 0) {
/*
* Do extra check for first_key, eb can be stale due to
* being cached, read from scrub, or have multiple
* parents (shared tree blocks).
*/
- if (btrfs_verify_level_key(tmp,
- parent_level - 1, &check.first_key, gen)) {
- free_extent_buffer(tmp);
- return -EUCLEAN;
+ if (btrfs_verify_level_key(tmp, &check)) {
+ ret = -EUCLEAN;
+ goto out;
}
*eb_ret = tmp;
- return 0;
+ tmp = NULL;
+ ret = 0;
+ goto out;
}
if (p->nowait) {
- free_extent_buffer(tmp);
- return -EAGAIN;
+ ret = -EAGAIN;
+ goto out;
}
- if (unlock_up)
- btrfs_unlock_up_safe(p, level + 1);
-
- /* now we're allowed to do a blocking uptodate check */
- ret = btrfs_read_extent_buffer(tmp, &check);
- if (ret) {
- free_extent_buffer(tmp);
+ if (!p->skip_locking) {
+ btrfs_unlock_up_safe(p, parent_level + 1);
+ tmp_locked = true;
+ btrfs_tree_read_lock(tmp);
btrfs_release_path(p);
- return ret;
+ ret = -EAGAIN;
+ path_released = true;
}
- if (unlock_up)
- ret = -EAGAIN;
+ /* Now we're allowed to do a blocking uptodate check. */
+ err = btrfs_read_extent_buffer(tmp, &check);
+ if (err) {
+ ret = err;
+ goto out;
+ }
+ if (ret == 0) {
+ ASSERT(!tmp_locked);
+ *eb_ret = tmp;
+ tmp = NULL;
+ }
goto out;
} else if (p->nowait) {
- return -EAGAIN;
+ ret = -EAGAIN;
+ goto out;
}
- if (unlock_up) {
- btrfs_unlock_up_safe(p, level + 1);
+ if (!p->skip_locking) {
+ btrfs_unlock_up_safe(p, parent_level + 1);
ret = -EAGAIN;
- } else {
- ret = 0;
}
if (p->reada != READA_NONE)
- reada_for_search(fs_info, p, level, slot, key->objectid);
+ reada_for_search(fs_info, p, parent_level, slot, key->objectid);
- tmp = read_tree_block(fs_info, blocknr, &check);
+ tmp = btrfs_find_create_tree_block(fs_info, blocknr, check.owner_root, check.level);
if (IS_ERR(tmp)) {
+ ret = PTR_ERR(tmp);
+ tmp = NULL;
+ goto out;
+ }
+ read_tmp = true;
+
+ if (!p->skip_locking) {
+ ASSERT(ret == -EAGAIN);
+ tmp_locked = true;
+ btrfs_tree_read_lock(tmp);
btrfs_release_path(p);
- return PTR_ERR(tmp);
+ path_released = true;
+ }
+
+ /* Now we're allowed to do a blocking uptodate check. */
+ err = btrfs_read_extent_buffer(tmp, &check);
+ if (err) {
+ ret = err;
+ goto out;
}
+
/*
* If the read above didn't mark this buffer up to date,
* it will never end up being up to date. Set ret to EIO now
* and give up so that our caller doesn't loop forever
* on our EAGAINs.
*/
- if (!extent_buffer_uptodate(tmp))
+ if (!extent_buffer_uptodate(tmp)) {
ret = -EIO;
+ goto out;
+ }
-out:
if (ret == 0) {
+ ASSERT(!tmp_locked);
*eb_ret = tmp;
- } else {
- free_extent_buffer(tmp);
- btrfs_release_path(p);
+ tmp = NULL;
+ }
+out:
+ if (tmp) {
+ if (tmp_locked)
+ btrfs_tree_read_unlock(tmp);
+ if (read_tmp && ret && ret != -EAGAIN)
+ free_extent_buffer_stale(tmp);
+ else
+ free_extent_buffer(tmp);
}
+ if (ret && !path_released)
+ btrfs_release_path(p);
return ret;
}
@@ -2197,8 +2233,8 @@ cow_done:
goto done;
}
- err = read_block_for_search(root, p, &b, level, slot, key);
- if (err == -EAGAIN)
+ err = read_block_for_search(root, p, &b, slot, key);
+ if (err == -EAGAIN && !p->nowait)
goto again;
if (err) {
ret = err;
@@ -2324,8 +2360,8 @@ again:
goto done;
}
- err = read_block_for_search(root, p, &b, level, slot, key);
- if (err == -EAGAIN)
+ err = read_block_for_search(root, p, &b, slot, key);
+ if (err == -EAGAIN && !p->nowait)
goto again;
if (err) {
ret = err;
@@ -2334,7 +2370,7 @@ again:
level = btrfs_header_level(b);
btrfs_tree_read_lock(b);
- b = btrfs_tree_mod_log_rewind(fs_info, p, b, time_seq);
+ b = btrfs_tree_mod_log_rewind(fs_info, b, time_seq);
if (!b) {
ret = -ENOMEM;
goto done;
@@ -4930,8 +4966,7 @@ again:
}
next = c;
- ret = read_block_for_search(root, path, &next, level,
- slot, &key);
+ ret = read_block_for_search(root, path, &next, slot, &key);
if (ret == -EAGAIN && !path->nowait)
goto again;
@@ -4974,8 +5009,7 @@ again:
if (!level)
break;
- ret = read_block_for_search(root, path, &next, level,
- 0, &key);
+ ret = read_block_for_search(root, path, &next, 0, &key);
if (ret == -EAGAIN && !path->nowait)
goto again;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 317a3712270f..307dedf95c70 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -744,16 +744,11 @@ const char *btrfs_super_csum_driver(u16 csum_type);
size_t __attribute_const__ btrfs_get_num_csums(void);
/*
- * We use page status Private2 to indicate there is an ordered extent with
+ * We use folio flag owner_2 to indicate there is an ordered extent with
* unfinished IO.
- *
- * Rename the Private2 accessors to Ordered, to improve readability.
*/
-#define PageOrdered(page) PagePrivate2(page)
-#define SetPageOrdered(page) SetPagePrivate2(page)
-#define ClearPageOrdered(page) ClearPagePrivate2(page)
-#define folio_test_ordered(folio) folio_test_private_2(folio)
-#define folio_set_ordered(folio) folio_set_private_2(folio)
-#define folio_clear_ordered(folio) folio_clear_private_2(folio)
+#define folio_test_ordered(folio) folio_test_owner_2(folio)
+#define folio_set_ordered(folio) folio_set_owner_2(folio)
+#define folio_clear_ordered(folio) folio_clear_owner_2(folio)
#endif
diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index b95ef44c326b..968dae953948 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -763,12 +763,12 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
* We can get a merged extent, in that case, we need to re-search
* tree to get the original em for defrag.
*
- * If @newer_than is 0 or em::generation < newer_than, we can trust
- * this em, as either we don't care about the generation, or the
- * merged extent map will be rejected anyway.
+ * This is because even if we have adjacent extents that are contiguous
+ * and compatible (same type and flags), we still want to defrag them
+ * so that we use less metadata (extent items in the extent tree and
+ * file extent items in the inode's subvolume tree).
*/
- if (em && (em->flags & EXTENT_FLAG_MERGED) &&
- newer_than && em->generation >= newer_than) {
+ if (em && (em->flags & EXTENT_FLAG_MERGED)) {
free_extent_map(em);
em = NULL;
}
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 7cfefdfe54ea..f4d9feac0d0e 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -64,9 +64,9 @@ struct btrfs_delayed_node {
struct mutex mutex;
struct btrfs_inode_item inode_item;
refcount_t refs;
+ int count;
u64 index_cnt;
unsigned long flags;
- int count;
/*
* The size of the next batch of dir index items to insert (if this
* node is from a directory inode). Protected by @mutex.
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 32f719b9e661..0d878dbbabba 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -9,6 +9,7 @@
#include "messages.h"
#include "ctree.h"
#include "delayed-ref.h"
+#include "extent-tree.h"
#include "transaction.h"
#include "qgroup.h"
#include "space-info.h"
@@ -298,7 +299,7 @@ static int comp_refs(struct btrfs_delayed_ref_node *ref1,
if (ref1->ref_root < ref2->ref_root)
return -1;
if (ref1->ref_root > ref2->ref_root)
- return -1;
+ return 1;
if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY)
ret = comp_data_refs(ref1, ref2);
}
@@ -313,39 +314,6 @@ static int comp_refs(struct btrfs_delayed_ref_node *ref1,
return 0;
}
-/* insert a new ref to head ref rbtree */
-static struct btrfs_delayed_ref_head *htree_insert(struct rb_root_cached *root,
- struct rb_node *node)
-{
- struct rb_node **p = &root->rb_root.rb_node;
- struct rb_node *parent_node = NULL;
- struct btrfs_delayed_ref_head *entry;
- struct btrfs_delayed_ref_head *ins;
- u64 bytenr;
- bool leftmost = true;
-
- ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node);
- bytenr = ins->bytenr;
- while (*p) {
- parent_node = *p;
- entry = rb_entry(parent_node, struct btrfs_delayed_ref_head,
- href_node);
-
- if (bytenr < entry->bytenr) {
- p = &(*p)->rb_left;
- } else if (bytenr > entry->bytenr) {
- p = &(*p)->rb_right;
- leftmost = false;
- } else {
- return entry;
- }
- }
-
- rb_link_node(node, parent_node, p);
- rb_insert_color_cached(node, root, leftmost);
- return NULL;
-}
-
static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root,
struct btrfs_delayed_ref_node *ins)
{
@@ -380,75 +348,32 @@ static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root,
static struct btrfs_delayed_ref_head *find_first_ref_head(
struct btrfs_delayed_ref_root *dr)
{
- struct rb_node *n;
- struct btrfs_delayed_ref_head *entry;
-
- n = rb_first_cached(&dr->href_root);
- if (!n)
- return NULL;
+ unsigned long from = 0;
- entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);
+ lockdep_assert_held(&dr->lock);
- return entry;
+ return xa_find(&dr->head_refs, &from, ULONG_MAX, XA_PRESENT);
}
-/*
- * Find a head entry based on bytenr. This returns the delayed ref head if it
- * was able to find one, or NULL if nothing was in that spot. If return_bigger
- * is given, the next bigger entry is returned if no exact match is found.
- */
-static struct btrfs_delayed_ref_head *find_ref_head(
- struct btrfs_delayed_ref_root *dr, u64 bytenr,
- bool return_bigger)
-{
- struct rb_root *root = &dr->href_root.rb_root;
- struct rb_node *n;
- struct btrfs_delayed_ref_head *entry;
-
- n = root->rb_node;
- entry = NULL;
- while (n) {
- entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);
-
- if (bytenr < entry->bytenr)
- n = n->rb_left;
- else if (bytenr > entry->bytenr)
- n = n->rb_right;
- else
- return entry;
- }
- if (entry && return_bigger) {
- if (bytenr > entry->bytenr) {
- n = rb_next(&entry->href_node);
- if (!n)
- return NULL;
- entry = rb_entry(n, struct btrfs_delayed_ref_head,
- href_node);
- }
- return entry;
- }
- return NULL;
-}
-
-int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
- struct btrfs_delayed_ref_head *head)
+static bool btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_head *head)
{
lockdep_assert_held(&delayed_refs->lock);
if (mutex_trylock(&head->mutex))
- return 0;
+ return true;
refcount_inc(&head->refs);
spin_unlock(&delayed_refs->lock);
mutex_lock(&head->mutex);
spin_lock(&delayed_refs->lock);
- if (RB_EMPTY_NODE(&head->href_node)) {
+ if (!head->tracked) {
mutex_unlock(&head->mutex);
btrfs_put_delayed_ref_head(head);
- return -EAGAIN;
+ return false;
}
btrfs_put_delayed_ref_head(head);
- return 0;
+ return true;
}
static inline void drop_delayed_ref(struct btrfs_fs_info *fs_info,
@@ -462,7 +387,6 @@ static inline void drop_delayed_ref(struct btrfs_fs_info *fs_info,
if (!list_empty(&ref->add_list))
list_del(&ref->add_list);
btrfs_put_delayed_ref(ref);
- atomic_dec(&delayed_refs->num_entries);
btrfs_delayed_refs_rsv_release(fs_info, 1, 0);
}
@@ -558,33 +482,31 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq)
}
struct btrfs_delayed_ref_head *btrfs_select_ref_head(
+ const struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs)
{
struct btrfs_delayed_ref_head *head;
+ unsigned long start_index;
+ unsigned long found_index;
+ bool found_head = false;
+ bool locked;
- lockdep_assert_held(&delayed_refs->lock);
+ spin_lock(&delayed_refs->lock);
again:
- head = find_ref_head(delayed_refs, delayed_refs->run_delayed_start,
- true);
- if (!head && delayed_refs->run_delayed_start != 0) {
- delayed_refs->run_delayed_start = 0;
- head = find_first_ref_head(delayed_refs);
+ start_index = (delayed_refs->run_delayed_start >> fs_info->sectorsize_bits);
+ xa_for_each_start(&delayed_refs->head_refs, found_index, head, start_index) {
+ if (!head->processing) {
+ found_head = true;
+ break;
+ }
}
- if (!head)
- return NULL;
-
- while (head->processing) {
- struct rb_node *node;
-
- node = rb_next(&head->href_node);
- if (!node) {
- if (delayed_refs->run_delayed_start == 0)
- return NULL;
- delayed_refs->run_delayed_start = 0;
- goto again;
+ if (!found_head) {
+ if (delayed_refs->run_delayed_start == 0) {
+ spin_unlock(&delayed_refs->lock);
+ return NULL;
}
- head = rb_entry(node, struct btrfs_delayed_ref_head,
- href_node);
+ delayed_refs->run_delayed_start = 0;
+ goto again;
}
head->processing = true;
@@ -592,18 +514,42 @@ again:
delayed_refs->num_heads_ready--;
delayed_refs->run_delayed_start = head->bytenr +
head->num_bytes;
+
+ locked = btrfs_delayed_ref_lock(delayed_refs, head);
+ spin_unlock(&delayed_refs->lock);
+
+ /*
+ * We may have dropped the spin lock to get the head mutex lock, and
+ * that might have given someone else time to free the head. If that's
+ * true, it has been removed from our list and we can move on.
+ */
+ if (!locked)
+ return ERR_PTR(-EAGAIN);
+
return head;
}
-void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
+void btrfs_unselect_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_head *head)
+{
+ spin_lock(&delayed_refs->lock);
+ head->processing = false;
+ delayed_refs->num_heads_ready++;
+ spin_unlock(&delayed_refs->lock);
+ btrfs_delayed_ref_unlock(head);
+}
+
+void btrfs_delete_ref_head(const struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head)
{
+ const unsigned long index = (head->bytenr >> fs_info->sectorsize_bits);
+
lockdep_assert_held(&delayed_refs->lock);
lockdep_assert_held(&head->lock);
- rb_erase_cached(&head->href_node, &delayed_refs->href_root);
- RB_CLEAR_NODE(&head->href_node);
- atomic_dec(&delayed_refs->num_entries);
+ xa_erase(&delayed_refs->head_refs, index);
+ head->tracked = false;
delayed_refs->num_heads--;
if (!head->processing)
delayed_refs->num_heads_ready--;
@@ -629,7 +575,6 @@ static bool insert_delayed_ref(struct btrfs_trans_handle *trans,
if (!exist) {
if (ref->action == BTRFS_ADD_DELAYED_REF)
list_add_tail(&ref->add_list, &href->ref_add_list);
- atomic_inc(&root->num_entries);
spin_unlock(&href->lock);
trans->delayed_ref_updates++;
return false;
@@ -649,7 +594,7 @@ static bool insert_delayed_ref(struct btrfs_trans_handle *trans,
&href->ref_add_list);
else if (ref->action == BTRFS_DROP_DELAYED_REF) {
ASSERT(!list_empty(&exist->add_list));
- list_del(&exist->add_list);
+ list_del_init(&exist->add_list);
} else {
ASSERT(0);
}
@@ -813,7 +758,7 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
head_ref->is_system = (generic_ref->ref_root == BTRFS_CHUNK_TREE_OBJECTID);
head_ref->ref_tree = RB_ROOT_CACHED;
INIT_LIST_HEAD(&head_ref->ref_add_list);
- RB_CLEAR_NODE(&head_ref->href_node);
+ head_ref->tracked = false;
head_ref->processing = false;
head_ref->total_ref_mod = count_mod;
spin_lock_init(&head_ref->lock);
@@ -830,7 +775,6 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
qrecord->data_rsv = reserved;
qrecord->data_rsv_refroot = generic_ref->ref_root;
}
- qrecord->bytenr = generic_ref->bytenr;
qrecord->num_bytes = generic_ref->num_bytes;
qrecord->old_roots = NULL;
}
@@ -849,21 +793,36 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
struct btrfs_qgroup_extent_record *qrecord,
int action, bool *qrecord_inserted_ret)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_ref_head *existing;
struct btrfs_delayed_ref_root *delayed_refs;
+ const unsigned long index = (head_ref->bytenr >> fs_info->sectorsize_bits);
bool qrecord_inserted = false;
delayed_refs = &trans->transaction->delayed_refs;
+ lockdep_assert_held(&delayed_refs->lock);
+
+#if BITS_PER_LONG == 32
+ if (head_ref->bytenr >= MAX_LFS_FILESIZE) {
+ if (qrecord)
+ xa_release(&delayed_refs->dirty_extents, index);
+ btrfs_err_rl(fs_info,
+"delayed ref head %llu is beyond 32bit page cache and xarray index limit",
+ head_ref->bytenr);
+ btrfs_err_32bit_limit(fs_info);
+ return ERR_PTR(-EOVERFLOW);
+ }
+#endif
/* Record qgroup extent info if provided */
if (qrecord) {
int ret;
- ret = btrfs_qgroup_trace_extent_nolock(trans->fs_info,
- delayed_refs, qrecord);
+ ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, qrecord,
+ head_ref->bytenr);
if (ret) {
/* Clean up if insertion fails or item exists. */
- xa_release(&delayed_refs->dirty_extents, qrecord->bytenr);
+ xa_release(&delayed_refs->dirty_extents, index);
/* Caller responsible for freeing qrecord on error. */
if (ret < 0)
return ERR_PTR(ret);
@@ -873,10 +832,9 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
}
}
- trace_add_delayed_ref_head(trans->fs_info, head_ref, action);
+ trace_add_delayed_ref_head(fs_info, head_ref, action);
- existing = htree_insert(&delayed_refs->href_root,
- &head_ref->href_node);
+ existing = xa_load(&delayed_refs->head_refs, index);
if (existing) {
update_existing_head_ref(trans, existing, head_ref);
/*
@@ -886,6 +844,19 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
head_ref = existing;
} else {
+ existing = xa_store(&delayed_refs->head_refs, index, head_ref, GFP_ATOMIC);
+ if (xa_is_err(existing)) {
+ /* Memory was preallocated by the caller. */
+ ASSERT(xa_err(existing) != -ENOMEM);
+ return ERR_PTR(xa_err(existing));
+ } else if (WARN_ON(existing)) {
+ /*
+ * Shouldn't happen we just did a lookup before under
+ * delayed_refs->lock.
+ */
+ return ERR_PTR(-EEXIST);
+ }
+ head_ref->tracked = true;
/*
* We reserve the amount of bytes needed to delete csums when
* adding the ref head and not when adding individual drop refs
@@ -895,12 +866,10 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
if (head_ref->is_data && head_ref->ref_mod < 0) {
delayed_refs->pending_csums += head_ref->num_bytes;
trans->delayed_ref_csum_deletions +=
- btrfs_csum_bytes_to_leaves(trans->fs_info,
- head_ref->num_bytes);
+ btrfs_csum_bytes_to_leaves(fs_info, head_ref->num_bytes);
}
delayed_refs->num_heads++;
delayed_refs->num_heads_ready++;
- atomic_inc(&delayed_refs->num_entries);
}
if (qrecord_inserted_ret)
*qrecord_inserted_ret = qrecord_inserted;
@@ -1008,6 +977,8 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *new_head_ref;
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_qgroup_extent_record *record = NULL;
+ const unsigned long index = (generic_ref->bytenr >> fs_info->sectorsize_bits);
+ bool qrecord_reserved = false;
bool qrecord_inserted;
int action = generic_ref->action;
bool merged;
@@ -1023,24 +994,32 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans,
goto free_node;
}
+ delayed_refs = &trans->transaction->delayed_refs;
+
if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) {
record = kzalloc(sizeof(*record), GFP_NOFS);
if (!record) {
ret = -ENOMEM;
goto free_head_ref;
}
- if (xa_reserve(&trans->transaction->delayed_refs.dirty_extents,
- generic_ref->bytenr, GFP_NOFS)) {
+ if (xa_reserve(&delayed_refs->dirty_extents, index, GFP_NOFS)) {
ret = -ENOMEM;
goto free_record;
}
+ qrecord_reserved = true;
+ }
+
+ ret = xa_reserve(&delayed_refs->head_refs, index, GFP_NOFS);
+ if (ret) {
+ if (qrecord_reserved)
+ xa_release(&delayed_refs->dirty_extents, index);
+ goto free_record;
}
init_delayed_ref_common(fs_info, node, generic_ref);
init_delayed_ref_head(head_ref, generic_ref, record, reserved);
head_ref->extent_op = extent_op;
- delayed_refs = &trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock);
/*
@@ -1050,6 +1029,7 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans,
new_head_ref = add_delayed_ref_head(trans, head_ref, record,
action, &qrecord_inserted);
if (IS_ERR(new_head_ref)) {
+ xa_release(&delayed_refs->head_refs, index);
spin_unlock(&delayed_refs->lock);
ret = PTR_ERR(new_head_ref);
goto free_record;
@@ -1073,7 +1053,7 @@ static int add_delayed_ref(struct btrfs_trans_handle *trans,
kmem_cache_free(btrfs_delayed_ref_node_cachep, node);
if (qrecord_inserted)
- return btrfs_qgroup_trace_extent_post(trans, record);
+ return btrfs_qgroup_trace_extent_post(trans, record, generic_ref->bytenr);
return 0;
free_record:
@@ -1112,6 +1092,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, u8 level,
struct btrfs_delayed_extent_op *extent_op)
{
+ const unsigned long index = (bytenr >> trans->fs_info->sectorsize_bits);
struct btrfs_delayed_ref_head *head_ref;
struct btrfs_delayed_ref_head *head_ref_ret;
struct btrfs_delayed_ref_root *delayed_refs;
@@ -1122,6 +1103,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
.num_bytes = num_bytes,
.tree_ref.level = level,
};
+ int ret;
head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
if (!head_ref)
@@ -1131,16 +1113,23 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
head_ref->extent_op = extent_op;
delayed_refs = &trans->transaction->delayed_refs;
- spin_lock(&delayed_refs->lock);
+ ret = xa_reserve(&delayed_refs->head_refs, index, GFP_NOFS);
+ if (ret) {
+ kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
+ return ret;
+ }
+
+ spin_lock(&delayed_refs->lock);
head_ref_ret = add_delayed_ref_head(trans, head_ref, NULL,
BTRFS_UPDATE_DELAYED_HEAD, NULL);
- spin_unlock(&delayed_refs->lock);
-
if (IS_ERR(head_ref_ret)) {
+ xa_release(&delayed_refs->head_refs, index);
+ spin_unlock(&delayed_refs->lock);
kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
return PTR_ERR(head_ref_ret);
}
+ spin_unlock(&delayed_refs->lock);
/*
* Need to update the delayed_refs_rsv with any changes we may have
@@ -1163,11 +1152,15 @@ void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
* head node if found, or NULL if not.
*/
struct btrfs_delayed_ref_head *
-btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr)
+btrfs_find_delayed_ref_head(const struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ u64 bytenr)
{
+ const unsigned long index = (bytenr >> fs_info->sectorsize_bits);
+
lockdep_assert_held(&delayed_refs->lock);
- return find_ref_head(delayed_refs, bytenr, false);
+ return xa_load(&delayed_refs->head_refs, index);
}
static int find_comp(struct btrfs_delayed_ref_node *entry, u64 root, u64 parent)
@@ -1237,6 +1230,81 @@ bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head,
return found;
}
+void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans)
+{
+ struct btrfs_delayed_ref_root *delayed_refs = &trans->delayed_refs;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+
+ spin_lock(&delayed_refs->lock);
+ while (true) {
+ struct btrfs_delayed_ref_head *head;
+ struct rb_node *n;
+ bool pin_bytes = false;
+
+ head = find_first_ref_head(delayed_refs);
+ if (!head)
+ break;
+
+ if (!btrfs_delayed_ref_lock(delayed_refs, head))
+ continue;
+
+ spin_lock(&head->lock);
+ while ((n = rb_first_cached(&head->ref_tree)) != NULL) {
+ struct btrfs_delayed_ref_node *ref;
+
+ ref = rb_entry(n, struct btrfs_delayed_ref_node, ref_node);
+ drop_delayed_ref(fs_info, delayed_refs, head, ref);
+ }
+ if (head->must_insert_reserved)
+ pin_bytes = true;
+ btrfs_free_delayed_extent_op(head->extent_op);
+ btrfs_delete_ref_head(fs_info, delayed_refs, head);
+ spin_unlock(&head->lock);
+ spin_unlock(&delayed_refs->lock);
+ mutex_unlock(&head->mutex);
+
+ if (pin_bytes) {
+ struct btrfs_block_group *bg;
+
+ bg = btrfs_lookup_block_group(fs_info, head->bytenr);
+ if (WARN_ON_ONCE(bg == NULL)) {
+ /*
+ * Unexpected and there's nothing we can do here
+ * because we are in a transaction abort path,
+ * so any errors can only be ignored or reported
+ * while attempting to cleanup all resources.
+ */
+ btrfs_err(fs_info,
+"block group for delayed ref at %llu was not found while destroying ref head",
+ head->bytenr);
+ } else {
+ spin_lock(&bg->space_info->lock);
+ spin_lock(&bg->lock);
+ bg->pinned += head->num_bytes;
+ btrfs_space_info_update_bytes_pinned(fs_info,
+ bg->space_info,
+ head->num_bytes);
+ bg->reserved -= head->num_bytes;
+ bg->space_info->bytes_reserved -= head->num_bytes;
+ spin_unlock(&bg->lock);
+ spin_unlock(&bg->space_info->lock);
+
+ btrfs_put_block_group(bg);
+ }
+
+ btrfs_error_unpin_extent_range(fs_info, head->bytenr,
+ head->bytenr + head->num_bytes - 1);
+ }
+ btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
+ btrfs_put_delayed_ref_head(head);
+ cond_resched();
+ spin_lock(&delayed_refs->lock);
+ }
+ btrfs_qgroup_destroy_extent_records(trans);
+
+ spin_unlock(&delayed_refs->lock);
+}
+
void __cold btrfs_delayed_ref_exit(void)
{
kmem_cache_destroy(btrfs_delayed_ref_head_cachep);
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 085f30968aba..611fb3388f82 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -61,7 +61,8 @@ struct btrfs_delayed_ref_node {
/*
* If action is BTRFS_ADD_DELAYED_REF, also link this node to
* ref_head->ref_add_list, then we do not need to iterate the
- * whole ref_head->ref_list to find BTRFS_ADD_DELAYED_REF nodes.
+ * refs rbtree in the corresponding delayed ref head
+ * (struct btrfs_delayed_ref_head::ref_tree).
*/
struct list_head add_list;
@@ -123,12 +124,6 @@ struct btrfs_delayed_ref_head {
u64 bytenr;
u64 num_bytes;
/*
- * For insertion into struct btrfs_delayed_ref_root::href_root.
- * Keep it in the same cache line as 'bytenr' for more efficient
- * searches in the rbtree.
- */
- struct rb_node href_node;
- /*
* the mutex is held while running the refs, and it is also
* held when checking the sum of reference modifications.
*/
@@ -191,6 +186,11 @@ struct btrfs_delayed_ref_head {
bool is_data;
bool is_system;
bool processing;
+ /*
+ * Indicate if it's currently in the data structure that tracks head
+ * refs (struct btrfs_delayed_ref_root::head_refs).
+ */
+ bool tracked;
};
enum btrfs_delayed_ref_flags {
@@ -199,30 +199,52 @@ enum btrfs_delayed_ref_flags {
};
struct btrfs_delayed_ref_root {
- /* head ref rbtree */
- struct rb_root_cached href_root;
+ /*
+ * Track head references.
+ * The keys correspond to the logical address of the extent ("bytenr")
+ * right shifted by fs_info->sectorsize_bits. This is both to get a more
+ * dense index space (optimizes xarray structure) and because indexes in
+ * xarrays are of "unsigned long" type, meaning they are 32 bits wide on
+ * 32 bits platforms, limiting the extent range to 4G which is too low
+ * and makes it unusable (truncated index values) on 32 bits platforms.
+ * Protected by the spinlock 'lock' defined below.
+ */
+ struct xarray head_refs;
- /* Track dirty extent records. */
+ /*
+ * Track dirty extent records.
+ * The keys correspond to the logical address of the extent ("bytenr")
+ * right shifted by fs_info->sectorsize_bits, for same reasons as above.
+ */
struct xarray dirty_extents;
- /* this spin lock protects the rbtree and the entries inside */
- spinlock_t lock;
-
- /* how many delayed ref updates we've queued, used by the
- * throttling code
+ /*
+ * Protects the xarray head_refs, its entries and the following fields:
+ * num_heads, num_heads_ready, pending_csums and run_delayed_start.
*/
- atomic_t num_entries;
+ spinlock_t lock;
- /* total number of head nodes in tree */
+ /* Total number of head refs, protected by the spinlock 'lock'. */
unsigned long num_heads;
- /* total number of head nodes ready for processing */
+ /*
+ * Total number of head refs ready for processing, protected by the
+ * spinlock 'lock'.
+ */
unsigned long num_heads_ready;
+ /*
+ * Track space reserved for deleting csums of data extents.
+ * Protected by the spinlock 'lock'.
+ */
u64 pending_csums;
unsigned long flags;
+ /*
+ * Track from which bytenr to start searching ref heads.
+ * Protected by the spinlock 'lock'.
+ */
u64 run_delayed_start;
/*
@@ -364,19 +386,22 @@ void btrfs_merge_delayed_refs(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_head *head);
struct btrfs_delayed_ref_head *
-btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
+btrfs_find_delayed_ref_head(const struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
u64 bytenr);
-int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
- struct btrfs_delayed_ref_head *head);
static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head)
{
mutex_unlock(&head->mutex);
}
-void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
+void btrfs_delete_ref_head(const struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head);
struct btrfs_delayed_ref_head *btrfs_select_ref_head(
+ const struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs);
+void btrfs_unselect_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_head *head);
int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq);
@@ -391,6 +416,7 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info);
bool btrfs_find_delayed_tree_ref(struct btrfs_delayed_ref_head *head,
u64 root, u64 parent);
+void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans);
static inline u64 btrfs_delayed_ref_owner(struct btrfs_delayed_ref_node *node)
{
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 83d5cdd77f29..ac8e97ed13f7 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -45,7 +45,7 @@
*
* - Copy existing extents
*
- * This happens by re-using scrub facility, as scrub also iterates through
+ * This happens by reusing scrub facility, as scrub also iterates through
* existing extents from commit root.
*
* Location: scrub_write_block_to_dev_replace() from
@@ -641,6 +641,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
return ret;
down_write(&dev_replace->rwsem);
+ dev_replace->replace_task = current;
switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
@@ -994,6 +995,7 @@ error:
list_add(&tgt_device->dev_alloc_list, &fs_devices->alloc_list);
fs_devices->rw_devices++;
+ dev_replace->replace_task = NULL;
up_write(&dev_replace->rwsem);
btrfs_rm_dev_replace_blocked(fs_info);
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 001c0c2f872c..1ea5d8fcfbf7 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -27,7 +27,6 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
const char *name,
int name_len)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
char *ptr;
struct extent_buffer *leaf;
@@ -35,7 +34,7 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
if (ret == -EEXIST) {
struct btrfs_dir_item *di;
- di = btrfs_match_dir_item_name(fs_info, path, name, name_len);
+ di = btrfs_match_dir_item_name(path, name, name_len);
if (di)
return ERR_PTR(-EEXIST);
btrfs_extend_item(trans, path, data_size);
@@ -190,7 +189,7 @@ static struct btrfs_dir_item *btrfs_lookup_match_dir(
if (ret > 0)
return ERR_PTR(-ENOENT);
- return btrfs_match_dir_item_name(root->fs_info, path, name, name_len);
+ return btrfs_match_dir_item_name(path, name, name_len);
}
/*
@@ -341,14 +340,13 @@ btrfs_search_dir_index_item(struct btrfs_root *root, struct btrfs_path *path,
if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY)
break;
- di = btrfs_match_dir_item_name(root->fs_info, path,
- name->name, name->len);
+ di = btrfs_match_dir_item_name(path, name->name, name->len);
if (di)
return di;
}
/* Adjust return code if the key was not found in the next leaf. */
- if (ret > 0)
- ret = 0;
+ if (ret >= 0)
+ ret = -ENOENT;
return ERR_PTR(ret);
}
@@ -378,8 +376,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
* this walks through all the entries in a dir item and finds one
* for a specific name.
*/
-struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
- const struct btrfs_path *path,
+struct btrfs_dir_item *btrfs_match_dir_item_name(const struct btrfs_path *path,
const char *name, int name_len)
{
struct btrfs_dir_item *dir_item;
diff --git a/fs/btrfs/dir-item.h b/fs/btrfs/dir-item.h
index 5f6dfafc91f1..28d69970bc70 100644
--- a/fs/btrfs/dir-item.h
+++ b/fs/btrfs/dir-item.h
@@ -44,8 +44,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
struct btrfs_path *path, u64 dir,
const char *name, u16 name_len,
int mod);
-struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
- const struct btrfs_path *path,
+struct btrfs_dir_item *btrfs_match_dir_item_name(const struct btrfs_path *path,
const char *name,
int name_len);
diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c
index bd38df5647e3..a7c3e221378d 100644
--- a/fs/btrfs/direct-io.c
+++ b/fs/btrfs/direct-io.c
@@ -834,7 +834,7 @@ relock:
return ret;
}
- ret = btrfs_write_check(iocb, from, ret);
+ ret = btrfs_write_check(iocb, ret);
if (ret < 0) {
btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
goto out;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4ad5db619b00..814320948645 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -917,8 +917,7 @@ fail:
return ERR_PTR(ret);
}
-static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
+static struct btrfs_root *alloc_log_tree(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *root;
@@ -966,7 +965,7 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
{
struct btrfs_root *log_root;
- log_root = alloc_log_tree(trans, fs_info);
+ log_root = alloc_log_tree(fs_info);
if (IS_ERR(log_root))
return PTR_ERR(log_root);
@@ -992,7 +991,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_inode_item *inode_item;
int ret;
- log_root = alloc_log_tree(trans, fs_info);
+ log_root = alloc_log_tree(fs_info);
if (IS_ERR(log_root))
return PTR_ERR(log_root);
@@ -1959,7 +1958,7 @@ static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
fs_info->qgroup_seq = 1;
fs_info->qgroup_ulist = NULL;
fs_info->qgroup_rescan_running = false;
- fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL;
+ fs_info->qgroup_drop_subtree_thres = BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT;
mutex_init(&fs_info->qgroup_rescan_lock);
}
@@ -2786,6 +2785,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
btrfs_init_scrub(fs_info);
btrfs_init_balance(fs_info);
btrfs_init_async_reclaim_work(fs_info);
+ btrfs_init_extent_map_shrinker_work(fs_info);
rwlock_init(&fs_info->block_group_cache_lock);
fs_info->block_group_cache_tree = RB_ROOT_CACHED;
@@ -2852,8 +2852,6 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block
if (ret)
return ret;
- spin_lock_init(&fs_info->extent_map_shrinker_lock);
-
ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
if (ret)
return ret;
@@ -3202,8 +3200,7 @@ int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount)
return 0;
}
-int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices,
- const char *options)
+int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices)
{
u32 sectorsize;
u32 nodesize;
@@ -4186,7 +4183,7 @@ static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info)
btrfs_warn(fs_info,
"transaction %llu (with %llu dirty metadata bytes) is not committed",
trans->transid, dirty_bytes);
- btrfs_cleanup_one_transaction(trans, fs_info);
+ btrfs_cleanup_one_transaction(trans);
if (trans == fs_info->running_transaction)
fs_info->running_transaction = NULL;
@@ -4294,6 +4291,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
cancel_work_sync(&fs_info->async_reclaim_work);
cancel_work_sync(&fs_info->async_data_reclaim_work);
cancel_work_sync(&fs_info->preempt_reclaim_work);
+ cancel_work_sync(&fs_info->em_shrinker_work);
/* Cancel or finish ongoing discard work */
btrfs_discard_cleanup(fs_info);
@@ -4531,75 +4529,6 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
btrfs_wait_ordered_roots(fs_info, U64_MAX, NULL);
}
-static void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
- struct btrfs_fs_info *fs_info)
-{
- struct rb_node *node;
- struct btrfs_delayed_ref_root *delayed_refs = &trans->delayed_refs;
- struct btrfs_delayed_ref_node *ref;
-
- spin_lock(&delayed_refs->lock);
- while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) {
- struct btrfs_delayed_ref_head *head;
- struct rb_node *n;
- bool pin_bytes = false;
-
- head = rb_entry(node, struct btrfs_delayed_ref_head,
- href_node);
- if (btrfs_delayed_ref_lock(delayed_refs, head))
- continue;
-
- spin_lock(&head->lock);
- while ((n = rb_first_cached(&head->ref_tree)) != NULL) {
- ref = rb_entry(n, struct btrfs_delayed_ref_node,
- ref_node);
- rb_erase_cached(&ref->ref_node, &head->ref_tree);
- RB_CLEAR_NODE(&ref->ref_node);
- if (!list_empty(&ref->add_list))
- list_del(&ref->add_list);
- atomic_dec(&delayed_refs->num_entries);
- btrfs_put_delayed_ref(ref);
- btrfs_delayed_refs_rsv_release(fs_info, 1, 0);
- }
- if (head->must_insert_reserved)
- pin_bytes = true;
- btrfs_free_delayed_extent_op(head->extent_op);
- btrfs_delete_ref_head(delayed_refs, head);
- spin_unlock(&head->lock);
- spin_unlock(&delayed_refs->lock);
- mutex_unlock(&head->mutex);
-
- if (pin_bytes) {
- struct btrfs_block_group *cache;
-
- cache = btrfs_lookup_block_group(fs_info, head->bytenr);
- BUG_ON(!cache);
-
- spin_lock(&cache->space_info->lock);
- spin_lock(&cache->lock);
- cache->pinned += head->num_bytes;
- btrfs_space_info_update_bytes_pinned(fs_info,
- cache->space_info, head->num_bytes);
- cache->reserved -= head->num_bytes;
- cache->space_info->bytes_reserved -= head->num_bytes;
- spin_unlock(&cache->lock);
- spin_unlock(&cache->space_info->lock);
-
- btrfs_put_block_group(cache);
-
- btrfs_error_unpin_extent_range(fs_info, head->bytenr,
- head->bytenr + head->num_bytes - 1);
- }
- btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
- btrfs_put_delayed_ref_head(head);
- cond_resched();
- spin_lock(&delayed_refs->lock);
- }
- btrfs_qgroup_destroy_extent_records(trans);
-
- spin_unlock(&delayed_refs->lock);
-}
-
static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
{
struct btrfs_inode *btrfs_inode;
@@ -4805,9 +4734,9 @@ static void btrfs_free_all_qgroup_pertrans(struct btrfs_fs_info *fs_info)
spin_unlock(&fs_info->fs_roots_radix_lock);
}
-void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
- struct btrfs_fs_info *fs_info)
+void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans)
{
+ struct btrfs_fs_info *fs_info = cur_trans->fs_info;
struct btrfs_device *dev, *tmp;
btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
@@ -4819,7 +4748,7 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
list_del_init(&dev->post_commit_list);
}
- btrfs_destroy_delayed_refs(cur_trans, fs_info);
+ btrfs_destroy_delayed_refs(cur_trans);
cur_trans->state = TRANS_STATE_COMMIT_START;
wake_up(&fs_info->transaction_blocked_wait);
@@ -4865,7 +4794,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
} else {
spin_unlock(&fs_info->trans_lock);
}
- btrfs_cleanup_one_transaction(t, fs_info);
+ btrfs_cleanup_one_transaction(t);
spin_lock(&fs_info->trans_lock);
if (t == fs_info->running_transaction)
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 99af64d3f277..a7051e2570c1 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -52,8 +52,7 @@ struct extent_buffer *btrfs_find_create_tree_block(
int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info);
int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
const struct btrfs_super_block *disk_sb);
-int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices,
- const char *options);
+int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices);
void __cold close_ctree(struct btrfs_fs_info *fs_info);
int btrfs_validate_super(const struct btrfs_fs_info *fs_info,
const struct btrfs_super_block *sb, int mirror_num);
@@ -127,8 +126,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *trans,
struct btrfs_fs_info *fs_info);
-void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans,
- struct btrfs_fs_info *fs_info);
+void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans);
struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
u64 objectid);
int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d9f511babd89..412e318e4a22 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -182,7 +182,7 @@ search_again:
delayed_refs = &trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock);
- head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
+ head = btrfs_find_delayed_ref_head(fs_info, delayed_refs, bytenr);
if (head) {
if (!mutex_trylock(&head->mutex)) {
refcount_inc(&head->refs);
@@ -795,7 +795,6 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
if (insert) {
extra_size = btrfs_extent_inline_ref_size(want);
path->search_for_extension = 1;
- path->keep_locks = 1;
} else
extra_size = -1;
@@ -946,6 +945,25 @@ again:
ret = -EAGAIN;
goto out;
}
+
+ if (path->slots[0] + 1 < btrfs_header_nritems(path->nodes[0])) {
+ struct btrfs_key tmp_key;
+
+ btrfs_item_key_to_cpu(path->nodes[0], &tmp_key, path->slots[0] + 1);
+ if (tmp_key.objectid == bytenr &&
+ tmp_key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
+ ret = -EAGAIN;
+ goto out;
+ }
+ goto out_no_entry;
+ }
+
+ if (!path->keep_locks) {
+ btrfs_release_path(path);
+ path->keep_locks = 1;
+ goto again;
+ }
+
/*
* To add new inline back ref, we have to make sure
* there is no corresponding back ref item.
@@ -959,13 +977,15 @@ again:
goto out;
}
}
+out_no_entry:
*ref_ret = (struct btrfs_extent_inline_ref *)ptr;
out:
- if (insert) {
+ if (path->keep_locks) {
path->keep_locks = 0;
- path->search_for_extension = 0;
btrfs_unlock_up_safe(path, 1);
}
+ if (insert)
+ path->search_for_extension = 0;
return ret;
}
@@ -1807,16 +1827,6 @@ select_delayed_ref(struct btrfs_delayed_ref_head *head)
return ref;
}
-static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
- struct btrfs_delayed_ref_head *head)
-{
- spin_lock(&delayed_refs->lock);
- head->processing = false;
- delayed_refs->num_heads_ready++;
- spin_unlock(&delayed_refs->lock);
- btrfs_delayed_ref_unlock(head);
-}
-
static struct btrfs_delayed_extent_op *cleanup_extent_op(
struct btrfs_delayed_ref_head *head)
{
@@ -1891,7 +1901,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
ret = run_and_cleanup_extent_op(trans, head);
if (ret < 0) {
- unselect_delayed_ref_head(delayed_refs, head);
+ btrfs_unselect_ref_head(delayed_refs, head);
btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
return ret;
} else if (ret) {
@@ -1910,7 +1920,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
spin_unlock(&delayed_refs->lock);
return 1;
}
- btrfs_delete_ref_head(delayed_refs, head);
+ btrfs_delete_ref_head(fs_info, delayed_refs, head);
spin_unlock(&head->lock);
spin_unlock(&delayed_refs->lock);
@@ -1933,39 +1943,6 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
return ret;
}
-static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
- struct btrfs_trans_handle *trans)
-{
- struct btrfs_delayed_ref_root *delayed_refs =
- &trans->transaction->delayed_refs;
- struct btrfs_delayed_ref_head *head = NULL;
- int ret;
-
- spin_lock(&delayed_refs->lock);
- head = btrfs_select_ref_head(delayed_refs);
- if (!head) {
- spin_unlock(&delayed_refs->lock);
- return head;
- }
-
- /*
- * Grab the lock that says we are going to process all the refs for
- * this head
- */
- ret = btrfs_delayed_ref_lock(delayed_refs, head);
- spin_unlock(&delayed_refs->lock);
-
- /*
- * We may have dropped the spin lock to get the head mutex lock, and
- * that might have given someone else time to free the head. If that's
- * true, it has been removed from our list and we can move on.
- */
- if (ret == -EAGAIN)
- head = ERR_PTR(-EAGAIN);
-
- return head;
-}
-
static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *locked_ref,
u64 *bytes_released)
@@ -1986,7 +1963,7 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
if (ref->seq &&
btrfs_check_delayed_seq(fs_info, ref->seq)) {
spin_unlock(&locked_ref->lock);
- unselect_delayed_ref_head(delayed_refs, locked_ref);
+ btrfs_unselect_ref_head(delayed_refs, locked_ref);
return -EAGAIN;
}
@@ -2009,7 +1986,6 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
default:
WARN_ON(1);
}
- atomic_dec(&delayed_refs->num_entries);
/*
* Record the must_insert_reserved flag before we drop the
@@ -2035,7 +2011,7 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
btrfs_free_delayed_extent_op(extent_op);
if (ret) {
- unselect_delayed_ref_head(delayed_refs, locked_ref);
+ btrfs_unselect_ref_head(delayed_refs, locked_ref);
btrfs_put_delayed_ref(ref);
return ret;
}
@@ -2073,7 +2049,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
do {
if (!locked_ref) {
- locked_ref = btrfs_obtain_ref_head(trans);
+ locked_ref = btrfs_select_ref_head(fs_info, delayed_refs);
if (IS_ERR_OR_NULL(locked_ref)) {
if (PTR_ERR(locked_ref) == -EAGAIN) {
continue;
@@ -2220,7 +2196,7 @@ again:
btrfs_create_pending_block_groups(trans);
spin_lock(&delayed_refs->lock);
- if (RB_EMPTY_ROOT(&delayed_refs->href_root.rb_root)) {
+ if (xa_empty(&delayed_refs->head_refs)) {
spin_unlock(&delayed_refs->lock);
return 0;
}
@@ -2275,7 +2251,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
delayed_refs = &cur_trans->delayed_refs;
spin_lock(&delayed_refs->lock);
- head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
+ head = btrfs_find_delayed_ref_head(root->fs_info, delayed_refs, bytenr);
if (!head) {
spin_unlock(&delayed_refs->lock);
btrfs_put_transaction(cur_trans);
@@ -3144,7 +3120,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
break;
}
- /* Quick path didn't find the EXTEMT/METADATA_ITEM */
+ /* Quick path didn't find the EXTENT/METADATA_ITEM */
if (path->slots[0] - extent_slot > 5)
break;
extent_slot--;
@@ -3377,13 +3353,14 @@ out:
static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
u64 bytenr)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_ref_head *head;
struct btrfs_delayed_ref_root *delayed_refs;
int ret = 0;
delayed_refs = &trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock);
- head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
+ head = btrfs_find_delayed_ref_head(fs_info, delayed_refs, bytenr);
if (!head)
goto out_delayed_unlock;
@@ -3401,7 +3378,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
if (!mutex_trylock(&head->mutex))
goto out;
- btrfs_delete_ref_head(delayed_refs, head);
+ btrfs_delete_ref_head(fs_info, delayed_refs, head);
head->processing = false;
spin_unlock(&head->lock);
@@ -3411,7 +3388,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
if (head->must_insert_reserved)
ret = 1;
- btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head);
+ btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
mutex_unlock(&head->mutex);
btrfs_put_delayed_ref_head(head);
return ret;
@@ -5270,7 +5247,7 @@ struct walk_control {
* corrupted file systems must have been caught before calling this function.
*/
static bool visit_node_for_delete(struct btrfs_root *root, struct walk_control *wc,
- struct extent_buffer *eb, u64 refs, u64 flags, int slot)
+ struct extent_buffer *eb, u64 flags, int slot)
{
struct btrfs_key key;
u64 generation;
@@ -5384,7 +5361,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
continue;
/* If we don't need to visit this node don't reada. */
- if (!visit_node_for_delete(root, wc, eb, refs, flags, slot))
+ if (!visit_node_for_delete(root, wc, eb, flags, slot))
continue;
reada:
btrfs_readahead_node_child(eb, slot);
@@ -5518,7 +5495,7 @@ again:
*/
delayed_refs = &trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock);
- head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
+ head = btrfs_find_delayed_ref_head(root->fs_info, delayed_refs, bytenr);
if (!head)
goto out;
if (!mutex_trylock(&head->mutex)) {
@@ -5737,8 +5714,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
/* If we don't have to walk into this node skip it. */
if (!visit_node_for_delete(root, wc, path->nodes[level],
- wc->refs[level - 1], wc->flags[level - 1],
- path->slots[level]))
+ wc->flags[level - 1], path->slots[level]))
goto skip;
/*
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 309a8ae48434..b923d0cec61c 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -190,7 +190,7 @@ static void process_one_folio(struct btrfs_fs_info *fs_info,
btrfs_folio_clamp_clear_writeback(fs_info, folio, start, len);
if (folio != locked_folio && (page_ops & PAGE_UNLOCK))
- btrfs_folio_end_writer_lock(fs_info, folio, start, len);
+ btrfs_folio_end_lock(fs_info, folio, start, len);
}
static void __process_folios_contig(struct address_space *mapping,
@@ -262,22 +262,23 @@ static noinline int lock_delalloc_folios(struct inode *inode,
for (i = 0; i < found_folios; i++) {
struct folio *folio = fbatch.folios[i];
- u32 len = end + 1 - start;
+ u64 range_start;
+ u32 range_len;
if (folio == locked_folio)
continue;
- if (btrfs_folio_start_writer_lock(fs_info, folio, start,
- len))
- goto out;
-
+ folio_lock(folio);
if (!folio_test_dirty(folio) || folio->mapping != mapping) {
- btrfs_folio_end_writer_lock(fs_info, folio, start,
- len);
+ folio_unlock(folio);
goto out;
}
+ range_start = max_t(u64, folio_pos(folio), start);
+ range_len = min_t(u64, folio_pos(folio) + folio_size(folio),
+ end + 1) - range_start;
+ btrfs_folio_set_lock(fs_info, folio, range_start, range_len);
- processed_end = folio_pos(folio) + folio_size(folio) - 1;
+ processed_end = range_start + range_len - 1;
}
folio_batch_release(&fbatch);
cond_resched();
@@ -437,7 +438,7 @@ static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 le
if (!btrfs_is_subpage(fs_info, folio->mapping))
folio_unlock(folio);
else
- btrfs_subpage_end_reader(fs_info, folio, start, len);
+ btrfs_folio_end_lock(fs_info, folio, start, len);
}
/*
@@ -494,7 +495,7 @@ static void begin_folio_read(struct btrfs_fs_info *fs_info, struct folio *folio)
return;
ASSERT(folio_test_private(folio));
- btrfs_subpage_start_reader(fs_info, folio, folio_pos(folio), PAGE_SIZE);
+ btrfs_folio_set_lock(fs_info, folio, folio_pos(folio), PAGE_SIZE);
}
/*
@@ -785,7 +786,7 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl,
}
if (bio_ctrl->wbc)
- wbc_account_cgroup_owner(bio_ctrl->wbc, &folio->page,
+ wbc_account_cgroup_owner(bio_ctrl->wbc, folio,
len);
size -= len;
@@ -1101,6 +1102,45 @@ int btrfs_read_folio(struct file *file, struct folio *folio)
return ret;
}
+static void set_delalloc_bitmap(struct folio *folio, unsigned long *delalloc_bitmap,
+ u64 start, u32 len)
+{
+ struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
+ const u64 folio_start = folio_pos(folio);
+ unsigned int start_bit;
+ unsigned int nbits;
+
+ ASSERT(start >= folio_start && start + len <= folio_start + PAGE_SIZE);
+ start_bit = (start - folio_start) >> fs_info->sectorsize_bits;
+ nbits = len >> fs_info->sectorsize_bits;
+ ASSERT(bitmap_test_range_all_zero(delalloc_bitmap, start_bit, nbits));
+ bitmap_set(delalloc_bitmap, start_bit, nbits);
+}
+
+static bool find_next_delalloc_bitmap(struct folio *folio,
+ unsigned long *delalloc_bitmap, u64 start,
+ u64 *found_start, u32 *found_len)
+{
+ struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
+ const u64 folio_start = folio_pos(folio);
+ const unsigned int bitmap_size = fs_info->sectors_per_page;
+ unsigned int start_bit;
+ unsigned int first_zero;
+ unsigned int first_set;
+
+ ASSERT(start >= folio_start && start < folio_start + PAGE_SIZE);
+
+ start_bit = (start - folio_start) >> fs_info->sectorsize_bits;
+ first_set = find_next_bit(delalloc_bitmap, bitmap_size, start_bit);
+ if (first_set >= bitmap_size)
+ return false;
+
+ *found_start = folio_start + (first_set << fs_info->sectorsize_bits);
+ first_zero = find_next_zero_bit(delalloc_bitmap, bitmap_size, first_set);
+ *found_len = (first_zero - first_set) << fs_info->sectorsize_bits;
+ return true;
+}
+
/*
* helper for extent_writepage(), doing all of the delayed allocation setup.
*
@@ -1120,6 +1160,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
const bool is_subpage = btrfs_is_subpage(fs_info, folio->mapping);
const u64 page_start = folio_pos(folio);
const u64 page_end = page_start + folio_size(folio) - 1;
+ unsigned long delalloc_bitmap = 0;
/*
* Save the last found delalloc end. As the delalloc end can go beyond
* page boundary, thus we cannot rely on subpage bitmap to locate the
@@ -1130,6 +1171,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
u64 delalloc_end = page_end;
u64 delalloc_to_write = 0;
int ret = 0;
+ int bit;
/* Save the dirty bitmap as our submission bitmap will be a subset of it. */
if (btrfs_is_subpage(fs_info, inode->vfs_inode.i_mapping)) {
@@ -1139,6 +1181,12 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
bio_ctrl->submit_bitmap = 1;
}
+ for_each_set_bit(bit, &bio_ctrl->submit_bitmap, fs_info->sectors_per_page) {
+ u64 start = page_start + (bit << fs_info->sectorsize_bits);
+
+ btrfs_folio_set_lock(fs_info, folio, start, fs_info->sectorsize);
+ }
+
/* Lock all (subpage) delalloc ranges inside the folio first. */
while (delalloc_start < page_end) {
delalloc_end = page_end;
@@ -1147,9 +1195,8 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
delalloc_start = delalloc_end + 1;
continue;
}
- btrfs_folio_set_writer_lock(fs_info, folio, delalloc_start,
- min(delalloc_end, page_end) + 1 -
- delalloc_start);
+ set_delalloc_bitmap(folio, &delalloc_bitmap, delalloc_start,
+ min(delalloc_end, page_end) + 1 - delalloc_start);
last_delalloc_end = delalloc_end;
delalloc_start = delalloc_end + 1;
}
@@ -1174,7 +1221,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
found_len = last_delalloc_end + 1 - found_start;
found = true;
} else {
- found = btrfs_subpage_find_writer_locked(fs_info, folio,
+ found = find_next_delalloc_bitmap(folio, &delalloc_bitmap,
delalloc_start, &found_start, &found_len);
}
if (!found)
@@ -1313,7 +1360,7 @@ static int submit_one_sector(struct btrfs_inode *inode,
* a folio for a range already written to disk.
*/
btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize);
- btrfs_set_range_writeback(inode, filepos, filepos + sectorsize - 1);
+ btrfs_folio_set_writeback(fs_info, folio, filepos, sectorsize);
/*
* Above call should set the whole folio with writeback flag, even
* just for a single subpage sector.
@@ -1390,8 +1437,6 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
goto out;
submitted_io = true;
}
-
- btrfs_folio_assert_not_dirty(fs_info, folio, start, len);
out:
/*
* If we didn't submitted any sector (>= i_size), folio dirty get
@@ -1475,7 +1520,7 @@ done:
* Only unlock ranges that are submitted. As there can be some async
* submitted ranges inside the folio.
*/
- btrfs_folio_end_writer_lock_bitmap(fs_info, folio, bio_ctrl->submit_bitmap);
+ btrfs_folio_end_lock_bitmap(fs_info, folio, bio_ctrl->submit_bitmap);
ASSERT(ret <= 0);
return ret;
}
@@ -1707,7 +1752,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
ret = bio_add_folio(&bbio->bio, folio, eb->len,
eb->start - folio_pos(folio));
ASSERT(ret);
- wbc_account_cgroup_owner(wbc, folio_page(folio, 0), eb->len);
+ wbc_account_cgroup_owner(wbc, folio, eb->len);
folio_unlock(folio);
} else {
int num_folios = num_extent_folios(eb);
@@ -1721,8 +1766,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
folio_start_writeback(folio);
ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0);
ASSERT(ret);
- wbc_account_cgroup_owner(wbc, folio_page(folio, 0),
- eb->folio_size);
+ wbc_account_cgroup_owner(wbc, folio, eb->folio_size);
wbc->nr_to_write -= folio_nr_pages(folio);
folio_unlock(folio);
}
@@ -2115,7 +2159,27 @@ retry:
continue;
}
- if (wbc->sync_mode != WB_SYNC_NONE) {
+ /*
+ * For subpage case, compression can lead to mixed
+ * writeback and dirty flags, e.g:
+ * 0 32K 64K 96K 128K
+ * | |//////||/////| |//|
+ *
+ * In above case, [32K, 96K) is asynchronously submitted
+ * for compression, and [124K, 128K) needs to be written back.
+ *
+ * If we didn't wait wrtiteback for page 64K, [128K, 128K)
+ * won't be submitted as the page still has writeback flag
+ * and will be skipped in the next check.
+ *
+ * This mixed writeback and dirty case is only possible for
+ * subpage case.
+ *
+ * TODO: Remove this check after migrating compression to
+ * regular submission.
+ */
+ if (wbc->sync_mode != WB_SYNC_NONE ||
+ btrfs_is_subpage(inode_to_fs_info(inode), mapping)) {
if (folio_test_writeback(folio))
submit_write_bio(bio_ctrl, 0);
folio_wait_writeback(folio);
@@ -2200,7 +2264,7 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f
u32 cur_len = cur_end + 1 - cur;
struct folio *folio;
- folio = __filemap_get_folio(mapping, cur >> PAGE_SHIFT, 0, 0);
+ folio = filemap_get_folio(mapping, cur >> PAGE_SHIFT);
/*
* This shouldn't happen, the pages are pinned and locked, this
@@ -2233,7 +2297,7 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f
cur, cur_len, !ret);
mapping_set_error(mapping, ret);
}
- btrfs_folio_end_writer_lock(fs_info, folio, cur, cur_len);
+ btrfs_folio_end_lock(fs_info, folio, cur, cur_len);
if (ret < 0)
found_error = true;
next_page:
@@ -2317,7 +2381,7 @@ int extent_invalidate_folio(struct extent_io_tree *tree,
* to drop the page.
*/
static bool try_release_extent_state(struct extent_io_tree *tree,
- struct folio *folio, gfp_t mask)
+ struct folio *folio)
{
u64 start = folio_pos(folio);
u64 end = start + PAGE_SIZE - 1;
@@ -2428,7 +2492,7 @@ next:
cond_resched();
}
}
- return try_release_extent_state(io_tree, folio, mask);
+ return try_release_extent_state(io_tree, folio);
}
static void __free_extent_buffer(struct extent_buffer *eb)
@@ -2442,7 +2506,7 @@ static int extent_buffer_under_io(const struct extent_buffer *eb)
test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
}
-static bool folio_range_has_eb(struct btrfs_fs_info *fs_info, struct folio *folio)
+static bool folio_range_has_eb(struct folio *folio)
{
struct btrfs_subpage *subpage;
@@ -2452,12 +2516,6 @@ static bool folio_range_has_eb(struct btrfs_fs_info *fs_info, struct folio *foli
subpage = folio_get_private(folio);
if (atomic_read(&subpage->eb_refs))
return true;
- /*
- * Even there is no eb refs here, we may still have
- * end_folio_read() call relying on page::private.
- */
- if (atomic_read(&subpage->readers))
- return true;
}
return false;
}
@@ -2516,7 +2574,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo
* We can only detach the folio private if there are no other ebs in the
* page range and no unfinished IO.
*/
- if (!folio_range_has_eb(fs_info, folio))
+ if (!folio_range_has_eb(folio))
btrfs_detach_subpage(fs_info, folio);
spin_unlock(&folio->mapping->i_private_lock);
@@ -3121,7 +3179,7 @@ out:
}
/*
* Now all pages of that extent buffer is unmapped, set UNMAPPED flag,
- * so it can be cleaned up without utlizing page->mapping.
+ * so it can be cleaned up without utilizing page->mapping.
*/
set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
@@ -4221,7 +4279,6 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
u64 bytenr, u64 owner_root, u64 gen, int level)
{
struct btrfs_tree_parent_check check = {
- .has_first_key = 0,
.level = level,
.transid = gen
};
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 25d191f1ac10..67ce85ff0ae2 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -77,10 +77,13 @@ static u64 range_end(u64 start, u64 len)
return start + len;
}
-static void dec_evictable_extent_maps(struct btrfs_inode *inode)
+static void remove_em(struct btrfs_inode *inode, struct extent_map *em)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ rb_erase(&em->rb_node, &inode->extent_tree.root);
+ RB_CLEAR_NODE(&em->rb_node);
+
if (!btrfs_is_testing(fs_info) && is_fstree(btrfs_root_id(inode->root)))
percpu_counter_dec(&fs_info->evictable_extent_maps);
}
@@ -230,7 +233,12 @@ static bool mergeable_maps(const struct extent_map *prev, const struct extent_ma
if (extent_map_end(prev) != next->start)
return false;
- if (prev->flags != next->flags)
+ /*
+ * The merged flag is not an on-disk flag, it just indicates we had the
+ * extent maps of 2 (or more) adjacent extents merged, so factor it out.
+ */
+ if ((prev->flags & ~EXTENT_FLAG_MERGED) !=
+ (next->flags & ~EXTENT_FLAG_MERGED))
return false;
if (next->disk_bytenr < EXTENT_MAP_LAST_BYTE - 1)
@@ -243,13 +251,19 @@ static bool mergeable_maps(const struct extent_map *prev, const struct extent_ma
/*
* Handle the on-disk data extents merge for @prev and @next.
*
+ * @prev: left extent to merge
+ * @next: right extent to merge
+ * @merged: the extent we will not discard after the merge; updated with new values
+ *
+ * After this, one of the two extents is the new merged extent and the other is
+ * removed from the tree and likely freed. Note that @merged is one of @prev/@next
+ * so there is const/non-const aliasing occurring here.
+ *
* Only touches disk_bytenr/disk_num_bytes/offset/ram_bytes.
* For now only uncompressed regular extent can be merged.
- *
- * @prev and @next will be both updated to point to the new merged range.
- * Thus one of them should be removed by the caller.
*/
-static void merge_ondisk_extents(struct extent_map *prev, struct extent_map *next)
+static void merge_ondisk_extents(const struct extent_map *prev, const struct extent_map *next,
+ struct extent_map *merged)
{
u64 new_disk_bytenr;
u64 new_disk_num_bytes;
@@ -284,15 +298,10 @@ static void merge_ondisk_extents(struct extent_map *prev, struct extent_map *nex
new_disk_bytenr;
new_offset = prev->disk_bytenr + prev->offset - new_disk_bytenr;
- prev->disk_bytenr = new_disk_bytenr;
- prev->disk_num_bytes = new_disk_num_bytes;
- prev->ram_bytes = new_disk_num_bytes;
- prev->offset = new_offset;
-
- next->disk_bytenr = new_disk_bytenr;
- next->disk_num_bytes = new_disk_num_bytes;
- next->ram_bytes = new_disk_num_bytes;
- next->offset = new_offset;
+ merged->disk_bytenr = new_disk_bytenr;
+ merged->disk_num_bytes = new_disk_num_bytes;
+ merged->ram_bytes = new_disk_num_bytes;
+ merged->offset = new_offset;
}
static void dump_extent_map(struct btrfs_fs_info *fs_info, const char *prefix,
@@ -333,7 +342,6 @@ static void validate_extent_map(struct btrfs_fs_info *fs_info, struct extent_map
static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct extent_map_tree *tree = &inode->extent_tree;
struct extent_map *merge = NULL;
struct rb_node *rb;
@@ -361,14 +369,12 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em)
em->generation = max(em->generation, merge->generation);
if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE)
- merge_ondisk_extents(merge, em);
+ merge_ondisk_extents(merge, em, em);
em->flags |= EXTENT_FLAG_MERGED;
validate_extent_map(fs_info, em);
- rb_erase(&merge->rb_node, &tree->root);
- RB_CLEAR_NODE(&merge->rb_node);
+ remove_em(inode, merge);
free_extent_map(merge);
- dec_evictable_extent_maps(inode);
}
}
@@ -378,14 +384,12 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em)
if (rb && can_merge_extent_map(merge) && mergeable_maps(em, merge)) {
em->len += merge->len;
if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE)
- merge_ondisk_extents(em, merge);
+ merge_ondisk_extents(em, merge, em);
validate_extent_map(fs_info, em);
- rb_erase(&merge->rb_node, &tree->root);
- RB_CLEAR_NODE(&merge->rb_node);
em->generation = max(em->generation, merge->generation);
em->flags |= EXTENT_FLAG_MERGED;
+ remove_em(inode, merge);
free_extent_map(merge);
- dec_evictable_extent_maps(inode);
}
}
@@ -582,12 +586,10 @@ void remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em)
lockdep_assert_held_write(&tree->lock);
WARN_ON(em->flags & EXTENT_FLAG_PINNED);
- rb_erase(&em->rb_node, &tree->root);
if (!(em->flags & EXTENT_FLAG_LOGGING))
list_del_init(&em->list);
- RB_CLEAR_NODE(&em->rb_node);
- dec_evictable_extent_maps(inode);
+ remove_em(inode, em);
}
static void replace_extent_mapping(struct btrfs_inode *inode,
@@ -1116,13 +1118,12 @@ out_free_pre:
struct btrfs_em_shrink_ctx {
long nr_to_scan;
long scanned;
- u64 last_ino;
- u64 last_root;
};
static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_ctx *ctx)
{
- const u64 cur_fs_gen = btrfs_get_fs_generation(inode->root->fs_info);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ const u64 cur_fs_gen = btrfs_get_fs_generation(fs_info);
struct extent_map_tree *tree = &inode->extent_tree;
long nr_dropped = 0;
struct rb_node *node;
@@ -1195,7 +1196,8 @@ next:
* lock. This is to avoid slowing other tasks trying to take the
* lock.
*/
- if (need_resched() || rwlock_needbreak(&tree->lock))
+ if (need_resched() || rwlock_needbreak(&tree->lock) ||
+ btrfs_fs_closing(fs_info))
break;
node = next;
}
@@ -1207,19 +1209,21 @@ next:
static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx)
{
+ struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_inode *inode;
long nr_dropped = 0;
- u64 min_ino = ctx->last_ino + 1;
+ u64 min_ino = fs_info->em_shrinker_last_ino + 1;
inode = btrfs_find_first_inode(root, min_ino);
while (inode) {
nr_dropped += btrfs_scan_inode(inode, ctx);
min_ino = btrfs_ino(inode) + 1;
- ctx->last_ino = btrfs_ino(inode);
+ fs_info->em_shrinker_last_ino = btrfs_ino(inode);
btrfs_add_delayed_iput(inode);
- if (ctx->scanned >= ctx->nr_to_scan)
+ if (ctx->scanned >= ctx->nr_to_scan ||
+ btrfs_fs_closing(inode->root->fs_info))
break;
cond_resched();
@@ -1235,52 +1239,43 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx
* inode if there is one or we will find out this was the last
* one and move to the next root.
*/
- ctx->last_root = btrfs_root_id(root);
+ fs_info->em_shrinker_last_root = btrfs_root_id(root);
} else {
/*
* No more inodes in this root, set extent_map_shrinker_last_ino to 0 so
* that when processing the next root we start from its first inode.
*/
- ctx->last_ino = 0;
- ctx->last_root = btrfs_root_id(root) + 1;
+ fs_info->em_shrinker_last_ino = 0;
+ fs_info->em_shrinker_last_root = btrfs_root_id(root) + 1;
}
return nr_dropped;
}
-long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
+static void btrfs_extent_map_shrinker_worker(struct work_struct *work)
{
+ struct btrfs_fs_info *fs_info;
struct btrfs_em_shrink_ctx ctx;
u64 start_root_id;
u64 next_root_id;
bool cycled = false;
long nr_dropped = 0;
- ctx.scanned = 0;
- ctx.nr_to_scan = nr_to_scan;
+ fs_info = container_of(work, struct btrfs_fs_info, em_shrinker_work);
- /*
- * In case we have multiple tasks running this shrinker, make the next
- * one start from the next inode in case it starts before we finish.
- */
- spin_lock(&fs_info->extent_map_shrinker_lock);
- ctx.last_ino = fs_info->extent_map_shrinker_last_ino;
- fs_info->extent_map_shrinker_last_ino++;
- ctx.last_root = fs_info->extent_map_shrinker_last_root;
- spin_unlock(&fs_info->extent_map_shrinker_lock);
+ ctx.scanned = 0;
+ ctx.nr_to_scan = atomic64_read(&fs_info->em_shrinker_nr_to_scan);
- start_root_id = ctx.last_root;
- next_root_id = ctx.last_root;
+ start_root_id = fs_info->em_shrinker_last_root;
+ next_root_id = fs_info->em_shrinker_last_root;
if (trace_btrfs_extent_map_shrinker_scan_enter_enabled()) {
s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
- trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr_to_scan,
- nr, ctx.last_root,
- ctx.last_ino);
+ trace_btrfs_extent_map_shrinker_scan_enter(fs_info, nr);
}
- while (ctx.scanned < ctx.nr_to_scan) {
+ while (ctx.scanned < ctx.nr_to_scan && !btrfs_fs_closing(fs_info)) {
struct btrfs_root *root;
unsigned long count;
@@ -1294,8 +1289,8 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
spin_unlock(&fs_info->fs_roots_radix_lock);
if (start_root_id > 0 && !cycled) {
next_root_id = 0;
- ctx.last_root = 0;
- ctx.last_ino = 0;
+ fs_info->em_shrinker_last_root = 0;
+ fs_info->em_shrinker_last_ino = 0;
cycled = true;
continue;
}
@@ -1314,29 +1309,40 @@ long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
btrfs_put_root(root);
}
- /*
- * In case of multiple tasks running this extent map shrinking code this
- * isn't perfect but it's simple and silences things like KCSAN. It's
- * not possible to know which task made more progress because we can
- * cycle back to the first root and first inode if it's not the first
- * time the shrinker ran, see the above logic. Also a task that started
- * later may finish ealier than another task and made less progress. So
- * make this simple and update to the progress of the last task that
- * finished, with the occasional possiblity of having two consecutive
- * runs of the shrinker process the same inodes.
- */
- spin_lock(&fs_info->extent_map_shrinker_lock);
- fs_info->extent_map_shrinker_last_ino = ctx.last_ino;
- fs_info->extent_map_shrinker_last_root = ctx.last_root;
- spin_unlock(&fs_info->extent_map_shrinker_lock);
-
if (trace_btrfs_extent_map_shrinker_scan_exit_enabled()) {
s64 nr = percpu_counter_sum_positive(&fs_info->evictable_extent_maps);
- trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped,
- nr, ctx.last_root,
- ctx.last_ino);
+ trace_btrfs_extent_map_shrinker_scan_exit(fs_info, nr_dropped, nr);
}
- return nr_dropped;
+ atomic64_set(&fs_info->em_shrinker_nr_to_scan, 0);
+}
+
+void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan)
+{
+ /*
+ * Do nothing if the shrinker is already running. In case of high memory
+ * pressure we can have a lot of tasks calling us and all passing the
+ * same nr_to_scan value, but in reality we may need only to free
+ * nr_to_scan extent maps (or less). In case we need to free more than
+ * that, we will be called again by the fs shrinker, so no worries about
+ * not doing enough work to reclaim memory from extent maps.
+ * We can also be repeatedly called with the same nr_to_scan value
+ * simply because the shrinker runs asynchronously and multiple calls
+ * to this function are made before the shrinker does enough progress.
+ *
+ * That's why we set the atomic counter to nr_to_scan only if its
+ * current value is zero, instead of incrementing the counter by
+ * nr_to_scan.
+ */
+ if (atomic64_cmpxchg(&fs_info->em_shrinker_nr_to_scan, 0, nr_to_scan) != 0)
+ return;
+
+ queue_work(system_unbound_wq, &fs_info->em_shrinker_work);
+}
+
+void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info)
+{
+ atomic64_set(&fs_info->em_shrinker_nr_to_scan, 0);
+ INIT_WORK(&fs_info->em_shrinker_work, btrfs_extent_map_shrinker_worker);
}
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 5154a8f1d26c..cd123b266b64 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -189,6 +189,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode,
int btrfs_replace_extent_map_range(struct btrfs_inode *inode,
struct extent_map *new_em,
bool modified);
-long btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan);
+void btrfs_free_extent_maps(struct btrfs_fs_info *fs_info, long nr_to_scan);
+void btrfs_init_extent_map_shrinker_work(struct btrfs_fs_info *fs_info);
#endif
diff --git a/fs/btrfs/fiemap.c b/fs/btrfs/fiemap.c
index df7f09f3b02e..b80c07ad8c5e 100644
--- a/fs/btrfs/fiemap.c
+++ b/fs/btrfs/fiemap.c
@@ -186,7 +186,7 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
* we have in the cache is the last delalloc range we
* found while the file extent item we found can be
* either for a whole delalloc range we previously
- * emmitted or only a part of that range.
+ * emitted or only a part of that range.
*
* We have two cases here:
*
@@ -194,13 +194,13 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
* cached extent's end. In this case just ignore the
* current file extent item because we don't want to
* overlap with previous ranges that may have been
- * emmitted already;
+ * emitted already;
*
* 2) The file extent item starts behind the currently
* cached extent but its end offset goes beyond the
* end offset of the cached extent. We don't want to
* overlap with a previous range that may have been
- * emmitted already, so we emit the currently cached
+ * emitted already, so we emit the currently cached
* extent and then partially store the current file
* extent item's range in the cache, for the subrange
* going the cached extent's end to the end of the
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 4fb521d91b06..588c353d2969 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -37,33 +37,30 @@
#include "file.h"
#include "super.h"
-/* simple helper to fault in pages and copy. This should go away
- * and be replaced with calls into generic code.
+/*
+ * Helper to fault in page and copy. This should go away and be replaced with
+ * calls into generic code.
*/
static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
- struct page **prepared_pages,
- struct iov_iter *i)
+ struct folio *folio, struct iov_iter *i)
{
size_t copied = 0;
size_t total_copied = 0;
- int pg = 0;
int offset = offset_in_page(pos);
while (write_bytes > 0) {
- size_t count = min_t(size_t,
- PAGE_SIZE - offset, write_bytes);
- struct page *page = prepared_pages[pg];
+ size_t count = min_t(size_t, PAGE_SIZE - offset, write_bytes);
/*
* Copy data from userspace to the current page
*/
- copied = copy_page_from_iter_atomic(page, offset, count, i);
+ copied = copy_folio_from_iter_atomic(folio, offset, count, i);
/* Flush processor's dcache for this page */
- flush_dcache_page(page);
+ flush_dcache_folio(folio);
/*
* if we get a partial write, we can end up with
- * partially up to date pages. These add
+ * partially up to date page. These add
* a lot of complexity, so make sure they don't
* happen by forcing this copy to be retried.
*
@@ -71,7 +68,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
* back to page at a time copies after we return 0.
*/
if (unlikely(copied < count)) {
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
iov_iter_revert(i, copied);
copied = 0;
}
@@ -82,54 +79,44 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
write_bytes -= copied;
total_copied += copied;
offset += copied;
- if (offset == PAGE_SIZE) {
- pg++;
- offset = 0;
- }
}
return total_copied;
}
/*
- * unlocks pages after btrfs_file_write is done with them
+ * Unlock folio after btrfs_file_write() is done with it.
*/
-static void btrfs_drop_pages(struct btrfs_fs_info *fs_info,
- struct page **pages, size_t num_pages,
+static void btrfs_drop_folio(struct btrfs_fs_info *fs_info, struct folio *folio,
u64 pos, u64 copied)
{
- size_t i;
u64 block_start = round_down(pos, fs_info->sectorsize);
u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start;
ASSERT(block_len <= U32_MAX);
- for (i = 0; i < num_pages; i++) {
- /* page checked is some magic around finding pages that
- * have been modified without going through btrfs_set_page_dirty
- * clear it here. There should be no need to mark the pages
- * accessed as prepare_pages should have marked them accessed
- * in prepare_pages via find_or_create_page()
- */
- btrfs_folio_clamp_clear_checked(fs_info, page_folio(pages[i]),
- block_start, block_len);
- unlock_page(pages[i]);
- put_page(pages[i]);
- }
+ /*
+ * Folio checked is some magic around finding folios that have been
+ * modified without going through btrfs_dirty_folio(). Clear it here.
+ * There should be no need to mark the pages accessed as
+ * prepare_one_folio() should have marked them accessed in
+ * prepare_one_folio() via find_or_create_page()
+ */
+ btrfs_folio_clamp_clear_checked(fs_info, folio, block_start, block_len);
+ folio_unlock(folio);
+ folio_put(folio);
}
/*
* After btrfs_copy_from_user(), update the following things for delalloc:
- * - Mark newly dirtied pages as DELALLOC in the io tree.
+ * - Mark newly dirtied folio as DELALLOC in the io tree.
* Used to advise which range is to be written back.
- * - Mark modified pages as Uptodate/Dirty and not needing COW fixup
+ * - Mark modified folio as Uptodate/Dirty and not needing COW fixup
* - Update inode size for past EOF write
*/
-int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
- size_t num_pages, loff_t pos, size_t write_bytes,
- struct extent_state **cached, bool noreserve)
+int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos,
+ size_t write_bytes, struct extent_state **cached, bool noreserve)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
int ret = 0;
- int i;
u64 num_bytes;
u64 start_pos;
u64 end_of_last_block;
@@ -147,6 +134,8 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
num_bytes = round_up(write_bytes + pos - start_pos,
fs_info->sectorsize);
ASSERT(num_bytes <= U32_MAX);
+ ASSERT(folio_pos(folio) <= pos &&
+ folio_pos(folio) + folio_size(folio) >= pos + write_bytes);
end_of_last_block = start_pos + num_bytes - 1;
@@ -163,16 +152,9 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
if (ret)
return ret;
- for (i = 0; i < num_pages; i++) {
- struct page *p = pages[i];
-
- btrfs_folio_clamp_set_uptodate(fs_info, page_folio(p),
- start_pos, num_bytes);
- btrfs_folio_clamp_clear_checked(fs_info, page_folio(p),
- start_pos, num_bytes);
- btrfs_folio_clamp_set_dirty(fs_info, page_folio(p),
- start_pos, num_bytes);
- }
+ btrfs_folio_clamp_set_uptodate(fs_info, folio, start_pos, num_bytes);
+ btrfs_folio_clamp_clear_checked(fs_info, folio, start_pos, num_bytes);
+ btrfs_folio_clamp_set_dirty(fs_info, folio, start_pos, num_bytes);
/*
* we've only changed i_size in ram, and we haven't updated
@@ -851,53 +833,47 @@ out:
}
/*
- * on error we return an unlocked page and the error value
- * on success we return a locked page and 0
+ * On error return an unlocked folio and the error value
+ * On success return a locked folio and 0
*/
-static int prepare_uptodate_page(struct inode *inode,
- struct page *page, u64 pos,
- bool force_uptodate)
+static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64 pos,
+ u64 len, bool force_uptodate)
{
- struct folio *folio = page_folio(page);
+ u64 clamp_start = max_t(u64, pos, folio_pos(folio));
+ u64 clamp_end = min_t(u64, pos + len, folio_pos(folio) + folio_size(folio));
int ret = 0;
- if (((pos & (PAGE_SIZE - 1)) || force_uptodate) &&
- !PageUptodate(page)) {
- ret = btrfs_read_folio(NULL, folio);
- if (ret)
- return ret;
- lock_page(page);
- if (!PageUptodate(page)) {
- unlock_page(page);
- return -EIO;
- }
-
- /*
- * Since btrfs_read_folio() will unlock the folio before it
- * returns, there is a window where btrfs_release_folio() can be
- * called to release the page. Here we check both inode
- * mapping and PagePrivate() to make sure the page was not
- * released.
- *
- * The private flag check is essential for subpage as we need
- * to store extra bitmap using folio private.
- */
- if (page->mapping != inode->i_mapping || !folio_test_private(folio)) {
- unlock_page(page);
- return -EAGAIN;
- }
- }
- return 0;
-}
+ if (folio_test_uptodate(folio))
+ return 0;
-static fgf_t get_prepare_fgp_flags(bool nowait)
-{
- fgf_t fgp_flags = FGP_LOCK | FGP_ACCESSED | FGP_CREAT;
+ if (!force_uptodate &&
+ IS_ALIGNED(clamp_start, PAGE_SIZE) &&
+ IS_ALIGNED(clamp_end, PAGE_SIZE))
+ return 0;
- if (nowait)
- fgp_flags |= FGP_NOWAIT;
+ ret = btrfs_read_folio(NULL, folio);
+ if (ret)
+ return ret;
+ folio_lock(folio);
+ if (!folio_test_uptodate(folio)) {
+ folio_unlock(folio);
+ return -EIO;
+ }
- return fgp_flags;
+ /*
+ * Since btrfs_read_folio() will unlock the folio before it returns,
+ * there is a window where btrfs_release_folio() can be called to
+ * release the page. Here we check both inode mapping and page
+ * private to make sure the page was not released.
+ *
+ * The private flag check is essential for subpage as we need to store
+ * extra bitmap using folio private.
+ */
+ if (folio->mapping != inode->i_mapping || !folio_test_private(folio)) {
+ folio_unlock(folio);
+ return -EAGAIN;
+ }
+ return 0;
}
static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait)
@@ -914,89 +890,67 @@ static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait)
}
/*
- * this just gets pages into the page cache and locks them down.
+ * Get folio into the page cache and lock it.
*/
-static noinline int prepare_pages(struct inode *inode, struct page **pages,
- size_t num_pages, loff_t pos,
- size_t write_bytes, bool force_uptodate,
- bool nowait)
+static noinline int prepare_one_folio(struct inode *inode, struct folio **folio_ret,
+ loff_t pos, size_t write_bytes,
+ bool force_uptodate, bool nowait)
{
- int i;
unsigned long index = pos >> PAGE_SHIFT;
gfp_t mask = get_prepare_gfp_flags(inode, nowait);
- fgf_t fgp_flags = get_prepare_fgp_flags(nowait);
+ fgf_t fgp_flags = (nowait ? FGP_WRITEBEGIN | FGP_NOWAIT : FGP_WRITEBEGIN);
+ struct folio *folio;
int ret = 0;
- int faili;
- for (i = 0; i < num_pages; i++) {
again:
- pages[i] = pagecache_get_page(inode->i_mapping, index + i,
- fgp_flags, mask | __GFP_WRITE);
- if (!pages[i]) {
- faili = i - 1;
- if (nowait)
- ret = -EAGAIN;
- else
- ret = -ENOMEM;
- goto fail;
- }
-
- ret = set_page_extent_mapped(pages[i]);
- if (ret < 0) {
- faili = i;
- goto fail;
- }
-
- if (i == 0)
- ret = prepare_uptodate_page(inode, pages[i], pos,
- force_uptodate);
- if (!ret && i == num_pages - 1)
- ret = prepare_uptodate_page(inode, pages[i],
- pos + write_bytes, false);
- if (ret) {
- put_page(pages[i]);
- if (!nowait && ret == -EAGAIN) {
- ret = 0;
- goto again;
- }
- faili = i - 1;
- goto fail;
+ folio = __filemap_get_folio(inode->i_mapping, index, fgp_flags, mask);
+ if (IS_ERR(folio)) {
+ if (nowait)
+ ret = -EAGAIN;
+ else
+ ret = PTR_ERR(folio);
+ return ret;
+ }
+ /* Only support page sized folio yet. */
+ ASSERT(folio_order(folio) == 0);
+ ret = set_folio_extent_mapped(folio);
+ if (ret < 0) {
+ folio_unlock(folio);
+ folio_put(folio);
+ return ret;
+ }
+ ret = prepare_uptodate_folio(inode, folio, pos, write_bytes, force_uptodate);
+ if (ret) {
+ /* The folio is already unlocked. */
+ folio_put(folio);
+ if (!nowait && ret == -EAGAIN) {
+ ret = 0;
+ goto again;
}
- wait_on_page_writeback(pages[i]);
+ return ret;
}
-
+ *folio_ret = folio;
return 0;
-fail:
- while (faili >= 0) {
- unlock_page(pages[faili]);
- put_page(pages[faili]);
- faili--;
- }
- return ret;
-
}
/*
- * This function locks the extent and properly waits for data=ordered extents
- * to finish before allowing the pages to be modified if need.
+ * Locks the extent and properly waits for data=ordered extents to finish
+ * before allowing the folios to be modified if need.
*
- * The return value:
+ * Return:
* 1 - the extent is locked
* 0 - the extent is not locked, and everything is OK
- * -EAGAIN - need re-prepare the pages
- * the other < 0 number - Something wrong happens
+ * -EAGAIN - need to prepare the folios again
*/
static noinline int
-lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
- size_t num_pages, loff_t pos,
- size_t write_bytes,
+lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio,
+ loff_t pos, size_t write_bytes,
u64 *lockstart, u64 *lockend, bool nowait,
struct extent_state **cached_state)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
u64 start_pos;
u64 last_pos;
- int i;
int ret = 0;
start_pos = round_down(pos, fs_info->sectorsize);
@@ -1008,12 +962,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
if (nowait) {
if (!try_lock_extent(&inode->io_tree, start_pos, last_pos,
cached_state)) {
- for (i = 0; i < num_pages; i++) {
- unlock_page(pages[i]);
- put_page(pages[i]);
- pages[i] = NULL;
- }
-
+ folio_unlock(folio);
+ folio_put(folio);
return -EAGAIN;
}
} else {
@@ -1027,10 +977,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
ordered->file_offset <= last_pos) {
unlock_extent(&inode->io_tree, start_pos, last_pos,
cached_state);
- for (i = 0; i < num_pages; i++) {
- unlock_page(pages[i]);
- put_page(pages[i]);
- }
+ folio_unlock(folio);
+ folio_put(folio);
btrfs_start_ordered_extent(ordered);
btrfs_put_ordered_extent(ordered);
return -EAGAIN;
@@ -1044,11 +992,10 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
}
/*
- * We should be called after prepare_pages() which should have locked
+ * We should be called after prepare_one_folio() which should have locked
* all pages in the range.
*/
- for (i = 0; i < num_pages; i++)
- WARN_ON(!PageLocked(pages[i]));
+ WARN_ON(!folio_test_locked(folio));
return ret;
}
@@ -1120,27 +1067,7 @@ void btrfs_check_nocow_unlock(struct btrfs_inode *inode)
btrfs_drew_write_unlock(&inode->root->snapshot_lock);
}
-static void update_time_for_write(struct inode *inode)
-{
- struct timespec64 now, ts;
-
- if (IS_NOCMTIME(inode))
- return;
-
- now = current_time(inode);
- ts = inode_get_mtime(inode);
- if (!timespec64_equal(&ts, &now))
- inode_set_mtime_to_ts(inode, now);
-
- ts = inode_get_ctime(inode);
- if (!timespec64_equal(&ts, &now))
- inode_set_ctime_to_ts(inode, now);
-
- if (IS_I_VERSION(inode))
- inode_inc_iversion(inode);
-}
-
-int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, size_t count)
+int btrfs_write_check(struct kiocb *iocb, size_t count)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
@@ -1170,7 +1097,10 @@ int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, size_t count)
* need to start yet another transaction to update the inode as we will
* update the inode when we finish writing whatever data we write.
*/
- update_time_for_write(inode);
+ if (!IS_NOCMTIME(inode)) {
+ inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
+ inode_inc_iversion(inode);
+ }
start_pos = round_down(pos, fs_info->sectorsize);
oldsize = i_size_read(inode);
@@ -1192,20 +1122,17 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
loff_t pos;
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- struct page **pages = NULL;
struct extent_changeset *data_reserved = NULL;
u64 release_bytes = 0;
u64 lockstart;
u64 lockend;
size_t num_written = 0;
- int nrptrs;
ssize_t ret;
- bool only_release_metadata = false;
- bool force_page_uptodate = false;
loff_t old_isize = i_size_read(inode);
unsigned int ilock_flags = 0;
const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0);
+ bool only_release_metadata = false;
if (nowait)
ilock_flags |= BTRFS_ILOCK_TRY;
@@ -1218,38 +1145,26 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
if (ret <= 0)
goto out;
- ret = btrfs_write_check(iocb, i, ret);
+ ret = btrfs_write_check(iocb, ret);
if (ret < 0)
goto out;
pos = iocb->ki_pos;
- nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE),
- PAGE_SIZE / (sizeof(struct page *)));
- nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
- nrptrs = max(nrptrs, 8);
- pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL);
- if (!pages) {
- ret = -ENOMEM;
- goto out;
- }
-
while (iov_iter_count(i) > 0) {
struct extent_state *cached_state = NULL;
size_t offset = offset_in_page(pos);
size_t sector_offset;
- size_t write_bytes = min(iov_iter_count(i),
- nrptrs * (size_t)PAGE_SIZE -
- offset);
- size_t num_pages;
+ size_t write_bytes = min(iov_iter_count(i), PAGE_SIZE - offset);
size_t reserve_bytes;
- size_t dirty_pages;
size_t copied;
size_t dirty_sectors;
size_t num_sectors;
+ struct folio *folio = NULL;
int extents_locked;
+ bool force_page_uptodate = false;
/*
- * Fault pages before locking them in prepare_pages
+ * Fault pages before locking them in prepare_one_folio()
* to avoid recursive lock
*/
if (unlikely(fault_in_iov_iter_readable(i, write_bytes))) {
@@ -1288,8 +1203,6 @@ ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i)
only_release_metadata = true;
}
- num_pages = DIV_ROUND_UP(write_bytes + offset, PAGE_SIZE);
- WARN_ON(num_pages > nrptrs);
reserve_bytes = round_up(write_bytes + sector_offset,
fs_info->sectorsize);
WARN_ON(reserve_bytes == 0);
@@ -1317,23 +1230,17 @@ again:
break;
}
- /*
- * This is going to setup the pages array with the number of
- * pages we want, so we don't really need to worry about the
- * contents of pages from loop to loop
- */
- ret = prepare_pages(inode, pages, num_pages,
- pos, write_bytes, force_page_uptodate, false);
+ ret = prepare_one_folio(inode, &folio, pos, write_bytes,
+ force_page_uptodate, false);
if (ret) {
btrfs_delalloc_release_extents(BTRFS_I(inode),
reserve_bytes);
break;
}
- extents_locked = lock_and_cleanup_extent_if_need(
- BTRFS_I(inode), pages,
- num_pages, pos, write_bytes, &lockstart,
- &lockend, nowait, &cached_state);
+ extents_locked = lock_and_cleanup_extent_if_need(BTRFS_I(inode),
+ folio, pos, write_bytes, &lockstart,
+ &lockend, nowait, &cached_state);
if (extents_locked < 0) {
if (!nowait && extents_locked == -EAGAIN)
goto again;
@@ -1344,28 +1251,18 @@ again:
break;
}
- copied = btrfs_copy_from_user(pos, write_bytes, pages, i);
+ copied = btrfs_copy_from_user(pos, write_bytes, folio, i);
num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes);
dirty_sectors = round_up(copied + sector_offset,
fs_info->sectorsize);
dirty_sectors = BTRFS_BYTES_TO_BLKS(fs_info, dirty_sectors);
- /*
- * if we have trouble faulting in the pages, fall
- * back to one page at a time
- */
- if (copied < write_bytes)
- nrptrs = 1;
-
if (copied == 0) {
force_page_uptodate = true;
dirty_sectors = 0;
- dirty_pages = 0;
} else {
force_page_uptodate = false;
- dirty_pages = DIV_ROUND_UP(copied + offset,
- PAGE_SIZE);
}
if (num_sectors > dirty_sectors) {
@@ -1375,13 +1272,10 @@ again:
btrfs_delalloc_release_metadata(BTRFS_I(inode),
release_bytes, true);
} else {
- u64 __pos;
-
- __pos = round_down(pos,
- fs_info->sectorsize) +
- (dirty_pages << PAGE_SHIFT);
+ u64 release_start = round_up(pos + copied,
+ fs_info->sectorsize);
btrfs_delalloc_release_space(BTRFS_I(inode),
- data_reserved, __pos,
+ data_reserved, release_start,
release_bytes, true);
}
}
@@ -1389,15 +1283,14 @@ again:
release_bytes = round_up(copied + sector_offset,
fs_info->sectorsize);
- ret = btrfs_dirty_pages(BTRFS_I(inode), pages,
- dirty_pages, pos, copied,
+ ret = btrfs_dirty_folio(BTRFS_I(inode), folio, pos, copied,
&cached_state, only_release_metadata);
/*
* If we have not locked the extent range, because the range's
* start offset is >= i_size, we might still have a non-NULL
* cached extent state, acquired while marking the extent range
- * as delalloc through btrfs_dirty_pages(). Therefore free any
+ * as delalloc through btrfs_dirty_page(). Therefore free any
* possible cached extent state to avoid a memory leak.
*/
if (extents_locked)
@@ -1408,7 +1301,7 @@ again:
btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
if (ret) {
- btrfs_drop_pages(fs_info, pages, num_pages, pos, copied);
+ btrfs_drop_folio(fs_info, folio, pos, copied);
break;
}
@@ -1416,7 +1309,7 @@ again:
if (only_release_metadata)
btrfs_check_nocow_unlock(BTRFS_I(inode));
- btrfs_drop_pages(fs_info, pages, num_pages, pos, copied);
+ btrfs_drop_folio(fs_info, folio, pos, copied);
cond_resched();
@@ -1424,8 +1317,6 @@ again:
num_written += copied;
}
- kfree(pages);
-
if (release_bytes) {
if (only_release_metadata) {
btrfs_check_nocow_unlock(BTRFS_I(inode));
@@ -1470,7 +1361,7 @@ static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from,
if (ret || encoded->len == 0)
goto out;
- ret = btrfs_write_check(iocb, from, encoded->len);
+ ret = btrfs_write_check(iocb, encoded->len);
if (ret < 0)
goto out;
@@ -3802,6 +3693,7 @@ const struct file_operations btrfs_file_operations = {
.compat_ioctl = btrfs_compat_ioctl,
#endif
.remap_file_range = btrfs_remap_file_range,
+ .uring_cmd = btrfs_uring_cmd,
.fop_flags = FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC,
};
diff --git a/fs/btrfs/file.h b/fs/btrfs/file.h
index 912254e653cf..de89e644be29 100644
--- a/fs/btrfs/file.h
+++ b/fs/btrfs/file.h
@@ -34,9 +34,8 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
const struct btrfs_ioctl_encoded_io_args *encoded);
int btrfs_release_file(struct inode *inode, struct file *file);
-int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
- size_t num_pages, loff_t pos, size_t write_bytes,
- struct extent_state **cached, bool noreserve);
+int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos,
+ size_t write_bytes, struct extent_state **cached, bool noreserve);
int btrfs_fdatawrite_range(struct btrfs_inode *inode, loff_t start, loff_t end);
int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
size_t *write_bytes, bool nowait);
@@ -44,7 +43,7 @@ void btrfs_check_nocow_unlock(struct btrfs_inode *inode);
bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
struct extent_state **cached_state,
u64 *delalloc_start_ret, u64 *delalloc_end_ret);
-int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, size_t count);
+int btrfs_write_check(struct kiocb *iocb, size_t count);
ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i);
#endif
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index f4bcb2530660..cfa52ef40b06 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -11,6 +11,7 @@
#include <linux/ratelimit.h>
#include <linux/error-injection.h>
#include <linux/sched/mm.h>
+#include <linux/string_choices.h>
#include "ctree.h"
#include "fs.h"
#include "messages.h"
@@ -1387,6 +1388,7 @@ static int __btrfs_write_out_cache(struct inode *inode,
int bitmaps = 0;
int ret;
int must_iput = 0;
+ int i_size;
if (!i_size_read(inode))
return -EIO;
@@ -1457,11 +1459,16 @@ static int __btrfs_write_out_cache(struct inode *inode,
io_ctl_zero_remaining_pages(io_ctl);
/* Everything is written out, now we dirty the pages in the file. */
- ret = btrfs_dirty_pages(BTRFS_I(inode), io_ctl->pages,
- io_ctl->num_pages, 0, i_size_read(inode),
- &cached_state, false);
- if (ret)
- goto out_nospc;
+ i_size = i_size_read(inode);
+ for (int i = 0; i < round_up(i_size, PAGE_SIZE) / PAGE_SIZE; i++) {
+ u64 dirty_start = i * PAGE_SIZE;
+ u64 dirty_len = min_t(u64, dirty_start + PAGE_SIZE, i_size) - dirty_start;
+
+ ret = btrfs_dirty_folio(BTRFS_I(inode), page_folio(io_ctl->pages[i]),
+ dirty_start, dirty_len, &cached_state, false);
+ if (ret < 0)
+ goto out_nospc;
+ }
if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
up_write(&block_group->data_rwsem);
@@ -2936,12 +2943,11 @@ void btrfs_dump_free_space(struct btrfs_block_group *block_group,
if (info->bytes >= bytes && !block_group->ro)
count++;
btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s",
- info->offset, info->bytes,
- (info->bitmap) ? "yes" : "no");
+ info->offset, info->bytes, str_yes_no(info->bitmap));
}
spin_unlock(&ctl->tree_lock);
btrfs_info(fs_info, "block group has cluster?: %s",
- list_empty(&block_group->cluster_list) ? "no" : "yes");
+ str_no_yes(list_empty(&block_group->cluster_list)));
btrfs_info(fs_info,
"%d free space entries at or bigger than %llu bytes",
count, bytes);
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 79f64e383edd..79a1a3d6f04d 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -263,10 +263,10 @@ enum {
BTRFS_FEATURE_INCOMPAT_ZONED | \
BTRFS_FEATURE_INCOMPAT_SIMPLE_QUOTA)
-#ifdef CONFIG_BTRFS_DEBUG
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
/*
* Features under developmen like Extent tree v2 support is enabled
- * only under CONFIG_BTRFS_DEBUG.
+ * only under CONFIG_BTRFS_EXPERIMENTAL
*/
#define BTRFS_FEATURE_INCOMPAT_SUPP \
(BTRFS_FEATURE_INCOMPAT_SUPP_STABLE | \
@@ -317,6 +317,8 @@ struct btrfs_dev_replace {
struct percpu_counter bio_counter;
wait_queue_head_t replace_wait;
+
+ struct task_struct *replace_task;
};
/*
@@ -633,9 +635,10 @@ struct btrfs_fs_info {
s32 delalloc_batch;
struct percpu_counter evictable_extent_maps;
- spinlock_t extent_map_shrinker_lock;
- u64 extent_map_shrinker_last_root;
- u64 extent_map_shrinker_last_ino;
+ u64 em_shrinker_last_root;
+ u64 em_shrinker_last_ino;
+ atomic64_t em_shrinker_nr_to_scan;
+ struct work_struct em_shrinker_work;
/* Protected by 'trans_lock'. */
struct list_head dirty_cowonly_roots;
@@ -876,12 +879,9 @@ struct btrfs_fs_info {
#endif
};
-#define page_to_inode(_page) (BTRFS_I(_Generic((_page), \
- struct page *: (_page))->mapping->host))
#define folio_to_inode(_folio) (BTRFS_I(_Generic((_folio), \
struct folio *: (_folio))->mapping->host))
-#define page_to_fs_info(_page) (page_to_inode(_page)->root->fs_info)
#define folio_to_fs_info(_folio) (folio_to_inode(_folio)->root->fs_info)
#define inode_to_fs_info(_inode) (BTRFS_I(_Generic((_inode), \
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5618ca02934a..03fe0de2cd0d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -421,7 +421,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
index++;
continue;
}
- folio = __filemap_get_folio(inode->vfs_inode.i_mapping, index, 0, 0);
+ folio = filemap_get_folio(inode->vfs_inode.i_mapping, index);
index++;
if (IS_ERR(folio))
continue;
@@ -556,8 +556,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
} else {
struct folio *folio;
- folio = __filemap_get_folio(inode->vfs_inode.i_mapping,
- 0, 0, 0);
+ folio = filemap_get_folio(inode->vfs_inode.i_mapping, 0);
ASSERT(!IS_ERR(folio));
btrfs_set_file_extent_compression(leaf, ei, 0);
kaddr = kmap_local_folio(folio, 0);
@@ -646,7 +645,7 @@ static bool can_cow_file_range_inline(struct btrfs_inode *inode,
* If being used directly, you must have already checked we're allowed to cow
* the range by getting true from can_cow_file_range_inline().
*/
-static noinline int __cow_file_range_inline(struct btrfs_inode *inode, u64 offset,
+static noinline int __cow_file_range_inline(struct btrfs_inode *inode,
u64 size, size_t compressed_size,
int compress_type,
struct folio *compressed_folio,
@@ -736,7 +735,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode,
return 1;
lock_extent(&inode->io_tree, offset, end, &cached);
- ret = __cow_file_range_inline(inode, offset, size, compressed_size,
+ ret = __cow_file_range_inline(inode, size, compressed_size,
compress_type, compressed_folio,
update_i_size);
if (ret > 0) {
@@ -832,32 +831,16 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
return 0;
}
/*
- * Special check for subpage.
+ * Only enable sector perfect compression for experimental builds.
*
- * We lock the full page then run each delalloc range in the page, thus
- * for the following case, we will hit some subpage specific corner case:
+ * This is a big feature change for subpage cases, and can hit
+ * different corner cases, so only limit this feature for
+ * experimental build for now.
*
- * 0 32K 64K
- * | |///////| |///////|
- * \- A \- B
- *
- * In above case, both range A and range B will try to unlock the full
- * page [0, 64K), causing the one finished later will have page
- * unlocked already, triggering various page lock requirement BUG_ON()s.
- *
- * So here we add an artificial limit that subpage compression can only
- * if the range is fully page aligned.
- *
- * In theory we only need to ensure the first page is fully covered, but
- * the tailing partial page will be locked until the full compression
- * finishes, delaying the write of other range.
- *
- * TODO: Make btrfs_run_delalloc_range() to lock all delalloc range
- * first to prevent any submitted async extent to unlock the full page.
- * By this, we can ensure for subpage case that only the last async_cow
- * will unlock the full page.
+ * ETA for moving this out of experimental builds is 6.15.
*/
- if (fs_info->sectorsize < PAGE_SIZE) {
+ if (fs_info->sectorsize < PAGE_SIZE &&
+ !IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL)) {
if (!PAGE_ALIGNED(start) ||
!PAGE_ALIGNED(end + 1))
return 0;
@@ -896,13 +879,14 @@ static int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 e
for (unsigned long index = start >> PAGE_SHIFT;
index <= end_index; index++) {
- folio = __filemap_get_folio(inode->i_mapping, index, 0, 0);
+ folio = filemap_get_folio(inode->i_mapping, index);
if (IS_ERR(folio)) {
if (!ret)
ret = PTR_ERR(folio);
continue;
}
- folio_clear_dirty_for_io(folio);
+ btrfs_folio_clamp_clear_dirty(inode_to_fs_info(inode), folio, start,
+ end + 1 - start);
folio_put(folio);
}
return ret;
@@ -1001,17 +985,6 @@ again:
(start > 0 || end + 1 < inode->disk_i_size))
goto cleanup_and_bail_uncompressed;
- /*
- * For subpage case, we require full page alignment for the sector
- * aligned range.
- * Thus we must also check against @actual_end, not just @end.
- */
- if (blocksize < PAGE_SIZE) {
- if (!PAGE_ALIGNED(start) ||
- !PAGE_ALIGNED(round_up(actual_end, blocksize)))
- goto cleanup_and_bail_uncompressed;
- }
-
total_compressed = min_t(unsigned long, total_compressed,
BTRFS_MAX_UNCOMPRESSED);
total_in = 0;
@@ -1359,7 +1332,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
u64 alloc_hint = 0;
u64 orig_start = start;
u64 num_bytes;
- unsigned long ram_size;
u64 cur_alloc_size = 0;
u64 min_alloc_size;
u64 blocksize = fs_info->sectorsize;
@@ -1367,7 +1339,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
struct extent_map *em;
unsigned clear_bits;
unsigned long page_ops;
- bool extent_reserved = false;
int ret = 0;
if (btrfs_is_free_space_inode(inode)) {
@@ -1421,8 +1392,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
struct btrfs_ordered_extent *ordered;
struct btrfs_file_extent file_extent;
- cur_alloc_size = num_bytes;
- ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
+ ret = btrfs_reserve_extent(root, num_bytes, num_bytes,
min_alloc_size, 0, alloc_hint,
&ins, 1, 1);
if (ret == -EAGAIN) {
@@ -1453,9 +1423,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
if (ret < 0)
goto out_unlock;
cur_alloc_size = ins.offset;
- extent_reserved = true;
- ram_size = ins.offset;
file_extent.disk_bytenr = ins.objectid;
file_extent.disk_num_bytes = ins.offset;
file_extent.num_bytes = ins.offset;
@@ -1463,14 +1431,14 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
file_extent.offset = 0;
file_extent.compression = BTRFS_COMPRESS_NONE;
- lock_extent(&inode->io_tree, start, start + ram_size - 1,
+ lock_extent(&inode->io_tree, start, start + cur_alloc_size - 1,
&cached);
em = btrfs_create_io_em(inode, start, &file_extent,
BTRFS_ORDERED_REGULAR);
if (IS_ERR(em)) {
unlock_extent(&inode->io_tree, start,
- start + ram_size - 1, &cached);
+ start + cur_alloc_size - 1, &cached);
ret = PTR_ERR(em);
goto out_reserve;
}
@@ -1480,7 +1448,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
1 << BTRFS_ORDERED_REGULAR);
if (IS_ERR(ordered)) {
unlock_extent(&inode->io_tree, start,
- start + ram_size - 1, &cached);
+ start + cur_alloc_size - 1, &cached);
ret = PTR_ERR(ordered);
goto out_drop_extent_cache;
}
@@ -1501,7 +1469,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
*/
if (ret)
btrfs_drop_extent_map_range(inode, start,
- start + ram_size - 1,
+ start + cur_alloc_size - 1,
false);
}
btrfs_put_ordered_extent(ordered);
@@ -1513,13 +1481,13 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
* (which the caller expects to stay locked), don't clear any
* dirty bits and don't set any writeback bits
*
- * Do set the Ordered (Private2) bit so we know this page was
+ * Do set the Ordered flag so we know this page was
* properly setup for writepage.
*/
page_ops = (keep_locked ? 0 : PAGE_UNLOCK);
page_ops |= PAGE_SET_ORDERED;
- extent_clear_unlock_delalloc(inode, start, start + ram_size - 1,
+ extent_clear_unlock_delalloc(inode, start, start + cur_alloc_size - 1,
locked_folio, &cached,
EXTENT_LOCKED | EXTENT_DELALLOC,
page_ops);
@@ -1529,7 +1497,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode,
num_bytes -= cur_alloc_size;
alloc_hint = ins.objectid + ins.offset;
start += cur_alloc_size;
- extent_reserved = false;
+ cur_alloc_size = 0;
/*
* btrfs_reloc_clone_csums() error, since start is increased
@@ -1545,7 +1513,7 @@ done:
return ret;
out_drop_extent_cache:
- btrfs_drop_extent_map_range(inode, start, start + ram_size - 1, false);
+ btrfs_drop_extent_map_range(inode, start, start + cur_alloc_size - 1, false);
out_reserve:
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
@@ -1599,13 +1567,12 @@ out_unlock:
* to decrement again the data space_info's bytes_may_use counter,
* therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV.
*/
- if (extent_reserved) {
+ if (cur_alloc_size) {
extent_clear_unlock_delalloc(inode, start,
start + cur_alloc_size - 1,
locked_folio, &cached, clear_bits,
page_ops);
btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL);
- start += cur_alloc_size;
}
/*
@@ -1614,11 +1581,13 @@ out_unlock:
* space_info's bytes_may_use counter, reserved in
* btrfs_check_data_free_space().
*/
- if (start < end) {
+ if (start + cur_alloc_size < end) {
clear_bits |= EXTENT_CLEAR_DATA_RESV;
- extent_clear_unlock_delalloc(inode, start, end, locked_folio,
+ extent_clear_unlock_delalloc(inode, start + cur_alloc_size,
+ end, locked_folio,
&cached, clear_bits, page_ops);
- btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL);
+ btrfs_qgroup_free_data(inode, NULL, start + cur_alloc_size,
+ end - start - cur_alloc_size + 1, NULL);
}
return ret;
}
@@ -1729,7 +1698,7 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode,
* need full accuracy. Just account the whole thing
* against the first page.
*/
- wbc_account_cgroup_owner(wbc, &locked_folio->page,
+ wbc_account_cgroup_owner(wbc, locked_folio,
cur_end - start);
async_chunk[i].locked_folio = locked_folio;
locked_folio = NULL;
@@ -3094,34 +3063,6 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
goto out;
}
- if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
- BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
-
- btrfs_inode_safe_disk_i_size_write(inode, 0);
- if (freespace_inode)
- trans = btrfs_join_transaction_spacecache(root);
- else
- trans = btrfs_join_transaction(root);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- trans = NULL;
- goto out;
- }
- trans->block_rsv = &inode->block_rsv;
- ret = btrfs_update_inode_fallback(trans, inode);
- if (ret) /* -ENOMEM or corruption */
- btrfs_abort_transaction(trans, ret);
-
- ret = btrfs_insert_raid_extent(trans, ordered_extent);
- if (ret)
- btrfs_abort_transaction(trans, ret);
-
- goto out;
- }
-
- clear_bits |= EXTENT_LOCKED;
- lock_extent(io_tree, start, end, &cached_state);
-
if (freespace_inode)
trans = btrfs_join_transaction_spacecache(root);
else
@@ -3135,8 +3076,31 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
trans->block_rsv = &inode->block_rsv;
ret = btrfs_insert_raid_extent(trans, ordered_extent);
- if (ret)
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
+ /* Logic error */
+ ASSERT(list_empty(&ordered_extent->list));
+ if (!list_empty(&ordered_extent->list)) {
+ ret = -EINVAL;
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
+
+ btrfs_inode_safe_disk_i_size_write(inode, 0);
+ ret = btrfs_update_inode_fallback(trans, inode);
+ if (ret) {
+ /* -ENOMEM or corruption */
+ btrfs_abort_transaction(trans, ret);
+ }
goto out;
+ }
+
+ clear_bits |= EXTENT_LOCKED;
+ lock_extent(io_tree, start, end, &cached_state);
if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
compress_type = ordered_extent->compress_type;
@@ -3791,14 +3755,45 @@ static int btrfs_init_file_extent_tree(struct btrfs_inode *inode)
return 0;
}
+static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc)
+{
+ struct btrfs_root *root = inode->root;
+ struct btrfs_inode *existing;
+ const u64 ino = btrfs_ino(inode);
+ int ret;
+
+ if (inode_unhashed(&inode->vfs_inode))
+ return 0;
+
+ if (prealloc) {
+ ret = xa_reserve(&root->inodes, ino, GFP_NOFS);
+ if (ret)
+ return ret;
+ }
+
+ existing = xa_store(&root->inodes, ino, inode, GFP_ATOMIC);
+
+ if (xa_is_err(existing)) {
+ ret = xa_err(existing);
+ ASSERT(ret != -EINVAL);
+ ASSERT(ret != -ENOMEM);
+ return ret;
+ } else if (existing) {
+ WARN_ON(!(existing->vfs_inode.i_state & (I_WILL_FREE | I_FREEING)));
+ }
+
+ return 0;
+}
+
/*
- * read an inode from the btree into the in-memory inode
+ * Read a locked inode from the btree into the in-memory inode and add it to
+ * its root list/tree.
+ *
+ * On failure clean up the inode.
*/
-static int btrfs_read_locked_inode(struct inode *inode,
- struct btrfs_path *in_path)
+static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path)
{
struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
- struct btrfs_path *path = in_path;
struct extent_buffer *leaf;
struct btrfs_inode_item *inode_item;
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -3812,25 +3807,25 @@ static int btrfs_read_locked_inode(struct inode *inode,
ret = btrfs_init_file_extent_tree(BTRFS_I(inode));
if (ret)
- return ret;
+ goto out;
ret = btrfs_fill_inode(inode, &rdev);
if (!ret)
filled = true;
- if (!path) {
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
- }
+ ASSERT(path);
btrfs_get_inode_key(BTRFS_I(inode), &location);
ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
if (ret) {
- if (path != in_path)
- btrfs_free_path(path);
- return ret;
+ /*
+ * ret > 0 can come from btrfs_search_slot called by
+ * btrfs_lookup_inode(), this means the inode was not found.
+ */
+ if (ret > 0)
+ ret = -ENOENT;
+ goto out;
}
leaf = path->nodes[0];
@@ -3965,8 +3960,6 @@ cache_acl:
btrfs_ino(BTRFS_I(inode)),
btrfs_root_id(root), ret);
}
- if (path != in_path)
- btrfs_free_path(path);
if (!maybe_acls)
cache_no_acl(inode);
@@ -3993,7 +3986,15 @@ cache_acl:
}
btrfs_sync_inode_flags_to_i_flags(inode);
+
+ ret = btrfs_add_inode_to_root(BTRFS_I(inode), true);
+ if (ret)
+ goto out;
+
return 0;
+out:
+ iget_failed(inode);
+ return ret;
}
/*
@@ -4368,11 +4369,8 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
*/
if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
di = btrfs_search_dir_index_item(root, path, dir_ino, &fname.disk_name);
- if (IS_ERR_OR_NULL(di)) {
- if (!di)
- ret = -ENOENT;
- else
- ret = PTR_ERR(di);
+ if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -5505,35 +5503,7 @@ out:
return err;
}
-static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc)
-{
- struct btrfs_root *root = inode->root;
- struct btrfs_inode *existing;
- const u64 ino = btrfs_ino(inode);
- int ret;
- if (inode_unhashed(&inode->vfs_inode))
- return 0;
-
- if (prealloc) {
- ret = xa_reserve(&root->inodes, ino, GFP_NOFS);
- if (ret)
- return ret;
- }
-
- existing = xa_store(&root->inodes, ino, inode, GFP_ATOMIC);
-
- if (xa_is_err(existing)) {
- ret = xa_err(existing);
- ASSERT(ret != -EINVAL);
- ASSERT(ret != -ENOMEM);
- return ret;
- } else if (existing) {
- WARN_ON(!(existing->vfs_inode.i_state & (I_WILL_FREE | I_FREEING)));
- }
-
- return 0;
-}
static void btrfs_del_inode_from_root(struct btrfs_inode *inode)
{
@@ -5595,10 +5565,8 @@ static struct inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root)
}
/*
- * Get an inode object given its inode number and corresponding root.
- * Path can be preallocated to prevent recursing back to iget through
- * allocator. NULL is also valid but may require an additional allocation
- * later.
+ * Get an inode object given its inode number and corresponding root. Path is
+ * preallocated to prevent recursing back to iget through allocator.
*/
struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root,
struct btrfs_path *path)
@@ -5614,30 +5582,40 @@ struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root,
return inode;
ret = btrfs_read_locked_inode(inode, path);
- /*
- * ret > 0 can come from btrfs_search_slot called by
- * btrfs_read_locked_inode(), this means the inode item was not found.
- */
- if (ret > 0)
- ret = -ENOENT;
- if (ret < 0)
- goto error;
-
- ret = btrfs_add_inode_to_root(BTRFS_I(inode), true);
- if (ret < 0)
- goto error;
+ if (ret)
+ return ERR_PTR(ret);
unlock_new_inode(inode);
-
return inode;
-error:
- iget_failed(inode);
- return ERR_PTR(ret);
}
+/*
+ * Get an inode object given its inode number and corresponding root.
+ */
struct inode *btrfs_iget(u64 ino, struct btrfs_root *root)
{
- return btrfs_iget_path(ino, root, NULL);
+ struct inode *inode;
+ struct btrfs_path *path;
+ int ret;
+
+ inode = btrfs_iget_locked(ino, root);
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+
+ if (!(inode->i_state & I_NEW))
+ return inode;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return ERR_PTR(-ENOMEM);
+
+ ret = btrfs_read_locked_inode(inode, path);
+ btrfs_free_path(path);
+ if (ret)
+ return ERR_PTR(ret);
+
+ unlock_new_inode(inode);
+ return inode;
}
static struct inode *new_simple_dir(struct inode *dir,
@@ -6026,7 +6004,7 @@ again:
* offset. This means that new entries created during readdir
* are *guaranteed* to be seen in the future by that readdir.
* This has broken buggy programs which operate on names as
- * they're returned by readdir. Until we re-use freed offsets
+ * they're returned by readdir. Until we reuse freed offsets
* we have this hack to stop new entries from being returned
* under the assumption that they'll never reach this huge
* offset.
@@ -6768,8 +6746,7 @@ static noinline int uncompress_inline(struct btrfs_path *path,
return ret;
}
-static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path,
- struct folio *folio)
+static int read_inline_extent(struct btrfs_path *path, struct folio *folio)
{
struct btrfs_file_extent_item *fi;
void *kaddr;
@@ -6967,7 +6944,7 @@ next:
ASSERT(em->disk_bytenr == EXTENT_MAP_INLINE);
ASSERT(em->len == fs_info->sectorsize);
- ret = read_inline_extent(inode, path, folio);
+ ret = read_inline_extent(path, folio);
if (ret < 0)
goto out;
goto insert;
@@ -7297,7 +7274,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
*
* But already submitted bio can still be finished on this folio.
* Furthermore, endio function won't skip folio which has Ordered
- * (Private2) already cleared, so it's possible for endio and
+ * already cleared, so it's possible for endio and
* invalidate_folio to do the same ordered extent accounting twice
* on one folio.
*
@@ -7363,7 +7340,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
range_len = range_end + 1 - cur;
if (!btrfs_folio_test_ordered(fs_info, folio, cur, range_len)) {
/*
- * If Ordered (Private2) is cleared, it means endio has
+ * If Ordered is cleared, it means endio has
* already been executed for the range.
* We can't delete the extent states as
* btrfs_finish_ordered_io() may still use some of them.
@@ -7436,7 +7413,7 @@ next:
}
/*
* We have iterated through all ordered extents of the page, the page
- * should not have Ordered (Private2) anymore, or the above iteration
+ * should not have Ordered anymore, or the above iteration
* did something wrong.
*/
ASSERT(!folio_test_ordered(folio));
@@ -8975,28 +8952,6 @@ out_inode:
return finish_open_simple(file, ret);
}
-void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
-{
- struct btrfs_fs_info *fs_info = inode->root->fs_info;
- unsigned long index = start >> PAGE_SHIFT;
- unsigned long end_index = end >> PAGE_SHIFT;
- struct folio *folio;
- u32 len;
-
- ASSERT(end + 1 - start <= U32_MAX);
- len = end + 1 - start;
- while (index <= end_index) {
- folio = __filemap_get_folio(inode->vfs_inode.i_mapping, index, 0, 0);
- ASSERT(!IS_ERR(folio)); /* folios should be in the extent_io_tree */
-
- /* This is for data, which doesn't yet support larger folio. */
- ASSERT(folio_order(folio) == 0);
- btrfs_folio_set_writeback(fs_info, folio, start, len);
- folio_put(folio);
- index++;
- }
-}
-
int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
int compress_type)
{
@@ -9041,12 +8996,16 @@ static ssize_t btrfs_encoded_read_inline(
unsigned long ptr;
void *tmp;
ssize_t ret;
+ const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
goto out;
}
+
+ path->nowait = nowait;
+
ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
extent_start, 0);
if (ret) {
@@ -9110,6 +9069,7 @@ out:
struct btrfs_encoded_read_private {
wait_queue_head_t wait;
+ void *uring_ctx;
atomic_t pending;
blk_status_t status;
};
@@ -9129,26 +9089,40 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
*/
WRITE_ONCE(priv->status, bbio->bio.bi_status);
}
- if (!atomic_dec_return(&priv->pending))
- wake_up(&priv->wait);
+ if (atomic_dec_return(&priv->pending) == 0) {
+ int err = blk_status_to_errno(READ_ONCE(priv->status));
+
+ if (priv->uring_ctx) {
+ btrfs_uring_read_extent_endio(priv->uring_ctx, err);
+ kfree(priv);
+ } else {
+ wake_up(&priv->wait);
+ }
+ }
bio_put(&bbio->bio);
}
int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
- u64 file_offset, u64 disk_bytenr,
- u64 disk_io_size, struct page **pages)
+ u64 disk_bytenr, u64 disk_io_size,
+ struct page **pages, void *uring_ctx)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
- struct btrfs_encoded_read_private priv = {
- .pending = ATOMIC_INIT(1),
- };
+ struct btrfs_encoded_read_private *priv;
unsigned long i = 0;
struct btrfs_bio *bbio;
+ int ret;
+
+ priv = kmalloc(sizeof(struct btrfs_encoded_read_private), GFP_NOFS);
+ if (!priv)
+ return -ENOMEM;
- init_waitqueue_head(&priv.wait);
+ init_waitqueue_head(&priv->wait);
+ atomic_set(&priv->pending, 1);
+ priv->status = 0;
+ priv->uring_ctx = uring_ctx;
bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
- btrfs_encoded_read_endio, &priv);
+ btrfs_encoded_read_endio, priv);
bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
bbio->inode = inode;
@@ -9156,11 +9130,11 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE);
if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) {
- atomic_inc(&priv.pending);
+ atomic_inc(&priv->pending);
btrfs_submit_bbio(bbio, 0);
bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
- btrfs_encoded_read_endio, &priv);
+ btrfs_encoded_read_endio, priv);
bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
bbio->inode = inode;
continue;
@@ -9171,22 +9145,33 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
disk_io_size -= bytes;
} while (disk_io_size);
- atomic_inc(&priv.pending);
+ atomic_inc(&priv->pending);
btrfs_submit_bbio(bbio, 0);
- if (atomic_dec_return(&priv.pending))
- io_wait_event(priv.wait, !atomic_read(&priv.pending));
- /* See btrfs_encoded_read_endio() for ordering. */
- return blk_status_to_errno(READ_ONCE(priv.status));
+ if (uring_ctx) {
+ if (atomic_dec_return(&priv->pending) == 0) {
+ ret = blk_status_to_errno(READ_ONCE(priv->status));
+ btrfs_uring_read_extent_endio(uring_ctx, ret);
+ kfree(priv);
+ return ret;
+ }
+
+ return -EIOCBQUEUED;
+ } else {
+ if (atomic_dec_return(&priv->pending) != 0)
+ io_wait_event(priv->wait, !atomic_read(&priv->pending));
+ /* See btrfs_encoded_read_endio() for ordering. */
+ ret = blk_status_to_errno(READ_ONCE(priv->status));
+ kfree(priv);
+ return ret;
+ }
}
-static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
- struct iov_iter *iter,
- u64 start, u64 lockend,
- struct extent_state **cached_state,
- u64 disk_bytenr, u64 disk_io_size,
- size_t count, bool compressed,
- bool *unlocked)
+ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter,
+ u64 start, u64 lockend,
+ struct extent_state **cached_state,
+ u64 disk_bytenr, u64 disk_io_size,
+ size_t count, bool compressed, bool *unlocked)
{
struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
struct extent_io_tree *io_tree = &inode->io_tree;
@@ -9206,8 +9191,8 @@ static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
goto out;
}
- ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr,
- disk_io_size, pages);
+ ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr,
+ disk_io_size, pages, NULL);
if (ret)
goto out;
@@ -9247,21 +9232,26 @@ out:
}
ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
- struct btrfs_ioctl_encoded_io_args *encoded)
+ struct btrfs_ioctl_encoded_io_args *encoded,
+ struct extent_state **cached_state,
+ u64 *disk_bytenr, u64 *disk_io_size)
{
struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct extent_io_tree *io_tree = &inode->io_tree;
ssize_t ret;
size_t count = iov_iter_count(iter);
- u64 start, lockend, disk_bytenr, disk_io_size;
- struct extent_state *cached_state = NULL;
+ u64 start, lockend;
struct extent_map *em;
+ const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
bool unlocked = false;
file_accessed(iocb->ki_filp);
- btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
+ ret = btrfs_inode_lock(inode,
+ BTRFS_ILOCK_SHARED | (nowait ? BTRFS_ILOCK_TRY : 0));
+ if (ret)
+ return ret;
if (iocb->ki_pos >= inode->vfs_inode.i_size) {
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
@@ -9274,21 +9264,46 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
*/
lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
- for (;;) {
+ if (nowait) {
struct btrfs_ordered_extent *ordered;
- ret = btrfs_wait_ordered_range(inode, start,
- lockend - start + 1);
- if (ret)
+ if (filemap_range_needs_writeback(inode->vfs_inode.i_mapping,
+ start, lockend)) {
+ ret = -EAGAIN;
+ goto out_unlock_inode;
+ }
+
+ if (!try_lock_extent(io_tree, start, lockend, cached_state)) {
+ ret = -EAGAIN;
goto out_unlock_inode;
- lock_extent(io_tree, start, lockend, &cached_state);
+ }
+
ordered = btrfs_lookup_ordered_range(inode, start,
lockend - start + 1);
- if (!ordered)
- break;
- btrfs_put_ordered_extent(ordered);
- unlock_extent(io_tree, start, lockend, &cached_state);
- cond_resched();
+ if (ordered) {
+ btrfs_put_ordered_extent(ordered);
+ unlock_extent(io_tree, start, lockend, cached_state);
+ ret = -EAGAIN;
+ goto out_unlock_inode;
+ }
+ } else {
+ for (;;) {
+ struct btrfs_ordered_extent *ordered;
+
+ ret = btrfs_wait_ordered_range(inode, start,
+ lockend - start + 1);
+ if (ret)
+ goto out_unlock_inode;
+
+ lock_extent(io_tree, start, lockend, cached_state);
+ ordered = btrfs_lookup_ordered_range(inode, start,
+ lockend - start + 1);
+ if (!ordered)
+ break;
+ btrfs_put_ordered_extent(ordered);
+ unlock_extent(io_tree, start, lockend, cached_state);
+ cond_resched();
+ }
}
em = btrfs_get_extent(inode, NULL, start, lockend - start + 1);
@@ -9307,9 +9322,9 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
free_extent_map(em);
em = NULL;
ret = btrfs_encoded_read_inline(iocb, iter, start, lockend,
- &cached_state, extent_start,
+ cached_state, extent_start,
count, encoded, &unlocked);
- goto out;
+ goto out_unlock_extent;
}
/*
@@ -9320,12 +9335,12 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
inode->vfs_inode.i_size) - iocb->ki_pos;
if (em->disk_bytenr == EXTENT_MAP_HOLE ||
(em->flags & EXTENT_FLAG_PREALLOC)) {
- disk_bytenr = EXTENT_MAP_HOLE;
+ *disk_bytenr = EXTENT_MAP_HOLE;
count = min_t(u64, count, encoded->len);
encoded->len = count;
encoded->unencoded_len = count;
} else if (extent_map_is_compressed(em)) {
- disk_bytenr = em->disk_bytenr;
+ *disk_bytenr = em->disk_bytenr;
/*
* Bail if the buffer isn't large enough to return the whole
* compressed extent.
@@ -9334,7 +9349,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
ret = -ENOBUFS;
goto out_em;
}
- disk_io_size = em->disk_num_bytes;
+ *disk_io_size = em->disk_num_bytes;
count = em->disk_num_bytes;
encoded->unencoded_len = em->ram_bytes;
encoded->unencoded_offset = iocb->ki_pos - (em->start - em->offset);
@@ -9344,47 +9359,42 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
goto out_em;
encoded->compression = ret;
} else {
- disk_bytenr = extent_map_block_start(em) + (start - em->start);
+ *disk_bytenr = extent_map_block_start(em) + (start - em->start);
if (encoded->len > count)
encoded->len = count;
/*
* Don't read beyond what we locked. This also limits the page
* allocations that we'll do.
*/
- disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start;
- count = start + disk_io_size - iocb->ki_pos;
+ *disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start;
+ count = start + *disk_io_size - iocb->ki_pos;
encoded->len = count;
encoded->unencoded_len = count;
- disk_io_size = ALIGN(disk_io_size, fs_info->sectorsize);
+ *disk_io_size = ALIGN(*disk_io_size, fs_info->sectorsize);
}
free_extent_map(em);
em = NULL;
- if (disk_bytenr == EXTENT_MAP_HOLE) {
- unlock_extent(io_tree, start, lockend, &cached_state);
+ if (*disk_bytenr == EXTENT_MAP_HOLE) {
+ unlock_extent(io_tree, start, lockend, cached_state);
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
unlocked = true;
ret = iov_iter_zero(count, iter);
if (ret != count)
ret = -EFAULT;
} else {
- ret = btrfs_encoded_read_regular(iocb, iter, start, lockend,
- &cached_state, disk_bytenr,
- disk_io_size, count,
- encoded->compression,
- &unlocked);
+ ret = -EIOCBQUEUED;
+ goto out_unlock_extent;
}
-out:
- if (ret >= 0)
- iocb->ki_pos += encoded->len;
out_em:
free_extent_map(em);
out_unlock_extent:
- if (!unlocked)
- unlock_extent(io_tree, start, lockend, &cached_state);
+ /* Leave inode and extent locked if we need to do a read. */
+ if (!unlocked && ret != -EIOCBQUEUED)
+ unlock_extent(io_tree, start, lockend, cached_state);
out_unlock_inode:
- if (!unlocked)
+ if (!unlocked && ret != -EIOCBQUEUED)
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
return ret;
}
@@ -9495,7 +9505,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
*/
disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize);
nr_folios = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE);
- folios = kvcalloc(nr_folios, sizeof(struct page *), GFP_KERNEL_ACCOUNT);
+ folios = kvcalloc(nr_folios, sizeof(struct folio *), GFP_KERNEL_ACCOUNT);
if (!folios)
return -ENOMEM;
for (i = 0; i < nr_folios; i++) {
@@ -9559,7 +9569,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
if (encoded->unencoded_len == encoded->len &&
encoded->unencoded_offset == 0 &&
can_cow_file_range_inline(inode, start, encoded->len, orig_count)) {
- ret = __cow_file_range_inline(inode, start, encoded->len,
+ ret = __cow_file_range_inline(inode, encoded->len,
orig_count, compression, folios[0],
true);
if (ret <= 0) {
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 226c91fe31a7..c9302d193187 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -29,6 +29,7 @@
#include <linux/fileattr.h>
#include <linux/fsverity.h>
#include <linux/sched/xacct.h>
+#include <linux/io_uring/cmd.h>
#include "ctree.h"
#include "disk-io.h"
#include "export.h"
@@ -1048,7 +1049,6 @@ static noinline int btrfs_mksnapshot(const struct path *parent,
struct btrfs_qgroup_inherit *inherit)
{
int ret;
- bool snapshot_force_cow = false;
/*
* Force new buffered writes to reserve space even when NOCOW is
@@ -1067,15 +1067,13 @@ static noinline int btrfs_mksnapshot(const struct path *parent,
* creation.
*/
atomic_inc(&root->snapshot_force_cow);
- snapshot_force_cow = true;
btrfs_wait_ordered_extents(root, U64_MAX, NULL);
ret = btrfs_mksubvol(parent, idmap, name, namelen,
root, readonly, inherit);
+ atomic_dec(&root->snapshot_force_cow);
out:
- if (snapshot_force_cow)
- atomic_dec(&root->snapshot_force_cow);
btrfs_drew_read_unlock(&root->snapshot_lock);
return ret;
}
@@ -1308,9 +1306,9 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file,
ret = btrfs_mksubvol(&file->f_path, idmap, name,
namelen, NULL, readonly, inherit);
} else {
- struct fd src = fdget(fd);
+ CLASS(fd, src)(fd);
struct inode *src_inode;
- if (!fd_file(src)) {
+ if (fd_empty(src)) {
ret = -EINVAL;
goto out_drop_write;
}
@@ -1341,7 +1339,6 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file,
BTRFS_I(src_inode)->root,
readonly, inherit);
}
- fdput(src);
}
out_drop_write:
mnt_drop_write_file(file);
@@ -4058,8 +4055,7 @@ static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info,
return 0;
}
-static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info,
- void __user *arg)
+static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info)
{
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -4514,12 +4510,17 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp,
size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args,
flags);
size_t copy_end;
+ struct btrfs_inode *inode = BTRFS_I(file_inode(file));
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct extent_io_tree *io_tree = &inode->io_tree;
struct iovec iovstack[UIO_FASTIOV];
struct iovec *iov = iovstack;
struct iov_iter iter;
loff_t pos;
struct kiocb kiocb;
ssize_t ret;
+ u64 disk_bytenr, disk_io_size;
+ struct extent_state *cached_state = NULL;
if (!capable(CAP_SYS_ADMIN)) {
ret = -EPERM;
@@ -4572,7 +4573,32 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp,
init_sync_kiocb(&kiocb, file);
kiocb.ki_pos = pos;
- ret = btrfs_encoded_read(&kiocb, &iter, &args);
+ ret = btrfs_encoded_read(&kiocb, &iter, &args, &cached_state,
+ &disk_bytenr, &disk_io_size);
+
+ if (ret == -EIOCBQUEUED) {
+ bool unlocked = false;
+ u64 start, lockend, count;
+
+ start = ALIGN_DOWN(kiocb.ki_pos, fs_info->sectorsize);
+ lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
+
+ if (args.compression)
+ count = disk_io_size;
+ else
+ count = args.len;
+
+ ret = btrfs_encoded_read_regular(&kiocb, &iter, start, lockend,
+ &cached_state, disk_bytenr,
+ disk_io_size, count,
+ args.compression, &unlocked);
+
+ if (!unlocked) {
+ unlock_extent(io_tree, start, lockend, &cached_state);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+ }
+ }
+
if (ret >= 0) {
fsnotify_access(file);
if (copy_to_user(argp + copy_end,
@@ -4690,6 +4716,439 @@ out_acct:
return ret;
}
+/*
+ * Context that's attached to an encoded read io_uring command, in cmd->pdu. It
+ * contains the fields in btrfs_uring_read_extent that are necessary to finish
+ * off and cleanup the I/O in btrfs_uring_read_finished.
+ */
+struct btrfs_uring_priv {
+ struct io_uring_cmd *cmd;
+ struct page **pages;
+ unsigned long nr_pages;
+ struct kiocb iocb;
+ struct iovec *iov;
+ struct iov_iter iter;
+ struct extent_state *cached_state;
+ u64 count;
+ u64 start;
+ u64 lockend;
+ int err;
+ bool compressed;
+};
+
+struct io_btrfs_cmd {
+ struct btrfs_uring_priv *priv;
+};
+
+static void btrfs_uring_read_finished(struct io_uring_cmd *cmd, unsigned int issue_flags)
+{
+ struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd);
+ struct btrfs_uring_priv *priv = bc->priv;
+ struct btrfs_inode *inode = BTRFS_I(file_inode(priv->iocb.ki_filp));
+ struct extent_io_tree *io_tree = &inode->io_tree;
+ unsigned long index;
+ u64 cur;
+ size_t page_offset;
+ ssize_t ret;
+
+ if (priv->err) {
+ ret = priv->err;
+ goto out;
+ }
+
+ if (priv->compressed) {
+ index = 0;
+ page_offset = 0;
+ } else {
+ index = (priv->iocb.ki_pos - priv->start) >> PAGE_SHIFT;
+ page_offset = offset_in_page(priv->iocb.ki_pos - priv->start);
+ }
+ cur = 0;
+ while (cur < priv->count) {
+ size_t bytes = min_t(size_t, priv->count - cur, PAGE_SIZE - page_offset);
+
+ if (copy_page_to_iter(priv->pages[index], page_offset, bytes,
+ &priv->iter) != bytes) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ index++;
+ cur += bytes;
+ page_offset = 0;
+ }
+ ret = priv->count;
+
+out:
+ unlock_extent(io_tree, priv->start, priv->lockend, &priv->cached_state);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+
+ io_uring_cmd_done(cmd, ret, 0, issue_flags);
+ add_rchar(current, ret);
+
+ for (index = 0; index < priv->nr_pages; index++)
+ __free_page(priv->pages[index]);
+
+ kfree(priv->pages);
+ kfree(priv->iov);
+ kfree(priv);
+}
+
+void btrfs_uring_read_extent_endio(void *ctx, int err)
+{
+ struct btrfs_uring_priv *priv = ctx;
+ struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(priv->cmd, struct io_btrfs_cmd);
+
+ priv->err = err;
+ bc->priv = priv;
+
+ io_uring_cmd_complete_in_task(priv->cmd, btrfs_uring_read_finished);
+}
+
+static int btrfs_uring_read_extent(struct kiocb *iocb, struct iov_iter *iter,
+ u64 start, u64 lockend,
+ struct extent_state *cached_state,
+ u64 disk_bytenr, u64 disk_io_size,
+ size_t count, bool compressed,
+ struct iovec *iov, struct io_uring_cmd *cmd)
+{
+ struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
+ struct extent_io_tree *io_tree = &inode->io_tree;
+ struct page **pages;
+ struct btrfs_uring_priv *priv = NULL;
+ unsigned long nr_pages;
+ int ret;
+
+ nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
+ pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
+ if (!pages)
+ return -ENOMEM;
+ ret = btrfs_alloc_page_array(nr_pages, pages, 0);
+ if (ret) {
+ ret = -ENOMEM;
+ goto out_fail;
+ }
+
+ priv = kmalloc(sizeof(*priv), GFP_NOFS);
+ if (!priv) {
+ ret = -ENOMEM;
+ goto out_fail;
+ }
+
+ priv->iocb = *iocb;
+ priv->iov = iov;
+ priv->iter = *iter;
+ priv->count = count;
+ priv->cmd = cmd;
+ priv->cached_state = cached_state;
+ priv->compressed = compressed;
+ priv->nr_pages = nr_pages;
+ priv->pages = pages;
+ priv->start = start;
+ priv->lockend = lockend;
+ priv->err = 0;
+
+ ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr,
+ disk_io_size, pages, priv);
+ if (ret && ret != -EIOCBQUEUED)
+ goto out_fail;
+
+ /*
+ * If we return -EIOCBQUEUED, we're deferring the cleanup to
+ * btrfs_uring_read_finished(), which will handle unlocking the extent
+ * and inode and freeing the allocations.
+ */
+
+ return -EIOCBQUEUED;
+
+out_fail:
+ unlock_extent(io_tree, start, lockend, &cached_state);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+ kfree(priv);
+ return ret;
+}
+
+static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue_flags)
+{
+ size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, flags);
+ size_t copy_end;
+ struct btrfs_ioctl_encoded_io_args args = { 0 };
+ int ret;
+ u64 disk_bytenr, disk_io_size;
+ struct file *file;
+ struct btrfs_inode *inode;
+ struct btrfs_fs_info *fs_info;
+ struct extent_io_tree *io_tree;
+ struct iovec iovstack[UIO_FASTIOV];
+ struct iovec *iov = iovstack;
+ struct iov_iter iter;
+ loff_t pos;
+ struct kiocb kiocb;
+ struct extent_state *cached_state = NULL;
+ u64 start, lockend;
+ void __user *sqe_addr;
+
+ if (!capable(CAP_SYS_ADMIN)) {
+ ret = -EPERM;
+ goto out_acct;
+ }
+ file = cmd->file;
+ inode = BTRFS_I(file->f_inode);
+ fs_info = inode->root->fs_info;
+ io_tree = &inode->io_tree;
+ sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr));
+
+ if (issue_flags & IO_URING_F_COMPAT) {
+#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
+ struct btrfs_ioctl_encoded_io_args_32 args32;
+
+ copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32, flags);
+ if (copy_from_user(&args32, sqe_addr, copy_end)) {
+ ret = -EFAULT;
+ goto out_acct;
+ }
+ args.iov = compat_ptr(args32.iov);
+ args.iovcnt = args32.iovcnt;
+ args.offset = args32.offset;
+ args.flags = args32.flags;
+#else
+ return -ENOTTY;
+#endif
+ } else {
+ copy_end = copy_end_kernel;
+ if (copy_from_user(&args, sqe_addr, copy_end)) {
+ ret = -EFAULT;
+ goto out_acct;
+ }
+ }
+
+ if (args.flags != 0)
+ return -EINVAL;
+
+ ret = import_iovec(ITER_DEST, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
+ &iov, &iter);
+ if (ret < 0)
+ goto out_acct;
+
+ if (iov_iter_count(&iter) == 0) {
+ ret = 0;
+ goto out_free;
+ }
+
+ pos = args.offset;
+ ret = rw_verify_area(READ, file, &pos, args.len);
+ if (ret < 0)
+ goto out_free;
+
+ init_sync_kiocb(&kiocb, file);
+ kiocb.ki_pos = pos;
+
+ if (issue_flags & IO_URING_F_NONBLOCK)
+ kiocb.ki_flags |= IOCB_NOWAIT;
+
+ start = ALIGN_DOWN(pos, fs_info->sectorsize);
+ lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
+
+ ret = btrfs_encoded_read(&kiocb, &iter, &args, &cached_state,
+ &disk_bytenr, &disk_io_size);
+ if (ret < 0 && ret != -EIOCBQUEUED)
+ goto out_free;
+
+ file_accessed(file);
+
+ if (copy_to_user(sqe_addr + copy_end, (const char *)&args + copy_end_kernel,
+ sizeof(args) - copy_end_kernel)) {
+ if (ret == -EIOCBQUEUED) {
+ unlock_extent(io_tree, start, lockend, &cached_state);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+ }
+ ret = -EFAULT;
+ goto out_free;
+ }
+
+ if (ret == -EIOCBQUEUED) {
+ u64 count;
+
+ /*
+ * If we've optimized things by storing the iovecs on the stack,
+ * undo this.
+ */
+ if (!iov) {
+ iov = kmalloc(sizeof(struct iovec) * args.iovcnt, GFP_NOFS);
+ if (!iov) {
+ unlock_extent(io_tree, start, lockend, &cached_state);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+ ret = -ENOMEM;
+ goto out_acct;
+ }
+
+ memcpy(iov, iovstack, sizeof(struct iovec) * args.iovcnt);
+ }
+
+ count = min_t(u64, iov_iter_count(&iter), disk_io_size);
+
+ /* Match ioctl by not returning past EOF if uncompressed. */
+ if (!args.compression)
+ count = min_t(u64, count, args.len);
+
+ ret = btrfs_uring_read_extent(&kiocb, &iter, start, lockend,
+ cached_state, disk_bytenr,
+ disk_io_size, count,
+ args.compression, iov, cmd);
+
+ goto out_acct;
+ }
+
+out_free:
+ kfree(iov);
+
+out_acct:
+ if (ret > 0)
+ add_rchar(current, ret);
+ inc_syscr(current);
+
+ return ret;
+}
+
+int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
+{
+ switch (cmd->cmd_op) {
+ case BTRFS_IOC_ENCODED_READ:
+#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
+ case BTRFS_IOC_ENCODED_READ_32:
+#endif
+ return btrfs_uring_encoded_read(cmd, issue_flags);
+ }
+
+ return -EINVAL;
+}
+
+static int btrfs_ioctl_subvol_sync(struct btrfs_fs_info *fs_info, void __user *argp)
+{
+ struct btrfs_root *root;
+ struct btrfs_ioctl_subvol_wait args = { 0 };
+ signed long sched_ret;
+ int refs;
+ u64 root_flags;
+ bool wait_for_deletion = false;
+ bool found = false;
+
+ if (copy_from_user(&args, argp, sizeof(args)))
+ return -EFAULT;
+
+ switch (args.mode) {
+ case BTRFS_SUBVOL_SYNC_WAIT_FOR_QUEUED:
+ /*
+ * Wait for the first one deleted that waits until all previous
+ * are cleaned.
+ */
+ spin_lock(&fs_info->trans_lock);
+ if (!list_empty(&fs_info->dead_roots)) {
+ root = list_last_entry(&fs_info->dead_roots,
+ struct btrfs_root, root_list);
+ args.subvolid = btrfs_root_id(root);
+ found = true;
+ }
+ spin_unlock(&fs_info->trans_lock);
+ if (!found)
+ return -ENOENT;
+
+ fallthrough;
+ case BTRFS_SUBVOL_SYNC_WAIT_FOR_ONE:
+ if ((0 < args.subvolid && args.subvolid < BTRFS_FIRST_FREE_OBJECTID) ||
+ BTRFS_LAST_FREE_OBJECTID < args.subvolid)
+ return -EINVAL;
+ break;
+ case BTRFS_SUBVOL_SYNC_COUNT:
+ spin_lock(&fs_info->trans_lock);
+ args.count = list_count_nodes(&fs_info->dead_roots);
+ spin_unlock(&fs_info->trans_lock);
+ if (copy_to_user(argp, &args, sizeof(args)))
+ return -EFAULT;
+ return 0;
+ case BTRFS_SUBVOL_SYNC_PEEK_FIRST:
+ spin_lock(&fs_info->trans_lock);
+ /* Last in the list was deleted first. */
+ if (!list_empty(&fs_info->dead_roots)) {
+ root = list_last_entry(&fs_info->dead_roots,
+ struct btrfs_root, root_list);
+ args.subvolid = btrfs_root_id(root);
+ } else {
+ args.subvolid = 0;
+ }
+ spin_unlock(&fs_info->trans_lock);
+ if (copy_to_user(argp, &args, sizeof(args)))
+ return -EFAULT;
+ return 0;
+ case BTRFS_SUBVOL_SYNC_PEEK_LAST:
+ spin_lock(&fs_info->trans_lock);
+ /* First in the list was deleted last. */
+ if (!list_empty(&fs_info->dead_roots)) {
+ root = list_first_entry(&fs_info->dead_roots,
+ struct btrfs_root, root_list);
+ args.subvolid = btrfs_root_id(root);
+ } else {
+ args.subvolid = 0;
+ }
+ spin_unlock(&fs_info->trans_lock);
+ if (copy_to_user(argp, &args, sizeof(args)))
+ return -EFAULT;
+ return 0;
+ default:
+ return -EINVAL;
+ }
+
+ /* 32bit limitation: fs_roots_radix key is not wide enough. */
+ if (sizeof(unsigned long) != sizeof(u64) && args.subvolid > U32_MAX)
+ return -EOVERFLOW;
+
+ while (1) {
+ /* Wait for the specific one. */
+ if (down_read_interruptible(&fs_info->subvol_sem) == -EINTR)
+ return -EINTR;
+ refs = -1;
+ spin_lock(&fs_info->fs_roots_radix_lock);
+ root = radix_tree_lookup(&fs_info->fs_roots_radix,
+ (unsigned long)args.subvolid);
+ if (root) {
+ spin_lock(&root->root_item_lock);
+ refs = btrfs_root_refs(&root->root_item);
+ root_flags = btrfs_root_flags(&root->root_item);
+ spin_unlock(&root->root_item_lock);
+ }
+ spin_unlock(&fs_info->fs_roots_radix_lock);
+ up_read(&fs_info->subvol_sem);
+
+ /* Subvolume does not exist. */
+ if (!root)
+ return -ENOENT;
+
+ /* Subvolume not deleted at all. */
+ if (refs > 0)
+ return -EEXIST;
+ /* We've waited and now the subvolume is gone. */
+ if (wait_for_deletion && refs == -1) {
+ /* Return the one we waited for as the last one. */
+ if (copy_to_user(argp, &args, sizeof(args)))
+ return -EFAULT;
+ return 0;
+ }
+
+ /* Subvolume not found on the first try (deleted or never existed). */
+ if (refs == -1)
+ return -ENOENT;
+
+ wait_for_deletion = true;
+ ASSERT(root_flags & BTRFS_ROOT_SUBVOL_DEAD);
+ sched_ret = schedule_timeout_interruptible(HZ);
+ /* Early wake up or error. */
+ if (sched_ret != 0)
+ return -EINTR;
+ }
+
+ return 0;
+}
+
long btrfs_ioctl(struct file *file, unsigned int
cmd, unsigned long arg)
{
@@ -4812,7 +5271,7 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_QUOTA_RESCAN_STATUS:
return btrfs_ioctl_quota_rescan_status(fs_info, argp);
case BTRFS_IOC_QUOTA_RESCAN_WAIT:
- return btrfs_ioctl_quota_rescan_wait(fs_info, argp);
+ return btrfs_ioctl_quota_rescan_wait(fs_info);
case BTRFS_IOC_DEV_REPLACE:
return btrfs_ioctl_dev_replace(fs_info, argp);
case BTRFS_IOC_GET_SUPPORTED_FEATURES:
@@ -4841,6 +5300,8 @@ long btrfs_ioctl(struct file *file, unsigned int
case BTRFS_IOC_ENCODED_WRITE_32:
return btrfs_ioctl_encoded_write(file, argp, true);
#endif
+ case BTRFS_IOC_SUBVOL_SYNC_WAIT:
+ return btrfs_ioctl_subvol_sync(fs_info, argp);
}
return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 19cd26b0244a..2b760c8778f8 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -22,5 +22,7 @@ void btrfs_sync_inode_flags_to_i_flags(struct inode *inode);
int __pure btrfs_is_empty_uuid(const u8 *uuid);
void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_balance_args *bargs);
+int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags);
+void btrfs_uring_read_extent_endio(void *ctx, int err);
#endif
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 6a0b7abb5bd9..9a7a7b723305 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -162,21 +162,6 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb)
}
/*
- * Try-lock for write.
- *
- * Return 1 if the rwlock has been taken, 0 otherwise
- */
-int btrfs_try_tree_write_lock(struct extent_buffer *eb)
-{
- if (down_write_trylock(&eb->lock)) {
- btrfs_set_eb_lock_owner(eb, current->pid);
- trace_btrfs_try_tree_write_lock(eb);
- return 1;
- }
- return 0;
-}
-
-/*
* Release read lock.
*/
void btrfs_tree_read_unlock(struct extent_buffer *eb)
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index 3c15c75e0582..46c8be2afab1 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -180,7 +180,6 @@ static inline void btrfs_tree_read_lock(struct extent_buffer *eb)
void btrfs_tree_read_unlock(struct extent_buffer *eb);
int btrfs_try_tree_read_lock(struct extent_buffer *eb);
-int btrfs_try_tree_write_lock(struct extent_buffer *eb);
struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root);
struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root);
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 72856f6775f7..a45bc11f8665 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -80,7 +80,7 @@ void lzo_free_workspace(struct list_head *ws)
kfree(workspace);
}
-struct list_head *lzo_alloc_workspace(unsigned int level)
+struct list_head *lzo_alloc_workspace(void)
{
struct workspace *workspace;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 2104d60c2161..95c8499a159a 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -346,10 +346,10 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
ASSERT(file_offset + len <= folio_pos(folio) + folio_size(folio));
/*
- * Ordered (Private2) bit indicates whether we still have
+ * Ordered flag indicates whether we still have
* pending io unfinished for the ordered extent.
*
- * If there's no such bit, we need to skip to next range.
+ * If it's not set, we need to skip to next range.
*/
if (!btrfs_folio_test_ordered(fs_info, folio, file_offset, len))
return false;
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index c297909f1506..a6f92836c9b1 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -226,8 +226,7 @@ static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
return qgroup;
}
-static void __del_qgroup_rb(struct btrfs_fs_info *fs_info,
- struct btrfs_qgroup *qgroup)
+static void __del_qgroup_rb(struct btrfs_qgroup *qgroup)
{
struct btrfs_qgroup_list *list;
@@ -258,7 +257,7 @@ static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
return -ENOENT;
rb_erase(&qgroup->node, &fs_info->qgroup_tree);
- __del_qgroup_rb(fs_info, qgroup);
+ __del_qgroup_rb(qgroup);
return 0;
}
@@ -469,7 +468,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info)
/*
* If a qgroup exists for a subvolume ID, it is possible
* that subvolume has been deleted, in which case
- * re-using that ID would lead to incorrect accounting.
+ * reusing that ID would lead to incorrect accounting.
*
* Ensure that we skip any such subvol ids.
*
@@ -643,7 +642,7 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
while ((n = rb_first(&fs_info->qgroup_tree))) {
qgroup = rb_entry(n, struct btrfs_qgroup, node);
rb_erase(n, &fs_info->qgroup_tree);
- __del_qgroup_rb(fs_info, qgroup);
+ __del_qgroup_rb(qgroup);
btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
kfree(qgroup);
}
@@ -1407,7 +1406,7 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
fs_info->quota_root = NULL;
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE;
- fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL;
+ fs_info->qgroup_drop_subtree_thres = BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT;
spin_unlock(&fs_info->qgroup_lock);
btrfs_free_qgroup_config(fs_info);
@@ -2001,20 +2000,30 @@ out:
* Return <0 for insertion failure, caller can free @record safely.
*/
int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
- struct btrfs_delayed_ref_root *delayed_refs,
- struct btrfs_qgroup_extent_record *record)
+ struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_qgroup_extent_record *record,
+ u64 bytenr)
{
struct btrfs_qgroup_extent_record *existing, *ret;
- unsigned long bytenr = record->bytenr;
+ const unsigned long index = (bytenr >> fs_info->sectorsize_bits);
if (!btrfs_qgroup_full_accounting(fs_info))
return 1;
- lockdep_assert_held(&delayed_refs->lock);
- trace_btrfs_qgroup_trace_extent(fs_info, record);
+#if BITS_PER_LONG == 32
+ if (bytenr >= MAX_LFS_FILESIZE) {
+ btrfs_err_rl(fs_info,
+"qgroup record for extent at %llu is beyond 32bit page cache and xarray index limit",
+ bytenr);
+ btrfs_err_32bit_limit(fs_info);
+ return -EOVERFLOW;
+ }
+#endif
+
+ trace_btrfs_qgroup_trace_extent(fs_info, record, bytenr);
xa_lock(&delayed_refs->dirty_extents);
- existing = xa_load(&delayed_refs->dirty_extents, bytenr);
+ existing = xa_load(&delayed_refs->dirty_extents, index);
if (existing) {
if (record->data_rsv && !existing->data_rsv) {
existing->data_rsv = record->data_rsv;
@@ -2024,7 +2033,7 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
return 1;
}
- ret = __xa_store(&delayed_refs->dirty_extents, record->bytenr, record, GFP_ATOMIC);
+ ret = __xa_store(&delayed_refs->dirty_extents, index, record, GFP_ATOMIC);
xa_unlock(&delayed_refs->dirty_extents);
if (xa_is_err(ret)) {
qgroup_mark_inconsistent(fs_info);
@@ -2056,12 +2065,17 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
* transaction committing, but not now as qgroup accounting will be wrong again.
*/
int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
- struct btrfs_qgroup_extent_record *qrecord)
+ struct btrfs_qgroup_extent_record *qrecord,
+ u64 bytenr)
{
- struct btrfs_backref_walk_ctx ctx = { 0 };
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_backref_walk_ctx ctx = {
+ .bytenr = bytenr,
+ .fs_info = fs_info,
+ };
int ret;
- if (!btrfs_qgroup_full_accounting(trans->fs_info))
+ if (!btrfs_qgroup_full_accounting(fs_info))
return 0;
/*
* We are always called in a context where we are already holding a
@@ -2084,16 +2098,13 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
*/
ASSERT(trans != NULL);
- if (trans->fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)
+ if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)
return 0;
- ctx.bytenr = qrecord->bytenr;
- ctx.fs_info = trans->fs_info;
-
ret = btrfs_find_all_roots(&ctx, true);
if (ret < 0) {
- qgroup_mark_inconsistent(trans->fs_info);
- btrfs_warn(trans->fs_info,
+ qgroup_mark_inconsistent(fs_info);
+ btrfs_warn(fs_info,
"error accounting new delayed refs extent (err code: %d), quota inconsistent",
ret);
return 0;
@@ -2128,7 +2139,8 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_qgroup_extent_record *record;
- struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_delayed_ref_root *delayed_refs = &trans->transaction->delayed_refs;
+ const unsigned long index = (bytenr >> fs_info->sectorsize_bits);
int ret;
if (!btrfs_qgroup_full_accounting(fs_info) || bytenr == 0 || num_bytes == 0)
@@ -2137,26 +2149,21 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
if (!record)
return -ENOMEM;
- if (xa_reserve(&trans->transaction->delayed_refs.dirty_extents, bytenr, GFP_NOFS)) {
+ if (xa_reserve(&delayed_refs->dirty_extents, index, GFP_NOFS)) {
kfree(record);
return -ENOMEM;
}
- delayed_refs = &trans->transaction->delayed_refs;
- record->bytenr = bytenr;
record->num_bytes = num_bytes;
- record->old_roots = NULL;
- spin_lock(&delayed_refs->lock);
- ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record);
- spin_unlock(&delayed_refs->lock);
+ ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record, bytenr);
if (ret) {
/* Clean up if insertion fails or item exists. */
- xa_release(&delayed_refs->dirty_extents, record->bytenr);
+ xa_release(&delayed_refs->dirty_extents, index);
kfree(record);
return 0;
}
- return btrfs_qgroup_trace_extent_post(trans, record);
+ return btrfs_qgroup_trace_extent_post(trans, record, bytenr);
}
/*
@@ -2641,7 +2648,6 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
if (!extent_buffer_uptodate(root_eb)) {
struct btrfs_tree_parent_check check = {
- .has_first_key = false,
.transid = root_gen,
.level = root_level
};
@@ -3032,14 +3038,16 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
delayed_refs = &trans->transaction->delayed_refs;
qgroup_to_skip = delayed_refs->qgroup_to_skip;
xa_for_each(&delayed_refs->dirty_extents, index, record) {
+ const u64 bytenr = (((u64)index) << fs_info->sectorsize_bits);
+
num_dirty_extents++;
- trace_btrfs_qgroup_account_extents(fs_info, record);
+ trace_btrfs_qgroup_account_extents(fs_info, record, bytenr);
if (!ret && !(fs_info->qgroup_flags &
BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)) {
struct btrfs_backref_walk_ctx ctx = { 0 };
- ctx.bytenr = record->bytenr;
+ ctx.bytenr = bytenr;
ctx.fs_info = fs_info;
/*
@@ -3081,7 +3089,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
ulist_del(record->old_roots, qgroup_to_skip,
0);
}
- ret = btrfs_qgroup_account_extent(trans, record->bytenr,
+ ret = btrfs_qgroup_account_extent(trans, bytenr,
record->num_bytes,
record->old_roots,
new_roots);
@@ -4185,13 +4193,20 @@ static int try_flush_qgroup(struct btrfs_root *root)
return 0;
}
- btrfs_run_delayed_iputs(root->fs_info);
- btrfs_wait_on_delayed_iputs(root->fs_info);
ret = btrfs_start_delalloc_snapshot(root, true);
if (ret < 0)
goto out;
btrfs_wait_ordered_extents(root, U64_MAX, NULL);
+ /*
+ * After waiting for ordered extents run delayed iputs in order to free
+ * space from unlinked files before committing the current transaction,
+ * as ordered extents may have been holding the last reference of an
+ * inode and they add a delayed iput when they complete.
+ */
+ btrfs_run_delayed_iputs(root->fs_info);
+ btrfs_wait_on_delayed_iputs(root->fs_info);
+
ret = btrfs_commit_current_transaction(root);
out:
clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state);
@@ -4676,8 +4691,7 @@ out:
* BOTH POINTERS ARE BEFORE TREE SWAP
* @last_snapshot: last snapshot generation of the subvolume tree
*/
-int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
- struct btrfs_root *subvol_root,
+int btrfs_qgroup_add_swapped_blocks(struct btrfs_root *subvol_root,
struct btrfs_block_group *bg,
struct extent_buffer *subvol_parent, int subvol_slot,
struct extent_buffer *reloc_parent, int reloc_slot,
@@ -4883,17 +4897,6 @@ void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans)
xa_destroy(&trans->delayed_refs.dirty_extents);
}
-void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_bytes)
-{
- if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE)
- return;
-
- if (!is_fstree(root))
- return;
-
- btrfs_qgroup_free_refroot(fs_info, root, rsv_bytes, BTRFS_QGROUP_RSV_DATA);
-}
-
int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info,
const struct btrfs_squota_delta *delta)
{
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 98adf4ec7b01..e233cc79af18 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -121,11 +121,18 @@ struct btrfs_inode;
#define BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN (1ULL << 63)
#define BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING (1ULL << 62)
+#define BTRFS_QGROUP_DROP_SUBTREE_THRES_DEFAULT (3)
+
/*
* Record a dirty extent, and info qgroup to update quota on it
*/
struct btrfs_qgroup_extent_record {
- u64 bytenr;
+ /*
+ * The bytenr of the extent is given by its index in the dirty_extents
+ * xarray of struct btrfs_delayed_ref_root left shifted by
+ * fs_info->sectorsize_bits.
+ */
+
u64 num_bytes;
/*
@@ -343,9 +350,11 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
int btrfs_qgroup_trace_extent_nolock(
struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
- struct btrfs_qgroup_extent_record *record);
+ struct btrfs_qgroup_extent_record *record,
+ u64 bytenr);
int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
- struct btrfs_qgroup_extent_record *qrecord);
+ struct btrfs_qgroup_extent_record *qrecord,
+ u64 bytenr);
int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
u64 num_bytes);
int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
@@ -430,8 +439,7 @@ void btrfs_qgroup_init_swapped_blocks(
struct btrfs_qgroup_swapped_blocks *swapped_blocks);
void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root);
-int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
- struct btrfs_root *subvol_root,
+int btrfs_qgroup_add_swapped_blocks(struct btrfs_root *subvol_root,
struct btrfs_block_group *bg,
struct extent_buffer *subvol_parent, int subvol_slot,
struct extent_buffer *reloc_parent, int reloc_slot,
@@ -440,7 +448,6 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *eb);
void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans);
bool btrfs_check_quota_leak(const struct btrfs_fs_info *fs_info);
-void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_bytes);
int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info,
const struct btrfs_squota_delta *delta);
diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c
index 4c859b550f6c..9ffc79f250fb 100644
--- a/fs/btrfs/raid-stripe-tree.c
+++ b/fs/btrfs/raid-stripe-tree.c
@@ -13,6 +13,39 @@
#include "volumes.h"
#include "print-tree.h"
+static void btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ const struct btrfs_key *oldkey,
+ u64 newlen, u64 frontpad)
+{
+ struct btrfs_stripe_extent *extent;
+ struct extent_buffer *leaf;
+ int slot;
+ size_t item_size;
+ struct btrfs_key newkey = {
+ .objectid = oldkey->objectid + frontpad,
+ .type = BTRFS_RAID_STRIPE_KEY,
+ .offset = newlen,
+ };
+
+ ASSERT(oldkey->type == BTRFS_RAID_STRIPE_KEY);
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ item_size = btrfs_item_size(leaf, slot);
+ extent = btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent);
+
+ for (int i = 0; i < btrfs_num_raid_stripes(item_size); i++) {
+ struct btrfs_raid_stride *stride = &extent->strides[i];
+ u64 phys;
+
+ phys = btrfs_raid_stride_physical(leaf, stride);
+ btrfs_set_raid_stride_physical(leaf, stride, phys + frontpad);
+ }
+
+ btrfs_set_item_key_safe(trans, path, &newkey);
+}
+
int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -36,23 +69,24 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
while (1) {
key.objectid = start;
key.type = BTRFS_RAID_STRIPE_KEY;
- key.offset = length;
+ key.offset = 0;
ret = btrfs_search_slot(trans, stripe_root, &key, path, -1, 1);
if (ret < 0)
break;
- if (ret > 0) {
- ret = 0;
- if (path->slots[0] == 0)
- break;
+
+ if (path->slots[0] == btrfs_header_nritems(path->nodes[0]))
path->slots[0]--;
- }
leaf = path->nodes[0];
slot = path->slots[0];
btrfs_item_key_to_cpu(leaf, &key, slot);
found_start = key.objectid;
found_end = found_start + key.offset;
+ ret = 0;
+
+ if (key.type != BTRFS_RAID_STRIPE_KEY)
+ break;
/* That stripe ends before we start, we're done. */
if (found_end <= start)
@@ -61,7 +95,40 @@ int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 le
trace_btrfs_raid_extent_delete(fs_info, start, end,
found_start, found_end);
- ASSERT(found_start >= start && found_end <= end);
+ /*
+ * The stripe extent starts before the range we want to delete:
+ *
+ * |--- RAID Stripe Extent ---|
+ * |--- keep ---|--- drop ---|
+ *
+ * This means we have to duplicate the tree item, truncate the
+ * length to the new size and then re-insert the item.
+ */
+ if (found_start < start) {
+ u64 diff = start - found_start;
+
+ btrfs_partially_delete_raid_extent(trans, path, &key,
+ diff, 0);
+ break;
+ }
+
+ /*
+ * The stripe extent ends after the range we want to delete:
+ *
+ * |--- RAID Stripe Extent ---|
+ * |--- drop ---|--- keep ---|
+ *
+ * This means we have to duplicate the tree item, truncate the
+ * length to the new size and then re-insert the item.
+ */
+ if (found_end > end) {
+ u64 diff = found_end - end;
+
+ btrfs_partially_delete_raid_extent(trans, path, &key,
+ diff, diff);
+ break;
+ }
+
ret = btrfs_del_item(trans, stripe_root, path);
if (ret)
break;
@@ -108,8 +175,9 @@ static int update_raid_extent_item(struct btrfs_trans_handle *trans,
return ret;
}
-static int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
- struct btrfs_io_context *bioc)
+EXPORT_FOR_TESTS
+int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_io_context *bioc)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_key stripe_key;
@@ -233,7 +301,7 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
found_end = found_logical + found_length;
if (found_logical > end) {
- ret = -ENOENT;
+ ret = -ENODATA;
goto out;
}
@@ -279,10 +347,10 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
}
/* If we're here, we haven't found the requested devid in the stripe. */
- ret = -ENOENT;
+ ret = -ENODATA;
out:
if (ret > 0)
- ret = -ENOENT;
+ ret = -ENODATA;
if (ret && ret != -EIO && !stripe->rst_search_commit_root) {
btrfs_debug(fs_info,
"cannot find raid-stripe for logical [%llu, %llu] devid %llu, profile %s",
diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h
index 1ac1c21aac2f..541836421778 100644
--- a/fs/btrfs/raid-stripe-tree.h
+++ b/fs/btrfs/raid-stripe-tree.h
@@ -28,6 +28,11 @@ int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans,
struct btrfs_ordered_extent *ordered_extent);
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_io_context *bioc);
+#endif
+
static inline bool btrfs_need_stripe_tree_update(struct btrfs_fs_info *fs_info,
u64 map_type)
{
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 39bec672df0c..cdd373c27784 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1272,8 +1272,7 @@ static inline void bio_list_put(struct bio_list *bio_list)
static void assert_rbio(struct btrfs_raid_bio *rbio)
{
- if (!IS_ENABLED(CONFIG_BTRFS_DEBUG) ||
- !IS_ENABLED(CONFIG_BTRFS_ASSERT))
+ if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
return;
/*
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index f3834f8d26b4..bf267bdfa8f8 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1244,7 +1244,7 @@ again:
* The real subtree rescan is delayed until we have new
* CoW on the subtree root node before transaction commit.
*/
- ret = btrfs_qgroup_add_swapped_blocks(trans, dest,
+ ret = btrfs_qgroup_add_swapped_blocks(dest,
rc->block_group, parent, slot,
path->nodes[level], path->slots[level],
last_snapshot);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 3a3427428074..204c928beaf9 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1656,8 +1656,7 @@ static u32 stripe_length(const struct scrub_stripe *stripe)
stripe->bg->start + stripe->bg->length - stripe->logical);
}
-static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx,
- struct scrub_stripe *stripe)
+static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe)
{
struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
struct btrfs_bio *bbio = NULL;
@@ -1704,8 +1703,18 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx,
&stripe_len, &bioc, &io_stripe, &mirror);
btrfs_put_bioc(bioc);
if (err < 0) {
- set_bit(i, &stripe->io_error_bitmap);
- set_bit(i, &stripe->error_bitmap);
+ if (err != -ENODATA) {
+ /*
+ * Earlier btrfs_get_raid_extent_offset()
+ * returned -ENODATA, which means there's
+ * no entry for the corresponding range
+ * in the stripe tree. But if it's in
+ * the extent tree, then it's a preallocated
+ * extent and not an error.
+ */
+ set_bit(i, &stripe->io_error_bitmap);
+ set_bit(i, &stripe->error_bitmap);
+ }
continue;
}
@@ -1743,7 +1752,7 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx,
ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state));
if (btrfs_need_stripe_tree_update(fs_info, stripe->bg->flags)) {
- scrub_submit_extent_sector_read(sctx, stripe);
+ scrub_submit_extent_sector_read(stripe);
return;
}
@@ -1954,7 +1963,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
ASSERT(sctx->raid56_data_stripes);
/*
- * For data stripe search, we cannot re-use the same extent/csum paths,
+ * For data stripe search, we cannot reuse the same extent/csum paths,
* as the data stripe bytenr may be smaller than previous extent. Thus
* we have to use our own extent/csum paths.
*/
@@ -2103,7 +2112,6 @@ out:
*/
static int scrub_simple_mirror(struct scrub_ctx *sctx,
struct btrfs_block_group *bg,
- struct btrfs_chunk_map *map,
u64 logical_start, u64 logical_length,
struct btrfs_device *device,
u64 physical, int mirror_num)
@@ -2222,7 +2230,7 @@ static int scrub_simple_stripe(struct scrub_ctx *sctx,
* just RAID1, so we can reuse scrub_simple_mirror() to scrub
* this stripe.
*/
- ret = scrub_simple_mirror(sctx, bg, map, cur_logical,
+ ret = scrub_simple_mirror(sctx, bg, cur_logical,
BTRFS_STRIPE_LEN, device, cur_physical,
mirror_num);
if (ret)
@@ -2256,7 +2264,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
/* Offset inside the chunk */
u64 offset;
u64 stripe_logical;
- int stop_loop = 0;
/* Extent_path should be released by now. */
ASSERT(sctx->extent_path.nodes[0] == NULL);
@@ -2307,7 +2314,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
* Only @physical and @mirror_num needs to calculated using
* @stripe_index.
*/
- ret = scrub_simple_mirror(sctx, bg, map, bg->start, bg->length,
+ ret = scrub_simple_mirror(sctx, bg, bg->start, bg->length,
scrub_dev, map->stripes[stripe_index].physical,
stripe_index + 1);
offset = 0;
@@ -2362,7 +2369,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
* We can reuse scrub_simple_mirror() here, as the repair part
* is still based on @mirror_num.
*/
- ret = scrub_simple_mirror(sctx, bg, map, logical, BTRFS_STRIPE_LEN,
+ ret = scrub_simple_mirror(sctx, bg, logical, BTRFS_STRIPE_LEN,
scrub_dev, physical, 1);
if (ret < 0)
goto out;
@@ -2370,14 +2377,8 @@ next:
logical += increment;
physical += BTRFS_STRIPE_LEN;
spin_lock(&sctx->stat_lock);
- if (stop_loop)
- sctx->stat.last_physical =
- map->stripes[stripe_index].physical + dev_stripe_len;
- else
- sctx->stat.last_physical = physical;
+ sctx->stat.last_physical = physical;
spin_unlock(&sctx->stat_lock);
- if (stop_loop)
- break;
}
out:
ret2 = flush_scrub_stripes(sctx);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 27306d98ec43..7254279c3cc9 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -980,9 +980,7 @@ static int get_inode_gen(struct btrfs_root *root, u64 ino, u64 *gen)
return ret;
}
-typedef int (*iterate_inode_ref_t)(int num, u64 dir, int index,
- struct fs_path *p,
- void *ctx);
+typedef int (*iterate_inode_ref_t)(u64 dir, struct fs_path *p, void *ctx);
/*
* Helper function to iterate the entries in ONE btrfs_inode_ref or
@@ -1007,8 +1005,6 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
u32 name_len;
char *start;
int ret = 0;
- int num = 0;
- int index;
u64 dir;
unsigned long name_off;
unsigned long elem_size;
@@ -1043,13 +1039,11 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
iref = (struct btrfs_inode_ref *)(ptr + cur);
name_len = btrfs_inode_ref_name_len(eb, iref);
name_off = (unsigned long)(iref + 1);
- index = btrfs_inode_ref_index(eb, iref);
dir = found_key->offset;
} else {
extref = (struct btrfs_inode_extref *)(ptr + cur);
name_len = btrfs_inode_extref_name_len(eb, extref);
name_off = (unsigned long)&extref->name;
- index = btrfs_inode_extref_index(eb, extref);
dir = btrfs_inode_extref_parent(eb, extref);
}
@@ -1094,10 +1088,9 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
}
cur += elem_size + name_len;
- ret = iterate(num, dir, index, p, ctx);
+ ret = iterate(dir, p, ctx);
if (ret)
goto out;
- num++;
}
out:
@@ -1227,8 +1220,7 @@ out:
return ret;
}
-static int __copy_first_ref(int num, u64 dir, int index,
- struct fs_path *p, void *ctx)
+static int __copy_first_ref(u64 dir, struct fs_path *p, void *ctx)
{
int ret;
struct fs_path *pt = ctx;
@@ -3768,7 +3760,6 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx,
struct recorded_ref *parent_ref,
const bool is_orphan)
{
- struct btrfs_fs_info *fs_info = sctx->parent_root->fs_info;
struct btrfs_path *path;
struct btrfs_key key;
struct btrfs_key di_key;
@@ -3797,7 +3788,7 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx,
goto out;
}
- di = btrfs_match_dir_item_name(fs_info, path, parent_ref->name,
+ di = btrfs_match_dir_item_name(path, parent_ref->name,
parent_ref->name_len);
if (!di) {
ret = 0;
@@ -4708,8 +4699,7 @@ out:
return ret;
}
-static int record_new_ref_if_needed(int num, u64 dir, int index,
- struct fs_path *name, void *ctx)
+static int record_new_ref_if_needed(u64 dir, struct fs_path *name, void *ctx)
{
int ret = 0;
struct send_ctx *sctx = ctx;
@@ -4738,8 +4728,7 @@ out:
return ret;
}
-static int record_deleted_ref_if_needed(int num, u64 dir, int index,
- struct fs_path *name, void *ctx)
+static int record_deleted_ref_if_needed(u64 dir, struct fs_path *name, void *ctx)
{
int ret = 0;
struct send_ctx *sctx = ctx;
@@ -5677,10 +5666,11 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path,
* Note that send_buf is a mapping of send_buf_pages, so this is really
* reading into send_buf.
*/
- ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode), offset,
+ ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode),
disk_bytenr, disk_num_bytes,
sctx->send_buf_pages +
- (data_offset >> PAGE_SHIFT));
+ (data_offset >> PAGE_SHIFT),
+ NULL);
if (ret)
goto out;
@@ -7190,13 +7180,11 @@ static int changed_extent(struct send_ctx *sctx,
static int changed_verity(struct send_ctx *sctx, enum btrfs_compare_tree_result result)
{
- int ret = 0;
-
if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
if (result == BTRFS_COMPARE_TREE_NEW)
sctx->cur_inode_needs_verity = true;
}
- return ret;
+ return 0;
}
static int dir_changed(struct send_ctx *sctx, u64 dir)
@@ -8137,7 +8125,20 @@ long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_a
* making it RW. This also protects against deletion.
*/
spin_lock(&send_root->root_item_lock);
- if (btrfs_root_readonly(send_root) && send_root->dedupe_in_progress) {
+ /*
+ * Unlikely but possible, if the subvolume is marked for deletion but
+ * is slow to remove the directory entry, send can still be started.
+ */
+ if (btrfs_root_dead(send_root)) {
+ spin_unlock(&send_root->root_item_lock);
+ return -EPERM;
+ }
+ /* Userspace tools do the checks and warn the user if it's not RO. */
+ if (!btrfs_root_readonly(send_root)) {
+ spin_unlock(&send_root->root_item_lock);
+ return -EPERM;
+ }
+ if (send_root->dedupe_in_progress) {
dedupe_in_progress_warn(send_root);
spin_unlock(&send_root->root_item_lock);
return -EAGAIN;
@@ -8146,15 +8147,6 @@ long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_a
spin_unlock(&send_root->root_item_lock);
/*
- * Userspace tools do the checks and warn the user if it's
- * not RO.
- */
- if (!btrfs_root_readonly(send_root)) {
- ret = -EPERM;
- goto out;
- }
-
- /*
* Check that we don't overflow at later allocations, we request
* clone_sources_count + 1 items, and compare to unsigned long inside
* access_ok. Also set an upper limit for allocation size so this can't
@@ -8219,15 +8211,6 @@ long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_a
}
sctx->send_root = send_root;
- /*
- * Unlikely but possible, if the subvolume is marked for deletion but
- * is slow to remove the directory entry, send can still be started
- */
- if (btrfs_root_dead(sctx->send_root)) {
- ret = -EPERM;
- goto out;
- }
-
sctx->clone_roots_cnt = arg->clone_sources_count;
if (sctx->proto >= 2) {
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index b07f4aa66878..9309886c5ea1 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -16,7 +16,7 @@ struct btrfs_ioctl_send_args;
#define BTRFS_SEND_STREAM_MAGIC "btrfs-stream"
/* Conditional support for the upcoming protocol version. */
-#ifdef CONFIG_BTRFS_DEBUG
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
#define BTRFS_SEND_STREAM_VERSION 3
#else
#define BTRFS_SEND_STREAM_VERSION 2
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index d5a9cd8a4fd8..255e85f78313 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -1279,7 +1279,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
* If we are freeing inodes, we want to make sure all delayed iputs have
* completed, because they could have been on an inode with i_nlink == 0, and
* thus have been truncated and freed up space. But again this space is not
- * immediately re-usable, it comes in the form of a delayed ref, which must be
+ * immediately reusable, it comes in the form of a delayed ref, which must be
* run and then the transaction must be committed.
*
* COMMIT_TRANS
@@ -1488,8 +1488,7 @@ static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info,
spin_unlock(&space_info->lock);
}
-static void wait_reserve_ticket(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info,
+static void wait_reserve_ticket(struct btrfs_space_info *space_info,
struct reserve_ticket *ticket)
{
@@ -1547,7 +1546,7 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
case BTRFS_RESERVE_FLUSH_DATA:
case BTRFS_RESERVE_FLUSH_ALL:
case BTRFS_RESERVE_FLUSH_ALL_STEAL:
- wait_reserve_ticket(fs_info, space_info, ticket);
+ wait_reserve_ticket(space_info, ticket);
break;
case BTRFS_RESERVE_FLUSH_LIMIT:
priority_reclaim_metadata_space(fs_info, space_info, ticket,
@@ -1984,8 +1983,7 @@ static bool is_reclaim_urgent(struct btrfs_space_info *space_info)
return unalloc < data_chunk_size;
}
-static void do_reclaim_sweep(const struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info, int raid)
+static void do_reclaim_sweep(struct btrfs_space_info *space_info, int raid)
{
struct btrfs_block_group *bg;
int thresh_pct;
@@ -2081,6 +2079,6 @@ void btrfs_reclaim_sweep(const struct btrfs_fs_info *fs_info)
if (!btrfs_should_periodic_reclaim(space_info))
continue;
for (raid = 0; raid < BTRFS_NR_RAID_TYPES; raid++)
- do_reclaim_sweep(fs_info, space_info, raid);
+ do_reclaim_sweep(space_info, raid);
}
}
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index fe4d719d506b..8c68059ac1b0 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -140,12 +140,10 @@ struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
return ERR_PTR(-ENOMEM);
spin_lock_init(&ret->lock);
- if (type == BTRFS_SUBPAGE_METADATA) {
+ if (type == BTRFS_SUBPAGE_METADATA)
atomic_set(&ret->eb_refs, 0);
- } else {
- atomic_set(&ret->readers, 0);
- atomic_set(&ret->writers, 0);
- }
+ else
+ atomic_set(&ret->nr_locked, 0);
return ret;
}
@@ -221,62 +219,6 @@ static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info,
__start_bit; \
})
-void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info,
- struct folio *folio, u64 start, u32 len)
-{
- struct btrfs_subpage *subpage = folio_get_private(folio);
- const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len);
- const int nbits = len >> fs_info->sectorsize_bits;
- unsigned long flags;
-
-
- btrfs_subpage_assert(fs_info, folio, start, len);
-
- spin_lock_irqsave(&subpage->lock, flags);
- /*
- * Even though it's just for reading the page, no one should have
- * locked the subpage range.
- */
- ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
- bitmap_set(subpage->bitmaps, start_bit, nbits);
- atomic_add(nbits, &subpage->readers);
- spin_unlock_irqrestore(&subpage->lock, flags);
-}
-
-void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
- struct folio *folio, u64 start, u32 len)
-{
- struct btrfs_subpage *subpage = folio_get_private(folio);
- const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len);
- const int nbits = len >> fs_info->sectorsize_bits;
- unsigned long flags;
- bool is_data;
- bool last;
-
- btrfs_subpage_assert(fs_info, folio, start, len);
- is_data = is_data_inode(BTRFS_I(folio->mapping->host));
-
- spin_lock_irqsave(&subpage->lock, flags);
-
- /* The range should have already been locked. */
- ASSERT(bitmap_test_range_all_set(subpage->bitmaps, start_bit, nbits));
- ASSERT(atomic_read(&subpage->readers) >= nbits);
-
- bitmap_clear(subpage->bitmaps, start_bit, nbits);
- last = atomic_sub_and_test(nbits, &subpage->readers);
-
- /*
- * For data we need to unlock the page if the last read has finished.
- *
- * And please don't replace @last with atomic_sub_and_test() call
- * inside if () condition.
- * As we want the atomic_sub_and_test() to be always executed.
- */
- if (is_data && last)
- folio_unlock(folio);
- spin_unlock_irqrestore(&subpage->lock, flags);
-}
-
static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len)
{
u64 orig_start = *start;
@@ -295,28 +237,8 @@ static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len)
orig_start + orig_len) - *start;
}
-static void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info,
- struct folio *folio, u64 start, u32 len)
-{
- struct btrfs_subpage *subpage = folio_get_private(folio);
- const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len);
- const int nbits = (len >> fs_info->sectorsize_bits);
- unsigned long flags;
- int ret;
-
- btrfs_subpage_assert(fs_info, folio, start, len);
-
- spin_lock_irqsave(&subpage->lock, flags);
- ASSERT(atomic_read(&subpage->readers) == 0);
- ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
- bitmap_set(subpage->bitmaps, start_bit, nbits);
- ret = atomic_add_return(nbits, &subpage->writers);
- ASSERT(ret == nbits);
- spin_unlock_irqrestore(&subpage->lock, flags);
-}
-
-static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
- struct folio *folio, u64 start, u32 len)
+static bool btrfs_subpage_end_and_test_lock(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len)
{
struct btrfs_subpage *subpage = folio_get_private(folio);
const int start_bit = subpage_calc_start_bit(fs_info, folio, locked, start, len);
@@ -334,9 +256,9 @@ static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_inf
* extent_clear_unlock_delalloc() for compression path.
*
* This @locked_page is locked by plain lock_page(), thus its
- * subpage::writers is 0. Handle them in a special way.
+ * subpage::locked is 0. Handle them in a special way.
*/
- if (atomic_read(&subpage->writers) == 0) {
+ if (atomic_read(&subpage->nr_locked) == 0) {
spin_unlock_irqrestore(&subpage->lock, flags);
return true;
}
@@ -345,40 +267,13 @@ static bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_inf
clear_bit(bit, subpage->bitmaps);
cleared++;
}
- ASSERT(atomic_read(&subpage->writers) >= cleared);
- last = atomic_sub_and_test(cleared, &subpage->writers);
+ ASSERT(atomic_read(&subpage->nr_locked) >= cleared);
+ last = atomic_sub_and_test(cleared, &subpage->nr_locked);
spin_unlock_irqrestore(&subpage->lock, flags);
return last;
}
/*
- * Lock a folio for delalloc page writeback.
- *
- * Return -EAGAIN if the page is not properly initialized.
- * Return 0 with the page locked, and writer counter updated.
- *
- * Even with 0 returned, the page still need extra check to make sure
- * it's really the correct page, as the caller is using
- * filemap_get_folios_contig(), which can race with page invalidating.
- */
-int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info,
- struct folio *folio, u64 start, u32 len)
-{
- if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) {
- folio_lock(folio);
- return 0;
- }
- folio_lock(folio);
- if (!folio_test_private(folio) || !folio_get_private(folio)) {
- folio_unlock(folio);
- return -EAGAIN;
- }
- btrfs_subpage_clamp_range(folio, &start, &len);
- btrfs_subpage_start_writer(fs_info, folio, start, len);
- return 0;
-}
-
-/*
* Handle different locked folios:
*
* - Non-subpage folio
@@ -394,8 +289,8 @@ int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info,
* bitmap, reduce the writer lock number, and unlock the page if that's
* the last locked range.
*/
-void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info,
- struct folio *folio, u64 start, u32 len)
+void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len)
{
struct btrfs_subpage *subpage = folio_get_private(folio);
@@ -408,24 +303,24 @@ void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info,
/*
* For subpage case, there are two types of locked page. With or
- * without writers number.
+ * without locked number.
*
- * Since we own the page lock, no one else could touch subpage::writers
+ * Since we own the page lock, no one else could touch subpage::locked
* and we are safe to do several atomic operations without spinlock.
*/
- if (atomic_read(&subpage->writers) == 0) {
- /* No writers, locked by plain lock_page(). */
+ if (atomic_read(&subpage->nr_locked) == 0) {
+ /* No subpage lock, locked by plain lock_page(). */
folio_unlock(folio);
return;
}
btrfs_subpage_clamp_range(folio, &start, &len);
- if (btrfs_subpage_end_and_test_writer(fs_info, folio, start, len))
+ if (btrfs_subpage_end_and_test_lock(fs_info, folio, start, len))
folio_unlock(folio);
}
-void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info,
- struct folio *folio, unsigned long bitmap)
+void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, unsigned long bitmap)
{
struct btrfs_subpage *subpage = folio_get_private(folio);
const int start_bit = fs_info->sectors_per_page * btrfs_bitmap_nr_locked;
@@ -434,13 +329,13 @@ void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info,
int cleared = 0;
int bit;
- if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) {
+ if (!btrfs_is_subpage(fs_info, folio->mapping)) {
folio_unlock(folio);
return;
}
- if (atomic_read(&subpage->writers) == 0) {
- /* No writers, locked by plain lock_page(). */
+ if (atomic_read(&subpage->nr_locked) == 0) {
+ /* No subpage lock, locked by plain lock_page(). */
folio_unlock(folio);
return;
}
@@ -450,8 +345,8 @@ void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info,
if (test_and_clear_bit(bit + start_bit, subpage->bitmaps))
cleared++;
}
- ASSERT(atomic_read(&subpage->writers) >= cleared);
- last = atomic_sub_and_test(cleared, &subpage->writers);
+ ASSERT(atomic_read(&subpage->nr_locked) >= cleared);
+ last = atomic_sub_and_test(cleared, &subpage->nr_locked);
spin_unlock_irqrestore(&subpage->lock, flags);
if (last)
folio_unlock(folio);
@@ -776,8 +671,8 @@ void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info,
* This populates the involved subpage ranges so that subpage helpers can
* properly unlock them.
*/
-void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info,
- struct folio *folio, u64 start, u32 len)
+void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len)
{
struct btrfs_subpage *subpage;
unsigned long flags;
@@ -796,58 +691,11 @@ void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info,
/* Target range should not yet be locked. */
ASSERT(bitmap_test_range_all_zero(subpage->bitmaps, start_bit, nbits));
bitmap_set(subpage->bitmaps, start_bit, nbits);
- ret = atomic_add_return(nbits, &subpage->writers);
+ ret = atomic_add_return(nbits, &subpage->nr_locked);
ASSERT(ret <= fs_info->sectors_per_page);
spin_unlock_irqrestore(&subpage->lock, flags);
}
-/*
- * Find any subpage writer locked range inside @folio, starting at file offset
- * @search_start. The caller should ensure the folio is locked.
- *
- * Return true and update @found_start_ret and @found_len_ret to the first
- * writer locked range.
- * Return false if there is no writer locked range.
- */
-bool btrfs_subpage_find_writer_locked(const struct btrfs_fs_info *fs_info,
- struct folio *folio, u64 search_start,
- u64 *found_start_ret, u32 *found_len_ret)
-{
- struct btrfs_subpage *subpage = folio_get_private(folio);
- const u32 sectors_per_page = fs_info->sectors_per_page;
- const unsigned int len = PAGE_SIZE - offset_in_page(search_start);
- const unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
- locked, search_start, len);
- const unsigned int locked_bitmap_start = sectors_per_page * btrfs_bitmap_nr_locked;
- const unsigned int locked_bitmap_end = locked_bitmap_start + sectors_per_page;
- unsigned long flags;
- int first_zero;
- int first_set;
- bool found = false;
-
- ASSERT(folio_test_locked(folio));
- spin_lock_irqsave(&subpage->lock, flags);
- first_set = find_next_bit(subpage->bitmaps, locked_bitmap_end, start_bit);
- if (first_set >= locked_bitmap_end)
- goto out;
-
- found = true;
-
- *found_start_ret = folio_pos(folio) +
- ((first_set - locked_bitmap_start) << fs_info->sectorsize_bits);
- /*
- * Since @first_set is ensured to be smaller than locked_bitmap_end
- * here, @found_start_ret should be inside the folio.
- */
- ASSERT(*found_start_ret < folio_pos(folio) + PAGE_SIZE);
-
- first_zero = find_next_zero_bit(subpage->bitmaps, locked_bitmap_end, first_set);
- *found_len_ret = (first_zero - first_set) << fs_info->sectorsize_bits;
-out:
- spin_unlock_irqrestore(&subpage->lock, flags);
- return found;
-}
-
#define GET_SUBPAGE_BITMAP(subpage, fs_info, name, dst) \
{ \
const int sectors_per_page = fs_info->sectors_per_page; \
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
index 4b85d91d0e18..428fa9389fd4 100644
--- a/fs/btrfs/subpage.h
+++ b/fs/btrfs/subpage.h
@@ -45,14 +45,6 @@ enum {
struct btrfs_subpage {
/* Common members for both data and metadata pages */
spinlock_t lock;
- /*
- * Both data and metadata needs to track how many readers are for the
- * page.
- * Data relies on @readers to unlock the page when last reader finished.
- * While metadata doesn't need page unlock, it needs to prevent
- * page::private get cleared before the last end_page_read().
- */
- atomic_t readers;
union {
/*
* Structures only used by metadata
@@ -62,8 +54,12 @@ struct btrfs_subpage {
*/
atomic_t eb_refs;
- /* Structures only used by data */
- atomic_t writers;
+ /*
+ * Structures only used by data,
+ *
+ * How many sectors inside the page is locked.
+ */
+ atomic_t nr_locked;
};
unsigned long bitmaps[];
};
@@ -95,23 +91,12 @@ void btrfs_free_subpage(struct btrfs_subpage *subpage);
void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio);
void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio);
-void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info,
- struct folio *folio, u64 start, u32 len);
-void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
- struct folio *folio, u64 start, u32 len);
-
-int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info,
- struct folio *folio, u64 start, u32 len);
-void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info,
- struct folio *folio, u64 start, u32 len);
-void btrfs_folio_set_writer_lock(const struct btrfs_fs_info *fs_info,
- struct folio *folio, u64 start, u32 len);
-void btrfs_folio_end_writer_lock_bitmap(const struct btrfs_fs_info *fs_info,
- struct folio *folio, unsigned long bitmap);
-bool btrfs_subpage_find_writer_locked(const struct btrfs_fs_info *fs_info,
- struct folio *folio, u64 search_start,
- u64 *found_start_ret, u32 *found_len_ret);
-
+void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len);
+void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len);
+void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, unsigned long bitmap);
/*
* Template for subpage related operations.
*
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 98fa0f382480..97a85d180b61 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -28,7 +28,6 @@
#include <linux/btrfs.h>
#include <linux/security.h>
#include <linux/fs_parser.h>
-#include <linux/swap.h>
#include "messages.h"
#include "delayed-inode.h"
#include "ctree.h"
@@ -340,6 +339,15 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
fallthrough;
case Opt_compress:
case Opt_compress_type:
+ /*
+ * Provide the same semantics as older kernels that don't use fs
+ * context, specifying the "compress" option clears
+ * "force-compress" without the need to pass
+ * "compress-force=[no|none]" before specifying "compress".
+ */
+ if (opt != Opt_compress_force && opt != Opt_compress_force_type)
+ btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS);
+
if (opt == Opt_compress || opt == Opt_compress_force) {
ctx->compress_type = BTRFS_COMPRESS_ZLIB;
ctx->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL;
@@ -937,8 +945,7 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec
}
static int btrfs_fill_super(struct super_block *sb,
- struct btrfs_fs_devices *fs_devices,
- void *data)
+ struct btrfs_fs_devices *fs_devices)
{
struct inode *inode;
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
@@ -962,7 +969,7 @@ static int btrfs_fill_super(struct super_block *sb,
return err;
}
- err = open_ctree(sb, fs_devices, (char *)data);
+ err = open_ctree(sb, fs_devices);
if (err) {
btrfs_err(fs_info, "open_ctree failed");
return err;
@@ -1498,8 +1505,7 @@ static int btrfs_reconfigure(struct fs_context *fc)
sync_filesystem(sb);
set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
- if (!mount_reconfigure &&
- !btrfs_check_options(fs_info, &ctx->mount_opt, fc->sb_flags))
+ if (!btrfs_check_options(fs_info, &ctx->mount_opt, fc->sb_flags))
return -EINVAL;
ret = btrfs_check_features(fs_info, !(fc->sb_flags & SB_RDONLY));
@@ -1885,7 +1891,7 @@ static int btrfs_get_tree_super(struct fs_context *fc)
snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev);
shrinker_debugfs_rename(sb->s_shrink, "sb-btrfs:%s", sb->s_id);
btrfs_sb(sb)->bdev_holder = &btrfs_fs_type;
- ret = btrfs_fill_super(sb, fs_devices, NULL);
+ ret = btrfs_fill_super(sb, fs_devices);
}
if (ret) {
@@ -1971,25 +1977,10 @@ error:
* fsconfig(FSCONFIG_SET_FLAG, "ro"). This option is seen by the filesystem
* in fc->sb_flags.
*
- * This disambiguation has rather positive consequences. Mounting a subvolume
- * ro will not also turn the superblock ro. Only the mount for the subvolume
- * will become ro.
- *
- * So, if the superblock creation request comes from the new mount API the
- * caller must have explicitly done:
- *
- * fsconfig(FSCONFIG_SET_FLAG, "ro")
- * fsmount/mount_setattr(MOUNT_ATTR_RDONLY)
- *
- * IOW, at some point the caller must have explicitly turned the whole
- * superblock ro and we shouldn't just undo it like we did for the old mount
- * API. In any case, it lets us avoid the hack in the new mount API.
- *
- * Consequently, the remounting hack must only be used for requests originating
- * from the old mount API and should be marked for full deprecation so it can be
- * turned off in a couple of years.
- *
- * The new mount API has no reason to support this hack.
+ * But, currently the util-linux mount command already utilizes the new mount
+ * API and is still setting fsconfig(FSCONFIG_SET_FLAG, "ro") no matter if it's
+ * btrfs or not, setting the whole super block RO. To make per-subvolume mounting
+ * work with different options work we need to keep backward compatibility.
*/
static struct vfsmount *btrfs_reconfigure_for_mount(struct fs_context *fc)
{
@@ -2011,7 +2002,7 @@ static struct vfsmount *btrfs_reconfigure_for_mount(struct fs_context *fc)
if (IS_ERR(mnt))
return mnt;
- if (!fc->oldapi || !ro2rw)
+ if (!ro2rw)
return mnt;
/* We need to convert to rw, call reconfigure. */
@@ -2198,7 +2189,8 @@ static struct file_system_type btrfs_fs_type = {
.init_fs_context = btrfs_init_fs_context,
.parameters = btrfs_fs_parameters,
.kill_sb = btrfs_kill_super,
- .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_ALLOW_IDMAP,
+ .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA |
+ FS_ALLOW_IDMAP | FS_MGTIME,
};
MODULE_ALIAS_FS("btrfs");
@@ -2263,7 +2255,10 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd,
device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false);
if (IS_ERR_OR_NULL(device)) {
mutex_unlock(&uuid_mutex);
- ret = PTR_ERR(device);
+ if (IS_ERR(device))
+ ret = PTR_ERR(device);
+ else
+ ret = 0;
break;
}
ret = !(device->fs_devices->num_devices ==
@@ -2402,13 +2397,7 @@ static long btrfs_nr_cached_objects(struct super_block *sb, struct shrink_contro
trace_btrfs_extent_map_shrinker_count(fs_info, nr);
- /*
- * Only report the real number for DEBUG builds, as there are reports of
- * serious performance degradation caused by too frequent shrinks.
- */
- if (IS_ENABLED(CONFIG_BTRFS_DEBUG))
- return nr;
- return 0;
+ return nr;
}
static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_control *sc)
@@ -2416,16 +2405,10 @@ static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_cont
const long nr_to_scan = min_t(unsigned long, LONG_MAX, sc->nr_to_scan);
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
- /*
- * We may be called from any task trying to allocate memory and we don't
- * want to slow it down with scanning and dropping extent maps. It would
- * also cause heavy lock contention if many tasks concurrently enter
- * here. Therefore only allow kswapd tasks to scan and drop extent maps.
- */
- if (!current_is_kswapd())
- return 0;
+ btrfs_free_extent_maps(fs_info, nr_to_scan);
- return btrfs_free_extent_maps(fs_info, nr_to_scan);
+ /* The extent map shrinker runs asynchronously, so always return 0. */
+ return 0;
}
static const struct super_operations btrfs_super_ops = {
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 03926ad467c9..b843308e2bc6 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -1390,7 +1390,7 @@ static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj,
BTRFS_ATTR_RW(, bg_reclaim_threshold, btrfs_bg_reclaim_threshold_show,
btrfs_bg_reclaim_threshold_store);
-#ifdef CONFIG_BTRFS_DEBUG
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
static ssize_t btrfs_offload_csum_show(struct kobject *kobj,
struct kobj_attribute *a, char *buf)
{
@@ -1450,7 +1450,7 @@ static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(, bg_reclaim_threshold),
BTRFS_ATTR_PTR(, commit_stats),
BTRFS_ATTR_PTR(, temp_fsid),
-#ifdef CONFIG_BTRFS_DEBUG
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
BTRFS_ATTR_PTR(, offload_csum),
#endif
NULL,
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index ce50847e1e01..e607b5d52fb1 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -29,6 +29,7 @@ const char *test_error[] = {
[TEST_ALLOC_BLOCK_GROUP] = "cannot allocate block group",
[TEST_ALLOC_EXTENT_MAP] = "cannot allocate extent map",
[TEST_ALLOC_CHUNK_MAP] = "cannot allocate chunk map",
+ [TEST_ALLOC_IO_CONTEXT] = "cannot allocate io context",
};
static const struct super_operations btrfs_test_super_ops = {
@@ -291,6 +292,9 @@ int btrfs_run_sanity_tests(void)
ret = btrfs_test_free_space_tree(sectorsize, nodesize);
if (ret)
goto out;
+ ret = btrfs_test_raid_stripe_tree(sectorsize, nodesize);
+ if (ret)
+ goto out;
}
}
ret = btrfs_test_extent_map();
diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h
index dc2f2ab15fa5..b524ecf2f452 100644
--- a/fs/btrfs/tests/btrfs-tests.h
+++ b/fs/btrfs/tests/btrfs-tests.h
@@ -24,6 +24,7 @@ enum {
TEST_ALLOC_BLOCK_GROUP,
TEST_ALLOC_EXTENT_MAP,
TEST_ALLOC_CHUNK_MAP,
+ TEST_ALLOC_IO_CONTEXT,
};
extern const char *test_error[];
@@ -37,6 +38,7 @@ int btrfs_test_extent_io(u32 sectorsize, u32 nodesize);
int btrfs_test_inodes(u32 sectorsize, u32 nodesize);
int btrfs_test_qgroups(u32 sectorsize, u32 nodesize);
int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize);
+int btrfs_test_raid_stripe_tree(u32 sectorsize, u32 nodesize);
int btrfs_test_extent_map(void);
struct inode *btrfs_new_test_inode(void);
struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize);
diff --git a/fs/btrfs/tests/raid-stripe-tree-tests.c b/fs/btrfs/tests/raid-stripe-tree-tests.c
new file mode 100644
index 000000000000..30f17eb7b6a8
--- /dev/null
+++ b/fs/btrfs/tests/raid-stripe-tree-tests.c
@@ -0,0 +1,538 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2024 Western Digital Corporation or its affiliates.
+ */
+
+#include <linux/sizes.h>
+#include "../fs.h"
+#include "../disk-io.h"
+#include "../transaction.h"
+#include "../volumes.h"
+#include "../raid-stripe-tree.h"
+#include "btrfs-tests.h"
+
+#define RST_TEST_NUM_DEVICES (2)
+#define RST_TEST_RAID1_TYPE (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_RAID1)
+
+typedef int (*test_func_t)(struct btrfs_trans_handle *trans);
+
+static struct btrfs_device *btrfs_device_by_devid(struct btrfs_fs_devices *fs_devices,
+ u64 devid)
+{
+ struct btrfs_device *dev;
+
+ list_for_each_entry(dev, &fs_devices->devices, dev_list) {
+ if (dev->devid == devid)
+ return dev;
+ }
+
+ return NULL;
+}
+
+/*
+ * Test a 64K RST write on a 2 disk RAID1 at a logical address of 1M and then
+ * delete the 1st 32K, making the new start address 1M+32K.
+ */
+static int test_front_delete(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_io_context *bioc;
+ struct btrfs_io_stripe io_stripe = { 0 };
+ u64 map_type = RST_TEST_RAID1_TYPE;
+ u64 logical = SZ_1M;
+ u64 len = SZ_64K;
+ int ret;
+
+ bioc = alloc_btrfs_io_context(fs_info, logical, RST_TEST_NUM_DEVICES);
+ if (!bioc) {
+ test_std_err(TEST_ALLOC_IO_CONTEXT);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0);
+ bioc->map_type = map_type;
+ bioc->size = len;
+
+ for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
+ struct btrfs_io_stripe *stripe = &bioc->stripes[i];
+
+ stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i);
+ if (!stripe->dev) {
+ test_err("cannot find device with devid %d", i);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ stripe->physical = logical + i * SZ_1G;
+ }
+
+ ret = btrfs_insert_one_raid_extent(trans, bioc);
+ if (ret) {
+ test_err("inserting RAID extent failed: %d", ret);
+ goto out;
+ }
+
+ ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe);
+ if (ret) {
+ test_err("lookup of RAID extent [%llu, %llu] failed", logical,
+ logical + len);
+ goto out;
+ }
+
+ if (io_stripe.physical != logical) {
+ test_err("invalid physical address, expected %llu got %llu",
+ logical, io_stripe.physical);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (len != SZ_64K) {
+ test_err("invalid stripe length, expected %llu got %llu",
+ (u64)SZ_64K, len);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = btrfs_delete_raid_extent(trans, logical, SZ_32K);
+ if (ret) {
+ test_err("deleting RAID extent [%llu, %llu] failed", logical,
+ logical + SZ_32K);
+ goto out;
+ }
+
+ len = SZ_32K;
+ ret = btrfs_get_raid_extent_offset(fs_info, logical + SZ_32K, &len,
+ map_type, 0, &io_stripe);
+ if (ret) {
+ test_err("lookup of RAID extent [%llu, %llu] failed",
+ logical + SZ_32K, logical + SZ_32K + len);
+ goto out;
+ }
+
+ if (io_stripe.physical != logical + SZ_32K) {
+ test_err("invalid physical address, expected %llu, got %llu",
+ logical + SZ_32K, io_stripe.physical);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (len != SZ_32K) {
+ test_err("invalid stripe length, expected %llu, got %llu",
+ (u64)SZ_32K, len);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe);
+ if (!ret) {
+ ret = -EINVAL;
+ test_err("lookup of RAID extent [%llu, %llu] succeeded, should fail",
+ logical, logical + SZ_32K);
+ goto out;
+ }
+
+ ret = btrfs_delete_raid_extent(trans, logical + SZ_32K, SZ_32K);
+out:
+ btrfs_put_bioc(bioc);
+ return ret;
+}
+
+/*
+ * Test a 64K RST write on a 2 disk RAID1 at a logical address of 1M and then
+ * truncate the stripe extent down to 32K.
+ */
+static int test_tail_delete(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_io_context *bioc;
+ struct btrfs_io_stripe io_stripe = { 0 };
+ u64 map_type = RST_TEST_RAID1_TYPE;
+ u64 logical = SZ_1M;
+ u64 len = SZ_64K;
+ int ret;
+
+ bioc = alloc_btrfs_io_context(fs_info, logical, RST_TEST_NUM_DEVICES);
+ if (!bioc) {
+ test_std_err(TEST_ALLOC_IO_CONTEXT);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0);
+ bioc->map_type = map_type;
+ bioc->size = len;
+
+ for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
+ struct btrfs_io_stripe *stripe = &bioc->stripes[i];
+
+ stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i);
+ if (!stripe->dev) {
+ test_err("cannot find device with devid %d", i);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ stripe->physical = logical + i * SZ_1G;
+ }
+
+ ret = btrfs_insert_one_raid_extent(trans, bioc);
+ if (ret) {
+ test_err("inserting RAID extent failed: %d", ret);
+ goto out;
+ }
+
+ io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0);
+ if (!io_stripe.dev) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe);
+ if (ret) {
+ test_err("lookup of RAID extent [%llu, %llu] failed", logical,
+ logical + len);
+ goto out;
+ }
+
+ if (io_stripe.physical != logical) {
+ test_err("invalid physical address, expected %llu got %llu",
+ logical, io_stripe.physical);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (len != SZ_64K) {
+ test_err("invalid stripe length, expected %llu got %llu",
+ (u64)SZ_64K, len);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = btrfs_delete_raid_extent(trans, logical + SZ_32K, SZ_32K);
+ if (ret) {
+ test_err("deleting RAID extent [%llu, %llu] failed",
+ logical + SZ_32K, logical + SZ_64K);
+ goto out;
+ }
+
+ len = SZ_32K;
+ ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe);
+ if (ret) {
+ test_err("lookup of RAID extent [%llu, %llu] failed", logical,
+ logical + len);
+ goto out;
+ }
+
+ if (io_stripe.physical != logical) {
+ test_err("invalid physical address, expected %llu, got %llu",
+ logical, io_stripe.physical);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (len != SZ_32K) {
+ test_err("invalid stripe length, expected %llu, got %llu",
+ (u64)SZ_32K, len);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = btrfs_delete_raid_extent(trans, logical, len);
+ if (ret)
+ test_err("deleting RAID extent [%llu, %llu] failed", logical,
+ logical + len);
+
+out:
+ btrfs_put_bioc(bioc);
+ return ret;
+}
+
+/*
+ * Test a 64K RST write on a 2 disk RAID1 at a logical address of 1M and then
+ * overwrite the whole range giving it new physical address at an offset of 1G.
+ * The intent of this test is to exercise the 'update_raid_extent_item()'
+ * function called be btrfs_insert_one_raid_extent().
+ */
+static int test_create_update_delete(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_io_context *bioc;
+ struct btrfs_io_stripe io_stripe = { 0 };
+ u64 map_type = RST_TEST_RAID1_TYPE;
+ u64 logical = SZ_1M;
+ u64 len = SZ_64K;
+ int ret;
+
+ bioc = alloc_btrfs_io_context(fs_info, logical, RST_TEST_NUM_DEVICES);
+ if (!bioc) {
+ test_std_err(TEST_ALLOC_IO_CONTEXT);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0);
+ bioc->map_type = map_type;
+ bioc->size = len;
+
+ for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
+ struct btrfs_io_stripe *stripe = &bioc->stripes[i];
+
+ stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i);
+ if (!stripe->dev) {
+ test_err("cannot find device with devid %d", i);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ stripe->physical = logical + i * SZ_1G;
+ }
+
+ ret = btrfs_insert_one_raid_extent(trans, bioc);
+ if (ret) {
+ test_err("inserting RAID extent failed: %d", ret);
+ goto out;
+ }
+
+ io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0);
+ if (!io_stripe.dev) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe);
+ if (ret) {
+ test_err("lookup of RAID extent [%llu, %llu] failed", logical,
+ logical + len);
+ goto out;
+ }
+
+ if (io_stripe.physical != logical) {
+ test_err("invalid physical address, expected %llu got %llu",
+ logical, io_stripe.physical);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (len != SZ_64K) {
+ test_err("invalid stripe length, expected %llu got %llu",
+ (u64)SZ_64K, len);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
+ struct btrfs_io_stripe *stripe = &bioc->stripes[i];
+
+ stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i);
+ if (!stripe->dev) {
+ test_err("cannot find device with devid %d", i);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ stripe->physical = SZ_1G + logical + i * SZ_1G;
+ }
+
+ ret = btrfs_insert_one_raid_extent(trans, bioc);
+ if (ret) {
+ test_err("updating RAID extent failed: %d", ret);
+ goto out;
+ }
+
+ ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe);
+ if (ret) {
+ test_err("lookup of RAID extent [%llu, %llu] failed", logical,
+ logical + len);
+ goto out;
+ }
+
+ if (io_stripe.physical != logical + SZ_1G) {
+ test_err("invalid physical address, expected %llu, got %llu",
+ logical + SZ_1G, io_stripe.physical);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (len != SZ_64K) {
+ test_err("invalid stripe length, expected %llu, got %llu",
+ (u64)SZ_64K, len);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = btrfs_delete_raid_extent(trans, logical, len);
+ if (ret)
+ test_err("deleting RAID extent [%llu, %llu] failed", logical,
+ logical + len);
+
+out:
+ btrfs_put_bioc(bioc);
+ return ret;
+}
+
+/*
+ * Test a simple 64K RST write on a 2 disk RAID1 at a logical address of 1M.
+ * The "physical" copy on device 0 is at 1M, on device 1 it is at 1G+1M.
+ */
+static int test_simple_create_delete(struct btrfs_trans_handle *trans)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_io_context *bioc;
+ struct btrfs_io_stripe io_stripe = { 0 };
+ u64 map_type = RST_TEST_RAID1_TYPE;
+ u64 logical = SZ_1M;
+ u64 len = SZ_64K;
+ int ret;
+
+ bioc = alloc_btrfs_io_context(fs_info, logical, RST_TEST_NUM_DEVICES);
+ if (!bioc) {
+ test_std_err(TEST_ALLOC_IO_CONTEXT);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ bioc->map_type = map_type;
+ bioc->size = SZ_64K;
+
+ for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
+ struct btrfs_io_stripe *stripe = &bioc->stripes[i];
+
+ stripe->dev = btrfs_device_by_devid(fs_info->fs_devices, i);
+ if (!stripe->dev) {
+ test_err("cannot find device with devid %d", i);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ stripe->physical = logical + i * SZ_1G;
+ }
+
+ ret = btrfs_insert_one_raid_extent(trans, bioc);
+ if (ret) {
+ test_err("inserting RAID extent failed: %d", ret);
+ goto out;
+ }
+
+ io_stripe.dev = btrfs_device_by_devid(fs_info->fs_devices, 0);
+ if (!io_stripe.dev) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = btrfs_get_raid_extent_offset(fs_info, logical, &len, map_type, 0, &io_stripe);
+ if (ret) {
+ test_err("lookup of RAID extent [%llu, %llu] failed", logical,
+ logical + len);
+ goto out;
+ }
+
+ if (io_stripe.physical != logical) {
+ test_err("invalid physical address, expected %llu got %llu",
+ logical, io_stripe.physical);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (len != SZ_64K) {
+ test_err("invalid stripe length, expected %llu got %llu",
+ (u64)SZ_64K, len);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = btrfs_delete_raid_extent(trans, logical, len);
+ if (ret)
+ test_err("deleting RAID extent [%llu, %llu] failed", logical,
+ logical + len);
+
+out:
+ btrfs_put_bioc(bioc);
+ return ret;
+}
+
+static const test_func_t tests[] = {
+ test_simple_create_delete,
+ test_create_update_delete,
+ test_tail_delete,
+ test_front_delete,
+};
+
+static int run_test(test_func_t test, u32 sectorsize, u32 nodesize)
+{
+ struct btrfs_trans_handle trans;
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_root *root = NULL;
+ int ret;
+
+ fs_info = btrfs_alloc_dummy_fs_info(sectorsize, nodesize);
+ if (!fs_info) {
+ test_std_err(TEST_ALLOC_FS_INFO);
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ root = btrfs_alloc_dummy_root(fs_info);
+ if (IS_ERR(root)) {
+ test_std_err(TEST_ALLOC_ROOT);
+ ret = PTR_ERR(root);
+ goto out;
+ }
+ btrfs_set_super_compat_ro_flags(root->fs_info->super_copy,
+ BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE);
+ root->root_key.objectid = BTRFS_RAID_STRIPE_TREE_OBJECTID;
+ root->root_key.type = BTRFS_ROOT_ITEM_KEY;
+ root->root_key.offset = 0;
+ fs_info->stripe_root = root;
+ root->fs_info->tree_root = root;
+
+ root->node = alloc_test_extent_buffer(root->fs_info, nodesize);
+ if (IS_ERR(root->node)) {
+ test_std_err(TEST_ALLOC_EXTENT_BUFFER);
+ ret = PTR_ERR(root->node);
+ goto out;
+ }
+ btrfs_set_header_level(root->node, 0);
+ btrfs_set_header_nritems(root->node, 0);
+ root->alloc_bytenr += 2 * nodesize;
+
+ for (int i = 0; i < RST_TEST_NUM_DEVICES; i++) {
+ struct btrfs_device *dev;
+
+ dev = btrfs_alloc_dummy_device(fs_info);
+ if (IS_ERR(dev)) {
+ test_err("cannot allocate device");
+ ret = PTR_ERR(dev);
+ goto out;
+ }
+ dev->devid = i;
+ }
+
+ btrfs_init_dummy_trans(&trans, root->fs_info);
+ ret = test(&trans);
+ if (ret)
+ goto out;
+
+out:
+ btrfs_free_dummy_root(root);
+ btrfs_free_dummy_fs_info(fs_info);
+
+ return ret;
+}
+
+int btrfs_test_raid_stripe_tree(u32 sectorsize, u32 nodesize)
+{
+ int ret = 0;
+
+ test_msg("running raid-stripe-tree tests");
+ for (int i = 0; i < ARRAY_SIZE(tests); i++) {
+ ret = run_test(tests[i], sectorsize, nodesize);
+ if (ret) {
+ test_err("test-case %ps failed with %d\n", tests[i], ret);
+ goto out;
+ }
+ }
+
+out:
+ return ret;
+}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 0fc873af891f..dc0b837efd5d 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -141,8 +141,7 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
WARN_ON(refcount_read(&transaction->use_count) == 0);
if (refcount_dec_and_test(&transaction->use_count)) {
BUG_ON(!list_empty(&transaction->list));
- WARN_ON(!RB_EMPTY_ROOT(
- &transaction->delayed_refs.href_root.rb_root));
+ WARN_ON(!xa_empty(&transaction->delayed_refs.head_refs));
WARN_ON(!xa_empty(&transaction->delayed_refs.dirty_extents));
if (transaction->delayed_refs.pending_csums)
btrfs_err(transaction->fs_info,
@@ -349,9 +348,8 @@ loop:
memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs));
- cur_trans->delayed_refs.href_root = RB_ROOT_CACHED;
+ xa_init(&cur_trans->delayed_refs.head_refs);
xa_init(&cur_trans->delayed_refs.dirty_extents);
- atomic_set(&cur_trans->delayed_refs.num_entries, 0);
/*
* although the tree mod log is per file system and not per transaction,
@@ -2052,7 +2050,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
spin_unlock(&fs_info->trans_lock);
- btrfs_cleanup_one_transaction(trans->transaction, fs_info);
+ btrfs_cleanup_one_transaction(trans->transaction);
spin_lock(&fs_info->trans_lock);
if (cur_trans == fs_info->running_transaction)
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index dd9ce9b9f69e..184fa5c0062a 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -33,7 +33,7 @@ struct btrfs_path;
*/
#define BTRFS_TRANS_DIO_WRITE_STUB ((void *) 1)
-/* Radix-tree tag for roots that are part of the trasaction. */
+/* Radix-tree tag for roots that are part of the transaction. */
#define BTRFS_ROOT_TRANS_TAG 0
enum btrfs_trans_state {
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 7b50263723bc..148d8cefa40e 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -2183,8 +2183,8 @@ int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner)
return 0;
}
-int btrfs_verify_level_key(struct extent_buffer *eb, int level,
- struct btrfs_key *first_key, u64 parent_transid)
+int btrfs_verify_level_key(struct extent_buffer *eb,
+ const struct btrfs_tree_parent_check *check)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
int found_level;
@@ -2192,16 +2192,16 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level,
int ret;
found_level = btrfs_header_level(eb);
- if (found_level != level) {
+ if (found_level != check->level) {
WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
KERN_ERR "BTRFS: tree level check failed\n");
btrfs_err(fs_info,
"tree level mismatch detected, bytenr=%llu level expected=%u has=%u",
- eb->start, level, found_level);
+ eb->start, check->level, found_level);
return -EIO;
}
- if (!first_key)
+ if (!check->has_first_key)
return 0;
/*
@@ -2226,15 +2226,15 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level,
btrfs_node_key_to_cpu(eb, &found_key, 0);
else
btrfs_item_key_to_cpu(eb, &found_key, 0);
- ret = btrfs_comp_cpu_keys(first_key, &found_key);
+ ret = btrfs_comp_cpu_keys(&check->first_key, &found_key);
if (ret) {
WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
KERN_ERR "BTRFS: tree first key check failed\n");
btrfs_err(fs_info,
"tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)",
- eb->start, parent_transid, first_key->objectid,
- first_key->type, first_key->offset,
+ eb->start, check->transid, check->first_key.objectid,
+ check->first_key.type, check->first_key.offset,
found_key.objectid, found_key.type,
found_key.offset);
}
diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h
index 01669cfa6578..db67f96cbe4b 100644
--- a/fs/btrfs/tree-checker.h
+++ b/fs/btrfs/tree-checker.h
@@ -69,7 +69,7 @@ int btrfs_check_node(struct extent_buffer *node);
int btrfs_check_chunk_valid(struct extent_buffer *leaf,
struct btrfs_chunk *chunk, u64 logical);
int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner);
-int btrfs_verify_level_key(struct extent_buffer *eb, int level,
- struct btrfs_key *first_key, u64 parent_transid);
+int btrfs_verify_level_key(struct extent_buffer *eb,
+ const struct btrfs_tree_parent_check *check);
#endif
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index e2ed2a791f8f..c8d6587688b3 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1374,7 +1374,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
struct inode *inode = NULL;
unsigned long ref_ptr;
unsigned long ref_end;
- struct fscrypt_str name;
+ struct fscrypt_str name = { 0 };
int ret;
int log_ref_ver = 0;
u64 parent_objectid;
@@ -1845,7 +1845,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
struct btrfs_dir_item *di,
struct btrfs_key *key)
{
- struct fscrypt_str name;
+ struct fscrypt_str name = { 0 };
struct btrfs_dir_item *dir_dst_di;
struct btrfs_dir_item *index_dst_di;
bool dir_dst_matches = false;
@@ -2125,7 +2125,7 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
struct extent_buffer *eb;
int slot;
struct btrfs_dir_item *di;
- struct fscrypt_str name;
+ struct fscrypt_str name = { 0 };
struct inode *inode = NULL;
struct btrfs_key location;
@@ -6204,7 +6204,6 @@ static int log_delayed_deletions_full(struct btrfs_trans_handle *trans,
static int batch_delete_dir_index_items(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct btrfs_path *path,
- struct btrfs_log_ctx *ctx,
const struct list_head *delayed_del_list,
const struct btrfs_delayed_item *first,
const struct btrfs_delayed_item **last_ret)
@@ -6265,7 +6264,7 @@ static int log_delayed_deletions_incremental(struct btrfs_trans_handle *trans,
if (ret < 0) {
return ret;
} else if (ret == 0) {
- ret = batch_delete_dir_index_items(trans, inode, path, ctx,
+ ret = batch_delete_dir_index_items(trans, inode, path,
delayed_del_list, curr,
&last);
if (ret)
diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c
index b382a4c443d4..1ac2678fc4ca 100644
--- a/fs/btrfs/tree-mod-log.c
+++ b/fs/btrfs/tree-mod-log.c
@@ -909,7 +909,6 @@ static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
* is freed (its refcount is decremented).
*/
struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
- struct btrfs_path *path,
struct extent_buffer *eb,
u64 time_seq)
{
diff --git a/fs/btrfs/tree-mod-log.h b/fs/btrfs/tree-mod-log.h
index 6308c577a4a4..1c12566040db 100644
--- a/fs/btrfs/tree-mod-log.h
+++ b/fs/btrfs/tree-mod-log.h
@@ -41,7 +41,6 @@ int btrfs_tree_mod_log_insert_key(const struct extent_buffer *eb, int slot,
enum btrfs_mod_log_op op);
int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb);
struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
- struct btrfs_path *path,
struct extent_buffer *eb,
u64 time_seq);
struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8f340ad1d938..1cccaf9c2b0d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -733,6 +733,114 @@ const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb)
}
/*
+ * We can have very weird soft links passed in.
+ * One example is "/proc/self/fd/<fd>", which can be a soft link to
+ * a block device.
+ *
+ * But it's never a good idea to use those weird names.
+ * Here we check if the path (not following symlinks) is a good one inside
+ * "/dev/".
+ */
+static bool is_good_dev_path(const char *dev_path)
+{
+ struct path path = { .mnt = NULL, .dentry = NULL };
+ char *path_buf = NULL;
+ char *resolved_path;
+ bool is_good = false;
+ int ret;
+
+ if (!dev_path)
+ goto out;
+
+ path_buf = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (!path_buf)
+ goto out;
+
+ /*
+ * Do not follow soft link, just check if the original path is inside
+ * "/dev/".
+ */
+ ret = kern_path(dev_path, 0, &path);
+ if (ret)
+ goto out;
+ resolved_path = d_path(&path, path_buf, PATH_MAX);
+ if (IS_ERR(resolved_path))
+ goto out;
+ if (strncmp(resolved_path, "/dev/", strlen("/dev/")))
+ goto out;
+ is_good = true;
+out:
+ kfree(path_buf);
+ path_put(&path);
+ return is_good;
+}
+
+static int get_canonical_dev_path(const char *dev_path, char *canonical)
+{
+ struct path path = { .mnt = NULL, .dentry = NULL };
+ char *path_buf = NULL;
+ char *resolved_path;
+ int ret;
+
+ if (!dev_path) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ path_buf = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (!path_buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = kern_path(dev_path, LOOKUP_FOLLOW, &path);
+ if (ret)
+ goto out;
+ resolved_path = d_path(&path, path_buf, PATH_MAX);
+ ret = strscpy(canonical, resolved_path, PATH_MAX);
+out:
+ kfree(path_buf);
+ path_put(&path);
+ return ret;
+}
+
+static bool is_same_device(struct btrfs_device *device, const char *new_path)
+{
+ struct path old = { .mnt = NULL, .dentry = NULL };
+ struct path new = { .mnt = NULL, .dentry = NULL };
+ char *old_path = NULL;
+ bool is_same = false;
+ int ret;
+
+ if (!device->name)
+ goto out;
+
+ old_path = kzalloc(PATH_MAX, GFP_NOFS);
+ if (!old_path)
+ goto out;
+
+ rcu_read_lock();
+ ret = strscpy(old_path, rcu_str_deref(device->name), PATH_MAX);
+ rcu_read_unlock();
+ if (ret < 0)
+ goto out;
+
+ ret = kern_path(old_path, LOOKUP_FOLLOW, &old);
+ if (ret)
+ goto out;
+ ret = kern_path(new_path, LOOKUP_FOLLOW, &new);
+ if (ret)
+ goto out;
+ if (path_equal(&old, &new))
+ is_same = true;
+out:
+ kfree(old_path);
+ path_put(&old);
+ path_put(&new);
+ return is_same;
+}
+
+/*
* Add new device to list of registered devices
*
* Returns:
@@ -852,7 +960,7 @@ static noinline struct btrfs_device *device_list_add(const char *path,
MAJOR(path_devt), MINOR(path_devt),
current->comm, task_pid_nr(current));
- } else if (!device->name || strcmp(device->name->str, path)) {
+ } else if (!device->name || !is_same_device(device, path)) {
/*
* When FS is already mounted.
* 1. If you are here and if the device->name is NULL that
@@ -1105,6 +1213,7 @@ static void btrfs_close_one_device(struct btrfs_device *device)
if (device->bdev) {
fs_devices->open_devices--;
device->bdev = NULL;
+ device->bdev_file = NULL;
}
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
btrfs_destroy_dev_zone_info(device);
@@ -1382,12 +1491,23 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
bool new_device_added = false;
struct btrfs_device *device = NULL;
struct file *bdev_file;
+ char *canonical_path = NULL;
u64 bytenr;
dev_t devt;
int ret;
lockdep_assert_held(&uuid_mutex);
+ if (!is_good_dev_path(path)) {
+ canonical_path = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (canonical_path) {
+ ret = get_canonical_dev_path(path, canonical_path);
+ if (ret < 0) {
+ kfree(canonical_path);
+ canonical_path = NULL;
+ }
+ }
+ }
/*
* Avoid an exclusive open here, as the systemd-udev may initiate the
* device scan which may race with the user's mount or mkfs command,
@@ -1432,7 +1552,8 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
goto free_disk_super;
}
- device = device_list_add(path, disk_super, &new_device_added);
+ device = device_list_add(canonical_path ? : path, disk_super,
+ &new_device_added);
if (!IS_ERR(device) && new_device_added)
btrfs_free_stale_devices(device->devt, device);
@@ -1441,6 +1562,7 @@ free_disk_super:
error_bdev_put:
fput(bdev_file);
+ kfree(canonical_path);
return device;
}
@@ -2720,8 +2842,6 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
set_blocksize(device->bdev_file, BTRFS_BDEV_BLOCKSIZE);
if (seeding_dev) {
- btrfs_clear_sb_rdonly(sb);
-
/* GFP_KERNEL allocation must not be under device_list_mutex */
seed_devices = btrfs_init_sprout(fs_info);
if (IS_ERR(seed_devices)) {
@@ -2864,8 +2984,6 @@ error_sysfs:
mutex_unlock(&fs_info->chunk_mutex);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
error_trans:
- if (seeding_dev)
- btrfs_set_sb_rdonly(sb);
if (trans)
btrfs_end_transaction(trans);
error_free_zone:
@@ -5309,7 +5427,7 @@ static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl,
ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
- /* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */
+ /* stripe_size is fixed in zoned filesystem. Reduce ndevs instead. */
if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies,
ctl->stripe_size) + ctl->nparity,
@@ -5841,24 +5959,6 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
return len;
}
-int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
-{
- struct btrfs_chunk_map *map;
- int ret = 0;
-
- if (!btrfs_fs_incompat(fs_info, RAID56))
- return 0;
-
- map = btrfs_get_chunk_map(fs_info, logical, len);
-
- if (!WARN_ON(IS_ERR(map))) {
- if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
- ret = 1;
- btrfs_free_chunk_map(map);
- }
- return ret;
-}
-
static int find_live_mirror(struct btrfs_fs_info *fs_info,
struct btrfs_chunk_map *map, int first,
int dev_replace_is_ongoing)
@@ -5919,9 +6019,9 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
return preferred_mirror;
}
-static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
- u64 logical,
- u16 total_stripes)
+EXPORT_FOR_TESTS
+struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
+ u64 logical, u16 total_stripes)
{
struct btrfs_io_context *bioc;
@@ -6480,13 +6580,15 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
max_len = btrfs_max_io_len(map, map_offset, &io_geom);
*length = min_t(u64, map->chunk_len - map_offset, max_len);
- down_read(&dev_replace->rwsem);
+ if (dev_replace->replace_task != current)
+ down_read(&dev_replace->rwsem);
+
dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
/*
* Hold the semaphore for read during the whole operation, write is
* requested at commit time but must wait.
*/
- if (!dev_replace_is_ongoing)
+ if (!dev_replace_is_ongoing && dev_replace->replace_task != current)
up_read(&dev_replace->rwsem);
switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
@@ -6626,7 +6728,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
bioc->mirror_num = io_geom.mirror_num;
out:
- if (dev_replace_is_ongoing) {
+ if (dev_replace_is_ongoing && dev_replace->replace_task != current) {
lockdep_assert_held(&dev_replace->rwsem);
/* Unlock and let waiting writers proceed */
up_read(&dev_replace->rwsem);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 4481575dd70f..3a416b1bc24c 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -306,7 +306,7 @@ enum btrfs_read_policy {
BTRFS_NR_READ_POLICY,
};
-#ifdef CONFIG_BTRFS_DEBUG
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
/*
* Checksum mode - offload it to workqueues or do it synchronously in
* btrfs_submit_chunk().
@@ -430,7 +430,7 @@ struct btrfs_fs_devices {
/* Policy used to read the mirrored stripes. */
enum btrfs_read_policy read_policy;
-#ifdef CONFIG_BTRFS_DEBUG
+#ifdef CONFIG_BTRFS_EXPERIMENTAL
/* Checksum mode - offload it or do it synchronously. */
enum btrfs_offload_csum_mode offload_csum_mode;
#endif
@@ -741,8 +741,6 @@ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans);
void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev);
void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev);
void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev);
-int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info,
- u64 logical, u64 len);
unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
u64 logical);
u64 btrfs_calc_stripe_length(const struct btrfs_chunk_map *map);
@@ -840,4 +838,9 @@ bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical);
bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr);
const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb);
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
+ u64 logical, u16 total_stripes);
+#endif
+
#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index ce464cd8e0ac..bc18710d1dcf 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -85,7 +85,6 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
{
struct btrfs_dir_item *di = NULL;
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
size_t name_len = strlen(name);
int ret = 0;
@@ -143,14 +142,14 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode,
*/
ret = 0;
btrfs_assert_tree_write_locked(path->nodes[0]);
- di = btrfs_match_dir_item_name(fs_info, path, name, name_len);
+ di = btrfs_match_dir_item_name(path, name, name_len);
if (!di && !(flags & XATTR_REPLACE)) {
ret = -ENOSPC;
goto out;
}
} else if (ret == -EEXIST) {
ret = 0;
- di = btrfs_match_dir_item_name(fs_info, path, name, name_len);
+ di = btrfs_match_dir_item_name(path, name, name_len);
ASSERT(di); /* logic error */
} else if (ret) {
goto out;
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 100abc00b794..ddf0d5a448a7 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -194,7 +194,7 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping,
pg_off = offset_in_page(start);
cur_len = btrfs_calc_input_length(orig_end, start);
data_in = kmap_local_folio(in_folio, pg_off);
- start += PAGE_SIZE;
+ start += cur_len;
workspace->strm.next_in = data_in;
workspace->strm.avail_in = cur_len;
}
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 69d03feea4e0..11ed523e528e 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -707,11 +707,14 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
* zoned mode. In this case, we don't have a valid max zone
* append size.
*/
- if (bdev_is_zoned(device->bdev)) {
- blk_stack_limits(lim,
- &bdev_get_queue(device->bdev)->limits,
- 0);
- }
+ if (bdev_is_zoned(device->bdev))
+ blk_stack_limits(lim, bdev_limits(device->bdev), 0);
+ }
+
+ ret = blk_validate_limits(lim);
+ if (ret) {
+ btrfs_err(fs_info, "zoned: failed to validate queue limits");
+ return ret;
}
/*
@@ -1739,7 +1742,7 @@ bool btrfs_use_zone_append(struct btrfs_bio *bbio)
return false;
/*
- * Using REQ_OP_ZONE_APPNED for relocation can break assumptions on the
+ * Using REQ_OP_ZONE_APPEND for relocation can break assumptions on the
* extent layout the relocation code has.
* Furthermore we have set aside own block-group from which only the
* relocation "process" can allocate and make sure only one process at a
@@ -1973,7 +1976,7 @@ int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
if (block_group->meta_write_pointer > eb->start)
return -EBUSY;
- /* If for_sync, this hole will be filled with trasnsaction commit. */
+ /* If for_sync, this hole will be filled with transaction commit. */
if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
return -EAGAIN;
return -EBUSY;
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 866607fd3e58..5232b56d5892 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -111,6 +111,8 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer)
unsigned long reclaim_threshold = jiffies - ZSTD_BTRFS_RECLAIM_JIFFIES;
struct list_head *pos, *next;
+ ASSERT(timer == &wsm.timer);
+
spin_lock(&wsm.lock);
if (list_empty(&wsm.lru_list)) {
@@ -495,7 +497,7 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping,
/* Check if we need more input */
if (workspace->in_buf.pos == workspace->in_buf.size) {
- tot_in += PAGE_SIZE;
+ tot_in += workspace->in_buf.size;
kunmap_local(workspace->in_buf.src);
workspace->in_buf.src = NULL;
folio_put(in_folio);
diff --git a/fs/buffer.c b/fs/buffer.c
index 1fc9a50def0b..bb4a31b9559d 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1649,6 +1649,7 @@ void block_invalidate_folio(struct folio *folio, size_t offset, size_t length)
if (length == folio_size(folio))
filemap_release_folio(folio, 0);
out:
+ folio_clear_mappedtodisk(folio);
return;
}
EXPORT_SYMBOL(block_invalidate_folio);
@@ -2803,7 +2804,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio->bi_write_hint = write_hint;
- __bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
+ bio_add_folio_nofail(bio, bh->b_folio, bh->b_size, bh_offset(bh));
bio->bi_end_io = end_bio_bh_io_sync;
bio->bi_private = bh;
@@ -2813,7 +2814,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
if (wbc) {
wbc_init_bio(wbc, bio);
- wbc_account_cgroup_owner(wbc, bh->b_page, bh->b_size);
+ wbc_account_cgroup_owner(wbc, bh->b_folio, bh->b_size);
}
submit_bio(bio);
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 35ba2117a6f6..3e63cfe15874 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -327,6 +327,8 @@ static void cachefiles_commit_object(struct cachefiles_object *object,
static void cachefiles_clean_up_object(struct cachefiles_object *object,
struct cachefiles_cache *cache)
{
+ struct file *file;
+
if (test_bit(FSCACHE_COOKIE_RETIRED, &object->cookie->flags)) {
if (!test_bit(CACHEFILES_OBJECT_USING_TMPFILE, &object->flags)) {
cachefiles_see_object(object, cachefiles_obj_see_clean_delete);
@@ -342,10 +344,14 @@ static void cachefiles_clean_up_object(struct cachefiles_object *object,
}
cachefiles_unmark_inode_in_use(object, object->file);
- if (object->file) {
- fput(object->file);
- object->file = NULL;
- }
+
+ spin_lock(&object->lock);
+ file = object->file;
+ object->file = NULL;
+ spin_unlock(&object->lock);
+
+ if (file)
+ fput(file);
}
/*
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 2b3f9935dbb4..7cf59713f0f7 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -691,11 +691,6 @@ bool cachefiles_commit_tmpfile(struct cachefiles_cache *cache,
}
if (!d_is_negative(dentry)) {
- if (d_backing_inode(dentry) == file_inode(object->file)) {
- success = true;
- goto out_dput;
- }
-
ret = cachefiles_unlink(volume->cache, object, fan, dentry,
FSCACHE_OBJECT_IS_STALE);
if (ret < 0)
diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c
index 470c96658385..fe3de9ad57bf 100644
--- a/fs/cachefiles/ondemand.c
+++ b/fs/cachefiles/ondemand.c
@@ -60,26 +60,36 @@ static ssize_t cachefiles_ondemand_fd_write_iter(struct kiocb *kiocb,
{
struct cachefiles_object *object = kiocb->ki_filp->private_data;
struct cachefiles_cache *cache = object->volume->cache;
- struct file *file = object->file;
- size_t len = iter->count;
+ struct file *file;
+ size_t len = iter->count, aligned_len = len;
loff_t pos = kiocb->ki_pos;
const struct cred *saved_cred;
int ret;
- if (!file)
+ spin_lock(&object->lock);
+ file = object->file;
+ if (!file) {
+ spin_unlock(&object->lock);
return -ENOBUFS;
+ }
+ get_file(file);
+ spin_unlock(&object->lock);
cachefiles_begin_secure(cache, &saved_cred);
- ret = __cachefiles_prepare_write(object, file, &pos, &len, len, true);
+ ret = __cachefiles_prepare_write(object, file, &pos, &aligned_len, len, true);
cachefiles_end_secure(cache, saved_cred);
if (ret < 0)
- return ret;
+ goto out;
trace_cachefiles_ondemand_fd_write(object, file_inode(file), pos, len);
ret = __cachefiles_write(object, file, pos, iter, NULL, NULL);
- if (!ret)
+ if (!ret) {
ret = len;
+ kiocb->ki_pos += ret;
+ }
+out:
+ fput(file);
return ret;
}
@@ -87,12 +97,22 @@ static loff_t cachefiles_ondemand_fd_llseek(struct file *filp, loff_t pos,
int whence)
{
struct cachefiles_object *object = filp->private_data;
- struct file *file = object->file;
+ struct file *file;
+ loff_t ret;
- if (!file)
+ spin_lock(&object->lock);
+ file = object->file;
+ if (!file) {
+ spin_unlock(&object->lock);
return -ENOBUFS;
+ }
+ get_file(file);
+ spin_unlock(&object->lock);
- return vfs_llseek(file, pos, whence);
+ ret = vfs_llseek(file, pos, whence);
+ fput(file);
+
+ return ret;
}
static long cachefiles_ondemand_fd_ioctl(struct file *filp, unsigned int ioctl,
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index c2a9e2cc03de..4c82348fe1e6 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1054,7 +1054,9 @@ get_more_pages:
if (!nr_folios && !locked_pages)
break;
for (i = 0; i < nr_folios && locked_pages < max_pages; i++) {
- page = &fbatch.folios[i]->page;
+ struct folio *folio = fbatch.folios[i];
+
+ page = &folio->page;
doutc(cl, "? %p idx %lu\n", page, page->index);
if (locked_pages == 0)
lock_page(page); /* first page */
@@ -1081,8 +1083,6 @@ get_more_pages:
continue;
}
if (page_offset(page) >= ceph_wbc.i_size) {
- struct folio *folio = page_folio(page);
-
doutc(cl, "folio at %lu beyond eof %llu\n",
folio->index, ceph_wbc.i_size);
if ((ceph_wbc.size_stable ||
@@ -1098,16 +1098,16 @@ get_more_pages:
unlock_page(page);
break;
}
- if (PageWriteback(page) ||
- PagePrivate2(page) /* [DEPRECATED] */) {
+ if (folio_test_writeback(folio) ||
+ folio_test_private_2(folio) /* [DEPRECATED] */) {
if (wbc->sync_mode == WB_SYNC_NONE) {
- doutc(cl, "%p under writeback\n", page);
- unlock_page(page);
+ doutc(cl, "%p under writeback\n", folio);
+ folio_unlock(folio);
continue;
}
- doutc(cl, "waiting on writeback %p\n", page);
- wait_on_page_writeback(page);
- folio_wait_private_2(page_folio(page)); /* [DEPRECATED] */
+ doutc(cl, "waiting on writeback %p\n", folio);
+ folio_wait_writeback(folio);
+ folio_wait_private_2(folio); /* [DEPRECATED] */
}
if (!clear_page_dirty_for_io(page)) {
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 57cc096c498a..c2ddb998f3c9 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -562,8 +562,8 @@ int cdev_device_add(struct cdev *cdev, struct device *dev)
/**
* cdev_device_del() - inverse of cdev_device_add
- * @dev: the device structure
* @cdev: the cdev structure
+ * @dev: the device structure
*
* cdev_device_del() is a helper function to call cdev_del and device_del.
* It should be used whenever cdev_device_add is used.
diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c
index 8f0af4f62631..d5ef5469e4e6 100644
--- a/fs/compat_binfmt_elf.c
+++ b/fs/compat_binfmt_elf.c
@@ -80,6 +80,16 @@
#define ELF_HWCAP2 COMPAT_ELF_HWCAP2
#endif
+#ifdef COMPAT_ELF_HWCAP3
+#undef ELF_HWCAP3
+#define ELF_HWCAP3 COMPAT_ELF_HWCAP3
+#endif
+
+#ifdef COMPAT_ELF_HWCAP4
+#undef ELF_HWCAP4
+#define ELF_HWCAP4 COMPAT_ELF_HWCAP4
+#endif
+
#ifdef COMPAT_ARCH_DLINFO
#undef ARCH_DLINFO
#define ARCH_DLINFO COMPAT_ARCH_DLINFO
diff --git a/fs/coredump.c b/fs/coredump.c
index 45737b43dda5..d48edb37bc35 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -951,6 +951,7 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start,
} else {
dump_skip(cprm, PAGE_SIZE);
}
+ cond_resched();
}
dump_page_free(dump_page);
return 1;
diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c
index 206835e31efa..787e9c8938ba 100644
--- a/fs/crypto/keyring.c
+++ b/fs/crypto/keyring.c
@@ -22,6 +22,7 @@
#include <crypto/skcipher.h>
#include <linux/key-type.h>
#include <linux/random.h>
+#include <linux/once.h>
#include <linux/seq_file.h>
#include "fscrypt_private.h"
diff --git a/fs/dax.c b/fs/dax.c
index c62acd2812f8..21b47402b3dc 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1262,35 +1262,46 @@ static s64 dax_unshare_iter(struct iomap_iter *iter)
{
struct iomap *iomap = &iter->iomap;
const struct iomap *srcmap = iomap_iter_srcmap(iter);
- loff_t pos = iter->pos;
- loff_t length = iomap_length(iter);
+ loff_t copy_pos = iter->pos;
+ u64 copy_len = iomap_length(iter);
+ u32 mod;
int id = 0;
s64 ret = 0;
void *daddr = NULL, *saddr = NULL;
- /* don't bother with blocks that are not shared to start with */
- if (!(iomap->flags & IOMAP_F_SHARED))
- return length;
+ if (!iomap_want_unshare_iter(iter))
+ return iomap_length(iter);
+
+ /*
+ * Extend the file range to be aligned to fsblock/pagesize, because
+ * we need to copy entire blocks, not just the byte range specified.
+ * Invalidate the mapping because we're about to CoW.
+ */
+ mod = offset_in_page(copy_pos);
+ if (mod) {
+ copy_len += mod;
+ copy_pos -= mod;
+ }
+
+ mod = offset_in_page(copy_pos + copy_len);
+ if (mod)
+ copy_len += PAGE_SIZE - mod;
+
+ invalidate_inode_pages2_range(iter->inode->i_mapping,
+ copy_pos >> PAGE_SHIFT,
+ (copy_pos + copy_len - 1) >> PAGE_SHIFT);
id = dax_read_lock();
- ret = dax_iomap_direct_access(iomap, pos, length, &daddr, NULL);
+ ret = dax_iomap_direct_access(iomap, copy_pos, copy_len, &daddr, NULL);
if (ret < 0)
goto out_unlock;
- /* zero the distance if srcmap is HOLE or UNWRITTEN */
- if (srcmap->flags & IOMAP_F_SHARED || srcmap->type == IOMAP_UNWRITTEN) {
- memset(daddr, 0, length);
- dax_flush(iomap->dax_dev, daddr, length);
- ret = length;
- goto out_unlock;
- }
-
- ret = dax_iomap_direct_access(srcmap, pos, length, &saddr, NULL);
+ ret = dax_iomap_direct_access(srcmap, copy_pos, copy_len, &saddr, NULL);
if (ret < 0)
goto out_unlock;
- if (copy_mc_to_kernel(daddr, saddr, length) == 0)
- ret = length;
+ if (copy_mc_to_kernel(daddr, saddr, copy_len) == 0)
+ ret = iomap_length(iter);
else
ret = -EIO;
diff --git a/fs/dcache.c b/fs/dcache.c
index 0f6b16ba30d0..0099077a2982 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -135,6 +135,7 @@ struct dentry_stat_t {
static DEFINE_PER_CPU(long, nr_dentry);
static DEFINE_PER_CPU(long, nr_dentry_unused);
static DEFINE_PER_CPU(long, nr_dentry_negative);
+static int dentry_negative_policy;
#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
/* Statistics gathering. */
@@ -199,6 +200,15 @@ static struct ctl_table fs_dcache_sysctls[] = {
.mode = 0444,
.proc_handler = proc_nr_dentry,
},
+ {
+ .procname = "dentry-negative",
+ .data = &dentry_negative_policy,
+ .maxlen = sizeof(dentry_negative_policy),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
+ },
};
static int __init init_fs_dcache_sysctls(void)
@@ -2039,8 +2049,8 @@ EXPORT_SYMBOL(d_obtain_root);
/**
* d_add_ci - lookup or allocate new dentry with case-exact name
- * @inode: the inode case-insensitive lookup has found
* @dentry: the negative dentry that was passed to the parent's lookup func
+ * @inode: the inode case-insensitive lookup has found
* @name: the case-exact name to be associated with the returned dentry
*
* This is to avoid filling the dcache with case-insensitive names to the
@@ -2093,8 +2103,8 @@ EXPORT_SYMBOL(d_add_ci);
/**
* d_same_name - compare dentry name with case-exact name
- * @parent: parent dentry
* @dentry: the negative dentry that was passed to the parent's lookup func
+ * @parent: parent dentry
* @name: the case-exact name to be associated with the returned dentry
*
* Return: true if names are same, or false
@@ -2401,6 +2411,8 @@ void d_delete(struct dentry * dentry)
* Are we the only user?
*/
if (dentry->d_lockref.count == 1) {
+ if (dentry_negative_policy)
+ __d_drop(dentry);
dentry->d_flags &= ~DCACHE_CANT_MOUNT;
dentry_unlink_inode(dentry);
} else {
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 827278525fd9..69536cacdea8 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -328,10 +328,10 @@ out:
* Convert an eCryptfs page index into a lower byte offset
*/
static loff_t lower_offset_for_page(struct ecryptfs_crypt_stat *crypt_stat,
- struct page *page)
+ struct folio *folio)
{
return ecryptfs_lower_header_size(crypt_stat) +
- ((loff_t)page->index << PAGE_SHIFT);
+ (loff_t)folio->index * PAGE_SIZE;
}
/**
@@ -340,6 +340,7 @@ static loff_t lower_offset_for_page(struct ecryptfs_crypt_stat *crypt_stat,
* encryption operation
* @dst_page: The page to write the result into
* @src_page: The page to read from
+ * @page_index: The offset in the file (in units of PAGE_SIZE)
* @extent_offset: Page extent offset for use in generating IV
* @op: ENCRYPT or DECRYPT to indicate the desired operation
*
@@ -350,9 +351,9 @@ static loff_t lower_offset_for_page(struct ecryptfs_crypt_stat *crypt_stat,
static int crypt_extent(struct ecryptfs_crypt_stat *crypt_stat,
struct page *dst_page,
struct page *src_page,
+ pgoff_t page_index,
unsigned long extent_offset, int op)
{
- pgoff_t page_index = op == ENCRYPT ? src_page->index : dst_page->index;
loff_t extent_base;
char extent_iv[ECRYPTFS_MAX_IV_BYTES];
struct scatterlist src_sg, dst_sg;
@@ -392,7 +393,7 @@ out:
/**
* ecryptfs_encrypt_page
- * @page: Page mapped from the eCryptfs inode for the file; contains
+ * @folio: Folio mapped from the eCryptfs inode for the file; contains
* decrypted content that needs to be encrypted (to a temporary
* page; not in place) and written out to the lower file
*
@@ -406,7 +407,7 @@ out:
*
* Returns zero on success; negative on error
*/
-int ecryptfs_encrypt_page(struct page *page)
+int ecryptfs_encrypt_page(struct folio *folio)
{
struct inode *ecryptfs_inode;
struct ecryptfs_crypt_stat *crypt_stat;
@@ -416,7 +417,7 @@ int ecryptfs_encrypt_page(struct page *page)
loff_t lower_offset;
int rc = 0;
- ecryptfs_inode = page->mapping->host;
+ ecryptfs_inode = folio->mapping->host;
crypt_stat =
&(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat);
BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED));
@@ -431,8 +432,9 @@ int ecryptfs_encrypt_page(struct page *page)
for (extent_offset = 0;
extent_offset < (PAGE_SIZE / crypt_stat->extent_size);
extent_offset++) {
- rc = crypt_extent(crypt_stat, enc_extent_page, page,
- extent_offset, ENCRYPT);
+ rc = crypt_extent(crypt_stat, enc_extent_page,
+ folio_page(folio, 0), folio->index,
+ extent_offset, ENCRYPT);
if (rc) {
printk(KERN_ERR "%s: Error encrypting extent; "
"rc = [%d]\n", __func__, rc);
@@ -440,7 +442,7 @@ int ecryptfs_encrypt_page(struct page *page)
}
}
- lower_offset = lower_offset_for_page(crypt_stat, page);
+ lower_offset = lower_offset_for_page(crypt_stat, folio);
enc_extent_virt = kmap_local_page(enc_extent_page);
rc = ecryptfs_write_lower(ecryptfs_inode, enc_extent_virt, lower_offset,
PAGE_SIZE);
@@ -461,7 +463,7 @@ out:
/**
* ecryptfs_decrypt_page
- * @page: Page mapped from the eCryptfs inode for the file; data read
+ * @folio: Folio mapped from the eCryptfs inode for the file; data read
* and decrypted from the lower file will be written into this
* page
*
@@ -475,7 +477,7 @@ out:
*
* Returns zero on success; negative on error
*/
-int ecryptfs_decrypt_page(struct page *page)
+int ecryptfs_decrypt_page(struct folio *folio)
{
struct inode *ecryptfs_inode;
struct ecryptfs_crypt_stat *crypt_stat;
@@ -484,13 +486,13 @@ int ecryptfs_decrypt_page(struct page *page)
loff_t lower_offset;
int rc = 0;
- ecryptfs_inode = page->mapping->host;
+ ecryptfs_inode = folio->mapping->host;
crypt_stat =
&(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat);
BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED));
- lower_offset = lower_offset_for_page(crypt_stat, page);
- page_virt = kmap_local_page(page);
+ lower_offset = lower_offset_for_page(crypt_stat, folio);
+ page_virt = kmap_local_folio(folio, 0);
rc = ecryptfs_read_lower(page_virt, lower_offset, PAGE_SIZE,
ecryptfs_inode);
kunmap_local(page_virt);
@@ -504,8 +506,9 @@ int ecryptfs_decrypt_page(struct page *page)
for (extent_offset = 0;
extent_offset < (PAGE_SIZE / crypt_stat->extent_size);
extent_offset++) {
- rc = crypt_extent(crypt_stat, page, page,
- extent_offset, DECRYPT);
+ struct page *page = folio_page(folio, 0);
+ rc = crypt_extent(crypt_stat, page, page, folio->index,
+ extent_offset, DECRYPT);
if (rc) {
printk(KERN_ERR "%s: Error decrypting extent; "
"rc = [%d]\n", __func__, rc);
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index c586c5db18b5..1f562e75d0e4 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -569,8 +569,8 @@ void ecryptfs_destroy_mount_crypt_stat(
struct ecryptfs_mount_crypt_stat *mount_crypt_stat);
int ecryptfs_init_crypt_ctx(struct ecryptfs_crypt_stat *crypt_stat);
int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode);
-int ecryptfs_encrypt_page(struct page *page);
-int ecryptfs_decrypt_page(struct page *page);
+int ecryptfs_encrypt_page(struct folio *folio);
+int ecryptfs_decrypt_page(struct folio *folio);
int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry,
struct inode *ecryptfs_inode);
int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry);
@@ -653,16 +653,15 @@ int ecryptfs_keyring_auth_tok_for_sig(struct key **auth_tok_key,
int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data,
loff_t offset, size_t size);
int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
- struct page *page_for_lower,
+ struct folio *folio_for_lower,
size_t offset_in_page, size_t size);
int ecryptfs_write(struct inode *inode, char *data, loff_t offset, size_t size);
int ecryptfs_read_lower(char *data, loff_t offset, size_t size,
struct inode *ecryptfs_inode);
-int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
+int ecryptfs_read_lower_page_segment(struct folio *folio_for_ecryptfs,
pgoff_t page_index,
size_t offset_in_page, size_t size,
struct inode *ecryptfs_inode);
-struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index);
int ecryptfs_parse_packet_length(unsigned char *data, size_t *size,
size_t *length_size);
int ecryptfs_write_packet_length(char *dest, size_t size,
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index cbdf82f0183f..a9819ddb1ab8 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -1008,14 +1008,6 @@ static int ecryptfs_getattr_link(struct mnt_idmap *idmap,
return rc;
}
-static int ecryptfs_do_getattr(const struct path *path, struct kstat *stat,
- u32 request_mask, unsigned int flags)
-{
- if (flags & AT_GETATTR_NOSEC)
- return vfs_getattr_nosec(path, stat, request_mask, flags);
- return vfs_getattr(path, stat, request_mask, flags);
-}
-
static int ecryptfs_getattr(struct mnt_idmap *idmap,
const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int flags)
@@ -1024,8 +1016,8 @@ static int ecryptfs_getattr(struct mnt_idmap *idmap,
struct kstat lower_stat;
int rc;
- rc = ecryptfs_do_getattr(ecryptfs_dentry_to_lower_path(dentry),
- &lower_stat, request_mask, flags);
+ rc = vfs_getattr_nosec(ecryptfs_dentry_to_lower_path(dentry),
+ &lower_stat, request_mask, flags);
if (!rc) {
fsstack_copy_attr_all(d_inode(dentry),
ecryptfs_inode_to_lower(d_inode(dentry)));
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index ceda5555971a..60f0ac8744b5 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -23,47 +23,29 @@
#include "ecryptfs_kernel.h"
/*
- * ecryptfs_get_locked_page
- *
- * Get one page from cache or lower f/s, return error otherwise.
- *
- * Returns locked and up-to-date page (if ok), with increased
- * refcnt.
- */
-struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index)
-{
- struct page *page = read_mapping_page(inode->i_mapping, index, NULL);
- if (!IS_ERR(page))
- lock_page(page);
- return page;
-}
-
-/**
- * ecryptfs_writepage
- * @page: Page that is locked before this call is made
- * @wbc: Write-back control structure
- *
- * Returns zero on success; non-zero otherwise
- *
* This is where we encrypt the data and pass the encrypted data to
* the lower filesystem. In OpenPGP-compatible mode, we operate on
* entire underlying packets.
*/
-static int ecryptfs_writepage(struct page *page, struct writeback_control *wbc)
+static int ecryptfs_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
{
- int rc;
-
- rc = ecryptfs_encrypt_page(page);
- if (rc) {
- ecryptfs_printk(KERN_WARNING, "Error encrypting "
- "page (upper index [0x%.16lx])\n", page->index);
- ClearPageUptodate(page);
- goto out;
+ struct folio *folio = NULL;
+ int error;
+
+ while ((folio = writeback_iter(mapping, wbc, folio, &error))) {
+ error = ecryptfs_encrypt_page(folio);
+ if (error) {
+ ecryptfs_printk(KERN_WARNING,
+ "Error encrypting folio (index [0x%.16lx])\n",
+ folio->index);
+ folio_clear_uptodate(folio);
+ mapping_set_error(mapping, error);
+ }
+ folio_unlock(folio);
}
- SetPageUptodate(page);
-out:
- unlock_page(page);
- return rc;
+
+ return error;
}
static void strip_xattr_flag(char *page_virt,
@@ -97,7 +79,7 @@ static void strip_xattr_flag(char *page_virt,
/**
* ecryptfs_copy_up_encrypted_with_header
- * @page: Sort of a ``virtual'' representation of the encrypted lower
+ * @folio: Sort of a ``virtual'' representation of the encrypted lower
* file. The actual lower file does not have the metadata in
* the header. This is locked.
* @crypt_stat: The eCryptfs inode's cryptographic context
@@ -106,7 +88,7 @@ static void strip_xattr_flag(char *page_virt,
* seeing, with the header information inserted.
*/
static int
-ecryptfs_copy_up_encrypted_with_header(struct page *page,
+ecryptfs_copy_up_encrypted_with_header(struct folio *folio,
struct ecryptfs_crypt_stat *crypt_stat)
{
loff_t extent_num_in_page = 0;
@@ -115,9 +97,9 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
int rc = 0;
while (extent_num_in_page < num_extents_per_page) {
- loff_t view_extent_num = ((((loff_t)page->index)
+ loff_t view_extent_num = ((loff_t)folio->index
* num_extents_per_page)
- + extent_num_in_page);
+ + extent_num_in_page;
size_t num_header_extents_at_front =
(crypt_stat->metadata_size / crypt_stat->extent_size);
@@ -125,21 +107,21 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
/* This is a header extent */
char *page_virt;
- page_virt = kmap_local_page(page);
+ page_virt = kmap_local_folio(folio, 0);
memset(page_virt, 0, PAGE_SIZE);
/* TODO: Support more than one header extent */
if (view_extent_num == 0) {
size_t written;
rc = ecryptfs_read_xattr_region(
- page_virt, page->mapping->host);
+ page_virt, folio->mapping->host);
strip_xattr_flag(page_virt + 16, crypt_stat);
ecryptfs_write_header_metadata(page_virt + 20,
crypt_stat,
&written);
}
kunmap_local(page_virt);
- flush_dcache_page(page);
+ flush_dcache_folio(folio);
if (rc) {
printk(KERN_ERR "%s: Error reading xattr "
"region; rc = [%d]\n", __func__, rc);
@@ -152,9 +134,9 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page,
- crypt_stat->metadata_size);
rc = ecryptfs_read_lower_page_segment(
- page, (lower_offset >> PAGE_SHIFT),
+ folio, (lower_offset >> PAGE_SHIFT),
(lower_offset & ~PAGE_MASK),
- crypt_stat->extent_size, page->mapping->host);
+ crypt_stat->extent_size, folio->mapping->host);
if (rc) {
printk(KERN_ERR "%s: Error attempting to read "
"extent at offset [%lld] in the lower "
@@ -180,55 +162,50 @@ out:
*/
static int ecryptfs_read_folio(struct file *file, struct folio *folio)
{
- struct page *page = &folio->page;
+ struct inode *inode = folio->mapping->host;
struct ecryptfs_crypt_stat *crypt_stat =
- &ecryptfs_inode_to_private(page->mapping->host)->crypt_stat;
- int rc = 0;
+ &ecryptfs_inode_to_private(inode)->crypt_stat;
+ int err = 0;
if (!crypt_stat || !(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
- rc = ecryptfs_read_lower_page_segment(page, page->index, 0,
- PAGE_SIZE,
- page->mapping->host);
+ err = ecryptfs_read_lower_page_segment(folio, folio->index, 0,
+ folio_size(folio), inode);
} else if (crypt_stat->flags & ECRYPTFS_VIEW_AS_ENCRYPTED) {
if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) {
- rc = ecryptfs_copy_up_encrypted_with_header(page,
- crypt_stat);
- if (rc) {
+ err = ecryptfs_copy_up_encrypted_with_header(folio,
+ crypt_stat);
+ if (err) {
printk(KERN_ERR "%s: Error attempting to copy "
"the encrypted content from the lower "
"file whilst inserting the metadata "
- "from the xattr into the header; rc = "
- "[%d]\n", __func__, rc);
+ "from the xattr into the header; err = "
+ "[%d]\n", __func__, err);
goto out;
}
} else {
- rc = ecryptfs_read_lower_page_segment(
- page, page->index, 0, PAGE_SIZE,
- page->mapping->host);
- if (rc) {
- printk(KERN_ERR "Error reading page; rc = "
- "[%d]\n", rc);
+ err = ecryptfs_read_lower_page_segment(folio,
+ folio->index, 0, folio_size(folio),
+ inode);
+ if (err) {
+ printk(KERN_ERR "Error reading page; err = "
+ "[%d]\n", err);
goto out;
}
}
} else {
- rc = ecryptfs_decrypt_page(page);
- if (rc) {
+ err = ecryptfs_decrypt_page(folio);
+ if (err) {
ecryptfs_printk(KERN_ERR, "Error decrypting page; "
- "rc = [%d]\n", rc);
+ "err = [%d]\n", err);
goto out;
}
}
out:
- if (rc)
- ClearPageUptodate(page);
- else
- SetPageUptodate(page);
- ecryptfs_printk(KERN_DEBUG, "Unlocking page with index = [0x%.16lx]\n",
- page->index);
- unlock_page(page);
- return rc;
+ ecryptfs_printk(KERN_DEBUG, "Unlocking folio with index = [0x%.16lx]\n",
+ folio->index);
+ folio_end_read(folio, err == 0);
+ return err;
}
/*
@@ -285,7 +262,7 @@ static int ecryptfs_write_begin(struct file *file,
if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
rc = ecryptfs_read_lower_page_segment(
- &folio->page, index, 0, PAGE_SIZE, mapping->host);
+ folio, index, 0, PAGE_SIZE, mapping->host);
if (rc) {
printk(KERN_ERR "%s: Error attempting to read "
"lower page segment; rc = [%d]\n",
@@ -297,7 +274,7 @@ static int ecryptfs_write_begin(struct file *file,
} else if (crypt_stat->flags & ECRYPTFS_VIEW_AS_ENCRYPTED) {
if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) {
rc = ecryptfs_copy_up_encrypted_with_header(
- &folio->page, crypt_stat);
+ folio, crypt_stat);
if (rc) {
printk(KERN_ERR "%s: Error attempting "
"to copy the encrypted content "
@@ -311,7 +288,7 @@ static int ecryptfs_write_begin(struct file *file,
folio_mark_uptodate(folio);
} else {
rc = ecryptfs_read_lower_page_segment(
- &folio->page, index, 0, PAGE_SIZE,
+ folio, index, 0, PAGE_SIZE,
mapping->host);
if (rc) {
printk(KERN_ERR "%s: Error reading "
@@ -328,7 +305,7 @@ static int ecryptfs_write_begin(struct file *file,
folio_zero_range(folio, 0, PAGE_SIZE);
folio_mark_uptodate(folio);
} else if (len < PAGE_SIZE) {
- rc = ecryptfs_decrypt_page(&folio->page);
+ rc = ecryptfs_decrypt_page(folio);
if (rc) {
printk(KERN_ERR "%s: Error decrypting "
"page at index [%ld]; "
@@ -477,7 +454,7 @@ static int ecryptfs_write_end(struct file *file,
"(page w/ index = [0x%.16lx], to = [%d])\n", index, to);
if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
rc = ecryptfs_write_lower_page_segment(ecryptfs_inode,
- &folio->page, 0, to);
+ folio, 0, to);
if (!rc) {
rc = copied;
fsstack_copy_inode_size(ecryptfs_inode,
@@ -499,7 +476,7 @@ static int ecryptfs_write_end(struct file *file,
"zeros in page with index = [0x%.16lx]\n", index);
goto out;
}
- rc = ecryptfs_encrypt_page(&folio->page);
+ rc = ecryptfs_encrypt_page(folio);
if (rc) {
ecryptfs_printk(KERN_WARNING, "Error encrypting page (upper "
"index [0x%.16lx])\n", index);
@@ -548,9 +525,10 @@ const struct address_space_operations ecryptfs_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
#endif
- .writepage = ecryptfs_writepage,
+ .writepages = ecryptfs_writepages,
.read_folio = ecryptfs_read_folio,
.write_begin = ecryptfs_write_begin,
.write_end = ecryptfs_write_end,
+ .migrate_folio = filemap_migrate_folio,
.bmap = ecryptfs_bmap,
};
diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c
index 3458f153a588..b3b451c2b941 100644
--- a/fs/ecryptfs/read_write.c
+++ b/fs/ecryptfs/read_write.c
@@ -41,30 +41,29 @@ int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data,
/**
* ecryptfs_write_lower_page_segment
* @ecryptfs_inode: The eCryptfs inode
- * @page_for_lower: The page containing the data to be written to the
+ * @folio_for_lower: The folio containing the data to be written to the
* lower file
- * @offset_in_page: The offset in the @page_for_lower from which to
+ * @offset_in_page: The offset in the @folio_for_lower from which to
* start writing the data
- * @size: The amount of data from @page_for_lower to write to the
+ * @size: The amount of data from @folio_for_lower to write to the
* lower file
*
* Determines the byte offset in the file for the given page and
* offset within the page, maps the page, and makes the call to write
- * the contents of @page_for_lower to the lower inode.
+ * the contents of @folio_for_lower to the lower inode.
*
* Returns zero on success; non-zero otherwise
*/
int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
- struct page *page_for_lower,
+ struct folio *folio_for_lower,
size_t offset_in_page, size_t size)
{
char *virt;
loff_t offset;
int rc;
- offset = ((((loff_t)page_for_lower->index) << PAGE_SHIFT)
- + offset_in_page);
- virt = kmap_local_page(page_for_lower);
+ offset = (loff_t)folio_for_lower->index * PAGE_SIZE + offset_in_page;
+ virt = kmap_local_folio(folio_for_lower, 0);
rc = ecryptfs_write_lower(ecryptfs_inode, virt, offset, size);
if (rc > 0)
rc = 0;
@@ -93,7 +92,6 @@ int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode,
int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
size_t size)
{
- struct page *ecryptfs_page;
struct ecryptfs_crypt_stat *crypt_stat;
char *ecryptfs_page_virt;
loff_t ecryptfs_file_size = i_size_read(ecryptfs_inode);
@@ -111,6 +109,7 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
else
pos = offset;
while (pos < (offset + size)) {
+ struct folio *ecryptfs_folio;
pgoff_t ecryptfs_page_idx = (pos >> PAGE_SHIFT);
size_t start_offset_in_page = (pos & ~PAGE_MASK);
size_t num_bytes = (PAGE_SIZE - start_offset_in_page);
@@ -130,17 +129,18 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
if (num_bytes > total_remaining_zeros)
num_bytes = total_remaining_zeros;
}
- ecryptfs_page = ecryptfs_get_locked_page(ecryptfs_inode,
- ecryptfs_page_idx);
- if (IS_ERR(ecryptfs_page)) {
- rc = PTR_ERR(ecryptfs_page);
+ ecryptfs_folio = read_mapping_folio(ecryptfs_inode->i_mapping,
+ ecryptfs_page_idx, NULL);
+ if (IS_ERR(ecryptfs_folio)) {
+ rc = PTR_ERR(ecryptfs_folio);
printk(KERN_ERR "%s: Error getting page at "
"index [%ld] from eCryptfs inode "
"mapping; rc = [%d]\n", __func__,
ecryptfs_page_idx, rc);
goto out;
}
- ecryptfs_page_virt = kmap_local_page(ecryptfs_page);
+ folio_lock(ecryptfs_folio);
+ ecryptfs_page_virt = kmap_local_folio(ecryptfs_folio, 0);
/*
* pos: where we're now writing, offset: where the request was
@@ -164,17 +164,17 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset,
data_offset += num_bytes;
}
kunmap_local(ecryptfs_page_virt);
- flush_dcache_page(ecryptfs_page);
- SetPageUptodate(ecryptfs_page);
- unlock_page(ecryptfs_page);
+ flush_dcache_folio(ecryptfs_folio);
+ folio_mark_uptodate(ecryptfs_folio);
+ folio_unlock(ecryptfs_folio);
if (crypt_stat->flags & ECRYPTFS_ENCRYPTED)
- rc = ecryptfs_encrypt_page(ecryptfs_page);
+ rc = ecryptfs_encrypt_page(ecryptfs_folio);
else
rc = ecryptfs_write_lower_page_segment(ecryptfs_inode,
- ecryptfs_page,
+ ecryptfs_folio,
start_offset_in_page,
data_offset);
- put_page(ecryptfs_page);
+ folio_put(ecryptfs_folio);
if (rc) {
printk(KERN_ERR "%s: Error encrypting "
"page; rc = [%d]\n", __func__, rc);
@@ -228,7 +228,7 @@ int ecryptfs_read_lower(char *data, loff_t offset, size_t size,
/**
* ecryptfs_read_lower_page_segment
- * @page_for_ecryptfs: The page into which data for eCryptfs will be
+ * @folio_for_ecryptfs: The folio into which data for eCryptfs will be
* written
* @page_index: Page index in @page_for_ecryptfs from which to start
* writing
@@ -243,7 +243,7 @@ int ecryptfs_read_lower(char *data, loff_t offset, size_t size,
*
* Returns zero on success; non-zero otherwise
*/
-int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
+int ecryptfs_read_lower_page_segment(struct folio *folio_for_ecryptfs,
pgoff_t page_index,
size_t offset_in_page, size_t size,
struct inode *ecryptfs_inode)
@@ -252,12 +252,12 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs,
loff_t offset;
int rc;
- offset = ((((loff_t)page_index) << PAGE_SHIFT) + offset_in_page);
- virt = kmap_local_page(page_for_ecryptfs);
+ offset = (loff_t)page_index * PAGE_SIZE + offset_in_page;
+ virt = kmap_local_folio(folio_for_ecryptfs, 0);
rc = ecryptfs_read_lower(virt, offset, size, ecryptfs_inode);
if (rc > 0)
rc = 0;
kunmap_local(virt);
- flush_dcache_page(page_for_ecryptfs);
+ flush_dcache_folio(folio_for_ecryptfs);
return rc;
}
diff --git a/fs/efs/super.c b/fs/efs/super.c
index e4421c10caeb..c59086b7eabf 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -15,7 +15,6 @@
#include <linux/vfs.h>
#include <linux/blkdev.h>
#include <linux/fs_context.h>
-#include <linux/fs_parser.h>
#include "efs.h"
#include <linux/efs_vh.h>
#include <linux/efs_fs_sb.h>
@@ -49,15 +48,6 @@ static struct pt_types sgi_pt_types[] = {
{0, NULL}
};
-enum {
- Opt_explicit_open,
-};
-
-static const struct fs_parameter_spec efs_param_spec[] = {
- fsparam_flag ("explicit-open", Opt_explicit_open),
- {}
-};
-
/*
* File system definition and registration.
*/
@@ -67,7 +57,6 @@ static struct file_system_type efs_fs_type = {
.kill_sb = efs_kill_sb,
.fs_flags = FS_REQUIRES_DEV,
.init_fs_context = efs_init_fs_context,
- .parameters = efs_param_spec,
};
MODULE_ALIAS_FS("efs");
@@ -265,7 +254,8 @@ static int efs_fill_super(struct super_block *s, struct fs_context *fc)
if (!sb_set_blocksize(s, EFS_BLOCKSIZE)) {
pr_err("device does not support %d byte blocks\n",
EFS_BLOCKSIZE);
- return -EINVAL;
+ return invalf(fc, "device does not support %d byte blocks\n",
+ EFS_BLOCKSIZE);
}
/* read the vh (volume header) block */
@@ -327,43 +317,22 @@ static int efs_fill_super(struct super_block *s, struct fs_context *fc)
return 0;
}
-static void efs_free_fc(struct fs_context *fc)
-{
- kfree(fc->fs_private);
-}
-
static int efs_get_tree(struct fs_context *fc)
{
return get_tree_bdev(fc, efs_fill_super);
}
-static int efs_parse_param(struct fs_context *fc, struct fs_parameter *param)
-{
- int token;
- struct fs_parse_result result;
-
- token = fs_parse(fc, efs_param_spec, param, &result);
- if (token < 0)
- return token;
- return 0;
-}
-
static int efs_reconfigure(struct fs_context *fc)
{
sync_filesystem(fc->root->d_sb);
+ fc->sb_flags |= SB_RDONLY;
return 0;
}
-struct efs_context {
- unsigned long s_mount_opts;
-};
-
static const struct fs_context_operations efs_context_opts = {
- .parse_param = efs_parse_param,
.get_tree = efs_get_tree,
.reconfigure = efs_reconfigure,
- .free = efs_free_fc,
};
/*
@@ -371,12 +340,6 @@ static const struct fs_context_operations efs_context_opts = {
*/
static int efs_init_fs_context(struct fs_context *fc)
{
- struct efs_context *ctx;
-
- ctx = kzalloc(sizeof(struct efs_context), GFP_KERNEL);
- if (!ctx)
- return -ENOMEM;
- fc->fs_private = ctx;
fc->ops = &efs_context_opts;
return 0;
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 666873f745da..bed3dbe5b7cb 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -191,10 +191,14 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
if (IS_ERR(file))
return PTR_ERR(file);
- dif->file = file;
- if (!erofs_is_fileio_mode(sbi))
+ if (!erofs_is_fileio_mode(sbi)) {
dif->dax_dev = fs_dax_get_by_bdev(file_bdev(file),
&dif->dax_part_off, NULL, NULL);
+ } else if (!S_ISREG(file_inode(file)->i_mode)) {
+ fput(file);
+ return -EINVAL;
+ }
+ dif->file = file;
}
dif->blocks = le32_to_cpu(dis->blocks);
@@ -705,7 +709,9 @@ static int erofs_fc_get_tree(struct fs_context *fc)
if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && sbi->fsid)
return get_tree_nodev(fc, erofs_fc_fill_super);
- ret = get_tree_bdev(fc, erofs_fc_fill_super);
+ ret = get_tree_bdev_flags(fc, erofs_fc_fill_super,
+ IS_ENABLED(CONFIG_EROFS_FS_BACKED_BY_FILE) ?
+ GET_TREE_BDEV_QUIET_LOOKUP : 0);
#ifdef CONFIG_EROFS_FS_BACKED_BY_FILE
if (ret == -ENOTBLK) {
if (!fc->source)
@@ -714,7 +720,10 @@ static int erofs_fc_get_tree(struct fs_context *fc)
if (IS_ERR(sbi->fdev))
return PTR_ERR(sbi->fdev);
- return get_tree_nodev(fc, erofs_fc_fill_super);
+ if (S_ISREG(file_inode(sbi->fdev)->i_mode) &&
+ sbi->fdev->f_mapping->a_ops->read_folio)
+ return get_tree_nodev(fc, erofs_fc_fill_super);
+ fput(sbi->fdev);
}
#endif
return ret;
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 8936790618c6..a569ff9dfd04 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -710,24 +710,6 @@ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
return ret;
}
-static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f)
-{
- struct z_erofs_pcluster *pcl = f->pcl;
- z_erofs_next_pcluster_t *owned_head = &f->owned_head;
-
- /* type 1, nil pcluster (this pcluster doesn't belong to any chain.) */
- if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_NIL,
- *owned_head) == Z_EROFS_PCLUSTER_NIL) {
- *owned_head = &pcl->next;
- /* so we can attach this pcluster to our submission chain. */
- f->mode = Z_EROFS_PCLUSTER_FOLLOWED;
- return;
- }
-
- /* type 2, it belongs to an ongoing chain */
- f->mode = Z_EROFS_PCLUSTER_INFLIGHT;
-}
-
static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
{
struct erofs_map_blocks *map = &fe->map;
@@ -803,7 +785,6 @@ static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe)
int ret;
DBG_BUGON(fe->pcl);
-
/* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous pcluster */
DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL);
@@ -823,7 +804,15 @@ static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe)
if (ret == -EEXIST) {
mutex_lock(&fe->pcl->lock);
- z_erofs_try_to_claim_pcluster(fe);
+ /* check if this pcluster hasn't been linked into any chain. */
+ if (cmpxchg(&fe->pcl->next, Z_EROFS_PCLUSTER_NIL,
+ fe->owned_head) == Z_EROFS_PCLUSTER_NIL) {
+ /* .. so it can be attached to our submission chain */
+ fe->owned_head = &fe->pcl->next;
+ fe->mode = Z_EROFS_PCLUSTER_FOLLOWED;
+ } else { /* otherwise, it belongs to an inflight chain */
+ fe->mode = Z_EROFS_PCLUSTER_INFLIGHT;
+ }
} else if (ret) {
return ret;
}
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 1253a8456e59..a076cca1f547 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -10,8 +10,6 @@
struct z_erofs_maprecorder {
struct inode *inode;
struct erofs_map_blocks *map;
- void *kaddr;
-
unsigned long lcn;
/* compression extent information gathered */
u8 type, headtype;
@@ -33,14 +31,11 @@ static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m,
struct z_erofs_lcluster_index *di;
unsigned int advise;
- m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb,
- pos, EROFS_KMAP);
- if (IS_ERR(m->kaddr))
- return PTR_ERR(m->kaddr);
-
- m->nextpackoff = pos + sizeof(struct z_erofs_lcluster_index);
+ di = erofs_read_metabuf(&m->map->buf, inode->i_sb, pos, EROFS_KMAP);
+ if (IS_ERR(di))
+ return PTR_ERR(di);
m->lcn = lcn;
- di = m->kaddr;
+ m->nextpackoff = pos + sizeof(struct z_erofs_lcluster_index);
advise = le16_to_cpu(di->di_advise);
m->type = advise & Z_EROFS_LI_LCLUSTER_TYPE_MASK;
@@ -53,8 +48,7 @@ static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m,
DBG_BUGON(1);
return -EFSCORRUPTED;
}
- m->compressedblks = m->delta[0] &
- ~Z_EROFS_LI_D0_CBLKCNT;
+ m->compressedblks = m->delta[0] & ~Z_EROFS_LI_D0_CBLKCNT;
m->delta[0] = 1;
}
m->delta[1] = le16_to_cpu(di->di_u.delta[1]);
@@ -110,9 +104,9 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
struct erofs_inode *const vi = EROFS_I(m->inode);
const unsigned int lclusterbits = vi->z_logical_clusterbits;
unsigned int vcnt, lo, lobits, encodebits, nblk, bytes;
- int i;
- u8 *in, type;
bool big_pcluster;
+ u8 *in, type;
+ int i;
if (1 << amortizedshift == 4 && lclusterbits <= 14)
vcnt = 2;
@@ -121,6 +115,10 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
else
return -EOPNOTSUPP;
+ in = erofs_read_metabuf(&m->map->buf, m->inode->i_sb, pos, EROFS_KMAP);
+ if (IS_ERR(in))
+ return PTR_ERR(in);
+
/* it doesn't equal to round_up(..) */
m->nextpackoff = round_down(pos, vcnt << amortizedshift) +
(vcnt << amortizedshift);
@@ -128,9 +126,7 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
lobits = max(lclusterbits, ilog2(Z_EROFS_LI_D0_CBLKCNT) + 1U);
encodebits = ((vcnt << amortizedshift) - sizeof(__le32)) * 8 / vcnt;
bytes = pos & ((vcnt << amortizedshift) - 1);
-
- in = m->kaddr - bytes;
-
+ in -= bytes;
i = bytes >> amortizedshift;
lo = decode_compactedbits(lobits, in, encodebits * i, &type);
@@ -255,10 +251,6 @@ static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m,
amortizedshift = 2;
out:
pos += lcn * (1 << amortizedshift);
- m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb,
- pos, EROFS_KMAP);
- if (IS_ERR(m->kaddr))
- return PTR_ERR(m->kaddr);
return unpack_compacted_index(m, amortizedshift, pos, lookahead);
}
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 22c934f3a080..76129bfcd663 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -347,13 +347,10 @@ EXPORT_SYMBOL_GPL(eventfd_fget);
*/
struct eventfd_ctx *eventfd_ctx_fdget(int fd)
{
- struct eventfd_ctx *ctx;
- struct fd f = fdget(fd);
- if (!fd_file(f))
+ CLASS(fd, f)(fd);
+ if (fd_empty(f))
return ERR_PTR(-EBADF);
- ctx = eventfd_ctx_fileget(fd_file(f));
- fdput(f);
- return ctx;
+ return eventfd_ctx_fileget(fd_file(f));
}
EXPORT_SYMBOL_GPL(eventfd_ctx_fdget);
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 1ae4542f0bd8..62433cb3d2c2 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -823,7 +823,8 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)
to_free = NULL;
head = file->f_ep;
if (head->first == &epi->fllink && !epi->fllink.next) {
- file->f_ep = NULL;
+ /* See eventpoll_release() for details. */
+ WRITE_ONCE(file->f_ep, NULL);
if (!is_file_epoll(file)) {
struct epitems_head *v;
v = container_of(head, struct epitems_head, epitems);
@@ -1002,7 +1003,7 @@ static struct file *epi_fget(const struct epitem *epi)
struct file *file;
file = epi->ffd.file;
- if (!atomic_long_inc_not_zero(&file->f_count))
+ if (!file_ref_get(&file->f_ref))
file = NULL;
return file;
}
@@ -1372,7 +1373,10 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
break;
}
}
- wake_up(&ep->wq);
+ if (sync)
+ wake_up_sync(&ep->wq);
+ else
+ wake_up(&ep->wq);
}
if (waitqueue_active(&ep->poll_wait))
pwake++;
@@ -1603,7 +1607,8 @@ allocate:
spin_unlock(&file->f_lock);
goto allocate;
}
- file->f_ep = head;
+ /* See eventpoll_release() for details. */
+ WRITE_ONCE(file->f_ep, head);
to_free = NULL;
}
hlist_add_head_rcu(&epi->fllink, file->f_ep);
@@ -2254,25 +2259,22 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
{
int error;
int full_check = 0;
- struct fd f, tf;
struct eventpoll *ep;
struct epitem *epi;
struct eventpoll *tep = NULL;
- error = -EBADF;
- f = fdget(epfd);
- if (!fd_file(f))
- goto error_return;
+ CLASS(fd, f)(epfd);
+ if (fd_empty(f))
+ return -EBADF;
/* Get the "struct file *" for the target file */
- tf = fdget(fd);
- if (!fd_file(tf))
- goto error_fput;
+ CLASS(fd, tf)(fd);
+ if (fd_empty(tf))
+ return -EBADF;
/* The target file descriptor must support poll */
- error = -EPERM;
if (!file_can_poll(fd_file(tf)))
- goto error_tgt_fput;
+ return -EPERM;
/* Check if EPOLLWAKEUP is allowed */
if (ep_op_has_event(op))
@@ -2391,12 +2393,6 @@ error_tgt_fput:
loop_check_gen++;
mutex_unlock(&epnested_mutex);
}
-
- fdput(tf);
-error_fput:
- fdput(f);
-error_return:
-
return error;
}
@@ -2424,8 +2420,6 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
static int do_epoll_wait(int epfd, struct epoll_event __user *events,
int maxevents, struct timespec64 *to)
{
- int error;
- struct fd f;
struct eventpoll *ep;
/* The maximum number of event must be greater than zero */
@@ -2437,17 +2431,16 @@ static int do_epoll_wait(int epfd, struct epoll_event __user *events,
return -EFAULT;
/* Get the "struct file *" for the eventpoll file */
- f = fdget(epfd);
- if (!fd_file(f))
+ CLASS(fd, f)(epfd);
+ if (fd_empty(f))
return -EBADF;
/*
* We have to check that the file structure underneath the fd
* the user passed to us _is_ an eventpoll file.
*/
- error = -EINVAL;
if (!is_file_epoll(fd_file(f)))
- goto error_fput;
+ return -EINVAL;
/*
* At this point it is safe to assume that the "private_data" contains
@@ -2456,11 +2449,7 @@ static int do_epoll_wait(int epfd, struct epoll_event __user *events,
ep = fd_file(f)->private_data;
/* Time to fish for events ... */
- error = ep_poll(ep, events, maxevents, to);
-
-error_fput:
- fdput(f);
- return error;
+ return ep_poll(ep, events, maxevents, to);
}
SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 591fb3f710be..8042ad873808 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -550,7 +550,8 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group,
trace_ext4_read_block_bitmap_load(sb, block_group, ignore_locked);
ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO |
(ignore_locked ? REQ_RAHEAD : 0),
- ext4_end_bitmap_read);
+ ext4_end_bitmap_read,
+ ext4_simulate_fail(sb, EXT4_SIM_BBITMAP_EIO));
return bh;
verify:
err = ext4_validate_block_bitmap(sb, desc, block_group, bh);
@@ -577,7 +578,6 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
if (!desc)
return -EFSCORRUPTED;
wait_on_buffer(bh);
- ext4_simulate_fail_bh(sb, bh, EXT4_SIM_BBITMAP_EIO);
if (!buffer_uptodate(bh)) {
ext4_error_err(sb, EIO, "Cannot read block bitmap - "
"block_group = %u, block_bitmap = %llu",
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index ef6a3c8f3a9a..02d47a64e8d1 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -418,7 +418,7 @@ struct fname {
__u32 inode;
__u8 name_len;
__u8 file_type;
- char name[];
+ char name[] __counted_by(name_len);
};
/*
@@ -471,14 +471,13 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
struct rb_node **p, *parent = NULL;
struct fname *fname, *new_fn;
struct dir_private_info *info;
- int len;
info = dir_file->private_data;
p = &info->root.rb_node;
/* Create and allocate the fname structure */
- len = sizeof(struct fname) + ent_name->len + 1;
- new_fn = kzalloc(len, GFP_KERNEL);
+ new_fn = kzalloc(struct_size(new_fn, name, ent_name->len + 1),
+ GFP_KERNEL);
if (!new_fn)
return -ENOMEM;
new_fn->hash = hash;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 44b0d418143c..74f2071189b2 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1729,6 +1729,10 @@ struct ext4_sb_info {
*/
struct work_struct s_sb_upd_work;
+ /* Atomic write unit values in bytes */
+ unsigned int s_awu_min;
+ unsigned int s_awu_max;
+
/* Ext4 fast commit sub transaction ID */
atomic_t s_fc_subtid;
@@ -1865,14 +1869,6 @@ static inline bool ext4_simulate_fail(struct super_block *sb,
return false;
}
-static inline void ext4_simulate_fail_bh(struct super_block *sb,
- struct buffer_head *bh,
- unsigned long code)
-{
- if (!IS_ERR(bh) && ext4_simulate_fail(sb, code))
- clear_buffer_uptodate(bh);
-}
-
/*
* Error number codes for s_{first,last}_error_errno
*
@@ -3100,9 +3096,9 @@ extern struct buffer_head *ext4_sb_bread(struct super_block *sb,
extern struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb,
sector_t block);
extern void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags,
- bh_end_io_t *end_io);
+ bh_end_io_t *end_io, bool simu_fail);
extern int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags,
- bh_end_io_t *end_io);
+ bh_end_io_t *end_io, bool simu_fail);
extern int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait);
extern void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block);
extern int ext4_seq_options_show(struct seq_file *seq, void *offset);
@@ -3855,6 +3851,12 @@ static inline int ext4_buffer_uptodate(struct buffer_head *bh)
return buffer_uptodate(bh);
}
+static inline bool ext4_inode_can_atomic_write(struct inode *inode)
+{
+
+ return S_ISREG(inode->i_mode) && EXT4_SB(inode->i_sb)->s_awu_min > 0;
+}
+
extern int ext4_block_write_begin(handle_t *handle, struct folio *folio,
loff_t pos, unsigned len,
get_block_t *get_block);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 34e25eee6521..a07a98a4b97a 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -568,7 +568,7 @@ __read_extent_tree_block(const char *function, unsigned int line,
if (!bh_uptodate_or_lock(bh)) {
trace_ext4_ext_load_extent(inode, pblk, _RET_IP_);
- err = ext4_read_bh(bh, 0, NULL);
+ err = ext4_read_bh(bh, 0, NULL, false);
if (err < 0)
goto errout;
}
@@ -3138,7 +3138,7 @@ static void ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex)
return;
ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock,
- EXTENT_STATUS_WRITTEN, 0);
+ EXTENT_STATUS_WRITTEN, false);
}
/* FIXME!! we need to try to merge to left or right after zero-out */
@@ -4158,7 +4158,7 @@ insert_hole:
/* Put just found gap into cache to speed up subsequent requests */
ext_debug(inode, " -> %u:%u\n", hole_start, len);
ext4_es_insert_extent(inode, hole_start, len, ~0,
- EXTENT_STATUS_HOLE, 0);
+ EXTENT_STATUS_HOLE, false);
/* Update hole_len to reflect hole size after lblk */
if (hole_start != lblk)
@@ -4482,7 +4482,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
int depth = 0;
struct ext4_map_blocks map;
unsigned int credits;
- loff_t epos;
+ loff_t epos, old_size = i_size_read(inode);
BUG_ON(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS));
map.m_lblk = offset;
@@ -4541,6 +4541,11 @@ retry:
if (ext4_update_inode_size(inode, epos) & 0x1)
inode_set_mtime_to_ts(inode,
inode_get_ctime(inode));
+ if (epos > old_size) {
+ pagecache_isize_extended(inode, old_size, epos);
+ ext4_zero_partial_blocks(handle, inode,
+ old_size, epos - old_size);
+ }
}
ret2 = ext4_mark_inode_dirty(handle, inode);
ext4_update_inode_fsync_trans(handle, inode, 1);
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index c786691dabd3..ae29832aab1e 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -848,7 +848,7 @@ out:
*/
void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len, ext4_fsblk_t pblk,
- unsigned int status, int flags)
+ unsigned int status, bool delalloc_reserve_used)
{
struct extent_status newes;
ext4_lblk_t end = lblk + len - 1;
@@ -863,8 +863,8 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
return;
- es_debug("add [%u/%u) %llu %x %x to extent status tree of inode %lu\n",
- lblk, len, pblk, status, flags, inode->i_ino);
+ es_debug("add [%u/%u) %llu %x %d to extent status tree of inode %lu\n",
+ lblk, len, pblk, status, delalloc_reserve_used, inode->i_ino);
if (!len)
return;
@@ -945,7 +945,7 @@ error:
resv_used += pending;
if (resv_used)
ext4_da_update_reserve_space(inode, resv_used,
- flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE);
+ delalloc_reserve_used);
if (err1 || err2 || err3 < 0)
goto retry;
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 4424232de298..8f9c008d11e8 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -135,7 +135,8 @@ extern void ext4_es_init_tree(struct ext4_es_tree *tree);
extern void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len, ext4_fsblk_t pblk,
- unsigned int status, int flags);
+ unsigned int status,
+ bool delalloc_reserve_used);
extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len, ext4_fsblk_t pblk,
unsigned int status);
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index b33664f6ce2a..26c4fc37edcf 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -291,9 +291,9 @@ void ext4_fc_del(struct inode *inode)
return;
restart:
- spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
+ spin_lock(&sbi->s_fc_lock);
if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) {
- spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
+ spin_unlock(&sbi->s_fc_lock);
return;
}
@@ -357,9 +357,7 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handl
}
spin_lock(&sbi->s_fc_lock);
is_ineligible = ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
- if (has_transaction &&
- (!is_ineligible ||
- (is_ineligible && tid_gt(tid, sbi->s_fc_ineligible_tid))))
+ if (has_transaction && (!is_ineligible || tid_gt(tid, sbi->s_fc_ineligible_tid)))
sbi->s_fc_ineligible_tid = tid;
ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
spin_unlock(&sbi->s_fc_lock);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index f14aed14b9cf..3bd96c3d4cd0 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -392,8 +392,9 @@ static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
*/
if (pos + size <= READ_ONCE(EXT4_I(inode)->i_disksize) &&
pos + size <= i_size_read(inode))
- return size;
- return ext4_handle_inode_extension(inode, pos, size, size);
+ return 0;
+ error = ext4_handle_inode_extension(inode, pos, size, size);
+ return error < 0 ? error : 0;
}
static const struct iomap_dio_ops ext4_dio_write_ops = {
@@ -564,12 +565,9 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
}
ret = ext4_orphan_add(handle, inode);
- if (ret) {
- ext4_journal_stop(handle);
- goto out;
- }
-
ext4_journal_stop(handle);
+ if (ret)
+ goto out;
}
if (ilock_shared && !unwritten)
@@ -599,6 +597,13 @@ out:
ssize_t err;
loff_t endbyte;
+ /*
+ * There is no support for atomic writes on buffered-io yet,
+ * we should never fallback to buffered-io for DIO atomic
+ * writes.
+ */
+ WARN_ON_ONCE(iocb->ki_flags & IOCB_ATOMIC);
+
offset = iocb->ki_pos;
err = ext4_buffered_write_iter(iocb, from);
if (err < 0)
@@ -692,6 +697,20 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (IS_DAX(inode))
return ext4_dax_write_iter(iocb, from);
#endif
+
+ if (iocb->ki_flags & IOCB_ATOMIC) {
+ size_t len = iov_iter_count(from);
+ int ret;
+
+ if (len < EXT4_SB(inode->i_sb)->s_awu_min ||
+ len > EXT4_SB(inode->i_sb)->s_awu_max)
+ return -EINVAL;
+
+ ret = generic_atomic_write_valid(iocb, from);
+ if (ret)
+ return ret;
+ }
+
if (iocb->ki_flags & IOCB_DIRECT)
return ext4_dio_write_iter(iocb, from);
else
@@ -884,6 +903,9 @@ static int ext4_file_open(struct inode *inode, struct file *filp)
return ret;
}
+ if (ext4_inode_can_atomic_write(inode))
+ filp->f_mode |= FMODE_CAN_ATOMIC_WRITE;
+
filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
return dquot_file_open(inode, filp);
}
diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c
index df853c4d3a8c..383c6edea6dd 100644
--- a/fs/ext4/fsmap.c
+++ b/fs/ext4/fsmap.c
@@ -185,6 +185,56 @@ static inline ext4_fsblk_t ext4_fsmap_next_pblk(struct ext4_fsmap *fmr)
return fmr->fmr_physical + fmr->fmr_length;
}
+static int ext4_getfsmap_meta_helper(struct super_block *sb,
+ ext4_group_t agno, ext4_grpblk_t start,
+ ext4_grpblk_t len, void *priv)
+{
+ struct ext4_getfsmap_info *info = priv;
+ struct ext4_fsmap *p;
+ struct ext4_fsmap *tmp;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ ext4_fsblk_t fsb, fs_start, fs_end;
+ int error;
+
+ fs_start = fsb = (EXT4_C2B(sbi, start) +
+ ext4_group_first_block_no(sb, agno));
+ fs_end = fs_start + EXT4_C2B(sbi, len);
+
+ /* Return relevant extents from the meta_list */
+ list_for_each_entry_safe(p, tmp, &info->gfi_meta_list, fmr_list) {
+ if (p->fmr_physical < info->gfi_next_fsblk) {
+ list_del(&p->fmr_list);
+ kfree(p);
+ continue;
+ }
+ if (p->fmr_physical <= fs_start ||
+ p->fmr_physical + p->fmr_length <= fs_end) {
+ /* Emit the retained free extent record if present */
+ if (info->gfi_lastfree.fmr_owner) {
+ error = ext4_getfsmap_helper(sb, info,
+ &info->gfi_lastfree);
+ if (error)
+ return error;
+ info->gfi_lastfree.fmr_owner = 0;
+ }
+ error = ext4_getfsmap_helper(sb, info, p);
+ if (error)
+ return error;
+ fsb = p->fmr_physical + p->fmr_length;
+ if (info->gfi_next_fsblk < fsb)
+ info->gfi_next_fsblk = fsb;
+ list_del(&p->fmr_list);
+ kfree(p);
+ continue;
+ }
+ }
+ if (info->gfi_next_fsblk < fsb)
+ info->gfi_next_fsblk = fsb;
+
+ return 0;
+}
+
+
/* Transform a blockgroup's free record into a fsmap */
static int ext4_getfsmap_datadev_helper(struct super_block *sb,
ext4_group_t agno, ext4_grpblk_t start,
@@ -539,6 +589,7 @@ static int ext4_getfsmap_datadev(struct super_block *sb,
error = ext4_mballoc_query_range(sb, info->gfi_agno,
EXT4_B2C(sbi, info->gfi_low.fmr_physical),
EXT4_B2C(sbi, info->gfi_high.fmr_physical),
+ ext4_getfsmap_meta_helper,
ext4_getfsmap_datadev_helper, info);
if (error)
goto err;
@@ -560,7 +611,8 @@ static int ext4_getfsmap_datadev(struct super_block *sb,
/* Report any gaps at the end of the bg */
info->gfi_last = true;
- error = ext4_getfsmap_datadev_helper(sb, end_ag, last_cluster, 0, info);
+ error = ext4_getfsmap_datadev_helper(sb, end_ag, last_cluster + 1,
+ 0, info);
if (error)
goto err;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 7f1a5f90dbbd..21d228073d79 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -193,8 +193,9 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
* submit the buffer_head for reading
*/
trace_ext4_load_inode_bitmap(sb, block_group);
- ext4_read_bh(bh, REQ_META | REQ_PRIO, ext4_end_bitmap_read);
- ext4_simulate_fail_bh(sb, bh, EXT4_SIM_IBITMAP_EIO);
+ ext4_read_bh(bh, REQ_META | REQ_PRIO,
+ ext4_end_bitmap_read,
+ ext4_simulate_fail(sb, EXT4_SIM_IBITMAP_EIO));
if (!buffer_uptodate(bh)) {
put_bh(bh);
ext4_error_err(sb, EIO, "Cannot read inode bitmap - "
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 7404f0935c90..7de327fa7b1c 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -170,7 +170,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
}
if (!bh_uptodate_or_lock(bh)) {
- if (ext4_read_bh(bh, 0, NULL) < 0) {
+ if (ext4_read_bh(bh, 0, NULL, false) < 0) {
put_bh(bh);
goto failure;
}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 54bdd4884fe6..89aade6f45f6 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -483,7 +483,7 @@ static int ext4_map_query_blocks(handle_t *handle, struct inode *inode,
status = map->m_flags & EXT4_MAP_UNWRITTEN ?
EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
- map->m_pblk, status, 0);
+ map->m_pblk, status, false);
return retval;
}
@@ -563,8 +563,8 @@ static int ext4_map_create_blocks(handle_t *handle, struct inode *inode,
status = map->m_flags & EXT4_MAP_UNWRITTEN ?
EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
- ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
- map->m_pblk, status, flags);
+ ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk,
+ status, flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE);
return retval;
}
@@ -856,7 +856,14 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
if (nowait)
return sb_find_get_block(inode->i_sb, map.m_pblk);
- bh = sb_getblk(inode->i_sb, map.m_pblk);
+ /*
+ * Since bh could introduce extra ref count such as referred by
+ * journal_head etc. Try to avoid using __GFP_MOVABLE here
+ * as it may fail the migration when journal_head remains.
+ */
+ bh = getblk_unmovable(inode->i_sb->s_bdev, map.m_pblk,
+ inode->i_sb->s_blocksize);
+
if (unlikely(!bh))
return ERR_PTR(-ENOMEM);
if (map.m_flags & EXT4_MAP_NEW) {
@@ -1307,8 +1314,10 @@ static int ext4_write_end(struct file *file,
folio_unlock(folio);
folio_put(folio);
- if (old_size < pos && !verity)
+ if (old_size < pos && !verity) {
pagecache_isize_extended(inode, old_size, pos);
+ ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size);
+ }
/*
* Don't mark the inode dirty under folio lock. First, it unnecessarily
* makes the holding time of folio lock longer. Second, it forces lock
@@ -1423,8 +1432,10 @@ static int ext4_journalled_write_end(struct file *file,
folio_unlock(folio);
folio_put(folio);
- if (old_size < pos && !verity)
+ if (old_size < pos && !verity) {
pagecache_isize_extended(inode, old_size, pos);
+ ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size);
+ }
if (size_changed) {
ret2 = ext4_mark_inode_dirty(handle, inode);
@@ -2985,7 +2996,8 @@ static int ext4_da_do_write_end(struct address_space *mapping,
struct inode *inode = mapping->host;
loff_t old_size = inode->i_size;
bool disksize_changed = false;
- loff_t new_i_size;
+ loff_t new_i_size, zero_len = 0;
+ handle_t *handle;
if (unlikely(!folio_buffers(folio))) {
folio_unlock(folio);
@@ -3029,18 +3041,21 @@ static int ext4_da_do_write_end(struct address_space *mapping,
folio_unlock(folio);
folio_put(folio);
- if (old_size < pos)
+ if (pos > old_size) {
pagecache_isize_extended(inode, old_size, pos);
+ zero_len = pos - old_size;
+ }
- if (disksize_changed) {
- handle_t *handle;
+ if (!disksize_changed && !zero_len)
+ return copied;
- handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
- ext4_mark_inode_dirty(handle, inode);
- ext4_journal_stop(handle);
- }
+ handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+ if (zero_len)
+ ext4_zero_partial_blocks(handle, inode, old_size, zero_len);
+ ext4_mark_inode_dirty(handle, inode);
+ ext4_journal_stop(handle);
return copied;
}
@@ -3444,17 +3459,34 @@ static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset,
return ret;
}
+static inline bool ext4_want_directio_fallback(unsigned flags, ssize_t written)
+{
+ /* must be a directio to fall back to buffered */
+ if ((flags & (IOMAP_WRITE | IOMAP_DIRECT)) !=
+ (IOMAP_WRITE | IOMAP_DIRECT))
+ return false;
+
+ /* atomic writes are all-or-nothing */
+ if (flags & IOMAP_ATOMIC)
+ return false;
+
+ /* can only try again if we wrote nothing */
+ return written == 0;
+}
+
static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
ssize_t written, unsigned flags, struct iomap *iomap)
{
/*
* Check to see whether an error occurred while writing out the data to
- * the allocated blocks. If so, return the magic error code so that we
- * fallback to buffered I/O and attempt to complete the remainder of
- * the I/O. Any blocks that may have been allocated in preparation for
- * the direct I/O will be reused during buffered I/O.
+ * the allocated blocks. If so, return the magic error code for
+ * non-atomic write so that we fallback to buffered I/O and attempt to
+ * complete the remainder of the I/O.
+ * For non-atomic writes, any blocks that may have been
+ * allocated in preparation for the direct I/O will be reused during
+ * buffered I/O. For atomic write, we never fallback to buffered-io.
*/
- if (flags & (IOMAP_WRITE | IOMAP_DIRECT) && written == 0)
+ if (ext4_want_directio_fallback(flags, written))
return -ENOTBLK;
return 0;
@@ -4497,10 +4529,10 @@ make_io:
* Read the block from disk.
*/
trace_ext4_load_inode(sb, ino);
- ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL);
+ ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL,
+ ext4_simulate_fail(sb, EXT4_SIM_INODE_EIO));
blk_finish_plug(&plug);
wait_on_buffer(bh);
- ext4_simulate_fail_bh(sb, bh, EXT4_SIM_INODE_EIO);
if (!buffer_uptodate(bh)) {
if (ret_block)
*ret_block = block;
@@ -5426,6 +5458,14 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
}
if (attr->ia_size != inode->i_size) {
+ /* attach jbd2 jinode for EOF folio tail zeroing */
+ if (attr->ia_size & (inode->i_sb->s_blocksize - 1) ||
+ oldsize & (inode->i_sb->s_blocksize - 1)) {
+ error = ext4_inode_attach_jinode(inode);
+ if (error)
+ goto err_out;
+ }
+
handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
if (IS_ERR(handle)) {
error = PTR_ERR(handle);
@@ -5436,12 +5476,17 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
orphan = 1;
}
/*
- * Update c/mtime on truncate up, ext4_truncate() will
- * update c/mtime in shrink case below
+ * Update c/mtime and tail zero the EOF folio on
+ * truncate up. ext4_truncate() handles the shrink case
+ * below.
*/
- if (!shrink)
+ if (!shrink) {
inode_set_mtime_to_ts(inode,
inode_set_ctime_current(inode));
+ if (oldsize & (inode->i_sb->s_blocksize - 1))
+ ext4_block_truncate_page(handle,
+ inode->i_mapping, oldsize);
+ }
if (shrink)
ext4_fc_track_range(handle, inode,
@@ -5578,6 +5623,18 @@ int ext4_getattr(struct mnt_idmap *idmap, const struct path *path,
}
}
+ if ((request_mask & STATX_WRITE_ATOMIC) && S_ISREG(inode->i_mode)) {
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ unsigned int awu_min = 0, awu_max = 0;
+
+ if (ext4_inode_can_atomic_write(inode)) {
+ awu_min = sbi->s_awu_min;
+ awu_max = sbi->s_awu_max;
+ }
+
+ generic_fill_statx_atomic_writes(stat, awu_min, awu_max);
+ }
+
flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
if (flags & EXT4_APPEND_FL)
stat->attributes |= STATX_ATTR_APPEND;
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 1c77400bd88e..7b9ce71c1c81 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -1330,7 +1330,6 @@ group_extend_out:
case EXT4_IOC_MOVE_EXT: {
struct move_extent me;
- struct fd donor;
int err;
if (!(filp->f_mode & FMODE_READ) ||
@@ -1342,30 +1341,26 @@ group_extend_out:
return -EFAULT;
me.moved_len = 0;
- donor = fdget(me.donor_fd);
- if (!fd_file(donor))
+ CLASS(fd, donor)(me.donor_fd);
+ if (fd_empty(donor))
return -EBADF;
- if (!(fd_file(donor)->f_mode & FMODE_WRITE)) {
- err = -EBADF;
- goto mext_out;
- }
+ if (!(fd_file(donor)->f_mode & FMODE_WRITE))
+ return -EBADF;
if (ext4_has_feature_bigalloc(sb)) {
ext4_msg(sb, KERN_ERR,
"Online defrag not supported with bigalloc");
- err = -EOPNOTSUPP;
- goto mext_out;
+ return -EOPNOTSUPP;
} else if (IS_DAX(inode)) {
ext4_msg(sb, KERN_ERR,
"Online defrag not supported with DAX");
- err = -EOPNOTSUPP;
- goto mext_out;
+ return -EOPNOTSUPP;
}
err = mnt_want_write_file(filp);
if (err)
- goto mext_out;
+ return err;
err = ext4_move_extents(filp, fd_file(donor), me.orig_start,
me.donor_start, me.len, &me.moved_len);
@@ -1374,8 +1369,6 @@ group_extend_out:
if (copy_to_user((struct move_extent __user *)arg,
&me, sizeof(me)))
err = -EFAULT;
-mext_out:
- fdput(donor);
return err;
}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index d73e38323879..b25a27c86696 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -5711,7 +5711,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
(unsigned long)ac->ac_b_ex.fe_logical,
(int)ac->ac_criteria);
mb_debug(sb, "%u found", ac->ac_found);
- mb_debug(sb, "used pa: %s, ", ac->ac_pa ? "yes" : "no");
+ mb_debug(sb, "used pa: %s, ", str_yes_no(ac->ac_pa));
if (ac->ac_pa)
mb_debug(sb, "pa_type %s\n", ac->ac_pa->pa_type == MB_GROUP_PA ?
"group pa" : "inode pa");
@@ -6056,7 +6056,7 @@ static bool ext4_mb_discard_preallocations_should_retry(struct super_block *sb,
}
out_dbg:
- mb_debug(sb, "freed %d, retry ? %s\n", freed, ret ? "yes" : "no");
+ mb_debug(sb, "freed %d, retry ? %s\n", freed, str_yes_no(ret));
return ret;
}
@@ -6999,13 +6999,14 @@ int
ext4_mballoc_query_range(
struct super_block *sb,
ext4_group_t group,
- ext4_grpblk_t start,
+ ext4_grpblk_t first,
ext4_grpblk_t end,
+ ext4_mballoc_query_range_fn meta_formatter,
ext4_mballoc_query_range_fn formatter,
void *priv)
{
void *bitmap;
- ext4_grpblk_t next;
+ ext4_grpblk_t start, next;
struct ext4_buddy e4b;
int error;
@@ -7016,10 +7017,19 @@ ext4_mballoc_query_range(
ext4_lock_group(sb, group);
- start = max(e4b.bd_info->bb_first_free, start);
+ start = max(e4b.bd_info->bb_first_free, first);
if (end >= EXT4_CLUSTERS_PER_GROUP(sb))
end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
-
+ if (meta_formatter && start != first) {
+ if (start > end)
+ start = end;
+ ext4_unlock_group(sb, group);
+ error = meta_formatter(sb, group, first, start - first,
+ priv);
+ if (error)
+ goto out_unload;
+ ext4_lock_group(sb, group);
+ }
while (start <= end) {
start = mb_find_next_zero_bit(bitmap, end + 1, start);
if (start > end)
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index d8553f1498d3..f8280de3e882 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -259,6 +259,7 @@ ext4_mballoc_query_range(
ext4_group_t agno,
ext4_grpblk_t start,
ext4_grpblk_t end,
+ ext4_mballoc_query_range_fn meta_formatter,
ext4_mballoc_query_range_fn formatter,
void *priv);
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index bd946d0c71b7..d64c04ed061a 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -94,7 +94,7 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
}
lock_buffer(*bh);
- ret = ext4_read_bh(*bh, REQ_META | REQ_PRIO, NULL);
+ ret = ext4_read_bh(*bh, REQ_META | REQ_PRIO, NULL, false);
if (ret)
goto warn_exit;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index b64661ea6e0e..898443e98efc 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -213,7 +213,7 @@ static int mext_page_mkuptodate(struct folio *folio, size_t from, size_t to)
unlock_buffer(bh);
continue;
}
- ext4_read_bh_nowait(bh, 0, NULL);
+ ext4_read_bh_nowait(bh, 0, NULL, false);
nr++;
} while (block++, (bh = bh->b_this_page) != head);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 790db7eac6c2..bcf2737078b8 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1747,7 +1747,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
#endif
frame = dx_probe(fname, dir, NULL, frames);
if (IS_ERR(frame))
- return (struct buffer_head *) frame;
+ return ERR_CAST(frame);
do {
block = dx_get_block(frame->at);
bh = ext4_read_dirblock(dir, block, DIRENT_HTREE);
@@ -1952,7 +1952,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
if (IS_ERR(bh2)) {
brelse(*bh);
*bh = NULL;
- return (struct ext4_dir_entry_2 *) bh2;
+ return ERR_CAST(bh2);
}
BUFFER_TRACE(*bh, "get_write_access");
@@ -2000,8 +2000,17 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
else
split = count/2;
+ if (WARN_ON_ONCE(split == 0)) {
+ /* Should never happen, but avoid out-of-bounds access below */
+ ext4_error_inode_block(dir, (*bh)->b_blocknr, 0,
+ "bad indexed directory? hash=%08x:%08x count=%d move=%u",
+ hinfo->hash, hinfo->minor_hash, count, move);
+ err = -EFSCORRUPTED;
+ goto out;
+ }
+
hash2 = map[split].hash;
- continued = split > 0 ? hash2 == map[split - 1].hash : 0;
+ continued = hash2 == map[split - 1].hash;
dxtrace(printk(KERN_INFO "Split block %lu at %x, %i/%i\n",
(unsigned long)dx_get_block(frame->at),
hash2, split, count-split));
@@ -2043,10 +2052,11 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
return de;
journal_error:
+ ext4_std_error(dir->i_sb, err);
+out:
brelse(*bh);
brelse(bh2);
*bh = NULL;
- ext4_std_error(dir->i_sb, err);
return ERR_PTR(err);
}
@@ -2395,11 +2405,8 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
if (fscrypt_is_nokey_name(dentry))
return -ENOKEY;
-#if IS_ENABLED(CONFIG_UNICODE)
- if (sb_has_strict_encoding(sb) && IS_CASEFOLDED(dir) &&
- utf8_validate(sb->s_encoding, &dentry->d_name))
+ if (!generic_ci_validate_strict_name(dir, &dentry->d_name))
return -EINVAL;
-#endif
retval = ext4_fname_setup_filename(dir, &dentry->d_name, 0, &fname);
if (retval)
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index ad5543866d21..69b8a7221a2b 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -417,11 +417,13 @@ static void io_submit_add_bh(struct ext4_io_submit *io,
submit_and_retry:
ext4_io_submit(io);
}
- if (io->io_bio == NULL)
+ if (io->io_bio == NULL) {
io_submit_init_bio(io, bh);
+ io->io_bio->bi_write_hint = inode->i_write_hint;
+ }
if (!bio_add_folio(io->io_bio, io_folio, bh->b_size, bh_offset(bh)))
goto submit_and_retry;
- wbc_account_cgroup_owner(io->io_wbc, &folio->page, bh->b_size);
+ wbc_account_cgroup_owner(io->io_wbc, folio, bh->b_size);
io->io_next_block++;
}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index a2704f064361..72f77f78ae8d 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1300,7 +1300,7 @@ static struct buffer_head *ext4_get_bitmap(struct super_block *sb, __u64 block)
if (unlikely(!bh))
return NULL;
if (!bh_uptodate_or_lock(bh)) {
- if (ext4_read_bh(bh, 0, NULL) < 0) {
+ if (ext4_read_bh(bh, 0, NULL, false) < 0) {
brelse(bh);
return NULL;
}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 16a4ce704460..785809f33ff4 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -161,8 +161,14 @@ MODULE_ALIAS("ext3");
static inline void __ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags,
- bh_end_io_t *end_io)
+ bh_end_io_t *end_io, bool simu_fail)
{
+ if (simu_fail) {
+ clear_buffer_uptodate(bh);
+ unlock_buffer(bh);
+ return;
+ }
+
/*
* buffer's verified bit is no longer valid after reading from
* disk again due to write out error, clear it to make sure we
@@ -176,7 +182,7 @@ static inline void __ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags,
}
void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags,
- bh_end_io_t *end_io)
+ bh_end_io_t *end_io, bool simu_fail)
{
BUG_ON(!buffer_locked(bh));
@@ -184,10 +190,11 @@ void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags,
unlock_buffer(bh);
return;
}
- __ext4_read_bh(bh, op_flags, end_io);
+ __ext4_read_bh(bh, op_flags, end_io, simu_fail);
}
-int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io)
+int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags,
+ bh_end_io_t *end_io, bool simu_fail)
{
BUG_ON(!buffer_locked(bh));
@@ -196,7 +203,7 @@ int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io
return 0;
}
- __ext4_read_bh(bh, op_flags, end_io);
+ __ext4_read_bh(bh, op_flags, end_io, simu_fail);
wait_on_buffer(bh);
if (buffer_uptodate(bh))
@@ -208,10 +215,10 @@ int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait)
{
lock_buffer(bh);
if (!wait) {
- ext4_read_bh_nowait(bh, op_flags, NULL);
+ ext4_read_bh_nowait(bh, op_flags, NULL, false);
return 0;
}
- return ext4_read_bh(bh, op_flags, NULL);
+ return ext4_read_bh(bh, op_flags, NULL, false);
}
/*
@@ -266,7 +273,7 @@ void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block)
if (likely(bh)) {
if (trylock_buffer(bh))
- ext4_read_bh_nowait(bh, REQ_RAHEAD, NULL);
+ ext4_read_bh_nowait(bh, REQ_RAHEAD, NULL, false);
brelse(bh);
}
}
@@ -346,9 +353,9 @@ __u32 ext4_free_group_clusters(struct super_block *sb,
__u32 ext4_free_inodes_count(struct super_block *sb,
struct ext4_group_desc *bg)
{
- return le16_to_cpu(bg->bg_free_inodes_count_lo) |
+ return le16_to_cpu(READ_ONCE(bg->bg_free_inodes_count_lo)) |
(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
- (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0);
+ (__u32)le16_to_cpu(READ_ONCE(bg->bg_free_inodes_count_hi)) << 16 : 0);
}
__u32 ext4_used_dirs_count(struct super_block *sb,
@@ -402,9 +409,9 @@ void ext4_free_group_clusters_set(struct super_block *sb,
void ext4_free_inodes_set(struct super_block *sb,
struct ext4_group_desc *bg, __u32 count)
{
- bg->bg_free_inodes_count_lo = cpu_to_le16((__u16)count);
+ WRITE_ONCE(bg->bg_free_inodes_count_lo, cpu_to_le16((__u16)count));
if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT)
- bg->bg_free_inodes_count_hi = cpu_to_le16(count >> 16);
+ WRITE_ONCE(bg->bg_free_inodes_count_hi, cpu_to_le16(count >> 16));
}
void ext4_used_dirs_set(struct super_block *sb,
@@ -2096,16 +2103,16 @@ static int ext4_parse_test_dummy_encryption(const struct fs_parameter *param,
}
#define EXT4_SET_CTX(name) \
-static inline void ctx_set_##name(struct ext4_fs_context *ctx, \
- unsigned long flag) \
+static inline __maybe_unused \
+void ctx_set_##name(struct ext4_fs_context *ctx, unsigned long flag) \
{ \
ctx->mask_s_##name |= flag; \
ctx->vals_s_##name |= flag; \
}
#define EXT4_CLEAR_CTX(name) \
-static inline void ctx_clear_##name(struct ext4_fs_context *ctx, \
- unsigned long flag) \
+static inline __maybe_unused \
+void ctx_clear_##name(struct ext4_fs_context *ctx, unsigned long flag) \
{ \
ctx->mask_s_##name |= flag; \
ctx->vals_s_##name &= ~flag; \
@@ -3030,6 +3037,9 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
SEQ_OPTS_PUTS("mb_optimize_scan=1");
}
+ if (nodefs && !test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS))
+ SEQ_OPTS_PUTS("prefetch_block_bitmaps");
+
ext4_show_quota_options(seq, sb);
return 0;
}
@@ -3709,12 +3719,12 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
ret = 1;
if (!ret) {
- start_time = ktime_get_real_ns();
+ start_time = ktime_get_ns();
ret = ext4_init_inode_table(sb, group,
elr->lr_timeout ? 0 : 1);
trace_ext4_lazy_itable_init(sb, group);
if (elr->lr_timeout == 0) {
- elr->lr_timeout = nsecs_to_jiffies((ktime_get_real_ns() - start_time) *
+ elr->lr_timeout = nsecs_to_jiffies((ktime_get_ns() - start_time) *
EXT4_SB(elr->lr_super)->s_li_wait_mult);
}
elr->lr_next_sched = jiffies + elr->lr_timeout;
@@ -3774,8 +3784,9 @@ static int ext4_lazyinit_thread(void *arg)
cont_thread:
while (true) {
- next_wakeup = MAX_JIFFY_OFFSET;
+ bool next_wakeup_initialized = false;
+ next_wakeup = 0;
mutex_lock(&eli->li_list_mtx);
if (list_empty(&eli->li_request_list)) {
mutex_unlock(&eli->li_list_mtx);
@@ -3788,8 +3799,11 @@ cont_thread:
lr_request);
if (time_before(jiffies, elr->lr_next_sched)) {
- if (time_before(elr->lr_next_sched, next_wakeup))
+ if (!next_wakeup_initialized ||
+ time_before(elr->lr_next_sched, next_wakeup)) {
next_wakeup = elr->lr_next_sched;
+ next_wakeup_initialized = true;
+ }
continue;
}
if (down_read_trylock(&elr->lr_super->s_umount)) {
@@ -3817,16 +3831,18 @@ cont_thread:
elr->lr_next_sched = jiffies +
get_random_u32_below(EXT4_DEF_LI_MAX_START_DELAY * HZ);
}
- if (time_before(elr->lr_next_sched, next_wakeup))
+ if (!next_wakeup_initialized ||
+ time_before(elr->lr_next_sched, next_wakeup)) {
next_wakeup = elr->lr_next_sched;
+ next_wakeup_initialized = true;
+ }
}
mutex_unlock(&eli->li_list_mtx);
try_to_freeze();
cur = jiffies;
- if ((time_after_eq(cur, next_wakeup)) ||
- (MAX_JIFFY_OFFSET == next_wakeup)) {
+ if (!next_wakeup_initialized || time_after_eq(cur, next_wakeup)) {
cond_resched();
continue;
}
@@ -4425,6 +4441,36 @@ static int ext4_handle_clustersize(struct super_block *sb)
return 0;
}
+/*
+ * ext4_atomic_write_init: Initializes filesystem min & max atomic write units.
+ * @sb: super block
+ * TODO: Later add support for bigalloc
+ */
+static void ext4_atomic_write_init(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct block_device *bdev = sb->s_bdev;
+
+ if (!bdev_can_atomic_write(bdev))
+ return;
+
+ if (!ext4_has_feature_extents(sb))
+ return;
+
+ sbi->s_awu_min = max(sb->s_blocksize,
+ bdev_atomic_write_unit_min_bytes(bdev));
+ sbi->s_awu_max = min(sb->s_blocksize,
+ bdev_atomic_write_unit_max_bytes(bdev));
+ if (sbi->s_awu_min && sbi->s_awu_max &&
+ sbi->s_awu_min <= sbi->s_awu_max) {
+ ext4_msg(sb, KERN_NOTICE, "Supports (experimental) DIO atomic writes awu_min: %u, awu_max: %u",
+ sbi->s_awu_min, sbi->s_awu_max);
+ } else {
+ sbi->s_awu_min = 0;
+ sbi->s_awu_max = 0;
+ }
+}
+
static void ext4_fast_commit_init(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -5336,6 +5382,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
spin_lock_init(&sbi->s_bdev_wb_lock);
+ ext4_atomic_write_init(sb);
ext4_fast_commit_init(sb);
sb->s_root = NULL;
@@ -6301,7 +6348,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
struct ext4_sb_info *sbi = EXT4_SB(sb);
if (unlikely(ext4_forced_shutdown(sb)))
- return 0;
+ return -EIO;
trace_ext4_sync_fs(sb, wait);
flush_workqueue(sbi->rsv_conversion_wq);
@@ -6518,8 +6565,12 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb)
goto restore_opts;
}
- if (test_opt2(sb, ABORT))
- ext4_abort(sb, ESHUTDOWN, "Abort forced by user");
+ if ((old_opts.s_mount_opt & EXT4_MOUNT_DELALLOC) &&
+ !test_opt(sb, DELALLOC)) {
+ ext4_msg(sb, KERN_ERR, "can't disable delalloc during remount");
+ err = -EINVAL;
+ goto restore_opts;
+ }
sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
(test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0);
@@ -6689,6 +6740,14 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb)
if (!ext4_has_feature_mmp(sb) || sb_rdonly(sb))
ext4_stop_mmpd(sbi);
+ /*
+ * Handle aborting the filesystem as the last thing during remount to
+ * avoid obsure errors during remount when some option changes fail to
+ * apply due to shutdown filesystem.
+ */
+ if (test_opt2(sb, ABORT))
+ ext4_abort(sb, ESHUTDOWN, "Abort forced by user");
+
return 0;
restore_opts:
@@ -7329,7 +7388,7 @@ static struct file_system_type ext4_fs_type = {
.init_fs_context = ext4_init_fs_context,
.parameters = ext4_param_specs,
.kill_sb = ext4_kill_sb,
- .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME,
};
MODULE_ALIAS_FS("ext4");
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 94f7b084f601..e3ce763cce18 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -711,7 +711,8 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
}
if (fio->io_wbc && !is_read_io(fio->op))
- wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE);
+ wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page),
+ PAGE_SIZE);
inc_page_count(fio->sbi, is_read_io(fio->op) ?
__read_io_type(page) : WB_DATA_TYPE(fio->page, false));
@@ -911,7 +912,8 @@ alloc_new:
}
if (fio->io_wbc)
- wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE);
+ wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page),
+ PAGE_SIZE);
inc_page_count(fio->sbi, WB_DATA_TYPE(page, false));
@@ -1011,7 +1013,8 @@ alloc_new:
}
if (fio->io_wbc)
- wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE);
+ wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page),
+ PAGE_SIZE);
io->last_block_in_bio = fio->new_blkaddr;
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 9ae54c4c72fe..84447d5145aa 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -3038,32 +3038,27 @@ out:
static int __f2fs_ioc_move_range(struct file *filp,
struct f2fs_move_range *range)
{
- struct fd dst;
int err;
if (!(filp->f_mode & FMODE_READ) ||
!(filp->f_mode & FMODE_WRITE))
return -EBADF;
- dst = fdget(range->dst_fd);
- if (!fd_file(dst))
+ CLASS(fd, dst)(range->dst_fd);
+ if (fd_empty(dst))
return -EBADF;
- if (!(fd_file(dst)->f_mode & FMODE_WRITE)) {
- err = -EBADF;
- goto err_out;
- }
+ if (!(fd_file(dst)->f_mode & FMODE_WRITE))
+ return -EBADF;
err = mnt_want_write_file(filp);
if (err)
- goto err_out;
+ return err;
err = f2fs_move_file_range(filp, range->pos_in, fd_file(dst),
range->pos_out, range->len);
mnt_drop_write_file(filp);
-err_out:
- fdput(dst);
return err;
}
@@ -4647,7 +4642,8 @@ static ssize_t f2fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
iov_iter_count(to), READ);
/* In LFS mode, if there is inflight dio, wait for its completion */
- if (f2fs_lfs_mode(F2FS_I_SB(inode)))
+ if (f2fs_lfs_mode(F2FS_I_SB(inode)) &&
+ get_pages(F2FS_I_SB(inode), F2FS_DIO_WRITE))
inode_dio_wait(inode);
if (f2fs_should_use_dio(inode, iocb, to)) {
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 6423e1dedf14..15bf32c21ac0 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -1037,7 +1037,7 @@ error_inode:
if (corrupt < 0) {
fat_fs_error(new_dir->i_sb,
"%s: Filesystem corrupted (i_pos %lld)",
- __func__, sinfo.i_pos);
+ __func__, new_i_pos);
}
goto out;
}
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 22dd9dcce7ec..ac77dd912412 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -12,7 +12,6 @@
#include <linux/fs.h>
#include <linux/filelock.h>
#include <linux/file.h>
-#include <linux/fdtable.h>
#include <linux/capability.h>
#include <linux/dnotify.h>
#include <linux/slab.h>
@@ -397,6 +396,9 @@ static long f_dupfd_query(int fd, struct file *filp)
{
CLASS(fd_raw, f)(fd);
+ if (fd_empty(f))
+ return -EBADF;
+
/*
* We can do the 'fdput()' immediately, as the only thing that
* matters is the pointer value which isn't changed by the fdput.
@@ -570,24 +572,21 @@ static int check_fcntl_cmd(unsigned cmd)
SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
{
- struct fd f = fdget_raw(fd);
- long err = -EBADF;
+ CLASS(fd_raw, f)(fd);
+ long err;
- if (!fd_file(f))
- goto out;
+ if (fd_empty(f))
+ return -EBADF;
if (unlikely(fd_file(f)->f_mode & FMODE_PATH)) {
if (!check_fcntl_cmd(cmd))
- goto out1;
+ return -EBADF;
}
err = security_file_fcntl(fd_file(f), cmd, arg);
if (!err)
err = do_fcntl(fd, cmd, arg, fd_file(f));
-out1:
- fdput(f);
-out:
return err;
}
@@ -596,21 +595,21 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
unsigned long, arg)
{
void __user *argp = (void __user *)arg;
- struct fd f = fdget_raw(fd);
+ CLASS(fd_raw, f)(fd);
struct flock64 flock;
- long err = -EBADF;
+ long err;
- if (!fd_file(f))
- goto out;
+ if (fd_empty(f))
+ return -EBADF;
if (unlikely(fd_file(f)->f_mode & FMODE_PATH)) {
if (!check_fcntl_cmd(cmd))
- goto out1;
+ return -EBADF;
}
err = security_file_fcntl(fd_file(f), cmd, arg);
if (err)
- goto out1;
+ return err;
switch (cmd) {
case F_GETLK64:
@@ -635,9 +634,6 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
err = do_fcntl(fd, cmd, arg, fd_file(f));
break;
}
-out1:
- fdput(f);
-out:
return err;
}
#endif
@@ -733,21 +729,21 @@ static int fixup_compat_flock(struct flock *flock)
static long do_compat_fcntl64(unsigned int fd, unsigned int cmd,
compat_ulong_t arg)
{
- struct fd f = fdget_raw(fd);
+ CLASS(fd_raw, f)(fd);
struct flock flock;
- long err = -EBADF;
+ long err;
- if (!fd_file(f))
- return err;
+ if (fd_empty(f))
+ return -EBADF;
if (unlikely(fd_file(f)->f_mode & FMODE_PATH)) {
if (!check_fcntl_cmd(cmd))
- goto out_put;
+ return -EBADF;
}
err = security_file_fcntl(fd_file(f), cmd, arg);
if (err)
- goto out_put;
+ return err;
switch (cmd) {
case F_GETLK:
@@ -790,8 +786,6 @@ static long do_compat_fcntl64(unsigned int fd, unsigned int cmd,
err = do_fcntl(fd, cmd, arg, fd_file(f));
break;
}
-out_put:
- fdput(f);
return err;
}
diff --git a/fs/fhandle.c b/fs/fhandle.c
index 82df28d45cd7..5f801139358e 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -139,12 +139,11 @@ static int get_path_from_fd(int fd, struct path *root)
path_get(root);
spin_unlock(&fs->lock);
} else {
- struct fd f = fdget(fd);
- if (!fd_file(f))
+ CLASS(fd, f)(fd);
+ if (fd_empty(f))
return -EBADF;
*root = fd_file(f)->f_path;
path_get(root);
- fdput(f);
}
return 0;
diff --git a/fs/file.c b/fs/file.c
index eb093e736972..fb1011cf6b4a 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -20,10 +20,73 @@
#include <linux/spinlock.h>
#include <linux/rcupdate.h>
#include <linux/close_range.h>
+#include <linux/file_ref.h>
#include <net/sock.h>
#include "internal.h"
+/**
+ * __file_ref_put - Slowpath of file_ref_put()
+ * @ref: Pointer to the reference count
+ * @cnt: Current reference count
+ *
+ * Invoked when the reference count is outside of the valid zone.
+ *
+ * Return:
+ * True if this was the last reference with no future references
+ * possible. This signals the caller that it can safely schedule the
+ * object, which is protected by the reference counter, for
+ * deconstruction.
+ *
+ * False if there are still active references or the put() raced
+ * with a concurrent get()/put() pair. Caller is not allowed to
+ * deconstruct the protected object.
+ */
+bool __file_ref_put(file_ref_t *ref, unsigned long cnt)
+{
+ /* Did this drop the last reference? */
+ if (likely(cnt == FILE_REF_NOREF)) {
+ /*
+ * Carefully try to set the reference count to FILE_REF_DEAD.
+ *
+ * This can fail if a concurrent get() operation has
+ * elevated it again or the corresponding put() even marked
+ * it dead already. Both are valid situations and do not
+ * require a retry. If this fails the caller is not
+ * allowed to deconstruct the object.
+ */
+ if (!atomic_long_try_cmpxchg_release(&ref->refcnt, &cnt, FILE_REF_DEAD))
+ return false;
+
+ /*
+ * The caller can safely schedule the object for
+ * deconstruction. Provide acquire ordering.
+ */
+ smp_acquire__after_ctrl_dep();
+ return true;
+ }
+
+ /*
+ * If the reference count was already in the dead zone, then this
+ * put() operation is imbalanced. Warn, put the reference count back to
+ * DEAD and tell the caller to not deconstruct the object.
+ */
+ if (WARN_ONCE(cnt >= FILE_REF_RELEASED, "imbalanced put on file reference count")) {
+ atomic_long_set(&ref->refcnt, FILE_REF_DEAD);
+ return false;
+ }
+
+ /*
+ * This is a put() operation on a saturated refcount. Restore the
+ * mean saturation value and tell the caller to not deconstruct the
+ * object.
+ */
+ if (cnt > FILE_REF_MAXREF)
+ atomic_long_set(&ref->refcnt, FILE_REF_SATURATED);
+ return false;
+}
+EXPORT_SYMBOL_GPL(__file_ref_put);
+
unsigned int sysctl_nr_open __read_mostly = 1024*1024;
unsigned int sysctl_nr_open_min = BITS_PER_LONG;
/* our min() is unusable in constant expressions ;-/ */
@@ -89,18 +152,11 @@ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
* 'unsigned long' in some places, but simply because that is how the Linux
* kernel bitmaps are defined to work: they are not "bits in an array of bytes",
* they are very much "bits in an array of unsigned long".
- *
- * The ALIGN(nr, BITS_PER_LONG) here is for clarity: since we just multiplied
- * by that "1024/sizeof(ptr)" before, we already know there are sufficient
- * clear low bits. Clang seems to realize that, gcc ends up being confused.
- *
- * On a 128-bit machine, the ALIGN() would actually matter. In the meantime,
- * let's consider it documentation (and maybe a test-case for gcc to improve
- * its code generation ;)
*/
-static struct fdtable * alloc_fdtable(unsigned int nr)
+static struct fdtable *alloc_fdtable(unsigned int slots_wanted)
{
struct fdtable *fdt;
+ unsigned int nr;
void *data;
/*
@@ -108,22 +164,32 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
* Allocation steps are keyed to the size of the fdarray, since it
* grows far faster than any of the other dynamic data. We try to fit
* the fdarray into comfortable page-tuned chunks: starting at 1024B
- * and growing in powers of two from there on.
+ * and growing in powers of two from there on. Since we called only
+ * with slots_wanted > BITS_PER_LONG (embedded instance in files->fdtab
+ * already gives BITS_PER_LONG slots), the above boils down to
+ * 1. use the smallest power of two large enough to give us that many
+ * slots.
+ * 2. on 32bit skip 64 and 128 - the minimal capacity we want there is
+ * 256 slots (i.e. 1Kb fd array).
+ * 3. on 64bit don't skip anything, 1Kb fd array means 128 slots there
+ * and we are never going to be asked for 64 or less.
*/
- nr /= (1024 / sizeof(struct file *));
- nr = roundup_pow_of_two(nr + 1);
- nr *= (1024 / sizeof(struct file *));
- nr = ALIGN(nr, BITS_PER_LONG);
+ if (IS_ENABLED(CONFIG_32BIT) && slots_wanted < 256)
+ nr = 256;
+ else
+ nr = roundup_pow_of_two(slots_wanted);
/*
* Note that this can drive nr *below* what we had passed if sysctl_nr_open
- * had been set lower between the check in expand_files() and here. Deal
- * with that in caller, it's cheaper that way.
+ * had been set lower between the check in expand_files() and here.
*
* We make sure that nr remains a multiple of BITS_PER_LONG - otherwise
* bitmaps handling below becomes unpleasant, to put it mildly...
*/
- if (unlikely(nr > sysctl_nr_open))
- nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1;
+ if (unlikely(nr > sysctl_nr_open)) {
+ nr = round_down(sysctl_nr_open, BITS_PER_LONG);
+ if (nr < slots_wanted)
+ return ERR_PTR(-EMFILE);
+ }
fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT);
if (!fdt)
@@ -152,14 +218,14 @@ out_arr:
out_fdt:
kfree(fdt);
out:
- return NULL;
+ return ERR_PTR(-ENOMEM);
}
/*
* Expand the file descriptor table.
* This function will allocate a new fdtable and both fd array and fdset, of
* the given size.
- * Return <0 error code on error; 1 on successful completion.
+ * Return <0 error code on error; 0 on successful completion.
* The files->file_lock should be held on entry, and will be held on exit.
*/
static int expand_fdtable(struct files_struct *files, unsigned int nr)
@@ -169,7 +235,7 @@ static int expand_fdtable(struct files_struct *files, unsigned int nr)
struct fdtable *new_fdt, *cur_fdt;
spin_unlock(&files->file_lock);
- new_fdt = alloc_fdtable(nr);
+ new_fdt = alloc_fdtable(nr + 1);
/* make sure all fd_install() have seen resize_in_progress
* or have finished their rcu_read_lock_sched() section.
@@ -178,16 +244,8 @@ static int expand_fdtable(struct files_struct *files, unsigned int nr)
synchronize_rcu();
spin_lock(&files->file_lock);
- if (!new_fdt)
- return -ENOMEM;
- /*
- * extremely unlikely race - sysctl_nr_open decreased between the check in
- * caller and alloc_fdtable(). Cheaper to catch it here...
- */
- if (unlikely(new_fdt->max_fds <= nr)) {
- __free_fdtable(new_fdt);
- return -EMFILE;
- }
+ if (IS_ERR(new_fdt))
+ return PTR_ERR(new_fdt);
cur_fdt = files_fdtable(files);
BUG_ON(nr < cur_fdt->max_fds);
copy_fdtable(new_fdt, cur_fdt);
@@ -196,15 +254,14 @@ static int expand_fdtable(struct files_struct *files, unsigned int nr)
call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
/* coupled with smp_rmb() in fd_install() */
smp_wmb();
- return 1;
+ return 0;
}
/*
* Expand files.
* This function will expand the file structures, if the requested size exceeds
* the current capacity and there is room for expansion.
- * Return <0 error code on error; 0 when nothing done; 1 when files were
- * expanded and execution may have blocked.
+ * Return <0 error code on error; 0 on success.
* The files->file_lock should be held on entry, and will be held on exit.
*/
static int expand_files(struct files_struct *files, unsigned int nr)
@@ -212,14 +269,14 @@ static int expand_files(struct files_struct *files, unsigned int nr)
__acquires(files->file_lock)
{
struct fdtable *fdt;
- int expanded = 0;
+ int error;
repeat:
fdt = files_fdtable(files);
/* Do we need to expand? */
if (nr < fdt->max_fds)
- return expanded;
+ return 0;
/* Can we expand? */
if (nr >= sysctl_nr_open)
@@ -227,7 +284,6 @@ repeat:
if (unlikely(files->resize_in_progress)) {
spin_unlock(&files->file_lock);
- expanded = 1;
wait_event(files->resize_wait, !files->resize_in_progress);
spin_lock(&files->file_lock);
goto repeat;
@@ -235,27 +291,28 @@ repeat:
/* All good, so we try */
files->resize_in_progress = true;
- expanded = expand_fdtable(files, nr);
+ error = expand_fdtable(files, nr);
files->resize_in_progress = false;
wake_up_all(&files->resize_wait);
- return expanded;
-}
-
-static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt)
-{
- __set_bit(fd, fdt->close_on_exec);
+ return error;
}
-static inline void __clear_close_on_exec(unsigned int fd, struct fdtable *fdt)
+static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt,
+ bool set)
{
- if (test_bit(fd, fdt->close_on_exec))
- __clear_bit(fd, fdt->close_on_exec);
+ if (set) {
+ __set_bit(fd, fdt->close_on_exec);
+ } else {
+ if (test_bit(fd, fdt->close_on_exec))
+ __clear_bit(fd, fdt->close_on_exec);
+ }
}
-static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt)
+static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt, bool set)
{
__set_bit(fd, fdt->open_fds);
+ __set_close_on_exec(fd, fdt, set);
fd /= BITS_PER_LONG;
if (!~fdt->open_fds[fd])
__set_bit(fd, fdt->full_fds_bits);
@@ -264,7 +321,9 @@ static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt)
static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt)
{
__clear_bit(fd, fdt->open_fds);
- __clear_bit(fd / BITS_PER_LONG, fdt->full_fds_bits);
+ fd /= BITS_PER_LONG;
+ if (test_bit(fd, fdt->full_fds_bits))
+ __clear_bit(fd, fdt->full_fds_bits);
}
static inline bool fd_is_open(unsigned int fd, const struct fdtable *fdt)
@@ -306,7 +365,6 @@ struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_ho
struct file **old_fds, **new_fds;
unsigned int open_files, i;
struct fdtable *old_fdt, *new_fdt;
- int error;
newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
if (!newf)
@@ -338,17 +396,10 @@ struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_ho
if (new_fdt != &newf->fdtab)
__free_fdtable(new_fdt);
- new_fdt = alloc_fdtable(open_files - 1);
- if (!new_fdt) {
- error = -ENOMEM;
- goto out_release;
- }
-
- /* beyond sysctl_nr_open; nothing to do */
- if (unlikely(new_fdt->max_fds < open_files)) {
- __free_fdtable(new_fdt);
- error = -EMFILE;
- goto out_release;
+ new_fdt = alloc_fdtable(open_files);
+ if (IS_ERR(new_fdt)) {
+ kmem_cache_free(files_cachep, newf);
+ return ERR_CAST(new_fdt);
}
/*
@@ -389,10 +440,6 @@ struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_ho
rcu_assign_pointer(newf->fdt, new_fdt);
return newf;
-
-out_release:
- kmem_cache_free(files_cachep, newf);
- return ERR_PTR(error);
}
static struct fdtable *close_files(struct files_struct * files)
@@ -413,7 +460,7 @@ static struct fdtable *close_files(struct files_struct * files)
set = fdt->open_fds[j++];
while (set) {
if (set & 1) {
- struct file * file = xchg(&fdt->fd[i], NULL);
+ struct file *file = fdt->fd[i];
if (file) {
filp_close(file, files);
cond_resched();
@@ -470,6 +517,15 @@ static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start)
unsigned int maxfd = fdt->max_fds; /* always multiple of BITS_PER_LONG */
unsigned int maxbit = maxfd / BITS_PER_LONG;
unsigned int bitbit = start / BITS_PER_LONG;
+ unsigned int bit;
+
+ /*
+ * Try to avoid looking at the second level bitmap
+ */
+ bit = find_next_zero_bit(&fdt->open_fds[bitbit], BITS_PER_LONG,
+ start & (BITS_PER_LONG - 1));
+ if (bit < BITS_PER_LONG)
+ return bit + bitbit * BITS_PER_LONG;
bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG;
if (bitbit >= maxfd)
@@ -496,7 +552,7 @@ repeat:
if (fd < files->next_fd)
fd = files->next_fd;
- if (fd < fdt->max_fds)
+ if (likely(fd < fdt->max_fds))
fd = find_next_fd(fdt, fd);
/*
@@ -504,36 +560,22 @@ repeat:
* will limit the total number of files that can be opened.
*/
error = -EMFILE;
- if (fd >= end)
+ if (unlikely(fd >= end))
goto out;
- error = expand_files(files, fd);
- if (error < 0)
- goto out;
+ if (unlikely(fd >= fdt->max_fds)) {
+ error = expand_files(files, fd);
+ if (error < 0)
+ goto out;
- /*
- * If we needed to expand the fs array we
- * might have blocked - try again.
- */
- if (error)
goto repeat;
+ }
if (start <= files->next_fd)
files->next_fd = fd + 1;
- __set_open_fd(fd, fdt);
- if (flags & O_CLOEXEC)
- __set_close_on_exec(fd, fdt);
- else
- __clear_close_on_exec(fd, fdt);
+ __set_open_fd(fd, fdt, flags & O_CLOEXEC);
error = fd;
-#if 1
- /* Sanity check */
- if (rcu_access_pointer(fdt->fd[fd]) != NULL) {
- printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
- rcu_assign_pointer(fdt->fd[fd], NULL);
- }
-#endif
out:
spin_unlock(&files->file_lock);
@@ -599,7 +641,7 @@ void fd_install(unsigned int fd, struct file *file)
rcu_read_unlock_sched();
spin_lock(&files->file_lock);
fdt = files_fdtable(files);
- BUG_ON(fdt->fd[fd] != NULL);
+ WARN_ON(fdt->fd[fd] != NULL);
rcu_assign_pointer(fdt->fd[fd], file);
spin_unlock(&files->file_lock);
return;
@@ -713,7 +755,7 @@ static inline void __range_close(struct files_struct *files, unsigned int fd,
}
/**
- * __close_range() - Close all file descriptors in a given range.
+ * sys_close_range() - Close all file descriptors in a given range.
*
* @fd: starting file descriptor to close
* @max_fd: last file descriptor to close
@@ -721,8 +763,10 @@ static inline void __range_close(struct files_struct *files, unsigned int fd,
*
* This closes a range of file descriptors. All file descriptors
* from @fd up to and including @max_fd are closed.
+ * Currently, errors to close a given file descriptor are ignored.
*/
-int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
+SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd,
+ unsigned int, flags)
{
struct task_struct *me = current;
struct files_struct *cur_fds = me->files, *fds = NULL;
@@ -839,7 +883,7 @@ static struct file *__get_file_rcu(struct file __rcu **f)
if (!file)
return NULL;
- if (unlikely(!atomic_long_inc_not_zero(&file->f_count)))
+ if (unlikely(!file_ref_get(&file->f_ref)))
return ERR_PTR(-EAGAIN);
file_reloaded = rcu_dereference_raw(*f);
@@ -853,8 +897,8 @@ static struct file *__get_file_rcu(struct file __rcu **f)
OPTIMIZER_HIDE_VAR(file_reloaded_cmp);
/*
- * atomic_long_inc_not_zero() above provided a full memory
- * barrier when we acquired a reference.
+ * file_ref_get() above provided a full memory barrier when we
+ * acquired a reference.
*
* This is paired with the write barrier from assigning to the
* __rcu protected file pointer so that if that pointer still
@@ -952,11 +996,11 @@ static inline struct file *__fget_files_rcu(struct files_struct *files,
* We need to confirm it by incrementing the refcount
* and then check the lookup again.
*
- * atomic_long_inc_not_zero() gives us a full memory
- * barrier. We only really need an 'acquire' one to
- * protect the loads below, but we don't have that.
+ * file_ref_get() gives us a full memory barrier. We
+ * only really need an 'acquire' one to protect the
+ * loads below, but we don't have that.
*/
- if (unlikely(!atomic_long_inc_not_zero(&file->f_count)))
+ if (unlikely(!file_ref_get(&file->f_ref)))
continue;
/*
@@ -1037,29 +1081,7 @@ struct file *fget_task(struct task_struct *task, unsigned int fd)
return file;
}
-struct file *lookup_fdget_rcu(unsigned int fd)
-{
- return __fget_files_rcu(current->files, fd, 0);
-
-}
-EXPORT_SYMBOL_GPL(lookup_fdget_rcu);
-
-struct file *task_lookup_fdget_rcu(struct task_struct *task, unsigned int fd)
-{
- /* Must be called with rcu_read_lock held */
- struct files_struct *files;
- struct file *file = NULL;
-
- task_lock(task);
- files = task->files;
- if (files)
- file = __fget_files_rcu(files, fd, 0);
- task_unlock(task);
-
- return file;
-}
-
-struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int *ret_fd)
+struct file *fget_task_next(struct task_struct *task, unsigned int *ret_fd)
{
/* Must be called with rcu_read_lock held */
struct files_struct *files;
@@ -1069,17 +1091,19 @@ struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int *
task_lock(task);
files = task->files;
if (files) {
+ rcu_read_lock();
for (; fd < files_fdtable(files)->max_fds; fd++) {
file = __fget_files_rcu(files, fd, 0);
if (file)
break;
}
+ rcu_read_unlock();
}
task_unlock(task);
*ret_fd = fd;
return file;
}
-EXPORT_SYMBOL(task_lookup_next_fdget_rcu);
+EXPORT_SYMBOL(fget_task_next);
/*
* Lightweight file lookup - no refcnt increment if fd table isn't shared.
@@ -1096,6 +1120,13 @@ EXPORT_SYMBOL(task_lookup_next_fdget_rcu);
*
* The fput_needed flag returned by fget_light should be passed to the
* corresponding fput_light.
+ *
+ * (As an exception to rule 2, you can call filp_close between fget_light and
+ * fput_light provided that you capture a real refcount with get_file before
+ * the call to filp_close, and ensure that this real refcount is fput *after*
+ * the fput_light call.)
+ *
+ * See also the documentation in rust/kernel/file.rs.
*/
static inline struct fd __fget_light(unsigned int fd, fmode_t mask)
{
@@ -1176,13 +1207,8 @@ void __f_unlock_pos(struct file *f)
void set_close_on_exec(unsigned int fd, int flag)
{
struct files_struct *files = current->files;
- struct fdtable *fdt;
spin_lock(&files->file_lock);
- fdt = files_fdtable(files);
- if (flag)
- __set_close_on_exec(fd, fdt);
- else
- __clear_close_on_exec(fd, fdt);
+ __set_close_on_exec(fd, files_fdtable(files), flag);
spin_unlock(&files->file_lock);
}
@@ -1223,11 +1249,7 @@ __releases(&files->file_lock)
goto Ebusy;
get_file(file);
rcu_assign_pointer(fdt->fd[fd], file);
- __set_open_fd(fd, fdt);
- if (flags & O_CLOEXEC)
- __set_close_on_exec(fd, fdt);
- else
- __clear_close_on_exec(fd, fdt);
+ __set_open_fd(fd, fdt, flags & O_CLOEXEC);
spin_unlock(&files->file_lock);
if (tofree)
diff --git a/fs/file_table.c b/fs/file_table.c
index eed5ffad9997..976736be47cb 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -9,7 +9,6 @@
#include <linux/string.h>
#include <linux/slab.h>
#include <linux/file.h>
-#include <linux/fdtable.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/fs.h>
@@ -40,13 +39,17 @@ static struct files_stat_struct files_stat = {
/* SLAB cache for file structures */
static struct kmem_cache *filp_cachep __ro_after_init;
+static struct kmem_cache *bfilp_cachep __ro_after_init;
static struct percpu_counter nr_files __cacheline_aligned_in_smp;
/* Container for backing file with optional user path */
struct backing_file {
struct file file;
- struct path user_path;
+ union {
+ struct path user_path;
+ freeptr_t bf_freeptr;
+ };
};
static inline struct backing_file *backing_file(struct file *f)
@@ -68,7 +71,7 @@ static inline void file_free(struct file *f)
put_cred(f->f_cred);
if (unlikely(f->f_mode & FMODE_BACKING)) {
path_put(backing_file_user_path(f));
- kfree(backing_file(f));
+ kmem_cache_free(bfilp_cachep, backing_file(f));
} else {
kmem_cache_free(filp_cachep, f);
}
@@ -165,16 +168,32 @@ static int init_file(struct file *f, int flags, const struct cred *cred)
* the respective member when opening the file.
*/
mutex_init(&f->f_pos_lock);
- f->f_flags = flags;
- f->f_mode = OPEN_FMODE(flags);
- /* f->f_version: 0 */
+ memset(&f->f_path, 0, sizeof(f->f_path));
+ memset(&f->f_ra, 0, sizeof(f->f_ra));
+
+ f->f_flags = flags;
+ f->f_mode = OPEN_FMODE(flags);
+
+ f->f_op = NULL;
+ f->f_mapping = NULL;
+ f->private_data = NULL;
+ f->f_inode = NULL;
+ f->f_owner = NULL;
+#ifdef CONFIG_EPOLL
+ f->f_ep = NULL;
+#endif
+
+ f->f_iocb_flags = 0;
+ f->f_pos = 0;
+ f->f_wb_err = 0;
+ f->f_sb_err = 0;
/*
* We're SLAB_TYPESAFE_BY_RCU so initialize f_count last. While
* fget-rcu pattern users need to be able to handle spurious
* refcount bumps we should reinitialize the reused file first.
*/
- atomic_long_set(&f->f_count, 1);
+ file_ref_init(&f->f_ref, 1);
return 0;
}
@@ -206,7 +225,7 @@ struct file *alloc_empty_file(int flags, const struct cred *cred)
goto over;
}
- f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL);
+ f = kmem_cache_alloc(filp_cachep, GFP_KERNEL);
if (unlikely(!f))
return ERR_PTR(-ENOMEM);
@@ -240,7 +259,7 @@ struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred)
struct file *f;
int error;
- f = kmem_cache_zalloc(filp_cachep, GFP_KERNEL);
+ f = kmem_cache_alloc(filp_cachep, GFP_KERNEL);
if (unlikely(!f))
return ERR_PTR(-ENOMEM);
@@ -267,13 +286,13 @@ struct file *alloc_empty_backing_file(int flags, const struct cred *cred)
struct backing_file *ff;
int error;
- ff = kzalloc(sizeof(struct backing_file), GFP_KERNEL);
+ ff = kmem_cache_alloc(bfilp_cachep, GFP_KERNEL);
if (unlikely(!ff))
return ERR_PTR(-ENOMEM);
error = init_file(&ff->file, flags, cred);
if (unlikely(error)) {
- kfree(ff);
+ kmem_cache_free(bfilp_cachep, ff);
return ERR_PTR(error);
}
@@ -479,7 +498,7 @@ static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput);
void fput(struct file *file)
{
- if (atomic_long_dec_and_test(&file->f_count)) {
+ if (file_ref_put(&file->f_ref)) {
struct task_struct *task = current;
if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) {
@@ -512,7 +531,7 @@ void fput(struct file *file)
*/
void __fput_sync(struct file *file)
{
- if (atomic_long_dec_and_test(&file->f_count))
+ if (file_ref_put(&file->f_ref))
__fput(file);
}
@@ -529,6 +548,11 @@ void __init files_init(void)
filp_cachep = kmem_cache_create("filp", sizeof(struct file), &args,
SLAB_HWCACHE_ALIGN | SLAB_PANIC |
SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU);
+
+ args.freeptr_offset = offsetof(struct backing_file, bf_freeptr);
+ bfilp_cachep = kmem_cache_create("bfilp", sizeof(struct backing_file),
+ &args, SLAB_HWCACHE_ALIGN | SLAB_PANIC |
+ SLAB_ACCOUNT | SLAB_TYPESAFE_BY_RCU);
percpu_counter_init(&nr_files, 0, GFP_KERNEL);
}
diff --git a/fs/freevxfs/vxfs_dir.h b/fs/freevxfs/vxfs_dir.h
index fbcd603365ad..8c67627f2a3d 100644
--- a/fs/freevxfs/vxfs_dir.h
+++ b/fs/freevxfs/vxfs_dir.h
@@ -25,7 +25,7 @@
struct vxfs_dirblk {
__fs16 d_free; /* free space in dirblock */
__fs16 d_nhash; /* no of hash chains */
- __fs16 d_hash[1]; /* hash chain */
+ __fs16 d_hash[]; /* hash chain */
};
/*
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index d8bec3c1bb1f..3cd99e2dc6ac 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -290,7 +290,6 @@ void __inode_attach_wb(struct inode *inode, struct folio *folio)
if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
wb_put(wb);
}
-EXPORT_SYMBOL_GPL(__inode_attach_wb);
/**
* inode_cgwb_move_to_attached - put the inode onto wb->b_attached list
@@ -731,8 +730,9 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb)
* writeback completion, wbc_detach_inode() should be called. This is used
* to track the cgroup writeback context.
*/
-void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
- struct inode *inode)
+static void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
+ struct inode *inode)
+ __releases(&inode->i_lock)
{
if (!inode_cgwb_enabled(inode)) {
spin_unlock(&inode->i_lock);
@@ -762,7 +762,24 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
if (unlikely(wb_dying(wbc->wb) && !css_is_dying(wbc->wb->memcg_css)))
inode_switch_wbs(inode, wbc->wb_id);
}
-EXPORT_SYMBOL_GPL(wbc_attach_and_unlock_inode);
+
+/**
+ * wbc_attach_fdatawrite_inode - associate wbc and inode for fdatawrite
+ * @wbc: writeback_control of interest
+ * @inode: target inode
+ *
+ * This function is to be used by __filemap_fdatawrite_range(), which is an
+ * alternative entry point into writeback code, and first ensures @inode is
+ * associated with a bdi_writeback and attaches it to @wbc.
+ */
+void wbc_attach_fdatawrite_inode(struct writeback_control *wbc,
+ struct inode *inode)
+{
+ spin_lock(&inode->i_lock);
+ inode_attach_wb(inode, NULL);
+ wbc_attach_and_unlock_inode(wbc, inode);
+}
+EXPORT_SYMBOL_GPL(wbc_attach_fdatawrite_inode);
/**
* wbc_detach_inode - disassociate wbc from inode and perform foreign detection
@@ -890,17 +907,16 @@ EXPORT_SYMBOL_GPL(wbc_detach_inode);
/**
* wbc_account_cgroup_owner - account writeback to update inode cgroup ownership
* @wbc: writeback_control of the writeback in progress
- * @page: page being written out
+ * @folio: folio being written out
* @bytes: number of bytes being written out
*
- * @bytes from @page are about to written out during the writeback
+ * @bytes from @folio are about to written out during the writeback
* controlled by @wbc. Keep the book for foreign inode detection. See
* wbc_detach_inode().
*/
-void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
+void wbc_account_cgroup_owner(struct writeback_control *wbc, struct folio *folio,
size_t bytes)
{
- struct folio *folio;
struct cgroup_subsys_state *css;
int id;
@@ -913,7 +929,6 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page,
if (!wbc->wb || wbc->no_cgroup_owner)
return;
- folio = page_folio(page);
css = mem_cgroup_css_from_folio(folio);
/* dead cgroups shouldn't contribute to inode ownership arbitration */
if (!(css->flags & CSS_ONLINE))
@@ -1227,6 +1242,13 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
}
}
+static inline void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
+ struct inode *inode)
+ __releases(&inode->i_lock)
+{
+ spin_unlock(&inode->i_lock);
+}
+
#endif /* CONFIG_CGROUP_WRITEBACK */
/*
diff --git a/fs/fs_parser.c b/fs/fs_parser.c
index 24727ec34e5a..16fa61ef56bf 100644
--- a/fs/fs_parser.c
+++ b/fs/fs_parser.c
@@ -156,6 +156,7 @@ int fs_lookup_param(struct fs_context *fc,
f = getname_kernel(param->string);
if (IS_ERR(f))
return PTR_ERR(f);
+ param->dirfd = AT_FDCWD;
put_f = true;
break;
case fs_value_is_filename:
@@ -308,6 +309,26 @@ int fs_param_is_fd(struct p_log *log, const struct fs_parameter_spec *p,
}
EXPORT_SYMBOL(fs_param_is_fd);
+int fs_param_is_file_or_string(struct p_log *log,
+ const struct fs_parameter_spec *p,
+ struct fs_parameter *param,
+ struct fs_parse_result *result)
+{
+ switch (param->type) {
+ case fs_value_is_string:
+ return fs_param_is_string(log, p, param, result);
+ case fs_value_is_file:
+ result->uint_32 = param->dirfd;
+ if (result->uint_32 <= INT_MAX)
+ return 0;
+ break;
+ default:
+ break;
+ }
+ return fs_param_bad_value(log, param);
+}
+EXPORT_SYMBOL(fs_param_is_file_or_string);
+
int fs_param_is_uid(struct p_log *log, const struct fs_parameter_spec *p,
struct fs_parameter *param, struct fs_parse_result *result)
{
diff --git a/fs/fsopen.c b/fs/fsopen.c
index 6cef3deccded..094a7f510edf 100644
--- a/fs/fsopen.c
+++ b/fs/fsopen.c
@@ -349,7 +349,6 @@ SYSCALL_DEFINE5(fsconfig,
int, aux)
{
struct fs_context *fc;
- struct fd f;
int ret;
int lookup_flags = 0;
@@ -392,12 +391,11 @@ SYSCALL_DEFINE5(fsconfig,
return -EOPNOTSUPP;
}
- f = fdget(fd);
- if (!fd_file(f))
+ CLASS(fd, f)(fd);
+ if (fd_empty(f))
return -EBADF;
- ret = -EINVAL;
if (fd_file(f)->f_op != &fscontext_fops)
- goto out_f;
+ return -EINVAL;
fc = fd_file(f)->private_data;
if (fc->ops == &legacy_fs_context_ops) {
@@ -407,17 +405,14 @@ SYSCALL_DEFINE5(fsconfig,
case FSCONFIG_SET_PATH_EMPTY:
case FSCONFIG_SET_FD:
case FSCONFIG_CMD_CREATE_EXCL:
- ret = -EOPNOTSUPP;
- goto out_f;
+ return -EOPNOTSUPP;
}
}
if (_key) {
param.key = strndup_user(_key, 256);
- if (IS_ERR(param.key)) {
- ret = PTR_ERR(param.key);
- goto out_f;
- }
+ if (IS_ERR(param.key))
+ return PTR_ERR(param.key);
}
switch (cmd) {
@@ -496,7 +491,5 @@ SYSCALL_DEFINE5(fsconfig,
}
out_key:
kfree(param.key);
-out_f:
- fdput(f);
return ret;
}
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 1f64ae6d7a69..0723c6344b20 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -2371,13 +2371,12 @@ static long fuse_dev_ioctl_clone(struct file *file, __u32 __user *argp)
int res;
int oldfd;
struct fuse_dev *fud = NULL;
- struct fd f;
if (get_user(oldfd, argp))
return -EFAULT;
- f = fdget(oldfd);
- if (!fd_file(f))
+ CLASS(fd, f)(oldfd);
+ if (fd_empty(f))
return -EINVAL;
/*
@@ -2394,7 +2393,6 @@ static long fuse_dev_ioctl_clone(struct file *file, __u32 __user *argp)
mutex_unlock(&fuse_mutex);
}
- fdput(f);
return res;
}
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index f33fbce86ae0..dafdf766b1d5 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -2288,6 +2288,13 @@ static int fuse_writepages_fill(struct folio *folio,
struct folio *tmp_folio;
int err;
+ if (!data->ff) {
+ err = -EIO;
+ data->ff = fuse_write_file_get(fi);
+ if (!data->ff)
+ goto out_unlock;
+ }
+
if (wpa && fuse_writepage_need_send(fc, &folio->page, ap, data)) {
fuse_writepages_send(data);
data->wpa = NULL;
@@ -2351,13 +2358,13 @@ static int fuse_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct inode *inode = mapping->host;
- struct fuse_inode *fi = get_fuse_inode(inode);
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_fill_wb_data data;
int err;
+ err = -EIO;
if (fuse_is_bad(inode))
- return -EIO;
+ goto out;
if (wbc->sync_mode == WB_SYNC_NONE &&
fc->num_background >= fc->congestion_threshold)
@@ -2365,9 +2372,7 @@ static int fuse_writepages(struct address_space *mapping,
data.inode = inode;
data.wpa = NULL;
- data.ff = fuse_write_file_get(fi);
- if (!data.ff)
- return -EIO;
+ data.ff = NULL;
err = -ENOMEM;
data.orig_pages = kcalloc(fc->max_pages,
@@ -2381,10 +2386,11 @@ static int fuse_writepages(struct address_space *mapping,
WARN_ON(!data.wpa->ia.ap.num_pages);
fuse_writepages_send(&data);
}
+ if (data.ff)
+ fuse_file_put(data.ff, false);
kfree(data.orig_pages);
out:
- fuse_file_put(data.ff, false);
return err;
}
diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c
index 62aee8289d11..bbac547dfcb3 100644
--- a/fs/fuse/passthrough.c
+++ b/fs/fuse/passthrough.c
@@ -18,11 +18,11 @@ static void fuse_file_accessed(struct file *file)
fuse_invalidate_atime(inode);
}
-static void fuse_file_modified(struct file *file)
+static void fuse_passthrough_end_write(struct file *file, loff_t pos, ssize_t ret)
{
struct inode *inode = file_inode(file);
- fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
+ fuse_write_update_attr(inode, pos, ret);
}
ssize_t fuse_passthrough_read_iter(struct kiocb *iocb, struct iov_iter *iter)
@@ -63,7 +63,7 @@ ssize_t fuse_passthrough_write_iter(struct kiocb *iocb,
struct backing_file_ctx ctx = {
.cred = ff->cred,
.user_file = file,
- .end_write = fuse_file_modified,
+ .end_write = fuse_passthrough_end_write,
};
pr_debug("%s: backing_file=0x%p, pos=%lld, len=%zu\n", __func__,
@@ -110,7 +110,7 @@ ssize_t fuse_passthrough_splice_write(struct pipe_inode_info *pipe,
struct backing_file_ctx ctx = {
.cred = ff->cred,
.user_file = out,
- .end_write = fuse_file_modified,
+ .end_write = fuse_passthrough_end_write,
};
pr_debug("%s: backing_file=0x%p, pos=%lld, len=%zu, flags=0x%x\n", __func__,
@@ -234,7 +234,6 @@ int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map)
goto out;
backing_sb = file_inode(file)->i_sb;
- pr_info("%s: %x:%pD %i\n", __func__, backing_sb->s_dev, file, backing_sb->s_stack_depth);
res = -ELOOP;
if (backing_sb->s_stack_depth >= fc->max_stack_depth)
goto out_fput;
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index d418d8b5367f..3334c394ce9c 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -190,6 +190,5 @@ const struct export_operations gfs2_export_ops = {
.fh_to_parent = gfs2_fh_to_parent,
.get_name = gfs2_get_name,
.get_parent = gfs2_get_parent,
- .flags = EXPORT_OP_ASYNC_LOCK,
};
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index f7dd64856c9b..1e73cf87ff88 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -1586,6 +1586,7 @@ const struct file_operations gfs2_file_fops = {
.splice_write = gfs2_file_splice_write,
.setlease = simple_nosetlease,
.fallocate = gfs2_fallocate,
+ .fop_flags = FOP_ASYNC_LOCK,
};
const struct file_operations gfs2_dir_fops = {
@@ -1598,6 +1599,7 @@ const struct file_operations gfs2_dir_fops = {
.lock = gfs2_lock,
.flock = gfs2_flock,
.llseek = default_llseek,
+ .fop_flags = FOP_ASYNC_LOCK,
};
#endif /* CONFIG_GFS2_FS_LOCKING_DLM */
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 269c3bc7fced..4701c4aafbf4 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -34,7 +34,6 @@
#include <linux/lockref.h>
#include <linux/rhashtable.h>
#include <linux/pid_namespace.h>
-#include <linux/fdtable.h>
#include <linux/file.h>
#include "gfs2.h"
@@ -2768,25 +2767,18 @@ static struct file *gfs2_glockfd_next_file(struct gfs2_glockfd_iter *i)
i->file = NULL;
}
- rcu_read_lock();
for(;; i->fd++) {
- struct inode *inode;
-
- i->file = task_lookup_next_fdget_rcu(i->task, &i->fd);
+ i->file = fget_task_next(i->task, &i->fd);
if (!i->file) {
i->fd = 0;
break;
}
- inode = file_inode(i->file);
- if (inode->i_sb == i->sb)
+ if (file_inode(i->file)->i_sb == i->sb)
break;
- rcu_read_unlock();
fput(i->file);
- rcu_read_lock();
}
- rcu_read_unlock();
return i->file;
}
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index eeac99765f0d..3bee9b5dba5e 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -15,10 +15,11 @@
#include <linux/module.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include <linux/mount.h>
#include <linux/init.h>
#include <linux/nls.h>
-#include <linux/parser.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/vfs.h>
@@ -111,21 +112,24 @@ static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf)
return 0;
}
-static int hfs_remount(struct super_block *sb, int *flags, char *data)
+static int hfs_reconfigure(struct fs_context *fc)
{
+ struct super_block *sb = fc->root->d_sb;
+
sync_filesystem(sb);
- *flags |= SB_NODIRATIME;
- if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
+ fc->sb_flags |= SB_NODIRATIME;
+ if ((bool)(fc->sb_flags & SB_RDONLY) == sb_rdonly(sb))
return 0;
- if (!(*flags & SB_RDONLY)) {
+
+ if (!(fc->sb_flags & SB_RDONLY)) {
if (!(HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_UNMNT))) {
pr_warn("filesystem was not cleanly unmounted, running fsck.hfs is recommended. leaving read-only.\n");
sb->s_flags |= SB_RDONLY;
- *flags |= SB_RDONLY;
+ fc->sb_flags |= SB_RDONLY;
} else if (HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_SLOCK)) {
pr_warn("filesystem is marked locked, leaving read-only.\n");
sb->s_flags |= SB_RDONLY;
- *flags |= SB_RDONLY;
+ fc->sb_flags |= SB_RDONLY;
}
}
return 0;
@@ -180,7 +184,6 @@ static const struct super_operations hfs_super_operations = {
.put_super = hfs_put_super,
.sync_fs = hfs_sync_fs,
.statfs = hfs_statfs,
- .remount_fs = hfs_remount,
.show_options = hfs_show_options,
};
@@ -188,181 +191,112 @@ enum {
opt_uid, opt_gid, opt_umask, opt_file_umask, opt_dir_umask,
opt_part, opt_session, opt_type, opt_creator, opt_quiet,
opt_codepage, opt_iocharset,
- opt_err
};
-static const match_table_t tokens = {
- { opt_uid, "uid=%u" },
- { opt_gid, "gid=%u" },
- { opt_umask, "umask=%o" },
- { opt_file_umask, "file_umask=%o" },
- { opt_dir_umask, "dir_umask=%o" },
- { opt_part, "part=%u" },
- { opt_session, "session=%u" },
- { opt_type, "type=%s" },
- { opt_creator, "creator=%s" },
- { opt_quiet, "quiet" },
- { opt_codepage, "codepage=%s" },
- { opt_iocharset, "iocharset=%s" },
- { opt_err, NULL }
+static const struct fs_parameter_spec hfs_param_spec[] = {
+ fsparam_u32 ("uid", opt_uid),
+ fsparam_u32 ("gid", opt_gid),
+ fsparam_u32oct ("umask", opt_umask),
+ fsparam_u32oct ("file_umask", opt_file_umask),
+ fsparam_u32oct ("dir_umask", opt_dir_umask),
+ fsparam_u32 ("part", opt_part),
+ fsparam_u32 ("session", opt_session),
+ fsparam_string ("type", opt_type),
+ fsparam_string ("creator", opt_creator),
+ fsparam_flag ("quiet", opt_quiet),
+ fsparam_string ("codepage", opt_codepage),
+ fsparam_string ("iocharset", opt_iocharset),
+ {}
};
-static inline int match_fourchar(substring_t *arg, u32 *result)
-{
- if (arg->to - arg->from != 4)
- return -EINVAL;
- memcpy(result, arg->from, 4);
- return 0;
-}
-
/*
- * parse_options()
+ * hfs_parse_param()
*
- * adapted from linux/fs/msdos/inode.c written 1992,93 by Werner Almesberger
- * This function is called by hfs_read_super() to parse the mount options.
+ * This function is called by the vfs to parse the mount options.
*/
-static int parse_options(char *options, struct hfs_sb_info *hsb)
+static int hfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- char *p;
- substring_t args[MAX_OPT_ARGS];
- int tmp, token;
-
- /* initialize the sb with defaults */
- hsb->s_uid = current_uid();
- hsb->s_gid = current_gid();
- hsb->s_file_umask = 0133;
- hsb->s_dir_umask = 0022;
- hsb->s_type = hsb->s_creator = cpu_to_be32(0x3f3f3f3f); /* == '????' */
- hsb->s_quiet = 0;
- hsb->part = -1;
- hsb->session = -1;
-
- if (!options)
- return 1;
-
- while ((p = strsep(&options, ",")) != NULL) {
- if (!*p)
- continue;
-
- token = match_token(p, tokens, args);
- switch (token) {
- case opt_uid:
- if (match_int(&args[0], &tmp)) {
- pr_err("uid requires an argument\n");
- return 0;
- }
- hsb->s_uid = make_kuid(current_user_ns(), (uid_t)tmp);
- if (!uid_valid(hsb->s_uid)) {
- pr_err("invalid uid %d\n", tmp);
- return 0;
- }
- break;
- case opt_gid:
- if (match_int(&args[0], &tmp)) {
- pr_err("gid requires an argument\n");
- return 0;
- }
- hsb->s_gid = make_kgid(current_user_ns(), (gid_t)tmp);
- if (!gid_valid(hsb->s_gid)) {
- pr_err("invalid gid %d\n", tmp);
- return 0;
- }
- break;
- case opt_umask:
- if (match_octal(&args[0], &tmp)) {
- pr_err("umask requires a value\n");
- return 0;
- }
- hsb->s_file_umask = (umode_t)tmp;
- hsb->s_dir_umask = (umode_t)tmp;
- break;
- case opt_file_umask:
- if (match_octal(&args[0], &tmp)) {
- pr_err("file_umask requires a value\n");
- return 0;
- }
- hsb->s_file_umask = (umode_t)tmp;
- break;
- case opt_dir_umask:
- if (match_octal(&args[0], &tmp)) {
- pr_err("dir_umask requires a value\n");
- return 0;
- }
- hsb->s_dir_umask = (umode_t)tmp;
- break;
- case opt_part:
- if (match_int(&args[0], &hsb->part)) {
- pr_err("part requires an argument\n");
- return 0;
- }
- break;
- case opt_session:
- if (match_int(&args[0], &hsb->session)) {
- pr_err("session requires an argument\n");
- return 0;
- }
- break;
- case opt_type:
- if (match_fourchar(&args[0], &hsb->s_type)) {
- pr_err("type requires a 4 character value\n");
- return 0;
- }
- break;
- case opt_creator:
- if (match_fourchar(&args[0], &hsb->s_creator)) {
- pr_err("creator requires a 4 character value\n");
- return 0;
- }
- break;
- case opt_quiet:
- hsb->s_quiet = 1;
- break;
- case opt_codepage:
- if (hsb->nls_disk) {
- pr_err("unable to change codepage\n");
- return 0;
- }
- p = match_strdup(&args[0]);
- if (p)
- hsb->nls_disk = load_nls(p);
- if (!hsb->nls_disk) {
- pr_err("unable to load codepage \"%s\"\n", p);
- kfree(p);
- return 0;
- }
- kfree(p);
- break;
- case opt_iocharset:
- if (hsb->nls_io) {
- pr_err("unable to change iocharset\n");
- return 0;
- }
- p = match_strdup(&args[0]);
- if (p)
- hsb->nls_io = load_nls(p);
- if (!hsb->nls_io) {
- pr_err("unable to load iocharset \"%s\"\n", p);
- kfree(p);
- return 0;
- }
- kfree(p);
- break;
- default:
- return 0;
- }
- }
+ struct hfs_sb_info *hsb = fc->s_fs_info;
+ struct fs_parse_result result;
+ int opt;
+
+ /* hfs does not honor any fs-specific options on remount */
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE)
+ return 0;
- if (hsb->nls_disk && !hsb->nls_io) {
- hsb->nls_io = load_nls_default();
+ opt = fs_parse(fc, hfs_param_spec, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case opt_uid:
+ hsb->s_uid = result.uid;
+ break;
+ case opt_gid:
+ hsb->s_gid = result.gid;
+ break;
+ case opt_umask:
+ hsb->s_file_umask = (umode_t)result.uint_32;
+ hsb->s_dir_umask = (umode_t)result.uint_32;
+ break;
+ case opt_file_umask:
+ hsb->s_file_umask = (umode_t)result.uint_32;
+ break;
+ case opt_dir_umask:
+ hsb->s_dir_umask = (umode_t)result.uint_32;
+ break;
+ case opt_part:
+ hsb->part = result.uint_32;
+ break;
+ case opt_session:
+ hsb->session = result.uint_32;
+ break;
+ case opt_type:
+ if (strlen(param->string) != 4) {
+ pr_err("type requires a 4 character value\n");
+ return -EINVAL;
+ }
+ memcpy(&hsb->s_type, param->string, 4);
+ break;
+ case opt_creator:
+ if (strlen(param->string) != 4) {
+ pr_err("creator requires a 4 character value\n");
+ return -EINVAL;
+ }
+ memcpy(&hsb->s_creator, param->string, 4);
+ break;
+ case opt_quiet:
+ hsb->s_quiet = 1;
+ break;
+ case opt_codepage:
+ if (hsb->nls_disk) {
+ pr_err("unable to change codepage\n");
+ return -EINVAL;
+ }
+ hsb->nls_disk = load_nls(param->string);
+ if (!hsb->nls_disk) {
+ pr_err("unable to load codepage \"%s\"\n",
+ param->string);
+ return -EINVAL;
+ }
+ break;
+ case opt_iocharset:
+ if (hsb->nls_io) {
+ pr_err("unable to change iocharset\n");
+ return -EINVAL;
+ }
+ hsb->nls_io = load_nls(param->string);
if (!hsb->nls_io) {
- pr_err("unable to load default iocharset\n");
- return 0;
+ pr_err("unable to load iocharset \"%s\"\n",
+ param->string);
+ return -EINVAL;
}
+ break;
+ default:
+ return -EINVAL;
}
- hsb->s_dir_umask &= 0777;
- hsb->s_file_umask &= 0577;
- return 1;
+ return 0;
}
/*
@@ -376,29 +310,25 @@ static int parse_options(char *options, struct hfs_sb_info *hsb)
* hfs_btree_init() to get the necessary data about the extents and
* catalog B-trees and, finally, reading the root inode into memory.
*/
-static int hfs_fill_super(struct super_block *sb, void *data, int silent)
+static int hfs_fill_super(struct super_block *sb, struct fs_context *fc)
{
- struct hfs_sb_info *sbi;
+ struct hfs_sb_info *sbi = HFS_SB(sb);
struct hfs_find_data fd;
hfs_cat_rec rec;
struct inode *root_inode;
+ int silent = fc->sb_flags & SB_SILENT;
int res;
- sbi = kzalloc(sizeof(struct hfs_sb_info), GFP_KERNEL);
- if (!sbi)
- return -ENOMEM;
+ /* load_nls_default does not fail */
+ if (sbi->nls_disk && !sbi->nls_io)
+ sbi->nls_io = load_nls_default();
+ sbi->s_dir_umask &= 0777;
+ sbi->s_file_umask &= 0577;
- sbi->sb = sb;
- sb->s_fs_info = sbi;
spin_lock_init(&sbi->work_lock);
INIT_DELAYED_WORK(&sbi->mdb_work, flush_mdb);
- res = -EINVAL;
- if (!parse_options((char *)data, sbi)) {
- pr_err("unable to parse mount options\n");
- goto bail;
- }
-
+ sbi->sb = sb;
sb->s_op = &hfs_super_operations;
sb->s_xattr = hfs_xattr_handlers;
sb->s_flags |= SB_NODIRATIME;
@@ -451,18 +381,56 @@ bail:
return res;
}
-static struct dentry *hfs_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int hfs_get_tree(struct fs_context *fc)
+{
+ return get_tree_bdev(fc, hfs_fill_super);
+}
+
+static void hfs_free_fc(struct fs_context *fc)
+{
+ kfree(fc->s_fs_info);
+}
+
+static const struct fs_context_operations hfs_context_ops = {
+ .parse_param = hfs_parse_param,
+ .get_tree = hfs_get_tree,
+ .reconfigure = hfs_reconfigure,
+ .free = hfs_free_fc,
+};
+
+static int hfs_init_fs_context(struct fs_context *fc)
{
- return mount_bdev(fs_type, flags, dev_name, data, hfs_fill_super);
+ struct hfs_sb_info *hsb;
+
+ hsb = kzalloc(sizeof(struct hfs_sb_info), GFP_KERNEL);
+ if (!hsb)
+ return -ENOMEM;
+
+ fc->s_fs_info = hsb;
+ fc->ops = &hfs_context_ops;
+
+ if (fc->purpose != FS_CONTEXT_FOR_RECONFIGURE) {
+ /* initialize options with defaults */
+ hsb->s_uid = current_uid();
+ hsb->s_gid = current_gid();
+ hsb->s_file_umask = 0133;
+ hsb->s_dir_umask = 0022;
+ hsb->s_type = cpu_to_be32(0x3f3f3f3f); /* == '????' */
+ hsb->s_creator = cpu_to_be32(0x3f3f3f3f); /* == '????' */
+ hsb->s_quiet = 0;
+ hsb->part = -1;
+ hsb->session = -1;
+ }
+
+ return 0;
}
static struct file_system_type hfs_fs_type = {
.owner = THIS_MODULE,
.name = "hfs",
- .mount = hfs_mount,
.kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV,
+ .init_fs_context = hfs_init_fs_context,
};
MODULE_ALIAS_FS("hfs");
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 59ce81dca73f..2f089bff0095 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -21,6 +21,7 @@
#include <linux/mutex.h>
#include <linux/buffer_head.h>
#include <linux/blkdev.h>
+#include <linux/fs_context.h>
#include "hfsplus_raw.h"
#define DBG_BNODE_REFS 0x00000001
@@ -156,6 +157,7 @@ struct hfsplus_sb_info {
/* Runtime variables */
u32 blockoffset;
+ u32 min_io_size;
sector_t part_start;
sector_t sect_count;
int fs_shift;
@@ -307,7 +309,7 @@ struct hfsplus_readdir_data {
*/
static inline unsigned short hfsplus_min_io_size(struct super_block *sb)
{
- return max_t(unsigned short, bdev_logical_block_size(sb->s_bdev),
+ return max_t(unsigned short, HFSPLUS_SB(sb)->min_io_size,
HFSPLUS_SECTOR_SIZE);
}
@@ -496,8 +498,7 @@ long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
/* options.c */
void hfsplus_fill_defaults(struct hfsplus_sb_info *opts);
-int hfsplus_parse_options_remount(char *input, int *force);
-int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi);
+int hfsplus_parse_param(struct fs_context *fc, struct fs_parameter *param);
int hfsplus_show_options(struct seq_file *seq, struct dentry *root);
/* part_tbl.c */
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index c94a58762ad6..a66a09a56bf7 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -12,7 +12,8 @@
#include <linux/string.h>
#include <linux/kernel.h>
#include <linux/sched.h>
-#include <linux/parser.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include <linux/nls.h>
#include <linux/mount.h>
#include <linux/seq_file.h>
@@ -23,26 +24,23 @@ enum {
opt_creator, opt_type,
opt_umask, opt_uid, opt_gid,
opt_part, opt_session, opt_nls,
- opt_nodecompose, opt_decompose,
- opt_barrier, opt_nobarrier,
- opt_force, opt_err
+ opt_decompose, opt_barrier,
+ opt_force,
};
-static const match_table_t tokens = {
- { opt_creator, "creator=%s" },
- { opt_type, "type=%s" },
- { opt_umask, "umask=%o" },
- { opt_uid, "uid=%u" },
- { opt_gid, "gid=%u" },
- { opt_part, "part=%u" },
- { opt_session, "session=%u" },
- { opt_nls, "nls=%s" },
- { opt_decompose, "decompose" },
- { opt_nodecompose, "nodecompose" },
- { opt_barrier, "barrier" },
- { opt_nobarrier, "nobarrier" },
- { opt_force, "force" },
- { opt_err, NULL }
+static const struct fs_parameter_spec hfs_param_spec[] = {
+ fsparam_string ("creator", opt_creator),
+ fsparam_string ("type", opt_type),
+ fsparam_u32oct ("umask", opt_umask),
+ fsparam_u32 ("uid", opt_uid),
+ fsparam_u32 ("gid", opt_gid),
+ fsparam_u32 ("part", opt_part),
+ fsparam_u32 ("session", opt_session),
+ fsparam_string ("nls", opt_nls),
+ fsparam_flag_no ("decompose", opt_decompose),
+ fsparam_flag_no ("barrier", opt_barrier),
+ fsparam_flag ("force", opt_force),
+ {}
};
/* Initialize an options object to reasonable defaults */
@@ -60,162 +58,89 @@ void hfsplus_fill_defaults(struct hfsplus_sb_info *opts)
opts->session = -1;
}
-/* convert a "four byte character" to a 32 bit int with error checks */
-static inline int match_fourchar(substring_t *arg, u32 *result)
+/* Parse options from mount. Returns nonzero errno on failure */
+int hfsplus_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- if (arg->to - arg->from != 4)
- return -EINVAL;
- memcpy(result, arg->from, 4);
- return 0;
-}
-
-int hfsplus_parse_options_remount(char *input, int *force)
-{
- char *p;
- substring_t args[MAX_OPT_ARGS];
- int token;
-
- if (!input)
- return 1;
-
- while ((p = strsep(&input, ",")) != NULL) {
- if (!*p)
- continue;
-
- token = match_token(p, tokens, args);
- switch (token) {
- case opt_force:
- *force = 1;
- break;
- default:
- break;
+ struct hfsplus_sb_info *sbi = fc->s_fs_info;
+ struct fs_parse_result result;
+ int opt;
+
+ /*
+ * Only the force option is examined during remount, all others
+ * are ignored.
+ */
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE &&
+ strncmp(param->key, "force", 5))
+ return 0;
+
+ opt = fs_parse(fc, hfs_param_spec, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case opt_creator:
+ if (strlen(param->string) != 4) {
+ pr_err("creator requires a 4 character value\n");
+ return -EINVAL;
}
- }
-
- return 1;
-}
-
-/* Parse options from mount. Returns 0 on failure */
-/* input is the options passed to mount() as a string */
-int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi)
-{
- char *p;
- substring_t args[MAX_OPT_ARGS];
- int tmp, token;
-
- if (!input)
- goto done;
-
- while ((p = strsep(&input, ",")) != NULL) {
- if (!*p)
- continue;
-
- token = match_token(p, tokens, args);
- switch (token) {
- case opt_creator:
- if (match_fourchar(&args[0], &sbi->creator)) {
- pr_err("creator requires a 4 character value\n");
- return 0;
- }
- break;
- case opt_type:
- if (match_fourchar(&args[0], &sbi->type)) {
- pr_err("type requires a 4 character value\n");
- return 0;
- }
- break;
- case opt_umask:
- if (match_octal(&args[0], &tmp)) {
- pr_err("umask requires a value\n");
- return 0;
- }
- sbi->umask = (umode_t)tmp;
- break;
- case opt_uid:
- if (match_int(&args[0], &tmp)) {
- pr_err("uid requires an argument\n");
- return 0;
- }
- sbi->uid = make_kuid(current_user_ns(), (uid_t)tmp);
- if (!uid_valid(sbi->uid)) {
- pr_err("invalid uid specified\n");
- return 0;
- } else {
- set_bit(HFSPLUS_SB_UID, &sbi->flags);
- }
- break;
- case opt_gid:
- if (match_int(&args[0], &tmp)) {
- pr_err("gid requires an argument\n");
- return 0;
- }
- sbi->gid = make_kgid(current_user_ns(), (gid_t)tmp);
- if (!gid_valid(sbi->gid)) {
- pr_err("invalid gid specified\n");
- return 0;
- } else {
- set_bit(HFSPLUS_SB_GID, &sbi->flags);
- }
- break;
- case opt_part:
- if (match_int(&args[0], &sbi->part)) {
- pr_err("part requires an argument\n");
- return 0;
- }
- break;
- case opt_session:
- if (match_int(&args[0], &sbi->session)) {
- pr_err("session requires an argument\n");
- return 0;
- }
- break;
- case opt_nls:
- if (sbi->nls) {
- pr_err("unable to change nls mapping\n");
- return 0;
- }
- p = match_strdup(&args[0]);
- if (p)
- sbi->nls = load_nls(p);
- if (!sbi->nls) {
- pr_err("unable to load nls mapping \"%s\"\n",
- p);
- kfree(p);
- return 0;
- }
- kfree(p);
- break;
- case opt_decompose:
- clear_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags);
- break;
- case opt_nodecompose:
+ memcpy(&sbi->creator, param->string, 4);
+ break;
+ case opt_type:
+ if (strlen(param->string) != 4) {
+ pr_err("type requires a 4 character value\n");
+ return -EINVAL;
+ }
+ memcpy(&sbi->type, param->string, 4);
+ break;
+ case opt_umask:
+ sbi->umask = (umode_t)result.uint_32;
+ break;
+ case opt_uid:
+ sbi->uid = result.uid;
+ set_bit(HFSPLUS_SB_UID, &sbi->flags);
+ break;
+ case opt_gid:
+ sbi->gid = result.gid;
+ set_bit(HFSPLUS_SB_GID, &sbi->flags);
+ break;
+ case opt_part:
+ sbi->part = result.uint_32;
+ break;
+ case opt_session:
+ sbi->session = result.uint_32;
+ break;
+ case opt_nls:
+ if (sbi->nls) {
+ pr_err("unable to change nls mapping\n");
+ return -EINVAL;
+ }
+ sbi->nls = load_nls(param->string);
+ if (!sbi->nls) {
+ pr_err("unable to load nls mapping \"%s\"\n",
+ param->string);
+ return -EINVAL;
+ }
+ break;
+ case opt_decompose:
+ if (result.negated)
set_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags);
- break;
- case opt_barrier:
- clear_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags);
- break;
- case opt_nobarrier:
+ else
+ clear_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags);
+ break;
+ case opt_barrier:
+ if (result.negated)
set_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags);
- break;
- case opt_force:
- set_bit(HFSPLUS_SB_FORCE, &sbi->flags);
- break;
- default:
- return 0;
- }
- }
-
-done:
- if (!sbi->nls) {
- /* try utf8 first, as this is the old default behaviour */
- sbi->nls = load_nls("utf8");
- if (!sbi->nls)
- sbi->nls = load_nls_default();
- if (!sbi->nls)
- return 0;
+ else
+ clear_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags);
+ break;
+ case opt_force:
+ set_bit(HFSPLUS_SB_FORCE, &sbi->flags);
+ break;
+ default:
+ return -EINVAL;
}
- return 1;
+ return 0;
}
int hfsplus_show_options(struct seq_file *seq, struct dentry *root)
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 97920202790f..948b8aaee33e 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -14,6 +14,7 @@
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/fs.h>
+#include <linux/fs_context.h>
#include <linux/slab.h>
#include <linux/vfs.h>
#include <linux/nls.h>
@@ -332,34 +333,33 @@ static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
return 0;
}
-static int hfsplus_remount(struct super_block *sb, int *flags, char *data)
+static int hfsplus_reconfigure(struct fs_context *fc)
{
+ struct super_block *sb = fc->root->d_sb;
+
sync_filesystem(sb);
- if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
+ if ((bool)(fc->sb_flags & SB_RDONLY) == sb_rdonly(sb))
return 0;
- if (!(*flags & SB_RDONLY)) {
- struct hfsplus_vh *vhdr = HFSPLUS_SB(sb)->s_vhdr;
- int force = 0;
-
- if (!hfsplus_parse_options_remount(data, &force))
- return -EINVAL;
+ if (!(fc->sb_flags & SB_RDONLY)) {
+ struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
+ struct hfsplus_vh *vhdr = sbi->s_vhdr;
if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) {
pr_warn("filesystem was not cleanly unmounted, running fsck.hfsplus is recommended. leaving read-only.\n");
sb->s_flags |= SB_RDONLY;
- *flags |= SB_RDONLY;
- } else if (force) {
+ fc->sb_flags |= SB_RDONLY;
+ } else if (test_bit(HFSPLUS_SB_FORCE, &sbi->flags)) {
/* nothing */
} else if (vhdr->attributes &
cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
pr_warn("filesystem is marked locked, leaving read-only.\n");
sb->s_flags |= SB_RDONLY;
- *flags |= SB_RDONLY;
+ fc->sb_flags |= SB_RDONLY;
} else if (vhdr->attributes &
cpu_to_be32(HFSPLUS_VOL_JOURNALED)) {
pr_warn("filesystem is marked journaled, leaving read-only.\n");
sb->s_flags |= SB_RDONLY;
- *flags |= SB_RDONLY;
+ fc->sb_flags |= SB_RDONLY;
}
}
return 0;
@@ -373,38 +373,33 @@ static const struct super_operations hfsplus_sops = {
.put_super = hfsplus_put_super,
.sync_fs = hfsplus_sync_fs,
.statfs = hfsplus_statfs,
- .remount_fs = hfsplus_remount,
.show_options = hfsplus_show_options,
};
-static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
+static int hfsplus_fill_super(struct super_block *sb, struct fs_context *fc)
{
struct hfsplus_vh *vhdr;
- struct hfsplus_sb_info *sbi;
+ struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
hfsplus_cat_entry entry;
struct hfs_find_data fd;
struct inode *root, *inode;
struct qstr str;
- struct nls_table *nls = NULL;
+ struct nls_table *nls;
u64 last_fs_block, last_fs_page;
+ int silent = fc->sb_flags & SB_SILENT;
int err;
- err = -ENOMEM;
- sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
- if (!sbi)
- goto out;
-
- sb->s_fs_info = sbi;
mutex_init(&sbi->alloc_mutex);
mutex_init(&sbi->vh_mutex);
spin_lock_init(&sbi->work_lock);
INIT_DELAYED_WORK(&sbi->sync_work, delayed_sync_fs);
- hfsplus_fill_defaults(sbi);
err = -EINVAL;
- if (!hfsplus_parse_options(data, sbi)) {
- pr_err("unable to parse mount options\n");
- goto out_unload_nls;
+ if (!sbi->nls) {
+ /* try utf8 first, as this is the old default behaviour */
+ sbi->nls = load_nls("utf8");
+ if (!sbi->nls)
+ sbi->nls = load_nls_default();
}
/* temporarily use utf8 to correctly find the hidden dir below */
@@ -616,7 +611,6 @@ out_unload_nls:
unload_nls(sbi->nls);
unload_nls(nls);
kfree(sbi);
-out:
return err;
}
@@ -641,18 +635,46 @@ static void hfsplus_free_inode(struct inode *inode)
#define HFSPLUS_INODE_SIZE sizeof(struct hfsplus_inode_info)
-static struct dentry *hfsplus_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int hfsplus_get_tree(struct fs_context *fc)
+{
+ return get_tree_bdev(fc, hfsplus_fill_super);
+}
+
+static void hfsplus_free_fc(struct fs_context *fc)
{
- return mount_bdev(fs_type, flags, dev_name, data, hfsplus_fill_super);
+ kfree(fc->s_fs_info);
+}
+
+static const struct fs_context_operations hfsplus_context_ops = {
+ .parse_param = hfsplus_parse_param,
+ .get_tree = hfsplus_get_tree,
+ .reconfigure = hfsplus_reconfigure,
+ .free = hfsplus_free_fc,
+};
+
+static int hfsplus_init_fs_context(struct fs_context *fc)
+{
+ struct hfsplus_sb_info *sbi;
+
+ sbi = kzalloc(sizeof(struct hfsplus_sb_info), GFP_KERNEL);
+ if (!sbi)
+ return -ENOMEM;
+
+ if (fc->purpose != FS_CONTEXT_FOR_RECONFIGURE)
+ hfsplus_fill_defaults(sbi);
+
+ fc->s_fs_info = sbi;
+ fc->ops = &hfsplus_context_ops;
+
+ return 0;
}
static struct file_system_type hfsplus_fs_type = {
.owner = THIS_MODULE,
.name = "hfsplus",
- .mount = hfsplus_mount,
.kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV,
+ .init_fs_context = hfsplus_init_fs_context,
};
MODULE_ALIAS_FS("hfsplus");
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index 9592ffcb44e5..74801911bc1c 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -172,6 +172,8 @@ int hfsplus_read_wrapper(struct super_block *sb)
if (!blocksize)
goto out;
+ sbi->min_io_size = blocksize;
+
if (hfsplus_get_last_session(sb, &part_start, &part_size))
goto out;
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index e73717daa5f9..27567920abe4 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -9,7 +9,8 @@
#include "hpfs_fn.h"
#include <linux/module.h>
-#include <linux/parser.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include <linux/init.h>
#include <linux/statfs.h>
#include <linux/magic.h>
@@ -90,7 +91,7 @@ void hpfs_error(struct super_block *s, const char *fmt, ...)
hpfs_sb(s)->sb_was_error = 1;
}
-/*
+/*
* A little trick to detect cycles in many hpfs structures and don't let the
* kernel crash on corrupted filesystem. When first called, set c2 to 0.
*
@@ -272,146 +273,70 @@ static void destroy_inodecache(void)
kmem_cache_destroy(hpfs_inode_cachep);
}
-/*
- * A tiny parser for option strings, stolen from dosfs.
- * Stolen again from read-only hpfs.
- * And updated for table-driven option parsing.
- */
-
enum {
- Opt_help, Opt_uid, Opt_gid, Opt_umask, Opt_case_lower, Opt_case_asis,
- Opt_check_none, Opt_check_normal, Opt_check_strict,
- Opt_err_cont, Opt_err_ro, Opt_err_panic,
- Opt_eas_no, Opt_eas_ro, Opt_eas_rw,
- Opt_chkdsk_no, Opt_chkdsk_errors, Opt_chkdsk_always,
- Opt_timeshift, Opt_err,
+ Opt_help, Opt_uid, Opt_gid, Opt_umask, Opt_case,
+ Opt_check, Opt_err, Opt_eas, Opt_chkdsk, Opt_timeshift,
};
-static const match_table_t tokens = {
- {Opt_help, "help"},
- {Opt_uid, "uid=%u"},
- {Opt_gid, "gid=%u"},
- {Opt_umask, "umask=%o"},
- {Opt_case_lower, "case=lower"},
- {Opt_case_asis, "case=asis"},
- {Opt_check_none, "check=none"},
- {Opt_check_normal, "check=normal"},
- {Opt_check_strict, "check=strict"},
- {Opt_err_cont, "errors=continue"},
- {Opt_err_ro, "errors=remount-ro"},
- {Opt_err_panic, "errors=panic"},
- {Opt_eas_no, "eas=no"},
- {Opt_eas_ro, "eas=ro"},
- {Opt_eas_rw, "eas=rw"},
- {Opt_chkdsk_no, "chkdsk=no"},
- {Opt_chkdsk_errors, "chkdsk=errors"},
- {Opt_chkdsk_always, "chkdsk=always"},
- {Opt_timeshift, "timeshift=%d"},
- {Opt_err, NULL},
+static const struct constant_table hpfs_param_case[] = {
+ {"asis", 0},
+ {"lower", 1},
+ {}
};
-static int parse_opts(char *opts, kuid_t *uid, kgid_t *gid, umode_t *umask,
- int *lowercase, int *eas, int *chk, int *errs,
- int *chkdsk, int *timeshift)
-{
- char *p;
- int option;
+static const struct constant_table hpfs_param_check[] = {
+ {"none", 0},
+ {"normal", 1},
+ {"strict", 2},
+ {}
+};
- if (!opts)
- return 1;
+static const struct constant_table hpfs_param_err[] = {
+ {"continue", 0},
+ {"remount-ro", 1},
+ {"panic", 2},
+ {}
+};
- /*pr_info("Parsing opts: '%s'\n",opts);*/
-
- while ((p = strsep(&opts, ",")) != NULL) {
- substring_t args[MAX_OPT_ARGS];
- int token;
- if (!*p)
- continue;
-
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_help:
- return 2;
- case Opt_uid:
- if (match_int(args, &option))
- return 0;
- *uid = make_kuid(current_user_ns(), option);
- if (!uid_valid(*uid))
- return 0;
- break;
- case Opt_gid:
- if (match_int(args, &option))
- return 0;
- *gid = make_kgid(current_user_ns(), option);
- if (!gid_valid(*gid))
- return 0;
- break;
- case Opt_umask:
- if (match_octal(args, &option))
- return 0;
- *umask = option;
- break;
- case Opt_case_lower:
- *lowercase = 1;
- break;
- case Opt_case_asis:
- *lowercase = 0;
- break;
- case Opt_check_none:
- *chk = 0;
- break;
- case Opt_check_normal:
- *chk = 1;
- break;
- case Opt_check_strict:
- *chk = 2;
- break;
- case Opt_err_cont:
- *errs = 0;
- break;
- case Opt_err_ro:
- *errs = 1;
- break;
- case Opt_err_panic:
- *errs = 2;
- break;
- case Opt_eas_no:
- *eas = 0;
- break;
- case Opt_eas_ro:
- *eas = 1;
- break;
- case Opt_eas_rw:
- *eas = 2;
- break;
- case Opt_chkdsk_no:
- *chkdsk = 0;
- break;
- case Opt_chkdsk_errors:
- *chkdsk = 1;
- break;
- case Opt_chkdsk_always:
- *chkdsk = 2;
- break;
- case Opt_timeshift:
- {
- int m = 1;
- char *rhs = args[0].from;
- if (!rhs || !*rhs)
- return 0;
- if (*rhs == '-') m = -1;
- if (*rhs == '+' || *rhs == '-') rhs++;
- *timeshift = simple_strtoul(rhs, &rhs, 0) * m;
- if (*rhs)
- return 0;
- break;
- }
- default:
- return 0;
- }
- }
- return 1;
-}
+static const struct constant_table hpfs_param_eas[] = {
+ {"no", 0},
+ {"ro", 1},
+ {"rw", 2},
+ {}
+};
+
+static const struct constant_table hpfs_param_chkdsk[] = {
+ {"no", 0},
+ {"errors", 1},
+ {"always", 2},
+ {}
+};
+
+static const struct fs_parameter_spec hpfs_param_spec[] = {
+ fsparam_flag ("help", Opt_help),
+ fsparam_uid ("uid", Opt_uid),
+ fsparam_gid ("gid", Opt_gid),
+ fsparam_u32oct ("umask", Opt_umask),
+ fsparam_enum ("case", Opt_case, hpfs_param_case),
+ fsparam_enum ("check", Opt_check, hpfs_param_check),
+ fsparam_enum ("errors", Opt_err, hpfs_param_err),
+ fsparam_enum ("eas", Opt_eas, hpfs_param_eas),
+ fsparam_enum ("chkdsk", Opt_chkdsk, hpfs_param_chkdsk),
+ fsparam_s32 ("timeshift", Opt_timeshift),
+ {}
+};
+
+struct hpfs_fc_context {
+ kuid_t uid;
+ kgid_t gid;
+ umode_t umask;
+ int lowercase;
+ int eas;
+ int chk;
+ int errs;
+ int chkdsk;
+ int timeshift;
+};
static inline void hpfs_help(void)
{
@@ -439,49 +364,92 @@ HPFS filesystem options:\n\
\n");
}
-static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
+static int hpfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- kuid_t uid;
- kgid_t gid;
- umode_t umask;
- int lowercase, eas, chk, errs, chkdsk, timeshift;
- int o;
+ struct hpfs_fc_context *ctx = fc->fs_private;
+ struct fs_parse_result result;
+ int opt;
+
+ opt = fs_parse(fc, hpfs_param_spec, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_help:
+ hpfs_help();
+ return -EINVAL;
+ case Opt_uid:
+ ctx->uid = result.uid;
+ break;
+ case Opt_gid:
+ ctx->gid = result.gid;
+ break;
+ case Opt_umask:
+ ctx->umask = result.uint_32;
+ break;
+ case Opt_case:
+ ctx->lowercase = result.uint_32;
+ break;
+ case Opt_check:
+ ctx->chk = result.uint_32;
+ break;
+ case Opt_err:
+ ctx->errs = result.uint_32;
+ break;
+ case Opt_eas:
+ ctx->eas = result.uint_32;
+ break;
+ case Opt_chkdsk:
+ ctx->chkdsk = result.uint_32;
+ break;
+ case Opt_timeshift:
+ {
+ int m = 1;
+ char *rhs = param->string;
+ int timeshift;
+
+ if (*rhs == '-') m = -1;
+ if (*rhs == '+' || *rhs == '-') rhs++;
+ timeshift = simple_strtoul(rhs, &rhs, 0) * m;
+ if (*rhs)
+ return -EINVAL;
+ ctx->timeshift = timeshift;
+ break;
+ }
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int hpfs_reconfigure(struct fs_context *fc)
+{
+ struct hpfs_fc_context *ctx = fc->fs_private;
+ struct super_block *s = fc->root->d_sb;
struct hpfs_sb_info *sbi = hpfs_sb(s);
sync_filesystem(s);
- *flags |= SB_NOATIME;
+ fc->sb_flags |= SB_NOATIME;
hpfs_lock(s);
- uid = sbi->sb_uid; gid = sbi->sb_gid;
- umask = 0777 & ~sbi->sb_mode;
- lowercase = sbi->sb_lowercase;
- eas = sbi->sb_eas; chk = sbi->sb_chk; chkdsk = sbi->sb_chkdsk;
- errs = sbi->sb_err; timeshift = sbi->sb_timeshift;
-
- if (!(o = parse_opts(data, &uid, &gid, &umask, &lowercase,
- &eas, &chk, &errs, &chkdsk, &timeshift))) {
- pr_err("bad mount options.\n");
- goto out_err;
- }
- if (o == 2) {
- hpfs_help();
- goto out_err;
- }
- if (timeshift != sbi->sb_timeshift) {
+
+ if (ctx->timeshift != sbi->sb_timeshift) {
pr_err("timeshift can't be changed using remount.\n");
goto out_err;
}
unmark_dirty(s);
- sbi->sb_uid = uid; sbi->sb_gid = gid;
- sbi->sb_mode = 0777 & ~umask;
- sbi->sb_lowercase = lowercase;
- sbi->sb_eas = eas; sbi->sb_chk = chk; sbi->sb_chkdsk = chkdsk;
- sbi->sb_err = errs; sbi->sb_timeshift = timeshift;
+ sbi->sb_uid = ctx->uid; sbi->sb_gid = ctx->gid;
+ sbi->sb_mode = 0777 & ~ctx->umask;
+ sbi->sb_lowercase = ctx->lowercase;
+ sbi->sb_eas = ctx->eas; sbi->sb_chk = ctx->chk;
+ sbi->sb_chkdsk = ctx->chkdsk;
+ sbi->sb_err = ctx->errs; sbi->sb_timeshift = ctx->timeshift;
- if (!(*flags & SB_RDONLY)) mark_dirty(s, 1);
+ if (!(fc->sb_flags & SB_RDONLY)) mark_dirty(s, 1);
hpfs_unlock(s);
return 0;
@@ -530,30 +498,24 @@ static const struct super_operations hpfs_sops =
.evict_inode = hpfs_evict_inode,
.put_super = hpfs_put_super,
.statfs = hpfs_statfs,
- .remount_fs = hpfs_remount_fs,
.show_options = hpfs_show_options,
};
-static int hpfs_fill_super(struct super_block *s, void *options, int silent)
+static int hpfs_fill_super(struct super_block *s, struct fs_context *fc)
{
+ struct hpfs_fc_context *ctx = fc->fs_private;
struct buffer_head *bh0, *bh1, *bh2;
struct hpfs_boot_block *bootblock;
struct hpfs_super_block *superblock;
struct hpfs_spare_block *spareblock;
struct hpfs_sb_info *sbi;
struct inode *root;
-
- kuid_t uid;
- kgid_t gid;
- umode_t umask;
- int lowercase, eas, chk, errs, chkdsk, timeshift;
+ int silent = fc->sb_flags & SB_SILENT;
dnode_secno root_dno;
struct hpfs_dirent *de = NULL;
struct quad_buffer_head qbh;
- int o;
-
sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
if (!sbi) {
return -ENOMEM;
@@ -563,26 +525,6 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
mutex_init(&sbi->hpfs_mutex);
hpfs_lock(s);
- uid = current_uid();
- gid = current_gid();
- umask = current_umask();
- lowercase = 0;
- eas = 2;
- chk = 1;
- errs = 1;
- chkdsk = 1;
- timeshift = 0;
-
- if (!(o = parse_opts(options, &uid, &gid, &umask, &lowercase,
- &eas, &chk, &errs, &chkdsk, &timeshift))) {
- pr_err("bad mount options.\n");
- goto bail0;
- }
- if (o==2) {
- hpfs_help();
- goto bail0;
- }
-
/*sbi->sb_mounting = 1;*/
sb_set_blocksize(s, 512);
sbi->sb_fs_size = -1;
@@ -622,17 +564,17 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
sbi->sb_dirband_start = le32_to_cpu(superblock->dir_band_start);
sbi->sb_dirband_size = le32_to_cpu(superblock->n_dir_band);
sbi->sb_dmap = le32_to_cpu(superblock->dir_band_bitmap);
- sbi->sb_uid = uid;
- sbi->sb_gid = gid;
- sbi->sb_mode = 0777 & ~umask;
+ sbi->sb_uid = ctx->uid;
+ sbi->sb_gid = ctx->gid;
+ sbi->sb_mode = 0777 & ~ctx->umask;
sbi->sb_n_free = -1;
sbi->sb_n_free_dnodes = -1;
- sbi->sb_lowercase = lowercase;
- sbi->sb_eas = eas;
- sbi->sb_chk = chk;
- sbi->sb_chkdsk = chkdsk;
- sbi->sb_err = errs;
- sbi->sb_timeshift = timeshift;
+ sbi->sb_lowercase = ctx->lowercase;
+ sbi->sb_eas = ctx->eas;
+ sbi->sb_chk = ctx->chk;
+ sbi->sb_chkdsk = ctx->chkdsk;
+ sbi->sb_err = ctx->errs;
+ sbi->sb_timeshift = ctx->timeshift;
sbi->sb_was_error = 0;
sbi->sb_cp_table = NULL;
sbi->sb_c_bitmap = -1;
@@ -653,7 +595,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
/* Check for general fs errors*/
if (spareblock->dirty && !spareblock->old_wrote) {
- if (errs == 2) {
+ if (sbi->sb_err == 2) {
pr_err("Improperly stopped, not mounted\n");
goto bail4;
}
@@ -667,16 +609,16 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
}
if (le32_to_cpu(spareblock->n_dnode_spares) != le32_to_cpu(spareblock->n_dnode_spares_free)) {
- if (errs >= 2) {
+ if (sbi->sb_err >= 2) {
pr_err("Spare dnodes used, try chkdsk\n");
mark_dirty(s, 0);
goto bail4;
}
hpfs_error(s, "warning: spare dnodes used, try chkdsk");
- if (errs == 0)
+ if (sbi->sb_err == 0)
pr_err("Proceeding, but your filesystem could be corrupted if you delete files or directories\n");
}
- if (chk) {
+ if (sbi->sb_chk) {
unsigned a;
if (le32_to_cpu(superblock->dir_band_end) - le32_to_cpu(superblock->dir_band_start) + 1 != le32_to_cpu(superblock->n_dir_band) ||
le32_to_cpu(superblock->dir_band_end) < le32_to_cpu(superblock->dir_band_start) || le32_to_cpu(superblock->n_dir_band) > 0x4000) {
@@ -755,18 +697,70 @@ bail0:
return -EINVAL;
}
-static struct dentry *hpfs_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int hpfs_get_tree(struct fs_context *fc)
+{
+ return get_tree_bdev(fc, hpfs_fill_super);
+}
+
+static void hpfs_free_fc(struct fs_context *fc)
{
- return mount_bdev(fs_type, flags, dev_name, data, hpfs_fill_super);
+ kfree(fc->fs_private);
}
+static const struct fs_context_operations hpfs_fc_context_ops = {
+ .parse_param = hpfs_parse_param,
+ .get_tree = hpfs_get_tree,
+ .reconfigure = hpfs_reconfigure,
+ .free = hpfs_free_fc,
+};
+
+static int hpfs_init_fs_context(struct fs_context *fc)
+{
+ struct hpfs_fc_context *ctx;
+
+ ctx = kzalloc(sizeof(struct hpfs_fc_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+ struct super_block *sb = fc->root->d_sb;
+ struct hpfs_sb_info *sbi = hpfs_sb(sb);
+
+ ctx->uid = sbi->sb_uid;
+ ctx->gid = sbi->sb_gid;
+ ctx->umask = 0777 & ~sbi->sb_mode;
+ ctx->lowercase = sbi->sb_lowercase;
+ ctx->eas = sbi->sb_eas;
+ ctx->chk = sbi->sb_chk;
+ ctx->chkdsk = sbi->sb_chkdsk;
+ ctx->errs = sbi->sb_err;
+ ctx->timeshift = sbi->sb_timeshift;
+
+ } else {
+ ctx->uid = current_uid();
+ ctx->gid = current_gid();
+ ctx->umask = current_umask();
+ ctx->lowercase = 0;
+ ctx->eas = 2;
+ ctx->chk = 1;
+ ctx->errs = 1;
+ ctx->chkdsk = 1;
+ ctx->timeshift = 0;
+ }
+
+ fc->fs_private = ctx;
+ fc->ops = &hpfs_fc_context_ops;
+
+ return 0;
+};
+
static struct file_system_type hpfs_fs_type = {
.owner = THIS_MODULE,
.name = "hpfs",
- .mount = hpfs_mount,
.kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV,
+ .init_fs_context = hpfs_init_fs_context,
+ .parameters = hpfs_param_spec,
};
MODULE_ALIAS_FS("hpfs");
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 5cf327337e22..1bbf783b244a 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -39,6 +39,9 @@
#include <linux/uaccess.h>
#include <linux/sched/mm.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/hugetlbfs.h>
+
static const struct address_space_operations hugetlbfs_aops;
static const struct file_operations hugetlbfs_file_operations;
static const struct inode_operations hugetlbfs_dir_inode_operations;
@@ -110,7 +113,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
* way when do_mmap unwinds (may be important on powerpc
* and ia64).
*/
- vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND);
+ vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND | VM_MTE_ALLOWED);
vma->vm_ops = &hugetlb_vm_ops;
ret = seal_check_write(info->seals, vma);
@@ -687,6 +690,7 @@ static void hugetlbfs_evict_inode(struct inode *inode)
{
struct resv_map *resv_map;
+ trace_hugetlbfs_evict_inode(inode);
remove_inode_hugepages(inode, 0, LLONG_MAX);
/*
@@ -814,8 +818,10 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
return -EOPNOTSUPP;
- if (mode & FALLOC_FL_PUNCH_HOLE)
- return hugetlbfs_punch_hole(inode, offset, len);
+ if (mode & FALLOC_FL_PUNCH_HOLE) {
+ error = hugetlbfs_punch_hole(inode, offset, len);
+ goto out_nolock;
+ }
/*
* Default preallocate case.
@@ -919,6 +925,9 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
inode_set_ctime_current(inode);
out:
inode_unlock(inode);
+
+out_nolock:
+ trace_hugetlbfs_fallocate(inode, mode, offset, len, error);
return error;
}
@@ -935,6 +944,8 @@ static int hugetlbfs_setattr(struct mnt_idmap *idmap,
if (error)
return error;
+ trace_hugetlbfs_setattr(inode, dentry, attr);
+
if (ia_valid & ATTR_SIZE) {
loff_t oldsize = inode->i_size;
loff_t newsize = attr->ia_size;
@@ -1033,6 +1044,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
break;
}
lockdep_annotate_inode_mutex_key(inode);
+ trace_hugetlbfs_alloc_inode(inode, dir, mode);
} else {
if (resv_map)
kref_put(&resv_map->refs, resv_map_release);
@@ -1272,6 +1284,7 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
static void hugetlbfs_free_inode(struct inode *inode)
{
+ trace_hugetlbfs_free_inode(inode);
kmem_cache_free(hugetlbfs_inode_cachep, HUGETLBFS_I(inode));
}
diff --git a/fs/inode.c b/fs/inode.c
index 8dabb224f941..b13b778257ae 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -21,7 +21,12 @@
#include <linux/list_lru.h>
#include <linux/iversion.h>
#include <linux/rw_hint.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
#include <trace/events/writeback.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/timestamp.h>
+
#include "internal.h"
/*
@@ -98,6 +103,70 @@ long get_nr_dirty_inodes(void)
return nr_dirty > 0 ? nr_dirty : 0;
}
+#ifdef CONFIG_DEBUG_FS
+static DEFINE_PER_CPU(long, mg_ctime_updates);
+static DEFINE_PER_CPU(long, mg_fine_stamps);
+static DEFINE_PER_CPU(long, mg_ctime_swaps);
+
+static unsigned long get_mg_ctime_updates(void)
+{
+ unsigned long sum = 0;
+ int i;
+
+ for_each_possible_cpu(i)
+ sum += data_race(per_cpu(mg_ctime_updates, i));
+ return sum;
+}
+
+static unsigned long get_mg_fine_stamps(void)
+{
+ unsigned long sum = 0;
+ int i;
+
+ for_each_possible_cpu(i)
+ sum += data_race(per_cpu(mg_fine_stamps, i));
+ return sum;
+}
+
+static unsigned long get_mg_ctime_swaps(void)
+{
+ unsigned long sum = 0;
+ int i;
+
+ for_each_possible_cpu(i)
+ sum += data_race(per_cpu(mg_ctime_swaps, i));
+ return sum;
+}
+
+#define mgtime_counter_inc(__var) this_cpu_inc(__var)
+
+static int mgts_show(struct seq_file *s, void *p)
+{
+ unsigned long ctime_updates = get_mg_ctime_updates();
+ unsigned long ctime_swaps = get_mg_ctime_swaps();
+ unsigned long fine_stamps = get_mg_fine_stamps();
+ unsigned long floor_swaps = timekeeping_get_mg_floor_swaps();
+
+ seq_printf(s, "%lu %lu %lu %lu\n",
+ ctime_updates, ctime_swaps, fine_stamps, floor_swaps);
+ return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(mgts);
+
+static int __init mg_debugfs_init(void)
+{
+ debugfs_create_file("multigrain_timestamps", S_IFREG | S_IRUGO, NULL, NULL, &mgts_fops);
+ return 0;
+}
+late_initcall(mg_debugfs_init);
+
+#else /* ! CONFIG_DEBUG_FS */
+
+#define mgtime_counter_inc(__var) do { } while (0)
+
+#endif /* CONFIG_DEBUG_FS */
+
/*
* Handle nr_inode sysctl
*/
@@ -174,6 +243,8 @@ int inode_init_always_gfp(struct super_block *sb, struct inode *inode, gfp_t gfp
inode->i_opflags = 0;
if (sb->s_xattr)
inode->i_opflags |= IOP_XATTR;
+ if (sb->s_type->fs_flags & FS_MGTIME)
+ inode->i_opflags |= IOP_MGTIME;
i_uid_write(inode, 0);
i_gid_write(inode, 0);
atomic_set(&inode->i_writecount, 0);
@@ -748,7 +819,7 @@ static void evict(struct inode *inode)
* ___wait_var_event() either sees the bit cleared or
* waitqueue_active() check in wake_up_var() sees the waiter.
*/
- smp_mb();
+ smp_mb__after_spinlock();
inode_wake_up_bit(inode, __I_NEW);
BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
spin_unlock(&inode->i_lock);
@@ -1241,16 +1312,15 @@ EXPORT_SYMBOL(unlock_two_nondirectories);
* @data: opaque data pointer to pass to @test and @set
*
* Search for the inode specified by @hashval and @data in the inode cache,
- * and if present it is return it with an increased reference count. This is
- * a variant of iget5_locked() for callers that don't want to fail on memory
- * allocation of inode.
+ * and if present return it with an increased reference count. This is a
+ * variant of iget5_locked() that doesn't allocate an inode.
*
- * If the inode is not in cache, insert the pre-allocated inode to cache and
+ * If the inode is not present in the cache, insert the pre-allocated inode and
* return it locked, hashed, and with the I_NEW flag set. The file system gets
* to fill it in before unlocking it via unlock_new_inode().
*
- * Note both @test and @set are called with the inode_hash_lock held, so can't
- * sleep.
+ * Note that both @test and @set are called with the inode_hash_lock held, so
+ * they can't sleep.
*/
struct inode *inode_insert5(struct inode *inode, unsigned long hashval,
int (*test)(struct inode *, void *),
@@ -1314,16 +1384,16 @@ EXPORT_SYMBOL(inode_insert5);
* @data: opaque data pointer to pass to @test and @set
*
* Search for the inode specified by @hashval and @data in the inode cache,
- * and if present it is return it with an increased reference count. This is
- * a generalized version of iget_locked() for file systems where the inode
+ * and if present return it with an increased reference count. This is a
+ * generalized version of iget_locked() for file systems where the inode
* number is not sufficient for unique identification of an inode.
*
- * If the inode is not in cache, allocate a new inode and return it locked,
- * hashed, and with the I_NEW flag set. The file system gets to fill it in
- * before unlocking it via unlock_new_inode().
+ * If the inode is not present in the cache, allocate and insert a new inode
+ * and return it locked, hashed, and with the I_NEW flag set. The file system
+ * gets to fill it in before unlocking it via unlock_new_inode().
*
- * Note both @test and @set are called with the inode_hash_lock held, so can't
- * sleep.
+ * Note that both @test and @set are called with the inode_hash_lock held, so
+ * they can't sleep.
*/
struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
int (*test)(struct inode *, void *),
@@ -2211,19 +2281,58 @@ int file_remove_privs(struct file *file)
}
EXPORT_SYMBOL(file_remove_privs);
+/**
+ * current_time - Return FS time (possibly fine-grained)
+ * @inode: inode.
+ *
+ * Return the current time truncated to the time granularity supported by
+ * the fs, as suitable for a ctime/mtime change. If the ctime is flagged
+ * as having been QUERIED, get a fine-grained timestamp, but don't update
+ * the floor.
+ *
+ * For a multigrain inode, this is effectively an estimate of the timestamp
+ * that a file would receive. An actual update must go through
+ * inode_set_ctime_current().
+ */
+struct timespec64 current_time(struct inode *inode)
+{
+ struct timespec64 now;
+ u32 cns;
+
+ ktime_get_coarse_real_ts64_mg(&now);
+
+ if (!is_mgtime(inode))
+ goto out;
+
+ /* If nothing has queried it, then coarse time is fine */
+ cns = smp_load_acquire(&inode->i_ctime_nsec);
+ if (cns & I_CTIME_QUERIED) {
+ /*
+ * If there is no apparent change, then get a fine-grained
+ * timestamp.
+ */
+ if (now.tv_nsec == (cns & ~I_CTIME_QUERIED))
+ ktime_get_real_ts64(&now);
+ }
+out:
+ return timestamp_truncate(now, inode);
+}
+EXPORT_SYMBOL(current_time);
+
static int inode_needs_update_time(struct inode *inode)
{
+ struct timespec64 now, ts;
int sync_it = 0;
- struct timespec64 now = current_time(inode);
- struct timespec64 ts;
/* First try to exhaust all avenues to not sync */
if (IS_NOCMTIME(inode))
return 0;
+ now = current_time(inode);
+
ts = inode_get_mtime(inode);
if (!timespec64_equal(&ts, &now))
- sync_it = S_MTIME;
+ sync_it |= S_MTIME;
ts = inode_get_ctime(inode);
if (!timespec64_equal(&ts, &now))
@@ -2600,6 +2709,16 @@ void inode_nohighmem(struct inode *inode)
}
EXPORT_SYMBOL(inode_nohighmem);
+struct timespec64 inode_set_ctime_to_ts(struct inode *inode, struct timespec64 ts)
+{
+ trace_inode_set_ctime_to_ts(inode, &ts);
+ set_normalized_timespec64(&ts, ts.tv_sec, ts.tv_nsec);
+ inode->i_ctime_sec = ts.tv_sec;
+ inode->i_ctime_nsec = ts.tv_nsec;
+ return ts;
+}
+EXPORT_SYMBOL(inode_set_ctime_to_ts);
+
/**
* timestamp_truncate - Truncate timespec to a granularity
* @t: Timespec
@@ -2632,39 +2751,159 @@ struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode)
EXPORT_SYMBOL(timestamp_truncate);
/**
- * current_time - Return FS time
- * @inode: inode.
+ * inode_set_ctime_current - set the ctime to current_time
+ * @inode: inode
*
- * Return the current time truncated to the time granularity supported by
- * the fs.
+ * Set the inode's ctime to the current value for the inode. Returns the
+ * current value that was assigned. If this is not a multigrain inode, then we
+ * set it to the later of the coarse time and floor value.
+ *
+ * If it is multigrain, then we first see if the coarse-grained timestamp is
+ * distinct from what is already there. If so, then use that. Otherwise, get a
+ * fine-grained timestamp.
*
- * Note that inode and inode->sb cannot be NULL.
- * Otherwise, the function warns and returns time without truncation.
+ * After that, try to swap the new value into i_ctime_nsec. Accept the
+ * resulting ctime, regardless of the outcome of the swap. If it has
+ * already been replaced, then that timestamp is later than the earlier
+ * unacceptable one, and is thus acceptable.
*/
-struct timespec64 current_time(struct inode *inode)
+struct timespec64 inode_set_ctime_current(struct inode *inode)
{
struct timespec64 now;
+ u32 cns, cur;
- ktime_get_coarse_real_ts64(&now);
- return timestamp_truncate(now, inode);
+ ktime_get_coarse_real_ts64_mg(&now);
+ now = timestamp_truncate(now, inode);
+
+ /* Just return that if this is not a multigrain fs */
+ if (!is_mgtime(inode)) {
+ inode_set_ctime_to_ts(inode, now);
+ goto out;
+ }
+
+ /*
+ * A fine-grained time is only needed if someone has queried
+ * for timestamps, and the current coarse grained time isn't
+ * later than what's already there.
+ */
+ cns = smp_load_acquire(&inode->i_ctime_nsec);
+ if (cns & I_CTIME_QUERIED) {
+ struct timespec64 ctime = { .tv_sec = inode->i_ctime_sec,
+ .tv_nsec = cns & ~I_CTIME_QUERIED };
+
+ if (timespec64_compare(&now, &ctime) <= 0) {
+ ktime_get_real_ts64_mg(&now);
+ now = timestamp_truncate(now, inode);
+ mgtime_counter_inc(mg_fine_stamps);
+ }
+ }
+ mgtime_counter_inc(mg_ctime_updates);
+
+ /* No need to cmpxchg if it's exactly the same */
+ if (cns == now.tv_nsec && inode->i_ctime_sec == now.tv_sec) {
+ trace_ctime_xchg_skip(inode, &now);
+ goto out;
+ }
+ cur = cns;
+retry:
+ /* Try to swap the nsec value into place. */
+ if (try_cmpxchg(&inode->i_ctime_nsec, &cur, now.tv_nsec)) {
+ /* If swap occurred, then we're (mostly) done */
+ inode->i_ctime_sec = now.tv_sec;
+ trace_ctime_ns_xchg(inode, cns, now.tv_nsec, cur);
+ mgtime_counter_inc(mg_ctime_swaps);
+ } else {
+ /*
+ * Was the change due to someone marking the old ctime QUERIED?
+ * If so then retry the swap. This can only happen once since
+ * the only way to clear I_CTIME_QUERIED is to stamp the inode
+ * with a new ctime.
+ */
+ if (!(cns & I_CTIME_QUERIED) && (cns | I_CTIME_QUERIED) == cur) {
+ cns = cur;
+ goto retry;
+ }
+ /* Otherwise, keep the existing ctime */
+ now.tv_sec = inode->i_ctime_sec;
+ now.tv_nsec = cur & ~I_CTIME_QUERIED;
+ }
+out:
+ return now;
}
-EXPORT_SYMBOL(current_time);
+EXPORT_SYMBOL(inode_set_ctime_current);
/**
- * inode_set_ctime_current - set the ctime to current_time
- * @inode: inode
+ * inode_set_ctime_deleg - try to update the ctime on a delegated inode
+ * @inode: inode to update
+ * @update: timespec64 to set the ctime
*
- * Set the inode->i_ctime to the current value for the inode. Returns
- * the current value that was assigned to i_ctime.
+ * Attempt to atomically update the ctime on behalf of a delegation holder.
+ *
+ * The nfs server can call back the holder of a delegation to get updated
+ * inode attributes, including the mtime. When updating the mtime, update
+ * the ctime to a value at least equal to that.
+ *
+ * This can race with concurrent updates to the inode, in which
+ * case the update is skipped.
+ *
+ * Note that this works even when multigrain timestamps are not enabled,
+ * so it is used in either case.
*/
-struct timespec64 inode_set_ctime_current(struct inode *inode)
+struct timespec64 inode_set_ctime_deleg(struct inode *inode, struct timespec64 update)
{
- struct timespec64 now = current_time(inode);
+ struct timespec64 now, cur_ts;
+ u32 cur, old;
- inode_set_ctime_to_ts(inode, now);
- return now;
+ /* pairs with try_cmpxchg below */
+ cur = smp_load_acquire(&inode->i_ctime_nsec);
+ cur_ts.tv_nsec = cur & ~I_CTIME_QUERIED;
+ cur_ts.tv_sec = inode->i_ctime_sec;
+
+ /* If the update is older than the existing value, skip it. */
+ if (timespec64_compare(&update, &cur_ts) <= 0)
+ return cur_ts;
+
+ ktime_get_coarse_real_ts64_mg(&now);
+
+ /* Clamp the update to "now" if it's in the future */
+ if (timespec64_compare(&update, &now) > 0)
+ update = now;
+
+ update = timestamp_truncate(update, inode);
+
+ /* No need to update if the values are already the same */
+ if (timespec64_equal(&update, &cur_ts))
+ return cur_ts;
+
+ /*
+ * Try to swap the nsec value into place. If it fails, that means
+ * it raced with an update due to a write or similar activity. That
+ * stamp takes precedence, so just skip the update.
+ */
+retry:
+ old = cur;
+ if (try_cmpxchg(&inode->i_ctime_nsec, &cur, update.tv_nsec)) {
+ inode->i_ctime_sec = update.tv_sec;
+ mgtime_counter_inc(mg_ctime_swaps);
+ return update;
+ }
+
+ /*
+ * Was the change due to another task marking the old ctime QUERIED?
+ *
+ * If so, then retry the swap. This can only happen once since
+ * the only way to clear I_CTIME_QUERIED is to stamp the inode
+ * with a new ctime.
+ */
+ if (!(old & I_CTIME_QUERIED) && (cur == (old | I_CTIME_QUERIED)))
+ goto retry;
+
+ /* Otherwise, it was a new timestamp. */
+ cur_ts.tv_sec = inode->i_ctime_sec;
+ cur_ts.tv_nsec = cur & ~I_CTIME_QUERIED;
+ return cur_ts;
}
-EXPORT_SYMBOL(inode_set_ctime_current);
+EXPORT_SYMBOL(inode_set_ctime_deleg);
/**
* in_group_or_capable - check whether caller is CAP_FSETID privileged
@@ -2672,7 +2911,7 @@ EXPORT_SYMBOL(inode_set_ctime_current);
* @inode: inode to check
* @vfsgid: the new/current vfsgid of @inode
*
- * Check wether @vfsgid is in the caller's group list or if the caller is
+ * Check whether @vfsgid is in the caller's group list or if the caller is
* privileged with CAP_FSETID over @inode. This can be used to determine
* whether the setgid bit can be kept or must be dropped.
*
diff --git a/fs/internal.h b/fs/internal.h
index 8c1b7acbbe8f..e7f02ae1e098 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -246,7 +246,6 @@ int open_namespace(struct ns_common *ns);
* fs/stat.c:
*/
-int getname_statx_lookup_flags(int flags);
int do_statx(int dfd, struct filename *filename, unsigned int flags,
unsigned int mask, struct statx __user *buffer);
int do_statx_fd(int fd, unsigned int flags, unsigned int mask,
@@ -267,7 +266,7 @@ struct xattr_name {
char name[XATTR_NAME_MAX + 1];
};
-struct xattr_ctx {
+struct kernel_xattr_ctx {
/* Value of attribute */
union {
const void __user *cvalue;
@@ -280,14 +279,15 @@ struct xattr_ctx {
unsigned int flags;
};
+ssize_t file_getxattr(struct file *file, struct kernel_xattr_ctx *ctx);
+ssize_t filename_getxattr(int dfd, struct filename *filename,
+ unsigned int lookup_flags, struct kernel_xattr_ctx *ctx);
+int file_setxattr(struct file *file, struct kernel_xattr_ctx *ctx);
+int filename_setxattr(int dfd, struct filename *filename,
+ unsigned int lookup_flags, struct kernel_xattr_ctx *ctx);
+int setxattr_copy(const char __user *name, struct kernel_xattr_ctx *ctx);
+int import_xattr_name(struct xattr_name *kname, const char __user *name);
-ssize_t do_getxattr(struct mnt_idmap *idmap,
- struct dentry *d,
- struct xattr_ctx *ctx);
-
-int setxattr_copy(const char __user *name, struct xattr_ctx *ctx);
-int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry,
- struct xattr_ctx *ctx);
int may_write_xattr(struct mnt_idmap *idmap, struct inode *inode);
#ifdef CONFIG_FS_POSIX_ACL
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 6e0c954388d4..638a36be31c1 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -231,11 +231,11 @@ static int ioctl_fiemap(struct file *filp, struct fiemap __user *ufiemap)
static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd,
u64 off, u64 olen, u64 destoff)
{
- struct fd src_file = fdget(srcfd);
+ CLASS(fd, src_file)(srcfd);
loff_t cloned;
int ret;
- if (!fd_file(src_file))
+ if (fd_empty(src_file))
return -EBADF;
cloned = vfs_clone_file_range(fd_file(src_file), off, dst_file, destoff,
olen, 0);
@@ -245,7 +245,6 @@ static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd,
ret = -EINVAL;
else
ret = 0;
- fdput(src_file);
return ret;
}
@@ -892,22 +891,20 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd,
SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
{
- struct fd f = fdget(fd);
+ CLASS(fd, f)(fd);
int error;
- if (!fd_file(f))
+ if (fd_empty(f))
return -EBADF;
error = security_file_ioctl(fd_file(f), cmd, arg);
if (error)
- goto out;
+ return error;
error = do_vfs_ioctl(fd_file(f), fd, cmd, arg);
if (error == -ENOIOCTLCMD)
error = vfs_ioctl(fd_file(f), cmd, arg);
-out:
- fdput(f);
return error;
}
@@ -950,15 +947,15 @@ EXPORT_SYMBOL(compat_ptr_ioctl);
COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd,
compat_ulong_t, arg)
{
- struct fd f = fdget(fd);
+ CLASS(fd, f)(fd);
int error;
- if (!fd_file(f))
+ if (fd_empty(f))
return -EBADF;
error = security_file_ioctl_compat(fd_file(f), cmd, arg);
if (error)
- goto out;
+ return error;
switch (cmd) {
/* FICLONE takes an int argument, so don't use compat_ptr() */
@@ -1009,10 +1006,6 @@ COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd,
error = -ENOTTY;
break;
}
-
- out:
- fdput(f);
-
return error;
}
#endif
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 78ebd265f425..ce73d2a48c1e 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1145,10 +1145,36 @@ static void iomap_write_delalloc_scan(struct inode *inode,
}
/*
+ * When a short write occurs, the filesystem might need to use ->iomap_end
+ * to remove space reservations created in ->iomap_begin.
+ *
+ * For filesystems that use delayed allocation, there can be dirty pages over
+ * the delalloc extent outside the range of a short write but still within the
+ * delalloc extent allocated for this iomap if the write raced with page
+ * faults.
+ *
* Punch out all the delalloc blocks in the range given except for those that
* have dirty data still pending in the page cache - those are going to be
* written and so must still retain the delalloc backing for writeback.
*
+ * The punch() callback *must* only punch delalloc extents in the range passed
+ * to it. It must skip over all other types of extents in the range and leave
+ * them completely unchanged. It must do this punch atomically with respect to
+ * other extent modifications.
+ *
+ * The punch() callback may be called with a folio locked to prevent writeback
+ * extent allocation racing at the edge of the range we are currently punching.
+ * The locked folio may or may not cover the range being punched, so it is not
+ * safe for the punch() callback to lock folios itself.
+ *
+ * Lock order is:
+ *
+ * inode->i_rwsem (shared or exclusive)
+ * inode->i_mapping->invalidate_lock (exclusive)
+ * folio_lock()
+ * ->punch
+ * internal filesystem allocation lock
+ *
* As we are scanning the page cache for data, we don't need to reimplement the
* wheel - mapping_seek_hole_data() does exactly what we need to identify the
* start and end of data ranges correctly even for sub-folio block sizes. This
@@ -1177,7 +1203,7 @@ static void iomap_write_delalloc_scan(struct inode *inode,
* require sprinkling this code with magic "+ 1" and "- 1" arithmetic and expose
* the code to subtle off-by-one bugs....
*/
-static void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte,
+void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte,
loff_t end_byte, unsigned flags, struct iomap *iomap,
iomap_punch_t punch)
{
@@ -1185,12 +1211,13 @@ static void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte,
loff_t scan_end_byte = min(i_size_read(inode), end_byte);
/*
- * Lock the mapping to avoid races with page faults re-instantiating
- * folios and dirtying them via ->page_mkwrite whilst we walk the
- * cache and perform delalloc extent removal. Failing to do this can
- * leave dirty pages with no space reservation in the cache.
+ * The caller must hold invalidate_lock to avoid races with page faults
+ * re-instantiating folios and dirtying them via ->page_mkwrite whilst
+ * we walk the cache and perform delalloc extent removal. Failing to do
+ * this can leave dirty pages with no space reservation in the cache.
*/
- filemap_invalidate_lock(inode->i_mapping);
+ lockdep_assert_held_write(&inode->i_mapping->invalidate_lock);
+
while (start_byte < scan_end_byte) {
loff_t data_end;
@@ -1207,7 +1234,7 @@ static void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte,
if (start_byte == -ENXIO || start_byte == scan_end_byte)
break;
if (WARN_ON_ONCE(start_byte < 0))
- goto out_unlock;
+ return;
WARN_ON_ONCE(start_byte < punch_start_byte);
WARN_ON_ONCE(start_byte > scan_end_byte);
@@ -1218,7 +1245,7 @@ static void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte,
data_end = mapping_seek_hole_data(inode->i_mapping, start_byte,
scan_end_byte, SEEK_HOLE);
if (WARN_ON_ONCE(data_end < 0))
- goto out_unlock;
+ return;
/*
* If we race with post-direct I/O invalidation of the page cache,
@@ -1240,74 +1267,8 @@ static void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte,
if (punch_start_byte < end_byte)
punch(inode, punch_start_byte, end_byte - punch_start_byte,
iomap);
-out_unlock:
- filemap_invalidate_unlock(inode->i_mapping);
}
-
-/*
- * When a short write occurs, the filesystem may need to remove reserved space
- * that was allocated in ->iomap_begin from it's ->iomap_end method. For
- * filesystems that use delayed allocation, we need to punch out delalloc
- * extents from the range that are not dirty in the page cache. As the write can
- * race with page faults, there can be dirty pages over the delalloc extent
- * outside the range of a short write but still within the delalloc extent
- * allocated for this iomap.
- *
- * This function uses [start_byte, end_byte) intervals (i.e. open ended) to
- * simplify range iterations.
- *
- * The punch() callback *must* only punch delalloc extents in the range passed
- * to it. It must skip over all other types of extents in the range and leave
- * them completely unchanged. It must do this punch atomically with respect to
- * other extent modifications.
- *
- * The punch() callback may be called with a folio locked to prevent writeback
- * extent allocation racing at the edge of the range we are currently punching.
- * The locked folio may or may not cover the range being punched, so it is not
- * safe for the punch() callback to lock folios itself.
- *
- * Lock order is:
- *
- * inode->i_rwsem (shared or exclusive)
- * inode->i_mapping->invalidate_lock (exclusive)
- * folio_lock()
- * ->punch
- * internal filesystem allocation lock
- */
-void iomap_file_buffered_write_punch_delalloc(struct inode *inode,
- loff_t pos, loff_t length, ssize_t written, unsigned flags,
- struct iomap *iomap, iomap_punch_t punch)
-{
- loff_t start_byte;
- loff_t end_byte;
- unsigned int blocksize = i_blocksize(inode);
-
- if (iomap->type != IOMAP_DELALLOC)
- return;
-
- /* If we didn't reserve the blocks, we're not allowed to punch them. */
- if (!(iomap->flags & IOMAP_F_NEW))
- return;
-
- /*
- * start_byte refers to the first unused block after a short write. If
- * nothing was written, round offset down to point at the first block in
- * the range.
- */
- if (unlikely(!written))
- start_byte = round_down(pos, blocksize);
- else
- start_byte = round_up(pos + written, blocksize);
- end_byte = round_up(pos + length, blocksize);
-
- /* Nothing to do if we've written the entire delalloc extent */
- if (start_byte >= end_byte)
- return;
-
- iomap_write_delalloc_release(inode, start_byte, end_byte, flags, iomap,
- punch);
-}
-EXPORT_SYMBOL_GPL(iomap_file_buffered_write_punch_delalloc);
+EXPORT_SYMBOL_GPL(iomap_write_delalloc_release);
static loff_t iomap_unshare_iter(struct iomap_iter *iter)
{
@@ -1316,22 +1277,7 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter)
loff_t length = iomap_length(iter);
loff_t written = 0;
- /* Don't bother with blocks that are not shared to start with. */
- if (!(iomap->flags & IOMAP_F_SHARED))
- return length;
-
- /*
- * Don't bother with delalloc reservations, holes or unwritten extents.
- *
- * Note that we use srcmap directly instead of iomap_iter_srcmap as
- * unsharing requires providing a separate source map, and the presence
- * of one is a good indicator that unsharing is needed, unlike
- * IOMAP_F_SHARED which can be set for any data that goes into the COW
- * fork for XFS.
- */
- if (iter->srcmap.type == IOMAP_HOLE ||
- iter->srcmap.type == IOMAP_DELALLOC ||
- iter->srcmap.type == IOMAP_UNWRITTEN)
+ if (!iomap_want_unshare_iter(iter))
return length;
do {
@@ -1838,7 +1784,7 @@ new_ioend:
if (ifs)
atomic_add(len, &ifs->write_bytes_pending);
wpc->ioend->io_size += len;
- wbc_account_cgroup_owner(wbc, &folio->page, len);
+ wbc_account_cgroup_owner(wbc, folio, len);
return 0;
}
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index f637aa0706a3..b521eb15759e 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -271,7 +271,7 @@ static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
* clearing the WRITE_THROUGH flag in the dio request.
*/
static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
- const struct iomap *iomap, bool use_fua)
+ const struct iomap *iomap, bool use_fua, bool atomic)
{
blk_opf_t opflags = REQ_SYNC | REQ_IDLE;
@@ -283,6 +283,8 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio,
opflags |= REQ_FUA;
else
dio->flags &= ~IOMAP_DIO_WRITE_THROUGH;
+ if (atomic)
+ opflags |= REQ_ATOMIC;
return opflags;
}
@@ -293,7 +295,8 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
const struct iomap *iomap = &iter->iomap;
struct inode *inode = iter->inode;
unsigned int fs_block_size = i_blocksize(inode), pad;
- loff_t length = iomap_length(iter);
+ const loff_t length = iomap_length(iter);
+ bool atomic = iter->flags & IOMAP_ATOMIC;
loff_t pos = iter->pos;
blk_opf_t bio_opf;
struct bio *bio;
@@ -303,6 +306,9 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
size_t copied = 0;
size_t orig_count;
+ if (atomic && length != fs_block_size)
+ return -EINVAL;
+
if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) ||
!bdev_iter_is_aligned(iomap->bdev, dio->submit.iter))
return -EINVAL;
@@ -377,12 +383,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
goto out;
}
- /*
- * Set the operation flags early so that bio_iov_iter_get_pages
- * can set up the page vector appropriately for a ZONE_APPEND
- * operation.
- */
- bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua);
+ bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua, atomic);
nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS);
do {
@@ -415,6 +416,17 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
}
n = bio->bi_iter.bi_size;
+ if (WARN_ON_ONCE(atomic && n != length)) {
+ /*
+ * This bio should have covered the complete length,
+ * which it doesn't, so error. We may need to zero out
+ * the tail (complete FS block), similar to when
+ * bio_iov_iter_get_pages() returns an error, above.
+ */
+ ret = -EINVAL;
+ bio_put(bio);
+ goto zero_tail;
+ }
if (dio->flags & IOMAP_DIO_WRITE) {
task_io_account_write(n);
} else {
@@ -598,6 +610,9 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (iocb->ki_flags & IOCB_NOWAIT)
iomi.flags |= IOMAP_NOWAIT;
+ if (iocb->ki_flags & IOCB_ATOMIC)
+ iomi.flags |= IOMAP_ATOMIC;
+
if (iov_iter_rw(iter) == READ) {
/* reads can always complete inline */
dio->flags |= IOMAP_DIO_INLINE_COMP;
@@ -659,7 +674,17 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (ret != -EAGAIN) {
trace_iomap_dio_invalidate_fail(inode, iomi.pos,
iomi.len);
- ret = -ENOTBLK;
+ if (iocb->ki_flags & IOCB_ATOMIC) {
+ /*
+ * folio invalidation failed, maybe
+ * this is transient, unlock and see if
+ * the caller tries again.
+ */
+ ret = -EAGAIN;
+ } else {
+ /* fall back to buffered write */
+ ret = -ENOTBLK;
+ }
}
goto out_free_dio;
}
diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
index 0a991c4ce87d..4118a42cdab0 100644
--- a/fs/iomap/trace.h
+++ b/fs/iomap/trace.h
@@ -98,7 +98,8 @@ DEFINE_RANGE_EVENT(iomap_dio_rw_queued);
{ IOMAP_REPORT, "REPORT" }, \
{ IOMAP_FAULT, "FAULT" }, \
{ IOMAP_DIRECT, "DIRECT" }, \
- { IOMAP_NOWAIT, "NOWAIT" }
+ { IOMAP_NOWAIT, "NOWAIT" }, \
+ { IOMAP_ATOMIC, "ATOMIC" }
#define IOMAP_F_FLAGS_STRINGS \
{ IOMAP_F_NEW, "NEW" }, \
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 4305a1ac808a..9153ff3a08e7 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -662,10 +662,6 @@ void jbd2_journal_commit_transaction(journal_t *journal)
JBUFFER_TRACE(jh, "ph3: write metadata");
escape = jbd2_journal_write_metadata_buffer(commit_transaction,
jh, &wbuf[bufs], blocknr);
- if (escape < 0) {
- jbd2_journal_abort(journal, escape);
- continue;
- }
jbd2_file_log_bh(&io_bufs, wbuf[bufs]);
/* Record the new block's tag in the current descriptor
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 97f487c3d8fc..7e49d912b091 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -318,7 +318,6 @@ static inline void jbd2_data_do_escape(char *data)
*
*
* Return value:
- * <0: Error
* =0: Finished OK without escape
* =1: Finished OK with escape
*/
@@ -386,12 +385,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
goto escape_done;
spin_unlock(&jh_in->b_state_lock);
- tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS);
- if (!tmp) {
- brelse(new_bh);
- free_buffer_head(new_bh);
- return -ENOMEM;
- }
+ tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS | __GFP_NOFAIL);
spin_lock(&jh_in->b_state_lock);
if (jh_in->b_frozen_data) {
jbd2_free(tmp, bh_in->b_size);
@@ -1518,9 +1512,10 @@ static int journal_load_superblock(journal_t *journal)
* destroy journal_t structures, and to initialise and read existing
* journal blocks from disk. */
-/* First: create and setup a journal_t object in memory. We initialise
- * very few fields yet: that has to wait until we have created the
- * journal structures from from scratch, or loaded them from disk. */
+/* The journal_init_common() function creates and fills a journal_t object
+ * in memory. It calls journal_load_superblock() to load the on-disk journal
+ * superblock and initialize the journal_t object.
+ */
static journal_t *journal_init_common(struct block_device *bdev,
struct block_device *fs_dev,
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 667f67342c52..9192be7c19d8 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -485,6 +485,104 @@ static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag,
return tag->t_checksum == cpu_to_be16(csum32);
}
+static __always_inline int jbd2_do_replay(journal_t *journal,
+ struct recovery_info *info,
+ struct buffer_head *bh,
+ unsigned long *next_log_block,
+ unsigned int next_commit_ID)
+{
+ char *tagp;
+ int flags;
+ int ret = 0;
+ int tag_bytes = journal_tag_bytes(journal);
+ int descr_csum_size = 0;
+ unsigned long io_block;
+ journal_block_tag_t tag;
+ struct buffer_head *obh;
+ struct buffer_head *nbh;
+
+ if (jbd2_journal_has_csum_v2or3(journal))
+ descr_csum_size = sizeof(struct jbd2_journal_block_tail);
+
+ tagp = &bh->b_data[sizeof(journal_header_t)];
+ while (tagp - bh->b_data + tag_bytes <=
+ journal->j_blocksize - descr_csum_size) {
+ int err;
+
+ memcpy(&tag, tagp, sizeof(tag));
+ flags = be16_to_cpu(tag.t_flags);
+
+ io_block = (*next_log_block)++;
+ wrap(journal, *next_log_block);
+ err = jread(&obh, journal, io_block);
+ if (err) {
+ /* Recover what we can, but report failure at the end. */
+ ret = err;
+ pr_err("JBD2: IO error %d recovering block %lu in log\n",
+ err, io_block);
+ } else {
+ unsigned long long blocknr;
+
+ J_ASSERT(obh != NULL);
+ blocknr = read_tag_block(journal, &tag);
+
+ /* If the block has been revoked, then we're all done here. */
+ if (jbd2_journal_test_revoke(journal, blocknr,
+ next_commit_ID)) {
+ brelse(obh);
+ ++info->nr_revoke_hits;
+ goto skip_write;
+ }
+
+ /* Look for block corruption */
+ if (!jbd2_block_tag_csum_verify(journal, &tag,
+ (journal_block_tag3_t *)tagp,
+ obh->b_data, next_commit_ID)) {
+ brelse(obh);
+ ret = -EFSBADCRC;
+ pr_err("JBD2: Invalid checksum recovering data block %llu in journal block %lu\n",
+ blocknr, io_block);
+ goto skip_write;
+ }
+
+ /* Find a buffer for the new data being restored */
+ nbh = __getblk(journal->j_fs_dev, blocknr,
+ journal->j_blocksize);
+ if (nbh == NULL) {
+ pr_err("JBD2: Out of memory during recovery.\n");
+ brelse(obh);
+ return -ENOMEM;
+ }
+
+ lock_buffer(nbh);
+ memcpy(nbh->b_data, obh->b_data, journal->j_blocksize);
+ if (flags & JBD2_FLAG_ESCAPE) {
+ *((__be32 *)nbh->b_data) =
+ cpu_to_be32(JBD2_MAGIC_NUMBER);
+ }
+
+ BUFFER_TRACE(nbh, "marking dirty");
+ set_buffer_uptodate(nbh);
+ mark_buffer_dirty(nbh);
+ BUFFER_TRACE(nbh, "marking uptodate");
+ ++info->nr_replays;
+ unlock_buffer(nbh);
+ brelse(obh);
+ brelse(nbh);
+ }
+
+skip_write:
+ tagp += tag_bytes;
+ if (!(flags & JBD2_FLAG_SAME_UUID))
+ tagp += 16;
+
+ if (flags & JBD2_FLAG_LAST_TAG)
+ break;
+ }
+
+ return ret;
+}
+
static int do_one_pass(journal_t *journal,
struct recovery_info *info, enum passtype pass)
{
@@ -493,13 +591,10 @@ static int do_one_pass(journal_t *journal,
int err, success = 0;
journal_superblock_t * sb;
journal_header_t * tmp;
- struct buffer_head * bh;
+ struct buffer_head *bh = NULL;
unsigned int sequence;
int blocktype;
- int tag_bytes = journal_tag_bytes(journal);
__u32 crc32_sum = ~0; /* Transactional Checksums */
- int descr_csum_size = 0;
- int block_error = 0;
bool need_check_commit_time = false;
__u64 last_trans_commit_time = 0, commit_time;
@@ -528,12 +623,6 @@ static int do_one_pass(journal_t *journal,
*/
while (1) {
- int flags;
- char * tagp;
- journal_block_tag_t tag;
- struct buffer_head * obh;
- struct buffer_head * nbh;
-
cond_resched();
/* If we already know where to stop the log traversal,
@@ -552,6 +641,8 @@ static int do_one_pass(journal_t *journal,
* record. */
jbd2_debug(3, "JBD2: checking block %ld\n", next_log_block);
+ brelse(bh);
+ bh = NULL;
err = jread(&bh, journal, next_log_block);
if (err)
goto failed;
@@ -567,20 +658,16 @@ static int do_one_pass(journal_t *journal,
tmp = (journal_header_t *)bh->b_data;
- if (tmp->h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER)) {
- brelse(bh);
+ if (tmp->h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER))
break;
- }
blocktype = be32_to_cpu(tmp->h_blocktype);
sequence = be32_to_cpu(tmp->h_sequence);
jbd2_debug(3, "Found magic %d, sequence %d\n",
blocktype, sequence);
- if (sequence != next_commit_ID) {
- brelse(bh);
+ if (sequence != next_commit_ID)
break;
- }
/* OK, we have a valid descriptor block which matches
* all of the sequence number checks. What are we going
@@ -589,11 +676,7 @@ static int do_one_pass(journal_t *journal,
switch(blocktype) {
case JBD2_DESCRIPTOR_BLOCK:
/* Verify checksum first */
- if (jbd2_journal_has_csum_v2or3(journal))
- descr_csum_size =
- sizeof(struct jbd2_journal_block_tail);
- if (descr_csum_size > 0 &&
- !jbd2_descriptor_block_csum_verify(journal,
+ if (!jbd2_descriptor_block_csum_verify(journal,
bh->b_data)) {
/*
* PASS_SCAN can see stale blocks due to lazy
@@ -603,7 +686,6 @@ static int do_one_pass(journal_t *journal,
pr_err("JBD2: Invalid checksum recovering block %lu in log\n",
next_log_block);
err = -EFSBADCRC;
- brelse(bh);
goto failed;
}
need_check_commit_time = true;
@@ -619,125 +701,39 @@ static int do_one_pass(journal_t *journal,
if (pass != PASS_REPLAY) {
if (pass == PASS_SCAN &&
jbd2_has_feature_checksum(journal) &&
- !need_check_commit_time &&
!info->end_transaction) {
if (calc_chksums(journal, bh,
&next_log_block,
- &crc32_sum)) {
- put_bh(bh);
+ &crc32_sum))
break;
- }
- put_bh(bh);
continue;
}
next_log_block += count_tags(journal, bh);
wrap(journal, next_log_block);
- put_bh(bh);
continue;
}
- /* A descriptor block: we can now write all of
- * the data blocks. Yay, useful work is finally
- * getting done here! */
-
- tagp = &bh->b_data[sizeof(journal_header_t)];
- while ((tagp - bh->b_data + tag_bytes)
- <= journal->j_blocksize - descr_csum_size) {
- unsigned long io_block;
-
- memcpy(&tag, tagp, sizeof(tag));
- flags = be16_to_cpu(tag.t_flags);
-
- io_block = next_log_block++;
- wrap(journal, next_log_block);
- err = jread(&obh, journal, io_block);
- if (err) {
- /* Recover what we can, but
- * report failure at the end. */
- success = err;
- printk(KERN_ERR
- "JBD2: IO error %d recovering "
- "block %lu in log\n",
- err, io_block);
- } else {
- unsigned long long blocknr;
-
- J_ASSERT(obh != NULL);
- blocknr = read_tag_block(journal,
- &tag);
-
- /* If the block has been
- * revoked, then we're all done
- * here. */
- if (jbd2_journal_test_revoke
- (journal, blocknr,
- next_commit_ID)) {
- brelse(obh);
- ++info->nr_revoke_hits;
- goto skip_write;
- }
-
- /* Look for block corruption */
- if (!jbd2_block_tag_csum_verify(
- journal, &tag, (journal_block_tag3_t *)tagp,
- obh->b_data, be32_to_cpu(tmp->h_sequence))) {
- brelse(obh);
- success = -EFSBADCRC;
- printk(KERN_ERR "JBD2: Invalid "
- "checksum recovering "
- "data block %llu in "
- "journal block %lu\n",
- blocknr, io_block);
- block_error = 1;
- goto skip_write;
- }
-
- /* Find a buffer for the new
- * data being restored */
- nbh = __getblk(journal->j_fs_dev,
- blocknr,
- journal->j_blocksize);
- if (nbh == NULL) {
- printk(KERN_ERR
- "JBD2: Out of memory "
- "during recovery.\n");
- err = -ENOMEM;
- brelse(bh);
- brelse(obh);
- goto failed;
- }
-
- lock_buffer(nbh);
- memcpy(nbh->b_data, obh->b_data,
- journal->j_blocksize);
- if (flags & JBD2_FLAG_ESCAPE) {
- *((__be32 *)nbh->b_data) =
- cpu_to_be32(JBD2_MAGIC_NUMBER);
- }
-
- BUFFER_TRACE(nbh, "marking dirty");
- set_buffer_uptodate(nbh);
- mark_buffer_dirty(nbh);
- BUFFER_TRACE(nbh, "marking uptodate");
- ++info->nr_replays;
- unlock_buffer(nbh);
- brelse(obh);
- brelse(nbh);
- }
-
- skip_write:
- tagp += tag_bytes;
- if (!(flags & JBD2_FLAG_SAME_UUID))
- tagp += 16;
-
- if (flags & JBD2_FLAG_LAST_TAG)
- break;
+ /*
+ * A descriptor block: we can now write all of the
+ * data blocks. Yay, useful work is finally getting
+ * done here!
+ */
+ err = jbd2_do_replay(journal, info, bh, &next_log_block,
+ next_commit_ID);
+ if (err) {
+ if (err == -ENOMEM)
+ goto failed;
+ success = err;
}
- brelse(bh);
continue;
case JBD2_COMMIT_BLOCK:
+ if (pass != PASS_SCAN) {
+ next_commit_ID++;
+ continue;
+ }
+
/* How to differentiate between interrupted commit
* and journal corruption ?
*
@@ -782,7 +778,6 @@ static int do_one_pass(journal_t *journal,
pr_err("JBD2: Invalid checksum found in transaction %u\n",
next_commit_ID);
err = -EFSBADCRC;
- brelse(bh);
goto failed;
}
ignore_crc_mismatch:
@@ -792,7 +787,6 @@ static int do_one_pass(journal_t *journal,
*/
jbd2_debug(1, "JBD2: Invalid checksum ignored in transaction %u, likely stale data\n",
next_commit_ID);
- brelse(bh);
goto done;
}
@@ -802,8 +796,7 @@ static int do_one_pass(journal_t *journal,
* much to do other than move on to the next sequence
* number.
*/
- if (pass == PASS_SCAN &&
- jbd2_has_feature_checksum(journal)) {
+ if (jbd2_has_feature_checksum(journal)) {
struct commit_header *cbh =
(struct commit_header *)bh->b_data;
unsigned found_chksum =
@@ -812,7 +805,6 @@ static int do_one_pass(journal_t *journal,
if (info->end_transaction) {
journal->j_failed_commit =
info->end_transaction;
- brelse(bh);
break;
}
@@ -828,36 +820,33 @@ static int do_one_pass(journal_t *journal,
goto chksum_error;
crc32_sum = ~0;
+ goto chksum_ok;
}
- if (pass == PASS_SCAN &&
- !jbd2_commit_block_csum_verify(journal,
- bh->b_data)) {
- if (jbd2_commit_block_csum_verify_partial(
- journal,
- bh->b_data)) {
- pr_notice("JBD2: Find incomplete commit block in transaction %u block %lu\n",
- next_commit_ID, next_log_block);
- goto chksum_ok;
- }
- chksum_error:
- if (commit_time < last_trans_commit_time)
- goto ignore_crc_mismatch;
- info->end_transaction = next_commit_ID;
- info->head_block = head_block;
- if (!jbd2_has_feature_async_commit(journal)) {
- journal->j_failed_commit =
- next_commit_ID;
- brelse(bh);
- break;
- }
+ if (jbd2_commit_block_csum_verify(journal, bh->b_data))
+ goto chksum_ok;
+
+ if (jbd2_commit_block_csum_verify_partial(journal,
+ bh->b_data)) {
+ pr_notice("JBD2: Find incomplete commit block in transaction %u block %lu\n",
+ next_commit_ID, next_log_block);
+ goto chksum_ok;
}
- if (pass == PASS_SCAN) {
- chksum_ok:
- last_trans_commit_time = commit_time;
- head_block = next_log_block;
+
+chksum_error:
+ if (commit_time < last_trans_commit_time)
+ goto ignore_crc_mismatch;
+ info->end_transaction = next_commit_ID;
+ info->head_block = head_block;
+
+ if (!jbd2_has_feature_async_commit(journal)) {
+ journal->j_failed_commit = next_commit_ID;
+ break;
}
- brelse(bh);
+
+chksum_ok:
+ last_trans_commit_time = commit_time;
+ head_block = next_log_block;
next_commit_ID++;
continue;
@@ -876,14 +865,11 @@ static int do_one_pass(journal_t *journal,
/* If we aren't in the REVOKE pass, then we can
* just skip over this block. */
- if (pass != PASS_REVOKE) {
- brelse(bh);
+ if (pass != PASS_REVOKE)
continue;
- }
err = scan_revoke_records(journal, bh,
next_commit_ID, info);
- brelse(bh);
if (err)
goto failed;
continue;
@@ -891,12 +877,12 @@ static int do_one_pass(journal_t *journal,
default:
jbd2_debug(3, "Unrecognised magic %d, end of scan.\n",
blocktype);
- brelse(bh);
goto done;
}
}
done:
+ brelse(bh);
/*
* We broke out of the log scan loop: either we came to the
* known end of the log or we found an unexpected block in the
@@ -927,11 +913,10 @@ static int do_one_pass(journal_t *journal,
success = err;
}
- if (block_error && success == 0)
- success = -EIO;
return success;
failed:
+ brelse(bh);
return err;
}
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 974ecf5e0d95..3ab410059dc2 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -187,7 +187,7 @@ int dbMount(struct inode *ipbmap)
}
bmp->db_numag = le32_to_cpu(dbmp_le->dn_numag);
- if (!bmp->db_numag || bmp->db_numag >= MAXAG) {
+ if (!bmp->db_numag || bmp->db_numag > MAXAG) {
err = -EINVAL;
goto err_release_metapage;
}
diff --git a/fs/jfs/jfs_filsys.h b/fs/jfs/jfs_filsys.h
index 33ef13a0b110..8794281f8ffd 100644
--- a/fs/jfs/jfs_filsys.h
+++ b/fs/jfs/jfs_filsys.h
@@ -24,6 +24,7 @@
#define JFS_ERR_REMOUNT_RO 0x00000002 /* remount read-only */
#define JFS_ERR_CONTINUE 0x00000004 /* continue */
#define JFS_ERR_PANIC 0x00000008 /* panic */
+#define JFS_ERR_MASK (JFS_ERR_REMOUNT_RO|JFS_ERR_CONTINUE|JFS_ERR_PANIC)
/* Quota support */
#define JFS_USRQUOTA 0x00000010
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index e1be21ca5d6e..223d9ac59839 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -6,11 +6,11 @@
#include <linux/fs.h>
#include <linux/module.h>
-#include <linux/parser.h>
#include <linux/completion.h>
#include <linux/vfs.h>
#include <linux/quotaops.h>
-#include <linux/mount.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include <linux/moduleparam.h>
#include <linux/kthread.h>
#include <linux/posix_acl.h>
@@ -210,240 +210,195 @@ enum {
Opt_discard, Opt_nodiscard, Opt_discard_minblk
};
-static const match_table_t tokens = {
- {Opt_integrity, "integrity"},
- {Opt_nointegrity, "nointegrity"},
- {Opt_iocharset, "iocharset=%s"},
- {Opt_resize, "resize=%u"},
- {Opt_resize_nosize, "resize"},
- {Opt_errors, "errors=%s"},
- {Opt_ignore, "noquota"},
- {Opt_quota, "quota"},
- {Opt_usrquota, "usrquota"},
- {Opt_grpquota, "grpquota"},
- {Opt_uid, "uid=%u"},
- {Opt_gid, "gid=%u"},
- {Opt_umask, "umask=%u"},
- {Opt_discard, "discard"},
- {Opt_nodiscard, "nodiscard"},
- {Opt_discard_minblk, "discard=%u"},
- {Opt_err, NULL}
+static const struct constant_table jfs_param_errors[] = {
+ {"continue", JFS_ERR_CONTINUE},
+ {"remount-ro", JFS_ERR_REMOUNT_RO},
+ {"panic", JFS_ERR_PANIC},
+ {}
};
-static int parse_options(char *options, struct super_block *sb, s64 *newLVSize,
- int *flag)
-{
- void *nls_map = (void *)-1; /* -1: no change; NULL: none */
- char *p;
- struct jfs_sb_info *sbi = JFS_SBI(sb);
+static const struct fs_parameter_spec jfs_param_spec[] = {
+ fsparam_flag_no ("integrity", Opt_integrity),
+ fsparam_string ("iocharset", Opt_iocharset),
+ fsparam_u64 ("resize", Opt_resize),
+ fsparam_flag ("resize", Opt_resize_nosize),
+ fsparam_enum ("errors", Opt_errors, jfs_param_errors),
+ fsparam_flag ("quota", Opt_quota),
+ fsparam_flag ("noquota", Opt_ignore),
+ fsparam_flag ("usrquota", Opt_usrquota),
+ fsparam_flag ("grpquota", Opt_grpquota),
+ fsparam_uid ("uid", Opt_uid),
+ fsparam_gid ("gid", Opt_gid),
+ fsparam_u32oct ("umask", Opt_umask),
+ fsparam_flag ("discard", Opt_discard),
+ fsparam_u32 ("discard", Opt_discard_minblk),
+ fsparam_flag ("nodiscard", Opt_nodiscard),
+ {}
+};
- *newLVSize = 0;
-
- if (!options)
- return 1;
-
- while ((p = strsep(&options, ",")) != NULL) {
- substring_t args[MAX_OPT_ARGS];
- int token;
- if (!*p)
- continue;
-
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_integrity:
- *flag &= ~JFS_NOINTEGRITY;
- break;
- case Opt_nointegrity:
- *flag |= JFS_NOINTEGRITY;
- break;
- case Opt_ignore:
- /* Silently ignore the quota options */
- /* Don't do anything ;-) */
- break;
- case Opt_iocharset:
- if (nls_map && nls_map != (void *) -1)
- unload_nls(nls_map);
- if (!strcmp(args[0].from, "none"))
- nls_map = NULL;
- else {
- nls_map = load_nls(args[0].from);
- if (!nls_map) {
- pr_err("JFS: charset not found\n");
- goto cleanup;
- }
- }
- break;
- case Opt_resize:
- {
- char *resize = args[0].from;
- int rc = kstrtoll(resize, 0, newLVSize);
+struct jfs_context {
+ int flag;
+ kuid_t uid;
+ kgid_t gid;
+ uint umask;
+ uint minblks_trim;
+ void *nls_map;
+ bool resize;
+ s64 newLVSize;
+};
- if (rc)
- goto cleanup;
- break;
- }
- case Opt_resize_nosize:
- {
- *newLVSize = sb_bdev_nr_blocks(sb);
- if (*newLVSize == 0)
- pr_err("JFS: Cannot determine volume size\n");
- break;
+static int jfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+ struct jfs_context *ctx = fc->fs_private;
+ int reconfigure = (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE);
+ struct fs_parse_result result;
+ struct nls_table *nls_map;
+ int opt;
+
+ opt = fs_parse(fc, jfs_param_spec, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_integrity:
+ if (result.negated)
+ ctx->flag |= JFS_NOINTEGRITY;
+ else
+ ctx->flag &= ~JFS_NOINTEGRITY;
+ break;
+ case Opt_ignore:
+ /* Silently ignore the quota options */
+ /* Don't do anything ;-) */
+ break;
+ case Opt_iocharset:
+ if (ctx->nls_map && ctx->nls_map != (void *) -1) {
+ unload_nls(ctx->nls_map);
+ ctx->nls_map = NULL;
}
- case Opt_errors:
- {
- char *errors = args[0].from;
- if (!errors || !*errors)
- goto cleanup;
- if (!strcmp(errors, "continue")) {
- *flag &= ~JFS_ERR_REMOUNT_RO;
- *flag &= ~JFS_ERR_PANIC;
- *flag |= JFS_ERR_CONTINUE;
- } else if (!strcmp(errors, "remount-ro")) {
- *flag &= ~JFS_ERR_CONTINUE;
- *flag &= ~JFS_ERR_PANIC;
- *flag |= JFS_ERR_REMOUNT_RO;
- } else if (!strcmp(errors, "panic")) {
- *flag &= ~JFS_ERR_CONTINUE;
- *flag &= ~JFS_ERR_REMOUNT_RO;
- *flag |= JFS_ERR_PANIC;
- } else {
- pr_err("JFS: %s is an invalid error handler\n",
- errors);
- goto cleanup;
+ if (!strcmp(param->string, "none"))
+ ctx->nls_map = NULL;
+ else {
+ nls_map = load_nls(param->string);
+ if (!nls_map) {
+ pr_err("JFS: charset not found\n");
+ return -EINVAL;
}
- break;
+ ctx->nls_map = nls_map;
}
+ break;
+ case Opt_resize:
+ if (!reconfigure)
+ return -EINVAL;
+ ctx->resize = true;
+ ctx->newLVSize = result.uint_64;
+ break;
+ case Opt_resize_nosize:
+ if (!reconfigure)
+ return -EINVAL;
+ ctx->resize = true;
+ break;
+ case Opt_errors:
+ ctx->flag &= ~JFS_ERR_MASK;
+ ctx->flag |= result.uint_32;
+ break;
#ifdef CONFIG_QUOTA
- case Opt_quota:
- case Opt_usrquota:
- *flag |= JFS_USRQUOTA;
- break;
- case Opt_grpquota:
- *flag |= JFS_GRPQUOTA;
- break;
+ case Opt_quota:
+ case Opt_usrquota:
+ ctx->flag |= JFS_USRQUOTA;
+ break;
+ case Opt_grpquota:
+ ctx->flag |= JFS_GRPQUOTA;
+ break;
#else
- case Opt_usrquota:
- case Opt_grpquota:
- case Opt_quota:
- pr_err("JFS: quota operations not supported\n");
- break;
+ case Opt_usrquota:
+ case Opt_grpquota:
+ case Opt_quota:
+ pr_err("JFS: quota operations not supported\n");
+ break;
#endif
- case Opt_uid:
- {
- char *uid = args[0].from;
- uid_t val;
- int rc = kstrtouint(uid, 0, &val);
-
- if (rc)
- goto cleanup;
- sbi->uid = make_kuid(current_user_ns(), val);
- if (!uid_valid(sbi->uid))
- goto cleanup;
- break;
- }
-
- case Opt_gid:
- {
- char *gid = args[0].from;
- gid_t val;
- int rc = kstrtouint(gid, 0, &val);
-
- if (rc)
- goto cleanup;
- sbi->gid = make_kgid(current_user_ns(), val);
- if (!gid_valid(sbi->gid))
- goto cleanup;
- break;
+ case Opt_uid:
+ ctx->uid = result.uid;
+ break;
+
+ case Opt_gid:
+ ctx->gid = result.gid;
+ break;
+
+ case Opt_umask:
+ if (result.uint_32 & ~0777) {
+ pr_err("JFS: Invalid value of umask\n");
+ return -EINVAL;
}
+ ctx->umask = result.uint_32;
+ break;
- case Opt_umask:
- {
- char *umask = args[0].from;
- int rc = kstrtouint(umask, 8, &sbi->umask);
+ case Opt_discard:
+ /* if set to 1, even copying files will cause
+ * trimming :O
+ * -> user has more control over the online trimming
+ */
+ ctx->minblks_trim = 64;
+ ctx->flag |= JFS_DISCARD;
+ break;
- if (rc)
- goto cleanup;
- if (sbi->umask & ~0777) {
- pr_err("JFS: Invalid value of umask\n");
- goto cleanup;
- }
- break;
- }
+ case Opt_nodiscard:
+ ctx->flag &= ~JFS_DISCARD;
+ break;
- case Opt_discard:
- /* if set to 1, even copying files will cause
- * trimming :O
- * -> user has more control over the online trimming
- */
- sbi->minblks_trim = 64;
- if (bdev_max_discard_sectors(sb->s_bdev))
- *flag |= JFS_DISCARD;
- else
- pr_err("JFS: discard option not supported on device\n");
- break;
-
- case Opt_nodiscard:
- *flag &= ~JFS_DISCARD;
- break;
-
- case Opt_discard_minblk:
- {
- char *minblks_trim = args[0].from;
- int rc;
- if (bdev_max_discard_sectors(sb->s_bdev)) {
- *flag |= JFS_DISCARD;
- rc = kstrtouint(minblks_trim, 0,
- &sbi->minblks_trim);
- if (rc)
- goto cleanup;
- } else
- pr_err("JFS: discard option not supported on device\n");
- break;
- }
+ case Opt_discard_minblk:
+ ctx->minblks_trim = result.uint_32;
+ ctx->flag |= JFS_DISCARD;
+ break;
- default:
- printk("jfs: Unrecognized mount option \"%s\" or missing value\n",
- p);
- goto cleanup;
- }
- }
-
- if (nls_map != (void *) -1) {
- /* Discard old (if remount) */
- unload_nls(sbi->nls_tab);
- sbi->nls_tab = nls_map;
+ default:
+ return -EINVAL;
}
- return 1;
-cleanup:
- if (nls_map && nls_map != (void *) -1)
- unload_nls(nls_map);
return 0;
}
-static int jfs_remount(struct super_block *sb, int *flags, char *data)
+static int jfs_reconfigure(struct fs_context *fc)
{
- s64 newLVSize = 0;
+ struct jfs_context *ctx = fc->fs_private;
+ struct super_block *sb = fc->root->d_sb;
+ int readonly = fc->sb_flags & SB_RDONLY;
int rc = 0;
- int flag = JFS_SBI(sb)->flag;
+ int flag = ctx->flag;
int ret;
sync_filesystem(sb);
- if (!parse_options(data, sb, &newLVSize, &flag))
- return -EINVAL;
- if (newLVSize) {
+ /* Transfer results of parsing to the sbi */
+ JFS_SBI(sb)->flag = ctx->flag;
+ JFS_SBI(sb)->uid = ctx->uid;
+ JFS_SBI(sb)->gid = ctx->gid;
+ JFS_SBI(sb)->umask = ctx->umask;
+ JFS_SBI(sb)->minblks_trim = ctx->minblks_trim;
+ if (ctx->nls_map != (void *) -1) {
+ unload_nls(JFS_SBI(sb)->nls_tab);
+ JFS_SBI(sb)->nls_tab = ctx->nls_map;
+ }
+ ctx->nls_map = NULL;
+
+ if (ctx->resize) {
if (sb_rdonly(sb)) {
pr_err("JFS: resize requires volume to be mounted read-write\n");
return -EROFS;
}
- rc = jfs_extendfs(sb, newLVSize, 0);
+
+ if (!ctx->newLVSize) {
+ ctx->newLVSize = sb_bdev_nr_blocks(sb);
+ if (ctx->newLVSize == 0)
+ pr_err("JFS: Cannot determine volume size\n");
+ }
+
+ rc = jfs_extendfs(sb, ctx->newLVSize, 0);
if (rc)
return rc;
}
- if (sb_rdonly(sb) && !(*flags & SB_RDONLY)) {
+ if (sb_rdonly(sb) && !readonly) {
/*
* Invalidate any previously read metadata. fsck may have
* changed the on-disk data since we mounted r/o
@@ -459,7 +414,7 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
dquot_resume(sb, -1);
return ret;
}
- if (!sb_rdonly(sb) && (*flags & SB_RDONLY)) {
+ if (!sb_rdonly(sb) && readonly) {
rc = dquot_suspend(sb, -1);
if (rc < 0)
return rc;
@@ -467,7 +422,7 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
JFS_SBI(sb)->flag = flag;
return rc;
}
- if ((JFS_SBI(sb)->flag & JFS_NOINTEGRITY) != (flag & JFS_NOINTEGRITY))
+ if ((JFS_SBI(sb)->flag & JFS_NOINTEGRITY) != (flag & JFS_NOINTEGRITY)) {
if (!sb_rdonly(sb)) {
rc = jfs_umount_rw(sb);
if (rc)
@@ -477,18 +432,20 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
ret = jfs_mount_rw(sb, 1);
return ret;
}
+ }
JFS_SBI(sb)->flag = flag;
return 0;
}
-static int jfs_fill_super(struct super_block *sb, void *data, int silent)
+static int jfs_fill_super(struct super_block *sb, struct fs_context *fc)
{
+ struct jfs_context *ctx = fc->fs_private;
+ int silent = fc->sb_flags & SB_SILENT;
struct jfs_sb_info *sbi;
struct inode *inode;
int rc;
- s64 newLVSize = 0;
- int flag, ret = -EINVAL;
+ int ret = -EINVAL;
jfs_info("In jfs_read_super: s_flags=0x%lx", sb->s_flags);
@@ -501,24 +458,34 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_time_min = 0;
sb->s_time_max = U32_MAX;
sbi->sb = sb;
- sbi->uid = INVALID_UID;
- sbi->gid = INVALID_GID;
- sbi->umask = -1;
-
- /* initialize the mount flag and determine the default error handler */
- flag = JFS_ERR_REMOUNT_RO;
- if (!parse_options((char *) data, sb, &newLVSize, &flag))
- goto out_kfree;
- sbi->flag = flag;
+ /* Transfer results of parsing to the sbi */
+ sbi->flag = ctx->flag;
+ sbi->uid = ctx->uid;
+ sbi->gid = ctx->gid;
+ sbi->umask = ctx->umask;
+ if (ctx->nls_map != (void *) -1) {
+ unload_nls(sbi->nls_tab);
+ sbi->nls_tab = ctx->nls_map;
+ }
+ ctx->nls_map = NULL;
+
+ if (sbi->flag & JFS_DISCARD) {
+ if (!bdev_max_discard_sectors(sb->s_bdev)) {
+ pr_err("JFS: discard option not supported on device\n");
+ sbi->flag &= ~JFS_DISCARD;
+ } else {
+ sbi->minblks_trim = ctx->minblks_trim;
+ }
+ }
#ifdef CONFIG_JFS_POSIX_ACL
sb->s_flags |= SB_POSIXACL;
#endif
- if (newLVSize) {
+ if (ctx->resize) {
pr_err("resize option for remount only\n");
- goto out_kfree;
+ goto out_unload;
}
/*
@@ -608,7 +575,6 @@ out_mount_failed:
sbi->direct_inode = NULL;
out_unload:
unload_nls(sbi->nls_tab);
-out_kfree:
kfree(sbi);
return ret;
}
@@ -664,10 +630,9 @@ out:
return rc;
}
-static struct dentry *jfs_do_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int jfs_get_tree(struct fs_context *fc)
{
- return mount_bdev(fs_type, flags, dev_name, data, jfs_fill_super);
+ return get_tree_bdev(fc, jfs_fill_super);
}
static int jfs_sync_fs(struct super_block *sb, int wait)
@@ -886,7 +851,6 @@ static const struct super_operations jfs_super_operations = {
.freeze_fs = jfs_freeze,
.unfreeze_fs = jfs_unfreeze,
.statfs = jfs_statfs,
- .remount_fs = jfs_remount,
.show_options = jfs_show_options,
#ifdef CONFIG_QUOTA
.quota_read = jfs_quota_read,
@@ -902,12 +866,71 @@ static const struct export_operations jfs_export_operations = {
.get_parent = jfs_get_parent,
};
+static void jfs_init_options(struct fs_context *fc, struct jfs_context *ctx)
+{
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+ struct super_block *sb = fc->root->d_sb;
+
+ /* Copy over current option values and mount flags */
+ ctx->uid = JFS_SBI(sb)->uid;
+ ctx->gid = JFS_SBI(sb)->gid;
+ ctx->umask = JFS_SBI(sb)->umask;
+ ctx->nls_map = (void *)-1;
+ ctx->minblks_trim = JFS_SBI(sb)->minblks_trim;
+ ctx->flag = JFS_SBI(sb)->flag;
+
+ } else {
+ /*
+ * Initialize the mount flag and determine the default
+ * error handler
+ */
+ ctx->flag = JFS_ERR_REMOUNT_RO;
+ ctx->uid = INVALID_UID;
+ ctx->gid = INVALID_GID;
+ ctx->umask = -1;
+ ctx->nls_map = (void *)-1;
+ }
+}
+
+static void jfs_free_fc(struct fs_context *fc)
+{
+ struct jfs_context *ctx = fc->fs_private;
+
+ if (ctx->nls_map != (void *) -1)
+ unload_nls(ctx->nls_map);
+ kfree(ctx);
+}
+
+static const struct fs_context_operations jfs_context_ops = {
+ .parse_param = jfs_parse_param,
+ .get_tree = jfs_get_tree,
+ .reconfigure = jfs_reconfigure,
+ .free = jfs_free_fc,
+};
+
+static int jfs_init_fs_context(struct fs_context *fc)
+{
+ struct jfs_context *ctx;
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ jfs_init_options(fc, ctx);
+
+ fc->fs_private = ctx;
+ fc->ops = &jfs_context_ops;
+
+ return 0;
+}
+
static struct file_system_type jfs_fs_type = {
.owner = THIS_MODULE,
.name = "jfs",
- .mount = jfs_do_mount,
.kill_sb = kill_block_super,
.fs_flags = FS_REQUIRES_DEV,
+ .init_fs_context = jfs_init_fs_context,
+ .parameters = jfs_param_spec,
};
MODULE_ALIAS_FS("jfs");
diff --git a/fs/kernel_read_file.c b/fs/kernel_read_file.c
index 9ff37ae650ea..de32c95d823d 100644
--- a/fs/kernel_read_file.c
+++ b/fs/kernel_read_file.c
@@ -175,15 +175,11 @@ ssize_t kernel_read_file_from_fd(int fd, loff_t offset, void **buf,
size_t buf_size, size_t *file_size,
enum kernel_read_file_id id)
{
- struct fd f = fdget(fd);
- ssize_t ret = -EBADF;
+ CLASS(fd, f)(fd);
- if (!fd_file(f) || !(fd_file(f)->f_mode & FMODE_READ))
- goto out;
+ if (fd_empty(f) || !(fd_file(f)->f_mode & FMODE_READ))
+ return -EBADF;
- ret = kernel_read_file(fd_file(f), offset, buf, buf_size, file_size, id);
-out:
- fdput(f);
- return ret;
+ return kernel_read_file(fd_file(f), offset, buf, buf_size, file_size, id);
}
EXPORT_SYMBOL_GPL(kernel_read_file_from_fd);
diff --git a/fs/libfs.c b/fs/libfs.c
index 46966fd8bcf9..748ac5923154 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -77,6 +77,10 @@ struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, unsigned
return ERR_PTR(-ENAMETOOLONG);
if (!dentry->d_sb->s_d_op)
d_set_d_op(dentry, &simple_dentry_operations);
+
+ if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir))
+ return NULL;
+
d_add(dentry, NULL);
return NULL;
}
@@ -1711,15 +1715,6 @@ static struct dentry *empty_dir_lookup(struct inode *dir, struct dentry *dentry,
return ERR_PTR(-ENOENT);
}
-static int empty_dir_getattr(struct mnt_idmap *idmap,
- const struct path *path, struct kstat *stat,
- u32 request_mask, unsigned int query_flags)
-{
- struct inode *inode = d_inode(path->dentry);
- generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
- return 0;
-}
-
static int empty_dir_setattr(struct mnt_idmap *idmap,
struct dentry *dentry, struct iattr *attr)
{
@@ -1733,9 +1728,7 @@ static ssize_t empty_dir_listxattr(struct dentry *dentry, char *list, size_t siz
static const struct inode_operations empty_dir_inode_operations = {
.lookup = empty_dir_lookup,
- .permission = generic_permission,
.setattr = empty_dir_setattr,
- .getattr = empty_dir_getattr,
.listxattr = empty_dir_listxattr,
};
@@ -1791,8 +1784,8 @@ bool is_empty_dir_inode(struct inode *inode)
*
* Return: 0 if names match, 1 if mismatch, or -ERRNO
*/
-static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
- const char *str, const struct qstr *name)
+int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
+ const char *str, const struct qstr *name)
{
const struct dentry *parent;
const struct inode *dir;
@@ -1835,6 +1828,7 @@ static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
return utf8_strncasecmp(dentry->d_sb->s_encoding, name, &qstr);
}
+EXPORT_SYMBOL(generic_ci_d_compare);
/**
* generic_ci_d_hash - generic d_hash implementation for casefolding filesystems
@@ -1843,7 +1837,7 @@ static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
*
* Return: 0 if hash was successful or unchanged, and -EINVAL on error
*/
-static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str)
+int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str)
{
const struct inode *dir = READ_ONCE(dentry->d_inode);
struct super_block *sb = dentry->d_sb;
@@ -1858,6 +1852,7 @@ static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str)
return -EINVAL;
return 0;
}
+EXPORT_SYMBOL(generic_ci_d_hash);
static const struct dentry_operations generic_ci_dentry_ops = {
.d_hash = generic_ci_d_hash,
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 1f2149db10f2..2359347c9fbd 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -30,7 +30,6 @@
#include <linux/sunrpc/svc_xprt.h>
#include <linux/lockd/nlm.h>
#include <linux/lockd/lockd.h>
-#include <linux/exportfs.h>
#define NLMDBG_FACILITY NLMDBG_SVCLOCK
@@ -481,7 +480,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
struct nlm_host *host, struct nlm_lock *lock, int wait,
struct nlm_cookie *cookie, int reclaim)
{
- struct inode *inode = nlmsvc_file_inode(file);
+ struct inode *inode __maybe_unused = nlmsvc_file_inode(file);
struct nlm_block *block = NULL;
int error;
int mode;
@@ -496,7 +495,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
(long long)lock->fl.fl_end,
wait);
- if (!exportfs_lock_op_is_async(inode->i_sb->s_export_op)) {
+ if (!locks_can_async_lock(nlmsvc_file_file(file)->f_op)) {
async_block = wait;
wait = 0;
}
@@ -550,7 +549,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
* requests on the underlaying ->lock() implementation but
* only one nlm_block to being granted by lm_grant().
*/
- if (exportfs_lock_op_is_async(inode->i_sb->s_export_op) &&
+ if (locks_can_async_lock(nlmsvc_file_file(file)->f_op) &&
!list_empty(&block->b_list)) {
spin_unlock(&nlm_blocked_lock);
ret = nlm_lck_blocked;
diff --git a/fs/locks.c b/fs/locks.c
index 204847628f3e..25afc8d9c9d1 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2136,7 +2136,6 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
{
int can_sleep, error, type;
struct file_lock fl;
- struct fd f;
/*
* LOCK_MAND locks were broken for a long time in that they never
@@ -2155,19 +2154,18 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
if (type < 0)
return type;
- error = -EBADF;
- f = fdget(fd);
- if (!fd_file(f))
- return error;
+ CLASS(fd, f)(fd);
+ if (fd_empty(f))
+ return -EBADF;
if (type != F_UNLCK && !(fd_file(f)->f_mode & (FMODE_READ | FMODE_WRITE)))
- goto out_putf;
+ return -EBADF;
flock_make_lock(fd_file(f), &fl, type);
error = security_file_lock(fd_file(f), fl.c.flc_type);
if (error)
- goto out_putf;
+ return error;
can_sleep = !(cmd & LOCK_NB);
if (can_sleep)
@@ -2181,9 +2179,6 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
error = locks_lock_file_wait(fd_file(f), &fl);
locks_release_private(&fl);
- out_putf:
- fdput(f);
-
return error;
}
diff --git a/fs/mpage.c b/fs/mpage.c
index b5b5ddf9d513..82aecf372743 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -606,7 +606,7 @@ alloc_new:
* the confused fail path above (OOM) will be very confused when
* it finds all bh marked clean (i.e. it will not write anything)
*/
- wbc_account_cgroup_owner(wbc, &folio->page, folio_size(folio));
+ wbc_account_cgroup_owner(wbc, folio, folio_size(folio));
length = first_unmapped << blkbits;
if (!bio_add_folio(bio, folio, length, 0)) {
bio = mpage_bio_submit_write(bio);
diff --git a/fs/namei.c b/fs/namei.c
index 4a4a22a08ac2..9d30c7aa9aa6 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -211,22 +211,38 @@ getname_flags(const char __user *filename, int flags)
return result;
}
-struct filename *
-getname_uflags(const char __user *filename, int uflags)
+struct filename *getname_uflags(const char __user *filename, int uflags)
{
int flags = (uflags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0;
return getname_flags(filename, flags);
}
-struct filename *
-getname(const char __user * filename)
+struct filename *getname(const char __user * filename)
{
return getname_flags(filename, 0);
}
-struct filename *
-getname_kernel(const char * filename)
+struct filename *__getname_maybe_null(const char __user *pathname)
+{
+ struct filename *name;
+ char c;
+
+ /* try to save on allocations; loss on um, though */
+ if (get_user(c, pathname))
+ return ERR_PTR(-EFAULT);
+ if (!c)
+ return NULL;
+
+ name = getname_flags(pathname, LOOKUP_EMPTY);
+ if (!IS_ERR(name) && !(name->name[0])) {
+ putname(name);
+ name = NULL;
+ }
+ return name;
+}
+
+struct filename *getname_kernel(const char * filename)
{
struct filename *result;
int len = strlen(filename) + 1;
@@ -264,7 +280,7 @@ EXPORT_SYMBOL(getname_kernel);
void putname(struct filename *name)
{
- if (IS_ERR(name))
+ if (IS_ERR_OR_NULL(name))
return;
if (WARN_ON_ONCE(!atomic_read(&name->refcnt)))
@@ -326,6 +342,25 @@ static int check_acl(struct mnt_idmap *idmap,
return -EAGAIN;
}
+/*
+ * Very quick optimistic "we know we have no ACL's" check.
+ *
+ * Note that this is purely for ACL_TYPE_ACCESS, and purely
+ * for the "we have cached that there are no ACLs" case.
+ *
+ * If this returns true, we know there are no ACLs. But if
+ * it returns false, we might still not have ACLs (it could
+ * be the is_uncached_acl() case).
+ */
+static inline bool no_acl_inode(struct inode *inode)
+{
+#ifdef CONFIG_FS_POSIX_ACL
+ return likely(!READ_ONCE(inode->i_acl));
+#else
+ return true;
+#endif
+}
+
/**
* acl_permission_check - perform basic UNIX permission checking
* @idmap: idmap of the mount the inode was found from
@@ -348,6 +383,28 @@ static int acl_permission_check(struct mnt_idmap *idmap,
unsigned int mode = inode->i_mode;
vfsuid_t vfsuid;
+ /*
+ * Common cheap case: everybody has the requested
+ * rights, and there are no ACLs to check. No need
+ * to do any owner/group checks in that case.
+ *
+ * - 'mask&7' is the requested permission bit set
+ * - multiplying by 0111 spreads them out to all of ugo
+ * - '& ~mode' looks for missing inode permission bits
+ * - the '!' is for "no missing permissions"
+ *
+ * After that, we just need to check that there are no
+ * ACL's on the inode - do the 'IS_POSIXACL()' check last
+ * because it will dereference the ->i_sb pointer and we
+ * want to avoid that if at all possible.
+ */
+ if (!((mask & 7) * 0111 & ~mode)) {
+ if (no_acl_inode(inode))
+ return 0;
+ if (!IS_POSIXACL(inode))
+ return 0;
+ }
+
/* Are we the owner? If so, ACL's don't matter */
vfsuid = i_uid_into_vfsuid(idmap, inode);
if (likely(vfsuid_eq_kuid(vfsuid, current_fsuid()))) {
@@ -588,6 +645,7 @@ struct nameidata {
unsigned seq;
} *stack, internal[EMBEDDED_LEVELS];
struct filename *name;
+ const char *pathname;
struct nameidata *saved;
unsigned root_seq;
int dfd;
@@ -606,6 +664,7 @@ static void __set_nameidata(struct nameidata *p, int dfd, struct filename *name)
p->depth = 0;
p->dfd = dfd;
p->name = name;
+ p->pathname = likely(name) ? name->name : "";
p->path.mnt = NULL;
p->path.dentry = NULL;
p->total_link_count = old ? old->total_link_count : 0;
@@ -2439,7 +2498,7 @@ OK:
static const char *path_init(struct nameidata *nd, unsigned flags)
{
int error;
- const char *s = nd->name->name;
+ const char *s = nd->pathname;
/* LOOKUP_CACHED requires RCU, ask caller to retry */
if ((flags & (LOOKUP_RCU | LOOKUP_CACHED)) == LOOKUP_CACHED)
@@ -2503,26 +2562,22 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
}
} else {
/* Caller must check execute permissions on the starting path component */
- struct fd f = fdget_raw(nd->dfd);
+ CLASS(fd_raw, f)(nd->dfd);
struct dentry *dentry;
- if (!fd_file(f))
+ if (fd_empty(f))
return ERR_PTR(-EBADF);
if (flags & LOOKUP_LINKAT_EMPTY) {
if (fd_file(f)->f_cred != current_cred() &&
- !ns_capable(fd_file(f)->f_cred->user_ns, CAP_DAC_READ_SEARCH)) {
- fdput(f);
+ !ns_capable(fd_file(f)->f_cred->user_ns, CAP_DAC_READ_SEARCH))
return ERR_PTR(-ENOENT);
- }
}
dentry = fd_file(f)->f_path.dentry;
- if (*s && unlikely(!d_can_lookup(dentry))) {
- fdput(f);
+ if (*s && unlikely(!d_can_lookup(dentry)))
return ERR_PTR(-ENOTDIR);
- }
nd->path = fd_file(f)->f_path;
if (flags & LOOKUP_RCU) {
@@ -2532,7 +2587,6 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
path_get(&nd->path);
nd->inode = nd->path.dentry->d_inode;
}
- fdput(f);
}
/* For scoped-lookups we need to set the root to the dirfd as well. */
diff --git a/fs/namespace.c b/fs/namespace.c
index 93c377816d75..6b0a17487d0f 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3901,7 +3901,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a
}
new_ns->ns.ops = &mntns_operations;
if (!anon)
- new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
+ new_ns->seq = atomic64_inc_return(&mnt_ns_seq);
refcount_set(&new_ns->ns.count, 1);
refcount_set(&new_ns->passive, 1);
new_ns->mounts = RB_ROOT;
@@ -3944,7 +3944,9 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
new = copy_tree(old, old->mnt.mnt_root, copy_flags);
if (IS_ERR(new)) {
namespace_unlock();
- free_mnt_ns(new_ns);
+ ns_free_inum(&new_ns->ns);
+ dec_mnt_namespaces(new_ns->ucounts);
+ mnt_ns_release(new_ns);
return ERR_CAST(new);
}
if (user_ns != ns->user_ns) {
@@ -4105,7 +4107,6 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
struct file *file;
struct path newmount;
struct mount *mnt;
- struct fd f;
unsigned int mnt_flags = 0;
long ret;
@@ -4133,19 +4134,18 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
return -EINVAL;
}
- f = fdget(fs_fd);
- if (!fd_file(f))
+ CLASS(fd, f)(fs_fd);
+ if (fd_empty(f))
return -EBADF;
- ret = -EINVAL;
if (fd_file(f)->f_op != &fscontext_fops)
- goto err_fsfd;
+ return -EINVAL;
fc = fd_file(f)->private_data;
ret = mutex_lock_interruptible(&fc->uapi_mutex);
if (ret < 0)
- goto err_fsfd;
+ return ret;
/* There must be a valid superblock or we can't mount it */
ret = -EINVAL;
@@ -4212,8 +4212,6 @@ err_path:
path_put(&newmount);
err_unlock:
mutex_unlock(&fc->uapi_mutex);
-err_fsfd:
- fdput(f);
return ret;
}
@@ -4668,10 +4666,8 @@ out:
static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
struct mount_kattr *kattr, unsigned int flags)
{
- int err = 0;
struct ns_common *ns;
struct user_namespace *mnt_userns;
- struct fd f;
if (!((attr->attr_set | attr->attr_clr) & MOUNT_ATTR_IDMAP))
return 0;
@@ -4687,20 +4683,16 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
if (attr->userns_fd > INT_MAX)
return -EINVAL;
- f = fdget(attr->userns_fd);
- if (!fd_file(f))
+ CLASS(fd, f)(attr->userns_fd);
+ if (fd_empty(f))
return -EBADF;
- if (!proc_ns_file(fd_file(f))) {
- err = -EINVAL;
- goto out_fput;
- }
+ if (!proc_ns_file(fd_file(f)))
+ return -EINVAL;
ns = get_proc_ns(file_inode(fd_file(f)));
- if (ns->ops->type != CLONE_NEWUSER) {
- err = -EINVAL;
- goto out_fput;
- }
+ if (ns->ops->type != CLONE_NEWUSER)
+ return -EINVAL;
/*
* The initial idmapping cannot be used to create an idmapped
@@ -4711,22 +4703,15 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize,
* result.
*/
mnt_userns = container_of(ns, struct user_namespace, ns);
- if (mnt_userns == &init_user_ns) {
- err = -EPERM;
- goto out_fput;
- }
+ if (mnt_userns == &init_user_ns)
+ return -EPERM;
/* We're not controlling the target namespace. */
- if (!ns_capable(mnt_userns, CAP_SYS_ADMIN)) {
- err = -EPERM;
- goto out_fput;
- }
+ if (!ns_capable(mnt_userns, CAP_SYS_ADMIN))
+ return -EPERM;
kattr->mnt_userns = get_user_ns(mnt_userns);
-
-out_fput:
- fdput(f);
- return err;
+ return 0;
}
static int build_mount_kattr(const struct mount_attr *attr, size_t usize,
@@ -5004,6 +4989,40 @@ static int statmount_fs_type(struct kstatmount *s, struct seq_file *seq)
return 0;
}
+static void statmount_fs_subtype(struct kstatmount *s, struct seq_file *seq)
+{
+ struct super_block *sb = s->mnt->mnt_sb;
+
+ if (sb->s_subtype)
+ seq_puts(seq, sb->s_subtype);
+}
+
+static int statmount_sb_source(struct kstatmount *s, struct seq_file *seq)
+{
+ struct super_block *sb = s->mnt->mnt_sb;
+ struct mount *r = real_mount(s->mnt);
+
+ if (sb->s_op->show_devname) {
+ size_t start = seq->count;
+ int ret;
+
+ ret = sb->s_op->show_devname(seq, s->mnt->mnt_root);
+ if (ret)
+ return ret;
+
+ if (unlikely(seq_has_overflowed(seq)))
+ return -EAGAIN;
+
+ /* Unescape the result */
+ seq->buf[seq->count] = '\0';
+ seq->count = start;
+ seq_commit(seq, string_unescape_inplace(seq->buf + start, UNESCAPE_OCTAL));
+ } else if (r->mnt_devname) {
+ seq_puts(seq, r->mnt_devname);
+ }
+ return 0;
+}
+
static void statmount_mnt_ns_id(struct kstatmount *s, struct mnt_namespace *ns)
{
s->sm.mask |= STATMOUNT_MNT_NS_ID;
@@ -5038,35 +5057,134 @@ static int statmount_mnt_opts(struct kstatmount *s, struct seq_file *seq)
return 0;
}
+static inline int statmount_opt_unescape(struct seq_file *seq, char *buf_start)
+{
+ char *buf_end, *opt_start, *opt_end;
+ int count = 0;
+
+ buf_end = seq->buf + seq->count;
+ *buf_end = '\0';
+ for (opt_start = buf_start + 1; opt_start < buf_end; opt_start = opt_end + 1) {
+ opt_end = strchrnul(opt_start, ',');
+ *opt_end = '\0';
+ buf_start += string_unescape(opt_start, buf_start, 0, UNESCAPE_OCTAL) + 1;
+ if (WARN_ON_ONCE(++count == INT_MAX))
+ return -EOVERFLOW;
+ }
+ seq->count = buf_start - 1 - seq->buf;
+ return count;
+}
+
+static int statmount_opt_array(struct kstatmount *s, struct seq_file *seq)
+{
+ struct vfsmount *mnt = s->mnt;
+ struct super_block *sb = mnt->mnt_sb;
+ size_t start = seq->count;
+ char *buf_start;
+ int err;
+
+ if (!sb->s_op->show_options)
+ return 0;
+
+ buf_start = seq->buf + start;
+ err = sb->s_op->show_options(seq, mnt->mnt_root);
+ if (err)
+ return err;
+
+ if (unlikely(seq_has_overflowed(seq)))
+ return -EAGAIN;
+
+ if (seq->count == start)
+ return 0;
+
+ err = statmount_opt_unescape(seq, buf_start);
+ if (err < 0)
+ return err;
+
+ s->sm.opt_num = err;
+ return 0;
+}
+
+static int statmount_opt_sec_array(struct kstatmount *s, struct seq_file *seq)
+{
+ struct vfsmount *mnt = s->mnt;
+ struct super_block *sb = mnt->mnt_sb;
+ size_t start = seq->count;
+ char *buf_start;
+ int err;
+
+ buf_start = seq->buf + start;
+
+ err = security_sb_show_options(seq, sb);
+ if (!err)
+ return err;
+
+ if (unlikely(seq_has_overflowed(seq)))
+ return -EAGAIN;
+
+ if (seq->count == start)
+ return 0;
+
+ err = statmount_opt_unescape(seq, buf_start);
+ if (err < 0)
+ return err;
+
+ s->sm.opt_sec_num = err;
+ return 0;
+}
+
static int statmount_string(struct kstatmount *s, u64 flag)
{
- int ret;
+ int ret = 0;
size_t kbufsize;
struct seq_file *seq = &s->seq;
struct statmount *sm = &s->sm;
+ u32 start = seq->count;
switch (flag) {
case STATMOUNT_FS_TYPE:
- sm->fs_type = seq->count;
+ sm->fs_type = start;
ret = statmount_fs_type(s, seq);
break;
case STATMOUNT_MNT_ROOT:
- sm->mnt_root = seq->count;
+ sm->mnt_root = start;
ret = statmount_mnt_root(s, seq);
break;
case STATMOUNT_MNT_POINT:
- sm->mnt_point = seq->count;
+ sm->mnt_point = start;
ret = statmount_mnt_point(s, seq);
break;
case STATMOUNT_MNT_OPTS:
- sm->mnt_opts = seq->count;
+ sm->mnt_opts = start;
ret = statmount_mnt_opts(s, seq);
break;
+ case STATMOUNT_OPT_ARRAY:
+ sm->opt_array = start;
+ ret = statmount_opt_array(s, seq);
+ break;
+ case STATMOUNT_OPT_SEC_ARRAY:
+ sm->opt_sec_array = start;
+ ret = statmount_opt_sec_array(s, seq);
+ break;
+ case STATMOUNT_FS_SUBTYPE:
+ sm->fs_subtype = start;
+ statmount_fs_subtype(s, seq);
+ break;
+ case STATMOUNT_SB_SOURCE:
+ sm->sb_source = start;
+ ret = statmount_sb_source(s, seq);
+ break;
default:
WARN_ON_ONCE(true);
return -EINVAL;
}
+ /*
+ * If nothing was emitted, return to avoid setting the flag
+ * and terminating the buffer.
+ */
+ if (seq->count == start)
+ return ret;
if (unlikely(check_add_overflow(sizeof(*sm), seq->count, &kbufsize)))
return -EOVERFLOW;
if (kbufsize >= s->bufsize)
@@ -5201,6 +5319,18 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
if (!err && s->mask & STATMOUNT_MNT_OPTS)
err = statmount_string(s, STATMOUNT_MNT_OPTS);
+ if (!err && s->mask & STATMOUNT_OPT_ARRAY)
+ err = statmount_string(s, STATMOUNT_OPT_ARRAY);
+
+ if (!err && s->mask & STATMOUNT_OPT_SEC_ARRAY)
+ err = statmount_string(s, STATMOUNT_OPT_SEC_ARRAY);
+
+ if (!err && s->mask & STATMOUNT_FS_SUBTYPE)
+ err = statmount_string(s, STATMOUNT_FS_SUBTYPE);
+
+ if (!err && s->mask & STATMOUNT_SB_SOURCE)
+ err = statmount_string(s, STATMOUNT_SB_SOURCE);
+
if (!err && s->mask & STATMOUNT_MNT_NS_ID)
statmount_mnt_ns_id(s, ns);
@@ -5222,7 +5352,9 @@ static inline bool retry_statmount(const long ret, size_t *seq_size)
}
#define STATMOUNT_STRING_REQ (STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT | \
- STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS)
+ STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS | \
+ STATMOUNT_FS_SUBTYPE | STATMOUNT_SB_SOURCE | \
+ STATMOUNT_OPT_ARRAY | STATMOUNT_OPT_SEC_ARRAY)
static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq,
struct statmount __user *buf, size_t bufsize,
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index c40e226053cc..7ac34550c403 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -67,7 +67,8 @@ static int netfs_begin_cache_read(struct netfs_io_request *rreq, struct netfs_in
* Decant the list of folios to read into a rolling buffer.
*/
static size_t netfs_load_buffer_from_ra(struct netfs_io_request *rreq,
- struct folio_queue *folioq)
+ struct folio_queue *folioq,
+ struct folio_batch *put_batch)
{
unsigned int order, nr;
size_t size = 0;
@@ -82,6 +83,9 @@ static size_t netfs_load_buffer_from_ra(struct netfs_io_request *rreq,
order = folio_order(folio);
folioq->orders[i] = order;
size += PAGE_SIZE << order;
+
+ if (!folio_batch_add(put_batch, folio))
+ folio_batch_release(put_batch);
}
for (int i = nr; i < folioq_nr_slots(folioq); i++)
@@ -120,6 +124,9 @@ static ssize_t netfs_prepare_read_iterator(struct netfs_io_subrequest *subreq)
* that we will need to release later - but we don't want to do
* that until after we've started the I/O.
*/
+ struct folio_batch put_batch;
+
+ folio_batch_init(&put_batch);
while (rreq->submitted < subreq->start + rsize) {
struct folio_queue *tail = rreq->buffer_tail, *new;
size_t added;
@@ -132,10 +139,11 @@ static ssize_t netfs_prepare_read_iterator(struct netfs_io_subrequest *subreq)
new->prev = tail;
tail->next = new;
rreq->buffer_tail = new;
- added = netfs_load_buffer_from_ra(rreq, new);
+ added = netfs_load_buffer_from_ra(rreq, new, &put_batch);
rreq->iter.count += added;
rreq->submitted += added;
}
+ folio_batch_release(&put_batch);
}
subreq->len = rsize;
@@ -348,6 +356,7 @@ static int netfs_wait_for_read(struct netfs_io_request *rreq)
static int netfs_prime_buffer(struct netfs_io_request *rreq)
{
struct folio_queue *folioq;
+ struct folio_batch put_batch;
size_t added;
folioq = kmalloc(sizeof(*folioq), GFP_KERNEL);
@@ -360,39 +369,14 @@ static int netfs_prime_buffer(struct netfs_io_request *rreq)
rreq->submitted = rreq->start;
iov_iter_folio_queue(&rreq->iter, ITER_DEST, folioq, 0, 0, 0);
- added = netfs_load_buffer_from_ra(rreq, folioq);
+ folio_batch_init(&put_batch);
+ added = netfs_load_buffer_from_ra(rreq, folioq, &put_batch);
+ folio_batch_release(&put_batch);
rreq->iter.count += added;
rreq->submitted += added;
return 0;
}
-/*
- * Drop the ref on each folio that we inherited from the VM readahead code. We
- * still have the folio locks to pin the page until we complete the I/O.
- *
- * Note that we can't just release the batch in each queue struct as we use the
- * occupancy count in other places.
- */
-static void netfs_put_ra_refs(struct folio_queue *folioq)
-{
- struct folio_batch fbatch;
-
- folio_batch_init(&fbatch);
- while (folioq) {
- for (unsigned int slot = 0; slot < folioq_count(folioq); slot++) {
- struct folio *folio = folioq_folio(folioq, slot);
- if (!folio)
- continue;
- trace_netfs_folio(folio, netfs_folio_trace_read_put);
- if (!folio_batch_add(&fbatch, folio))
- folio_batch_release(&fbatch);
- }
- folioq = folioq->next;
- }
-
- folio_batch_release(&fbatch);
-}
-
/**
* netfs_readahead - Helper to manage a read request
* @ractl: The description of the readahead request
@@ -436,9 +420,6 @@ void netfs_readahead(struct readahead_control *ractl)
goto cleanup_free;
netfs_read_to_pagecache(rreq);
- /* Release the folio refs whilst we're waiting for the I/O. */
- netfs_put_ra_refs(rreq->buffer);
-
netfs_put_request(rreq, true, netfs_rreq_trace_put_return);
return;
@@ -646,7 +627,7 @@ static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len,
if (unlikely(always_fill)) {
if (pos - offset + len <= i_size)
return false; /* Page entirely before EOF */
- zero_user_segment(&folio->page, 0, plen);
+ folio_zero_segment(folio, 0, plen);
folio_mark_uptodate(folio);
return true;
}
@@ -665,7 +646,7 @@ static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len,
return false;
zero_out:
- zero_user_segments(&folio->page, 0, offset, offset + len, plen);
+ folio_zero_segments(folio, 0, offset, offset + len, plen);
return true;
}
@@ -732,7 +713,7 @@ retry:
if (folio_test_uptodate(folio))
goto have_folio;
- /* If the page is beyond the EOF, we want to clear it - unless it's
+ /* If the folio is beyond the EOF, we want to clear it - unless it's
* within the cache granule containing the EOF, in which case we need
* to preload the granule.
*/
@@ -792,7 +773,7 @@ error:
EXPORT_SYMBOL(netfs_write_begin);
/*
- * Preload the data into a page we're proposing to write into.
+ * Preload the data into a folio we're proposing to write into.
*/
int netfs_prefetch_for_write(struct file *file, struct folio *folio,
size_t offset, size_t len)
diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c
index b3910dfcb56d..b4826360a411 100644
--- a/fs/netfs/buffered_write.c
+++ b/fs/netfs/buffered_write.c
@@ -83,13 +83,13 @@ static void netfs_update_i_size(struct netfs_inode *ctx, struct inode *inode,
* netfs_perform_write - Copy data into the pagecache.
* @iocb: The operation parameters
* @iter: The source buffer
- * @netfs_group: Grouping for dirty pages (eg. ceph snaps).
+ * @netfs_group: Grouping for dirty folios (eg. ceph snaps).
*
- * Copy data into pagecache pages attached to the inode specified by @iocb.
+ * Copy data into pagecache folios attached to the inode specified by @iocb.
* The caller must hold appropriate inode locks.
*
- * Dirty pages are tagged with a netfs_folio struct if they're not up to date
- * to indicate the range modified. Dirty pages may also be tagged with a
+ * Dirty folios are tagged with a netfs_folio struct if they're not up to date
+ * to indicate the range modified. Dirty folios may also be tagged with a
* netfs-specific grouping such that data from an old group gets flushed before
* a new one is started.
*/
@@ -223,11 +223,11 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
* we try to read it.
*/
if (fpos >= ctx->zero_point) {
- zero_user_segment(&folio->page, 0, offset);
+ folio_zero_segment(folio, 0, offset);
copied = copy_folio_from_iter_atomic(folio, offset, part, iter);
if (unlikely(copied == 0))
goto copy_failed;
- zero_user_segment(&folio->page, offset + copied, flen);
+ folio_zero_segment(folio, offset + copied, flen);
__netfs_set_group(folio, netfs_group);
folio_mark_uptodate(folio);
trace_netfs_folio(folio, netfs_modify_and_clear);
@@ -407,7 +407,7 @@ EXPORT_SYMBOL(netfs_perform_write);
* netfs_buffered_write_iter_locked - write data to a file
* @iocb: IO state structure (file, offset, etc.)
* @from: iov_iter with data to write
- * @netfs_group: Grouping for dirty pages (eg. ceph snaps).
+ * @netfs_group: Grouping for dirty folios (eg. ceph snaps).
*
* This function does all the work needed for actually writing data to a
* file. It does all basic checks, removes SUID from the file, updates
@@ -491,7 +491,9 @@ EXPORT_SYMBOL(netfs_file_write_iter);
/*
* Notification that a previously read-only page is about to become writable.
- * Note that the caller indicates a single page of a multipage folio.
+ * The caller indicates the precise page that needs to be written to, but
+ * we only track group on a per-folio basis, so we block more often than
+ * we might otherwise.
*/
vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group)
{
@@ -501,7 +503,7 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr
struct address_space *mapping = file->f_mapping;
struct inode *inode = file_inode(file);
struct netfs_inode *ictx = netfs_inode(inode);
- vm_fault_t ret = VM_FAULT_RETRY;
+ vm_fault_t ret = VM_FAULT_NOPAGE;
int err;
_enter("%lx", folio->index);
@@ -510,21 +512,15 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr
if (folio_lock_killable(folio) < 0)
goto out;
- if (folio->mapping != mapping) {
- folio_unlock(folio);
- ret = VM_FAULT_NOPAGE;
- goto out;
- }
-
- if (folio_wait_writeback_killable(folio)) {
- ret = VM_FAULT_LOCKED;
- goto out;
- }
+ if (folio->mapping != mapping)
+ goto unlock;
+ if (folio_wait_writeback_killable(folio) < 0)
+ goto unlock;
/* Can we see a streaming write here? */
if (WARN_ON(!folio_test_uptodate(folio))) {
- ret = VM_FAULT_SIGBUS | VM_FAULT_LOCKED;
- goto out;
+ ret = VM_FAULT_SIGBUS;
+ goto unlock;
}
group = netfs_folio_group(folio);
@@ -559,5 +555,8 @@ vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_gr
out:
sb_end_pagefault(inode->i_sb);
return ret;
+unlock:
+ folio_unlock(folio);
+ goto out;
}
EXPORT_SYMBOL(netfs_page_mkwrite);
diff --git a/fs/netfs/fscache_volume.c b/fs/netfs/fscache_volume.c
index cb75c07b5281..ced14ac78cc1 100644
--- a/fs/netfs/fscache_volume.c
+++ b/fs/netfs/fscache_volume.c
@@ -322,8 +322,7 @@ maybe_wait:
}
return;
no_wait:
- clear_bit_unlock(FSCACHE_VOLUME_CREATING, &volume->flags);
- wake_up_bit(&volume->flags, FSCACHE_VOLUME_CREATING);
+ clear_and_wake_up_bit(FSCACHE_VOLUME_CREATING, &volume->flags);
}
/*
diff --git a/fs/netfs/locking.c b/fs/netfs/locking.c
index 21eab56ee2f9..2249ecd09d0a 100644
--- a/fs/netfs/locking.c
+++ b/fs/netfs/locking.c
@@ -109,6 +109,7 @@ int netfs_start_io_write(struct inode *inode)
up_write(&inode->i_rwsem);
return -ERESTARTSYS;
}
+ downgrade_write(&inode->i_rwsem);
return 0;
}
EXPORT_SYMBOL(netfs_start_io_write);
@@ -123,7 +124,7 @@ EXPORT_SYMBOL(netfs_start_io_write);
void netfs_end_io_write(struct inode *inode)
__releases(inode->i_rwsem)
{
- up_write(&inode->i_rwsem);
+ up_read(&inode->i_rwsem);
}
EXPORT_SYMBOL(netfs_end_io_write);
diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c
index b18c65ba5580..3cbb289535a8 100644
--- a/fs/netfs/read_collect.c
+++ b/fs/netfs/read_collect.c
@@ -77,6 +77,8 @@ static void netfs_unlock_read_folio(struct netfs_io_subrequest *subreq,
folio_unlock(folio);
}
}
+
+ folioq_clear(folioq, slot);
}
/*
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 114282398716..03ecc7765615 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -181,8 +181,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)
#if IS_ENABLED(CONFIG_NFS_LOCALIO)
seqlock_init(&clp->cl_boot_lock);
ktime_get_real_ts64(&clp->cl_nfssvc_boot);
- clp->cl_uuid.net = NULL;
- clp->cl_uuid.dom = NULL;
+ nfs_uuid_init(&clp->cl_uuid);
spin_lock_init(&clp->cl_localio_lock);
#endif /* CONFIG_NFS_LOCALIO */
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 542c7d97b235..596f35170137 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -205,12 +205,15 @@ void nfs_set_cache_invalid(struct inode *inode, unsigned long flags)
nfs_fscache_invalidate(inode, 0);
flags &= ~NFS_INO_REVAL_FORCED;
- nfsi->cache_validity |= flags;
+ flags |= nfsi->cache_validity;
+ if (inode->i_mapping->nrpages == 0)
+ flags &= ~NFS_INO_INVALID_DATA;
- if (inode->i_mapping->nrpages == 0) {
- nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
- nfs_ooo_clear(nfsi);
- } else if (nfsi->cache_validity & NFS_INO_INVALID_DATA) {
+ /* pairs with nfs_clear_invalid_mapping()'s smp_load_acquire() */
+ smp_store_release(&nfsi->cache_validity, flags);
+
+ if (inode->i_mapping->nrpages == 0 ||
+ nfsi->cache_validity & NFS_INO_INVALID_DATA) {
nfs_ooo_clear(nfsi);
}
trace_nfs_set_cache_invalid(inode, 0);
@@ -628,23 +631,35 @@ nfs_fattr_fixup_delegated(struct inode *inode, struct nfs_fattr *fattr)
}
}
+static void nfs_update_timestamps(struct inode *inode, unsigned int ia_valid)
+{
+ enum file_time_flags time_flags = 0;
+ unsigned int cache_flags = 0;
+
+ if (ia_valid & ATTR_MTIME) {
+ time_flags |= S_MTIME | S_CTIME;
+ cache_flags |= NFS_INO_INVALID_CTIME | NFS_INO_INVALID_MTIME;
+ }
+ if (ia_valid & ATTR_ATIME) {
+ time_flags |= S_ATIME;
+ cache_flags |= NFS_INO_INVALID_ATIME;
+ }
+ inode_update_timestamps(inode, time_flags);
+ NFS_I(inode)->cache_validity &= ~cache_flags;
+}
+
void nfs_update_delegated_atime(struct inode *inode)
{
spin_lock(&inode->i_lock);
- if (nfs_have_delegated_atime(inode)) {
- inode_update_timestamps(inode, S_ATIME);
- NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_ATIME;
- }
+ if (nfs_have_delegated_atime(inode))
+ nfs_update_timestamps(inode, ATTR_ATIME);
spin_unlock(&inode->i_lock);
}
void nfs_update_delegated_mtime_locked(struct inode *inode)
{
- if (nfs_have_delegated_mtime(inode)) {
- inode_update_timestamps(inode, S_CTIME | S_MTIME);
- NFS_I(inode)->cache_validity &= ~(NFS_INO_INVALID_CTIME |
- NFS_INO_INVALID_MTIME);
- }
+ if (nfs_have_delegated_mtime(inode))
+ nfs_update_timestamps(inode, ATTR_MTIME);
}
void nfs_update_delegated_mtime(struct inode *inode)
@@ -682,15 +697,16 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
attr->ia_valid &= ~ATTR_SIZE;
}
- if (nfs_have_delegated_mtime(inode)) {
- if (attr->ia_valid & ATTR_MTIME) {
- nfs_update_delegated_mtime(inode);
- attr->ia_valid &= ~ATTR_MTIME;
- }
- if (attr->ia_valid & ATTR_ATIME) {
- nfs_update_delegated_atime(inode);
- attr->ia_valid &= ~ATTR_ATIME;
- }
+ if (nfs_have_delegated_mtime(inode) && attr->ia_valid & ATTR_MTIME) {
+ spin_lock(&inode->i_lock);
+ nfs_update_timestamps(inode, attr->ia_valid);
+ spin_unlock(&inode->i_lock);
+ attr->ia_valid &= ~(ATTR_MTIME | ATTR_ATIME);
+ } else if (nfs_have_delegated_atime(inode) &&
+ attr->ia_valid & ATTR_ATIME &&
+ !(attr->ia_valid & ATTR_MTIME)) {
+ nfs_update_delegated_atime(inode);
+ attr->ia_valid &= ~ATTR_ATIME;
}
/* Optimization: if the end result is no change, don't RPC */
@@ -1408,6 +1424,13 @@ int nfs_clear_invalid_mapping(struct address_space *mapping)
TASK_KILLABLE|TASK_FREEZABLE_UNSAFE);
if (ret)
goto out;
+ smp_rmb(); /* pairs with smp_wmb() below */
+ if (test_bit(NFS_INO_INVALIDATING, bitlock))
+ continue;
+ /* pairs with nfs_set_cache_invalid()'s smp_store_release() */
+ if (!(smp_load_acquire(&nfsi->cache_validity) & NFS_INO_INVALID_DATA))
+ goto out;
+ /* Slow-path that double-checks with spinlock held */
spin_lock(&inode->i_lock);
if (test_bit(NFS_INO_INVALIDATING, bitlock)) {
spin_unlock(&inode->i_lock);
@@ -1633,6 +1656,7 @@ void nfs_fattr_init(struct nfs_fattr *fattr)
fattr->gencount = nfs_inc_attr_generation_counter();
fattr->owner_name = NULL;
fattr->group_name = NULL;
+ fattr->mdsthreshold = NULL;
}
EXPORT_SYMBOL_GPL(nfs_fattr_init);
diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c
index d0aa680ec816..8f0ce82a677e 100644
--- a/fs/nfs/localio.c
+++ b/fs/nfs/localio.c
@@ -205,7 +205,8 @@ void nfs_local_probe(struct nfs_client *clp)
nfs_local_disable(clp);
}
- nfs_uuid_begin(&clp->cl_uuid);
+ if (!nfs_uuid_begin(&clp->cl_uuid))
+ return;
if (nfs_server_uuid_is_local(clp))
nfs_local_enable(clp);
nfs_uuid_end(&clp->cl_uuid);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index cd2fbde2e6d7..9d40319e063d 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3452,6 +3452,10 @@ static int nfs4_do_setattr(struct inode *inode, const struct cred *cred,
adjust_flags |= NFS_INO_INVALID_MODE;
if (sattr->ia_valid & (ATTR_UID | ATTR_GID))
adjust_flags |= NFS_INO_INVALID_OTHER;
+ if (sattr->ia_valid & ATTR_ATIME)
+ adjust_flags |= NFS_INO_INVALID_ATIME;
+ if (sattr->ia_valid & ATTR_MTIME)
+ adjust_flags |= NFS_INO_INVALID_MTIME;
do {
nfs4_bitmap_copy_adjust(bitmask, nfs4_bitmask(server, fattr->label),
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 9723b6c53397..ae5c5e39afa0 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -885,7 +885,15 @@ static int nfs_request_mount(struct fs_context *fc,
* Now ask the mount server to map our export path
* to a file handle.
*/
- status = nfs_mount(&request, ctx->timeo, ctx->retrans);
+ if ((request.protocol == XPRT_TRANSPORT_UDP) ==
+ !(ctx->flags & NFS_MOUNT_TCP))
+ /*
+ * NFS protocol and mount protocol are both UDP or neither UDP
+ * so timeouts are compatible. Use NFS timeouts for MOUNT
+ */
+ status = nfs_mount(&request, ctx->timeo, ctx->retrans);
+ else
+ status = nfs_mount(&request, NFS_UNSPEC_TIMEO, NFS_UNSPEC_RETRANS);
if (status != 0) {
dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n",
request.hostname, status);
diff --git a/fs/nfs_common/nfslocalio.c b/fs/nfs_common/nfslocalio.c
index 5c8ce5066c16..09404d142d1a 100644
--- a/fs/nfs_common/nfslocalio.c
+++ b/fs/nfs_common/nfslocalio.c
@@ -5,7 +5,7 @@
*/
#include <linux/module.h>
-#include <linux/rculist.h>
+#include <linux/list.h>
#include <linux/nfslocalio.h>
#include <net/netns/generic.h>
@@ -20,15 +20,27 @@ static DEFINE_SPINLOCK(nfs_uuid_lock);
*/
static LIST_HEAD(nfs_uuids);
-void nfs_uuid_begin(nfs_uuid_t *nfs_uuid)
+void nfs_uuid_init(nfs_uuid_t *nfs_uuid)
{
nfs_uuid->net = NULL;
nfs_uuid->dom = NULL;
- uuid_gen(&nfs_uuid->uuid);
+ INIT_LIST_HEAD(&nfs_uuid->list);
+}
+EXPORT_SYMBOL_GPL(nfs_uuid_init);
+bool nfs_uuid_begin(nfs_uuid_t *nfs_uuid)
+{
spin_lock(&nfs_uuid_lock);
- list_add_tail_rcu(&nfs_uuid->list, &nfs_uuids);
+ /* Is this nfs_uuid already in use? */
+ if (!list_empty(&nfs_uuid->list)) {
+ spin_unlock(&nfs_uuid_lock);
+ return false;
+ }
+ uuid_gen(&nfs_uuid->uuid);
+ list_add_tail(&nfs_uuid->list, &nfs_uuids);
spin_unlock(&nfs_uuid_lock);
+
+ return true;
}
EXPORT_SYMBOL_GPL(nfs_uuid_begin);
@@ -36,7 +48,8 @@ void nfs_uuid_end(nfs_uuid_t *nfs_uuid)
{
if (nfs_uuid->net == NULL) {
spin_lock(&nfs_uuid_lock);
- list_del_init(&nfs_uuid->list);
+ if (nfs_uuid->net == NULL)
+ list_del_init(&nfs_uuid->list);
spin_unlock(&nfs_uuid_lock);
}
}
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index b5a6bf4f459f..d32f2dfd148f 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1841,14 +1841,12 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if (!async_copy)
goto out_err;
async_copy->cp_nn = nn;
+ INIT_LIST_HEAD(&async_copy->copies);
+ refcount_set(&async_copy->refcount, 1);
/* Arbitrary cap on number of pending async copy operations */
if (atomic_inc_return(&nn->pending_async_copies) >
- (int)rqstp->rq_pool->sp_nrthreads) {
- atomic_dec(&nn->pending_async_copies);
+ (int)rqstp->rq_pool->sp_nrthreads)
goto out_err;
- }
- INIT_LIST_HEAD(&async_copy->copies);
- refcount_set(&async_copy->refcount, 1);
async_copy->cp_src = kmalloc(sizeof(*async_copy->cp_src), GFP_KERNEL);
if (!async_copy->cp_src)
goto out_err;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 56b261608af4..d80406f8b568 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1359,21 +1359,47 @@ static void destroy_delegation(struct nfs4_delegation *dp)
destroy_unhashed_deleg(dp);
}
+/**
+ * revoke_delegation - perform nfs4 delegation structure cleanup
+ * @dp: pointer to the delegation
+ *
+ * This function assumes that it's called either from the administrative
+ * interface (nfsd4_revoke_states()) that's revoking a specific delegation
+ * stateid or it's called from a laundromat thread (nfsd4_landromat()) that
+ * determined that this specific state has expired and needs to be revoked
+ * (both mark state with the appropriate stid sc_status mode). It is also
+ * assumed that a reference was taken on the @dp state.
+ *
+ * If this function finds that the @dp state is SC_STATUS_FREED it means
+ * that a FREE_STATEID operation for this stateid has been processed and
+ * we can proceed to removing it from recalled list. However, if @dp state
+ * isn't marked SC_STATUS_FREED, it means we need place it on the cl_revoked
+ * list and wait for the FREE_STATEID to arrive from the client. At the same
+ * time, we need to mark it as SC_STATUS_FREEABLE to indicate to the
+ * nfsd4_free_stateid() function that this stateid has already been added
+ * to the cl_revoked list and that nfsd4_free_stateid() is now responsible
+ * for removing it from the list. Inspection of where the delegation state
+ * in the revocation process is protected by the clp->cl_lock.
+ */
static void revoke_delegation(struct nfs4_delegation *dp)
{
struct nfs4_client *clp = dp->dl_stid.sc_client;
WARN_ON(!list_empty(&dp->dl_recall_lru));
+ WARN_ON_ONCE(!(dp->dl_stid.sc_status &
+ (SC_STATUS_REVOKED | SC_STATUS_ADMIN_REVOKED)));
trace_nfsd_stid_revoke(&dp->dl_stid);
- if (dp->dl_stid.sc_status &
- (SC_STATUS_REVOKED | SC_STATUS_ADMIN_REVOKED)) {
- spin_lock(&clp->cl_lock);
- refcount_inc(&dp->dl_stid.sc_count);
- list_add(&dp->dl_recall_lru, &clp->cl_revoked);
- spin_unlock(&clp->cl_lock);
+ spin_lock(&clp->cl_lock);
+ if (dp->dl_stid.sc_status & SC_STATUS_FREED) {
+ list_del_init(&dp->dl_recall_lru);
+ goto out;
}
+ list_add(&dp->dl_recall_lru, &clp->cl_revoked);
+ dp->dl_stid.sc_status |= SC_STATUS_FREEABLE;
+out:
+ spin_unlock(&clp->cl_lock);
destroy_unhashed_deleg(dp);
}
@@ -1780,6 +1806,7 @@ void nfsd4_revoke_states(struct net *net, struct super_block *sb)
mutex_unlock(&stp->st_mutex);
break;
case SC_TYPE_DELEG:
+ refcount_inc(&stid->sc_count);
dp = delegstateid(stid);
spin_lock(&state_lock);
if (!unhash_delegation_locked(
@@ -6545,6 +6572,7 @@ nfs4_laundromat(struct nfsd_net *nn)
dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
if (!state_expired(&lt, dp->dl_time))
break;
+ refcount_inc(&dp->dl_stid.sc_count);
unhash_delegation_locked(dp, SC_STATUS_REVOKED);
list_add(&dp->dl_recall_lru, &reaplist);
}
@@ -7157,7 +7185,9 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
s->sc_status |= SC_STATUS_CLOSED;
spin_unlock(&s->sc_lock);
dp = delegstateid(s);
- list_del_init(&dp->dl_recall_lru);
+ if (s->sc_status & SC_STATUS_FREEABLE)
+ list_del_init(&dp->dl_recall_lru);
+ s->sc_status |= SC_STATUS_FREED;
spin_unlock(&cl->cl_lock);
nfs4_put_stid(s);
ret = nfs_ok;
@@ -7487,7 +7517,9 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
return status;
- status = nfsd4_lookup_stateid(cstate, stateid, SC_TYPE_DELEG, 0, &s, nn);
+ status = nfsd4_lookup_stateid(cstate, stateid, SC_TYPE_DELEG,
+ SC_STATUS_REVOKED | SC_STATUS_FREEABLE,
+ &s, nn);
if (status)
goto out;
dp = delegstateid(s);
@@ -7969,9 +8001,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
fp = lock_stp->st_stid.sc_file;
switch (lock->lk_type) {
case NFS4_READW_LT:
- if (nfsd4_has_session(cstate) ||
- exportfs_lock_op_is_async(sb->s_export_op))
- flags |= FL_SLEEP;
fallthrough;
case NFS4_READ_LT:
spin_lock(&fp->fi_lock);
@@ -7982,9 +8011,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
type = F_RDLCK;
break;
case NFS4_WRITEW_LT:
- if (nfsd4_has_session(cstate) ||
- exportfs_lock_op_is_async(sb->s_export_op))
- flags |= FL_SLEEP;
fallthrough;
case NFS4_WRITE_LT:
spin_lock(&fp->fi_lock);
@@ -8004,15 +8030,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto out;
}
- /*
- * Most filesystems with their own ->lock operations will block
- * the nfsd thread waiting to acquire the lock. That leads to
- * deadlocks (we don't want every nfsd thread tied up waiting
- * for file locks), so don't attempt blocking lock notifications
- * on those filesystems:
- */
- if (!exportfs_lock_op_is_async(sb->s_export_op))
- flags &= ~FL_SLEEP;
+ if (lock->lk_type & (NFS4_READW_LT | NFS4_WRITEW_LT) &&
+ nfsd4_has_session(cstate) &&
+ locks_can_async_lock(nf->nf_file->f_op))
+ flags |= FL_SLEEP;
nbl = find_or_allocate_block(lock_sop, &fp->fi_fhandle, nn);
if (!nbl) {
@@ -8684,7 +8705,7 @@ nfs4_state_shutdown_net(struct net *net)
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
shrinker_free(nn->nfsd_client_shrinker);
- cancel_work(&nn->nfsd_shrinker_work);
+ cancel_work_sync(&nn->nfsd_shrinker_work);
cancel_delayed_work_sync(&nn->laundromat_work);
locks_end_grace(&nn->nfsd4_manager);
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 79c743c01a47..35b3564c065f 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -114,6 +114,8 @@ struct nfs4_stid {
/* For a deleg stateid kept around only to process free_stateid's: */
#define SC_STATUS_REVOKED BIT(1)
#define SC_STATUS_ADMIN_REVOKED BIT(2)
+#define SC_STATUS_FREEABLE BIT(3)
+#define SC_STATUS_FREED BIT(4)
unsigned short sc_status;
struct list_head sc_cp_list;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 22325b590e17..d6d4f2a0e898 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -903,11 +903,6 @@ __nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
goto out;
}
- if (may_flags & NFSD_MAY_64BIT_COOKIE)
- file->f_mode |= FMODE_64BITHASH;
- else
- file->f_mode |= FMODE_32BITHASH;
-
*filp = file;
out:
return host_err;
@@ -2174,13 +2169,15 @@ nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp,
loff_t offset = *offsetp;
int may_flags = NFSD_MAY_READ;
- if (fhp->fh_64bit_cookies)
- may_flags |= NFSD_MAY_64BIT_COOKIE;
-
err = nfsd_open(rqstp, fhp, S_IFDIR, may_flags, &file);
if (err)
goto out;
+ if (fhp->fh_64bit_cookies)
+ file->f_mode |= FMODE_64BITHASH;
+ else
+ file->f_mode |= FMODE_32BITHASH;
+
offset = vfs_llseek(file, offset, SEEK_SET);
if (offset < 0) {
err = nfserrno((int)offset);
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 57b4af5ad646..501ad7be5174 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -68,7 +68,6 @@ nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr)
goto failed;
}
memset(bh->b_data, 0, i_blocksize(inode));
- bh->b_bdev = inode->i_sb->s_bdev;
bh->b_blocknr = blocknr;
set_buffer_mapped(bh);
set_buffer_uptodate(bh);
@@ -133,7 +132,6 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
goto found;
}
set_buffer_mapped(bh);
- bh->b_bdev = inode->i_sb->s_bdev;
bh->b_blocknr = pblocknr; /* set block address for read */
bh->b_end_io = end_buffer_read_sync;
get_bh(bh);
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index fe5b1a30c509..a8602729586a 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -289,7 +289,7 @@ static int nilfs_readdir(struct file *file, struct dir_context *ctx)
* The folio is mapped and unlocked. When the caller is finished with
* the entry, it should call folio_release_kmap().
*
- * On failure, returns NULL and the caller should ignore foliop.
+ * On failure, returns an error pointer and the caller should ignore foliop.
*/
struct nilfs_dir_entry *nilfs_find_entry(struct inode *dir,
const struct qstr *qstr, struct folio **foliop)
@@ -312,22 +312,24 @@ struct nilfs_dir_entry *nilfs_find_entry(struct inode *dir,
do {
char *kaddr = nilfs_get_folio(dir, n, foliop);
- if (!IS_ERR(kaddr)) {
- de = (struct nilfs_dir_entry *)kaddr;
- kaddr += nilfs_last_byte(dir, n) - reclen;
- while ((char *) de <= kaddr) {
- if (de->rec_len == 0) {
- nilfs_error(dir->i_sb,
- "zero-length directory entry");
- folio_release_kmap(*foliop, kaddr);
- goto out;
- }
- if (nilfs_match(namelen, name, de))
- goto found;
- de = nilfs_next_entry(de);
+ if (IS_ERR(kaddr))
+ return ERR_CAST(kaddr);
+
+ de = (struct nilfs_dir_entry *)kaddr;
+ kaddr += nilfs_last_byte(dir, n) - reclen;
+ while ((char *)de <= kaddr) {
+ if (de->rec_len == 0) {
+ nilfs_error(dir->i_sb,
+ "zero-length directory entry");
+ folio_release_kmap(*foliop, kaddr);
+ goto out;
}
- folio_release_kmap(*foliop, kaddr);
+ if (nilfs_match(namelen, name, de))
+ goto found;
+ de = nilfs_next_entry(de);
}
+ folio_release_kmap(*foliop, kaddr);
+
if (++n >= npages)
n = 0;
/* next folio is past the blocks we've got */
@@ -340,7 +342,7 @@ struct nilfs_dir_entry *nilfs_find_entry(struct inode *dir,
}
} while (n != start);
out:
- return NULL;
+ return ERR_PTR(-ENOENT);
found:
ei->i_dir_start_lookup = n;
@@ -384,18 +386,18 @@ fail:
return NULL;
}
-ino_t nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr)
+int nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr, ino_t *ino)
{
- ino_t res = 0;
struct nilfs_dir_entry *de;
struct folio *folio;
de = nilfs_find_entry(dir, qstr, &folio);
- if (de) {
- res = le64_to_cpu(de->inode);
- folio_release_kmap(folio, de);
- }
- return res;
+ if (IS_ERR(de))
+ return PTR_ERR(de);
+
+ *ino = le64_to_cpu(de->inode);
+ folio_release_kmap(folio, de);
+ return 0;
}
void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 1c9ae36a03ab..ace22253fed0 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -83,10 +83,8 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff,
goto out;
}
- if (!buffer_mapped(bh)) {
- bh->b_bdev = inode->i_sb->s_bdev;
+ if (!buffer_mapped(bh))
set_buffer_mapped(bh);
- }
bh->b_blocknr = pbn;
bh->b_end_io = end_buffer_read_sync;
get_bh(bh);
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index ceb7dc0b5bad..2db6350b5ac2 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -89,7 +89,6 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
if (buffer_uptodate(bh))
goto failed_bh;
- bh->b_bdev = sb->s_bdev;
err = nilfs_mdt_insert_new_block(inode, block, bh, init_block);
if (likely(!err)) {
get_bh(bh);
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index c950139db6ef..9b108052d9f7 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -55,12 +55,20 @@ nilfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
{
struct inode *inode;
ino_t ino;
+ int res;
if (dentry->d_name.len > NILFS_NAME_LEN)
return ERR_PTR(-ENAMETOOLONG);
- ino = nilfs_inode_by_name(dir, &dentry->d_name);
- inode = ino ? nilfs_iget(dir->i_sb, NILFS_I(dir)->i_root, ino) : NULL;
+ res = nilfs_inode_by_name(dir, &dentry->d_name, &ino);
+ if (res) {
+ if (res != -ENOENT)
+ return ERR_PTR(res);
+ inode = NULL;
+ } else {
+ inode = nilfs_iget(dir->i_sb, NILFS_I(dir)->i_root, ino);
+ }
+
return d_splice_alias(inode, dentry);
}
@@ -149,6 +157,9 @@ static int nilfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
/* slow symlink */
inode->i_op = &nilfs_symlink_inode_operations;
inode_nohighmem(inode);
+ mapping_set_gfp_mask(inode->i_mapping,
+ mapping_gfp_constraint(inode->i_mapping,
+ ~__GFP_FS));
inode->i_mapping->a_ops = &nilfs_aops;
err = page_symlink(inode, symname, l);
if (err)
@@ -263,10 +274,11 @@ static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry)
struct folio *folio;
int err;
- err = -ENOENT;
de = nilfs_find_entry(dir, &dentry->d_name, &folio);
- if (!de)
+ if (IS_ERR(de)) {
+ err = PTR_ERR(de);
goto out;
+ }
inode = d_inode(dentry);
err = -EIO;
@@ -362,10 +374,11 @@ static int nilfs_rename(struct mnt_idmap *idmap,
if (unlikely(err))
return err;
- err = -ENOENT;
old_de = nilfs_find_entry(old_dir, &old_dentry->d_name, &old_folio);
- if (!old_de)
+ if (IS_ERR(old_de)) {
+ err = PTR_ERR(old_de);
goto out;
+ }
if (S_ISDIR(old_inode->i_mode)) {
err = -EIO;
@@ -382,10 +395,12 @@ static int nilfs_rename(struct mnt_idmap *idmap,
if (dir_de && !nilfs_empty_dir(new_inode))
goto out_dir;
- err = -ENOENT;
- new_de = nilfs_find_entry(new_dir, &new_dentry->d_name, &new_folio);
- if (!new_de)
+ new_de = nilfs_find_entry(new_dir, &new_dentry->d_name,
+ &new_folio);
+ if (IS_ERR(new_de)) {
+ err = PTR_ERR(new_de);
goto out_dir;
+ }
nilfs_set_link(new_dir, new_de, new_folio, old_inode);
folio_release_kmap(new_folio, new_de);
nilfs_mark_inode_dirty(new_dir);
@@ -440,12 +455,13 @@ out:
*/
static struct dentry *nilfs_get_parent(struct dentry *child)
{
- unsigned long ino;
+ ino_t ino;
+ int res;
struct nilfs_root *root;
- ino = nilfs_inode_by_name(d_inode(child), &dotdot_name);
- if (!ino)
- return ERR_PTR(-ENOENT);
+ res = nilfs_inode_by_name(d_inode(child), &dotdot_name, &ino);
+ if (res)
+ return ERR_PTR(res);
root = NILFS_I(d_inode(child))->i_root;
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index fb1c4c5bae7c..45d03826eaf1 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -254,7 +254,7 @@ static inline __u32 nilfs_mask_flags(umode_t mode, __u32 flags)
/* dir.c */
int nilfs_add_link(struct dentry *, struct inode *);
-ino_t nilfs_inode_by_name(struct inode *, const struct qstr *);
+int nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr, ino_t *ino);
int nilfs_make_empty(struct inode *, struct inode *);
struct nilfs_dir_entry *nilfs_find_entry(struct inode *, const struct qstr *,
struct folio **);
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 9c0b7cddeaae..9a849397c768 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -39,7 +39,6 @@ static struct buffer_head *__nilfs_get_folio_block(struct folio *folio,
first_block = (unsigned long)index << (PAGE_SHIFT - blkbits);
bh = get_nth_bh(bh, block - first_block);
- touch_buffer(bh);
wait_on_buffer(bh);
return bh;
}
@@ -64,6 +63,7 @@ struct buffer_head *nilfs_grab_buffer(struct inode *inode,
folio_put(folio);
return NULL;
}
+ bh->b_bdev = inode->i_sb->s_bdev;
return bh;
}
@@ -77,7 +77,8 @@ void nilfs_forget_buffer(struct buffer_head *bh)
const unsigned long clear_bits =
(BIT(BH_Uptodate) | BIT(BH_Dirty) | BIT(BH_Mapped) |
BIT(BH_Async_Write) | BIT(BH_NILFS_Volatile) |
- BIT(BH_NILFS_Checked) | BIT(BH_NILFS_Redirected));
+ BIT(BH_NILFS_Checked) | BIT(BH_NILFS_Redirected) |
+ BIT(BH_Delay));
lock_buffer(bh);
set_mask_bits(&bh->b_state, clear_bits, 0);
@@ -98,16 +99,16 @@ void nilfs_forget_buffer(struct buffer_head *bh)
*/
void nilfs_copy_buffer(struct buffer_head *dbh, struct buffer_head *sbh)
{
- void *kaddr0, *kaddr1;
+ void *saddr, *daddr;
unsigned long bits;
- struct page *spage = sbh->b_page, *dpage = dbh->b_page;
+ struct folio *sfolio = sbh->b_folio, *dfolio = dbh->b_folio;
struct buffer_head *bh;
- kaddr0 = kmap_local_page(spage);
- kaddr1 = kmap_local_page(dpage);
- memcpy(kaddr1 + bh_offset(dbh), kaddr0 + bh_offset(sbh), sbh->b_size);
- kunmap_local(kaddr1);
- kunmap_local(kaddr0);
+ saddr = kmap_local_folio(sfolio, bh_offset(sbh));
+ daddr = kmap_local_folio(dfolio, bh_offset(dbh));
+ memcpy(daddr, saddr, sbh->b_size);
+ kunmap_local(daddr);
+ kunmap_local(saddr);
dbh->b_state = sbh->b_state & NILFS_BUFFER_INHERENT_BITS;
dbh->b_blocknr = sbh->b_blocknr;
@@ -121,13 +122,13 @@ void nilfs_copy_buffer(struct buffer_head *dbh, struct buffer_head *sbh)
unlock_buffer(bh);
}
if (bits & BIT(BH_Uptodate))
- SetPageUptodate(dpage);
+ folio_mark_uptodate(dfolio);
else
- ClearPageUptodate(dpage);
+ folio_clear_uptodate(dfolio);
if (bits & BIT(BH_Mapped))
- SetPageMappedToDisk(dpage);
+ folio_set_mappedtodisk(dfolio);
else
- ClearPageMappedToDisk(dpage);
+ folio_clear_mappedtodisk(dfolio);
}
/**
@@ -400,13 +401,15 @@ void nilfs_clear_folio_dirty(struct folio *folio)
folio_clear_uptodate(folio);
folio_clear_mappedtodisk(folio);
+ folio_clear_checked(folio);
head = folio_buffers(folio);
if (head) {
const unsigned long clear_bits =
(BIT(BH_Uptodate) | BIT(BH_Dirty) | BIT(BH_Mapped) |
BIT(BH_Async_Write) | BIT(BH_NILFS_Volatile) |
- BIT(BH_NILFS_Checked) | BIT(BH_NILFS_Redirected));
+ BIT(BH_NILFS_Checked) | BIT(BH_NILFS_Redirected) |
+ BIT(BH_Delay));
bh = head;
do {
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index d5dbef7f5c95..6004dfdfdf0f 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -16,7 +16,6 @@
#include <linux/security.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
-#include <linux/fdtable.h>
#include <linux/fsnotify_backend.h>
static int dir_notify_enable __read_mostly = 1;
@@ -347,9 +346,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg)
new_fsn_mark = NULL;
}
- rcu_read_lock();
- f = lookup_fdget_rcu(fd);
- rcu_read_unlock();
+ f = fget_raw(fd);
/* if (f != filp) means that we lost a race and another task/thread
* actually closed the fd we are still playing with before we grabbed
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 224bccaab4cc..24c7c5df4998 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -1,6 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/fanotify.h>
-#include <linux/fdtable.h>
#include <linux/fsnotify_backend.h>
#include <linux/init.h>
#include <linux/jiffies.h>
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 9644bc72e457..35159fa0b063 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -1,7 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/fanotify.h>
#include <linux/fcntl.h>
-#include <linux/fdtable.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/anon_inodes.h>
@@ -1003,22 +1002,17 @@ static int fanotify_find_path(int dfd, const char __user *filename,
dfd, filename, flags);
if (filename == NULL) {
- struct fd f = fdget(dfd);
+ CLASS(fd, f)(dfd);
- ret = -EBADF;
- if (!fd_file(f))
- goto out;
+ if (fd_empty(f))
+ return -EBADF;
- ret = -ENOTDIR;
if ((flags & FAN_MARK_ONLYDIR) &&
- !(S_ISDIR(file_inode(fd_file(f))->i_mode))) {
- fdput(f);
- goto out;
- }
+ !(S_ISDIR(file_inode(fd_file(f))->i_mode)))
+ return -ENOTDIR;
*path = fd_file(f)->f_path;
path_get(path);
- fdput(f);
} else {
unsigned int lookup_flags = 0;
@@ -1682,7 +1676,6 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
struct inode *inode = NULL;
struct vfsmount *mnt = NULL;
struct fsnotify_group *group;
- struct fd f;
struct path path;
struct fan_fsid __fsid, *fsid = NULL;
u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS;
@@ -1752,14 +1745,13 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
umask = FANOTIFY_EVENT_FLAGS;
}
- f = fdget(fanotify_fd);
- if (unlikely(!fd_file(f)))
+ CLASS(fd, f)(fanotify_fd);
+ if (fd_empty(f))
return -EBADF;
/* verify that this is indeed an fanotify instance */
- ret = -EINVAL;
if (unlikely(fd_file(f)->f_op != &fanotify_fops))
- goto fput_and_out;
+ return -EINVAL;
group = fd_file(f)->private_data;
/*
@@ -1767,23 +1759,21 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
* marks. This also includes setting up such marks by a group that
* was initialized by an unprivileged user.
*/
- ret = -EPERM;
if ((!capable(CAP_SYS_ADMIN) ||
FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV)) &&
mark_type != FAN_MARK_INODE)
- goto fput_and_out;
+ return -EPERM;
/*
* Permission events require minimum priority FAN_CLASS_CONTENT.
*/
- ret = -EINVAL;
if (mask & FANOTIFY_PERM_EVENTS &&
group->priority < FSNOTIFY_PRIO_CONTENT)
- goto fput_and_out;
+ return -EINVAL;
if (mask & FAN_FS_ERROR &&
mark_type != FAN_MARK_FILESYSTEM)
- goto fput_and_out;
+ return -EINVAL;
/*
* Evictable is only relevant for inode marks, because only inode object
@@ -1791,7 +1781,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
*/
if (flags & FAN_MARK_EVICTABLE &&
mark_type != FAN_MARK_INODE)
- goto fput_and_out;
+ return -EINVAL;
/*
* Events that do not carry enough information to report
@@ -1803,7 +1793,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS);
if (mask & ~(FANOTIFY_FD_EVENTS|FANOTIFY_EVENT_FLAGS) &&
(!fid_mode || mark_type == FAN_MARK_MOUNT))
- goto fput_and_out;
+ return -EINVAL;
/*
* FAN_RENAME uses special info type records to report the old and
@@ -1811,23 +1801,22 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
* useful and was not implemented.
*/
if (mask & FAN_RENAME && !(fid_mode & FAN_REPORT_NAME))
- goto fput_and_out;
+ return -EINVAL;
if (mark_cmd == FAN_MARK_FLUSH) {
- ret = 0;
if (mark_type == FAN_MARK_MOUNT)
fsnotify_clear_vfsmount_marks_by_group(group);
else if (mark_type == FAN_MARK_FILESYSTEM)
fsnotify_clear_sb_marks_by_group(group);
else
fsnotify_clear_inode_marks_by_group(group);
- goto fput_and_out;
+ return 0;
}
ret = fanotify_find_path(dfd, pathname, &path, flags,
(mask & ALL_FSNOTIFY_EVENTS), obj_type);
if (ret)
- goto fput_and_out;
+ return ret;
if (mark_cmd == FAN_MARK_ADD) {
ret = fanotify_events_supported(group, &path, mask, flags);
@@ -1906,8 +1895,6 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
path_put_and_out:
path_put(&path);
-fput_and_out:
- fdput(f);
return ret;
}
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 0794dcaf1e47..e0c48956608a 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -732,7 +732,6 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
struct fsnotify_group *group;
struct inode *inode;
struct path path;
- struct fd f;
int ret;
unsigned flags = 0;
@@ -752,21 +751,17 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
if (unlikely(!(mask & ALL_INOTIFY_BITS)))
return -EINVAL;
- f = fdget(fd);
- if (unlikely(!fd_file(f)))
+ CLASS(fd, f)(fd);
+ if (fd_empty(f))
return -EBADF;
/* IN_MASK_ADD and IN_MASK_CREATE don't make sense together */
- if (unlikely((mask & IN_MASK_ADD) && (mask & IN_MASK_CREATE))) {
- ret = -EINVAL;
- goto fput_and_out;
- }
+ if (unlikely((mask & IN_MASK_ADD) && (mask & IN_MASK_CREATE)))
+ return -EINVAL;
/* verify that this is indeed an inotify instance */
- if (unlikely(fd_file(f)->f_op != &inotify_fops)) {
- ret = -EINVAL;
- goto fput_and_out;
- }
+ if (unlikely(fd_file(f)->f_op != &inotify_fops))
+ return -EINVAL;
if (!(mask & IN_DONT_FOLLOW))
flags |= LOOKUP_FOLLOW;
@@ -776,7 +771,7 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
ret = inotify_find_inode(pathname, &path, flags,
(mask & IN_ALL_EVENTS));
if (ret)
- goto fput_and_out;
+ return ret;
/* inode held in place by reference to path; group by fget on fd */
inode = path.dentry->d_inode;
@@ -785,8 +780,6 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
/* create/update an inode mark */
ret = inotify_update_watch(group, inode, mask);
path_put(&path);
-fput_and_out:
- fdput(f);
return ret;
}
@@ -794,33 +787,26 @@ SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
{
struct fsnotify_group *group;
struct inotify_inode_mark *i_mark;
- struct fd f;
- int ret = -EINVAL;
+ CLASS(fd, f)(fd);
- f = fdget(fd);
- if (unlikely(!fd_file(f)))
+ if (fd_empty(f))
return -EBADF;
/* verify that this is indeed an inotify instance */
if (unlikely(fd_file(f)->f_op != &inotify_fops))
- goto out;
+ return -EINVAL;
group = fd_file(f)->private_data;
i_mark = inotify_idr_find(group, wd);
if (unlikely(!i_mark))
- goto out;
-
- ret = 0;
+ return -EINVAL;
fsnotify_destroy_mark(&i_mark->fsn_mark, group);
/* match ref taken by inotify_idr_find */
fsnotify_put_mark(&i_mark->fsn_mark);
-
-out:
- fdput(f);
- return ret;
+ return 0;
}
/*
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 4b9f45d7049e..4200a0341343 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1765,42 +1765,41 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
long fd;
int sectsize;
char *p = (char *)page;
- struct fd f;
ssize_t ret = -EINVAL;
int live_threshold;
if (reg->hr_bdev_file)
- goto out;
+ return -EINVAL;
/* We can't heartbeat without having had our node number
* configured yet. */
if (o2nm_this_node() == O2NM_MAX_NODES)
- goto out;
+ return -EINVAL;
fd = simple_strtol(p, &p, 0);
if (!p || (*p && (*p != '\n')))
- goto out;
+ return -EINVAL;
if (fd < 0 || fd >= INT_MAX)
- goto out;
+ return -EINVAL;
- f = fdget(fd);
- if (fd_file(f) == NULL)
- goto out;
+ CLASS(fd, f)(fd);
+ if (fd_empty(f))
+ return -EINVAL;
if (reg->hr_blocks == 0 || reg->hr_start_block == 0 ||
reg->hr_block_bytes == 0)
- goto out2;
+ return -EINVAL;
if (!S_ISBLK(fd_file(f)->f_mapping->host->i_mode))
- goto out2;
+ return -EINVAL;
reg->hr_bdev_file = bdev_file_open_by_dev(fd_file(f)->f_mapping->host->i_rdev,
BLK_OPEN_WRITE | BLK_OPEN_READ, NULL, NULL);
if (IS_ERR(reg->hr_bdev_file)) {
ret = PTR_ERR(reg->hr_bdev_file);
reg->hr_bdev_file = NULL;
- goto out2;
+ return ret;
}
sectsize = bdev_logical_block_size(reg_bdev(reg));
@@ -1906,9 +1905,6 @@ out3:
fput(reg->hr_bdev_file);
reg->hr_bdev_file = NULL;
}
-out2:
- fdput(f);
-out:
return ret;
}
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 96b684763b39..b95724b767e1 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -280,5 +280,4 @@ const struct export_operations ocfs2_export_ops = {
.fh_to_dentry = ocfs2_fh_to_dentry,
.fh_to_parent = ocfs2_fh_to_parent,
.get_parent = ocfs2_get_parent,
- .flags = EXPORT_OP_ASYNC_LOCK,
};
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index ad131a2fc58e..4fa6c840d20b 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1129,9 +1129,12 @@ int ocfs2_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
trace_ocfs2_setattr(inode, dentry,
(unsigned long long)OCFS2_I(inode)->ip_blkno,
dentry->d_name.len, dentry->d_name.name,
- attr->ia_valid, attr->ia_mode,
- from_kuid(&init_user_ns, attr->ia_uid),
- from_kgid(&init_user_ns, attr->ia_gid));
+ attr->ia_valid,
+ attr->ia_valid & ATTR_MODE ? attr->ia_mode : 0,
+ attr->ia_valid & ATTR_UID ?
+ from_kuid(&init_user_ns, attr->ia_uid) : 0,
+ attr->ia_valid & ATTR_GID ?
+ from_kgid(&init_user_ns, attr->ia_gid) : 0);
/* ensuring we don't even attempt to truncate a symlink */
if (S_ISLNK(inode->i_mode))
@@ -1784,6 +1787,14 @@ int ocfs2_remove_inode_range(struct inode *inode,
return 0;
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+ int id_count = ocfs2_max_inline_data_with_xattr(inode->i_sb, di);
+
+ if (byte_start > id_count || byte_start + byte_len > id_count) {
+ ret = -EINVAL;
+ mlog_errno(ret);
+ goto out;
+ }
+
ret = ocfs2_truncate_inline(inode, di_bh, byte_start,
byte_start + byte_len, 0);
if (ret) {
@@ -2801,6 +2812,7 @@ const struct file_operations ocfs2_fops = {
.splice_write = iter_file_splice_write,
.fallocate = ocfs2_fallocate,
.remap_file_range = ocfs2_remap_file_range,
+ .fop_flags = FOP_ASYNC_LOCK,
};
WRAP_DIR_ITER(ocfs2_readdir) // FIXME!
@@ -2817,6 +2829,7 @@ const struct file_operations ocfs2_dops = {
#endif
.lock = ocfs2_lock,
.flock = ocfs2_flock,
+ .fop_flags = FOP_ASYNC_LOCK,
};
/*
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index c4a4016d3866..b0733c08ed13 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -574,6 +574,8 @@ out_commit:
ocfs2_commit_trans(osb, handle);
out_free_group_bh:
+ if (ret < 0)
+ ocfs2_remove_from_cache(INODE_CACHE(inode), group_bh);
brelse(group_bh);
out_unlock:
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 3d404624bb96..c79b4291777f 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -2319,6 +2319,7 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
struct ocfs2_blockcheck_stats *stats)
{
int status = -EAGAIN;
+ u32 blksz_bits;
if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE,
strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) {
@@ -2333,11 +2334,15 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
goto out;
}
status = -EINVAL;
- if ((1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits)) != blksz) {
+ /* Acceptable block sizes are 512 bytes, 1K, 2K and 4K. */
+ blksz_bits = le32_to_cpu(di->id2.i_super.s_blocksize_bits);
+ if (blksz_bits < 9 || blksz_bits > 12) {
mlog(ML_ERROR, "found superblock with incorrect block "
- "size: found %u, should be %u\n",
- 1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits),
- blksz);
+ "size bits: found %u, should be 9, 10, 11, or 12\n",
+ blksz_bits);
+ } else if ((1 << le32_to_cpu(blksz_bits)) != blksz) {
+ mlog(ML_ERROR, "found superblock with incorrect block "
+ "size: found %u, should be %u\n", 1 << blksz_bits, blksz);
} else if (le16_to_cpu(di->id2.i_super.s_major_rev_level) !=
OCFS2_MAJOR_REV_LEVEL ||
le16_to_cpu(di->id2.i_super.s_minor_rev_level) !=
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index dd0a05365e79..73a6f6fd8a8e 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -2036,8 +2036,7 @@ static int ocfs2_xa_remove(struct ocfs2_xa_loc *loc,
rc = 0;
ocfs2_xa_cleanup_value_truncate(loc, "removing",
orig_clusters);
- if (rc)
- goto out;
+ goto out;
}
}
diff --git a/fs/open.c b/fs/open.c
index acaeb3e25c88..4b37c5912e6c 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -187,19 +187,13 @@ long do_ftruncate(struct file *file, loff_t length, int small)
long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
{
- struct fd f;
- int error;
-
if (length < 0)
return -EINVAL;
- f = fdget(fd);
- if (!fd_file(f))
+ CLASS(fd, f)(fd);
+ if (fd_empty(f))
return -EBADF;
- error = do_ftruncate(fd_file(f), length, small);
-
- fdput(f);
- return error;
+ return do_ftruncate(fd_file(f), length, small);
}
SYSCALL_DEFINE2(ftruncate, unsigned int, fd, off_t, length)
@@ -349,14 +343,12 @@ EXPORT_SYMBOL_GPL(vfs_fallocate);
int ksys_fallocate(int fd, int mode, loff_t offset, loff_t len)
{
- struct fd f = fdget(fd);
- int error = -EBADF;
+ CLASS(fd, f)(fd);
- if (fd_file(f)) {
- error = vfs_fallocate(fd_file(f), mode, offset, len);
- fdput(f);
- }
- return error;
+ if (fd_empty(f))
+ return -EBADF;
+
+ return vfs_fallocate(fd_file(f), mode, offset, len);
}
SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len)
@@ -580,23 +572,18 @@ out:
SYSCALL_DEFINE1(fchdir, unsigned int, fd)
{
- struct fd f = fdget_raw(fd);
+ CLASS(fd_raw, f)(fd);
int error;
- error = -EBADF;
- if (!fd_file(f))
- goto out;
+ if (fd_empty(f))
+ return -EBADF;
- error = -ENOTDIR;
if (!d_can_lookup(fd_file(f)->f_path.dentry))
- goto out_putf;
+ return -ENOTDIR;
error = file_permission(fd_file(f), MAY_EXEC | MAY_CHDIR);
if (!error)
set_fs_pwd(current->fs, &fd_file(f)->f_path);
-out_putf:
- fdput(f);
-out:
return error;
}
@@ -671,14 +658,12 @@ int vfs_fchmod(struct file *file, umode_t mode)
SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode)
{
- struct fd f = fdget(fd);
- int err = -EBADF;
+ CLASS(fd, f)(fd);
- if (fd_file(f)) {
- err = vfs_fchmod(fd_file(f), mode);
- fdput(f);
- }
- return err;
+ if (fd_empty(f))
+ return -EBADF;
+
+ return vfs_fchmod(fd_file(f), mode);
}
static int do_fchmodat(int dfd, const char __user *filename, umode_t mode,
@@ -865,14 +850,12 @@ int vfs_fchown(struct file *file, uid_t user, gid_t group)
int ksys_fchown(unsigned int fd, uid_t user, gid_t group)
{
- struct fd f = fdget(fd);
- int error = -EBADF;
+ CLASS(fd, f)(fd);
- if (fd_file(f)) {
- error = vfs_fchown(fd_file(f), user, group);
- fdput(f);
- }
- return error;
+ if (fd_empty(f))
+ return -EBADF;
+
+ return vfs_fchown(fd_file(f), user, group);
}
SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
@@ -1457,6 +1440,8 @@ SYSCALL_DEFINE4(openat2, int, dfd, const char __user *, filename,
if (unlikely(usize < OPEN_HOW_SIZE_VER0))
return -EINVAL;
+ if (unlikely(usize > PAGE_SIZE))
+ return -E2BIG;
err = copy_struct_from_user(&tmp, sizeof(tmp), how, usize);
if (err)
@@ -1574,23 +1559,6 @@ SYSCALL_DEFINE1(close, unsigned int, fd)
return retval;
}
-/**
- * sys_close_range() - Close all file descriptors in a given range.
- *
- * @fd: starting file descriptor to close
- * @max_fd: last file descriptor to close
- * @flags: reserved for future extensions
- *
- * This closes a range of file descriptors. All file descriptors
- * from @fd up to and including @max_fd are closed.
- * Currently, errors to close a given file descriptor are ignored.
- */
-SYSCALL_DEFINE3(close_range, unsigned int, fd, unsigned int, max_fd,
- unsigned int, flags)
-{
- return __close_range(fd, max_fd, flags);
-}
-
/*
* This routine simulates a hangup on the tty, to arrange that users
* are given clean terminals at login time.
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 2ed6ad641a20..ee2cbd044ce6 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -16,7 +16,6 @@
#include <linux/sched/signal.h>
#include <linux/cred.h>
#include <linux/namei.h>
-#include <linux/fdtable.h>
#include <linux/ratelimit.h>
#include <linux/exportfs.h>
#include "overlayfs.h"
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 4504493b20be..4444c78e2e0c 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -231,6 +231,11 @@ static void ovl_file_modified(struct file *file)
ovl_copyattr(file_inode(file));
}
+static void ovl_file_end_write(struct file *file, loff_t pos, ssize_t ret)
+{
+ ovl_file_modified(file);
+}
+
static void ovl_file_accessed(struct file *file)
{
struct inode *inode, *upperinode;
@@ -294,7 +299,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
struct backing_file_ctx ctx = {
.cred = ovl_creds(inode->i_sb),
.user_file = file,
- .end_write = ovl_file_modified,
+ .end_write = ovl_file_end_write,
};
if (!iov_iter_count(iter))
@@ -364,7 +369,7 @@ static ssize_t ovl_splice_write(struct pipe_inode_info *pipe, struct file *out,
struct backing_file_ctx ctx = {
.cred = ovl_creds(inode->i_sb),
.user_file = out,
- .end_write = ovl_file_modified,
+ .end_write = ovl_file_end_write,
};
inode_lock(inode);
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 35fd3e3e1778..8b31f44c12cd 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -170,7 +170,7 @@ int ovl_getattr(struct mnt_idmap *idmap, const struct path *path,
type = ovl_path_real(dentry, &realpath);
old_cred = ovl_override_creds(dentry->d_sb);
- err = ovl_do_getattr(&realpath, stat, request_mask, flags);
+ err = vfs_getattr_nosec(&realpath, stat, request_mask, flags);
if (err)
goto out;
@@ -195,8 +195,8 @@ int ovl_getattr(struct mnt_idmap *idmap, const struct path *path,
(!is_dir ? STATX_NLINK : 0);
ovl_path_lower(dentry, &realpath);
- err = ovl_do_getattr(&realpath, &lowerstat, lowermask,
- flags);
+ err = vfs_getattr_nosec(&realpath, &lowerstat, lowermask,
+ flags);
if (err)
goto out;
@@ -248,8 +248,8 @@ int ovl_getattr(struct mnt_idmap *idmap, const struct path *path,
ovl_path_lowerdata(dentry, &realpath);
if (realpath.dentry) {
- err = ovl_do_getattr(&realpath, &lowerdatastat,
- lowermask, flags);
+ err = vfs_getattr_nosec(&realpath, &lowerdatastat,
+ lowermask, flags);
if (err)
goto out;
} else {
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 0bfe35da4b7b..910dbbb2bb7b 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -412,14 +412,6 @@ static inline bool ovl_open_flags_need_copy_up(int flags)
return ((OPEN_FMODE(flags) & FMODE_WRITE) || (flags & O_TRUNC));
}
-static inline int ovl_do_getattr(const struct path *path, struct kstat *stat,
- u32 request_mask, unsigned int flags)
-{
- if (flags & AT_GETATTR_NOSEC)
- return vfs_getattr_nosec(path, stat, request_mask, flags);
- return vfs_getattr(path, stat, request_mask, flags);
-}
-
/* util.c */
int ovl_get_write_access(struct dentry *dentry);
void ovl_put_write_access(struct dentry *dentry);
diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c
index e42546c6c5df..1115c22deca0 100644
--- a/fs/overlayfs/params.c
+++ b/fs/overlayfs/params.c
@@ -141,10 +141,10 @@ static int ovl_verity_mode_def(void)
const struct fs_parameter_spec ovl_parameter_spec[] = {
fsparam_string_empty("lowerdir", Opt_lowerdir),
- fsparam_string("lowerdir+", Opt_lowerdir_add),
- fsparam_string("datadir+", Opt_datadir_add),
- fsparam_string("upperdir", Opt_upperdir),
- fsparam_string("workdir", Opt_workdir),
+ fsparam_file_or_string("lowerdir+", Opt_lowerdir_add),
+ fsparam_file_or_string("datadir+", Opt_datadir_add),
+ fsparam_file_or_string("upperdir", Opt_upperdir),
+ fsparam_file_or_string("workdir", Opt_workdir),
fsparam_flag("default_permissions", Opt_default_permissions),
fsparam_enum("redirect_dir", Opt_redirect_dir, ovl_parameter_redirect_dir),
fsparam_enum("index", Opt_index, ovl_parameter_bool),
@@ -367,40 +367,100 @@ static void ovl_add_layer(struct fs_context *fc, enum ovl_opt layer,
}
}
-static int ovl_parse_layer(struct fs_context *fc, const char *layer_name, enum ovl_opt layer)
+static inline bool is_upper_layer(enum ovl_opt layer)
+{
+ return layer == Opt_upperdir || layer == Opt_workdir;
+}
+
+/* Handle non-file descriptor-based layer options that require path lookup. */
+static inline int ovl_kern_path(const char *layer_name, struct path *layer_path,
+ enum ovl_opt layer)
{
- char *name = kstrdup(layer_name, GFP_KERNEL);
- bool upper = (layer == Opt_upperdir || layer == Opt_workdir);
- struct path path;
int err;
+ switch (layer) {
+ case Opt_upperdir:
+ fallthrough;
+ case Opt_workdir:
+ fallthrough;
+ case Opt_lowerdir:
+ err = ovl_mount_dir(layer_name, layer_path);
+ break;
+ case Opt_lowerdir_add:
+ fallthrough;
+ case Opt_datadir_add:
+ err = ovl_mount_dir_noesc(layer_name, layer_path);
+ break;
+ default:
+ WARN_ON_ONCE(true);
+ err = -EINVAL;
+ }
+
+ return err;
+}
+
+static int ovl_do_parse_layer(struct fs_context *fc, const char *layer_name,
+ struct path *layer_path, enum ovl_opt layer)
+{
+ char *name __free(kfree) = kstrdup(layer_name, GFP_KERNEL);
+ bool upper;
+ int err = 0;
+
if (!name)
return -ENOMEM;
- if (upper || layer == Opt_lowerdir)
- err = ovl_mount_dir(name, &path);
- else
- err = ovl_mount_dir_noesc(name, &path);
+ upper = is_upper_layer(layer);
+ err = ovl_mount_dir_check(fc, layer_path, layer, name, upper);
if (err)
- goto out_free;
-
- err = ovl_mount_dir_check(fc, &path, layer, name, upper);
- if (err)
- goto out_put;
+ return err;
if (!upper) {
err = ovl_ctx_realloc_lower(fc);
if (err)
- goto out_put;
+ return err;
}
/* Store the user provided path string in ctx to show in mountinfo */
- ovl_add_layer(fc, layer, &path, &name);
+ ovl_add_layer(fc, layer, layer_path, &name);
+ return err;
+}
+
+static int ovl_parse_layer(struct fs_context *fc, struct fs_parameter *param,
+ enum ovl_opt layer)
+{
+ struct path layer_path __free(path_put) = {};
+ int err = 0;
+
+ switch (param->type) {
+ case fs_value_is_string:
+ err = ovl_kern_path(param->string, &layer_path, layer);
+ if (err)
+ return err;
+ err = ovl_do_parse_layer(fc, param->string, &layer_path, layer);
+ break;
+ case fs_value_is_file: {
+ char *buf __free(kfree);
+ char *layer_name;
+
+ buf = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT);
+ if (!buf)
+ return -ENOMEM;
+
+ layer_path = param->file->f_path;
+ path_get(&layer_path);
+
+ layer_name = d_path(&layer_path, buf, PATH_MAX);
+ if (IS_ERR(layer_name))
+ return PTR_ERR(layer_name);
+
+ err = ovl_do_parse_layer(fc, layer_name, &layer_path, layer);
+ break;
+ }
+ default:
+ WARN_ON_ONCE(true);
+ err = -EINVAL;
+ }
-out_put:
- path_put(&path);
-out_free:
- kfree(name);
return err;
}
@@ -474,7 +534,13 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc)
iter = dup;
for (nr = 0; nr < nr_lower; nr++) {
- err = ovl_parse_layer(fc, iter, Opt_lowerdir);
+ struct path path __free(path_put) = {};
+
+ err = ovl_kern_path(iter, &path, Opt_lowerdir);
+ if (err)
+ goto out_err;
+
+ err = ovl_do_parse_layer(fc, iter, &path, Opt_lowerdir);
if (err)
goto out_err;
@@ -555,7 +621,7 @@ static int ovl_parse_param(struct fs_context *fc, struct fs_parameter *param)
case Opt_datadir_add:
case Opt_upperdir:
case Opt_workdir:
- err = ovl_parse_layer(fc, param->string, opt);
+ err = ovl_parse_layer(fc, param, opt);
break;
case Opt_default_permissions:
config->default_permissions = true;
diff --git a/fs/pidfs.c b/fs/pidfs.c
index 80675b6bf884..618abb1fa1b8 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -2,6 +2,7 @@
#include <linux/anon_inodes.h>
#include <linux/file.h>
#include <linux/fs.h>
+#include <linux/cgroup.h>
#include <linux/magic.h>
#include <linux/mount.h>
#include <linux/pid.h>
@@ -114,6 +115,81 @@ static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
return poll_flags;
}
+static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long arg)
+{
+ struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg;
+ size_t usize = _IOC_SIZE(cmd);
+ struct pidfd_info kinfo = {};
+ struct user_namespace *user_ns;
+ const struct cred *c;
+ __u64 mask;
+#ifdef CONFIG_CGROUPS
+ struct cgroup *cgrp;
+#endif
+
+ if (!uinfo)
+ return -EINVAL;
+ if (usize < PIDFD_INFO_SIZE_VER0)
+ return -EINVAL; /* First version, no smaller struct possible */
+
+ if (copy_from_user(&mask, &uinfo->mask, sizeof(mask)))
+ return -EFAULT;
+
+ c = get_task_cred(task);
+ if (!c)
+ return -ESRCH;
+
+ /* Unconditionally return identifiers and credentials, the rest only on request */
+
+ user_ns = current_user_ns();
+ kinfo.ruid = from_kuid_munged(user_ns, c->uid);
+ kinfo.rgid = from_kgid_munged(user_ns, c->gid);
+ kinfo.euid = from_kuid_munged(user_ns, c->euid);
+ kinfo.egid = from_kgid_munged(user_ns, c->egid);
+ kinfo.suid = from_kuid_munged(user_ns, c->suid);
+ kinfo.sgid = from_kgid_munged(user_ns, c->sgid);
+ kinfo.fsuid = from_kuid_munged(user_ns, c->fsuid);
+ kinfo.fsgid = from_kgid_munged(user_ns, c->fsgid);
+ kinfo.mask |= PIDFD_INFO_CREDS;
+ put_cred(c);
+
+#ifdef CONFIG_CGROUPS
+ rcu_read_lock();
+ cgrp = task_dfl_cgroup(task);
+ kinfo.cgroupid = cgroup_id(cgrp);
+ kinfo.mask |= PIDFD_INFO_CGROUPID;
+ rcu_read_unlock();
+#endif
+
+ /*
+ * Copy pid/tgid last, to reduce the chances the information might be
+ * stale. Note that it is not possible to ensure it will be valid as the
+ * task might return as soon as the copy_to_user finishes, but that's ok
+ * and userspace expects that might happen and can act accordingly, so
+ * this is just best-effort. What we can do however is checking that all
+ * the fields are set correctly, or return ESRCH to avoid providing
+ * incomplete information. */
+
+ kinfo.ppid = task_ppid_nr_ns(task, NULL);
+ kinfo.tgid = task_tgid_vnr(task);
+ kinfo.pid = task_pid_vnr(task);
+ kinfo.mask |= PIDFD_INFO_PID;
+
+ if (kinfo.pid == 0 || kinfo.tgid == 0 || (kinfo.ppid == 0 && kinfo.pid != 1))
+ return -ESRCH;
+
+ /*
+ * If userspace and the kernel have the same struct size it can just
+ * be copied. If userspace provides an older struct, only the bits that
+ * userspace knows about will be copied. If userspace provides a new
+ * struct, only the bits that the kernel knows about will be copied.
+ */
+ if (copy_to_user(uinfo, &kinfo, min(usize, sizeof(kinfo))))
+ return -EFAULT;
+
+ return 0;
+}
+
static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
struct task_struct *task __free(put_task) = NULL;
@@ -122,13 +198,17 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
struct ns_common *ns_common = NULL;
struct pid_namespace *pid_ns;
- if (arg)
- return -EINVAL;
-
task = get_pid_task(pid, PIDTYPE_PID);
if (!task)
return -ESRCH;
+ /* Extensible IOCTL that does not open namespace FDs, take a shortcut */
+ if (_IOC_NR(cmd) == _IOC_NR(PIDFD_GET_INFO))
+ return pidfd_info(task, cmd, arg);
+
+ if (arg)
+ return -EINVAL;
+
scoped_guard(task_lock, task) {
nsp = task->nsproxy;
if (nsp)
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 6c66a37522d0..4050942ab52f 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -200,11 +200,11 @@ EXPORT_SYMBOL(posix_acl_init);
* Allocate a new ACL with the specified number of entries.
*/
struct posix_acl *
-posix_acl_alloc(int count, gfp_t flags)
+posix_acl_alloc(unsigned int count, gfp_t flags)
{
- const size_t size = sizeof(struct posix_acl) +
- count * sizeof(struct posix_acl_entry);
- struct posix_acl *acl = kmalloc(size, flags);
+ struct posix_acl *acl;
+
+ acl = kmalloc(struct_size(acl, a_entries, count), flags);
if (acl)
posix_acl_init(acl, count);
return acl;
@@ -220,9 +220,8 @@ posix_acl_clone(const struct posix_acl *acl, gfp_t flags)
struct posix_acl *clone = NULL;
if (acl) {
- int size = sizeof(struct posix_acl) + acl->a_count *
- sizeof(struct posix_acl_entry);
- clone = kmemdup(acl, size, flags);
+ clone = kmemdup(acl, struct_size(acl, a_entries, acl->a_count),
+ flags);
if (clone)
refcount_set(&clone->a_refcount, 1);
}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index b31283d81c52..e9d7ddc52f69 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -58,7 +58,6 @@
#include <linux/init.h>
#include <linux/capability.h>
#include <linux/file.h>
-#include <linux/fdtable.h>
#include <linux/generic-radix-tree.h>
#include <linux/string.h>
#include <linux/seq_file.h>
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 1f54a54bfb91..24baf23e864f 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -77,7 +77,7 @@ static int seq_fdinfo_open(struct inode *inode, struct file *file)
return single_open(file, seq_show, inode);
}
-/**
+/*
* Shared /proc/pid/fdinfo and /proc/pid/fdinfo/fd permission helper to ensure
* that the current task has PTRACE_MODE_READ in addition to the normal
* POSIX-like checks.
@@ -116,9 +116,7 @@ static bool tid_fd_mode(struct task_struct *task, unsigned fd, fmode_t *mode)
{
struct file *file;
- rcu_read_lock();
- file = task_lookup_fdget_rcu(task, fd);
- rcu_read_unlock();
+ file = fget_task(task, fd);
if (file) {
*mode = file->f_mode;
fput(file);
@@ -258,19 +256,17 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx,
if (!dir_emit_dots(file, ctx))
goto out;
- rcu_read_lock();
for (fd = ctx->pos - 2;; fd++) {
struct file *f;
struct fd_data data;
char name[10 + 1];
unsigned int len;
- f = task_lookup_next_fdget_rcu(p, &fd);
+ f = fget_task_next(p, &fd);
ctx->pos = fd + 2LL;
if (!f)
break;
data.mode = f->f_mode;
- rcu_read_unlock();
fput(f);
data.fd = fd;
@@ -278,11 +274,9 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx,
if (!proc_fill_cache(file, ctx,
name, len, instantiate, p,
&data))
- goto out;
+ break;
cond_resched();
- rcu_read_lock();
}
- rcu_read_unlock();
out:
put_task_struct(p);
return 0;
diff --git a/fs/proc/softirqs.c b/fs/proc/softirqs.c
index f4616083faef..04bb29721419 100644
--- a/fs/proc/softirqs.c
+++ b/fs/proc/softirqs.c
@@ -20,7 +20,7 @@ static int show_softirqs(struct seq_file *p, void *v)
for (i = 0; i < NR_SOFTIRQS; i++) {
seq_printf(p, "%12s:", softirq_to_name[i]);
for_each_possible_cpu(j)
- seq_printf(p, " %10u", kstat_softirqs_cpu(i, j));
+ seq_put_decimal_ull_width(p, " ", kstat_softirqs_cpu(i, j), 10);
seq_putc(p, '\n');
}
return 0;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 72f14fd59c2d..38a5a3e9cba2 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -909,8 +909,15 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
{
/*
* Don't forget to update Documentation/ on changes.
+ *
+ * The length of the second argument of mnemonics[]
+ * needs to be 3 instead of previously set 2
+ * (i.e. from [BITS_PER_LONG][2] to [BITS_PER_LONG][3])
+ * to avoid spurious
+ * -Werror=unterminated-string-initialization warning
+ * with GCC 15
*/
- static const char mnemonics[BITS_PER_LONG][2] = {
+ static const char mnemonics[BITS_PER_LONG][3] = {
/*
* In case if we meet a flag we don't know about.
*/
@@ -971,7 +978,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
[ilog2(VM_UFFD_MINOR)] = "ui",
#endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
-#ifdef CONFIG_X86_USER_SHADOW_STACK
+#ifdef CONFIG_ARCH_HAS_USER_SHADOW_STACK
[ilog2(VM_SHADOW_STACK)] = "ss",
#endif
#if defined(CONFIG_64BIT) || defined(CONFIG_PPC32)
@@ -987,11 +994,8 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
for (i = 0; i < BITS_PER_LONG; i++) {
if (!mnemonics[i][0])
continue;
- if (vma->vm_flags & (1UL << i)) {
- seq_putc(m, mnemonics[i][0]);
- seq_putc(m, mnemonics[i][1]);
- seq_putc(m, ' ');
- }
+ if (vma->vm_flags & (1UL << i))
+ seq_printf(m, "%s ", mnemonics[i]);
}
seq_putc(m, '\n');
}
@@ -2661,8 +2665,10 @@ static int pagemap_scan_get_args(struct pm_scan_arg *arg,
return -EFAULT;
if (!arg->vec && arg->vec_len)
return -EINVAL;
+ if (UINT_MAX == SIZE_MAX && arg->vec_len > SIZE_MAX)
+ return -EINVAL;
if (arg->vec && !access_ok((void __user *)(long)arg->vec,
- arg->vec_len * sizeof(struct page_region)))
+ size_mul(arg->vec_len, sizeof(struct page_region))))
return -EFAULT;
/* Fixup default values */
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index b52d85f8ad59..b4521b096058 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -457,10 +457,6 @@ static vm_fault_t mmap_vmcore_fault(struct vm_fault *vmf)
#endif
}
-static const struct vm_operations_struct vmcore_mmap_ops = {
- .fault = mmap_vmcore_fault,
-};
-
/**
* vmcore_alloc_buf - allocate buffer in vmalloc memory
* @size: size of buffer
@@ -488,6 +484,11 @@ static inline char *vmcore_alloc_buf(size_t size)
* virtually contiguous user-space in ELF layout.
*/
#ifdef CONFIG_MMU
+
+static const struct vm_operations_struct vmcore_mmap_ops = {
+ .fault = mmap_vmcore_fault,
+};
+
/*
* remap_oldmem_pfn_checked - do remap_oldmem_pfn_range replacing all pages
* reported as not being ram with the zero page.
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 290157bc7bec..7c2b75a44485 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -976,21 +976,19 @@ SYSCALL_DEFINE4(quotactl_fd, unsigned int, fd, unsigned int, cmd,
struct super_block *sb;
unsigned int cmds = cmd >> SUBCMDSHIFT;
unsigned int type = cmd & SUBCMDMASK;
- struct fd f;
+ CLASS(fd_raw, f)(fd);
int ret;
- f = fdget_raw(fd);
- if (!fd_file(f))
+ if (fd_empty(f))
return -EBADF;
- ret = -EINVAL;
if (type >= MAXQUOTAS)
- goto out;
+ return -EINVAL;
if (quotactl_cmd_write(cmds)) {
ret = mnt_want_write(fd_file(f)->f_path.mnt);
if (ret)
- goto out;
+ return ret;
}
sb = fd_file(f)->f_path.mnt->mnt_sb;
@@ -1008,7 +1006,5 @@ SYSCALL_DEFINE4(quotactl_fd, unsigned int, fd, unsigned int, cmd,
if (quotactl_cmd_write(cmds))
mnt_drop_write(fd_file(f)->f_path.mnt);
-out:
- fdput(f);
return ret;
}
diff --git a/fs/read_write.c b/fs/read_write.c
index 64dc24afdb3a..a6133241dfb8 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -386,8 +386,8 @@ EXPORT_SYMBOL(vfs_llseek);
static off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence)
{
off_t retval;
- struct fd f = fdget_pos(fd);
- if (!fd_file(f))
+ CLASS(fd_pos, f)(fd);
+ if (fd_empty(f))
return -EBADF;
retval = -EINVAL;
@@ -397,7 +397,6 @@ static off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence)
if (res != (loff_t)retval)
retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */
}
- fdput_pos(f);
return retval;
}
@@ -420,15 +419,14 @@ SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
unsigned int, whence)
{
int retval;
- struct fd f = fdget_pos(fd);
+ CLASS(fd_pos, f)(fd);
loff_t offset;
- if (!fd_file(f))
+ if (fd_empty(f))
return -EBADF;
- retval = -EINVAL;
if (whence > SEEK_MAX)
- goto out_putf;
+ return -EINVAL;
offset = vfs_llseek(fd_file(f), ((loff_t) offset_high << 32) | offset_low,
whence);
@@ -439,8 +437,6 @@ SYSCALL_DEFINE5(llseek, unsigned int, fd, unsigned long, offset_high,
if (!copy_to_user(result, &offset, sizeof(offset)))
retval = 0;
}
-out_putf:
- fdput_pos(f);
return retval;
}
#endif
@@ -700,10 +696,10 @@ static inline loff_t *file_ppos(struct file *file)
ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count)
{
- struct fd f = fdget_pos(fd);
+ CLASS(fd_pos, f)(fd);
ssize_t ret = -EBADF;
- if (fd_file(f)) {
+ if (!fd_empty(f)) {
loff_t pos, *ppos = file_ppos(fd_file(f));
if (ppos) {
pos = *ppos;
@@ -712,7 +708,6 @@ ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count)
ret = vfs_read(fd_file(f), buf, count, ppos);
if (ret >= 0 && ppos)
fd_file(f)->f_pos = pos;
- fdput_pos(f);
}
return ret;
}
@@ -724,10 +719,10 @@ SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
{
- struct fd f = fdget_pos(fd);
+ CLASS(fd_pos, f)(fd);
ssize_t ret = -EBADF;
- if (fd_file(f)) {
+ if (!fd_empty(f)) {
loff_t pos, *ppos = file_ppos(fd_file(f));
if (ppos) {
pos = *ppos;
@@ -736,7 +731,6 @@ ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
ret = vfs_write(fd_file(f), buf, count, ppos);
if (ret >= 0 && ppos)
fd_file(f)->f_pos = pos;
- fdput_pos(f);
}
return ret;
@@ -751,21 +745,17 @@ SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count,
loff_t pos)
{
- struct fd f;
- ssize_t ret = -EBADF;
-
if (pos < 0)
return -EINVAL;
- f = fdget(fd);
- if (fd_file(f)) {
- ret = -ESPIPE;
- if (fd_file(f)->f_mode & FMODE_PREAD)
- ret = vfs_read(fd_file(f), buf, count, &pos);
- fdput(f);
- }
+ CLASS(fd, f)(fd);
+ if (fd_empty(f))
+ return -EBADF;
- return ret;
+ if (fd_file(f)->f_mode & FMODE_PREAD)
+ return vfs_read(fd_file(f), buf, count, &pos);
+
+ return -ESPIPE;
}
SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
@@ -785,21 +775,17 @@ COMPAT_SYSCALL_DEFINE5(pread64, unsigned int, fd, char __user *, buf,
ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf,
size_t count, loff_t pos)
{
- struct fd f;
- ssize_t ret = -EBADF;
-
if (pos < 0)
return -EINVAL;
- f = fdget(fd);
- if (fd_file(f)) {
- ret = -ESPIPE;
- if (fd_file(f)->f_mode & FMODE_PWRITE)
- ret = vfs_write(fd_file(f), buf, count, &pos);
- fdput(f);
- }
+ CLASS(fd, f)(fd);
+ if (fd_empty(f))
+ return -EBADF;
- return ret;
+ if (fd_file(f)->f_mode & FMODE_PWRITE)
+ return vfs_write(fd_file(f), buf, count, &pos);
+
+ return -ESPIPE;
}
SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
@@ -1075,10 +1061,10 @@ out:
static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
unsigned long vlen, rwf_t flags)
{
- struct fd f = fdget_pos(fd);
+ CLASS(fd_pos, f)(fd);
ssize_t ret = -EBADF;
- if (fd_file(f)) {
+ if (!fd_empty(f)) {
loff_t pos, *ppos = file_ppos(fd_file(f));
if (ppos) {
pos = *ppos;
@@ -1087,7 +1073,6 @@ static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
ret = vfs_readv(fd_file(f), vec, vlen, ppos, flags);
if (ret >= 0 && ppos)
fd_file(f)->f_pos = pos;
- fdput_pos(f);
}
if (ret > 0)
@@ -1099,10 +1084,10 @@ static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec,
static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec,
unsigned long vlen, rwf_t flags)
{
- struct fd f = fdget_pos(fd);
+ CLASS(fd_pos, f)(fd);
ssize_t ret = -EBADF;
- if (fd_file(f)) {
+ if (!fd_empty(f)) {
loff_t pos, *ppos = file_ppos(fd_file(f));
if (ppos) {
pos = *ppos;
@@ -1111,7 +1096,6 @@ static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec,
ret = vfs_writev(fd_file(f), vec, vlen, ppos, flags);
if (ret >= 0 && ppos)
fd_file(f)->f_pos = pos;
- fdput_pos(f);
}
if (ret > 0)
@@ -1129,18 +1113,16 @@ static inline loff_t pos_from_hilo(unsigned long high, unsigned long low)
static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec,
unsigned long vlen, loff_t pos, rwf_t flags)
{
- struct fd f;
ssize_t ret = -EBADF;
if (pos < 0)
return -EINVAL;
- f = fdget(fd);
- if (fd_file(f)) {
+ CLASS(fd, f)(fd);
+ if (!fd_empty(f)) {
ret = -ESPIPE;
if (fd_file(f)->f_mode & FMODE_PREAD)
ret = vfs_readv(fd_file(f), vec, vlen, &pos, flags);
- fdput(f);
}
if (ret > 0)
@@ -1152,18 +1134,16 @@ static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec,
static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec,
unsigned long vlen, loff_t pos, rwf_t flags)
{
- struct fd f;
ssize_t ret = -EBADF;
if (pos < 0)
return -EINVAL;
- f = fdget(fd);
- if (fd_file(f)) {
+ CLASS(fd, f)(fd);
+ if (!fd_empty(f)) {
ret = -ESPIPE;
if (fd_file(f)->f_mode & FMODE_PWRITE)
ret = vfs_writev(fd_file(f), vec, vlen, &pos, flags);
- fdput(f);
}
if (ret > 0)
@@ -1315,7 +1295,6 @@ COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd,
static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
size_t count, loff_t max)
{
- struct fd in, out;
struct inode *in_inode, *out_inode;
struct pipe_inode_info *opipe;
loff_t pos;
@@ -1326,35 +1305,32 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
/*
* Get input file, and verify that it is ok..
*/
- retval = -EBADF;
- in = fdget(in_fd);
- if (!fd_file(in))
- goto out;
+ CLASS(fd, in)(in_fd);
+ if (fd_empty(in))
+ return -EBADF;
if (!(fd_file(in)->f_mode & FMODE_READ))
- goto fput_in;
- retval = -ESPIPE;
+ return -EBADF;
if (!ppos) {
pos = fd_file(in)->f_pos;
} else {
pos = *ppos;
if (!(fd_file(in)->f_mode & FMODE_PREAD))
- goto fput_in;
+ return -ESPIPE;
}
retval = rw_verify_area(READ, fd_file(in), &pos, count);
if (retval < 0)
- goto fput_in;
+ return retval;
if (count > MAX_RW_COUNT)
count = MAX_RW_COUNT;
/*
* Get output file, and verify that it is ok..
*/
- retval = -EBADF;
- out = fdget(out_fd);
- if (!fd_file(out))
- goto fput_in;
+ CLASS(fd, out)(out_fd);
+ if (fd_empty(out))
+ return -EBADF;
if (!(fd_file(out)->f_mode & FMODE_WRITE))
- goto fput_out;
+ return -EBADF;
in_inode = file_inode(fd_file(in));
out_inode = file_inode(fd_file(out));
out_pos = fd_file(out)->f_pos;
@@ -1363,9 +1339,8 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes);
if (unlikely(pos + count > max)) {
- retval = -EOVERFLOW;
if (pos >= max)
- goto fput_out;
+ return -EOVERFLOW;
count = max - pos;
}
@@ -1384,7 +1359,7 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
if (!opipe) {
retval = rw_verify_area(WRITE, fd_file(out), &out_pos, count);
if (retval < 0)
- goto fput_out;
+ return retval;
retval = do_splice_direct(fd_file(in), &pos, fd_file(out), &out_pos,
count, fl);
} else {
@@ -1410,12 +1385,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
inc_syscw(current);
if (pos > max)
retval = -EOVERFLOW;
-
-fput_out:
- fdput(out);
-fput_in:
- fdput(in);
-out:
return retval;
}
@@ -1671,36 +1640,32 @@ SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
{
loff_t pos_in;
loff_t pos_out;
- struct fd f_in;
- struct fd f_out;
ssize_t ret = -EBADF;
- f_in = fdget(fd_in);
- if (!fd_file(f_in))
- goto out2;
+ CLASS(fd, f_in)(fd_in);
+ if (fd_empty(f_in))
+ return -EBADF;
- f_out = fdget(fd_out);
- if (!fd_file(f_out))
- goto out1;
+ CLASS(fd, f_out)(fd_out);
+ if (fd_empty(f_out))
+ return -EBADF;
- ret = -EFAULT;
if (off_in) {
if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
- goto out;
+ return -EFAULT;
} else {
pos_in = fd_file(f_in)->f_pos;
}
if (off_out) {
if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
- goto out;
+ return -EFAULT;
} else {
pos_out = fd_file(f_out)->f_pos;
}
- ret = -EINVAL;
if (flags != 0)
- goto out;
+ return -EINVAL;
ret = vfs_copy_file_range(fd_file(f_in), pos_in, fd_file(f_out), pos_out, len,
flags);
@@ -1722,12 +1687,6 @@ SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
fd_file(f_out)->f_pos = pos_out;
}
}
-
-out:
- fdput(f_out);
-out1:
- fdput(f_in);
-out2:
return ret;
}
@@ -1830,18 +1789,22 @@ int generic_file_rw_checks(struct file *file_in, struct file *file_out)
return 0;
}
-bool generic_atomic_write_valid(struct iov_iter *iter, loff_t pos)
+int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter)
{
size_t len = iov_iter_count(iter);
if (!iter_is_ubuf(iter))
- return false;
+ return -EINVAL;
if (!is_power_of_2(len))
- return false;
+ return -EINVAL;
- if (!IS_ALIGNED(pos, len))
- return false;
+ if (!IS_ALIGNED(iocb->ki_pos, len))
+ return -EINVAL;
- return true;
+ if (!(iocb->ki_flags & IOCB_DIRECT))
+ return -EOPNOTSUPP;
+
+ return 0;
}
+EXPORT_SYMBOL_GPL(generic_atomic_write_valid);
diff --git a/fs/readdir.c b/fs/readdir.c
index 6d29cab8576e..0038efda417b 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -219,20 +219,19 @@ SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
struct old_linux_dirent __user *, dirent, unsigned int, count)
{
int error;
- struct fd f = fdget_pos(fd);
+ CLASS(fd_pos, f)(fd);
struct readdir_callback buf = {
.ctx.actor = fillonedir,
.dirent = dirent
};
- if (!fd_file(f))
+ if (fd_empty(f))
return -EBADF;
error = iterate_dir(fd_file(f), &buf.ctx);
if (buf.result)
error = buf.result;
- fdput_pos(f);
return error;
}
@@ -309,7 +308,7 @@ efault:
SYSCALL_DEFINE3(getdents, unsigned int, fd,
struct linux_dirent __user *, dirent, unsigned int, count)
{
- struct fd f;
+ CLASS(fd_pos, f)(fd);
struct getdents_callback buf = {
.ctx.actor = filldir,
.count = count,
@@ -317,8 +316,7 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
};
int error;
- f = fdget_pos(fd);
- if (!fd_file(f))
+ if (fd_empty(f))
return -EBADF;
error = iterate_dir(fd_file(f), &buf.ctx);
@@ -333,7 +331,6 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
else
error = count - buf.count;
}
- fdput_pos(f);
return error;
}
@@ -392,7 +389,7 @@ efault:
SYSCALL_DEFINE3(getdents64, unsigned int, fd,
struct linux_dirent64 __user *, dirent, unsigned int, count)
{
- struct fd f;
+ CLASS(fd_pos, f)(fd);
struct getdents_callback64 buf = {
.ctx.actor = filldir64,
.count = count,
@@ -400,8 +397,7 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
};
int error;
- f = fdget_pos(fd);
- if (!fd_file(f))
+ if (fd_empty(f))
return -EBADF;
error = iterate_dir(fd_file(f), &buf.ctx);
@@ -417,7 +413,6 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
else
error = count - buf.count;
}
- fdput_pos(f);
return error;
}
@@ -477,20 +472,19 @@ COMPAT_SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
struct compat_old_linux_dirent __user *, dirent, unsigned int, count)
{
int error;
- struct fd f = fdget_pos(fd);
+ CLASS(fd_pos, f)(fd);
struct compat_readdir_callback buf = {
.ctx.actor = compat_fillonedir,
.dirent = dirent
};
- if (!fd_file(f))
+ if (fd_empty(f))
return -EBADF;
error = iterate_dir(fd_file(f), &buf.ctx);
if (buf.result)
error = buf.result;
- fdput_pos(f);
return error;
}
@@ -560,7 +554,7 @@ efault:
COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd,
struct compat_linux_dirent __user *, dirent, unsigned int, count)
{
- struct fd f;
+ CLASS(fd_pos, f)(fd);
struct compat_getdents_callback buf = {
.ctx.actor = compat_filldir,
.current_dir = dirent,
@@ -568,8 +562,7 @@ COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd,
};
int error;
- f = fdget_pos(fd);
- if (!fd_file(f))
+ if (fd_empty(f))
return -EBADF;
error = iterate_dir(fd_file(f), &buf.ctx);
@@ -584,7 +577,6 @@ COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd,
else
error = count - buf.count;
}
- fdput_pos(f);
return error;
}
#endif
diff --git a/fs/remap_range.c b/fs/remap_range.c
index 4403d5c68fcb..26afbbbfb10c 100644
--- a/fs/remap_range.c
+++ b/fs/remap_range.c
@@ -536,20 +536,19 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
}
for (i = 0, info = same->info; i < count; i++, info++) {
- struct fd dst_fd = fdget(info->dest_fd);
- struct file *dst_file = fd_file(dst_fd);
+ CLASS(fd, dst_fd)(info->dest_fd);
- if (!dst_file) {
+ if (fd_empty(dst_fd)) {
info->status = -EBADF;
goto next_loop;
}
if (info->reserved) {
info->status = -EINVAL;
- goto next_fdput;
+ goto next_loop;
}
- deduped = vfs_dedupe_file_range_one(file, off, dst_file,
+ deduped = vfs_dedupe_file_range_one(file, off, fd_file(dst_fd),
info->dest_offset, len,
REMAP_FILE_CAN_SHORTEN);
if (deduped == -EBADE)
@@ -559,8 +558,6 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
else
info->bytes_deduped = len;
-next_fdput:
- fdput(dst_fd);
next_loop:
if (fatal_signal_pending(current))
break;
diff --git a/fs/select.c b/fs/select.c
index a77907faf2b4..e223d1fe9d55 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -462,15 +462,22 @@ get_max:
EPOLLNVAL)
#define POLLEX_SET (EPOLLPRI | EPOLLNVAL)
-static inline void wait_key_set(poll_table *wait, unsigned long in,
+static inline __poll_t select_poll_one(int fd, poll_table *wait, unsigned long in,
unsigned long out, unsigned long bit,
__poll_t ll_flag)
{
+ CLASS(fd, f)(fd);
+
+ if (fd_empty(f))
+ return EPOLLNVAL;
+
wait->_key = POLLEX_SET | ll_flag;
if (in & bit)
wait->_key |= POLLIN_SET;
if (out & bit)
wait->_key |= POLLOUT_SET;
+
+ return vfs_poll(fd_file(f), wait);
}
static noinline_for_stack int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
@@ -522,20 +529,12 @@ static noinline_for_stack int do_select(int n, fd_set_bits *fds, struct timespec
}
for (j = 0; j < BITS_PER_LONG; ++j, ++i, bit <<= 1) {
- struct fd f;
if (i >= n)
break;
if (!(bit & all_bits))
continue;
- mask = EPOLLNVAL;
- f = fdget(i);
- if (fd_file(f)) {
- wait_key_set(wait, in, out, bit,
- busy_flag);
- mask = vfs_poll(fd_file(f), wait);
-
- fdput(f);
- }
+ mask = select_poll_one(i, wait, in, out, bit,
+ busy_flag);
if ((mask & POLLIN_SET) && (in & bit)) {
res_in |= bit;
retval++;
@@ -856,15 +855,14 @@ static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait,
__poll_t busy_flag)
{
int fd = pollfd->fd;
- __poll_t mask = 0, filter;
- struct fd f;
+ __poll_t mask, filter;
if (fd < 0)
- goto out;
- mask = EPOLLNVAL;
- f = fdget(fd);
- if (!fd_file(f))
- goto out;
+ return 0;
+
+ CLASS(fd, f)(fd);
+ if (fd_empty(f))
+ return EPOLLNVAL;
/* userland u16 ->events contains POLL... bitmap */
filter = demangle_poll(pollfd->events) | EPOLLERR | EPOLLHUP;
@@ -872,13 +870,7 @@ static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait,
mask = vfs_poll(fd_file(f), pwait);
if (mask & busy_flag)
*can_busy_poll = true;
- mask &= filter; /* Mask out unneeded events. */
- fdput(f);
-
-out:
- /* ... and so does ->revents */
- pollfd->revents = mangle_poll(mask);
- return mask;
+ return mask & filter; /* Mask out unneeded events. */
}
static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
@@ -910,6 +902,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
pfd = walk->entries;
pfd_end = pfd + walk->len;
for (; pfd != pfd_end; pfd++) {
+ __poll_t mask;
/*
* Fish for events. If we found one, record it
* and kill poll_table->_qproc, so we don't
@@ -917,8 +910,9 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
* this. They'll get immediately deregistered
* when we break out and return.
*/
- if (do_pollfd(pfd, pt, &can_busy_loop,
- busy_flag)) {
+ mask = do_pollfd(pfd, pt, &can_busy_loop, busy_flag);
+ pfd->revents = mangle_poll(mask);
+ if (mask) {
count++;
pt->_qproc = NULL;
/* found something, stop busy polling */
diff --git a/fs/seq_file.c b/fs/seq_file.c
index e676c8b0cf5d..8bbb1ad46335 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -343,8 +343,8 @@ EXPORT_SYMBOL(seq_lseek);
/**
* seq_release - free the structures associated with sequential file.
- * @file: file in question
* @inode: its inode
+ * @file: file in question
*
* Frees the structures associated with sequential file; can be used
* as ->f_op->release() if you don't have private data to destroy.
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 736bebf93591..d1a5f43ce466 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -288,20 +288,17 @@ static int do_signalfd4(int ufd, sigset_t *mask, int flags)
fd_install(ufd, file);
} else {
- struct fd f = fdget(ufd);
- if (!fd_file(f))
+ CLASS(fd, f)(ufd);
+ if (fd_empty(f))
return -EBADF;
ctx = fd_file(f)->private_data;
- if (fd_file(f)->f_op != &signalfd_fops) {
- fdput(f);
+ if (fd_file(f)->f_op != &signalfd_fops)
return -EINVAL;
- }
spin_lock_irq(&current->sighand->siglock);
ctx->sigmask = *mask;
spin_unlock_irq(&current->sighand->siglock);
wake_up(&current->sighand->signalfd_wqh);
- fdput(f);
}
return ufd;
diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index 000e1ef3beea..20cafdff5081 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -1780,7 +1780,7 @@ static int cifs_init_netfs(void)
nomem_subreqpool:
kmem_cache_destroy(cifs_io_subrequest_cachep);
nomem_subreq:
- mempool_destroy(&cifs_io_request_pool);
+ mempool_exit(&cifs_io_request_pool);
nomem_reqpool:
kmem_cache_destroy(cifs_io_request_cachep);
nomem_req:
diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h
index 68c716e6261b..1d3470bca45e 100644
--- a/fs/smb/client/cifsproto.h
+++ b/fs/smb/client/cifsproto.h
@@ -252,10 +252,6 @@ extern int cifs_read_from_socket(struct TCP_Server_Info *server, char *buf,
unsigned int to_read);
extern ssize_t cifs_discard_from_socket(struct TCP_Server_Info *server,
size_t to_read);
-extern int cifs_read_page_from_socket(struct TCP_Server_Info *server,
- struct page *page,
- unsigned int page_offset,
- unsigned int to_read);
int cifs_read_iter_from_socket(struct TCP_Server_Info *server,
struct iov_iter *iter,
unsigned int to_read);
@@ -623,8 +619,6 @@ enum securityEnum cifs_select_sectype(struct TCP_Server_Info *,
int cifs_alloc_hash(const char *name, struct shash_desc **sdesc);
void cifs_free_hash(struct shash_desc **sdesc);
-struct cifs_chan *
-cifs_ses_find_chan(struct cifs_ses *ses, struct TCP_Server_Info *server);
int cifs_try_adding_channels(struct cifs_ses *ses);
bool is_server_using_iface(struct TCP_Server_Info *server,
struct cifs_server_iface *iface);
@@ -640,9 +634,6 @@ cifs_chan_set_in_reconnect(struct cifs_ses *ses,
void
cifs_chan_clear_in_reconnect(struct cifs_ses *ses,
struct TCP_Server_Info *server);
-bool
-cifs_chan_in_reconnect(struct cifs_ses *ses,
- struct TCP_Server_Info *server);
void
cifs_chan_set_need_reconnect(struct cifs_ses *ses,
struct TCP_Server_Info *server);
diff --git a/fs/smb/client/compress.c b/fs/smb/client/compress.c
index 63b5a55b7a57..766b4de13da7 100644
--- a/fs/smb/client/compress.c
+++ b/fs/smb/client/compress.c
@@ -166,7 +166,6 @@ static int collect_sample(const struct iov_iter *iter, ssize_t max, u8 *sample)
loff_t start = iter->xarray_start + iter->iov_offset;
pgoff_t last, index = start / PAGE_SIZE;
size_t len, off, foff;
- ssize_t ret = 0;
void *p;
int s = 0;
@@ -193,9 +192,6 @@ static int collect_sample(const struct iov_iter *iter, ssize_t max, u8 *sample)
memcpy(&sample[s], p, len2);
kunmap_local(p);
- if (ret < 0)
- return ret;
-
s += len2;
if (len2 < SZ_2K || s >= max - SZ_2K)
diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index adf8758847f6..0ce2d704b1f3 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -795,18 +795,6 @@ cifs_discard_from_socket(struct TCP_Server_Info *server, size_t to_read)
}
int
-cifs_read_page_from_socket(struct TCP_Server_Info *server, struct page *page,
- unsigned int page_offset, unsigned int to_read)
-{
- struct msghdr smb_msg = {};
- struct bio_vec bv;
-
- bvec_set_page(&bv, page, to_read, page_offset);
- iov_iter_bvec(&smb_msg.msg_iter, ITER_DEST, &bv, 1, to_read);
- return cifs_readv_from_socket(server, &smb_msg);
-}
-
-int
cifs_read_iter_from_socket(struct TCP_Server_Info *server, struct iov_iter *iter,
unsigned int to_read)
{
@@ -1049,6 +1037,7 @@ clean_demultiplex_info(struct TCP_Server_Info *server)
*/
}
+ put_net(cifs_net_ns(server));
kfree(server->leaf_fullpath);
kfree(server);
@@ -1647,8 +1636,6 @@ cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect)
/* srv_count can never go negative */
WARN_ON(server->srv_count < 0);
- put_net(cifs_net_ns(server));
-
list_del_init(&server->tcp_ses_list);
spin_unlock(&cifs_tcp_ses_lock);
@@ -3082,13 +3069,22 @@ generic_ip_connect(struct TCP_Server_Info *server)
if (server->ssocket) {
socket = server->ssocket;
} else {
- rc = __sock_create(cifs_net_ns(server), sfamily, SOCK_STREAM,
+ struct net *net = cifs_net_ns(server);
+ struct sock *sk;
+
+ rc = __sock_create(net, sfamily, SOCK_STREAM,
IPPROTO_TCP, &server->ssocket, 1);
if (rc < 0) {
cifs_server_dbg(VFS, "Error %d creating socket\n", rc);
return rc;
}
+ sk = server->ssocket->sk;
+ __netns_tracker_free(net, &sk->ns_tracker, false);
+ sk->sk_net_refcnt = 1;
+ get_net_track(net, &sk->ns_tracker, GFP_KERNEL);
+ sock_inuse_add(net, 1);
+
/* BB other socket options to set KEEPALIVE, NODELAY? */
cifs_dbg(FYI, "Socket created\n");
socket = server->ssocket;
diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c
index 28c4e576d460..5c5a52019efa 100644
--- a/fs/smb/client/fs_context.c
+++ b/fs/smb/client/fs_context.c
@@ -920,8 +920,15 @@ static int smb3_reconfigure(struct fs_context *fc)
else {
kfree_sensitive(ses->password);
ses->password = kstrdup(ctx->password, GFP_KERNEL);
+ if (!ses->password)
+ return -ENOMEM;
kfree_sensitive(ses->password2);
ses->password2 = kstrdup(ctx->password2, GFP_KERNEL);
+ if (!ses->password2) {
+ kfree_sensitive(ses->password);
+ ses->password = NULL;
+ return -ENOMEM;
+ }
}
STEAL_STRING(cifs_sb, ctx, domainname);
STEAL_STRING(cifs_sb, ctx, nodename);
diff --git a/fs/smb/client/ioctl.c b/fs/smb/client/ioctl.c
index 2ce193609d8b..56439da4f119 100644
--- a/fs/smb/client/ioctl.c
+++ b/fs/smb/client/ioctl.c
@@ -72,7 +72,6 @@ static long cifs_ioctl_copychunk(unsigned int xid, struct file *dst_file,
unsigned long srcfd)
{
int rc;
- struct fd src_file;
struct inode *src_inode;
cifs_dbg(FYI, "ioctl copychunk range\n");
@@ -89,8 +88,8 @@ static long cifs_ioctl_copychunk(unsigned int xid, struct file *dst_file,
return rc;
}
- src_file = fdget(srcfd);
- if (!fd_file(src_file)) {
+ CLASS(fd, src_file)(srcfd);
+ if (fd_empty(src_file)) {
rc = -EBADF;
goto out_drop_write;
}
@@ -98,20 +97,18 @@ static long cifs_ioctl_copychunk(unsigned int xid, struct file *dst_file,
if (fd_file(src_file)->f_op->unlocked_ioctl != cifs_ioctl) {
rc = -EBADF;
cifs_dbg(VFS, "src file seems to be from a different filesystem type\n");
- goto out_fput;
+ goto out_drop_write;
}
src_inode = file_inode(fd_file(src_file));
rc = -EINVAL;
if (S_ISDIR(src_inode->i_mode))
- goto out_fput;
+ goto out_drop_write;
rc = cifs_file_copychunk_range(xid, fd_file(src_file), 0, dst_file, 0,
src_inode->i_size, 0);
if (rc > 0)
rc = 0;
-out_fput:
- fdput(src_file);
out_drop_write:
mnt_drop_write_file(dst_file);
return rc;
diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c
index 3216f786908f..c88e9657f47a 100644
--- a/fs/smb/client/sess.c
+++ b/fs/smb/client/sess.c
@@ -115,18 +115,6 @@ cifs_chan_clear_in_reconnect(struct cifs_ses *ses,
ses->chans[chan_index].in_reconnect = false;
}
-bool
-cifs_chan_in_reconnect(struct cifs_ses *ses,
- struct TCP_Server_Info *server)
-{
- unsigned int chan_index = cifs_ses_get_chan_index(ses, server);
-
- if (chan_index == CIFS_INVAL_CHAN_INDEX)
- return true; /* err on the safer side */
-
- return CIFS_CHAN_IN_RECONNECT(ses, chan_index);
-}
-
void
cifs_chan_set_need_reconnect(struct cifs_ses *ses,
struct TCP_Server_Info *server)
@@ -487,26 +475,6 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server)
spin_unlock(&ses->chan_lock);
}
-/*
- * If server is a channel of ses, return the corresponding enclosing
- * cifs_chan otherwise return NULL.
- */
-struct cifs_chan *
-cifs_ses_find_chan(struct cifs_ses *ses, struct TCP_Server_Info *server)
-{
- int i;
-
- spin_lock(&ses->chan_lock);
- for (i = 0; i < ses->chan_count; i++) {
- if (ses->chans[i].server == server) {
- spin_unlock(&ses->chan_lock);
- return &ses->chans[i];
- }
- }
- spin_unlock(&ses->chan_lock);
- return NULL;
-}
-
static int
cifs_ses_add_channel(struct cifs_ses *ses,
struct cifs_server_iface *iface)
diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index 6b385fce3f2a..24a2aa04a108 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -1158,7 +1158,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_fid fid;
unsigned int size[1];
void *data[1];
- struct smb2_file_full_ea_info *ea = NULL;
+ struct smb2_file_full_ea_info *ea;
struct smb2_query_info_rsp *rsp;
int rc, used_len = 0;
int retries = 0, cur_sleep = 1;
@@ -1179,6 +1179,7 @@ replay_again:
if (!utf16_path)
return -ENOMEM;
+ ea = NULL;
resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER;
vars = kzalloc(sizeof(*vars), GFP_KERNEL);
if (!vars) {
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index b2f16a7b696d..6584b5cddc28 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -3313,6 +3313,15 @@ SMB2_ioctl_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
return rc;
if (indatalen) {
+ unsigned int len;
+
+ if (WARN_ON_ONCE(smb3_encryption_required(tcon) &&
+ (check_add_overflow(total_len - 1,
+ ALIGN(indatalen, 8), &len) ||
+ len > MAX_CIFS_SMALL_BUFFER_SIZE))) {
+ cifs_small_buf_release(req);
+ return -EIO;
+ }
/*
* indatalen is usually small at a couple of bytes max, so
* just allocate through generic pool
diff --git a/fs/smb/server/auth.c b/fs/smb/server/auth.c
index 09b20039636e..611716bc8f27 100644
--- a/fs/smb/server/auth.c
+++ b/fs/smb/server/auth.c
@@ -512,6 +512,7 @@ int ksmbd_krb5_authenticate(struct ksmbd_session *sess, char *in_blob,
int in_len, char *out_blob, int *out_len)
{
struct ksmbd_spnego_authen_response *resp;
+ struct ksmbd_login_response_ext *resp_ext = NULL;
struct ksmbd_user *user = NULL;
int retval;
@@ -540,7 +541,10 @@ int ksmbd_krb5_authenticate(struct ksmbd_session *sess, char *in_blob,
goto out;
}
- user = ksmbd_alloc_user(&resp->login_response);
+ if (resp->login_response.status & KSMBD_USER_FLAG_EXTENSION)
+ resp_ext = ksmbd_ipc_login_request_ext(resp->login_response.account);
+
+ user = ksmbd_alloc_user(&resp->login_response, resp_ext);
if (!user) {
ksmbd_debug(AUTH, "login failure\n");
retval = -ENOMEM;
diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c
index aa2a37a7ce84..e6a72f75ab94 100644
--- a/fs/smb/server/connection.c
+++ b/fs/smb/server/connection.c
@@ -70,6 +70,7 @@ struct ksmbd_conn *ksmbd_conn_alloc(void)
atomic_set(&conn->req_running, 0);
atomic_set(&conn->r_count, 0);
atomic_set(&conn->refcnt, 1);
+ atomic_set(&conn->mux_smb_requests, 0);
conn->total_credits = 1;
conn->outstanding_credits = 0;
diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h
index b379ae4fdcdf..8ddd5a3c7baf 100644
--- a/fs/smb/server/connection.h
+++ b/fs/smb/server/connection.h
@@ -107,6 +107,7 @@ struct ksmbd_conn {
__le16 signing_algorithm;
bool binding;
atomic_t refcnt;
+ atomic_t mux_smb_requests;
};
struct ksmbd_conn_ops {
diff --git a/fs/smb/server/ksmbd_netlink.h b/fs/smb/server/ksmbd_netlink.h
index 38e6fd2da3b8..3d01d9d15293 100644
--- a/fs/smb/server/ksmbd_netlink.h
+++ b/fs/smb/server/ksmbd_netlink.h
@@ -51,6 +51,9 @@
* - KSMBD_EVENT_SPNEGO_AUTHEN_REQUEST/RESPONSE(ksmbd_spnego_authen_request/response)
* This event is to make kerberos authentication to be processed in
* userspace.
+ *
+ * - KSMBD_EVENT_LOGIN_REQUEST_EXT/RESPONSE_EXT(ksmbd_login_request_ext/response_ext)
+ * This event is to get user account extension info to user IPC daemon.
*/
#define KSMBD_GENL_NAME "SMBD_GENL"
@@ -146,6 +149,16 @@ struct ksmbd_login_response {
};
/*
+ * IPC user login response extension.
+ */
+struct ksmbd_login_response_ext {
+ __u32 handle;
+ __s32 ngroups; /* supplementary group count */
+ __s8 reserved[128]; /* Reserved room */
+ __s8 ____payload[];
+};
+
+/*
* IPC request to fetch net share config.
*/
struct ksmbd_share_config_request {
@@ -306,6 +319,9 @@ enum ksmbd_event {
KSMBD_EVENT_SPNEGO_AUTHEN_REQUEST,
KSMBD_EVENT_SPNEGO_AUTHEN_RESPONSE = 15,
+ KSMBD_EVENT_LOGIN_REQUEST_EXT,
+ KSMBD_EVENT_LOGIN_RESPONSE_EXT,
+
__KSMBD_EVENT_MAX,
KSMBD_EVENT_MAX = __KSMBD_EVENT_MAX - 1
};
@@ -336,6 +352,7 @@ enum KSMBD_TREE_CONN_STATUS {
#define KSMBD_USER_FLAG_BAD_USER BIT(3)
#define KSMBD_USER_FLAG_GUEST_ACCOUNT BIT(4)
#define KSMBD_USER_FLAG_DELAY_SESSION BIT(5)
+#define KSMBD_USER_FLAG_EXTENSION BIT(6)
/*
* Share config flags.
diff --git a/fs/smb/server/mgmt/user_config.c b/fs/smb/server/mgmt/user_config.c
index 279d00feff21..421a4a95e216 100644
--- a/fs/smb/server/mgmt/user_config.c
+++ b/fs/smb/server/mgmt/user_config.c
@@ -12,6 +12,7 @@
struct ksmbd_user *ksmbd_login_user(const char *account)
{
struct ksmbd_login_response *resp;
+ struct ksmbd_login_response_ext *resp_ext = NULL;
struct ksmbd_user *user = NULL;
resp = ksmbd_ipc_login_request(account);
@@ -21,15 +22,19 @@ struct ksmbd_user *ksmbd_login_user(const char *account)
if (!(resp->status & KSMBD_USER_FLAG_OK))
goto out;
- user = ksmbd_alloc_user(resp);
+ if (resp->status & KSMBD_USER_FLAG_EXTENSION)
+ resp_ext = ksmbd_ipc_login_request_ext(account);
+
+ user = ksmbd_alloc_user(resp, resp_ext);
out:
kvfree(resp);
return user;
}
-struct ksmbd_user *ksmbd_alloc_user(struct ksmbd_login_response *resp)
+struct ksmbd_user *ksmbd_alloc_user(struct ksmbd_login_response *resp,
+ struct ksmbd_login_response_ext *resp_ext)
{
- struct ksmbd_user *user = NULL;
+ struct ksmbd_user *user;
user = kmalloc(sizeof(struct ksmbd_user), GFP_KERNEL);
if (!user)
@@ -44,18 +49,42 @@ struct ksmbd_user *ksmbd_alloc_user(struct ksmbd_login_response *resp)
if (user->passkey)
memcpy(user->passkey, resp->hash, resp->hash_sz);
- if (!user->name || !user->passkey) {
- kfree(user->name);
- kfree(user->passkey);
- kfree(user);
- user = NULL;
+ user->ngroups = 0;
+ user->sgid = NULL;
+
+ if (!user->name || !user->passkey)
+ goto err_free;
+
+ if (resp_ext) {
+ if (resp_ext->ngroups > NGROUPS_MAX) {
+ pr_err("ngroups(%u) from login response exceeds max groups(%d)\n",
+ resp_ext->ngroups, NGROUPS_MAX);
+ goto err_free;
+ }
+
+ user->sgid = kmemdup(resp_ext->____payload,
+ resp_ext->ngroups * sizeof(gid_t),
+ GFP_KERNEL);
+ if (!user->sgid)
+ goto err_free;
+
+ user->ngroups = resp_ext->ngroups;
+ ksmbd_debug(SMB, "supplementary groups : %d\n", user->ngroups);
}
+
return user;
+
+err_free:
+ kfree(user->name);
+ kfree(user->passkey);
+ kfree(user);
+ return NULL;
}
void ksmbd_free_user(struct ksmbd_user *user)
{
ksmbd_ipc_logout_request(user->name, user->flags);
+ kfree(user->sgid);
kfree(user->name);
kfree(user->passkey);
kfree(user);
diff --git a/fs/smb/server/mgmt/user_config.h b/fs/smb/server/mgmt/user_config.h
index e068a19fd904..8c227b8d4954 100644
--- a/fs/smb/server/mgmt/user_config.h
+++ b/fs/smb/server/mgmt/user_config.h
@@ -18,6 +18,8 @@ struct ksmbd_user {
size_t passkey_sz;
char *passkey;
+ int ngroups;
+ gid_t *sgid;
};
static inline bool user_guest(struct ksmbd_user *user)
@@ -60,7 +62,8 @@ static inline unsigned int user_gid(struct ksmbd_user *user)
}
struct ksmbd_user *ksmbd_login_user(const char *account);
-struct ksmbd_user *ksmbd_alloc_user(struct ksmbd_login_response *resp);
+struct ksmbd_user *ksmbd_alloc_user(struct ksmbd_login_response *resp,
+ struct ksmbd_login_response_ext *resp_ext);
void ksmbd_free_user(struct ksmbd_user *user);
int ksmbd_anonymous_user(struct ksmbd_user *user);
bool ksmbd_compare_user(struct ksmbd_user *u1, struct ksmbd_user *u2);
diff --git a/fs/smb/server/mgmt/user_session.c b/fs/smb/server/mgmt/user_session.c
index 99416ce9f501..ad02fe555fda 100644
--- a/fs/smb/server/mgmt/user_session.c
+++ b/fs/smb/server/mgmt/user_session.c
@@ -90,7 +90,7 @@ static int __rpc_method(char *rpc_name)
int ksmbd_session_rpc_open(struct ksmbd_session *sess, char *rpc_name)
{
- struct ksmbd_session_rpc *entry;
+ struct ksmbd_session_rpc *entry, *old;
struct ksmbd_rpc_command *resp;
int method;
@@ -106,16 +106,19 @@ int ksmbd_session_rpc_open(struct ksmbd_session *sess, char *rpc_name)
entry->id = ksmbd_ipc_id_alloc();
if (entry->id < 0)
goto free_entry;
- xa_store(&sess->rpc_handle_list, entry->id, entry, GFP_KERNEL);
+ old = xa_store(&sess->rpc_handle_list, entry->id, entry, GFP_KERNEL);
+ if (xa_is_err(old))
+ goto free_id;
resp = ksmbd_rpc_open(sess, entry->id);
if (!resp)
- goto free_id;
+ goto erase_xa;
kvfree(resp);
return entry->id;
-free_id:
+erase_xa:
xa_erase(&sess->rpc_handle_list, entry->id);
+free_id:
ksmbd_rpc_id_free(entry->id);
free_entry:
kfree(entry);
@@ -175,11 +178,13 @@ static void ksmbd_expire_session(struct ksmbd_conn *conn)
unsigned long id;
struct ksmbd_session *sess;
+ down_write(&sessions_table_lock);
down_write(&conn->session_lock);
xa_for_each(&conn->sessions, id, sess) {
- if (sess->state != SMB2_SESSION_VALID ||
- time_after(jiffies,
- sess->last_active + SMB2_SESSION_TIMEOUT)) {
+ if (atomic_read(&sess->refcnt) == 0 &&
+ (sess->state != SMB2_SESSION_VALID ||
+ time_after(jiffies,
+ sess->last_active + SMB2_SESSION_TIMEOUT))) {
xa_erase(&conn->sessions, sess->id);
hash_del(&sess->hlist);
ksmbd_session_destroy(sess);
@@ -187,6 +192,7 @@ static void ksmbd_expire_session(struct ksmbd_conn *conn)
}
}
up_write(&conn->session_lock);
+ up_write(&sessions_table_lock);
}
int ksmbd_session_register(struct ksmbd_conn *conn,
@@ -228,7 +234,6 @@ void ksmbd_sessions_deregister(struct ksmbd_conn *conn)
}
}
}
- up_write(&sessions_table_lock);
down_write(&conn->session_lock);
xa_for_each(&conn->sessions, id, sess) {
@@ -248,6 +253,7 @@ void ksmbd_sessions_deregister(struct ksmbd_conn *conn)
}
}
up_write(&conn->session_lock);
+ up_write(&sessions_table_lock);
}
struct ksmbd_session *ksmbd_session_lookup(struct ksmbd_conn *conn,
@@ -269,8 +275,6 @@ struct ksmbd_session *ksmbd_session_lookup_slowpath(unsigned long long id)
down_read(&sessions_table_lock);
sess = __session_lookup(id);
- if (sess)
- sess->last_active = jiffies;
up_read(&sessions_table_lock);
return sess;
@@ -289,6 +293,22 @@ struct ksmbd_session *ksmbd_session_lookup_all(struct ksmbd_conn *conn,
return sess;
}
+void ksmbd_user_session_get(struct ksmbd_session *sess)
+{
+ atomic_inc(&sess->refcnt);
+}
+
+void ksmbd_user_session_put(struct ksmbd_session *sess)
+{
+ if (!sess)
+ return;
+
+ if (atomic_read(&sess->refcnt) <= 0)
+ WARN_ON(1);
+ else
+ atomic_dec(&sess->refcnt);
+}
+
struct preauth_session *ksmbd_preauth_session_alloc(struct ksmbd_conn *conn,
u64 sess_id)
{
@@ -393,6 +413,7 @@ static struct ksmbd_session *__session_create(int protocol)
xa_init(&sess->rpc_handle_list);
sess->sequence_number = 1;
rwlock_init(&sess->tree_conns_lock);
+ atomic_set(&sess->refcnt, 1);
ret = __init_smb2_session(sess);
if (ret)
diff --git a/fs/smb/server/mgmt/user_session.h b/fs/smb/server/mgmt/user_session.h
index dc9fded2cd43..c1c4b20bd5c6 100644
--- a/fs/smb/server/mgmt/user_session.h
+++ b/fs/smb/server/mgmt/user_session.h
@@ -61,6 +61,8 @@ struct ksmbd_session {
struct ksmbd_file_table file_table;
unsigned long last_active;
rwlock_t tree_conns_lock;
+
+ atomic_t refcnt;
};
static inline int test_session_flag(struct ksmbd_session *sess, int bit)
@@ -104,4 +106,6 @@ void ksmbd_release_tree_conn_id(struct ksmbd_session *sess, int id);
int ksmbd_session_rpc_open(struct ksmbd_session *sess, char *rpc_name);
void ksmbd_session_rpc_close(struct ksmbd_session *sess, int id);
int ksmbd_session_rpc_method(struct ksmbd_session *sess, int id);
+void ksmbd_user_session_get(struct ksmbd_session *sess);
+void ksmbd_user_session_put(struct ksmbd_session *sess);
#endif /* __USER_SESSION_MANAGEMENT_H__ */
diff --git a/fs/smb/server/server.c b/fs/smb/server/server.c
index 231d2d224656..e6cfedba9992 100644
--- a/fs/smb/server/server.c
+++ b/fs/smb/server/server.c
@@ -241,6 +241,8 @@ send:
if (work->tcon)
ksmbd_tree_connect_put(work->tcon);
smb3_preauth_hash_rsp(work);
+ if (work->sess)
+ ksmbd_user_session_put(work->sess);
if (work->sess && work->sess->enc && work->encrypted &&
conn->ops->encrypt_resp) {
rc = conn->ops->encrypt_resp(work);
@@ -268,6 +270,7 @@ static void handle_ksmbd_work(struct work_struct *wk)
ksmbd_conn_try_dequeue_request(work);
ksmbd_free_work_struct(work);
+ atomic_dec(&conn->mux_smb_requests);
/*
* Checking waitqueue to dropping pending requests on
* disconnection. waitqueue_active is safe because it
@@ -289,6 +292,15 @@ static int queue_ksmbd_work(struct ksmbd_conn *conn)
struct ksmbd_work *work;
int err;
+ err = ksmbd_init_smb_server(conn);
+ if (err)
+ return 0;
+
+ if (atomic_inc_return(&conn->mux_smb_requests) >= conn->vals->max_credits) {
+ atomic_dec_return(&conn->mux_smb_requests);
+ return -ENOSPC;
+ }
+
work = ksmbd_alloc_work_struct();
if (!work) {
pr_err("allocation for work failed\n");
@@ -299,12 +311,6 @@ static int queue_ksmbd_work(struct ksmbd_conn *conn)
work->request_buf = conn->request_buf;
conn->request_buf = NULL;
- err = ksmbd_init_smb_server(work);
- if (err) {
- ksmbd_free_work_struct(work);
- return 0;
- }
-
ksmbd_conn_enqueue_request(work);
atomic_inc(&conn->r_count);
/* update activity on connection */
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index 797b0f24097b..599118aed205 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -605,8 +605,10 @@ int smb2_check_user_session(struct ksmbd_work *work)
/* Check for validity of user session */
work->sess = ksmbd_session_lookup_all(conn, sess_id);
- if (work->sess)
+ if (work->sess) {
+ ksmbd_user_session_get(work->sess);
return 1;
+ }
ksmbd_debug(SMB, "Invalid user session, Uid %llu\n", sess_id);
return -ENOENT;
}
@@ -1740,6 +1742,7 @@ int smb2_sess_setup(struct ksmbd_work *work)
}
conn->binding = true;
+ ksmbd_user_session_get(sess);
} else if ((conn->dialect < SMB30_PROT_ID ||
server_conf.flags & KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL) &&
(req->Flags & SMB2_SESSION_REQ_FLAG_BINDING)) {
@@ -1766,6 +1769,7 @@ int smb2_sess_setup(struct ksmbd_work *work)
}
conn->binding = false;
+ ksmbd_user_session_get(sess);
}
work->sess = sess;
@@ -2228,7 +2232,9 @@ int smb2_session_logoff(struct ksmbd_work *work)
}
ksmbd_destroy_file_table(&sess->file_table);
+ down_write(&conn->session_lock);
sess->state = SMB2_SESSION_EXPIRED;
+ up_write(&conn->session_lock);
ksmbd_free_user(sess->user);
sess->user = NULL;
diff --git a/fs/smb/server/smb_common.c b/fs/smb/server/smb_common.c
index 5b8d75e78ffb..75b4eb856d32 100644
--- a/fs/smb/server/smb_common.c
+++ b/fs/smb/server/smb_common.c
@@ -388,6 +388,10 @@ static struct smb_version_ops smb1_server_ops = {
.set_rsp_status = set_smb1_rsp_status,
};
+static struct smb_version_values smb1_server_values = {
+ .max_credits = SMB2_MAX_CREDITS,
+};
+
static int smb1_negotiate(struct ksmbd_work *work)
{
return ksmbd_smb_negotiate_common(work, SMB_COM_NEGOTIATE);
@@ -399,18 +403,18 @@ static struct smb_version_cmds smb1_server_cmds[1] = {
static int init_smb1_server(struct ksmbd_conn *conn)
{
+ conn->vals = &smb1_server_values;
conn->ops = &smb1_server_ops;
conn->cmds = smb1_server_cmds;
conn->max_cmds = ARRAY_SIZE(smb1_server_cmds);
return 0;
}
-int ksmbd_init_smb_server(struct ksmbd_work *work)
+int ksmbd_init_smb_server(struct ksmbd_conn *conn)
{
- struct ksmbd_conn *conn = work->conn;
__le32 proto;
- proto = *(__le32 *)((struct smb_hdr *)work->request_buf)->Protocol;
+ proto = *(__le32 *)((struct smb_hdr *)conn->request_buf)->Protocol;
if (conn->need_neg == false) {
if (proto == SMB1_PROTO_NUMBER)
return -EINVAL;
@@ -736,13 +740,15 @@ int __ksmbd_override_fsids(struct ksmbd_work *work,
struct ksmbd_share_config *share)
{
struct ksmbd_session *sess = work->sess;
+ struct ksmbd_user *user = sess->user;
struct cred *cred;
struct group_info *gi;
unsigned int uid;
unsigned int gid;
+ int i;
- uid = user_uid(sess->user);
- gid = user_gid(sess->user);
+ uid = user_uid(user);
+ gid = user_gid(user);
if (share->force_uid != KSMBD_SHARE_INVALID_UID)
uid = share->force_uid;
if (share->force_gid != KSMBD_SHARE_INVALID_GID)
@@ -755,11 +761,18 @@ int __ksmbd_override_fsids(struct ksmbd_work *work,
cred->fsuid = make_kuid(&init_user_ns, uid);
cred->fsgid = make_kgid(&init_user_ns, gid);
- gi = groups_alloc(0);
+ gi = groups_alloc(user->ngroups);
if (!gi) {
abort_creds(cred);
return -ENOMEM;
}
+
+ for (i = 0; i < user->ngroups; i++)
+ gi->gid[i] = make_kgid(&init_user_ns, user->sgid[i]);
+
+ if (user->ngroups)
+ groups_sort(gi);
+
set_groups(cred, gi);
put_group_info(gi);
diff --git a/fs/smb/server/smb_common.h b/fs/smb/server/smb_common.h
index cc1d6dfe29d5..a3d8a905b07e 100644
--- a/fs/smb/server/smb_common.h
+++ b/fs/smb/server/smb_common.h
@@ -427,7 +427,7 @@ bool ksmbd_smb_request(struct ksmbd_conn *conn);
int ksmbd_lookup_dialect_by_id(__le16 *cli_dialects, __le16 dialects_count);
-int ksmbd_init_smb_server(struct ksmbd_work *work);
+int ksmbd_init_smb_server(struct ksmbd_conn *conn);
struct ksmbd_kstat;
int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work,
diff --git a/fs/smb/server/transport_ipc.c b/fs/smb/server/transport_ipc.c
index 8752ac82c557..2f27afb695f6 100644
--- a/fs/smb/server/transport_ipc.c
+++ b/fs/smb/server/transport_ipc.c
@@ -120,6 +120,12 @@ static const struct nla_policy ksmbd_nl_policy[KSMBD_EVENT_MAX + 1] = {
},
[KSMBD_EVENT_SPNEGO_AUTHEN_RESPONSE] = {
},
+ [KSMBD_EVENT_LOGIN_REQUEST_EXT] = {
+ .len = sizeof(struct ksmbd_login_request),
+ },
+ [KSMBD_EVENT_LOGIN_RESPONSE_EXT] = {
+ .len = sizeof(struct ksmbd_login_response_ext),
+ },
};
static struct genl_ops ksmbd_genl_ops[] = {
@@ -187,6 +193,14 @@ static struct genl_ops ksmbd_genl_ops[] = {
.cmd = KSMBD_EVENT_SPNEGO_AUTHEN_RESPONSE,
.doit = handle_generic_event,
},
+ {
+ .cmd = KSMBD_EVENT_LOGIN_REQUEST_EXT,
+ .doit = handle_unsupported_event,
+ },
+ {
+ .cmd = KSMBD_EVENT_LOGIN_RESPONSE_EXT,
+ .doit = handle_generic_event,
+ },
};
static struct genl_family ksmbd_genl_family = {
@@ -198,7 +212,7 @@ static struct genl_family ksmbd_genl_family = {
.module = THIS_MODULE,
.ops = ksmbd_genl_ops,
.n_ops = ARRAY_SIZE(ksmbd_genl_ops),
- .resv_start_op = KSMBD_EVENT_SPNEGO_AUTHEN_RESPONSE + 1,
+ .resv_start_op = KSMBD_EVENT_LOGIN_RESPONSE_EXT + 1,
};
static void ksmbd_nl_init_fixup(void)
@@ -459,16 +473,24 @@ static int ipc_validate_msg(struct ipc_msg_table_entry *entry)
{
unsigned int msg_sz = entry->msg_sz;
- if (entry->type == KSMBD_EVENT_RPC_REQUEST) {
+ switch (entry->type) {
+ case KSMBD_EVENT_RPC_REQUEST:
+ {
struct ksmbd_rpc_command *resp = entry->response;
msg_sz = sizeof(struct ksmbd_rpc_command) + resp->payload_sz;
- } else if (entry->type == KSMBD_EVENT_SPNEGO_AUTHEN_REQUEST) {
+ break;
+ }
+ case KSMBD_EVENT_SPNEGO_AUTHEN_REQUEST:
+ {
struct ksmbd_spnego_authen_response *resp = entry->response;
msg_sz = sizeof(struct ksmbd_spnego_authen_response) +
resp->session_key_len + resp->spnego_blob_len;
- } else if (entry->type == KSMBD_EVENT_SHARE_CONFIG_REQUEST) {
+ break;
+ }
+ case KSMBD_EVENT_SHARE_CONFIG_REQUEST:
+ {
struct ksmbd_share_config_response *resp = entry->response;
if (resp->payload_sz) {
@@ -478,6 +500,17 @@ static int ipc_validate_msg(struct ipc_msg_table_entry *entry)
msg_sz = sizeof(struct ksmbd_share_config_response) +
resp->payload_sz;
}
+ break;
+ }
+ case KSMBD_EVENT_LOGIN_REQUEST_EXT:
+ {
+ struct ksmbd_login_response_ext *resp = entry->response;
+
+ if (resp->ngroups) {
+ msg_sz = sizeof(struct ksmbd_login_response_ext) +
+ resp->ngroups * sizeof(gid_t);
+ }
+ }
}
return entry->msg_sz != msg_sz ? -EINVAL : 0;
@@ -560,6 +593,29 @@ struct ksmbd_login_response *ksmbd_ipc_login_request(const char *account)
return resp;
}
+struct ksmbd_login_response_ext *ksmbd_ipc_login_request_ext(const char *account)
+{
+ struct ksmbd_ipc_msg *msg;
+ struct ksmbd_login_request *req;
+ struct ksmbd_login_response_ext *resp;
+
+ if (strlen(account) >= KSMBD_REQ_MAX_ACCOUNT_NAME_SZ)
+ return NULL;
+
+ msg = ipc_msg_alloc(sizeof(struct ksmbd_login_request));
+ if (!msg)
+ return NULL;
+
+ msg->type = KSMBD_EVENT_LOGIN_REQUEST_EXT;
+ req = (struct ksmbd_login_request *)msg->payload;
+ req->handle = ksmbd_acquire_id(&ipc_ida);
+ strscpy(req->account, account, KSMBD_REQ_MAX_ACCOUNT_NAME_SZ);
+ resp = ipc_msg_send_request(msg, req->handle);
+ ipc_msg_handle_free(req->handle);
+ ipc_msg_free(msg);
+ return resp;
+}
+
struct ksmbd_spnego_authen_response *
ksmbd_ipc_spnego_authen_request(const char *spnego_blob, int blob_len)
{
diff --git a/fs/smb/server/transport_ipc.h b/fs/smb/server/transport_ipc.h
index 5e5b90a0c187..d9b6737f8cd0 100644
--- a/fs/smb/server/transport_ipc.h
+++ b/fs/smb/server/transport_ipc.h
@@ -12,6 +12,8 @@
struct ksmbd_login_response *
ksmbd_ipc_login_request(const char *account);
+struct ksmbd_login_response_ext *
+ksmbd_ipc_login_request_ext(const char *account);
struct ksmbd_session;
struct ksmbd_share_config;
diff --git a/fs/splice.c b/fs/splice.c
index 06232d7e505f..2898fa1e9e63 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1564,21 +1564,6 @@ static ssize_t vmsplice_to_pipe(struct file *file, struct iov_iter *iter,
return ret;
}
-static int vmsplice_type(struct fd f, int *type)
-{
- if (!fd_file(f))
- return -EBADF;
- if (fd_file(f)->f_mode & FMODE_WRITE) {
- *type = ITER_SOURCE;
- } else if (fd_file(f)->f_mode & FMODE_READ) {
- *type = ITER_DEST;
- } else {
- fdput(f);
- return -EBADF;
- }
- return 0;
-}
-
/*
* Note that vmsplice only really supports true splicing _from_ user memory
* to a pipe, not the other way around. Splicing from user memory is a simple
@@ -1602,21 +1587,25 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
struct iovec *iov = iovstack;
struct iov_iter iter;
ssize_t error;
- struct fd f;
int type;
if (unlikely(flags & ~SPLICE_F_ALL))
return -EINVAL;
- f = fdget(fd);
- error = vmsplice_type(f, &type);
- if (error)
- return error;
+ CLASS(fd, f)(fd);
+ if (fd_empty(f))
+ return -EBADF;
+ if (fd_file(f)->f_mode & FMODE_WRITE)
+ type = ITER_SOURCE;
+ else if (fd_file(f)->f_mode & FMODE_READ)
+ type = ITER_DEST;
+ else
+ return -EBADF;
error = import_iovec(type, uiov, nr_segs,
ARRAY_SIZE(iovstack), &iov, &iter);
if (error < 0)
- goto out_fdput;
+ return error;
if (!iov_iter_count(&iter))
error = 0;
@@ -1626,8 +1615,6 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, uiov,
error = vmsplice_to_user(fd_file(f), &iter, flags);
kfree(iov);
-out_fdput:
- fdput(f);
return error;
}
@@ -1635,27 +1622,22 @@ SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
int, fd_out, loff_t __user *, off_out,
size_t, len, unsigned int, flags)
{
- struct fd in, out;
- ssize_t error;
-
if (unlikely(!len))
return 0;
if (unlikely(flags & ~SPLICE_F_ALL))
return -EINVAL;
- error = -EBADF;
- in = fdget(fd_in);
- if (fd_file(in)) {
- out = fdget(fd_out);
- if (fd_file(out)) {
- error = __do_splice(fd_file(in), off_in, fd_file(out), off_out,
+ CLASS(fd, in)(fd_in);
+ if (fd_empty(in))
+ return -EBADF;
+
+ CLASS(fd, out)(fd_out);
+ if (fd_empty(out))
+ return -EBADF;
+
+ return __do_splice(fd_file(in), off_in, fd_file(out), off_out,
len, flags);
- fdput(out);
- }
- fdput(in);
- }
- return error;
}
/*
@@ -2005,25 +1987,19 @@ ssize_t do_tee(struct file *in, struct file *out, size_t len,
SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
{
- struct fd in, out;
- ssize_t error;
-
if (unlikely(flags & ~SPLICE_F_ALL))
return -EINVAL;
if (unlikely(!len))
return 0;
- error = -EBADF;
- in = fdget(fdin);
- if (fd_file(in)) {
- out = fdget(fdout);
- if (fd_file(out)) {
- error = do_tee(fd_file(in), fd_file(out), len, flags);
- fdput(out);
- }
- fdput(in);
- }
+ CLASS(fd, in)(fdin);
+ if (fd_empty(in))
+ return -EBADF;
- return error;
+ CLASS(fd, out)(fdout);
+ if (fd_empty(out))
+ return -EBADF;
+
+ return do_tee(fd_file(in), fd_file(out), len, flags);
}
diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c
index 22251743fadf..d19d4db74af8 100644
--- a/fs/squashfs/file_direct.c
+++ b/fs/squashfs/file_direct.c
@@ -30,7 +30,8 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize,
int mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1;
loff_t start_index = folio->index & ~mask;
loff_t end_index = start_index | mask;
- int i, n, pages, bytes, res = -ENOMEM;
+ loff_t index;
+ int i, pages, bytes, res = -ENOMEM;
struct page **page, *last_page;
struct squashfs_page_actor *actor;
void *pageaddr;
@@ -45,9 +46,9 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize,
return res;
/* Try to grab all the pages covered by the Squashfs block */
- for (i = 0, n = start_index; n <= end_index; n++) {
- page[i] = (n == folio->index) ? target_page :
- grab_cache_page_nowait(target_page->mapping, n);
+ for (i = 0, index = start_index; index <= end_index; index++) {
+ page[i] = (index == folio->index) ? target_page :
+ grab_cache_page_nowait(target_page->mapping, index);
if (page[i] == NULL)
continue;
diff --git a/fs/stat.c b/fs/stat.c
index 41e598376d7e..0870e969a8a0 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -23,10 +23,46 @@
#include <linux/uaccess.h>
#include <asm/unistd.h>
+#include <trace/events/timestamp.h>
+
#include "internal.h"
#include "mount.h"
/**
+ * fill_mg_cmtime - Fill in the mtime and ctime and flag ctime as QUERIED
+ * @stat: where to store the resulting values
+ * @request_mask: STATX_* values requested
+ * @inode: inode from which to grab the c/mtime
+ *
+ * Given @inode, grab the ctime and mtime out if it and store the result
+ * in @stat. When fetching the value, flag it as QUERIED (if not already)
+ * so the next write will record a distinct timestamp.
+ *
+ * NB: The QUERIED flag is tracked in the ctime, but we set it there even
+ * if only the mtime was requested, as that ensures that the next mtime
+ * change will be distinct.
+ */
+void fill_mg_cmtime(struct kstat *stat, u32 request_mask, struct inode *inode)
+{
+ atomic_t *pcn = (atomic_t *)&inode->i_ctime_nsec;
+
+ /* If neither time was requested, then don't report them */
+ if (!(request_mask & (STATX_CTIME|STATX_MTIME))) {
+ stat->result_mask &= ~(STATX_CTIME|STATX_MTIME);
+ return;
+ }
+
+ stat->mtime = inode_get_mtime(inode);
+ stat->ctime.tv_sec = inode->i_ctime_sec;
+ stat->ctime.tv_nsec = (u32)atomic_read(pcn);
+ if (!(stat->ctime.tv_nsec & I_CTIME_QUERIED))
+ stat->ctime.tv_nsec = ((u32)atomic_fetch_or(I_CTIME_QUERIED, pcn));
+ stat->ctime.tv_nsec &= ~I_CTIME_QUERIED;
+ trace_fill_mg_cmtime(inode, &stat->ctime, &stat->mtime);
+}
+EXPORT_SYMBOL(fill_mg_cmtime);
+
+/**
* generic_fillattr - Fill in the basic attributes from the inode struct
* @idmap: idmap of the mount the inode was found from
* @request_mask: statx request_mask
@@ -58,8 +94,14 @@ void generic_fillattr(struct mnt_idmap *idmap, u32 request_mask,
stat->rdev = inode->i_rdev;
stat->size = i_size_read(inode);
stat->atime = inode_get_atime(inode);
- stat->mtime = inode_get_mtime(inode);
- stat->ctime = inode_get_ctime(inode);
+
+ if (is_mgtime(inode)) {
+ fill_mg_cmtime(stat, request_mask, inode);
+ } else {
+ stat->ctime = inode_get_ctime(inode);
+ stat->mtime = inode_get_mtime(inode);
+ }
+
stat->blksize = i_blocksize(inode);
stat->blocks = inode->i_blocks;
@@ -165,7 +207,7 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat,
if (inode->i_op->getattr)
return inode->i_op->getattr(idmap, path, stat,
request_mask,
- query_flags | AT_GETATTR_NOSEC);
+ query_flags);
generic_fillattr(idmap, request_mask, inode, stat);
return 0;
@@ -198,9 +240,6 @@ int vfs_getattr(const struct path *path, struct kstat *stat,
{
int retval;
- if (WARN_ON_ONCE(query_flags & AT_GETATTR_NOSEC))
- return -EPERM;
-
retval = security_inode_getattr(path);
if (retval)
return retval;
@@ -220,18 +259,13 @@ EXPORT_SYMBOL(vfs_getattr);
*/
int vfs_fstat(int fd, struct kstat *stat)
{
- struct fd f;
- int error;
-
- f = fdget_raw(fd);
- if (!fd_file(f))
+ CLASS(fd_raw, f)(fd);
+ if (fd_empty(f))
return -EBADF;
- error = vfs_getattr(&fd_file(f)->f_path, stat, STATX_BASIC_STATS, 0);
- fdput(f);
- return error;
+ return vfs_getattr(&fd_file(f)->f_path, stat, STATX_BASIC_STATS, 0);
}
-int getname_statx_lookup_flags(int flags)
+static int statx_lookup_flags(int flags)
{
int lookup_flags = 0;
@@ -239,8 +273,6 @@ int getname_statx_lookup_flags(int flags)
lookup_flags |= LOOKUP_FOLLOW;
if (!(flags & AT_NO_AUTOMOUNT))
lookup_flags |= LOOKUP_AUTOMOUNT;
- if (flags & AT_EMPTY_PATH)
- lookup_flags |= LOOKUP_EMPTY;
return lookup_flags;
}
@@ -277,7 +309,7 @@ static int vfs_statx_fd(int fd, int flags, struct kstat *stat,
u32 request_mask)
{
CLASS(fd_raw, f)(fd);
- if (!fd_file(f))
+ if (fd_empty(f))
return -EBADF;
return vfs_statx_path(&fd_file(f)->f_path, flags, stat, request_mask);
}
@@ -301,7 +333,7 @@ static int vfs_statx(int dfd, struct filename *filename, int flags,
struct kstat *stat, u32 request_mask)
{
struct path path;
- unsigned int lookup_flags = getname_statx_lookup_flags(flags);
+ unsigned int lookup_flags = statx_lookup_flags(flags);
int error;
if (flags & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT | AT_EMPTY_PATH |
@@ -326,18 +358,11 @@ int vfs_fstatat(int dfd, const char __user *filename,
{
int ret;
int statx_flags = flags | AT_NO_AUTOMOUNT;
- struct filename *name;
+ struct filename *name = getname_maybe_null(filename, flags);
- /*
- * Work around glibc turning fstat() into fstatat(AT_EMPTY_PATH)
- *
- * If AT_EMPTY_PATH is set, we expect the common case to be that
- * empty path, and avoid doing all the extra pathname work.
- */
- if (flags == AT_EMPTY_PATH && vfs_empty_path(dfd, filename))
+ if (!name && dfd >= 0)
return vfs_fstat(dfd, stat);
- name = getname_flags(filename, getname_statx_lookup_flags(statx_flags));
ret = vfs_statx(dfd, name, statx_flags, stat, STATX_BASIC_STATS);
putname(name);
@@ -774,24 +799,11 @@ SYSCALL_DEFINE5(statx,
struct statx __user *, buffer)
{
int ret;
- unsigned lflags;
- struct filename *name;
+ struct filename *name = getname_maybe_null(filename, flags);
- /*
- * Short-circuit handling of NULL and "" paths.
- *
- * For a NULL path we require and accept only the AT_EMPTY_PATH flag
- * (possibly |'d with AT_STATX flags).
- *
- * However, glibc on 32-bit architectures implements fstatat as statx
- * with the "" pathname and AT_NO_AUTOMOUNT | AT_EMPTY_PATH flags.
- * Supporting this results in the uglification below.
- */
- lflags = flags & ~(AT_NO_AUTOMOUNT | AT_STATX_SYNC_TYPE);
- if (lflags == AT_EMPTY_PATH && vfs_empty_path(dfd, filename))
+ if (!name && dfd >= 0)
return do_statx_fd(dfd, flags & ~AT_NO_AUTOMOUNT, mask, buffer);
- name = getname_flags(filename, getname_statx_lookup_flags(flags));
ret = do_statx(dfd, name, flags, mask, buffer);
putname(name);
diff --git a/fs/statfs.c b/fs/statfs.c
index 9c7bb27e7932..a45ac85e6048 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -114,13 +114,11 @@ retry:
int fd_statfs(int fd, struct kstatfs *st)
{
- struct fd f = fdget_raw(fd);
- int error = -EBADF;
- if (fd_file(f)) {
- error = vfs_statfs(&fd_file(f)->f_path, st);
- fdput(f);
- }
- return error;
+ CLASS(fd_raw, f)(fd);
+
+ if (fd_empty(f))
+ return -EBADF;
+ return vfs_statfs(&fd_file(f)->f_path, st);
}
static int do_statfs_native(struct kstatfs *st, struct statfs __user *p)
diff --git a/fs/super.c b/fs/super.c
index 1db230432960..c9c7223bc2a2 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1596,13 +1596,14 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
EXPORT_SYMBOL_GPL(setup_bdev_super);
/**
- * get_tree_bdev - Get a superblock based on a single block device
+ * get_tree_bdev_flags - Get a superblock based on a single block device
* @fc: The filesystem context holding the parameters
* @fill_super: Helper to initialise a new superblock
+ * @flags: GET_TREE_BDEV_* flags
*/
-int get_tree_bdev(struct fs_context *fc,
- int (*fill_super)(struct super_block *,
- struct fs_context *))
+int get_tree_bdev_flags(struct fs_context *fc,
+ int (*fill_super)(struct super_block *sb,
+ struct fs_context *fc), unsigned int flags)
{
struct super_block *s;
int error = 0;
@@ -1613,10 +1614,10 @@ int get_tree_bdev(struct fs_context *fc,
error = lookup_bdev(fc->source, &dev);
if (error) {
- errorf(fc, "%s: Can't lookup blockdev", fc->source);
+ if (!(flags & GET_TREE_BDEV_QUIET_LOOKUP))
+ errorf(fc, "%s: Can't lookup blockdev", fc->source);
return error;
}
-
fc->sb_flags |= SB_NOSEC;
s = sget_dev(fc, dev);
if (IS_ERR(s))
@@ -1644,6 +1645,19 @@ int get_tree_bdev(struct fs_context *fc,
fc->root = dget(s->s_root);
return 0;
}
+EXPORT_SYMBOL_GPL(get_tree_bdev_flags);
+
+/**
+ * get_tree_bdev - Get a superblock based on a single block device
+ * @fc: The filesystem context holding the parameters
+ * @fill_super: Helper to initialise a new superblock
+ */
+int get_tree_bdev(struct fs_context *fc,
+ int (*fill_super)(struct super_block *,
+ struct fs_context *))
+{
+ return get_tree_bdev_flags(fc, fill_super, 0);
+}
EXPORT_SYMBOL(get_tree_bdev);
static int test_bdev_super(struct super_block *s, void *data)
diff --git a/fs/sync.c b/fs/sync.c
index 67df255eb189..2955cd4c77a3 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -148,11 +148,11 @@ void emergency_sync(void)
*/
SYSCALL_DEFINE1(syncfs, int, fd)
{
- struct fd f = fdget(fd);
+ CLASS(fd, f)(fd);
struct super_block *sb;
int ret, ret2;
- if (!fd_file(f))
+ if (fd_empty(f))
return -EBADF;
sb = fd_file(f)->f_path.dentry->d_sb;
@@ -162,7 +162,6 @@ SYSCALL_DEFINE1(syncfs, int, fd)
ret2 = errseq_check_and_advance(&sb->s_wb_err, &fd_file(f)->f_sb_err);
- fdput(f);
return ret ? ret : ret2;
}
@@ -205,14 +204,12 @@ EXPORT_SYMBOL(vfs_fsync);
static int do_fsync(unsigned int fd, int datasync)
{
- struct fd f = fdget(fd);
- int ret = -EBADF;
+ CLASS(fd, f)(fd);
- if (fd_file(f)) {
- ret = vfs_fsync(fd_file(f), datasync);
- fdput(f);
- }
- return ret;
+ if (fd_empty(f))
+ return -EBADF;
+
+ return vfs_fsync(fd_file(f), datasync);
}
SYSCALL_DEFINE1(fsync, unsigned int, fd)
@@ -355,16 +352,12 @@ out:
int ksys_sync_file_range(int fd, loff_t offset, loff_t nbytes,
unsigned int flags)
{
- int ret;
- struct fd f;
+ CLASS(fd, f)(fd);
- ret = -EBADF;
- f = fdget(fd);
- if (fd_file(f))
- ret = sync_file_range(fd_file(f), offset, nbytes, flags);
+ if (fd_empty(f))
+ return -EBADF;
- fdput(f);
- return ret;
+ return sync_file_range(fd_file(f), offset, nbytes, flags);
}
SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes,
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 137523e0bb21..4c32244b0508 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -394,19 +394,6 @@ static const struct file_operations timerfd_fops = {
.unlocked_ioctl = timerfd_ioctl,
};
-static int timerfd_fget(int fd, struct fd *p)
-{
- struct fd f = fdget(fd);
- if (!fd_file(f))
- return -EBADF;
- if (fd_file(f)->f_op != &timerfd_fops) {
- fdput(f);
- return -EINVAL;
- }
- *p = f;
- return 0;
-}
-
SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
{
int ufd;
@@ -471,7 +458,6 @@ static int do_timerfd_settime(int ufd, int flags,
const struct itimerspec64 *new,
struct itimerspec64 *old)
{
- struct fd f;
struct timerfd_ctx *ctx;
int ret;
@@ -479,15 +465,17 @@ static int do_timerfd_settime(int ufd, int flags,
!itimerspec64_valid(new))
return -EINVAL;
- ret = timerfd_fget(ufd, &f);
- if (ret)
- return ret;
+ CLASS(fd, f)(ufd);
+ if (fd_empty(f))
+ return -EBADF;
+
+ if (fd_file(f)->f_op != &timerfd_fops)
+ return -EINVAL;
+
ctx = fd_file(f)->private_data;
- if (isalarm(ctx) && !capable(CAP_WAKE_ALARM)) {
- fdput(f);
+ if (isalarm(ctx) && !capable(CAP_WAKE_ALARM))
return -EPERM;
- }
timerfd_setup_cancel(ctx, flags);
@@ -535,17 +523,18 @@ static int do_timerfd_settime(int ufd, int flags,
ret = timerfd_setup(ctx, flags, new);
spin_unlock_irq(&ctx->wqh.lock);
- fdput(f);
return ret;
}
static int do_timerfd_gettime(int ufd, struct itimerspec64 *t)
{
- struct fd f;
struct timerfd_ctx *ctx;
- int ret = timerfd_fget(ufd, &f);
- if (ret)
- return ret;
+ CLASS(fd, f)(ufd);
+
+ if (fd_empty(f))
+ return -EBADF;
+ if (fd_file(f)->f_op != &timerfd_fops)
+ return -EINVAL;
ctx = fd_file(f)->private_data;
spin_lock_irq(&ctx->wqh.lock);
@@ -567,7 +556,6 @@ static int do_timerfd_gettime(int ufd, struct itimerspec64 *t)
t->it_value = ktime_to_timespec64(timerfd_get_remaining(ctx));
t->it_interval = ktime_to_timespec64(ctx->tintv);
spin_unlock_irq(&ctx->wqh.lock);
- fdput(f);
return 0;
}
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index 1748dff58c3b..cfc614c638da 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -392,6 +392,9 @@ static int tracefs_reconfigure(struct fs_context *fc)
struct tracefs_fs_info *sb_opts = sb->s_fs_info;
struct tracefs_fs_info *new_opts = fc->s_fs_info;
+ if (!new_opts)
+ return 0;
+
sync_filesystem(sb);
/* structure copy of new mount options to sb */
*sb_opts = *new_opts;
@@ -478,14 +481,17 @@ static int tracefs_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_op = &tracefs_super_operations;
sb->s_d_op = &tracefs_dentry_operations;
- tracefs_apply_options(sb, false);
-
return 0;
}
static int tracefs_get_tree(struct fs_context *fc)
{
- return get_tree_single(fc, tracefs_fill_super);
+ int err = get_tree_single(fc, tracefs_fill_super);
+
+ if (err)
+ return err;
+
+ return tracefs_reconfigure(fc);
}
static void tracefs_free_fc(struct fs_context *fc)
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 291583005dd1..3fb308b6e167 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -19,9 +19,9 @@
#include <linux/module.h>
#include <linux/ctype.h>
#include <linux/kthread.h>
-#include <linux/parser.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include <linux/seq_file.h>
-#include <linux/mount.h>
#include <linux/math64.h>
#include <linux/writeback.h>
#include "ubifs.h"
@@ -981,177 +981,120 @@ enum {
Opt_auth_key,
Opt_auth_hash_name,
Opt_ignore,
- Opt_err,
};
-static const match_table_t tokens = {
- {Opt_fast_unmount, "fast_unmount"},
- {Opt_norm_unmount, "norm_unmount"},
- {Opt_bulk_read, "bulk_read"},
- {Opt_no_bulk_read, "no_bulk_read"},
- {Opt_chk_data_crc, "chk_data_crc"},
- {Opt_no_chk_data_crc, "no_chk_data_crc"},
- {Opt_override_compr, "compr=%s"},
- {Opt_auth_key, "auth_key=%s"},
- {Opt_auth_hash_name, "auth_hash_name=%s"},
- {Opt_ignore, "ubi=%s"},
- {Opt_ignore, "vol=%s"},
- {Opt_assert, "assert=%s"},
- {Opt_err, NULL},
+static const struct constant_table ubifs_param_compr[] = {
+ { "none", UBIFS_COMPR_NONE },
+ { "lzo", UBIFS_COMPR_LZO },
+ { "zlib", UBIFS_COMPR_ZLIB },
+ { "zstd", UBIFS_COMPR_ZSTD },
+ {}
};
-/**
- * parse_standard_option - parse a standard mount option.
- * @option: the option to parse
- *
- * Normally, standard mount options like "sync" are passed to file-systems as
- * flags. However, when a "rootflags=" kernel boot parameter is used, they may
- * be present in the options string. This function tries to deal with this
- * situation and parse standard options. Returns 0 if the option was not
- * recognized, and the corresponding integer flag if it was.
- *
- * UBIFS is only interested in the "sync" option, so do not check for anything
- * else.
- */
-static int parse_standard_option(const char *option)
-{
+static const struct constant_table ubifs_param_assert[] = {
+ { "report", ASSACT_REPORT },
+ { "read-only", ASSACT_RO },
+ { "panic", ASSACT_PANIC },
+ {}
+};
- pr_notice("UBIFS: parse %s\n", option);
- if (!strcmp(option, "sync"))
- return SB_SYNCHRONOUS;
- return 0;
-}
+static const struct fs_parameter_spec ubifs_fs_param_spec[] = {
+ fsparam_flag ("fast_unmount", Opt_fast_unmount),
+ fsparam_flag ("norm_unmount", Opt_norm_unmount),
+ fsparam_flag ("bulk_read", Opt_bulk_read),
+ fsparam_flag ("no_bulk_read", Opt_no_bulk_read),
+ fsparam_flag ("chk_data_crc", Opt_chk_data_crc),
+ fsparam_flag ("no_chk_data_crc", Opt_no_chk_data_crc),
+ fsparam_enum ("compr", Opt_override_compr, ubifs_param_compr),
+ fsparam_enum ("assert", Opt_assert, ubifs_param_assert),
+ fsparam_string ("auth_key", Opt_auth_key),
+ fsparam_string ("auth_hash_name", Opt_auth_hash_name),
+ fsparam_string ("ubi", Opt_ignore),
+ fsparam_string ("vol", Opt_ignore),
+ {}
+};
+
+struct ubifs_fs_context {
+ struct ubifs_mount_opts mount_opts;
+ char *auth_key_name;
+ char *auth_hash_name;
+ unsigned int no_chk_data_crc:1;
+ unsigned int bulk_read:1;
+ unsigned int default_compr:2;
+ unsigned int assert_action:2;
+};
/**
- * ubifs_parse_options - parse mount parameters.
- * @c: UBIFS file-system description object
- * @options: parameters to parse
- * @is_remount: non-zero if this is FS re-mount
+ * ubifs_parse_param - parse a parameter.
+ * @fc: the filesystem context
+ * @param: the parameter to parse
*
* This function parses UBIFS mount options and returns zero in case success
* and a negative error code in case of failure.
*/
-static int ubifs_parse_options(struct ubifs_info *c, char *options,
- int is_remount)
+static int ubifs_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- char *p;
- substring_t args[MAX_OPT_ARGS];
-
- if (!options)
- return 0;
+ struct ubifs_fs_context *ctx = fc->fs_private;
+ struct fs_parse_result result;
+ bool is_remount = (fc->purpose & FS_CONTEXT_FOR_RECONFIGURE);
+ int opt;
- while ((p = strsep(&options, ","))) {
- int token;
+ opt = fs_parse(fc, ubifs_fs_param_spec, param, &result);
+ if (opt < 0)
+ return opt;
- if (!*p)
- continue;
-
- token = match_token(p, tokens, args);
- switch (token) {
+ switch (opt) {
/*
* %Opt_fast_unmount and %Opt_norm_unmount options are ignored.
* We accept them in order to be backward-compatible. But this
* should be removed at some point.
*/
- case Opt_fast_unmount:
- c->mount_opts.unmount_mode = 2;
- break;
- case Opt_norm_unmount:
- c->mount_opts.unmount_mode = 1;
- break;
- case Opt_bulk_read:
- c->mount_opts.bulk_read = 2;
- c->bulk_read = 1;
- break;
- case Opt_no_bulk_read:
- c->mount_opts.bulk_read = 1;
- c->bulk_read = 0;
- break;
- case Opt_chk_data_crc:
- c->mount_opts.chk_data_crc = 2;
- c->no_chk_data_crc = 0;
- break;
- case Opt_no_chk_data_crc:
- c->mount_opts.chk_data_crc = 1;
- c->no_chk_data_crc = 1;
- break;
- case Opt_override_compr:
- {
- char *name = match_strdup(&args[0]);
-
- if (!name)
- return -ENOMEM;
- if (!strcmp(name, "none"))
- c->mount_opts.compr_type = UBIFS_COMPR_NONE;
- else if (!strcmp(name, "lzo"))
- c->mount_opts.compr_type = UBIFS_COMPR_LZO;
- else if (!strcmp(name, "zlib"))
- c->mount_opts.compr_type = UBIFS_COMPR_ZLIB;
- else if (!strcmp(name, "zstd"))
- c->mount_opts.compr_type = UBIFS_COMPR_ZSTD;
- else {
- ubifs_err(c, "unknown compressor \"%s\"", name); //FIXME: is c ready?
- kfree(name);
- return -EINVAL;
- }
- kfree(name);
- c->mount_opts.override_compr = 1;
- c->default_compr = c->mount_opts.compr_type;
- break;
- }
- case Opt_assert:
- {
- char *act = match_strdup(&args[0]);
-
- if (!act)
- return -ENOMEM;
- if (!strcmp(act, "report"))
- c->assert_action = ASSACT_REPORT;
- else if (!strcmp(act, "read-only"))
- c->assert_action = ASSACT_RO;
- else if (!strcmp(act, "panic"))
- c->assert_action = ASSACT_PANIC;
- else {
- ubifs_err(c, "unknown assert action \"%s\"", act);
- kfree(act);
- return -EINVAL;
- }
- kfree(act);
- break;
- }
- case Opt_auth_key:
- if (!is_remount) {
- c->auth_key_name = kstrdup(args[0].from,
- GFP_KERNEL);
- if (!c->auth_key_name)
- return -ENOMEM;
- }
- break;
- case Opt_auth_hash_name:
- if (!is_remount) {
- c->auth_hash_name = kstrdup(args[0].from,
- GFP_KERNEL);
- if (!c->auth_hash_name)
- return -ENOMEM;
- }
- break;
- case Opt_ignore:
- break;
- default:
- {
- unsigned long flag;
- struct super_block *sb = c->vfs_sb;
-
- flag = parse_standard_option(p);
- if (!flag) {
- ubifs_err(c, "unrecognized mount option \"%s\" or missing value",
- p);
- return -EINVAL;
- }
- sb->s_flags |= flag;
- break;
+ case Opt_fast_unmount:
+ ctx->mount_opts.unmount_mode = 2;
+ break;
+ case Opt_norm_unmount:
+ ctx->mount_opts.unmount_mode = 1;
+ break;
+ case Opt_bulk_read:
+ ctx->mount_opts.bulk_read = 2;
+ ctx->bulk_read = 1;
+ break;
+ case Opt_no_bulk_read:
+ ctx->mount_opts.bulk_read = 1;
+ ctx->bulk_read = 0;
+ break;
+ case Opt_chk_data_crc:
+ ctx->mount_opts.chk_data_crc = 2;
+ ctx->no_chk_data_crc = 0;
+ break;
+ case Opt_no_chk_data_crc:
+ ctx->mount_opts.chk_data_crc = 1;
+ ctx->no_chk_data_crc = 1;
+ break;
+ case Opt_override_compr:
+ ctx->mount_opts.compr_type = result.uint_32;
+ ctx->mount_opts.override_compr = 1;
+ ctx->default_compr = ctx->mount_opts.compr_type;
+ break;
+ case Opt_assert:
+ ctx->assert_action = result.uint_32;
+ break;
+ case Opt_auth_key:
+ if (!is_remount) {
+ kfree(ctx->auth_key_name);
+ ctx->auth_key_name = param->string;
+ param->string = NULL;
}
+ break;
+ case Opt_auth_hash_name:
+ if (!is_remount) {
+ kfree(ctx->auth_hash_name);
+ ctx->auth_hash_name = param->string;
+ param->string = NULL;
}
+ break;
+ case Opt_ignore:
+ break;
}
return 0;
@@ -2003,21 +1946,27 @@ static void ubifs_put_super(struct super_block *sb)
mutex_unlock(&c->umount_mutex);
}
-static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
+static int ubifs_reconfigure(struct fs_context *fc)
{
+ struct ubifs_fs_context *ctx = fc->fs_private;
+ struct super_block *sb = fc->root->d_sb;
int err;
struct ubifs_info *c = sb->s_fs_info;
sync_filesystem(sb);
- dbg_gen("old flags %#lx, new flags %#x", sb->s_flags, *flags);
+ dbg_gen("old flags %#lx, new flags %#x", sb->s_flags, fc->sb_flags);
- err = ubifs_parse_options(c, data, 1);
- if (err) {
- ubifs_err(c, "invalid or unknown remount parameter");
- return err;
- }
+ /*
+ * Apply the mount option changes.
+ * auth_key_name and auth_hash_name are ignored on remount.
+ */
+ c->mount_opts = ctx->mount_opts;
+ c->bulk_read = ctx->bulk_read;
+ c->no_chk_data_crc = ctx->no_chk_data_crc;
+ c->default_compr = ctx->default_compr;
+ c->assert_action = ctx->assert_action;
- if (c->ro_mount && !(*flags & SB_RDONLY)) {
+ if (c->ro_mount && !(fc->sb_flags & SB_RDONLY)) {
if (c->ro_error) {
ubifs_msg(c, "cannot re-mount R/W due to prior errors");
return -EROFS;
@@ -2029,7 +1978,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
err = ubifs_remount_rw(c);
if (err)
return err;
- } else if (!c->ro_mount && (*flags & SB_RDONLY)) {
+ } else if (!c->ro_mount && (fc->sb_flags & SB_RDONLY)) {
if (c->ro_error) {
ubifs_msg(c, "cannot re-mount R/O due to prior errors");
return -EROFS;
@@ -2062,14 +2011,13 @@ const struct super_operations ubifs_super_operations = {
.evict_inode = ubifs_evict_inode,
.statfs = ubifs_statfs,
.dirty_inode = ubifs_dirty_inode,
- .remount_fs = ubifs_remount_fs,
.show_options = ubifs_show_options,
.sync_fs = ubifs_sync_fs,
};
/**
* open_ubi - parse UBI device name string and open the UBI device.
- * @name: UBI volume name
+ * @fc: The filesystem context
* @mode: UBI volume open mode
*
* The primary method of mounting UBIFS is by specifying the UBI volume
@@ -2086,15 +2034,13 @@ const struct super_operations ubifs_super_operations = {
* returns UBI volume description object in case of success and a negative
* error code in case of failure.
*/
-static struct ubi_volume_desc *open_ubi(const char *name, int mode)
+static struct ubi_volume_desc *open_ubi(struct fs_context *fc, int mode)
{
struct ubi_volume_desc *ubi;
+ const char *name = fc->source;
int dev, vol;
char *endptr;
- if (!name || !*name)
- return ERR_PTR(-EINVAL);
-
/* First, try to open using the device node path method */
ubi = ubi_open_volume_path(name, mode);
if (!IS_ERR(ubi))
@@ -2102,14 +2048,14 @@ static struct ubi_volume_desc *open_ubi(const char *name, int mode)
/* Try the "nodev" method */
if (name[0] != 'u' || name[1] != 'b' || name[2] != 'i')
- return ERR_PTR(-EINVAL);
+ goto invalid_source;
/* ubi:NAME method */
if ((name[3] == ':' || name[3] == '!') && name[4] != '\0')
return ubi_open_volume_nm(0, name + 4, mode);
if (!isdigit(name[3]))
- return ERR_PTR(-EINVAL);
+ goto invalid_source;
dev = simple_strtoul(name + 3, &endptr, 0);
@@ -2121,7 +2067,7 @@ static struct ubi_volume_desc *open_ubi(const char *name, int mode)
if (*endptr == '_' && isdigit(endptr[1])) {
vol = simple_strtoul(endptr + 1, &endptr, 0);
if (*endptr != '\0')
- return ERR_PTR(-EINVAL);
+ goto invalid_source;
return ubi_open_volume(dev, vol, mode);
}
@@ -2129,7 +2075,8 @@ static struct ubi_volume_desc *open_ubi(const char *name, int mode)
if ((*endptr == ':' || *endptr == '!') && endptr[1] != '\0')
return ubi_open_volume_nm(dev, ++endptr, mode);
- return ERR_PTR(-EINVAL);
+invalid_source:
+ return ERR_PTR(invalf(fc, "Invalid source name"));
}
static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi)
@@ -2181,9 +2128,10 @@ static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi)
return c;
}
-static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
+static int ubifs_fill_super(struct super_block *sb, struct fs_context *fc)
{
struct ubifs_info *c = sb->s_fs_info;
+ struct ubifs_fs_context *ctx = fc->fs_private;
struct inode *root;
int err;
@@ -2195,9 +2143,18 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
goto out;
}
- err = ubifs_parse_options(c, data, 0);
- if (err)
- goto out_close;
+ /* Copy in parsed mount options */
+ c->mount_opts = ctx->mount_opts;
+ c->auth_key_name = ctx->auth_key_name;
+ c->auth_hash_name = ctx->auth_hash_name;
+ c->no_chk_data_crc = ctx->no_chk_data_crc;
+ c->bulk_read = ctx->bulk_read;
+ c->default_compr = ctx->default_compr;
+ c->assert_action = ctx->assert_action;
+
+ /* ubifs_info owns auth strings now */
+ ctx->auth_key_name = NULL;
+ ctx->auth_hash_name = NULL;
/*
* UBIFS provides 'backing_dev_info' in order to disable read-ahead. For
@@ -2264,41 +2221,38 @@ out:
return err;
}
-static int sb_test(struct super_block *sb, void *data)
+static int sb_test(struct super_block *sb, struct fs_context *fc)
{
- struct ubifs_info *c1 = data;
+ struct ubifs_info *c1 = fc->s_fs_info;
struct ubifs_info *c = sb->s_fs_info;
return c->vi.cdev == c1->vi.cdev;
}
-static int sb_set(struct super_block *sb, void *data)
-{
- sb->s_fs_info = data;
- return set_anon_super(sb, NULL);
-}
-
-static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags,
- const char *name, void *data)
+static int ubifs_get_tree(struct fs_context *fc)
{
struct ubi_volume_desc *ubi;
struct ubifs_info *c;
struct super_block *sb;
int err;
- dbg_gen("name %s, flags %#x", name, flags);
+ if (!fc->source || !*fc->source)
+ return invalf(fc, "No source specified");
+
+ dbg_gen("name %s, flags %#x", fc->source, fc->sb_flags);
/*
* Get UBI device number and volume ID. Mount it read-only so far
* because this might be a new mount point, and UBI allows only one
* read-write user at a time.
*/
- ubi = open_ubi(name, UBI_READONLY);
+ ubi = open_ubi(fc, UBI_READONLY);
if (IS_ERR(ubi)) {
- if (!(flags & SB_SILENT))
+ err = PTR_ERR(ubi);
+ if (!(fc->sb_flags & SB_SILENT))
pr_err("UBIFS error (pid: %d): cannot open \"%s\", error %d",
- current->pid, name, (int)PTR_ERR(ubi));
- return ERR_CAST(ubi);
+ current->pid, fc->source, err);
+ return err;
}
c = alloc_ubifs_info(ubi);
@@ -2306,10 +2260,11 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags,
err = -ENOMEM;
goto out_close;
}
+ fc->s_fs_info = c;
dbg_gen("opened ubi%d_%d", c->vi.ubi_num, c->vi.vol_id);
- sb = sget(fs_type, sb_test, sb_set, flags, c);
+ sb = sget_fc(fc, sb_test, set_anon_super_fc);
if (IS_ERR(sb)) {
err = PTR_ERR(sb);
kfree(c);
@@ -2321,12 +2276,12 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags,
kfree(c);
/* A new mount point for already mounted UBIFS */
dbg_gen("this ubi volume is already mounted");
- if (!!(flags & SB_RDONLY) != c1->ro_mount) {
+ if (!!(fc->sb_flags & SB_RDONLY) != c1->ro_mount) {
err = -EBUSY;
goto out_deact;
}
} else {
- err = ubifs_fill_super(sb, data, flags & SB_SILENT ? 1 : 0);
+ err = ubifs_fill_super(sb, fc);
if (err)
goto out_deact;
/* We do not support atime */
@@ -2340,13 +2295,14 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags,
/* 'fill_super()' opens ubi again so we must close it here */
ubi_close_volume(ubi);
- return dget(sb->s_root);
+ fc->root = dget(sb->s_root);
+ return 0;
out_deact:
deactivate_locked_super(sb);
out_close:
ubi_close_volume(ubi);
- return ERR_PTR(err);
+ return err;
}
static void kill_ubifs_super(struct super_block *s)
@@ -2356,10 +2312,61 @@ static void kill_ubifs_super(struct super_block *s)
kfree(c);
}
+static void ubifs_free_fc(struct fs_context *fc)
+{
+ struct ubifs_fs_context *ctx = fc->fs_private;
+
+ if (ctx) {
+ kfree(ctx->auth_key_name);
+ kfree(ctx->auth_hash_name);
+ kfree(ctx);
+ }
+}
+
+static const struct fs_context_operations ubifs_context_ops = {
+ .free = ubifs_free_fc,
+ .parse_param = ubifs_parse_param,
+ .get_tree = ubifs_get_tree,
+ .reconfigure = ubifs_reconfigure,
+};
+
+static int ubifs_init_fs_context(struct fs_context *fc)
+{
+ struct ubifs_fs_context *ctx;
+
+ ctx = kzalloc(sizeof(struct ubifs_fs_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ if (fc->purpose != FS_CONTEXT_FOR_RECONFIGURE) {
+ /* Iniitialize for first mount */
+ ctx->no_chk_data_crc = 1;
+ ctx->assert_action = ASSACT_RO;
+ } else {
+ struct ubifs_info *c = fc->root->d_sb->s_fs_info;
+
+ /*
+ * Preserve existing options across remounts.
+ * auth_key_name and auth_hash_name are not remountable.
+ */
+ ctx->mount_opts = c->mount_opts;
+ ctx->bulk_read = c->bulk_read;
+ ctx->no_chk_data_crc = c->no_chk_data_crc;
+ ctx->default_compr = c->default_compr;
+ ctx->assert_action = c->assert_action;
+ }
+
+ fc->ops = &ubifs_context_ops;
+ fc->fs_private = ctx;
+
+ return 0;
+}
+
static struct file_system_type ubifs_fs_type = {
.name = "ubifs",
.owner = THIS_MODULE,
- .mount = ubifs_mount,
+ .init_fs_context = ubifs_init_fs_context,
+ .parameters = ubifs_fs_param_spec,
.kill_sb = kill_ubifs_super,
};
MODULE_ALIAS_FS("ubifs");
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 53c11be2b2c1..194ed3ab945e 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -33,6 +33,29 @@ static u64 ufs_bitmap_search (struct super_block *, struct ufs_cg_private_info *
static unsigned char ufs_fragtable_8fpb[], ufs_fragtable_other[];
static void ufs_clusteracct(struct super_block *, struct ufs_cg_private_info *, unsigned, int);
+static void adjust_free_blocks(struct super_block *sb,
+ struct ufs_cylinder_group *ucg,
+ struct ufs_cg_private_info *ucpi,
+ unsigned fragment, int delta)
+{
+ struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+
+ if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
+ ufs_clusteracct(sb, ucpi, fragment, delta);
+
+ fs32_add(sb, &ucg->cg_cs.cs_nbfree, delta);
+ uspi->cs_total.cs_nbfree += delta;
+ fs32_add(sb, &UFS_SB(sb)->fs_cs(ucpi->c_cgx).cs_nbfree, delta);
+
+ if (uspi->fs_magic != UFS2_MAGIC) {
+ unsigned cylno = ufs_cbtocylno(fragment);
+
+ fs16_add(sb, &ubh_cg_blks(ucpi, cylno,
+ ufs_cbtorpos(fragment)), delta);
+ fs32_add(sb, &ubh_cg_blktot(ucpi, cylno), delta);
+ }
+}
+
/*
* Free 'count' fragments from fragment number 'fragment'
*/
@@ -43,7 +66,6 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
struct ufs_cg_private_info * ucpi;
struct ufs_cylinder_group * ucg;
unsigned cgno, bit, end_bit, bbase, blkmap, i;
- u64 blkno;
sb = inode->i_sb;
uspi = UFS_SB(sb)->s_uspi;
@@ -51,7 +73,7 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
UFSD("ENTER, fragment %llu, count %u\n",
(unsigned long long)fragment, count);
- if (ufs_fragnum(fragment) + count > uspi->s_fpg)
+ if (ufs_fragnum(fragment) + count > uspi->s_fpb)
ufs_error (sb, "ufs_free_fragments", "internal error");
mutex_lock(&UFS_SB(sb)->s_lock);
@@ -94,23 +116,11 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
/*
* Trying to reassemble free fragments into block
*/
- blkno = ufs_fragstoblks (bbase);
- if (ubh_isblockset(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno)) {
+ if (ubh_isblockset(uspi, ucpi, bbase)) {
fs32_sub(sb, &ucg->cg_cs.cs_nffree, uspi->s_fpb);
uspi->cs_total.cs_nffree -= uspi->s_fpb;
fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, uspi->s_fpb);
- if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
- ufs_clusteracct (sb, ucpi, blkno, 1);
- fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1);
- uspi->cs_total.cs_nbfree++;
- fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nbfree, 1);
- if (uspi->fs_magic != UFS2_MAGIC) {
- unsigned cylno = ufs_cbtocylno (bbase);
-
- fs16_add(sb, &ubh_cg_blks(ucpi, cylno,
- ufs_cbtorpos(bbase)), 1);
- fs32_add(sb, &ubh_cg_blktot(ucpi, cylno), 1);
- }
+ adjust_free_blocks(sb, ucg, ucpi, bbase, 1);
}
ubh_mark_buffer_dirty (USPI_UBH(uspi));
@@ -139,7 +149,6 @@ void ufs_free_blocks(struct inode *inode, u64 fragment, unsigned count)
struct ufs_cg_private_info * ucpi;
struct ufs_cylinder_group * ucg;
unsigned overflow, cgno, bit, end_bit, i;
- u64 blkno;
sb = inode->i_sb;
uspi = UFS_SB(sb)->s_uspi;
@@ -181,26 +190,12 @@ do_more:
}
for (i = bit; i < end_bit; i += uspi->s_fpb) {
- blkno = ufs_fragstoblks(i);
- if (ubh_isblockset(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno)) {
+ if (ubh_isblockset(uspi, ucpi, i)) {
ufs_error(sb, "ufs_free_blocks", "freeing free fragment");
}
- ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
+ ubh_setblock(uspi, ucpi, i);
inode_sub_bytes(inode, uspi->s_fpb << uspi->s_fshift);
- if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
- ufs_clusteracct (sb, ucpi, blkno, 1);
-
- fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1);
- uspi->cs_total.cs_nbfree++;
- fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nbfree, 1);
-
- if (uspi->fs_magic != UFS2_MAGIC) {
- unsigned cylno = ufs_cbtocylno(i);
-
- fs16_add(sb, &ubh_cg_blks(ucpi, cylno,
- ufs_cbtorpos(i)), 1);
- fs32_add(sb, &ubh_cg_blktot(ucpi, cylno), 1);
- }
+ adjust_free_blocks(sb, ucg, ucpi, i, 1);
}
ubh_mark_buffer_dirty (USPI_UBH(uspi));
@@ -234,13 +229,13 @@ failed:
* situated at the end of file.
*
* We can come here from ufs_writepage or ufs_prepare_write,
- * locked_page is argument of these functions, so we already lock it.
+ * locked_folio is argument of these functions, so we already lock it.
*/
static void ufs_change_blocknr(struct inode *inode, sector_t beg,
unsigned int count, sector_t oldb,
- sector_t newb, struct page *locked_page)
+ sector_t newb, struct folio *locked_folio)
{
- struct folio *folio, *locked_folio = page_folio(locked_page);
+ struct folio *folio;
const unsigned blks_per_page =
1 << (PAGE_SHIFT - inode->i_blkbits);
const unsigned mask = blks_per_page - 1;
@@ -337,7 +332,7 @@ static void ufs_clear_frags(struct inode *inode, sector_t beg, unsigned int n,
u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
u64 goal, unsigned count, int *err,
- struct page *locked_page)
+ struct folio *locked_folio)
{
struct super_block * sb;
struct ufs_sb_private_info * uspi;
@@ -417,7 +412,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
result = ufs_alloc_fragments (inode, cgno, goal, count, err);
if (result) {
ufs_clear_frags(inode, result + oldcount,
- newcount - oldcount, locked_page != NULL);
+ newcount - oldcount, locked_folio != NULL);
*err = 0;
write_seqlock(&UFS_I(inode)->meta_lock);
ufs_cpu_to_data_ptr(sb, p, result);
@@ -441,7 +436,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
fragment + count);
read_sequnlock_excl(&UFS_I(inode)->meta_lock);
ufs_clear_frags(inode, result + oldcount, newcount - oldcount,
- locked_page != NULL);
+ locked_folio != NULL);
mutex_unlock(&UFS_SB(sb)->s_lock);
UFSD("EXIT, result %llu\n", (unsigned long long)result);
return result;
@@ -462,11 +457,11 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment,
result = ufs_alloc_fragments (inode, cgno, goal, request, err);
if (result) {
ufs_clear_frags(inode, result + oldcount, newcount - oldcount,
- locked_page != NULL);
+ locked_folio != NULL);
mutex_unlock(&UFS_SB(sb)->s_lock);
ufs_change_blocknr(inode, fragment - oldcount, oldcount,
uspi->s_sbbase + tmp,
- uspi->s_sbbase + result, locked_page);
+ uspi->s_sbbase + result, locked_folio);
*err = 0;
write_seqlock(&UFS_I(inode)->meta_lock);
ufs_cpu_to_data_ptr(sb, p, result);
@@ -698,7 +693,7 @@ static u64 ufs_alloccg_block(struct inode *inode,
struct super_block * sb;
struct ufs_sb_private_info * uspi;
struct ufs_cylinder_group * ucg;
- u64 result, blkno;
+ u64 result;
UFSD("ENTER, goal %llu\n", (unsigned long long)goal);
@@ -716,7 +711,7 @@ static u64 ufs_alloccg_block(struct inode *inode,
/*
* If the requested block is available, use it.
*/
- if (ubh_isblockset(UCPI_UBH(ucpi), ucpi->c_freeoff, ufs_fragstoblks(goal))) {
+ if (ubh_isblockset(uspi, ucpi, goal)) {
result = goal;
goto gotit;
}
@@ -729,22 +724,8 @@ norot:
gotit:
if (!try_add_frags(inode, uspi->s_fpb))
return 0;
- blkno = ufs_fragstoblks(result);
- ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
- if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
- ufs_clusteracct (sb, ucpi, blkno, -1);
-
- fs32_sub(sb, &ucg->cg_cs.cs_nbfree, 1);
- uspi->cs_total.cs_nbfree--;
- fs32_sub(sb, &UFS_SB(sb)->fs_cs(ucpi->c_cgx).cs_nbfree, 1);
-
- if (uspi->fs_magic != UFS2_MAGIC) {
- unsigned cylno = ufs_cbtocylno((unsigned)result);
-
- fs16_sub(sb, &ubh_cg_blks(ucpi, cylno,
- ufs_cbtorpos((unsigned)result)), 1);
- fs32_sub(sb, &ubh_cg_blktot(ucpi, cylno), 1);
- }
+ ubh_clrblock(uspi, ucpi, result);
+ adjust_free_blocks(sb, ucg, ucpi, result, -1);
UFSD("EXIT, result %llu\n", (unsigned long long)result);
@@ -863,12 +844,12 @@ static u64 ufs_bitmap_search(struct super_block *sb,
}
static void ufs_clusteracct(struct super_block * sb,
- struct ufs_cg_private_info * ucpi, unsigned blkno, int cnt)
+ struct ufs_cg_private_info * ucpi, unsigned frag, int cnt)
{
- struct ufs_sb_private_info * uspi;
+ struct ufs_sb_private_info * uspi = UFS_SB(sb)->s_uspi;
int i, start, end, forw, back;
+ unsigned blkno = ufs_fragstoblks(frag);
- uspi = UFS_SB(sb)->s_uspi;
if (uspi->s_contigsumsize <= 0)
return;
diff --git a/fs/ufs/cylinder.c b/fs/ufs/cylinder.c
index 1abe5454de47..a2813270c303 100644
--- a/fs/ufs/cylinder.c
+++ b/fs/ufs/cylinder.c
@@ -26,7 +26,7 @@
* Read cylinder group into cache. The memory space for ufs_cg_private_info
* structure is already allocated during ufs_read_super.
*/
-static void ufs_read_cylinder (struct super_block * sb,
+static bool ufs_read_cylinder(struct super_block *sb,
unsigned cgno, unsigned bitmap_nr)
{
struct ufs_sb_info * sbi = UFS_SB(sb);
@@ -46,9 +46,11 @@ static void ufs_read_cylinder (struct super_block * sb,
* We have already the first fragment of cylinder group block in buffer
*/
UCPI_UBH(ucpi)->bh[0] = sbi->s_ucg[cgno];
- for (i = 1; i < UCPI_UBH(ucpi)->count; i++)
- if (!(UCPI_UBH(ucpi)->bh[i] = sb_bread(sb, UCPI_UBH(ucpi)->fragment + i)))
+ for (i = 1; i < UCPI_UBH(ucpi)->count; i++) {
+ UCPI_UBH(ucpi)->bh[i] = sb_bread(sb, UCPI_UBH(ucpi)->fragment + i);
+ if (!UCPI_UBH(ucpi)->bh[i])
goto failed;
+ }
sbi->s_cgno[bitmap_nr] = cgno;
ucpi->c_cgx = fs32_to_cpu(sb, ucg->cg_cgx);
@@ -67,13 +69,14 @@ static void ufs_read_cylinder (struct super_block * sb,
ucpi->c_clusteroff = fs32_to_cpu(sb, ucg->cg_u.cg_44.cg_clusteroff);
ucpi->c_nclusterblks = fs32_to_cpu(sb, ucg->cg_u.cg_44.cg_nclusterblks);
UFSD("EXIT\n");
- return;
+ return true;
failed:
for (j = 1; j < i; j++)
- brelse (sbi->s_ucg[j]);
+ brelse(UCPI_UBH(ucpi)->bh[j]);
sbi->s_cgno[bitmap_nr] = UFS_CGNO_EMPTY;
ufs_error (sb, "ufs_read_cylinder", "can't read cylinder group block %u", cgno);
+ return false;
}
/*
@@ -156,15 +159,14 @@ struct ufs_cg_private_info * ufs_load_cylinder (
UFSD("EXIT (FAILED)\n");
return NULL;
}
- else {
- UFSD("EXIT\n");
- return sbi->s_ucpi[cgno];
- }
} else {
- ufs_read_cylinder (sb, cgno, cgno);
- UFSD("EXIT\n");
- return sbi->s_ucpi[cgno];
+ if (unlikely(!ufs_read_cylinder (sb, cgno, cgno))) {
+ UFSD("EXIT (FAILED)\n");
+ return NULL;
+ }
}
+ UFSD("EXIT\n");
+ return sbi->s_ucpi[cgno];
}
/*
* Cylinder group number cg is in cache but it was not last used,
@@ -195,7 +197,10 @@ struct ufs_cg_private_info * ufs_load_cylinder (
sbi->s_ucpi[j] = sbi->s_ucpi[j-1];
}
sbi->s_ucpi[0] = ucpi;
- ufs_read_cylinder (sb, cgno, 0);
+ if (unlikely(!ufs_read_cylinder (sb, cgno, 0))) {
+ UFSD("EXIT (FAILED)\n");
+ return NULL;
+ }
}
UFSD("EXIT\n");
return sbi->s_ucpi[0];
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index d6e6a2198971..88d0062cfdb9 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -81,10 +81,9 @@ ino_t ufs_inode_by_name(struct inode *dir, const struct qstr *qstr)
}
-/* Releases the page */
-void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
- struct folio *folio, struct inode *inode,
- bool update_times)
+int ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
+ struct folio *folio, struct inode *inode,
+ bool update_times)
{
loff_t pos = folio_pos(folio) + offset_in_folio(folio, de);
unsigned len = fs16_to_cpu(dir->i_sb, de->d_reclen);
@@ -92,17 +91,19 @@ void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
folio_lock(folio);
err = ufs_prepare_chunk(folio, pos, len);
- BUG_ON(err);
+ if (unlikely(err)) {
+ folio_unlock(folio);
+ return err;
+ }
de->d_ino = cpu_to_fs32(dir->i_sb, inode->i_ino);
ufs_set_de_type(dir->i_sb, de, inode->i_mode);
ufs_commit_chunk(folio, pos, len);
- folio_release_kmap(folio, de);
if (update_times)
inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
mark_inode_dirty(dir);
- ufs_handle_dirsync(dir);
+ return ufs_handle_dirsync(dir);
}
static bool ufs_check_folio(struct folio *folio, char *kaddr)
@@ -505,8 +506,7 @@ int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir,
if (de->d_reclen == 0) {
ufs_error(inode->i_sb, __func__,
"zero-length directory entry");
- err = -EIO;
- goto out;
+ return -EIO;
}
pde = de;
de = ufs_next_entry(sb, de);
@@ -516,18 +516,17 @@ int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir,
pos = folio_pos(folio) + from;
folio_lock(folio);
err = ufs_prepare_chunk(folio, pos, to - from);
- BUG_ON(err);
+ if (unlikely(err)) {
+ folio_unlock(folio);
+ return err;
+ }
if (pde)
pde->d_reclen = cpu_to_fs16(sb, to - from);
dir->d_ino = 0;
ufs_commit_chunk(folio, pos, to - from);
inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
mark_inode_dirty(inode);
- err = ufs_handle_dirsync(inode);
-out:
- folio_release_kmap(folio, kaddr);
- UFSD("EXIT\n");
- return err;
+ return ufs_handle_dirsync(inode);
}
int ufs_make_empty(struct inode * inode, struct inode *dir)
diff --git a/fs/ufs/file.c b/fs/ufs/file.c
index 6558882a89ef..487ad1fc2de6 100644
--- a/fs/ufs/file.c
+++ b/fs/ufs/file.c
@@ -42,4 +42,5 @@ const struct file_operations ufs_file_operations = {
.open = generic_file_open,
.fsync = generic_file_fsync,
.splice_read = filemap_splice_read,
+ .splice_write = iter_file_splice_write,
};
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 5331ae7ebf3e..7dc38fdef2ea 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -220,7 +220,7 @@ changed:
*/
static bool
ufs_extend_tail(struct inode *inode, u64 writes_to,
- int *err, struct page *locked_page)
+ int *err, struct folio *locked_folio)
{
struct ufs_inode_info *ufsi = UFS_I(inode);
struct super_block *sb = inode->i_sb;
@@ -239,7 +239,7 @@ ufs_extend_tail(struct inode *inode, u64 writes_to,
p = ufs_get_direct_data_ptr(uspi, ufsi, block);
tmp = ufs_new_fragments(inode, p, lastfrag, ufs_data_ptr_to_cpu(sb, p),
new_size - (lastfrag & uspi->s_fpbmask), err,
- locked_page);
+ locked_folio);
return tmp != 0;
}
@@ -250,12 +250,11 @@ ufs_extend_tail(struct inode *inode, u64 writes_to,
* @new_fragment: number of new allocated fragment(s)
* @err: we set it if something wrong
* @new: we set it if we allocate new block
- * @locked_page: for ufs_new_fragments()
+ * @locked_folio: for ufs_new_fragments()
*/
-static u64
-ufs_inode_getfrag(struct inode *inode, unsigned index,
+static u64 ufs_inode_getfrag(struct inode *inode, unsigned index,
sector_t new_fragment, int *err,
- int *new, struct page *locked_page)
+ int *new, struct folio *locked_folio)
{
struct ufs_inode_info *ufsi = UFS_I(inode);
struct super_block *sb = inode->i_sb;
@@ -264,11 +263,6 @@ ufs_inode_getfrag(struct inode *inode, unsigned index,
unsigned nfrags = uspi->s_fpb;
void *p;
- /* TODO : to be done for write support
- if ( (flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2)
- goto ufs2;
- */
-
p = ufs_get_direct_data_ptr(uspi, ufsi, index);
tmp = ufs_data_ptr_to_cpu(sb, p);
if (tmp)
@@ -288,7 +282,7 @@ ufs_inode_getfrag(struct inode *inode, unsigned index,
goal += uspi->s_fpb;
}
tmp = ufs_new_fragments(inode, p, ufs_blknum(new_fragment),
- goal, nfrags, err, locked_page);
+ goal, nfrags, err, locked_folio);
if (!tmp) {
*err = -ENOSPC;
@@ -303,21 +297,6 @@ ufs_inode_getfrag(struct inode *inode, unsigned index,
mark_inode_dirty(inode);
out:
return tmp + uspi->s_sbbase;
-
- /* This part : To be implemented ....
- Required only for writing, not required for READ-ONLY.
-ufs2:
-
- u2_block = ufs_fragstoblks(fragment);
- u2_blockoff = ufs_fragnum(fragment);
- p = ufsi->i_u1.u2_i_data + block;
- goal = 0;
-
-repeat2:
- tmp = fs32_to_cpu(sb, *p);
- lastfrag = ufsi->i_lastfrag;
-
- */
}
/**
@@ -329,12 +308,11 @@ repeat2:
* (block will hold this fragment and also uspi->s_fpb-1)
* @err: see ufs_inode_getfrag()
* @new: see ufs_inode_getfrag()
- * @locked_page: see ufs_inode_getfrag()
+ * @locked_folio: see ufs_inode_getfrag()
*/
-static u64
-ufs_inode_getblock(struct inode *inode, u64 ind_block,
- unsigned index, sector_t new_fragment, int *err,
- int *new, struct page *locked_page)
+static u64 ufs_inode_getblock(struct inode *inode, u64 ind_block,
+ unsigned index, sector_t new_fragment, int *err,
+ int *new, struct folio *locked_folio)
{
struct super_block *sb = inode->i_sb;
struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
@@ -369,7 +347,7 @@ ufs_inode_getblock(struct inode *inode, u64 ind_block,
else
goal = bh->b_blocknr + uspi->s_fpb;
tmp = ufs_new_fragments(inode, p, ufs_blknum(new_fragment), goal,
- uspi->s_fpb, err, locked_page);
+ uspi->s_fpb, err, locked_folio);
if (!tmp)
goto out;
@@ -434,14 +412,14 @@ static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buff
unsigned tailfrags = lastfrag & uspi->s_fpbmask;
if (tailfrags && fragment >= lastfrag) {
if (!ufs_extend_tail(inode, fragment,
- &err, bh_result->b_page))
+ &err, bh_result->b_folio))
goto out;
}
}
if (depth == 1) {
phys64 = ufs_inode_getfrag(inode, offsets[0], fragment,
- &err, &new, bh_result->b_page);
+ &err, &new, bh_result->b_folio);
} else {
int i;
phys64 = ufs_inode_getfrag(inode, offsets[0], fragment,
@@ -450,7 +428,7 @@ static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buff
phys64 = ufs_inode_getblock(inode, phys64, offsets[i],
fragment, &err, NULL, NULL);
phys64 = ufs_inode_getblock(inode, phys64, offsets[depth - 1],
- fragment, &err, &new, bh_result->b_page);
+ fragment, &err, &new, bh_result->b_folio);
}
out:
if (phys64) {
@@ -898,91 +876,84 @@ static inline void free_data(struct to_free *ctx, u64 from, unsigned count)
#define DIRECT_FRAGMENT ((inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift)
+/*
+ * used only for truncation down to direct blocks.
+ */
static void ufs_trunc_direct(struct inode *inode)
{
struct ufs_inode_info *ufsi = UFS_I(inode);
- struct super_block * sb;
- struct ufs_sb_private_info * uspi;
- void *p;
- u64 frag1, frag2, frag3, frag4, block1, block2;
+ struct super_block *sb = inode->i_sb;
+ struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
+ unsigned int new_frags, old_frags;
+ unsigned int old_slot, new_slot;
+ unsigned int old_tail, new_tail;
struct to_free ctx = {.inode = inode};
- unsigned i, tmp;
UFSD("ENTER: ino %lu\n", inode->i_ino);
- sb = inode->i_sb;
- uspi = UFS_SB(sb)->s_uspi;
-
- frag1 = DIRECT_FRAGMENT;
- frag4 = min_t(u64, UFS_NDIR_FRAGMENT, ufsi->i_lastfrag);
- frag2 = ((frag1 & uspi->s_fpbmask) ? ((frag1 | uspi->s_fpbmask) + 1) : frag1);
- frag3 = frag4 & ~uspi->s_fpbmask;
- block1 = block2 = 0;
- if (frag2 > frag3) {
- frag2 = frag4;
- frag3 = frag4 = 0;
- } else if (frag2 < frag3) {
- block1 = ufs_fragstoblks (frag2);
- block2 = ufs_fragstoblks (frag3);
- }
-
- UFSD("ino %lu, frag1 %llu, frag2 %llu, block1 %llu, block2 %llu,"
- " frag3 %llu, frag4 %llu\n", inode->i_ino,
- (unsigned long long)frag1, (unsigned long long)frag2,
- (unsigned long long)block1, (unsigned long long)block2,
- (unsigned long long)frag3, (unsigned long long)frag4);
-
- if (frag1 >= frag2)
- goto next1;
+ new_frags = DIRECT_FRAGMENT;
+ // new_frags = first fragment past the new EOF
+ old_frags = min_t(u64, UFS_NDIR_FRAGMENT, ufsi->i_lastfrag);
+ // old_frags = first fragment past the old EOF or covered by indirects
- /*
- * Free first free fragments
- */
- p = ufs_get_direct_data_ptr(uspi, ufsi, ufs_fragstoblks(frag1));
- tmp = ufs_data_ptr_to_cpu(sb, p);
- if (!tmp )
- ufs_panic (sb, "ufs_trunc_direct", "internal error");
- frag2 -= frag1;
- frag1 = ufs_fragnum (frag1);
+ if (new_frags >= old_frags) // expanding - nothing to free
+ goto done;
- ufs_free_fragments(inode, tmp + frag1, frag2);
+ old_tail = ufs_fragnum(old_frags);
+ old_slot = ufs_fragstoblks(old_frags);
+ new_tail = ufs_fragnum(new_frags);
+ new_slot = ufs_fragstoblks(new_frags);
-next1:
- /*
- * Free whole blocks
- */
- for (i = block1 ; i < block2; i++) {
- p = ufs_get_direct_data_ptr(uspi, ufsi, i);
- tmp = ufs_data_ptr_to_cpu(sb, p);
+ if (old_slot == new_slot) { // old_tail > 0
+ void *p = ufs_get_direct_data_ptr(uspi, ufsi, old_slot);
+ u64 tmp = ufs_data_ptr_to_cpu(sb, p);
if (!tmp)
- continue;
- write_seqlock(&ufsi->meta_lock);
- ufs_data_ptr_clear(uspi, p);
- write_sequnlock(&ufsi->meta_lock);
+ ufs_panic(sb, __func__, "internal error");
+ if (!new_tail) {
+ write_seqlock(&ufsi->meta_lock);
+ ufs_data_ptr_clear(uspi, p);
+ write_sequnlock(&ufsi->meta_lock);
+ }
+ ufs_free_fragments(inode, tmp + new_tail, old_tail - new_tail);
+ } else {
+ unsigned int slot = new_slot;
- free_data(&ctx, tmp, uspi->s_fpb);
- }
+ if (new_tail) {
+ void *p = ufs_get_direct_data_ptr(uspi, ufsi, slot++);
+ u64 tmp = ufs_data_ptr_to_cpu(sb, p);
+ if (!tmp)
+ ufs_panic(sb, __func__, "internal error");
- free_data(&ctx, 0, 0);
+ ufs_free_fragments(inode, tmp + new_tail,
+ uspi->s_fpb - new_tail);
+ }
+ while (slot < old_slot) {
+ void *p = ufs_get_direct_data_ptr(uspi, ufsi, slot++);
+ u64 tmp = ufs_data_ptr_to_cpu(sb, p);
+ if (!tmp)
+ continue;
+ write_seqlock(&ufsi->meta_lock);
+ ufs_data_ptr_clear(uspi, p);
+ write_sequnlock(&ufsi->meta_lock);
- if (frag3 >= frag4)
- goto next3;
+ free_data(&ctx, tmp, uspi->s_fpb);
+ }
- /*
- * Free last free fragments
- */
- p = ufs_get_direct_data_ptr(uspi, ufsi, ufs_fragstoblks(frag3));
- tmp = ufs_data_ptr_to_cpu(sb, p);
- if (!tmp )
- ufs_panic(sb, "ufs_truncate_direct", "internal error");
- frag4 = ufs_fragnum (frag4);
- write_seqlock(&ufsi->meta_lock);
- ufs_data_ptr_clear(uspi, p);
- write_sequnlock(&ufsi->meta_lock);
+ free_data(&ctx, 0, 0);
- ufs_free_fragments (inode, tmp, frag4);
- next3:
+ if (old_tail) {
+ void *p = ufs_get_direct_data_ptr(uspi, ufsi, slot);
+ u64 tmp = ufs_data_ptr_to_cpu(sb, p);
+ if (!tmp)
+ ufs_panic(sb, __func__, "internal error");
+ write_seqlock(&ufsi->meta_lock);
+ ufs_data_ptr_clear(uspi, p);
+ write_sequnlock(&ufsi->meta_lock);
+ ufs_free_fragments(inode, tmp, old_tail);
+ }
+ }
+done:
UFSD("EXIT: ino %lu\n", inode->i_ino);
}
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index c8390976ab6a..38a024c8cccd 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -210,20 +210,18 @@ static int ufs_unlink(struct inode *dir, struct dentry *dentry)
struct inode * inode = d_inode(dentry);
struct ufs_dir_entry *de;
struct folio *folio;
- int err = -ENOENT;
+ int err;
de = ufs_find_entry(dir, &dentry->d_name, &folio);
if (!de)
- goto out;
+ return -ENOENT;
err = ufs_delete_entry(dir, de, folio);
- if (err)
- goto out;
-
- inode_set_ctime_to_ts(inode, inode_get_ctime(dir));
- inode_dec_link_count(inode);
- err = 0;
-out:
+ if (!err) {
+ inode_set_ctime_to_ts(inode, inode_get_ctime(dir));
+ inode_dec_link_count(inode);
+ }
+ folio_release_kmap(folio, de);
return err;
}
@@ -253,14 +251,14 @@ static int ufs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
struct ufs_dir_entry * dir_de = NULL;
struct folio *old_folio;
struct ufs_dir_entry *old_de;
- int err = -ENOENT;
+ int err;
if (flags & ~RENAME_NOREPLACE)
return -EINVAL;
old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_folio);
if (!old_de)
- goto out;
+ return -ENOENT;
if (S_ISDIR(old_inode->i_mode)) {
err = -EIO;
@@ -281,7 +279,10 @@ static int ufs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
new_de = ufs_find_entry(new_dir, &new_dentry->d_name, &new_folio);
if (!new_de)
goto out_dir;
- ufs_set_link(new_dir, new_de, new_folio, old_inode, 1);
+ err = ufs_set_link(new_dir, new_de, new_folio, old_inode, 1);
+ folio_release_kmap(new_folio, new_de);
+ if (err)
+ goto out_dir;
inode_set_ctime_current(new_inode);
if (dir_de)
drop_nlink(new_inode);
@@ -299,26 +300,20 @@ static int ufs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
* rename.
*/
inode_set_ctime_current(old_inode);
-
- ufs_delete_entry(old_dir, old_de, old_folio);
mark_inode_dirty(old_inode);
- if (dir_de) {
+ err = ufs_delete_entry(old_dir, old_de, old_folio);
+ if (!err && dir_de) {
if (old_dir != new_dir)
- ufs_set_link(old_inode, dir_de, dir_folio, new_dir, 0);
- else
- folio_release_kmap(dir_folio, dir_de);
+ err = ufs_set_link(old_inode, dir_de, dir_folio,
+ new_dir, 0);
inode_dec_link_count(old_dir);
}
- return 0;
-
-
out_dir:
if (dir_de)
folio_release_kmap(dir_folio, dir_de);
out_old:
folio_release_kmap(old_folio, old_de);
-out:
return err;
}
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index bc625788589c..762699c1bcf6 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -505,7 +505,6 @@ static int ufs_read_cylinder_structures(struct super_block *sb)
{
struct ufs_sb_info *sbi = UFS_SB(sb);
struct ufs_sb_private_info *uspi = sbi->s_uspi;
- struct ufs_buffer_head * ubh;
unsigned char * base, * space;
unsigned size, blks, i;
@@ -521,21 +520,13 @@ static int ufs_read_cylinder_structures(struct super_block *sb)
if (!base)
goto failed;
sbi->s_csp = (struct ufs_csum *)space;
- for (i = 0; i < blks; i += uspi->s_fpb) {
- size = uspi->s_bsize;
- if (i + uspi->s_fpb > blks)
- size = (blks - i) * uspi->s_fsize;
-
- ubh = ubh_bread(sb, uspi->s_csaddr + i, size);
-
- if (!ubh)
+ for (i = 0; i < blks; i++) {
+ struct buffer_head *bh = sb_bread(sb, uspi->s_csaddr + i);
+ if (!bh)
goto failed;
-
- ubh_ubhcpymem (space, ubh, size);
-
- space += size;
- ubh_brelse (ubh);
- ubh = NULL;
+ memcpy(space, bh->b_data, uspi->s_fsize);
+ space += uspi->s_fsize;
+ brelse (bh);
}
/*
@@ -645,7 +636,6 @@ static void ufs_put_super_internal(struct super_block *sb)
{
struct ufs_sb_info *sbi = UFS_SB(sb);
struct ufs_sb_private_info *uspi = sbi->s_uspi;
- struct ufs_buffer_head * ubh;
unsigned char * base, * space;
unsigned blks, size, i;
@@ -656,18 +646,17 @@ static void ufs_put_super_internal(struct super_block *sb)
size = uspi->s_cssize;
blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift;
base = space = (char*) sbi->s_csp;
- for (i = 0; i < blks; i += uspi->s_fpb) {
- size = uspi->s_bsize;
- if (i + uspi->s_fpb > blks)
- size = (blks - i) * uspi->s_fsize;
-
- ubh = ubh_bread(sb, uspi->s_csaddr + i, size);
-
- ubh_memcpyubh (ubh, space, size);
- space += size;
- ubh_mark_buffer_uptodate (ubh, 1);
- ubh_mark_buffer_dirty (ubh);
- ubh_brelse (ubh);
+ for (i = 0; i < blks; i++, space += uspi->s_fsize) {
+ struct buffer_head *bh = sb_bread(sb, uspi->s_csaddr + i);
+
+ if (unlikely(!bh)) { // better than an oops...
+ ufs_panic(sb, __func__,
+ "can't write part of cylinder group summary");
+ continue;
+ }
+ memcpy(bh->b_data, space, uspi->s_fsize);
+ mark_buffer_dirty(bh);
+ brelse(bh);
}
for (i = 0; i < sbi->s_cg_loaded; i++) {
ufs_put_cylinder (sb, i);
@@ -1240,11 +1229,7 @@ magic_found:
else
uspi->s_apbshift = uspi->s_bshift - 2;
- uspi->s_2apbshift = uspi->s_apbshift * 2;
- uspi->s_3apbshift = uspi->s_apbshift * 3;
uspi->s_apb = 1 << uspi->s_apbshift;
- uspi->s_2apb = 1 << uspi->s_2apbshift;
- uspi->s_3apb = 1 << uspi->s_3apbshift;
uspi->s_apbmask = uspi->s_apb - 1;
uspi->s_nspfshift = uspi->s_fshift - UFS_SECTOR_BITS;
uspi->s_nspb = uspi->s_nspf << uspi->s_fpbshift;
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index a2c762cb65a0..e7df65dd4351 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -88,10 +88,10 @@ struct ufs_inode_info {
#endif
/* balloc.c */
-extern void ufs_free_fragments (struct inode *, u64, unsigned);
-extern void ufs_free_blocks (struct inode *, u64, unsigned);
-extern u64 ufs_new_fragments(struct inode *, void *, u64, u64,
- unsigned, int *, struct page *);
+void ufs_free_fragments (struct inode *, u64 fragment, unsigned count);
+void ufs_free_blocks (struct inode *, u64 fragment, unsigned count);
+u64 ufs_new_fragments(struct inode *, void *, u64 fragment, u64 goal,
+ unsigned count, int *err, struct folio *);
/* cylinder.c */
extern struct ufs_cg_private_info * ufs_load_cylinder (struct super_block *, unsigned);
@@ -108,8 +108,8 @@ struct ufs_dir_entry *ufs_find_entry(struct inode *, const struct qstr *,
int ufs_delete_entry(struct inode *, struct ufs_dir_entry *, struct folio *);
int ufs_empty_dir(struct inode *);
struct ufs_dir_entry *ufs_dotdot(struct inode *, struct folio **);
-void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
- struct folio *folio, struct inode *inode, bool update_times);
+int ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
+ struct folio *folio, struct inode *inode, bool update_times);
/* file.c */
extern const struct inode_operations ufs_file_inode_operations;
diff --git a/fs/ufs/ufs_fs.h b/fs/ufs/ufs_fs.h
index ef9ead44776a..0905f9a16b91 100644
--- a/fs/ufs/ufs_fs.h
+++ b/fs/ufs/ufs_fs.h
@@ -775,12 +775,8 @@ struct ufs_sb_private_info {
__u32 s_fpbmask; /* fragments per block mask */
__u32 s_apb; /* address per block */
- __u32 s_2apb; /* address per block^2 */
- __u32 s_3apb; /* address per block^3 */
__u32 s_apbmask; /* address per block mask */
__u32 s_apbshift; /* address per block shift */
- __u32 s_2apbshift; /* address per block shift * 2 */
- __u32 s_3apbshift; /* address per block shift * 3 */
__u32 s_nspfshift; /* number of sector per fragment shift */
__u32 s_nspb; /* number of sector per block */
__u32 s_inopf; /* inodes per fragment */
diff --git a/fs/ufs/util.c b/fs/ufs/util.c
index 2acf191eb89e..f0e906ab4ddd 100644
--- a/fs/ufs/util.c
+++ b/fs/ufs/util.c
@@ -99,20 +99,6 @@ void ubh_mark_buffer_dirty (struct ufs_buffer_head * ubh)
mark_buffer_dirty (ubh->bh[i]);
}
-void ubh_mark_buffer_uptodate (struct ufs_buffer_head * ubh, int flag)
-{
- unsigned i;
- if (!ubh)
- return;
- if (flag) {
- for ( i = 0; i < ubh->count; i++ )
- set_buffer_uptodate (ubh->bh[i]);
- } else {
- for ( i = 0; i < ubh->count; i++ )
- clear_buffer_uptodate (ubh->bh[i]);
- }
-}
-
void ubh_sync_block(struct ufs_buffer_head *ubh)
{
if (ubh) {
@@ -146,38 +132,6 @@ int ubh_buffer_dirty (struct ufs_buffer_head * ubh)
return result;
}
-void _ubh_ubhcpymem_(struct ufs_sb_private_info * uspi,
- unsigned char * mem, struct ufs_buffer_head * ubh, unsigned size)
-{
- unsigned len, bhno;
- if (size > (ubh->count << uspi->s_fshift))
- size = ubh->count << uspi->s_fshift;
- bhno = 0;
- while (size) {
- len = min_t(unsigned int, size, uspi->s_fsize);
- memcpy (mem, ubh->bh[bhno]->b_data, len);
- mem += uspi->s_fsize;
- size -= len;
- bhno++;
- }
-}
-
-void _ubh_memcpyubh_(struct ufs_sb_private_info * uspi,
- struct ufs_buffer_head * ubh, unsigned char * mem, unsigned size)
-{
- unsigned len, bhno;
- if (size > (ubh->count << uspi->s_fshift))
- size = ubh->count << uspi->s_fshift;
- bhno = 0;
- while (size) {
- len = min_t(unsigned int, size, uspi->s_fsize);
- memcpy (ubh->bh[bhno]->b_data, mem, len);
- mem += uspi->s_fsize;
- size -= len;
- bhno++;
- }
-}
-
dev_t
ufs_get_inode_dev(struct super_block *sb, struct ufs_inode_info *ufsi)
{
diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index bf708b68f150..391bb4f11d74 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h
@@ -263,14 +263,9 @@ extern struct ufs_buffer_head * ubh_bread_uspi(struct ufs_sb_private_info *, str
extern void ubh_brelse (struct ufs_buffer_head *);
extern void ubh_brelse_uspi (struct ufs_sb_private_info *);
extern void ubh_mark_buffer_dirty (struct ufs_buffer_head *);
-extern void ubh_mark_buffer_uptodate (struct ufs_buffer_head *, int);
extern void ubh_sync_block(struct ufs_buffer_head *);
extern void ubh_bforget (struct ufs_buffer_head *);
extern int ubh_buffer_dirty (struct ufs_buffer_head *);
-#define ubh_ubhcpymem(mem,ubh,size) _ubh_ubhcpymem_(uspi,mem,ubh,size)
-extern void _ubh_ubhcpymem_(struct ufs_sb_private_info *, unsigned char *, struct ufs_buffer_head *, unsigned);
-#define ubh_memcpyubh(ubh,mem,size) _ubh_memcpyubh_(uspi,ubh,mem,size)
-extern void _ubh_memcpyubh_(struct ufs_sb_private_info *, struct ufs_buffer_head *, unsigned char *, unsigned);
/* This functions works with cache pages*/
struct folio *ufs_get_locked_folio(struct address_space *mapping, pgoff_t index);
@@ -455,65 +450,69 @@ static inline unsigned _ubh_find_last_zero_bit_(
return (base << uspi->s_bpfshift) + pos - begin;
}
-#define ubh_isblockclear(ubh,begin,block) (!_ubh_isblockset_(uspi,ubh,begin,block))
-
-#define ubh_isblockset(ubh,begin,block) _ubh_isblockset_(uspi,ubh,begin,block)
-static inline int _ubh_isblockset_(struct ufs_sb_private_info * uspi,
- struct ufs_buffer_head * ubh, unsigned begin, unsigned block)
+static inline int ubh_isblockset(struct ufs_sb_private_info *uspi,
+ struct ufs_cg_private_info *ucpi, unsigned int frag)
{
+ struct ufs_buffer_head *ubh = UCPI_UBH(ucpi);
+ u8 *p = ubh_get_addr(ubh, ucpi->c_freeoff + (frag >> 3));
u8 mask;
+
switch (uspi->s_fpb) {
case 8:
- return (*ubh_get_addr (ubh, begin + block) == 0xff);
+ return *p == 0xff;
case 4:
- mask = 0x0f << ((block & 0x01) << 2);
- return (*ubh_get_addr (ubh, begin + (block >> 1)) & mask) == mask;
+ mask = 0x0f << (frag & 4);
+ return (*p & mask) == mask;
case 2:
- mask = 0x03 << ((block & 0x03) << 1);
- return (*ubh_get_addr (ubh, begin + (block >> 2)) & mask) == mask;
+ mask = 0x03 << (frag & 6);
+ return (*p & mask) == mask;
case 1:
- mask = 0x01 << (block & 0x07);
- return (*ubh_get_addr (ubh, begin + (block >> 3)) & mask) == mask;
+ mask = 0x01 << (frag & 7);
+ return (*p & mask) == mask;
}
return 0;
}
-#define ubh_clrblock(ubh,begin,block) _ubh_clrblock_(uspi,ubh,begin,block)
-static inline void _ubh_clrblock_(struct ufs_sb_private_info * uspi,
- struct ufs_buffer_head * ubh, unsigned begin, unsigned block)
+static inline void ubh_clrblock(struct ufs_sb_private_info *uspi,
+ struct ufs_cg_private_info *ucpi, unsigned int frag)
{
+ struct ufs_buffer_head *ubh = UCPI_UBH(ucpi);
+ u8 *p = ubh_get_addr(ubh, ucpi->c_freeoff + (frag >> 3));
+
switch (uspi->s_fpb) {
case 8:
- *ubh_get_addr (ubh, begin + block) = 0x00;
+ *p = 0x00;
return;
case 4:
- *ubh_get_addr (ubh, begin + (block >> 1)) &= ~(0x0f << ((block & 0x01) << 2));
+ *p &= ~(0x0f << (frag & 4));
return;
case 2:
- *ubh_get_addr (ubh, begin + (block >> 2)) &= ~(0x03 << ((block & 0x03) << 1));
+ *p &= ~(0x03 << (frag & 6));
return;
case 1:
- *ubh_get_addr (ubh, begin + (block >> 3)) &= ~(0x01 << ((block & 0x07)));
+ *p &= ~(0x01 << (frag & 7));
return;
}
}
-#define ubh_setblock(ubh,begin,block) _ubh_setblock_(uspi,ubh,begin,block)
-static inline void _ubh_setblock_(struct ufs_sb_private_info * uspi,
- struct ufs_buffer_head * ubh, unsigned begin, unsigned block)
+static inline void ubh_setblock(struct ufs_sb_private_info * uspi,
+ struct ufs_cg_private_info *ucpi, unsigned int frag)
{
+ struct ufs_buffer_head *ubh = UCPI_UBH(ucpi);
+ u8 *p = ubh_get_addr(ubh, ucpi->c_freeoff + (frag >> 3));
+
switch (uspi->s_fpb) {
case 8:
- *ubh_get_addr(ubh, begin + block) = 0xff;
+ *p = 0xff;
return;
case 4:
- *ubh_get_addr(ubh, begin + (block >> 1)) |= (0x0f << ((block & 0x01) << 2));
+ *p |= 0x0f << (frag & 4);
return;
case 2:
- *ubh_get_addr(ubh, begin + (block >> 2)) |= (0x03 << ((block & 0x03) << 1));
+ *p |= 0x03 << (frag & 6);
return;
case 1:
- *ubh_get_addr(ubh, begin + (block >> 3)) |= (0x01 << ((block & 0x07)));
+ *p |= 0x01 << (frag & 7);
return;
}
}
diff --git a/fs/unicode/utf8-core.c b/fs/unicode/utf8-core.c
index 8395066341a4..7f7cb14e01ce 100644
--- a/fs/unicode/utf8-core.c
+++ b/fs/unicode/utf8-core.c
@@ -214,3 +214,29 @@ void utf8_unload(struct unicode_map *um)
}
EXPORT_SYMBOL(utf8_unload);
+/**
+ * utf8_parse_version - Parse a UTF-8 version number from a string
+ *
+ * @version: input string
+ *
+ * Returns the parsed version on success, negative code on error
+ */
+int utf8_parse_version(char *version)
+{
+ substring_t args[3];
+ unsigned int maj, min, rev;
+ static const struct match_token token[] = {
+ {1, "%d.%d.%d"},
+ {0, NULL}
+ };
+
+ if (match_token(version, token, args) != 1)
+ return -EINVAL;
+
+ if (match_int(&args[0], &maj) || match_int(&args[1], &min) ||
+ match_int(&args[2], &rev))
+ return -EINVAL;
+
+ return UNICODE_AGE(maj, min, rev);
+}
+EXPORT_SYMBOL(utf8_parse_version);
diff --git a/fs/unicode/utf8-selftest.c b/fs/unicode/utf8-selftest.c
index 600e15efe9ed..5ddaf27b21a6 100644
--- a/fs/unicode/utf8-selftest.c
+++ b/fs/unicode/utf8-selftest.c
@@ -17,9 +17,6 @@
static unsigned int failed_tests;
static unsigned int total_tests;
-/* Tests will be based on this version. */
-#define UTF8_LATEST UNICODE_AGE(12, 1, 0)
-
#define _test(cond, func, line, fmt, ...) do { \
total_tests++; \
if (!cond) { \
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 68cdd89c97a3..7c0bd0b55f88 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -692,6 +692,34 @@ void dup_userfaultfd_complete(struct list_head *fcs)
}
}
+void dup_userfaultfd_fail(struct list_head *fcs)
+{
+ struct userfaultfd_fork_ctx *fctx, *n;
+
+ /*
+ * An error has occurred on fork, we will tear memory down, but have
+ * allocated memory for fctx's and raised reference counts for both the
+ * original and child contexts (and on the mm for each as a result).
+ *
+ * These would ordinarily be taken care of by a user handling the event,
+ * but we are no longer doing so, so manually clean up here.
+ *
+ * mm tear down will take care of cleaning up VMA contexts.
+ */
+ list_for_each_entry_safe(fctx, n, fcs, list) {
+ struct userfaultfd_ctx *octx = fctx->orig;
+ struct userfaultfd_ctx *ctx = fctx->new;
+
+ atomic_dec(&octx->mmap_changing);
+ VM_BUG_ON(atomic_read(&octx->mmap_changing) < 0);
+ userfaultfd_ctx_put(octx);
+ userfaultfd_ctx_put(ctx);
+
+ list_del(&fctx->list);
+ kfree(fctx);
+ }
+}
+
void mremap_userfaultfd_prep(struct vm_area_struct *vma,
struct vm_userfaultfd_ctx *vm_ctx)
{
diff --git a/fs/utimes.c b/fs/utimes.c
index 99b26f792b89..c7c7958e57b2 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -108,18 +108,13 @@ retry:
static int do_utimes_fd(int fd, struct timespec64 *times, int flags)
{
- struct fd f;
- int error;
-
if (flags)
return -EINVAL;
- f = fdget(fd);
- if (!fd_file(f))
+ CLASS(fd, f)(fd);
+ if (fd_empty(f))
return -EBADF;
- error = vfs_utimes(&fd_file(f)->f_path, times);
- fdput(f);
- return error;
+ return vfs_utimes(&fd_file(f)->f_path, times);
}
/*
diff --git a/fs/xattr.c b/fs/xattr.c
index 05ec7e7d9e87..02bee149ad96 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -586,25 +586,32 @@ retry_deleg:
}
EXPORT_SYMBOL_GPL(vfs_removexattr);
+int import_xattr_name(struct xattr_name *kname, const char __user *name)
+{
+ int error = strncpy_from_user(kname->name, name,
+ sizeof(kname->name));
+ if (error == 0 || error == sizeof(kname->name))
+ return -ERANGE;
+ if (error < 0)
+ return error;
+ return 0;
+}
+
/*
* Extended attribute SET operations
*/
-int setxattr_copy(const char __user *name, struct xattr_ctx *ctx)
+int setxattr_copy(const char __user *name, struct kernel_xattr_ctx *ctx)
{
int error;
if (ctx->flags & ~(XATTR_CREATE|XATTR_REPLACE))
return -EINVAL;
- error = strncpy_from_user(ctx->kname->name, name,
- sizeof(ctx->kname->name));
- if (error == 0 || error == sizeof(ctx->kname->name))
- return -ERANGE;
- if (error < 0)
+ error = import_xattr_name(ctx->kname, name);
+ if (error)
return error;
- error = 0;
if (ctx->size) {
if (ctx->size > XATTR_SIZE_MAX)
return -E2BIG;
@@ -619,8 +626,8 @@ int setxattr_copy(const char __user *name, struct xattr_ctx *ctx)
return error;
}
-int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry,
- struct xattr_ctx *ctx)
+static int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry,
+ struct kernel_xattr_ctx *ctx)
{
if (is_posix_acl_xattr(ctx->kname->name))
return do_set_acl(idmap, dentry, ctx->kname->name,
@@ -630,32 +637,32 @@ int do_setxattr(struct mnt_idmap *idmap, struct dentry *dentry,
ctx->kvalue, ctx->size, ctx->flags);
}
-static int path_setxattr(const char __user *pathname,
- const char __user *name, const void __user *value,
- size_t size, int flags, unsigned int lookup_flags)
+int file_setxattr(struct file *f, struct kernel_xattr_ctx *ctx)
+{
+ int error = mnt_want_write_file(f);
+
+ if (!error) {
+ audit_file(f);
+ error = do_setxattr(file_mnt_idmap(f), f->f_path.dentry, ctx);
+ mnt_drop_write_file(f);
+ }
+ return error;
+}
+
+/* unconditionally consumes filename */
+int filename_setxattr(int dfd, struct filename *filename,
+ unsigned int lookup_flags, struct kernel_xattr_ctx *ctx)
{
- struct xattr_name kname;
- struct xattr_ctx ctx = {
- .cvalue = value,
- .kvalue = NULL,
- .size = size,
- .kname = &kname,
- .flags = flags,
- };
struct path path;
int error;
- error = setxattr_copy(name, &ctx);
- if (error)
- return error;
-
retry:
- error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
+ error = filename_lookup(dfd, filename, lookup_flags, &path, NULL);
if (error)
goto out;
error = mnt_want_write(path.mnt);
if (!error) {
- error = do_setxattr(mnt_idmap(path.mnt), path.dentry, &ctx);
+ error = do_setxattr(mnt_idmap(path.mnt), path.dentry, ctx);
mnt_drop_write(path.mnt);
}
path_put(&path);
@@ -665,80 +672,121 @@ retry:
}
out:
+ putname(filename);
+ return error;
+}
+
+static int path_setxattrat(int dfd, const char __user *pathname,
+ unsigned int at_flags, const char __user *name,
+ const void __user *value, size_t size, int flags)
+{
+ struct xattr_name kname;
+ struct kernel_xattr_ctx ctx = {
+ .cvalue = value,
+ .kvalue = NULL,
+ .size = size,
+ .kname = &kname,
+ .flags = flags,
+ };
+ struct filename *filename;
+ unsigned int lookup_flags = 0;
+ int error;
+
+ if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
+ return -EINVAL;
+
+ if (!(at_flags & AT_SYMLINK_NOFOLLOW))
+ lookup_flags = LOOKUP_FOLLOW;
+
+ error = setxattr_copy(name, &ctx);
+ if (error)
+ return error;
+
+ filename = getname_maybe_null(pathname, at_flags);
+ if (!filename) {
+ CLASS(fd, f)(dfd);
+ if (fd_empty(f))
+ error = -EBADF;
+ else
+ error = file_setxattr(fd_file(f), &ctx);
+ } else {
+ error = filename_setxattr(dfd, filename, lookup_flags, &ctx);
+ }
kvfree(ctx.kvalue);
return error;
}
+SYSCALL_DEFINE6(setxattrat, int, dfd, const char __user *, pathname, unsigned int, at_flags,
+ const char __user *, name, const struct xattr_args __user *, uargs,
+ size_t, usize)
+{
+ struct xattr_args args = {};
+ int error;
+
+ BUILD_BUG_ON(sizeof(struct xattr_args) < XATTR_ARGS_SIZE_VER0);
+ BUILD_BUG_ON(sizeof(struct xattr_args) != XATTR_ARGS_SIZE_LATEST);
+
+ if (unlikely(usize < XATTR_ARGS_SIZE_VER0))
+ return -EINVAL;
+ if (usize > PAGE_SIZE)
+ return -E2BIG;
+
+ error = copy_struct_from_user(&args, sizeof(args), uargs, usize);
+ if (error)
+ return error;
+
+ return path_setxattrat(dfd, pathname, at_flags, name,
+ u64_to_user_ptr(args.value), args.size,
+ args.flags);
+}
+
SYSCALL_DEFINE5(setxattr, const char __user *, pathname,
const char __user *, name, const void __user *, value,
size_t, size, int, flags)
{
- return path_setxattr(pathname, name, value, size, flags, LOOKUP_FOLLOW);
+ return path_setxattrat(AT_FDCWD, pathname, 0, name, value, size, flags);
}
SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname,
const char __user *, name, const void __user *, value,
size_t, size, int, flags)
{
- return path_setxattr(pathname, name, value, size, flags, 0);
+ return path_setxattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, name,
+ value, size, flags);
}
SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
const void __user *,value, size_t, size, int, flags)
{
- struct xattr_name kname;
- struct xattr_ctx ctx = {
- .cvalue = value,
- .kvalue = NULL,
- .size = size,
- .kname = &kname,
- .flags = flags,
- };
- int error;
-
- CLASS(fd, f)(fd);
- if (!fd_file(f))
- return -EBADF;
-
- audit_file(fd_file(f));
- error = setxattr_copy(name, &ctx);
- if (error)
- return error;
-
- error = mnt_want_write_file(fd_file(f));
- if (!error) {
- error = do_setxattr(file_mnt_idmap(fd_file(f)),
- fd_file(f)->f_path.dentry, &ctx);
- mnt_drop_write_file(fd_file(f));
- }
- kvfree(ctx.kvalue);
- return error;
+ return path_setxattrat(fd, NULL, AT_EMPTY_PATH, name,
+ value, size, flags);
}
/*
* Extended attribute GET operations
*/
-ssize_t
+static ssize_t
do_getxattr(struct mnt_idmap *idmap, struct dentry *d,
- struct xattr_ctx *ctx)
+ struct kernel_xattr_ctx *ctx)
{
ssize_t error;
char *kname = ctx->kname->name;
+ void *kvalue = NULL;
if (ctx->size) {
if (ctx->size > XATTR_SIZE_MAX)
ctx->size = XATTR_SIZE_MAX;
- ctx->kvalue = kvzalloc(ctx->size, GFP_KERNEL);
- if (!ctx->kvalue)
+ kvalue = kvzalloc(ctx->size, GFP_KERNEL);
+ if (!kvalue)
return -ENOMEM;
}
- if (is_posix_acl_xattr(ctx->kname->name))
- error = do_get_acl(idmap, d, kname, ctx->kvalue, ctx->size);
+ if (is_posix_acl_xattr(kname))
+ error = do_get_acl(idmap, d, kname, kvalue, ctx->size);
else
- error = vfs_getxattr(idmap, d, kname, ctx->kvalue, ctx->size);
+ error = vfs_getxattr(idmap, d, kname, kvalue, ctx->size);
if (error > 0) {
- if (ctx->size && copy_to_user(ctx->value, ctx->kvalue, error))
+ if (ctx->size && copy_to_user(ctx->value, kvalue, error))
error = -EFAULT;
} else if (error == -ERANGE && ctx->size >= XATTR_SIZE_MAX) {
/* The file system tried to returned a value bigger
@@ -746,79 +794,114 @@ do_getxattr(struct mnt_idmap *idmap, struct dentry *d,
error = -E2BIG;
}
+ kvfree(kvalue);
return error;
}
-static ssize_t
-getxattr(struct mnt_idmap *idmap, struct dentry *d,
- const char __user *name, void __user *value, size_t size)
+ssize_t file_getxattr(struct file *f, struct kernel_xattr_ctx *ctx)
{
+ audit_file(f);
+ return do_getxattr(file_mnt_idmap(f), f->f_path.dentry, ctx);
+}
+
+/* unconditionally consumes filename */
+ssize_t filename_getxattr(int dfd, struct filename *filename,
+ unsigned int lookup_flags, struct kernel_xattr_ctx *ctx)
+{
+ struct path path;
ssize_t error;
+retry:
+ error = filename_lookup(dfd, filename, lookup_flags, &path, NULL);
+ if (error)
+ goto out;
+ error = do_getxattr(mnt_idmap(path.mnt), path.dentry, ctx);
+ path_put(&path);
+ if (retry_estale(error, lookup_flags)) {
+ lookup_flags |= LOOKUP_REVAL;
+ goto retry;
+ }
+out:
+ putname(filename);
+ return error;
+}
+
+static ssize_t path_getxattrat(int dfd, const char __user *pathname,
+ unsigned int at_flags, const char __user *name,
+ void __user *value, size_t size)
+{
struct xattr_name kname;
- struct xattr_ctx ctx = {
+ struct kernel_xattr_ctx ctx = {
.value = value,
- .kvalue = NULL,
.size = size,
.kname = &kname,
.flags = 0,
};
+ struct filename *filename;
+ ssize_t error;
- error = strncpy_from_user(kname.name, name, sizeof(kname.name));
- if (error == 0 || error == sizeof(kname.name))
- error = -ERANGE;
- if (error < 0)
- return error;
+ if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
+ return -EINVAL;
- error = do_getxattr(idmap, d, &ctx);
+ error = import_xattr_name(&kname, name);
+ if (error)
+ return error;
- kvfree(ctx.kvalue);
- return error;
+ filename = getname_maybe_null(pathname, at_flags);
+ if (!filename) {
+ CLASS(fd, f)(dfd);
+ if (fd_empty(f))
+ return -EBADF;
+ return file_getxattr(fd_file(f), &ctx);
+ } else {
+ int lookup_flags = 0;
+ if (!(at_flags & AT_SYMLINK_NOFOLLOW))
+ lookup_flags = LOOKUP_FOLLOW;
+ return filename_getxattr(dfd, filename, lookup_flags, &ctx);
+ }
}
-static ssize_t path_getxattr(const char __user *pathname,
- const char __user *name, void __user *value,
- size_t size, unsigned int lookup_flags)
+SYSCALL_DEFINE6(getxattrat, int, dfd, const char __user *, pathname, unsigned int, at_flags,
+ const char __user *, name, struct xattr_args __user *, uargs, size_t, usize)
{
- struct path path;
- ssize_t error;
-retry:
- error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
+ struct xattr_args args = {};
+ int error;
+
+ BUILD_BUG_ON(sizeof(struct xattr_args) < XATTR_ARGS_SIZE_VER0);
+ BUILD_BUG_ON(sizeof(struct xattr_args) != XATTR_ARGS_SIZE_LATEST);
+
+ if (unlikely(usize < XATTR_ARGS_SIZE_VER0))
+ return -EINVAL;
+ if (usize > PAGE_SIZE)
+ return -E2BIG;
+
+ error = copy_struct_from_user(&args, sizeof(args), uargs, usize);
if (error)
return error;
- error = getxattr(mnt_idmap(path.mnt), path.dentry, name, value, size);
- path_put(&path);
- if (retry_estale(error, lookup_flags)) {
- lookup_flags |= LOOKUP_REVAL;
- goto retry;
- }
- return error;
+
+ if (args.flags != 0)
+ return -EINVAL;
+
+ return path_getxattrat(dfd, pathname, at_flags, name,
+ u64_to_user_ptr(args.value), args.size);
}
SYSCALL_DEFINE4(getxattr, const char __user *, pathname,
const char __user *, name, void __user *, value, size_t, size)
{
- return path_getxattr(pathname, name, value, size, LOOKUP_FOLLOW);
+ return path_getxattrat(AT_FDCWD, pathname, 0, name, value, size);
}
SYSCALL_DEFINE4(lgetxattr, const char __user *, pathname,
const char __user *, name, void __user *, value, size_t, size)
{
- return path_getxattr(pathname, name, value, size, 0);
+ return path_getxattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, name,
+ value, size);
}
SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name,
void __user *, value, size_t, size)
{
- struct fd f = fdget(fd);
- ssize_t error = -EBADF;
-
- if (!fd_file(f))
- return error;
- audit_file(fd_file(f));
- error = getxattr(file_mnt_idmap(fd_file(f)), fd_file(f)->f_path.dentry,
- name, value, size);
- fdput(f);
- return error;
+ return path_getxattrat(fd, NULL, AT_EMPTY_PATH, name, value, size);
}
/*
@@ -853,47 +936,80 @@ listxattr(struct dentry *d, char __user *list, size_t size)
return error;
}
-static ssize_t path_listxattr(const char __user *pathname, char __user *list,
- size_t size, unsigned int lookup_flags)
+static
+ssize_t file_listxattr(struct file *f, char __user *list, size_t size)
+{
+ audit_file(f);
+ return listxattr(f->f_path.dentry, list, size);
+}
+
+/* unconditionally consumes filename */
+static
+ssize_t filename_listxattr(int dfd, struct filename *filename,
+ unsigned int lookup_flags,
+ char __user *list, size_t size)
{
struct path path;
ssize_t error;
retry:
- error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
+ error = filename_lookup(dfd, filename, lookup_flags, &path, NULL);
if (error)
- return error;
+ goto out;
error = listxattr(path.dentry, list, size);
path_put(&path);
if (retry_estale(error, lookup_flags)) {
lookup_flags |= LOOKUP_REVAL;
goto retry;
}
+out:
+ putname(filename);
return error;
}
+static ssize_t path_listxattrat(int dfd, const char __user *pathname,
+ unsigned int at_flags, char __user *list,
+ size_t size)
+{
+ struct filename *filename;
+ int lookup_flags;
+
+ if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
+ return -EINVAL;
+
+ filename = getname_maybe_null(pathname, at_flags);
+ if (!filename) {
+ CLASS(fd, f)(dfd);
+ if (fd_empty(f))
+ return -EBADF;
+ return file_listxattr(fd_file(f), list, size);
+ }
+
+ lookup_flags = (at_flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
+ return filename_listxattr(dfd, filename, lookup_flags, list, size);
+}
+
+SYSCALL_DEFINE5(listxattrat, int, dfd, const char __user *, pathname,
+ unsigned int, at_flags,
+ char __user *, list, size_t, size)
+{
+ return path_listxattrat(dfd, pathname, at_flags, list, size);
+}
+
SYSCALL_DEFINE3(listxattr, const char __user *, pathname, char __user *, list,
size_t, size)
{
- return path_listxattr(pathname, list, size, LOOKUP_FOLLOW);
+ return path_listxattrat(AT_FDCWD, pathname, 0, list, size);
}
SYSCALL_DEFINE3(llistxattr, const char __user *, pathname, char __user *, list,
size_t, size)
{
- return path_listxattr(pathname, list, size, 0);
+ return path_listxattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, list, size);
}
SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size)
{
- struct fd f = fdget(fd);
- ssize_t error = -EBADF;
-
- if (!fd_file(f))
- return error;
- audit_file(fd_file(f));
- error = listxattr(fd_file(f)->f_path.dentry, list, size);
- fdput(f);
- return error;
+ return path_listxattrat(fd, NULL, AT_EMPTY_PATH, list, size);
}
/*
@@ -907,25 +1023,33 @@ removexattr(struct mnt_idmap *idmap, struct dentry *d, const char *name)
return vfs_removexattr(idmap, d, name);
}
-static int path_removexattr(const char __user *pathname,
- const char __user *name, unsigned int lookup_flags)
+static int file_removexattr(struct file *f, struct xattr_name *kname)
+{
+ int error = mnt_want_write_file(f);
+
+ if (!error) {
+ audit_file(f);
+ error = removexattr(file_mnt_idmap(f),
+ f->f_path.dentry, kname->name);
+ mnt_drop_write_file(f);
+ }
+ return error;
+}
+
+/* unconditionally consumes filename */
+static int filename_removexattr(int dfd, struct filename *filename,
+ unsigned int lookup_flags, struct xattr_name *kname)
{
struct path path;
int error;
- char kname[XATTR_NAME_MAX + 1];
- error = strncpy_from_user(kname, name, sizeof(kname));
- if (error == 0 || error == sizeof(kname))
- error = -ERANGE;
- if (error < 0)
- return error;
retry:
- error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path);
+ error = filename_lookup(dfd, filename, lookup_flags, &path, NULL);
if (error)
- return error;
+ goto out;
error = mnt_want_write(path.mnt);
if (!error) {
- error = removexattr(mnt_idmap(path.mnt), path.dentry, kname);
+ error = removexattr(mnt_idmap(path.mnt), path.dentry, kname->name);
mnt_drop_write(path.mnt);
}
path_put(&path);
@@ -933,45 +1057,58 @@ retry:
lookup_flags |= LOOKUP_REVAL;
goto retry;
}
+out:
+ putname(filename);
return error;
}
+static int path_removexattrat(int dfd, const char __user *pathname,
+ unsigned int at_flags, const char __user *name)
+{
+ struct xattr_name kname;
+ struct filename *filename;
+ unsigned int lookup_flags;
+ int error;
+
+ if ((at_flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
+ return -EINVAL;
+
+ error = import_xattr_name(&kname, name);
+ if (error)
+ return error;
+
+ filename = getname_maybe_null(pathname, at_flags);
+ if (!filename) {
+ CLASS(fd, f)(dfd);
+ if (fd_empty(f))
+ return -EBADF;
+ return file_removexattr(fd_file(f), &kname);
+ }
+ lookup_flags = (at_flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
+ return filename_removexattr(dfd, filename, lookup_flags, &kname);
+}
+
+SYSCALL_DEFINE4(removexattrat, int, dfd, const char __user *, pathname,
+ unsigned int, at_flags, const char __user *, name)
+{
+ return path_removexattrat(dfd, pathname, at_flags, name);
+}
+
SYSCALL_DEFINE2(removexattr, const char __user *, pathname,
const char __user *, name)
{
- return path_removexattr(pathname, name, LOOKUP_FOLLOW);
+ return path_removexattrat(AT_FDCWD, pathname, 0, name);
}
SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname,
const char __user *, name)
{
- return path_removexattr(pathname, name, 0);
+ return path_removexattrat(AT_FDCWD, pathname, AT_SYMLINK_NOFOLLOW, name);
}
SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
{
- struct fd f = fdget(fd);
- char kname[XATTR_NAME_MAX + 1];
- int error = -EBADF;
-
- if (!fd_file(f))
- return error;
- audit_file(fd_file(f));
-
- error = strncpy_from_user(kname, name, sizeof(kname));
- if (error == 0 || error == sizeof(kname))
- error = -ERANGE;
- if (error < 0)
- return error;
-
- error = mnt_want_write_file(fd_file(f));
- if (!error) {
- error = removexattr(file_mnt_idmap(fd_file(f)),
- fd_file(f)->f_path.dentry, kname);
- mnt_drop_write_file(fd_file(f));
- }
- fdput(f);
- return error;
+ return path_removexattrat(fd, NULL, AT_EMPTY_PATH, name);
}
int xattr_list_one(char **buffer, ssize_t *remaining_size, const char *name)
@@ -1005,9 +1142,10 @@ generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
{
const struct xattr_handler *handler, * const *handlers = dentry->d_sb->s_xattr;
ssize_t remaining_size = buffer_size;
- int err = 0;
for_each_xattr_handler(handlers, handler) {
+ int err;
+
if (!handler->name || (handler->list && !handler->list(dentry)))
continue;
err = xattr_list_one(&buffer, &remaining_size, handler->name);
@@ -1015,7 +1153,7 @@ generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
return err;
}
- return err ? err : buffer_size - remaining_size;
+ return buffer_size - remaining_size;
}
EXPORT_SYMBOL(generic_listxattr);
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index 5f0494702e0b..5ca8d0106827 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -185,17 +185,20 @@ out:
}
/*
- * Free up the per-ag resources associated with the mount structure.
+ * Free up the per-ag resources within the specified AG range.
*/
void
-xfs_free_perag(
- struct xfs_mount *mp)
+xfs_free_perag_range(
+ struct xfs_mount *mp,
+ xfs_agnumber_t first_agno,
+ xfs_agnumber_t end_agno)
+
{
- struct xfs_perag *pag;
xfs_agnumber_t agno;
- for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
- pag = xa_erase(&mp->m_perags, agno);
+ for (agno = first_agno; agno < end_agno; agno++) {
+ struct xfs_perag *pag = xa_erase(&mp->m_perags, agno);
+
ASSERT(pag);
XFS_IS_CORRUPT(pag->pag_mount, atomic_read(&pag->pag_ref) != 0);
xfs_defer_drain_free(&pag->pag_intents_drain);
@@ -270,54 +273,37 @@ xfs_agino_range(
return __xfs_agino_range(mp, xfs_ag_block_count(mp, agno), first, last);
}
-/*
- * Free perag within the specified AG range, it is only used to free unused
- * perags under the error handling path.
- */
-void
-xfs_free_unused_perag_range(
+int
+xfs_update_last_ag_size(
struct xfs_mount *mp,
- xfs_agnumber_t agstart,
- xfs_agnumber_t agend)
+ xfs_agnumber_t prev_agcount)
{
- struct xfs_perag *pag;
- xfs_agnumber_t index;
+ struct xfs_perag *pag = xfs_perag_grab(mp, prev_agcount - 1);
- for (index = agstart; index < agend; index++) {
- pag = xa_erase(&mp->m_perags, index);
- if (!pag)
- break;
- xfs_buf_cache_destroy(&pag->pag_bcache);
- xfs_defer_drain_free(&pag->pag_intents_drain);
- kfree(pag);
- }
+ if (!pag)
+ return -EFSCORRUPTED;
+ pag->block_count = __xfs_ag_block_count(mp, prev_agcount - 1,
+ mp->m_sb.sb_agcount, mp->m_sb.sb_dblocks);
+ __xfs_agino_range(mp, pag->block_count, &pag->agino_min,
+ &pag->agino_max);
+ xfs_perag_rele(pag);
+ return 0;
}
int
xfs_initialize_perag(
struct xfs_mount *mp,
- xfs_agnumber_t agcount,
+ xfs_agnumber_t old_agcount,
+ xfs_agnumber_t new_agcount,
xfs_rfsblock_t dblocks,
xfs_agnumber_t *maxagi)
{
struct xfs_perag *pag;
xfs_agnumber_t index;
- xfs_agnumber_t first_initialised = NULLAGNUMBER;
int error;
- /*
- * Walk the current per-ag tree so we don't try to initialise AGs
- * that already exist (growfs case). Allocate and insert all the
- * AGs we don't find ready for initialisation.
- */
- for (index = 0; index < agcount; index++) {
- pag = xfs_perag_get(mp, index);
- if (pag) {
- xfs_perag_put(pag);
- continue;
- }
-
- pag = kzalloc(sizeof(*pag), GFP_KERNEL | __GFP_RETRY_MAYFAIL);
+ for (index = old_agcount; index < new_agcount; index++) {
+ pag = kzalloc(sizeof(*pag), GFP_KERNEL);
if (!pag) {
error = -ENOMEM;
goto out_unwind_new_pags;
@@ -353,21 +339,17 @@ xfs_initialize_perag(
/* Active ref owned by mount indicates AG is online. */
atomic_set(&pag->pag_active_ref, 1);
- /* first new pag is fully initialized */
- if (first_initialised == NULLAGNUMBER)
- first_initialised = index;
-
/*
* Pre-calculated geometry
*/
- pag->block_count = __xfs_ag_block_count(mp, index, agcount,
+ pag->block_count = __xfs_ag_block_count(mp, index, new_agcount,
dblocks);
pag->min_block = XFS_AGFL_BLOCK(mp);
__xfs_agino_range(mp, pag->block_count, &pag->agino_min,
&pag->agino_max);
}
- index = xfs_set_inode_alloc(mp, agcount);
+ index = xfs_set_inode_alloc(mp, new_agcount);
if (maxagi)
*maxagi = index;
@@ -381,8 +363,7 @@ out_remove_pag:
out_free_pag:
kfree(pag);
out_unwind_new_pags:
- /* unwind any prior newly initialized pags */
- xfs_free_unused_perag_range(mp, first_initialised, agcount);
+ xfs_free_perag_range(mp, old_agcount, index);
return error;
}
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
index d9cccd093b60..9edfe0e96439 100644
--- a/fs/xfs/libxfs/xfs_ag.h
+++ b/fs/xfs/libxfs/xfs_ag.h
@@ -144,12 +144,13 @@ __XFS_AG_OPSTATE(prefers_metadata, PREFERS_METADATA)
__XFS_AG_OPSTATE(allows_inodes, ALLOWS_INODES)
__XFS_AG_OPSTATE(agfl_needs_reset, AGFL_NEEDS_RESET)
-void xfs_free_unused_perag_range(struct xfs_mount *mp, xfs_agnumber_t agstart,
- xfs_agnumber_t agend);
-int xfs_initialize_perag(struct xfs_mount *mp, xfs_agnumber_t agcount,
- xfs_rfsblock_t dcount, xfs_agnumber_t *maxagi);
+int xfs_initialize_perag(struct xfs_mount *mp, xfs_agnumber_t old_agcount,
+ xfs_agnumber_t agcount, xfs_rfsblock_t dcount,
+ xfs_agnumber_t *maxagi);
+void xfs_free_perag_range(struct xfs_mount *mp, xfs_agnumber_t first_agno,
+ xfs_agnumber_t end_agno);
int xfs_initialize_perag_data(struct xfs_mount *mp, xfs_agnumber_t agno);
-void xfs_free_perag(struct xfs_mount *mp);
+int xfs_update_last_ag_size(struct xfs_mount *mp, xfs_agnumber_t prev_agcount);
/* Passive AG references */
struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno);
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 04f64cf9777e..22bdbb3e9980 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -1923,7 +1923,7 @@ restart:
error = -EFSCORRUPTED;
goto error0;
}
- if (flen < bestrlen)
+ if (flen <= bestrlen)
break;
busy = xfs_alloc_compute_aligned(args, fbno, flen,
&rbno, &rlen, &busy_gen);
diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c
index 3c40f37e82c7..c962ad64b0c1 100644
--- a/fs/xfs/libxfs/xfs_trans_inode.c
+++ b/fs/xfs/libxfs/xfs_trans_inode.c
@@ -62,12 +62,12 @@ xfs_trans_ichgtime(
ASSERT(tp);
xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
- tv = current_time(inode);
+ /* If the mtime changes, then ctime must also change */
+ ASSERT(flags & XFS_ICHGTIME_CHG);
+ tv = inode_set_ctime_current(inode);
if (flags & XFS_ICHGTIME_MOD)
inode_set_mtime_to_ts(inode, tv);
- if (flags & XFS_ICHGTIME_CHG)
- inode_set_ctime_to_ts(inode, tv);
if (flags & XFS_ICHGTIME_ACCESS)
inode_set_atime_to_ts(inode, tv);
if (flags & XFS_ICHGTIME_CREATE)
diff --git a/fs/xfs/scrub/bmap_repair.c b/fs/xfs/scrub/bmap_repair.c
index 49dc38acc66b..4505f4829d53 100644
--- a/fs/xfs/scrub/bmap_repair.c
+++ b/fs/xfs/scrub/bmap_repair.c
@@ -801,7 +801,7 @@ xrep_bmap(
{
struct xrep_bmap *rb;
char *descr;
- unsigned int max_bmbt_recs;
+ xfs_extnum_t max_bmbt_recs;
bool large_extcount;
int error = 0;
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 67478294f11a..155bbaaa496e 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -1084,9 +1084,11 @@ xrep_metadata_inode_forks(
return error;
/* Make sure the attr fork looks ok before we delete it. */
- error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTA);
- if (error)
- return error;
+ if (xfs_inode_hasattr(sc->ip)) {
+ error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTA);
+ if (error)
+ return error;
+ }
/* Clear the reflink flag since metadata never shares. */
if (xfs_is_reflink_inode(sc->ip)) {
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 6dead20338e2..559a3a577097 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -116,7 +116,7 @@ xfs_end_ioend(
if (unlikely(error)) {
if (ioend->io_flags & IOMAP_F_SHARED) {
xfs_reflink_cancel_cow_range(ip, offset, size, true);
- xfs_bmap_punch_delalloc_range(ip, offset,
+ xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, offset,
offset + size);
}
goto done;
@@ -456,7 +456,7 @@ xfs_discard_folio(
* byte of the next folio. Hence the end offset is only dependent on the
* folio itself and not the start offset that is passed in.
*/
- xfs_bmap_punch_delalloc_range(ip, pos,
+ xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos,
folio_pos(folio) + folio_size(folio));
}
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 053d567c9108..4719ec90029c 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -442,11 +442,12 @@ out_unlock_iolock:
void
xfs_bmap_punch_delalloc_range(
struct xfs_inode *ip,
+ int whichfork,
xfs_off_t start_byte,
xfs_off_t end_byte)
{
struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = &ip->i_df;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, start_byte);
xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, end_byte);
struct xfs_bmbt_irec got, del;
@@ -474,11 +475,14 @@ xfs_bmap_punch_delalloc_range(
continue;
}
- xfs_bmap_del_extent_delay(ip, XFS_DATA_FORK, &icur, &got, &del);
+ xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del);
if (!xfs_iext_get_extent(ifp, &icur, &got))
break;
}
+ if (whichfork == XFS_COW_FORK && !ifp->if_bytes)
+ xfs_inode_clear_cowblocks_tag(ip);
+
out_unlock:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
}
@@ -580,7 +584,7 @@ xfs_free_eofblocks(
*/
if (ip->i_diflags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) {
if (ip->i_delayed_blks) {
- xfs_bmap_punch_delalloc_range(ip,
+ xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK,
round_up(XFS_ISIZE(ip), mp->m_sb.sb_blocksize),
LLONG_MAX);
}
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index eb0895bfb9da..b29760d36e1a 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -30,7 +30,7 @@ xfs_bmap_rtalloc(struct xfs_bmalloca *ap)
}
#endif /* CONFIG_XFS_RT */
-void xfs_bmap_punch_delalloc_range(struct xfs_inode *ip,
+void xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, int whichfork,
xfs_off_t start_byte, xfs_off_t end_byte);
struct kgetbmap {
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index aa4dbda7b536..e8196f5778e2 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -2115,6 +2115,13 @@ xfs_alloc_buftarg(
btp->bt_daxdev = fs_dax_get_by_bdev(btp->bt_bdev, &btp->bt_dax_part_off,
mp, ops);
+ if (bdev_can_atomic_write(btp->bt_bdev)) {
+ btp->bt_bdev_awu_min = bdev_atomic_write_unit_min_bytes(
+ btp->bt_bdev);
+ btp->bt_bdev_awu_max = bdev_atomic_write_unit_max_bytes(
+ btp->bt_bdev);
+ }
+
/*
* When allocating the buftargs we have not yet read the super block and
* thus don't know the file system sector size yet.
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 209a389f2abc..3d56bc7a35cc 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -124,6 +124,10 @@ struct xfs_buftarg {
struct percpu_counter bt_io_count;
struct ratelimit_state bt_ioerror_rl;
+ /* Atomic write unit values */
+ unsigned int bt_bdev_awu_min;
+ unsigned int bt_bdev_awu_max;
+
/* built-in cache, if we're not using the perag one */
struct xfs_buf_cache bt_cache[];
};
diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c
index 09e893cf563c..5180cbf5a90b 100644
--- a/fs/xfs/xfs_buf_item_recover.c
+++ b/fs/xfs/xfs_buf_item_recover.c
@@ -22,6 +22,9 @@
#include "xfs_inode.h"
#include "xfs_dir2.h"
#include "xfs_quota.h"
+#include "xfs_alloc.h"
+#include "xfs_ag.h"
+#include "xfs_sb.h"
/*
* This is the number of entries in the l_buf_cancel_table used during
@@ -685,6 +688,67 @@ xlog_recover_do_inode_buffer(
}
/*
+ * Update the in-memory superblock and perag structures from the primary SB
+ * buffer.
+ *
+ * This is required because transactions running after growfs may require the
+ * updated values to be set in a previous fully commit transaction.
+ */
+static int
+xlog_recover_do_primary_sb_buffer(
+ struct xfs_mount *mp,
+ struct xlog_recover_item *item,
+ struct xfs_buf *bp,
+ struct xfs_buf_log_format *buf_f,
+ xfs_lsn_t current_lsn)
+{
+ struct xfs_dsb *dsb = bp->b_addr;
+ xfs_agnumber_t orig_agcount = mp->m_sb.sb_agcount;
+ int error;
+
+ xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn);
+
+ if (orig_agcount == 0) {
+ xfs_alert(mp, "Trying to grow file system without AGs");
+ return -EFSCORRUPTED;
+ }
+
+ /*
+ * Update the in-core super block from the freshly recovered on-disk one.
+ */
+ xfs_sb_from_disk(&mp->m_sb, dsb);
+
+ if (mp->m_sb.sb_agcount < orig_agcount) {
+ xfs_alert(mp, "Shrinking AG count in log recovery not supported");
+ return -EFSCORRUPTED;
+ }
+
+ /*
+ * Growfs can also grow the last existing AG. In this case we also need
+ * to update the length in the in-core perag structure and values
+ * depending on it.
+ */
+ error = xfs_update_last_ag_size(mp, orig_agcount);
+ if (error)
+ return error;
+
+ /*
+ * Initialize the new perags, and also update various block and inode
+ * allocator setting based off the number of AGs or total blocks.
+ * Because of the latter this also needs to happen if the agcount did
+ * not change.
+ */
+ error = xfs_initialize_perag(mp, orig_agcount, mp->m_sb.sb_agcount,
+ mp->m_sb.sb_dblocks, &mp->m_maxagi);
+ if (error) {
+ xfs_warn(mp, "Failed recovery per-ag init: %d", error);
+ return error;
+ }
+ mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
+ return 0;
+}
+
+/*
* V5 filesystems know the age of the buffer on disk being recovered. We can
* have newer objects on disk than we are replaying, and so for these cases we
* don't want to replay the current change as that will make the buffer contents
@@ -967,6 +1031,12 @@ xlog_recover_buf_commit_pass2(
dirty = xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
if (!dirty)
goto out_release;
+ } else if ((xfs_blft_from_flags(buf_f) & XFS_BLFT_SB_BUF) &&
+ xfs_buf_daddr(bp) == 0) {
+ error = xlog_recover_do_primary_sb_buffer(mp, item, bp, buf_f,
+ current_lsn);
+ if (error)
+ goto out_release;
} else {
xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn);
}
diff --git a/fs/xfs/xfs_exchrange.c b/fs/xfs/xfs_exchrange.c
index 75cb53f090d1..fa29c8b334d2 100644
--- a/fs/xfs/xfs_exchrange.c
+++ b/fs/xfs/xfs_exchrange.c
@@ -813,8 +813,6 @@ xfs_ioc_exchange_range(
.file2 = file,
};
struct xfs_exchange_range args;
- struct fd file1;
- int error;
if (copy_from_user(&args, argp, sizeof(args)))
return -EFAULT;
@@ -828,14 +826,12 @@ xfs_ioc_exchange_range(
fxr.length = args.length;
fxr.flags = args.flags;
- file1 = fdget(args.file1_fd);
- if (!fd_file(file1))
+ CLASS(fd, file1)(args.file1_fd);
+ if (fd_empty(file1))
return -EBADF;
fxr.file1 = fd_file(file1);
- error = xfs_exchange_range(&fxr);
- fdput(file1);
- return error;
+ return xfs_exchange_range(&fxr);
}
/* Opaque freshness blob for XFS_IOC_COMMIT_RANGE */
@@ -909,8 +905,6 @@ xfs_ioc_commit_range(
struct xfs_commit_range_fresh *kern_f;
struct xfs_inode *ip2 = XFS_I(file_inode(file));
struct xfs_mount *mp = ip2->i_mount;
- struct fd file1;
- int error;
kern_f = (struct xfs_commit_range_fresh *)&args.file2_freshness;
@@ -934,12 +928,10 @@ xfs_ioc_commit_range(
fxr.file2_ctime.tv_sec = kern_f->file2_ctime;
fxr.file2_ctime.tv_nsec = kern_f->file2_ctime_nsec;
- file1 = fdget(args.file1_fd);
+ CLASS(fd, file1)(args.file1_fd);
if (fd_empty(file1))
return -EBADF;
fxr.file1 = fd_file(file1);
- error = xfs_exchange_range(&fxr);
- fdput(file1);
- return error;
+ return xfs_exchange_range(&fxr);
}
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 412b1d71b52b..ca47cae5a40a 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -348,9 +348,82 @@ xfs_file_splice_read(
}
/*
+ * Take care of zeroing post-EOF blocks when they might exist.
+ *
+ * Returns 0 if successfully, a negative error for a failure, or 1 if this
+ * function dropped the iolock and reacquired it exclusively and the caller
+ * needs to restart the write sanity checks.
+ */
+static ssize_t
+xfs_file_write_zero_eof(
+ struct kiocb *iocb,
+ struct iov_iter *from,
+ unsigned int *iolock,
+ size_t count,
+ bool *drained_dio)
+{
+ struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
+ loff_t isize;
+ int error;
+
+ /*
+ * We need to serialise against EOF updates that occur in IO completions
+ * here. We want to make sure that nobody is changing the size while
+ * we do this check until we have placed an IO barrier (i.e. hold
+ * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The
+ * spinlock effectively forms a memory barrier once we have
+ * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and
+ * hence be able to correctly determine if we need to run zeroing.
+ */
+ spin_lock(&ip->i_flags_lock);
+ isize = i_size_read(VFS_I(ip));
+ if (iocb->ki_pos <= isize) {
+ spin_unlock(&ip->i_flags_lock);
+ return 0;
+ }
+ spin_unlock(&ip->i_flags_lock);
+
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ return -EAGAIN;
+
+ if (!*drained_dio) {
+ /*
+ * If zeroing is needed and we are currently holding the iolock
+ * shared, we need to update it to exclusive which implies
+ * having to redo all checks before.
+ */
+ if (*iolock == XFS_IOLOCK_SHARED) {
+ xfs_iunlock(ip, *iolock);
+ *iolock = XFS_IOLOCK_EXCL;
+ xfs_ilock(ip, *iolock);
+ iov_iter_reexpand(from, count);
+ }
+
+ /*
+ * We now have an IO submission barrier in place, but AIO can do
+ * EOF updates during IO completion and hence we now need to
+ * wait for all of them to drain. Non-AIO DIO will have drained
+ * before we are given the XFS_IOLOCK_EXCL, and so for most
+ * cases this wait is a no-op.
+ */
+ inode_dio_wait(VFS_I(ip));
+ *drained_dio = true;
+ return 1;
+ }
+
+ trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
+
+ xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+ error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL);
+ xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
+
+ return error;
+}
+
+/*
* Common pre-write limit and setup checks.
*
- * Called with the iolocked held either shared and exclusive according to
+ * Called with the iolock held either shared and exclusive according to
* @iolock, and returns with it held. Might upgrade the iolock to exclusive
* if called for a direct write beyond i_size.
*/
@@ -360,13 +433,10 @@ xfs_file_write_checks(
struct iov_iter *from,
unsigned int *iolock)
{
- struct file *file = iocb->ki_filp;
- struct inode *inode = file->f_mapping->host;
- struct xfs_inode *ip = XFS_I(inode);
- ssize_t error = 0;
+ struct inode *inode = iocb->ki_filp->f_mapping->host;
size_t count = iov_iter_count(from);
bool drained_dio = false;
- loff_t isize;
+ ssize_t error;
restart:
error = generic_write_checks(iocb, from);
@@ -389,7 +459,7 @@ restart:
* exclusively.
*/
if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
- xfs_iunlock(ip, *iolock);
+ xfs_iunlock(XFS_I(inode), *iolock);
*iolock = XFS_IOLOCK_EXCL;
error = xfs_ilock_iocb(iocb, *iolock);
if (error) {
@@ -400,64 +470,24 @@ restart:
}
/*
- * If the offset is beyond the size of the file, we need to zero any
+ * If the offset is beyond the size of the file, we need to zero all
* blocks that fall between the existing EOF and the start of this
- * write. If zeroing is needed and we are currently holding the iolock
- * shared, we need to update it to exclusive which implies having to
- * redo all checks before.
- *
- * We need to serialise against EOF updates that occur in IO completions
- * here. We want to make sure that nobody is changing the size while we
- * do this check until we have placed an IO barrier (i.e. hold the
- * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The
- * spinlock effectively forms a memory barrier once we have the
- * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and
- * hence be able to correctly determine if we need to run zeroing.
+ * write.
*
- * We can do an unlocked check here safely as IO completion can only
- * extend EOF. Truncate is locked out at this point, so the EOF can
- * not move backwards, only forwards. Hence we only need to take the
- * slow path and spin locks when we are at or beyond the current EOF.
+ * We can do an unlocked check for i_size here safely as I/O completion
+ * can only extend EOF. Truncate is locked out at this point, so the
+ * EOF can not move backwards, only forwards. Hence we only need to take
+ * the slow path when we are at or beyond the current EOF.
*/
- if (iocb->ki_pos <= i_size_read(inode))
- goto out;
-
- spin_lock(&ip->i_flags_lock);
- isize = i_size_read(inode);
- if (iocb->ki_pos > isize) {
- spin_unlock(&ip->i_flags_lock);
-
- if (iocb->ki_flags & IOCB_NOWAIT)
- return -EAGAIN;
-
- if (!drained_dio) {
- if (*iolock == XFS_IOLOCK_SHARED) {
- xfs_iunlock(ip, *iolock);
- *iolock = XFS_IOLOCK_EXCL;
- xfs_ilock(ip, *iolock);
- iov_iter_reexpand(from, count);
- }
- /*
- * We now have an IO submission barrier in place, but
- * AIO can do EOF updates during IO completion and hence
- * we now need to wait for all of them to drain. Non-AIO
- * DIO will have drained before we are given the
- * XFS_IOLOCK_EXCL, and so for most cases this wait is a
- * no-op.
- */
- inode_dio_wait(inode);
- drained_dio = true;
+ if (iocb->ki_pos > i_size_read(inode)) {
+ error = xfs_file_write_zero_eof(iocb, from, iolock, count,
+ &drained_dio);
+ if (error == 1)
goto restart;
- }
-
- trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
- error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL);
if (error)
return error;
- } else
- spin_unlock(&ip->i_flags_lock);
+ }
-out:
return kiocb_modified(iocb);
}
@@ -822,6 +852,20 @@ xfs_file_write_iter(
if (IS_DAX(inode))
return xfs_file_dax_write(iocb, from);
+ if (iocb->ki_flags & IOCB_ATOMIC) {
+ /*
+ * Currently only atomic writing of a single FS block is
+ * supported. It would be possible to atomic write smaller than
+ * a FS block, but there is no requirement to support this.
+ * Note that iomap also does not support this yet.
+ */
+ if (ocount != ip->i_mount->m_sb.sb_blocksize)
+ return -EINVAL;
+ ret = generic_atomic_write_valid(iocb, from);
+ if (ret)
+ return ret;
+ }
+
if (iocb->ki_flags & IOCB_DIRECT) {
/*
* Allow a directio write to fall back to a buffered
@@ -1209,6 +1253,8 @@ xfs_file_open(
if (xfs_is_shutdown(XFS_M(inode->i_sb)))
return -EIO;
file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT;
+ if (xfs_inode_can_atomicwrite(XFS_I(inode)))
+ file->f_mode |= FMODE_CAN_ATOMIC_WRITE;
return generic_file_open(inode, file);
}
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index e3aaa0555597..290ba8887d29 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -64,25 +64,31 @@ xfs_filestream_pick_ag(
struct xfs_perag *pag;
struct xfs_perag *max_pag = NULL;
xfs_extlen_t minlen = *longest;
- xfs_extlen_t free = 0, minfree, maxfree = 0;
+ xfs_extlen_t minfree, maxfree = 0;
xfs_agnumber_t agno;
bool first_pass = true;
- int err;
/* 2% of an AG's blocks must be free for it to be chosen. */
minfree = mp->m_sb.sb_agblocks / 50;
restart:
for_each_perag_wrap(mp, start_agno, agno, pag) {
+ int err;
+
trace_xfs_filestream_scan(pag, pino);
+
*longest = 0;
err = xfs_bmap_longest_free_extent(pag, NULL, longest);
if (err) {
- if (err != -EAGAIN)
- break;
- /* Couldn't lock the AGF, skip this AG. */
- err = 0;
- continue;
+ if (err == -EAGAIN) {
+ /* Couldn't lock the AGF, skip this AG. */
+ err = 0;
+ continue;
+ }
+ xfs_perag_rele(pag);
+ if (max_pag)
+ xfs_perag_rele(max_pag);
+ return err;
}
/* Keep track of the AG with the most free blocks. */
@@ -107,8 +113,9 @@ restart:
!(flags & XFS_PICK_USERDATA) ||
(flags & XFS_PICK_LOWSPACE))) {
/* Break out, retaining the reference on the AG. */
- free = pag->pagf_freeblks;
- break;
+ if (max_pag)
+ xfs_perag_rele(max_pag);
+ goto done;
}
}
@@ -116,57 +123,47 @@ restart:
atomic_dec(&pag->pagf_fstrms);
}
- if (err) {
- xfs_perag_rele(pag);
- if (max_pag)
- xfs_perag_rele(max_pag);
- return err;
+ /*
+ * Allow a second pass to give xfs_bmap_longest_free_extent() another
+ * attempt at locking AGFs that it might have skipped over before we
+ * fail.
+ */
+ if (first_pass) {
+ first_pass = false;
+ goto restart;
}
- if (!pag) {
- /*
- * Allow a second pass to give xfs_bmap_longest_free_extent()
- * another attempt at locking AGFs that it might have skipped
- * over before we fail.
- */
- if (first_pass) {
- first_pass = false;
- goto restart;
- }
+ /*
+ * We must be low on data space, so run a final lowspace optimised
+ * selection pass if we haven't already.
+ */
+ if (!(flags & XFS_PICK_LOWSPACE)) {
+ flags |= XFS_PICK_LOWSPACE;
+ goto restart;
+ }
- /*
- * We must be low on data space, so run a final lowspace
- * optimised selection pass if we haven't already.
- */
- if (!(flags & XFS_PICK_LOWSPACE)) {
- flags |= XFS_PICK_LOWSPACE;
- goto restart;
+ /*
+ * No unassociated AGs are available, so select the AG with the most
+ * free space, regardless of whether it's already in use by another
+ * filestream. It none suit, just use whatever AG we can grab.
+ */
+ if (!max_pag) {
+ for_each_perag_wrap(args->mp, 0, start_agno, pag) {
+ max_pag = pag;
+ break;
}
- /*
- * No unassociated AGs are available, so select the AG with the
- * most free space, regardless of whether it's already in use by
- * another filestream. It none suit, just use whatever AG we can
- * grab.
- */
- if (!max_pag) {
- for_each_perag_wrap(args->mp, 0, start_agno, args->pag)
- break;
- atomic_inc(&args->pag->pagf_fstrms);
- *longest = 0;
- } else {
- pag = max_pag;
- free = maxfree;
- atomic_inc(&pag->pagf_fstrms);
- }
- } else if (max_pag) {
- xfs_perag_rele(max_pag);
+ /* Bail if there are no AGs at all to select from. */
+ if (!max_pag)
+ return -ENOSPC;
}
- trace_xfs_filestream_pick(pag, pino, free);
+ pag = max_pag;
+ atomic_inc(&pag->pagf_fstrms);
+done:
+ trace_xfs_filestream_pick(pag, pino);
args->pag = pag;
return 0;
-
}
static struct xfs_inode *
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 3643cc843f62..b247d895c276 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -87,6 +87,7 @@ xfs_growfs_data_private(
struct xfs_mount *mp, /* mount point for filesystem */
struct xfs_growfs_data *in) /* growfs data input struct */
{
+ xfs_agnumber_t oagcount = mp->m_sb.sb_agcount;
struct xfs_buf *bp;
int error;
xfs_agnumber_t nagcount;
@@ -94,7 +95,6 @@ xfs_growfs_data_private(
xfs_rfsblock_t nb, nb_div, nb_mod;
int64_t delta;
bool lastag_extended = false;
- xfs_agnumber_t oagcount;
struct xfs_trans *tp;
struct aghdr_init_data id = {};
struct xfs_perag *last_pag;
@@ -138,16 +138,14 @@ xfs_growfs_data_private(
if (delta == 0)
return 0;
- oagcount = mp->m_sb.sb_agcount;
- /* allocate the new per-ag structures */
- if (nagcount > oagcount) {
- error = xfs_initialize_perag(mp, nagcount, nb, &nagimax);
- if (error)
- return error;
- } else if (nagcount < oagcount) {
- /* TODO: shrinking the entire AGs hasn't yet completed */
+ /* TODO: shrinking the entire AGs hasn't yet completed */
+ if (nagcount < oagcount)
return -EINVAL;
- }
+
+ /* allocate the new per-ag structures */
+ error = xfs_initialize_perag(mp, oagcount, nagcount, nb, &nagimax);
+ if (error)
+ return error;
if (delta > 0)
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata,
@@ -231,7 +229,7 @@ out_trans_cancel:
xfs_trans_cancel(tp);
out_free_unused_perag:
if (nagcount > oagcount)
- xfs_free_unused_perag_range(mp, oagcount, nagcount);
+ xfs_free_perag_range(mp, oagcount, nagcount);
return error;
}
diff --git a/fs/xfs/xfs_handle.c b/fs/xfs/xfs_handle.c
index 49e5e5f04e60..f19fce557354 100644
--- a/fs/xfs/xfs_handle.c
+++ b/fs/xfs/xfs_handle.c
@@ -85,22 +85,23 @@ xfs_find_handle(
int hsize;
xfs_handle_t handle;
struct inode *inode;
- struct fd f = EMPTY_FD;
struct path path;
int error;
struct xfs_inode *ip;
if (cmd == XFS_IOC_FD_TO_HANDLE) {
- f = fdget(hreq->fd);
- if (!fd_file(f))
+ CLASS(fd, f)(hreq->fd);
+
+ if (fd_empty(f))
return -EBADF;
- inode = file_inode(fd_file(f));
+ path = fd_file(f)->f_path;
+ path_get(&path);
} else {
error = user_path_at(AT_FDCWD, hreq->path, 0, &path);
if (error)
return error;
- inode = d_inode(path.dentry);
}
+ inode = d_inode(path.dentry);
ip = XFS_I(inode);
/*
@@ -134,10 +135,7 @@ xfs_find_handle(
error = 0;
out_put:
- if (cmd == XFS_IOC_FD_TO_HANDLE)
- fdput(f);
- else
- path_put(&path);
+ path_put(&path);
return error;
}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index bcc277fc0a83..19dcb569a3e7 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1409,7 +1409,7 @@ xfs_inactive(
if (S_ISREG(VFS_I(ip)->i_mode) &&
(ip->i_disk_size != 0 || XFS_ISIZE(ip) != 0 ||
- ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0))
+ xfs_inode_has_filedata(ip)))
truncate = 1;
if (xfs_iflags_test(ip, XFS_IQUOTAUNCHECKED)) {
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 97ed912306fd..a2a6b5fd2545 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -292,6 +292,11 @@ static inline bool xfs_is_cow_inode(struct xfs_inode *ip)
return xfs_is_reflink_inode(ip) || xfs_is_always_cow_inode(ip);
}
+static inline bool xfs_inode_has_filedata(const struct xfs_inode *ip)
+{
+ return ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0;
+}
+
/*
* Check if an inode has any data in the COW fork. This might be often false
* even for inodes with the reflink flag when there is no pending COW operation.
@@ -327,6 +332,21 @@ static inline bool xfs_inode_has_bigrtalloc(struct xfs_inode *ip)
(XFS_IS_REALTIME_INODE(ip) ? \
(ip)->i_mount->m_rtdev_targp : (ip)->i_mount->m_ddev_targp)
+static inline bool
+xfs_inode_can_atomicwrite(
+ struct xfs_inode *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_buftarg *target = xfs_inode_buftarg(ip);
+
+ if (mp->m_sb.sb_blocksize < target->bt_bdev_awu_min)
+ return false;
+ if (mp->m_sb.sb_blocksize > target->bt_bdev_awu_max)
+ return false;
+
+ return true;
+}
+
/*
* In-core inode flags.
*/
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index a20d426ef021..af1bb5db1c59 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -481,7 +481,7 @@ xfs_ioctl_setattr_xflags(
if (rtflag != XFS_IS_REALTIME_INODE(ip)) {
/* Can't change realtime flag if any extents are allocated. */
- if (ip->i_df.if_nextents || ip->i_delayed_blks)
+ if (xfs_inode_has_filedata(ip))
return -EINVAL;
/*
@@ -602,7 +602,7 @@ xfs_ioctl_setattr_check_extsize(
if (!fa->fsx_valid)
return 0;
- if (S_ISREG(VFS_I(ip)->i_mode) && ip->i_df.if_nextents &&
+ if (S_ISREG(VFS_I(ip)->i_mode) && xfs_inode_has_filedata(ip) &&
XFS_FSB_TO_B(mp, ip->i_extsize) != fa->fsx_extsize)
return -EINVAL;
@@ -881,41 +881,29 @@ xfs_ioc_swapext(
xfs_swapext_t *sxp)
{
xfs_inode_t *ip, *tip;
- struct fd f, tmp;
- int error = 0;
/* Pull information for the target fd */
- f = fdget((int)sxp->sx_fdtarget);
- if (!fd_file(f)) {
- error = -EINVAL;
- goto out;
- }
+ CLASS(fd, f)((int)sxp->sx_fdtarget);
+ if (fd_empty(f))
+ return -EINVAL;
if (!(fd_file(f)->f_mode & FMODE_WRITE) ||
!(fd_file(f)->f_mode & FMODE_READ) ||
- (fd_file(f)->f_flags & O_APPEND)) {
- error = -EBADF;
- goto out_put_file;
- }
+ (fd_file(f)->f_flags & O_APPEND))
+ return -EBADF;
- tmp = fdget((int)sxp->sx_fdtmp);
- if (!fd_file(tmp)) {
- error = -EINVAL;
- goto out_put_file;
- }
+ CLASS(fd, tmp)((int)sxp->sx_fdtmp);
+ if (fd_empty(tmp))
+ return -EINVAL;
if (!(fd_file(tmp)->f_mode & FMODE_WRITE) ||
!(fd_file(tmp)->f_mode & FMODE_READ) ||
- (fd_file(tmp)->f_flags & O_APPEND)) {
- error = -EBADF;
- goto out_put_tmp_file;
- }
+ (fd_file(tmp)->f_flags & O_APPEND))
+ return -EBADF;
if (IS_SWAPFILE(file_inode(fd_file(f))) ||
- IS_SWAPFILE(file_inode(fd_file(tmp)))) {
- error = -EINVAL;
- goto out_put_tmp_file;
- }
+ IS_SWAPFILE(file_inode(fd_file(tmp))))
+ return -EINVAL;
/*
* We need to ensure that the fds passed in point to XFS inodes
@@ -923,37 +911,22 @@ xfs_ioc_swapext(
* control over what the user passes us here.
*/
if (fd_file(f)->f_op != &xfs_file_operations ||
- fd_file(tmp)->f_op != &xfs_file_operations) {
- error = -EINVAL;
- goto out_put_tmp_file;
- }
+ fd_file(tmp)->f_op != &xfs_file_operations)
+ return -EINVAL;
ip = XFS_I(file_inode(fd_file(f)));
tip = XFS_I(file_inode(fd_file(tmp)));
- if (ip->i_mount != tip->i_mount) {
- error = -EINVAL;
- goto out_put_tmp_file;
- }
-
- if (ip->i_ino == tip->i_ino) {
- error = -EINVAL;
- goto out_put_tmp_file;
- }
+ if (ip->i_mount != tip->i_mount)
+ return -EINVAL;
- if (xfs_is_shutdown(ip->i_mount)) {
- error = -EIO;
- goto out_put_tmp_file;
- }
+ if (ip->i_ino == tip->i_ino)
+ return -EINVAL;
- error = xfs_swap_extents(ip, tip, sxp);
+ if (xfs_is_shutdown(ip->i_mount))
+ return -EIO;
- out_put_tmp_file:
- fdput(tmp);
- out_put_file:
- fdput(f);
- out:
- return error;
+ return xfs_swap_extents(ip, tip, sxp);
}
static int
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 1e11f48814c0..86da16f54be9 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -707,7 +707,7 @@ imap_needs_cow(
return false;
/* when zeroing we don't have to COW holes or unwritten extents */
- if (flags & IOMAP_ZERO) {
+ if (flags & (IOMAP_UNSHARE | IOMAP_ZERO)) {
if (!nimaps ||
imap->br_startblock == HOLESTARTBLOCK ||
imap->br_state == XFS_EXT_UNWRITTEN)
@@ -975,6 +975,7 @@ xfs_buffered_write_iomap_begin(
int allocfork = XFS_DATA_FORK;
int error = 0;
unsigned int lockmode = XFS_ILOCK_EXCL;
+ unsigned int iomap_flags = 0;
u64 seq;
if (xfs_is_shutdown(mp))
@@ -1145,6 +1146,11 @@ xfs_buffered_write_iomap_begin(
}
}
+ /*
+ * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch
+ * them out if the write happens to fail.
+ */
+ iomap_flags |= IOMAP_F_NEW;
if (allocfork == XFS_COW_FORK) {
error = xfs_bmapi_reserve_delalloc(ip, allocfork, offset_fsb,
end_fsb - offset_fsb, prealloc_blocks, &cmap,
@@ -1162,19 +1168,11 @@ xfs_buffered_write_iomap_begin(
if (error)
goto out_unlock;
- /*
- * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch
- * them out if the write happens to fail.
- */
- seq = xfs_iomap_inode_sequence(ip, IOMAP_F_NEW);
- xfs_iunlock(ip, lockmode);
trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap);
- return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW, seq);
-
found_imap:
- seq = xfs_iomap_inode_sequence(ip, 0);
+ seq = xfs_iomap_inode_sequence(ip, iomap_flags);
xfs_iunlock(ip, lockmode);
- return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq);
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags, seq);
convert_delay:
xfs_iunlock(ip, lockmode);
@@ -1188,20 +1186,20 @@ convert_delay:
return 0;
found_cow:
- seq = xfs_iomap_inode_sequence(ip, 0);
if (imap.br_startoff <= offset_fsb) {
- error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq);
+ error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0,
+ xfs_iomap_inode_sequence(ip, 0));
if (error)
goto out_unlock;
- seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED);
- xfs_iunlock(ip, lockmode);
- return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags,
- IOMAP_F_SHARED, seq);
+ } else {
+ xfs_trim_extent(&cmap, offset_fsb,
+ imap.br_startoff - offset_fsb);
}
- xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb);
+ iomap_flags |= IOMAP_F_SHARED;
+ seq = xfs_iomap_inode_sequence(ip, iomap_flags);
xfs_iunlock(ip, lockmode);
- return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0, seq);
+ return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, iomap_flags, seq);
out_unlock:
xfs_iunlock(ip, lockmode);
@@ -1215,7 +1213,10 @@ xfs_buffered_write_delalloc_punch(
loff_t length,
struct iomap *iomap)
{
- xfs_bmap_punch_delalloc_range(XFS_I(inode), offset, offset + length);
+ xfs_bmap_punch_delalloc_range(XFS_I(inode),
+ (iomap->flags & IOMAP_F_SHARED) ?
+ XFS_COW_FORK : XFS_DATA_FORK,
+ offset, offset + length);
}
static int
@@ -1227,8 +1228,30 @@ xfs_buffered_write_iomap_end(
unsigned flags,
struct iomap *iomap)
{
- iomap_file_buffered_write_punch_delalloc(inode, offset, length, written,
- flags, iomap, &xfs_buffered_write_delalloc_punch);
+ loff_t start_byte, end_byte;
+
+ /* If we didn't reserve the blocks, we're not allowed to punch them. */
+ if (iomap->type != IOMAP_DELALLOC || !(iomap->flags & IOMAP_F_NEW))
+ return 0;
+
+ /* Nothing to do if we've written the entire delalloc extent */
+ start_byte = iomap_last_written_block(inode, offset, written);
+ end_byte = round_up(offset + length, i_blocksize(inode));
+ if (start_byte >= end_byte)
+ return 0;
+
+ /* For zeroing operations the callers already hold invalidate_lock. */
+ if (flags & (IOMAP_UNSHARE | IOMAP_ZERO)) {
+ rwsem_assert_held_write(&inode->i_mapping->invalidate_lock);
+ iomap_write_delalloc_release(inode, start_byte, end_byte, flags,
+ iomap, xfs_buffered_write_delalloc_punch);
+ } else {
+ filemap_invalidate_lock(inode->i_mapping);
+ iomap_write_delalloc_release(inode, start_byte, end_byte, flags,
+ iomap, xfs_buffered_write_delalloc_punch);
+ filemap_invalidate_unlock(inode->i_mapping);
+ }
+
return 0;
}
@@ -1435,6 +1458,8 @@ xfs_zero_range(
{
struct inode *inode = VFS_I(ip);
+ xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
+
if (IS_DAX(inode))
return dax_zero_range(inode, pos, len, did_zero,
&xfs_dax_write_iomap_ops);
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index ee79cf161312..4084d26f0d78 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -570,6 +570,20 @@ xfs_stat_blksize(
return max_t(uint32_t, PAGE_SIZE, mp->m_sb.sb_blocksize);
}
+static void
+xfs_get_atomic_write_attr(
+ struct xfs_inode *ip,
+ unsigned int *unit_min,
+ unsigned int *unit_max)
+{
+ if (!xfs_inode_can_atomicwrite(ip)) {
+ *unit_min = *unit_max = 0;
+ return;
+ }
+
+ *unit_min = *unit_max = ip->i_mount->m_sb.sb_blocksize;
+}
+
STATIC int
xfs_vn_getattr(
struct mnt_idmap *idmap,
@@ -597,8 +611,9 @@ xfs_vn_getattr(
stat->gid = vfsgid_into_kgid(vfsgid);
stat->ino = ip->i_ino;
stat->atime = inode_get_atime(inode);
- stat->mtime = inode_get_mtime(inode);
- stat->ctime = inode_get_ctime(inode);
+
+ fill_mg_cmtime(stat, request_mask, inode);
+
stat->blocks = XFS_FSB_TO_BB(mp, ip->i_nblocks + ip->i_delayed_blks);
if (xfs_has_v3inodes(mp)) {
@@ -608,11 +623,6 @@ xfs_vn_getattr(
}
}
- if ((request_mask & STATX_CHANGE_COOKIE) && IS_I_VERSION(inode)) {
- stat->change_cookie = inode_query_iversion(inode);
- stat->result_mask |= STATX_CHANGE_COOKIE;
- }
-
/*
* Note: If you add another clause to set an attribute flag, please
* update attributes_mask below.
@@ -643,6 +653,14 @@ xfs_vn_getattr(
stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
stat->dio_offset_align = bdev_logical_block_size(bdev);
}
+ if (request_mask & STATX_WRITE_ATOMIC) {
+ unsigned int unit_min, unit_max;
+
+ xfs_get_atomic_write_attr(ip, &unit_min,
+ &unit_max);
+ generic_fill_statx_atomic_writes(stat,
+ unit_min, unit_max);
+ }
fallthrough;
default:
stat->blksize = xfs_stat_blksize(ip);
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index a13bf53fea49..704aaadb61cf 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3393,13 +3393,6 @@ xlog_do_recover(
/* re-initialise in-core superblock and geometry structures */
mp->m_features |= xfs_sb_version_to_features(sbp);
xfs_reinit_percpu_counters(mp);
- error = xfs_initialize_perag(mp, sbp->sb_agcount, sbp->sb_dblocks,
- &mp->m_maxagi);
- if (error) {
- xfs_warn(mp, "Failed post-recovery per-ag init: %d", error);
- return error;
- }
- mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
/* Normal transactions can now occur */
clear_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 1fdd79c5bfa0..25bbcc3f4ee0 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -810,8 +810,8 @@ xfs_mountfs(
/*
* Allocate and initialize the per-ag data.
*/
- error = xfs_initialize_perag(mp, sbp->sb_agcount, mp->m_sb.sb_dblocks,
- &mp->m_maxagi);
+ error = xfs_initialize_perag(mp, 0, sbp->sb_agcount,
+ mp->m_sb.sb_dblocks, &mp->m_maxagi);
if (error) {
xfs_warn(mp, "Failed per-ag init: %d", error);
goto out_free_dir;
@@ -1048,7 +1048,7 @@ xfs_mountfs(
xfs_buftarg_drain(mp->m_logdev_targp);
xfs_buftarg_drain(mp->m_ddev_targp);
out_free_perag:
- xfs_free_perag(mp);
+ xfs_free_perag_range(mp, 0, mp->m_sb.sb_agcount);
out_free_dir:
xfs_da_unmount(mp);
out_remove_uuid:
@@ -1129,8 +1129,7 @@ xfs_unmountfs(
xfs_errortag_clearall(mp);
#endif
shrinker_free(mp->m_inodegc_shrinker);
- xfs_free_perag(mp);
-
+ xfs_free_perag_range(mp, 0, mp->m_sb.sb_agcount);
xfs_errortag_del(mp);
xfs_error_sysfs_del(mp);
xchk_stats_unregister(mp->m_scrub_stats);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index fbb3a1594c0d..fda75db739b1 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -2063,7 +2063,7 @@ static struct file_system_type xfs_fs_type = {
.init_fs_context = xfs_init_fs_context,
.parameters = xfs_fs_parameters,
.kill_sb = xfs_kill_sb,
- .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME,
};
MODULE_ALIAS_FS("xfs");
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index ee9f0b1f548d..fcb2bad4f76e 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -691,8 +691,8 @@ DEFINE_FILESTREAM_EVENT(xfs_filestream_lookup);
DEFINE_FILESTREAM_EVENT(xfs_filestream_scan);
TRACE_EVENT(xfs_filestream_pick,
- TP_PROTO(struct xfs_perag *pag, xfs_ino_t ino, xfs_extlen_t free),
- TP_ARGS(pag, ino, free),
+ TP_PROTO(struct xfs_perag *pag, xfs_ino_t ino),
+ TP_ARGS(pag, ino),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_ino_t, ino)
@@ -703,14 +703,9 @@ TRACE_EVENT(xfs_filestream_pick,
TP_fast_assign(
__entry->dev = pag->pag_mount->m_super->s_dev;
__entry->ino = ino;
- if (pag) {
- __entry->agno = pag->pag_agno;
- __entry->streams = atomic_read(&pag->pagf_fstrms);
- } else {
- __entry->agno = NULLAGNUMBER;
- __entry->streams = 0;
- }
- __entry->free = free;
+ __entry->agno = pag->pag_agno;
+ __entry->streams = atomic_read(&pag->pagf_fstrms);
+ __entry->free = pag->pagf_freeblks;
),
TP_printk("dev %d:%d ino 0x%llx agno 0x%x streams %d free %d",
MAJOR(__entry->dev), MINOR(__entry->dev),