diff options
Diffstat (limited to 'fs')
529 files changed, 46486 insertions, 11414 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c index bb1b286c49ae..c381499f5416 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -97,10 +97,13 @@ static struct posix_acl *v9fs_get_cached_acl(struct inode *inode, int type) return acl; } -struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type) +struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type, bool rcu) { struct v9fs_session_info *v9ses; + if (rcu) + return ERR_PTR(-ECHILD); + v9ses = v9fs_inode2v9ses(inode); if (((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) || ((v9ses->flags & V9FS_ACL_MASK) != V9FS_POSIX_ACL)) { diff --git a/fs/9p/acl.h b/fs/9p/acl.h index e4f7e882272b..d43c8949e807 100644 --- a/fs/9p/acl.h +++ b/fs/9p/acl.h @@ -16,7 +16,7 @@ #ifdef CONFIG_9P_FS_POSIX_ACL extern int v9fs_get_acl(struct inode *, struct p9_fid *); -extern struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type); +extern struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type, bool rcu); extern int v9fs_acl_chmod(struct inode *, struct p9_fid *); extern int v9fs_set_create_acl(struct inode *, struct p9_fid *, struct posix_acl *, struct posix_acl *); diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 59c32c9b799f..aab5e6538660 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -121,10 +121,6 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl) p9_debug(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl); - /* No mandatory locks */ - if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) - return -ENOLCK; - if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) { filemap_write_and_wait(inode->i_mapping); invalidate_mapping_pages(&inode->i_data, 0, -1); @@ -312,10 +308,6 @@ static int v9fs_file_lock_dotl(struct file *filp, int cmd, struct file_lock *fl) p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %pD\n", filp, cmd, fl, filp); - /* No mandatory locks */ - if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) - goto out_err; - if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) { filemap_write_and_wait(inode->i_mapping); invalidate_mapping_pages(&inode->i_data, 0, -1); @@ -327,7 +319,6 @@ static int v9fs_file_lock_dotl(struct file *filp, int cmd, struct file_lock *fl) ret = v9fs_file_getlock(filp, fl); else ret = -EINVAL; -out_err: return ret; } @@ -348,10 +339,6 @@ static int v9fs_file_flock_dotl(struct file *filp, int cmd, p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %pD\n", filp, cmd, fl, filp); - /* No mandatory locks */ - if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) - goto out_err; - if (!(fl->fl_flags & FL_FLOCK)) goto out_err; @@ -625,12 +612,7 @@ static void v9fs_mmap_vm_close(struct vm_area_struct *vma) p9_debug(P9_DEBUG_VFS, "9p VMA close, %p, flushing", vma); inode = file_inode(vma->vm_file); - - if (!mapping_can_writeback(inode->i_mapping)) - wbc.nr_to_write = 0; - - might_sleep(); - sync_inode(inode, &wbc); + filemap_fdatawrite_wbc(inode->i_mapping, &wbc); } diff --git a/fs/Kconfig b/fs/Kconfig index a7749c126b8e..b11bd4b387e1 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -101,16 +101,6 @@ config FILE_LOCKING for filesystems like NFS and for the flock() system call. Disabling this option saves about 11k. -config MANDATORY_FILE_LOCKING - bool "Enable Mandatory file locking" - depends on FILE_LOCKING - default y - help - This option enables files appropriately marked files on appropriely - mounted filesystems to support mandatory locking. - - To the best of my knowledge this is dead code that no one cares about. - source "fs/crypto/Kconfig" source "fs/verity/Kconfig" @@ -358,7 +348,15 @@ config NFS_V4_2_SSC_HELPER source "net/sunrpc/Kconfig" source "fs/ceph/Kconfig" + source "fs/cifs/Kconfig" +source "fs/ksmbd/Kconfig" + +config CIFS_COMMON + tristate + default y if CIFS=y + default m if CIFS=m + source "fs/coda/Kconfig" source "fs/afs/Kconfig" source "fs/9p/Kconfig" diff --git a/fs/Makefile b/fs/Makefile index f98f3e691c37..354e2ba3ee67 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -96,7 +96,9 @@ obj-$(CONFIG_LOCKD) += lockd/ obj-$(CONFIG_NLS) += nls/ obj-$(CONFIG_UNICODE) += unicode/ obj-$(CONFIG_SYSV_FS) += sysv/ +obj-$(CONFIG_CIFS_COMMON) += cifs_common/ obj-$(CONFIG_CIFS) += cifs/ +obj-$(CONFIG_SMB_SERVER) += ksmbd/ obj-$(CONFIG_HPFS_FS) += hpfs/ obj-$(CONFIG_NTFS_FS) += ntfs/ obj-$(CONFIG_UFS_FS) += ufs/ diff --git a/fs/afs/flock.c b/fs/afs/flock.c index cb3054c7843e..c4210a3964d8 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -772,10 +772,6 @@ int afs_lock(struct file *file, int cmd, struct file_lock *fl) fl->fl_type, fl->fl_flags, (long long) fl->fl_start, (long long) fl->fl_end); - /* AFS doesn't support mandatory locks */ - if (__mandatory_lock(&vnode->vfs_inode) && fl->fl_type != F_UNLCK) - return -ENOLCK; - if (IS_GETLK(cmd)) return afs_do_getlk(file, fl); @@ -1695,7 +1695,7 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, list_del(&iocb->ki_list); iocb->ki_res.res = mangle_poll(mask); req->done = true; - if (iocb->ki_eventfd && eventfd_signal_count()) { + if (iocb->ki_eventfd && eventfd_signal_allowed()) { iocb = NULL; INIT_WORK(&req->work, aio_poll_put_work); schedule_work(&req->work); diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 48e16144c1f7..12b8fdcc445b 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -121,7 +121,7 @@ static const char *bad_inode_get_link(struct dentry *dentry, return ERR_PTR(-EIO); } -static struct posix_acl *bad_inode_get_acl(struct inode *inode, int type) +static struct posix_acl *bad_inode_get_acl(struct inode *inode, int type, bool rcu) { return ERR_PTR(-EIO); } diff --git a/fs/block_dev.c b/fs/block_dev.c index 9ef4f1fc2cb0..45df6cbccf12 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -35,6 +35,7 @@ #include <linux/uaccess.h> #include <linux/suspend.h> #include "internal.h" +#include "../block/blk.h" struct bdev_inode { struct block_device bdev; @@ -385,7 +386,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, (bdev_logical_block_size(bdev) - 1)) return -EINVAL; - bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, &blkdev_dio_pool); + bio = bio_alloc_kiocb(iocb, nr_pages, &blkdev_dio_pool); dio = container_of(bio, struct blkdev_dio, bio); dio->is_sync = is_sync = is_sync_kiocb(iocb); @@ -513,7 +514,9 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) static __init int blkdev_init(void) { - return bioset_init(&blkdev_dio_pool, 4, offsetof(struct blkdev_dio, bio), BIOSET_NEED_BVECS); + return bioset_init(&blkdev_dio_pool, 4, + offsetof(struct blkdev_dio, bio), + BIOSET_NEED_BVECS|BIOSET_PERCPU_CACHE); } module_init(blkdev_init); @@ -686,7 +689,8 @@ static loff_t block_llseek(struct file *file, loff_t offset, int whence) return retval; } -int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) +static int blkdev_fsync(struct file *filp, loff_t start, loff_t end, + int datasync) { struct inode *bd_inode = bdev_file_inode(filp); struct block_device *bdev = I_BDEV(bd_inode); @@ -707,7 +711,6 @@ int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) return error; } -EXPORT_SYMBOL(blkdev_fsync); /** * bdev_read_page() - Start reading a page from a block device @@ -801,7 +804,6 @@ static struct inode *bdev_alloc_inode(struct super_block *sb) if (!ei) return NULL; memset(&ei->bdev, 0, sizeof(ei->bdev)); - ei->bdev.bd_bdi = &noop_backing_dev_info; return &ei->vfs_inode; } @@ -812,8 +814,15 @@ static void bdev_free_inode(struct inode *inode) free_percpu(bdev->bd_stats); kfree(bdev->bd_meta_info); - if (!bdev_is_partition(bdev)) + if (!bdev_is_partition(bdev)) { + if (bdev->bd_disk && bdev->bd_disk->bdi) + bdi_put(bdev->bd_disk->bdi); kfree(bdev->bd_disk); + } + + if (MAJOR(bdev->bd_dev) == BLOCK_EXT_MAJOR) + blk_free_ext_minor(MINOR(bdev->bd_dev)); + kmem_cache_free(bdev_cachep, BDEV_I(inode)); } @@ -826,16 +835,9 @@ static void init_once(void *data) static void bdev_evict_inode(struct inode *inode) { - struct block_device *bdev = &BDEV_I(inode)->bdev; truncate_inode_pages_final(&inode->i_data); invalidate_inode_buffers(inode); /* is it needed here? */ clear_inode(inode); - /* Detach inode from wb early as bdi_put() may free bdi->wb */ - inode_detach_wb(inode); - if (bdev->bd_bdi != &noop_backing_dev_info) { - bdi_put(bdev->bd_bdi); - bdev->bd_bdi = &noop_backing_dev_info; - } } static const struct super_operations bdev_sops = { @@ -902,9 +904,6 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) bdev->bd_disk = disk; bdev->bd_partno = partno; bdev->bd_inode = inode; -#ifdef CONFIG_SYSFS - INIT_LIST_HEAD(&bdev->bd_holder_disks); -#endif bdev->bd_stats = alloc_percpu(struct disk_stats); if (!bdev->bd_stats) { iput(inode); @@ -921,31 +920,6 @@ void bdev_add(struct block_device *bdev, dev_t dev) insert_inode_hash(bdev->bd_inode); } -static struct block_device *bdget(dev_t dev) -{ - struct inode *inode; - - inode = ilookup(blockdev_superblock, dev); - if (!inode) - return NULL; - return &BDEV_I(inode)->bdev; -} - -/** - * bdgrab -- Grab a reference to an already referenced block device - * @bdev: Block device to grab a reference to. - * - * Returns the block_device with an additional reference when successful, - * or NULL if the inode is already beeing freed. - */ -struct block_device *bdgrab(struct block_device *bdev) -{ - if (!igrab(bdev->bd_inode)) - return NULL; - return bdev; -} -EXPORT_SYMBOL(bdgrab); - long nr_blockdev_pages(void) { struct inode *inode; @@ -959,12 +933,6 @@ long nr_blockdev_pages(void) return ret; } -void bdput(struct block_device *bdev) -{ - iput(bdev->bd_inode); -} -EXPORT_SYMBOL(bdput); - /** * bd_may_claim - test whether a block device can be claimed * @bdev: block device of interest @@ -1094,148 +1062,6 @@ void bd_abort_claiming(struct block_device *bdev, void *holder) } EXPORT_SYMBOL(bd_abort_claiming); -#ifdef CONFIG_SYSFS -struct bd_holder_disk { - struct list_head list; - struct gendisk *disk; - int refcnt; -}; - -static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev, - struct gendisk *disk) -{ - struct bd_holder_disk *holder; - - list_for_each_entry(holder, &bdev->bd_holder_disks, list) - if (holder->disk == disk) - return holder; - return NULL; -} - -static int add_symlink(struct kobject *from, struct kobject *to) -{ - return sysfs_create_link(from, to, kobject_name(to)); -} - -static void del_symlink(struct kobject *from, struct kobject *to) -{ - sysfs_remove_link(from, kobject_name(to)); -} - -/** - * bd_link_disk_holder - create symlinks between holding disk and slave bdev - * @bdev: the claimed slave bdev - * @disk: the holding disk - * - * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT. - * - * This functions creates the following sysfs symlinks. - * - * - from "slaves" directory of the holder @disk to the claimed @bdev - * - from "holders" directory of the @bdev to the holder @disk - * - * For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is - * passed to bd_link_disk_holder(), then: - * - * /sys/block/dm-0/slaves/sda --> /sys/block/sda - * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0 - * - * The caller must have claimed @bdev before calling this function and - * ensure that both @bdev and @disk are valid during the creation and - * lifetime of these symlinks. - * - * CONTEXT: - * Might sleep. - * - * RETURNS: - * 0 on success, -errno on failure. - */ -int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) -{ - struct bd_holder_disk *holder; - int ret = 0; - - mutex_lock(&bdev->bd_disk->open_mutex); - - WARN_ON_ONCE(!bdev->bd_holder); - - /* FIXME: remove the following once add_disk() handles errors */ - if (WARN_ON(!disk->slave_dir || !bdev->bd_holder_dir)) - goto out_unlock; - - holder = bd_find_holder_disk(bdev, disk); - if (holder) { - holder->refcnt++; - goto out_unlock; - } - - holder = kzalloc(sizeof(*holder), GFP_KERNEL); - if (!holder) { - ret = -ENOMEM; - goto out_unlock; - } - - INIT_LIST_HEAD(&holder->list); - holder->disk = disk; - holder->refcnt = 1; - - ret = add_symlink(disk->slave_dir, bdev_kobj(bdev)); - if (ret) - goto out_free; - - ret = add_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); - if (ret) - goto out_del; - /* - * bdev could be deleted beneath us which would implicitly destroy - * the holder directory. Hold on to it. - */ - kobject_get(bdev->bd_holder_dir); - - list_add(&holder->list, &bdev->bd_holder_disks); - goto out_unlock; - -out_del: - del_symlink(disk->slave_dir, bdev_kobj(bdev)); -out_free: - kfree(holder); -out_unlock: - mutex_unlock(&bdev->bd_disk->open_mutex); - return ret; -} -EXPORT_SYMBOL_GPL(bd_link_disk_holder); - -/** - * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder() - * @bdev: the calimed slave bdev - * @disk: the holding disk - * - * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT. - * - * CONTEXT: - * Might sleep. - */ -void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) -{ - struct bd_holder_disk *holder; - - mutex_lock(&bdev->bd_disk->open_mutex); - - holder = bd_find_holder_disk(bdev, disk); - - if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) { - del_symlink(disk->slave_dir, bdev_kobj(bdev)); - del_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); - kobject_put(bdev->bd_holder_dir); - list_del_init(&holder->list); - kfree(holder); - } - - mutex_unlock(&bdev->bd_disk->open_mutex); -} -EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); -#endif - static void blkdev_flush_mapping(struct block_device *bdev) { WARN_ON_ONCE(bdev->bd_holders); @@ -1260,11 +1086,8 @@ static int blkdev_get_whole(struct block_device *bdev, fmode_t mode) } } - if (!bdev->bd_openers) { + if (!bdev->bd_openers) set_init_blocksize(bdev); - if (bdev->bd_bdi == &noop_backing_dev_info) - bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info); - } if (test_bit(GD_NEED_PART_SCAN, &disk->state)) bdev_disk_changed(disk, false); bdev->bd_openers++; @@ -1282,16 +1105,14 @@ static void blkdev_put_whole(struct block_device *bdev, fmode_t mode) static int blkdev_get_part(struct block_device *part, fmode_t mode) { struct gendisk *disk = part->bd_disk; - struct block_device *whole; int ret; if (part->bd_openers) goto done; - whole = bdgrab(disk->part0); - ret = blkdev_get_whole(whole, mode); + ret = blkdev_get_whole(bdev_whole(part), mode); if (ret) - goto out_put_whole; + return ret; ret = -ENXIO; if (!bdev_nr_sectors(part)) @@ -1299,16 +1120,12 @@ static int blkdev_get_part(struct block_device *part, fmode_t mode) disk->open_partitions++; set_init_blocksize(part); - if (part->bd_bdi == &noop_backing_dev_info) - part->bd_bdi = bdi_get(disk->queue->backing_dev_info); done: part->bd_openers++; return 0; out_blkdev_put: - blkdev_put_whole(whole, mode); -out_put_whole: - bdput(whole); + blkdev_put_whole(bdev_whole(part), mode); return ret; } @@ -1321,42 +1138,42 @@ static void blkdev_put_part(struct block_device *part, fmode_t mode) blkdev_flush_mapping(part); whole->bd_disk->open_partitions--; blkdev_put_whole(whole, mode); - bdput(whole); } struct block_device *blkdev_get_no_open(dev_t dev) { struct block_device *bdev; - struct gendisk *disk; + struct inode *inode; - bdev = bdget(dev); - if (!bdev) { + inode = ilookup(blockdev_superblock, dev); + if (!inode) { blk_request_module(dev); - bdev = bdget(dev); - if (!bdev) + inode = ilookup(blockdev_superblock, dev); + if (!inode) return NULL; } - disk = bdev->bd_disk; - if (!kobject_get_unless_zero(&disk_to_dev(disk)->kobj)) - goto bdput; - if ((disk->flags & (GENHD_FL_UP | GENHD_FL_HIDDEN)) != GENHD_FL_UP) - goto put_disk; - if (!try_module_get(bdev->bd_disk->fops->owner)) - goto put_disk; + /* switch from the inode reference to a device mode one: */ + bdev = &BDEV_I(inode)->bdev; + if (!kobject_get_unless_zero(&bdev->bd_device.kobj)) + bdev = NULL; + iput(inode); + + if (!bdev) + return NULL; + if ((bdev->bd_disk->flags & GENHD_FL_HIDDEN) || + !try_module_get(bdev->bd_disk->fops->owner)) { + put_device(&bdev->bd_device); + return NULL; + } + return bdev; -put_disk: - put_disk(disk); -bdput: - bdput(bdev); - return NULL; } void blkdev_put_no_open(struct block_device *bdev) { module_put(bdev->bd_disk->fops->owner); - put_disk(bdev->bd_disk); - bdput(bdev); + put_device(&bdev->bd_device); } /** @@ -1409,7 +1226,7 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) mutex_lock(&disk->open_mutex); ret = -ENXIO; - if (!(disk->flags & GENHD_FL_UP)) + if (!disk_live(disk)) goto abort_claiming; if (bdev_is_partition(bdev)) ret = blkdev_get_part(bdev, mode); diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index cec88a66bd6c..3dcf9bcc2326 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -36,6 +36,7 @@ btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o btrfs-$(CONFIG_BLK_DEV_ZONED) += zoned.o +btrfs-$(CONFIG_FS_VERITY) += verity.o btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \ tests/extent-buffer-tests.o tests/btrfs-tests.o \ diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index d95eb5c8cb37..0a0d0eccee4e 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -16,13 +16,16 @@ #include "btrfs_inode.h" #include "xattr.h" -struct posix_acl *btrfs_get_acl(struct inode *inode, int type) +struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu) { int size; const char *name; char *value = NULL; struct posix_acl *acl; + if (rcu) + return ERR_PTR(-ECHILD); + switch (type) { case ACL_TYPE_ACCESS: name = XATTR_NAME_POSIX_ACL_ACCESS; @@ -53,7 +56,8 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type) } static int __btrfs_set_acl(struct btrfs_trans_handle *trans, - struct inode *inode, struct posix_acl *acl, int type) + struct user_namespace *mnt_userns, + struct inode *inode, struct posix_acl *acl, int type) { int ret, size = 0; const char *name; @@ -114,12 +118,12 @@ int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, umode_t old_mode = inode->i_mode; if (type == ACL_TYPE_ACCESS && acl) { - ret = posix_acl_update_mode(&init_user_ns, inode, + ret = posix_acl_update_mode(mnt_userns, inode, &inode->i_mode, &acl); if (ret) return ret; } - ret = __btrfs_set_acl(NULL, inode, acl, type); + ret = __btrfs_set_acl(NULL, mnt_userns, inode, acl, type); if (ret) inode->i_mode = old_mode; return ret; @@ -140,14 +144,14 @@ int btrfs_init_acl(struct btrfs_trans_handle *trans, return ret; if (default_acl) { - ret = __btrfs_set_acl(trans, inode, default_acl, + ret = __btrfs_set_acl(trans, &init_user_ns, inode, default_acl, ACL_TYPE_DEFAULT); posix_acl_release(default_acl); } if (acl) { if (!ret) - ret = __btrfs_set_acl(trans, inode, acl, + ret = __btrfs_set_acl(trans, &init_user_ns, inode, acl, ACL_TYPE_ACCESS); posix_acl_release(acl); } diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 78b202d198b8..f735b8798ba1 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1211,7 +1211,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, again: head = NULL; - ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0); + ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); if (ret < 0) goto out; BUG_ON(ret == 0); @@ -1488,14 +1488,14 @@ static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans, int btrfs_find_all_roots(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 time_seq, struct ulist **roots, - bool ignore_offset, bool skip_commit_root_sem) + bool skip_commit_root_sem) { int ret; if (!trans && !skip_commit_root_sem) down_read(&fs_info->commit_root_sem); ret = btrfs_find_all_roots_safe(trans, fs_info, bytenr, - time_seq, roots, ignore_offset); + time_seq, roots, false); if (!trans && !skip_commit_root_sem) up_read(&fs_info->commit_root_sem); return ret; diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index ff5f07f9940b..ba454032dbe2 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -47,7 +47,7 @@ int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, const u64 *extent_item_pos, bool ignore_offset); int btrfs_find_all_roots(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, - u64 time_seq, struct ulist **roots, bool ignore_offset, + u64 time_seq, struct ulist **roots, bool skip_commit_root_sem); char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, u32 name_len, unsigned long name_off, diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 9e7d9d0c763d..a3b830b8410a 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1561,7 +1561,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) div64_u64(zone_unusable * 100, bg->length)); trace_btrfs_reclaim_block_group(bg); ret = btrfs_relocate_chunk(fs_info, bg->start); - if (ret) + if (ret && ret != -EAGAIN) btrfs_err(fs_info, "error relocating chunk %llu", bg->start); @@ -2105,11 +2105,22 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info) bg->used = em->len; bg->flags = map->type; ret = btrfs_add_block_group_cache(fs_info, bg); + /* + * We may have some valid block group cache added already, in + * that case we skip to the next one. + */ + if (ret == -EEXIST) { + ret = 0; + btrfs_put_block_group(bg); + continue; + } + if (ret) { btrfs_remove_free_space_cache(bg); btrfs_put_block_group(bg); break; } + btrfs_update_space_info(fs_info, bg->flags, em->len, em->len, 0, 0, &space_info); bg->space_info = space_info; @@ -2212,6 +2223,14 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) ret = check_chunk_block_group_mappings(info); error: btrfs_free_path(path); + /* + * We've hit some error while reading the extent tree, and have + * rescue=ibadroots mount option. + * Try to fill the tree using dummy block groups so that the user can + * continue to mount and grab their data. + */ + if (ret && btrfs_test_opt(info, IGNOREBADROOTS)) + ret = fill_dummy_bgs(info); return ret; } @@ -2244,6 +2263,95 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans, return btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi)); } +static int insert_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, u64 chunk_offset, + u64 start, u64 num_bytes) +{ + struct btrfs_fs_info *fs_info = device->fs_info; + struct btrfs_root *root = fs_info->dev_root; + struct btrfs_path *path; + struct btrfs_dev_extent *extent; + struct extent_buffer *leaf; + struct btrfs_key key; + int ret; + + WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)); + WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = device->devid; + key.type = BTRFS_DEV_EXTENT_KEY; + key.offset = start; + ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent)); + if (ret) + goto out; + + leaf = path->nodes[0]; + extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); + btrfs_set_dev_extent_chunk_tree(leaf, extent, BTRFS_CHUNK_TREE_OBJECTID); + btrfs_set_dev_extent_chunk_objectid(leaf, extent, + BTRFS_FIRST_CHUNK_TREE_OBJECTID); + btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); + + btrfs_set_dev_extent_length(leaf, extent, num_bytes); + btrfs_mark_buffer_dirty(leaf); +out: + btrfs_free_path(path); + return ret; +} + +/* + * This function belongs to phase 2. + * + * See the comment at btrfs_chunk_alloc() for details about the chunk allocation + * phases. + */ +static int insert_dev_extents(struct btrfs_trans_handle *trans, + u64 chunk_offset, u64 chunk_size) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_device *device; + struct extent_map *em; + struct map_lookup *map; + u64 dev_offset; + u64 stripe_size; + int i; + int ret = 0; + + em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size); + if (IS_ERR(em)) + return PTR_ERR(em); + + map = em->map_lookup; + stripe_size = em->orig_block_len; + + /* + * Take the device list mutex to prevent races with the final phase of + * a device replace operation that replaces the device object associated + * with the map's stripes, because the device object's id can change + * at any time during that final phase of the device replace operation + * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the + * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID, + * resulting in persisting a device extent item with such ID. + */ + mutex_lock(&fs_info->fs_devices->device_list_mutex); + for (i = 0; i < map->num_stripes; i++) { + device = map->stripes[i].dev; + dev_offset = map->stripes[i].physical; + + ret = insert_dev_extent(trans, device, chunk_offset, dev_offset, + stripe_size); + if (ret) + break; + } + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + + free_extent_map(em); + return ret; +} + /* * This function, btrfs_create_pending_block_groups(), belongs to the phase 2 of * chunk allocation. @@ -2278,8 +2386,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) if (ret) btrfs_abort_transaction(trans, ret); } - ret = btrfs_finish_chunk_alloc(trans, block_group->start, - block_group->length); + ret = insert_dev_extents(trans, block_group->start, + block_group->length); if (ret) btrfs_abort_transaction(trans, ret); add_block_group_free_space(trans, block_group); diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index c652e19ad74e..76ee1452c57b 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -51,6 +51,13 @@ enum { * the file range, inode's io_tree). */ BTRFS_INODE_NO_DELALLOC_FLUSH, + /* + * Set when we are working on enabling verity for a file. Computing and + * writing the whole Merkle tree can take a while so we want to prevent + * races where two separate tasks attempt to simultaneously start verity + * on the same file. + */ + BTRFS_INODE_VERITY_IN_PROGRESS, }; /* in memory btrfs inode */ @@ -189,8 +196,10 @@ struct btrfs_inode { */ u64 csum_bytes; - /* flags field from the on disk inode */ + /* Backwards incompatible flags, lower half of inode_item::flags */ u32 flags; + /* Read-only compatibility flags, upper half of inode_item::flags */ + u32 ro_flags; /* * Counters to keep track of the number of extent item's we may use due @@ -348,6 +357,22 @@ struct btrfs_dio_private { u8 csums[]; }; +/* + * btrfs_inode_item stores flags in a u64, btrfs_inode stores them in two + * separate u32s. These two functions convert between the two representations. + */ +static inline u64 btrfs_inode_combine_flags(u32 flags, u32 ro_flags) +{ + return (flags | ((u64)ro_flags << 32)); +} + +static inline void btrfs_inode_split_flags(u64 inode_item_flags, + u32 *flags, u32 *ro_flags) +{ + *flags = (u32)inode_item_flags; + *ro_flags = (u32)(inode_item_flags >> 32); +} + /* Array of bytes with variable length, hexadecimal format 0x1234 */ #define CSUM_FMT "0x%*phN" #define CSUM_FMT_VALUE(size, bytes) size, bytes diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 169508609324..86816088927f 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -243,47 +243,6 @@ struct btrfsic_state { u32 datablock_size; }; -static void btrfsic_block_init(struct btrfsic_block *b); -static struct btrfsic_block *btrfsic_block_alloc(void); -static void btrfsic_block_free(struct btrfsic_block *b); -static void btrfsic_block_link_init(struct btrfsic_block_link *n); -static struct btrfsic_block_link *btrfsic_block_link_alloc(void); -static void btrfsic_block_link_free(struct btrfsic_block_link *n); -static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds); -static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void); -static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds); -static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h); -static void btrfsic_block_hashtable_add(struct btrfsic_block *b, - struct btrfsic_block_hashtable *h); -static void btrfsic_block_hashtable_remove(struct btrfsic_block *b); -static struct btrfsic_block *btrfsic_block_hashtable_lookup( - struct block_device *bdev, - u64 dev_bytenr, - struct btrfsic_block_hashtable *h); -static void btrfsic_block_link_hashtable_init( - struct btrfsic_block_link_hashtable *h); -static void btrfsic_block_link_hashtable_add( - struct btrfsic_block_link *l, - struct btrfsic_block_link_hashtable *h); -static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l); -static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup( - struct block_device *bdev_ref_to, - u64 dev_bytenr_ref_to, - struct block_device *bdev_ref_from, - u64 dev_bytenr_ref_from, - struct btrfsic_block_link_hashtable *h); -static void btrfsic_dev_state_hashtable_init( - struct btrfsic_dev_state_hashtable *h); -static void btrfsic_dev_state_hashtable_add( - struct btrfsic_dev_state *ds, - struct btrfsic_dev_state_hashtable *h); -static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds); -static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(dev_t dev, - struct btrfsic_dev_state_hashtable *h); -static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void); -static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf); -static int btrfsic_process_superblock(struct btrfsic_state *state, - struct btrfs_fs_devices *fs_devices); static int btrfsic_process_metablock(struct btrfsic_state *state, struct btrfsic_block *block, struct btrfsic_block_data_ctx *block_ctx, @@ -313,14 +272,6 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx); static int btrfsic_read_block(struct btrfsic_state *state, struct btrfsic_block_data_ctx *block_ctx); -static void btrfsic_dump_database(struct btrfsic_state *state); -static int btrfsic_test_for_metadata(struct btrfsic_state *state, - char **datav, unsigned int num_pages); -static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, - u64 dev_bytenr, char **mapped_datav, - unsigned int num_pages, - struct bio *bio, int *bio_is_patched, - int submit_bio_bh_rw); static int btrfsic_process_written_superblock( struct btrfsic_state *state, struct btrfsic_block *const block, @@ -1558,10 +1509,8 @@ static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx) /* Pages must be unmapped in reverse order */ while (num_pages > 0) { num_pages--; - if (block_ctx->datav[num_pages]) { - kunmap_local(block_ctx->datav[num_pages]); + if (block_ctx->datav[num_pages]) block_ctx->datav[num_pages] = NULL; - } if (block_ctx->pagev[num_pages]) { __free_page(block_ctx->pagev[num_pages]); block_ctx->pagev[num_pages] = NULL; @@ -1638,7 +1587,7 @@ static int btrfsic_read_block(struct btrfsic_state *state, i = j; } for (i = 0; i < num_pages; i++) - block_ctx->datav[i] = kmap_local_page(block_ctx->pagev[i]); + block_ctx->datav[i] = page_address(block_ctx->pagev[i]); return block_ctx->len; } @@ -2703,7 +2652,7 @@ static void __btrfsic_submit_bio(struct bio *bio) bio_for_each_segment(bvec, bio, iter) { BUG_ON(bvec.bv_len != PAGE_SIZE); - mapped_datav[i] = kmap_local_page(bvec.bv_page); + mapped_datav[i] = page_address(bvec.bv_page); i++; if (dev_state->state->print_mask & @@ -2716,9 +2665,6 @@ static void __btrfsic_submit_bio(struct bio *bio) mapped_datav, segs, bio, &bio_is_patched, bio->bi_opf); - /* Unmap in reverse order */ - for (--i; i >= 0; i--) - kunmap_local(mapped_datav[i]); kfree(mapped_datav); } else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) { if (dev_state->state->print_mask & diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 30d82cdf128c..7869ad12bc6e 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -172,10 +172,9 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio, /* Hash through the page sector by sector */ for (pg_offset = 0; pg_offset < bytes_left; pg_offset += sectorsize) { - kaddr = kmap_atomic(page); + kaddr = page_address(page); crypto_shash_digest(shash, kaddr + pg_offset, sectorsize, csum); - kunmap_atomic(kaddr); if (memcmp(&csum, cb_sum, csum_size) != 0) { btrfs_print_data_csum_error(inode, disk_start, @@ -565,6 +564,16 @@ static noinline int add_ra_bio_pages(struct inode *inode, if (isize == 0) return 0; + /* + * For current subpage support, we only support 64K page size, + * which means maximum compressed extent size (128K) is just 2x page + * size. + * This makes readahead less effective, so here disable readahead for + * subpage for now, until full compressed write is supported. + */ + if (btrfs_sb(inode->i_sb)->sectorsize < PAGE_SIZE) + return 0; + end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; while (last_offset < compressed_end) { @@ -673,6 +682,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, struct page *page; struct bio *comp_bio; u64 cur_disk_byte = bio->bi_iter.bi_sector << 9; + u64 file_offset; u64 em_len; u64 em_start; struct extent_map *em; @@ -682,15 +692,17 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, em_tree = &BTRFS_I(inode)->extent_tree; + file_offset = bio_first_bvec_all(bio)->bv_offset + + page_offset(bio_first_page_all(bio)); + /* we need the actual starting offset of this extent in the file */ read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, - page_offset(bio_first_page_all(bio)), - fs_info->sectorsize); + em = lookup_extent_mapping(em_tree, file_offset, fs_info->sectorsize); read_unlock(&em_tree->lock); if (!em) return BLK_STS_IOERR; + ASSERT(em->compress_type != BTRFS_COMPRESS_NONE); compressed_len = em->block_len; cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); if (!cb) @@ -721,8 +733,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, goto fail1; for (pg_index = 0; pg_index < nr_pages; pg_index++) { - cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS | - __GFP_HIGHMEM); + cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS); if (!cb->compressed_pages[pg_index]) { faili = pg_index - 1; ret = BLK_STS_RESOURCE; @@ -1261,96 +1272,82 @@ void __cold btrfs_exit_compress(void) } /* - * Copy uncompressed data from working buffer to pages. + * Copy decompressed data from working buffer to pages. + * + * @buf: The decompressed data buffer + * @buf_len: The decompressed data length + * @decompressed: Number of bytes that are already decompressed inside the + * compressed extent + * @cb: The compressed extent descriptor + * @orig_bio: The original bio that the caller wants to read for + * + * An easier to understand graph is like below: + * + * |<- orig_bio ->| |<- orig_bio->| + * |<------- full decompressed extent ----->| + * |<----------- @cb range ---->| + * | |<-- @buf_len -->| + * |<--- @decompressed --->| + * + * Note that, @cb can be a subpage of the full decompressed extent, but + * @cb->start always has the same as the orig_file_offset value of the full + * decompressed extent. * - * buf_start is the byte offset we're of the start of our workspace buffer. + * When reading compressed extent, we have to read the full compressed extent, + * while @orig_bio may only want part of the range. + * Thus this function will ensure only data covered by @orig_bio will be copied + * to. * - * total_out is the last byte of the buffer + * Return 0 if we have copied all needed contents for @orig_bio. + * Return >0 if we need continue decompress. */ -int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start, - unsigned long total_out, u64 disk_start, - struct bio *bio) +int btrfs_decompress_buf2page(const char *buf, u32 buf_len, + struct compressed_bio *cb, u32 decompressed) { - unsigned long buf_offset; - unsigned long current_buf_start; - unsigned long start_byte; - unsigned long prev_start_byte; - unsigned long working_bytes = total_out - buf_start; - unsigned long bytes; - struct bio_vec bvec = bio_iter_iovec(bio, bio->bi_iter); - - /* - * start byte is the first byte of the page we're currently - * copying into relative to the start of the compressed data. - */ - start_byte = page_offset(bvec.bv_page) - disk_start; - - /* we haven't yet hit data corresponding to this page */ - if (total_out <= start_byte) - return 1; - - /* - * the start of the data we care about is offset into - * the middle of our working buffer - */ - if (total_out > start_byte && buf_start < start_byte) { - buf_offset = start_byte - buf_start; - working_bytes -= buf_offset; - } else { - buf_offset = 0; - } - current_buf_start = buf_start; - - /* copy bytes from the working buffer into the pages */ - while (working_bytes > 0) { - bytes = min_t(unsigned long, bvec.bv_len, - PAGE_SIZE - (buf_offset % PAGE_SIZE)); - bytes = min(bytes, working_bytes); - - memcpy_to_page(bvec.bv_page, bvec.bv_offset, buf + buf_offset, - bytes); - flush_dcache_page(bvec.bv_page); + struct bio *orig_bio = cb->orig_bio; + /* Offset inside the full decompressed extent */ + u32 cur_offset; + + cur_offset = decompressed; + /* The main loop to do the copy */ + while (cur_offset < decompressed + buf_len) { + struct bio_vec bvec; + size_t copy_len; + u32 copy_start; + /* Offset inside the full decompressed extent */ + u32 bvec_offset; + + bvec = bio_iter_iovec(orig_bio, orig_bio->bi_iter); + /* + * cb->start may underflow, but subtracting that value can still + * give us correct offset inside the full decompressed extent. + */ + bvec_offset = page_offset(bvec.bv_page) + bvec.bv_offset - cb->start; - buf_offset += bytes; - working_bytes -= bytes; - current_buf_start += bytes; + /* Haven't reached the bvec range, exit */ + if (decompressed + buf_len <= bvec_offset) + return 1; - /* check if we need to pick another page */ - bio_advance(bio, bytes); - if (!bio->bi_iter.bi_size) - return 0; - bvec = bio_iter_iovec(bio, bio->bi_iter); - prev_start_byte = start_byte; - start_byte = page_offset(bvec.bv_page) - disk_start; + copy_start = max(cur_offset, bvec_offset); + copy_len = min(bvec_offset + bvec.bv_len, + decompressed + buf_len) - copy_start; + ASSERT(copy_len); /* - * We need to make sure we're only adjusting - * our offset into compression working buffer when - * we're switching pages. Otherwise we can incorrectly - * keep copying when we were actually done. + * Extra range check to ensure we didn't go beyond + * @buf + @buf_len. */ - if (start_byte != prev_start_byte) { - /* - * make sure our new page is covered by this - * working buffer - */ - if (total_out <= start_byte) - return 1; + ASSERT(copy_start - decompressed < buf_len); + memcpy_to_page(bvec.bv_page, bvec.bv_offset, + buf + copy_start - decompressed, copy_len); + flush_dcache_page(bvec.bv_page); + cur_offset += copy_len; - /* - * the next page in the biovec might not be adjacent - * to the last page, but it might still be found - * inside this working buffer. bump our offset pointer - */ - if (total_out > start_byte && - current_buf_start < start_byte) { - buf_offset = start_byte - buf_start; - working_bytes = total_out - start_byte; - current_buf_start = buf_start + buf_offset; - } - } + bio_advance(orig_bio, copy_len); + /* Finished the bio */ + if (!orig_bio->bi_iter.bi_size) + return 0; } - return 1; } diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index c359f20920d0..399be0b435bf 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -86,9 +86,8 @@ int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping, unsigned long *total_out); int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, unsigned long start_byte, size_t srclen, size_t destlen); -int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start, - unsigned long total_out, u64 disk_start, - struct bio *bio); +int btrfs_decompress_buf2page(const char *buf, u32 buf_len, + struct compressed_bio *cb, u32 decompressed); blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, unsigned int len, u64 disk_start, diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index c5c08c87e130..84627cbd5b5b 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -726,21 +726,21 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, /* * search for key in the extent_buffer. The items start at offset p, - * and they are item_size apart. There are 'max' items in p. + * and they are item_size apart. * * the slot in the array is returned via slot, and it points to * the place where you would insert key if it is not found in * the array. * - * slot may point to max if the key is bigger than all of the keys + * Slot may point to total number of items if the key is bigger than + * all of the keys */ static noinline int generic_bin_search(struct extent_buffer *eb, unsigned long p, int item_size, - const struct btrfs_key *key, - int max, int *slot) + const struct btrfs_key *key, int *slot) { int low = 0; - int high = max; + int high = btrfs_header_nritems(eb); int ret; const int key_size = sizeof(struct btrfs_disk_key); @@ -799,15 +799,11 @@ int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key, if (btrfs_header_level(eb) == 0) return generic_bin_search(eb, offsetof(struct btrfs_leaf, items), - sizeof(struct btrfs_item), - key, btrfs_header_nritems(eb), - slot); + sizeof(struct btrfs_item), key, slot); else return generic_bin_search(eb, offsetof(struct btrfs_node, ptrs), - sizeof(struct btrfs_key_ptr), - key, btrfs_header_nritems(eb), - slot); + sizeof(struct btrfs_key_ptr), key, slot); } static void root_add_used(struct btrfs_root *root, u32 size) @@ -1237,7 +1233,6 @@ static void reada_for_search(struct btrfs_fs_info *fs_info, u64 target; u64 nread = 0; u64 nread_max; - struct extent_buffer *eb; u32 nr; u32 blocksize; u32 nscan = 0; @@ -1266,10 +1261,14 @@ static void reada_for_search(struct btrfs_fs_info *fs_info, search = btrfs_node_blockptr(node, slot); blocksize = fs_info->nodesize; - eb = find_extent_buffer(fs_info, search); - if (eb) { - free_extent_buffer(eb); - return; + if (path->reada != READA_FORWARD_ALWAYS) { + struct extent_buffer *eb; + + eb = find_extent_buffer(fs_info, search); + if (eb) { + free_extent_buffer(eb); + return; + } } target = search; @@ -2103,6 +2102,27 @@ again: } /* + * Execute search and call btrfs_previous_item to traverse backwards if the item + * was not found. + * + * Return 0 if found, 1 if not found and < 0 if error. + */ +int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key, + struct btrfs_path *path) +{ + int ret; + + ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + if (ret > 0) + ret = btrfs_previous_item(root, path, key->objectid, key->type); + + if (ret == 0) + btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]); + + return ret; +} + +/* * adjust the pointers going up the tree, starting at level * making sure the right key of each node is points to 'key'. * This is used after shifting pointers to the left, so it stops @@ -4358,16 +4378,6 @@ next: return 1; } -/* - * search the tree again to find a leaf with greater keys - * returns 0 if it found something or 1 if there are no greater leaves. - * returns < 0 on io errors. - */ -int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) -{ - return btrfs_next_old_leaf(root, path, 0); -} - int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq) { diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index e5e53e592d4f..dff2c8a3e059 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -281,7 +281,8 @@ struct btrfs_super_block { #define BTRFS_FEATURE_COMPAT_RO_SUPP \ (BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE | \ - BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID) + BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID | \ + BTRFS_FEATURE_COMPAT_RO_VERITY) #define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL #define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL @@ -1012,8 +1013,6 @@ struct btrfs_fs_info { u64 zoned; }; - /* Max size to emit ZONE_APPEND write command */ - u64 max_zone_append_size; struct mutex zoned_meta_io_lock; spinlock_t treelog_bg_lock; u64 treelog_bg; @@ -1484,20 +1483,20 @@ do { \ /* * Inode flags */ -#define BTRFS_INODE_NODATASUM (1 << 0) -#define BTRFS_INODE_NODATACOW (1 << 1) -#define BTRFS_INODE_READONLY (1 << 2) -#define BTRFS_INODE_NOCOMPRESS (1 << 3) -#define BTRFS_INODE_PREALLOC (1 << 4) -#define BTRFS_INODE_SYNC (1 << 5) -#define BTRFS_INODE_IMMUTABLE (1 << 6) -#define BTRFS_INODE_APPEND (1 << 7) -#define BTRFS_INODE_NODUMP (1 << 8) -#define BTRFS_INODE_NOATIME (1 << 9) -#define BTRFS_INODE_DIRSYNC (1 << 10) -#define BTRFS_INODE_COMPRESS (1 << 11) - -#define BTRFS_INODE_ROOT_ITEM_INIT (1 << 31) +#define BTRFS_INODE_NODATASUM (1U << 0) +#define BTRFS_INODE_NODATACOW (1U << 1) +#define BTRFS_INODE_READONLY (1U << 2) +#define BTRFS_INODE_NOCOMPRESS (1U << 3) +#define BTRFS_INODE_PREALLOC (1U << 4) +#define BTRFS_INODE_SYNC (1U << 5) +#define BTRFS_INODE_IMMUTABLE (1U << 6) +#define BTRFS_INODE_APPEND (1U << 7) +#define BTRFS_INODE_NODUMP (1U << 8) +#define BTRFS_INODE_NOATIME (1U << 9) +#define BTRFS_INODE_DIRSYNC (1U << 10) +#define BTRFS_INODE_COMPRESS (1U << 11) + +#define BTRFS_INODE_ROOT_ITEM_INIT (1U << 31) #define BTRFS_INODE_FLAG_MASK \ (BTRFS_INODE_NODATASUM | \ @@ -1514,6 +1513,10 @@ do { \ BTRFS_INODE_COMPRESS | \ BTRFS_INODE_ROOT_ITEM_INIT) +#define BTRFS_INODE_RO_VERITY (1U << 0) + +#define BTRFS_INODE_RO_FLAG_MASK (BTRFS_INODE_RO_VERITY) + struct btrfs_map_token { struct extent_buffer *eb; char *kaddr; @@ -2781,10 +2784,11 @@ enum btrfs_flush_state { FLUSH_DELAYED_REFS = 4, FLUSH_DELALLOC = 5, FLUSH_DELALLOC_WAIT = 6, - ALLOC_CHUNK = 7, - ALLOC_CHUNK_FORCE = 8, - RUN_DELAYED_IPUTS = 9, - COMMIT_TRANS = 10, + FLUSH_DELALLOC_FULL = 7, + ALLOC_CHUNK = 8, + ALLOC_CHUNK_FORCE = 9, + RUN_DELAYED_IPUTS = 10, + COMMIT_TRANS = 11, }; int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, @@ -2901,10 +2905,13 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, return btrfs_insert_empty_items(trans, root, path, key, &data_size, 1); } -int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq); + +int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key, + struct btrfs_path *path); + static inline int btrfs_next_old_item(struct btrfs_root *root, struct btrfs_path *p, u64 time_seq) { @@ -2913,6 +2920,18 @@ static inline int btrfs_next_old_item(struct btrfs_root *root, return btrfs_next_old_leaf(root, p, time_seq); return 0; } + +/* + * Search the tree again to find a leaf with greater keys. + * + * Returns 0 if it found something or 1 if there are no greater leaves. + * Returns < 0 on error. + */ +static inline int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) +{ + return btrfs_next_old_leaf(root, path, 0); +} + static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p) { return btrfs_next_old_item(root, p, 0); @@ -3145,7 +3164,8 @@ int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, struct extent_state **cached_state); int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, struct btrfs_root *new_root, - struct btrfs_root *parent_root); + struct btrfs_root *parent_root, + struct user_namespace *mnt_userns); void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, unsigned *bits); void btrfs_clear_delalloc_extent(struct inode *inode, @@ -3194,10 +3214,10 @@ int btrfs_prealloc_file_range_trans(struct inode *inode, int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, unsigned long *nr_written, struct writeback_control *wbc); -int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end); +int btrfs_writepage_cow_fixup(struct page *page); void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, struct page *page, u64 start, - u64 end, int uptodate); + u64 end, bool uptodate); extern const struct dentry_operations btrfs_dentry_operations; extern const struct iomap_ops btrfs_dio_iomap_ops; extern const struct iomap_dio_ops btrfs_dio_ops; @@ -3686,7 +3706,7 @@ static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag) /* acl.c */ #ifdef CONFIG_BTRFS_FS_POSIX_ACL -struct posix_acl *btrfs_get_acl(struct inode *inode, int type); +struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu); int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, struct posix_acl *acl, int type); int btrfs_init_acl(struct btrfs_trans_handle *trans, @@ -3779,6 +3799,30 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) return signal_pending(current); } +/* verity.c */ +#ifdef CONFIG_FS_VERITY + +extern const struct fsverity_operations btrfs_verityops; +int btrfs_drop_verity_items(struct btrfs_inode *inode); + +BTRFS_SETGET_FUNCS(verity_descriptor_encryption, struct btrfs_verity_descriptor_item, + encryption, 8); +BTRFS_SETGET_FUNCS(verity_descriptor_size, struct btrfs_verity_descriptor_item, + size, 64); +BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_encryption, + struct btrfs_verity_descriptor_item, encryption, 8); +BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_size, + struct btrfs_verity_descriptor_item, size, 64); + +#else + +static inline int btrfs_drop_verity_items(struct btrfs_inode *inode) +{ + return 0; +} + +#endif + /* Sanity test specific functions */ #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS void btrfs_test_destroy_inode(struct inode *inode); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 257c1e18abd4..1e08eb2b27f0 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -6,7 +6,6 @@ #include <linux/slab.h> #include <linux/iversion.h> -#include <linux/sched/mm.h> #include "misc.h" #include "delayed-inode.h" #include "disk-io.h" @@ -672,176 +671,119 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_fs_info *fs_info, } /* - * This helper will insert some continuous items into the same leaf according - * to the free space of the leaf. + * Insert a single delayed item or a batch of delayed items that have consecutive + * keys if they exist. */ -static int btrfs_batch_insert_items(struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_delayed_item *item) +static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_delayed_item *first_item) { - struct btrfs_delayed_item *curr, *next; - int free_space; - int total_size = 0; - struct extent_buffer *leaf; - char *data_ptr; - struct btrfs_key *keys; - u32 *data_size; - struct list_head head; - int slot; + LIST_HEAD(batch); + struct btrfs_delayed_item *curr; + struct btrfs_delayed_item *next; + const int max_size = BTRFS_LEAF_DATA_SIZE(root->fs_info); + int total_size; int nitems; - int i; - int ret = 0; - - BUG_ON(!path->nodes[0]); + char *ins_data = NULL; + struct btrfs_key *ins_keys; + u32 *ins_sizes; + int ret; - leaf = path->nodes[0]; - free_space = btrfs_leaf_free_space(leaf); - INIT_LIST_HEAD(&head); + list_add_tail(&first_item->tree_list, &batch); + nitems = 1; + total_size = first_item->data_len + sizeof(struct btrfs_item); + curr = first_item; - next = item; - nitems = 0; + while (true) { + int next_size; - /* - * count the number of the continuous items that we can insert in batch - */ - while (total_size + next->data_len + sizeof(struct btrfs_item) <= - free_space) { - total_size += next->data_len + sizeof(struct btrfs_item); - list_add_tail(&next->tree_list, &head); - nitems++; - - curr = next; next = __btrfs_next_delayed_item(curr); - if (!next) + if (!next || !btrfs_is_continuous_delayed_item(curr, next)) break; - if (!btrfs_is_continuous_delayed_item(curr, next)) + next_size = next->data_len + sizeof(struct btrfs_item); + if (total_size + next_size > max_size) break; - } - if (!nitems) { - ret = 0; - goto out; + list_add_tail(&next->tree_list, &batch); + nitems++; + total_size += next_size; + curr = next; } - keys = kmalloc_array(nitems, sizeof(struct btrfs_key), GFP_NOFS); - if (!keys) { - ret = -ENOMEM; - goto out; - } + if (nitems == 1) { + ins_keys = &first_item->key; + ins_sizes = &first_item->data_len; + } else { + int i = 0; - data_size = kmalloc_array(nitems, sizeof(u32), GFP_NOFS); - if (!data_size) { - ret = -ENOMEM; - goto error; + ins_data = kmalloc(nitems * sizeof(u32) + + nitems * sizeof(struct btrfs_key), GFP_NOFS); + if (!ins_data) { + ret = -ENOMEM; + goto out; + } + ins_sizes = (u32 *)ins_data; + ins_keys = (struct btrfs_key *)(ins_data + nitems * sizeof(u32)); + list_for_each_entry(curr, &batch, tree_list) { + ins_keys[i] = curr->key; + ins_sizes[i] = curr->data_len; + i++; + } } - /* get keys of all the delayed items */ - i = 0; - list_for_each_entry(next, &head, tree_list) { - keys[i] = next->key; - data_size[i] = next->data_len; - i++; - } + ret = btrfs_insert_empty_items(trans, root, path, ins_keys, ins_sizes, + nitems); + if (ret) + goto out; - /* insert the keys of the items */ - setup_items_for_insert(root, path, keys, data_size, nitems); + list_for_each_entry(curr, &batch, tree_list) { + char *data_ptr; - /* insert the dir index items */ - slot = path->slots[0]; - list_for_each_entry_safe(curr, next, &head, tree_list) { - data_ptr = btrfs_item_ptr(leaf, slot, char); - write_extent_buffer(leaf, &curr->data, - (unsigned long)data_ptr, - curr->data_len); - slot++; + data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char); + write_extent_buffer(path->nodes[0], &curr->data, + (unsigned long)data_ptr, curr->data_len); + path->slots[0]++; + } - btrfs_delayed_item_release_metadata(root, curr); + /* + * Now release our path before releasing the delayed items and their + * metadata reservations, so that we don't block other tasks for more + * time than needed. + */ + btrfs_release_path(path); + list_for_each_entry_safe(curr, next, &batch, tree_list) { list_del(&curr->tree_list); + btrfs_delayed_item_release_metadata(root, curr); btrfs_release_delayed_item(curr); } - -error: - kfree(data_size); - kfree(keys); out: + kfree(ins_data); return ret; } -/* - * This helper can just do simple insertion that needn't extend item for new - * data, such as directory name index insertion, inode insertion. - */ -static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_delayed_item *delayed_item) -{ - struct extent_buffer *leaf; - unsigned int nofs_flag; - char *ptr; - int ret; - - nofs_flag = memalloc_nofs_save(); - ret = btrfs_insert_empty_item(trans, root, path, &delayed_item->key, - delayed_item->data_len); - memalloc_nofs_restore(nofs_flag); - if (ret < 0 && ret != -EEXIST) - return ret; - - leaf = path->nodes[0]; - - ptr = btrfs_item_ptr(leaf, path->slots[0], char); - - write_extent_buffer(leaf, delayed_item->data, (unsigned long)ptr, - delayed_item->data_len); - btrfs_mark_buffer_dirty(leaf); - - btrfs_delayed_item_release_metadata(root, delayed_item); - return 0; -} - -/* - * we insert an item first, then if there are some continuous items, we try - * to insert those items into the same leaf. - */ static int btrfs_insert_delayed_items(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_root *root, struct btrfs_delayed_node *node) { - struct btrfs_delayed_item *curr, *prev; int ret = 0; -do_again: - mutex_lock(&node->mutex); - curr = __btrfs_first_delayed_insertion_item(node); - if (!curr) - goto insert_end; - - ret = btrfs_insert_delayed_item(trans, root, path, curr); - if (ret < 0) { - btrfs_release_path(path); - goto insert_end; - } + while (ret == 0) { + struct btrfs_delayed_item *curr; - prev = curr; - curr = __btrfs_next_delayed_item(prev); - if (curr && btrfs_is_continuous_delayed_item(prev, curr)) { - /* insert the continuous items into the same leaf */ - path->slots[0]++; - btrfs_batch_insert_items(root, path, curr); + mutex_lock(&node->mutex); + curr = __btrfs_first_delayed_insertion_item(node); + if (!curr) { + mutex_unlock(&node->mutex); + break; + } + ret = btrfs_insert_delayed_item(trans, root, path, curr); + mutex_unlock(&node->mutex); } - btrfs_release_delayed_item(prev); - btrfs_mark_buffer_dirty(path->nodes[0]); - btrfs_release_path(path); - mutex_unlock(&node->mutex); - goto do_again; - -insert_end: - mutex_unlock(&node->mutex); return ret; } @@ -914,7 +856,6 @@ static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans, struct btrfs_delayed_node *node) { struct btrfs_delayed_item *curr, *prev; - unsigned int nofs_flag; int ret = 0; do_again: @@ -923,9 +864,7 @@ do_again: if (!curr) goto delete_fail; - nofs_flag = memalloc_nofs_save(); ret = btrfs_search_slot(trans, root, &curr->key, path, -1, 1); - memalloc_nofs_restore(nofs_flag); if (ret < 0) goto delete_fail; else if (ret > 0) { @@ -994,7 +933,6 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, struct btrfs_key key; struct btrfs_inode_item *inode_item; struct extent_buffer *leaf; - unsigned int nofs_flag; int mod; int ret; @@ -1007,9 +945,7 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, else mod = 1; - nofs_flag = memalloc_nofs_save(); ret = btrfs_lookup_inode(trans, root, path, &key, mod); - memalloc_nofs_restore(nofs_flag); if (ret > 0) ret = -ENOENT; if (ret < 0) @@ -1066,9 +1002,7 @@ search: key.type = BTRFS_INODE_EXTREF_KEY; key.offset = -1; - nofs_flag = memalloc_nofs_save(); ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - memalloc_nofs_restore(nofs_flag); if (ret < 0) goto err_out; ASSERT(ret); @@ -1711,6 +1645,8 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode_item *inode_item, struct inode *inode) { + u64 flags; + btrfs_set_stack_inode_uid(inode_item, i_uid_read(inode)); btrfs_set_stack_inode_gid(inode_item, i_gid_read(inode)); btrfs_set_stack_inode_size(inode_item, BTRFS_I(inode)->disk_i_size); @@ -1723,7 +1659,9 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans, inode_peek_iversion(inode)); btrfs_set_stack_inode_transid(inode_item, trans->transid); btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev); - btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags); + flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags, + BTRFS_I(inode)->ro_flags); + btrfs_set_stack_inode_flags(inode_item, flags); btrfs_set_stack_inode_block_group(inode_item, 0); btrfs_set_stack_timespec_sec(&inode_item->atime, @@ -1781,7 +1719,8 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) btrfs_stack_inode_sequence(inode_item)); inode->i_rdev = 0; *rdev = btrfs_stack_inode_rdev(inode_item); - BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item); + btrfs_inode_split_flags(btrfs_stack_inode_flags(inode_item), + &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags); inode->i_atime.tv_sec = btrfs_stack_timespec_sec(&inode_item->atime); inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->atime); diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 98b63ebed539..f1274d5c3805 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -170,6 +170,25 @@ out_free: return 0; } +static struct btrfs_dir_item *btrfs_lookup_match_dir( + struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *key, const char *name, + int name_len, int mod) +{ + const int ins_len = (mod < 0 ? -1 : 0); + const int cow = (mod != 0); + int ret; + + ret = btrfs_search_slot(trans, root, key, path, ins_len, cow); + if (ret < 0) + return ERR_PTR(ret); + if (ret > 0) + return ERR_PTR(-ENOENT); + + return btrfs_match_dir_item_name(root->fs_info, path, name, name_len); +} + /* * lookup a directory item based on name. 'dir' is the objectid * we're searching in, and 'mod' tells us if you plan on deleting the @@ -181,23 +200,18 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, const char *name, int name_len, int mod) { - int ret; struct btrfs_key key; - int ins_len = mod < 0 ? -1 : 0; - int cow = mod != 0; + struct btrfs_dir_item *di; key.objectid = dir; key.type = BTRFS_DIR_ITEM_KEY; - key.offset = btrfs_name_hash(name, name_len); - ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); - if (ret < 0) - return ERR_PTR(ret); - if (ret > 0) + di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod); + if (IS_ERR(di) && PTR_ERR(di) == -ENOENT) return NULL; - return btrfs_match_dir_item_name(root->fs_info, path, name, name_len); + return di; } int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, @@ -211,7 +225,6 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, int slot; struct btrfs_path *path; - path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -220,20 +233,20 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, key.type = BTRFS_DIR_ITEM_KEY; key.offset = btrfs_name_hash(name, name_len); - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - - /* return back any errors */ - if (ret < 0) - goto out; + di = btrfs_lookup_match_dir(NULL, root, path, &key, name, name_len, 0); + if (IS_ERR(di)) { + ret = PTR_ERR(di); + /* Nothing found, we're safe */ + if (ret == -ENOENT) { + ret = 0; + goto out; + } - /* nothing found, we're safe */ - if (ret > 0) { - ret = 0; - goto out; + if (ret < 0) + goto out; } /* we found an item, look for our name in the item */ - di = btrfs_match_dir_item_name(root->fs_info, path, name, name_len); if (di) { /* our exact name was found */ ret = -EEXIST; @@ -274,21 +287,13 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, u64 objectid, const char *name, int name_len, int mod) { - int ret; struct btrfs_key key; - int ins_len = mod < 0 ? -1 : 0; - int cow = mod != 0; key.objectid = dir; key.type = BTRFS_DIR_INDEX_KEY; key.offset = objectid; - ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); - if (ret < 0) - return ERR_PTR(ret); - if (ret > 0) - return ERR_PTR(-ENOENT); - return btrfs_match_dir_item_name(root->fs_info, path, name, name_len); + return btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod); } struct btrfs_dir_item * @@ -345,21 +350,18 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, const char *name, u16 name_len, int mod) { - int ret; struct btrfs_key key; - int ins_len = mod < 0 ? -1 : 0; - int cow = mod != 0; + struct btrfs_dir_item *di; key.objectid = dir; key.type = BTRFS_XATTR_ITEM_KEY; key.offset = btrfs_name_hash(name, name_len); - ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); - if (ret < 0) - return ERR_PTR(ret); - if (ret > 0) + + di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod); + if (IS_ERR(di) && PTR_ERR(di) == -ENOENT) return NULL; - return btrfs_match_dir_item_name(root->fs_info, path, name, name_len); + return di; } /* diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a59ab7b9aea0..2f9515dccce0 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3392,11 +3392,16 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail_alloc; } - /* For 4K sector size support, it's only read-only */ - if (PAGE_SIZE == SZ_64K && sectorsize == SZ_4K) { - if (!sb_rdonly(sb) || btrfs_super_log_root(disk_super)) { + if (sectorsize != PAGE_SIZE) { + btrfs_warn(fs_info, + "read-write for sector size %u with page size %lu is experimental", + sectorsize, PAGE_SIZE); + } + if (sectorsize != PAGE_SIZE) { + if (btrfs_super_incompat_flags(fs_info->super_copy) & + BTRFS_FEATURE_INCOMPAT_RAID56) { btrfs_err(fs_info, - "subpage sectorsize %u only supported read-only for page size %lu", + "RAID56 is not yet supported for sector size %u with page size %lu", sectorsize, PAGE_SIZE); err = -EINVAL; goto fail_alloc; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 268ce58d4569..fc3da7585fb7 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -153,7 +153,7 @@ search_again: else key.type = BTRFS_EXTENT_ITEM_KEY; - ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0); + ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); if (ret < 0) goto out_free; @@ -5950,9 +5950,9 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) */ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) { + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_block_group *cache = NULL; struct btrfs_device *device; - struct list_head *devices; u64 group_trimmed; u64 range_end = U64_MAX; u64 start; @@ -6016,9 +6016,9 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) btrfs_warn(fs_info, "failed to trim %llu block group(s), last error %d", bg_failed, bg_ret); - mutex_lock(&fs_info->fs_devices->device_list_mutex); - devices = &fs_info->fs_devices->devices; - list_for_each_entry(device, devices, dev_list) { + + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) { if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) continue; @@ -6031,7 +6031,7 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) trimmed += group_trimmed; } - mutex_unlock(&fs_info->fs_devices->device_list_mutex); + mutex_unlock(&fs_devices->device_list_mutex); if (dev_failed) btrfs_warn(fs_info, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 9e81d25dea70..aaddd7225348 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -13,6 +13,7 @@ #include <linux/pagevec.h> #include <linux/prefetch.h> #include <linux/cleancache.h> +#include <linux/fsverity.h> #include "misc.h" #include "extent_io.h" #include "extent-io-tree.h" @@ -172,6 +173,8 @@ int __must_check submit_one_bio(struct bio *bio, int mirror_num, bio->bi_private = NULL; + /* Caller should ensure the bio has at least some range added */ + ASSERT(bio->bi_iter.bi_size); if (is_data_inode(tree->private_data)) ret = btrfs_submit_data_bio(tree->private_data, bio, mirror_num, bio_flags); @@ -2245,18 +2248,6 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, return bitset; } -/* - * helper function to set a given page up to date if all the - * extents in the tree for that page are up to date - */ -static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) -{ - u64 start = page_offset(page); - u64 end = start + PAGE_SIZE - 1; - if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) - SetPageUptodate(page); -} - int free_io_failure(struct extent_io_tree *failure_tree, struct extent_io_tree *io_tree, struct io_failure_record *rec) @@ -2688,7 +2679,15 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) start + len <= page_offset(page) + PAGE_SIZE); if (uptodate) { - btrfs_page_set_uptodate(fs_info, page, start, len); + if (fsverity_active(page->mapping->host) && + !PageError(page) && + !PageUptodate(page) && + start < i_size_read(page->mapping->host) && + !fsverity_verify_page(page)) { + btrfs_page_set_error(fs_info, page, start, len); + } else { + btrfs_page_set_uptodate(fs_info, page, start, len); + } } else { btrfs_page_clear_uptodate(fs_info, page, start, len); btrfs_page_set_error(fs_info, page, start, len); @@ -2779,7 +2778,7 @@ next: void end_extent_writepage(struct page *page, int err, u64 start, u64 end) { struct btrfs_inode *inode; - int uptodate = (err == 0); + const bool uptodate = (err == 0); int ret = 0; ASSERT(page && page->mapping); @@ -2787,8 +2786,14 @@ void end_extent_writepage(struct page *page, int err, u64 start, u64 end) btrfs_writepage_endio_finish_ordered(inode, page, start, end, uptodate); if (!uptodate) { - ClearPageUptodate(page); - SetPageError(page); + const struct btrfs_fs_info *fs_info = inode->root->fs_info; + u32 len; + + ASSERT(end + 1 - start <= U32_MAX); + len = end + 1 - start; + + btrfs_page_clear_uptodate(fs_info, page, start, len); + btrfs_page_set_error(fs_info, page, start, len); ret = err < 0 ? err : -EIO; mapping_set_error(page->mapping, ret); } @@ -3097,7 +3102,7 @@ readpage_ok: /* Update page status and unlock */ end_page_read(page, uptodate, start, len); endio_readpage_release_extent(&processed, BTRFS_I(inode), - start, end, uptodate); + start, end, PageUptodate(page)); } /* Release the last extent */ endio_readpage_release_extent(&processed, NULL, 0, 0, false); @@ -3153,11 +3158,13 @@ struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs) return bio; } -struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size) +struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size) { struct bio *bio; struct btrfs_io_bio *btrfs_bio; + ASSERT(offset <= UINT_MAX && size <= UINT_MAX); + /* this will never fail when it's backed by a bioset */ bio = bio_clone_fast(orig, GFP_NOFS, &btrfs_bioset); ASSERT(bio); @@ -3181,20 +3188,22 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size) * @size: portion of page that we want to write * @prev_bio_flags: flags of previous bio to see if we can merge the current one * @bio_flags: flags of the current bio to see if we can merge them - * @return: true if page was added, false otherwise * * Attempt to add a page to bio considering stripe alignment etc. * - * Return true if successfully page added. Otherwise, return false. + * Return >= 0 for the number of bytes added to the bio. + * Can return 0 if the current bio is already at stripe/zone boundary. + * Return <0 for error. */ -static bool btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, - struct page *page, - u64 disk_bytenr, unsigned int size, - unsigned int pg_offset, - unsigned long bio_flags) +static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, + struct page *page, + u64 disk_bytenr, unsigned int size, + unsigned int pg_offset, + unsigned long bio_flags) { struct bio *bio = bio_ctrl->bio; u32 bio_size = bio->bi_iter.bi_size; + u32 real_size; const sector_t sector = disk_bytenr >> SECTOR_SHIFT; bool contig; int ret; @@ -3203,29 +3212,36 @@ static bool btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, /* The limit should be calculated when bio_ctrl->bio is allocated */ ASSERT(bio_ctrl->len_to_oe_boundary && bio_ctrl->len_to_stripe_boundary); if (bio_ctrl->bio_flags != bio_flags) - return false; + return 0; if (bio_ctrl->bio_flags & EXTENT_BIO_COMPRESSED) contig = bio->bi_iter.bi_sector == sector; else contig = bio_end_sector(bio) == sector; if (!contig) - return false; + return 0; - if (bio_size + size > bio_ctrl->len_to_oe_boundary || - bio_size + size > bio_ctrl->len_to_stripe_boundary) - return false; + real_size = min(bio_ctrl->len_to_oe_boundary, + bio_ctrl->len_to_stripe_boundary) - bio_size; + real_size = min(real_size, size); + + /* + * If real_size is 0, never call bio_add_*_page(), as even size is 0, + * bio will still execute its endio function on the page! + */ + if (real_size == 0) + return 0; if (bio_op(bio) == REQ_OP_ZONE_APPEND) - ret = bio_add_zone_append_page(bio, page, size, pg_offset); + ret = bio_add_zone_append_page(bio, page, real_size, pg_offset); else - ret = bio_add_page(bio, page, size, pg_offset); + ret = bio_add_page(bio, page, real_size, pg_offset); - return ret == size; + return ret; } static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, - struct btrfs_inode *inode) + struct btrfs_inode *inode, u64 file_offset) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_io_geometry geom; @@ -3266,9 +3282,8 @@ static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, return 0; } - ASSERT(fs_info->max_zone_append_size > 0); /* Ordered extent not yet created, so we're good */ - ordered = btrfs_lookup_ordered_extent(inode, logical); + ordered = btrfs_lookup_ordered_extent(inode, file_offset); if (!ordered) { bio_ctrl->len_to_oe_boundary = U32_MAX; return 0; @@ -3280,6 +3295,62 @@ static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, return 0; } +static int alloc_new_bio(struct btrfs_inode *inode, + struct btrfs_bio_ctrl *bio_ctrl, + struct writeback_control *wbc, + unsigned int opf, + bio_end_io_t end_io_func, + u64 disk_bytenr, u32 offset, u64 file_offset, + unsigned long bio_flags) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct bio *bio; + int ret; + + /* + * For compressed page range, its disk_bytenr is always @disk_bytenr + * passed in, no matter if we have added any range into previous bio. + */ + if (bio_flags & EXTENT_BIO_COMPRESSED) + bio = btrfs_bio_alloc(disk_bytenr); + else + bio = btrfs_bio_alloc(disk_bytenr + offset); + bio_ctrl->bio = bio; + bio_ctrl->bio_flags = bio_flags; + bio->bi_end_io = end_io_func; + bio->bi_private = &inode->io_tree; + bio->bi_write_hint = inode->vfs_inode.i_write_hint; + bio->bi_opf = opf; + ret = calc_bio_boundaries(bio_ctrl, inode, file_offset); + if (ret < 0) + goto error; + if (wbc) { + struct block_device *bdev; + + bdev = fs_info->fs_devices->latest_bdev; + bio_set_dev(bio, bdev); + wbc_init_bio(wbc, bio); + } + if (btrfs_is_zoned(fs_info) && bio_op(bio) == REQ_OP_ZONE_APPEND) { + struct btrfs_device *device; + + device = btrfs_zoned_get_device(fs_info, disk_bytenr, + fs_info->sectorsize); + if (IS_ERR(device)) { + ret = PTR_ERR(device); + goto error; + } + + btrfs_io_bio(bio)->device = device; + } + return 0; +error: + bio_ctrl->bio = NULL; + bio->bi_status = errno_to_blk_status(ret); + bio_endio(bio); + return ret; +} + /* * @opf: bio REQ_OP_* and REQ_* flags as one value * @wbc: optional writeback control for io accounting @@ -3305,61 +3376,67 @@ static int submit_extent_page(unsigned int opf, bool force_bio_submit) { int ret = 0; - struct bio *bio; - size_t io_size = min_t(size_t, size, PAGE_SIZE); struct btrfs_inode *inode = BTRFS_I(page->mapping->host); - struct extent_io_tree *tree = &inode->io_tree; - struct btrfs_fs_info *fs_info = inode->root->fs_info; + unsigned int cur = pg_offset; ASSERT(bio_ctrl); ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE && pg_offset + size <= PAGE_SIZE); - if (bio_ctrl->bio) { - bio = bio_ctrl->bio; - if (force_bio_submit || - !btrfs_bio_add_page(bio_ctrl, page, disk_bytenr, io_size, - pg_offset, bio_flags)) { - ret = submit_one_bio(bio, mirror_num, bio_ctrl->bio_flags); + if (force_bio_submit && bio_ctrl->bio) { + ret = submit_one_bio(bio_ctrl->bio, mirror_num, bio_ctrl->bio_flags); + bio_ctrl->bio = NULL; + if (ret < 0) + return ret; + } + + while (cur < pg_offset + size) { + u32 offset = cur - pg_offset; + int added; + + /* Allocate new bio if needed */ + if (!bio_ctrl->bio) { + ret = alloc_new_bio(inode, bio_ctrl, wbc, opf, + end_io_func, disk_bytenr, offset, + page_offset(page) + cur, + bio_flags); + if (ret < 0) + return ret; + } + /* + * We must go through btrfs_bio_add_page() to ensure each + * page range won't cross various boundaries. + */ + if (bio_flags & EXTENT_BIO_COMPRESSED) + added = btrfs_bio_add_page(bio_ctrl, page, disk_bytenr, + size - offset, pg_offset + offset, + bio_flags); + else + added = btrfs_bio_add_page(bio_ctrl, page, + disk_bytenr + offset, size - offset, + pg_offset + offset, bio_flags); + + /* Metadata page range should never be split */ + if (!is_data_inode(&inode->vfs_inode)) + ASSERT(added == 0 || added == size - offset); + + /* At least we added some page, update the account */ + if (wbc && added) + wbc_account_cgroup_owner(wbc, page, added); + + /* We have reached boundary, submit right now */ + if (added < size - offset) { + /* The bio should contain some page(s) */ + ASSERT(bio_ctrl->bio->bi_iter.bi_size); + ret = submit_one_bio(bio_ctrl->bio, mirror_num, + bio_ctrl->bio_flags); bio_ctrl->bio = NULL; if (ret < 0) return ret; - } else { - if (wbc) - wbc_account_cgroup_owner(wbc, page, io_size); - return 0; } + cur += added; } - - bio = btrfs_bio_alloc(disk_bytenr); - bio_add_page(bio, page, io_size, pg_offset); - bio->bi_end_io = end_io_func; - bio->bi_private = tree; - bio->bi_write_hint = page->mapping->host->i_write_hint; - bio->bi_opf = opf; - if (wbc) { - struct block_device *bdev; - - bdev = fs_info->fs_devices->latest_bdev; - bio_set_dev(bio, bdev); - wbc_init_bio(wbc, bio); - wbc_account_cgroup_owner(wbc, page, io_size); - } - if (btrfs_is_zoned(fs_info) && bio_op(bio) == REQ_OP_ZONE_APPEND) { - struct btrfs_device *device; - - device = btrfs_zoned_get_device(fs_info, disk_bytenr, io_size); - if (IS_ERR(device)) - return PTR_ERR(device); - - btrfs_io_bio(bio)->device = device; - } - - bio_ctrl->bio = bio; - bio_ctrl->bio_flags = bio_flags; - ret = calc_bio_boundaries(bio_ctrl, inode); - - return ret; + return 0; } static int attach_extent_buffer_page(struct extent_buffer *eb, @@ -3488,7 +3565,6 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, size_t pg_offset = 0; size_t iosize; size_t blocksize = inode->i_sb->s_blocksize; - unsigned long this_bio_flag = 0; struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; ret = set_page_extent_mapped(page); @@ -3519,6 +3595,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, } begin_page_read(fs_info, page); while (cur <= end) { + unsigned long this_bio_flag = 0; bool force_bio_submit = false; u64 disk_bytenr; @@ -3627,7 +3704,6 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, /* the get_extent function already copied into the page */ if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1, NULL)) { - check_page_uptodate(tree, page); unlock_extent(tree, cur, cur + iosize - 1); end_page_read(page, true, cur, iosize); cur = cur + iosize; @@ -3722,14 +3798,9 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, ret = btrfs_run_delalloc_range(inode, page, delalloc_start, delalloc_end, &page_started, nr_written, wbc); if (ret) { - SetPageError(page); - /* - * btrfs_run_delalloc_range should return < 0 for error - * but just in case, we use > 0 here meaning the IO is - * started, so we don't want to return > 0 unless - * things are going well. - */ - return ret < 0 ? ret : -EIO; + btrfs_page_set_error(inode->root->fs_info, page, + page_offset(page), PAGE_SIZE); + return ret; } /* * delalloc_end is already one less than the total length, so @@ -3829,9 +3900,8 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, int *nr_ret) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - u64 start = page_offset(page); - u64 end = start + PAGE_SIZE - 1; - u64 cur = start; + u64 cur = page_offset(page); + u64 end = cur + PAGE_SIZE - 1; u64 extent_offset; u64 block_start; struct extent_map *em; @@ -3841,7 +3911,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, const unsigned int write_flags = wbc_to_write_flags(wbc); bool compressed; - ret = btrfs_writepage_cow_fixup(page, start, end); + ret = btrfs_writepage_cow_fixup(page); if (ret) { /* Fixup worker will requeue */ redirty_page_for_writepage(wbc, page); @@ -3865,7 +3935,16 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, if (cur >= i_size) { btrfs_writepage_endio_finish_ordered(inode, page, cur, - end, 1); + end, true); + /* + * This range is beyond i_size, thus we don't need to + * bother writing back. + * But we still need to clear the dirty subpage bit, or + * the next time the page gets dirtied, we will try to + * writeback the sectors with subpage dirty bits, + * causing writeback without ordered extent. + */ + btrfs_page_clear_dirty(fs_info, page, cur, end + 1 - cur); break; } @@ -3915,7 +3994,8 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, nr++; else btrfs_writepage_endio_finish_ordered(inode, - page, cur, cur + iosize - 1, 1); + page, cur, cur + iosize - 1, true); + btrfs_page_clear_dirty(fs_info, page, cur, iosize); cur += iosize; continue; } @@ -3951,6 +4031,12 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, cur += iosize; nr++; } + /* + * If we finish without problem, we should not only clear page dirty, + * but also empty subpage dirty bits + */ + if (!ret) + btrfs_page_assert_not_dirty(fs_info, page); *nr_ret = nr; return ret; } @@ -3981,7 +4067,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, WARN_ON(!PageLocked(page)); - ClearPageError(page); + btrfs_page_clear_error(btrfs_sb(inode->i_sb), page, + page_offset(page), PAGE_SIZE); pg_offset = offset_in_page(i_size); if (page->index > end_index || @@ -4022,10 +4109,39 @@ done: set_page_writeback(page); end_page_writeback(page); } - if (PageError(page)) { - ret = ret < 0 ? ret : -EIO; + /* + * Here we used to have a check for PageError() and then set @ret and + * call end_extent_writepage(). + * + * But in fact setting @ret here will cause different error paths + * between subpage and regular sectorsize. + * + * For regular page size, we never submit current page, but only add + * current page to current bio. + * The bio submission can only happen in next page. + * Thus if we hit the PageError() branch, @ret is already set to + * non-zero value and will not get updated for regular sectorsize. + * + * But for subpage case, it's possible we submit part of current page, + * thus can get PageError() set by submitted bio of the same page, + * while our @ret is still 0. + * + * So here we unify the behavior and don't set @ret. + * Error can still be properly passed to higher layer as page will + * be set error, here we just don't handle the IO failure. + * + * NOTE: This is just a hotfix for subpage. + * The root fix will be properly ending ordered extent when we hit + * an error during writeback. + * + * But that needs a bigger refactoring, as we not only need to grab the + * submitted OE, but also need to know exactly at which bytenr we hit + * the error. + * Currently the full page based __extent_writepage_io() is not + * capable of that. + */ + if (PageError(page)) end_extent_writepage(page, ret, start, page_end); - } unlock_page(page); ASSERT(ret <= 0); return ret; @@ -4984,7 +5100,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end, ret = __extent_writepage(page, &wbc_writepages, &epd); else { btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), - page, start, start + PAGE_SIZE - 1, 1); + page, start, start + PAGE_SIZE - 1, true); unlock_page(page); } put_page(page); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 62027f551b44..53abdc280451 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -280,7 +280,7 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, struct bio *btrfs_bio_alloc(u64 first_byte); struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs); struct bio *btrfs_bio_clone(struct bio *bio); -struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size); +struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size); int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, u64 length, u64 logical, struct page *page, diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index df6631eefc65..2673c6ba7a4e 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -233,7 +233,6 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, struct btrfs_path *path, u64 objectid, u64 offset, int mod) { - int ret; struct btrfs_key file_key; int ins_len = mod < 0 ? -1 : 0; int cow = mod != 0; @@ -241,8 +240,8 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, file_key.objectid = objectid; file_key.offset = offset; file_key.type = BTRFS_EXTENT_DATA_KEY; - ret = btrfs_search_slot(trans, root, &file_key, path, ins_len, cow); - return ret; + + return btrfs_search_slot(trans, root, &file_key, path, ins_len, cow); } /* diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index ee34497500e1..7ff577005d0f 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -16,6 +16,7 @@ #include <linux/btrfs.h> #include <linux/uio.h> #include <linux/iversion.h> +#include <linux/fsverity.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -1340,7 +1341,18 @@ static int prepare_uptodate_page(struct inode *inode, unlock_page(page); return -EIO; } - if (page->mapping != inode->i_mapping) { + + /* + * Since btrfs_readpage() will unlock the page before it + * returns, there is a window where btrfs_releasepage() can be + * called to release the page. Here we check both inode + * mapping and PagePrivate() to make sure the page was not + * released. + * + * The private flag check is essential for subpage as we need + * to store extra bitmap using page->private. + */ + if (page->mapping != inode->i_mapping || !PagePrivate(page)) { unlock_page(page); return -EAGAIN; } @@ -3604,7 +3616,13 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence) static int btrfs_file_open(struct inode *inode, struct file *filp) { + int ret; + filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; + + ret = fsverity_file_open(inode, filp); + if (ret) + return ret; return generic_file_open(inode, filp); } @@ -3633,6 +3651,9 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to) struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; + if (fsverity_active(inode)) + return 0; + if (check_direct_read(btrfs_sb(inode->i_sb), to, iocb->ki_pos)) return 0; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 2131ae5b9ed7..da0eee7c9e5f 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -344,19 +344,13 @@ fail: static void readahead_cache(struct inode *inode) { - struct file_ra_state *ra; + struct file_ra_state ra; unsigned long last_index; - ra = kzalloc(sizeof(*ra), GFP_NOFS); - if (!ra) - return; - - file_ra_state_init(ra, inode->i_mapping); + file_ra_state_init(&ra, inode->i_mapping); last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; - page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index); - - kfree(ra); + page_cache_sync_readahead(inode->i_mapping, &ra, NULL, 0, last_index); } static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode, @@ -2544,6 +2538,7 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; u64 offset = bytenr - block_group->start; u64 to_free, to_unusable; + const int bg_reclaim_threshold = READ_ONCE(fs_info->bg_reclaim_threshold); spin_lock(&ctl->tree_lock); if (!used) @@ -2573,9 +2568,9 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, /* All the region is now unusable. Mark it as unused and reclaim */ if (block_group->zone_unusable == block_group->length) { btrfs_mark_bg_unused(block_group); - } else if (block_group->zone_unusable >= - div_factor_fine(block_group->length, - fs_info->bg_reclaim_threshold)) { + } else if (bg_reclaim_threshold && + block_group->zone_unusable >= + div_factor_fine(block_group->length, bg_reclaim_threshold)) { btrfs_mark_bg_to_reclaim(block_group); } @@ -2652,8 +2647,11 @@ int btrfs_remove_free_space(struct btrfs_block_group *block_group, * btrfs_pin_extent_for_log_replay() when replaying the log. * Advance the pointer not to overwrite the tree-log nodes. */ - if (block_group->alloc_offset < offset + bytes) - block_group->alloc_offset = offset + bytes; + if (block_group->start + block_group->alloc_offset < + offset + bytes) { + block_group->alloc_offset = + offset + bytes - block_group->start; + } return 0; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0117d867ecf8..487533c35ddb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -32,6 +32,7 @@ #include <linux/sched/mm.h> #include <linux/iomap.h> #include <asm/unaligned.h> +#include <linux/fsverity.h> #include "misc.h" #include "ctree.h" #include "disk-io.h" @@ -286,9 +287,8 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, cur_size = min_t(unsigned long, compressed_size, PAGE_SIZE); - kaddr = kmap_atomic(cpage); + kaddr = page_address(cpage); write_extent_buffer(leaf, kaddr, ptr, cur_size); - kunmap_atomic(kaddr); i++; ptr += cur_size; @@ -490,6 +490,9 @@ static noinline int add_async_extent(struct async_chunk *cow, */ static inline bool inode_can_compress(struct btrfs_inode *inode) { + /* Subpage doesn't support compression yet */ + if (inode->root->fs_info->sectorsize < PAGE_SIZE) + return false; if (inode->flags & BTRFS_INODE_NODATACOW || inode->flags & BTRFS_INODE_NODATASUM) return false; @@ -629,7 +632,7 @@ again: * inode has not been flagged as nocompress. This flag can * change at any time if we discover bad compression ratios. */ - if (nr_pages > 1 && inode_need_compress(BTRFS_I(inode), start, end)) { + if (inode_need_compress(BTRFS_I(inode), start, end)) { WARN_ON(pages); pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); if (!pages) { @@ -682,7 +685,11 @@ again: } } cont: - if (start == 0) { + /* + * Check cow_file_range() for why we don't even try to create inline + * extent for subpage case. + */ + if (start == 0 && fs_info->sectorsize == PAGE_SIZE) { /* lets try to make an inline extent */ if (ret || total_in < actual_end) { /* we didn't compress the entire range, try @@ -973,7 +980,7 @@ retry: p->mapping = inode->vfs_inode.i_mapping; btrfs_writepage_endio_finish_ordered(inode, p, start, - end, 0); + end, false); p->mapping = NULL; extent_clear_unlock_delalloc(inode, start, end, NULL, 0, @@ -1080,7 +1087,17 @@ static noinline int cow_file_range(struct btrfs_inode *inode, inode_should_defrag(inode, start, end, num_bytes, SZ_64K); - if (start == 0) { + /* + * Due to the page size limit, for subpage we can only trigger the + * writeback for the dirty sectors of page, that means data writeback + * is doing more writeback than what we want. + * + * This is especially unexpected for some call sites like fallocate, + * where we only increase i_size after everything is done. + * This means we can trigger inline extent even if we didn't want to. + * So here we skip inline extent creation completely. + */ + if (start == 0 && fs_info->sectorsize == PAGE_SIZE) { /* lets try to make an inline extent */ ret = cow_file_range_inline(inode, start, end, 0, BTRFS_COMPRESS_NONE, NULL); @@ -1290,11 +1307,6 @@ static noinline void async_cow_submit(struct btrfs_work *work) nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >> PAGE_SHIFT; - /* atomic_sub_return implies a barrier */ - if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) < - 5 * SZ_1M) - cond_wake_up_nomb(&fs_info->async_submit_wait); - /* * ->inode could be NULL if async_chunk_start has failed to compress, * in which case we don't have anything to submit, yet we need to @@ -1303,6 +1315,11 @@ static noinline void async_cow_submit(struct btrfs_work *work) */ if (async_chunk->inode) submit_compressed_extents(async_chunk); + + /* atomic_sub_return implies a barrier */ + if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) < + 5 * SZ_1M) + cond_wake_up_nomb(&fs_info->async_submit_wait); } static noinline void async_cow_free(struct btrfs_work *work) @@ -1946,6 +1963,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page ret = cow_file_range_async(inode, wbc, locked_page, start, end, page_started, nr_written); } + ASSERT(ret <= 0); if (ret) btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1); @@ -2285,7 +2303,6 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len, struct extent_map *split_mid = NULL; struct extent_map *split_post = NULL; int ret = 0; - int modified; unsigned long flags; /* Sanity check */ @@ -2315,11 +2332,12 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len, ASSERT(em->len == len); ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)); ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE); + ASSERT(test_bit(EXTENT_FLAG_PINNED, &em->flags)); + ASSERT(!test_bit(EXTENT_FLAG_LOGGING, &em->flags)); + ASSERT(!list_empty(&em->list)); flags = em->flags; clear_bit(EXTENT_FLAG_PINNED, &em->flags); - clear_bit(EXTENT_FLAG_LOGGING, &flags); - modified = !list_empty(&em->list); /* First, replace the em with a new extent_map starting from * em->start */ split_pre->start = em->start; @@ -2333,7 +2351,7 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len, split_pre->compress_type = em->compress_type; split_pre->generation = em->generation; - replace_extent_mapping(em_tree, em, split_pre, modified); + replace_extent_mapping(em_tree, em, split_pre, 1); /* * Now we only have an extent_map at: @@ -2353,7 +2371,7 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len, split_mid->flags = flags; split_mid->compress_type = em->compress_type; split_mid->generation = em->generation; - add_extent_mapping(em_tree, split_mid, modified); + add_extent_mapping(em_tree, split_mid, 1); } if (post) { @@ -2367,7 +2385,7 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len, split_post->flags = flags; split_post->compress_type = em->compress_type; split_post->generation = em->generation; - add_extent_mapping(em_tree, split_post, modified); + add_extent_mapping(em_tree, split_post, 1); } /* Once for us */ @@ -2770,7 +2788,7 @@ out_page: * to fix it up. The async helper will wait for ordered extents, set * the delalloc bit and make it safe to write the page. */ -int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end) +int btrfs_writepage_cow_fixup(struct page *page) { struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -3171,7 +3189,7 @@ static void finish_ordered_fn(struct btrfs_work *work) void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, struct page *page, u64 start, - u64 end, int uptodate) + u64 end, bool uptodate) { trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate); @@ -3257,25 +3275,44 @@ unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, return 0; } - if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) + /* + * For subpage case, above PageChecked is not safe as it's not subpage + * compatible. + * But for now only cow fixup and compressed read utilize PageChecked + * flag, while in this context we can easily use io_bio->csum to + * determine if we really need to do csum verification. + * + * So for now, just exit if io_bio->csum is NULL, as it means it's + * compressed read, and its compressed data csum has already been + * verified. + */ + if (io_bio->csum == NULL) return 0; - if (!root->fs_info->csum_root) + if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) return 0; - if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && - test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) { - clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM); + if (!root->fs_info->csum_root) return 0; - } ASSERT(page_offset(page) <= start && end <= page_offset(page) + PAGE_SIZE - 1); for (pg_off = offset_in_page(start); pg_off < offset_in_page(end); pg_off += sectorsize, bio_offset += sectorsize) { + u64 file_offset = pg_off + page_offset(page); int ret; + if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && + test_range_bit(io_tree, file_offset, + file_offset + sectorsize - 1, + EXTENT_NODATASUM, 1, NULL)) { + /* Skip the range without csum for data reloc inode */ + clear_extent_bits(io_tree, file_offset, + file_offset + sectorsize - 1, + EXTENT_NODATASUM); + continue; + } ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off, page_offset(page) + pg_off); if (ret < 0) { @@ -3520,7 +3557,14 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) /* * If we have an inode with links, there are a couple of - * possibilities. Old kernels (before v3.12) used to create an + * possibilities: + * + * 1. We were halfway through creating fsverity metadata for the + * file. In that case, the orphan item represents incomplete + * fsverity metadata which must be cleaned up with + * btrfs_drop_verity_items and deleting the orphan item. + + * 2. Old kernels (before v3.12) used to create an * orphan item for truncate indicating that there were possibly * extent items past i_size that needed to be deleted. In v3.12, * truncate was changed to update i_size in sync with the extent @@ -3538,8 +3582,12 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) * but either way, we can delete the orphan item. */ if (ret == -ENOENT || inode->i_nlink) { - if (!ret) + if (!ret) { + ret = btrfs_drop_verity_items(BTRFS_I(inode)); iput(inode); + if (ret) + goto out; + } trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); @@ -3728,7 +3776,8 @@ static int btrfs_read_locked_inode(struct inode *inode, rdev = btrfs_inode_rdev(leaf, inode_item); BTRFS_I(inode)->index_cnt = (u64)-1; - BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); + btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item), + &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags); cache_index: /* @@ -3859,6 +3908,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, struct inode *inode) { struct btrfs_map_token token; + u64 flags; btrfs_init_map_token(&token, leaf); @@ -3894,7 +3944,9 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode)); btrfs_set_token_inode_transid(&token, item, trans->transid); btrfs_set_token_inode_rdev(&token, item, inode->i_rdev); - btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags); + flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags, + BTRFS_I(inode)->ro_flags); + btrfs_set_token_inode_flags(&token, item, flags); btrfs_set_token_inode_block_group(&token, item, 0); } @@ -5088,15 +5140,13 @@ static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode, int ret; /* - * Still need to make sure the inode looks like it's been updated so - * that any holes get logged if we fsync. + * If NO_HOLES is enabled, we don't need to do anything. + * Later, up in the call chain, either btrfs_set_inode_last_sub_trans() + * or btrfs_update_inode() will be called, which guarantee that the next + * fsync will know this inode was changed and needs to be logged. */ - if (btrfs_fs_incompat(fs_info, NO_HOLES)) { - inode->last_trans = fs_info->generation; - inode->last_sub_trans = root->log_transid; - inode->last_log_commit = root->last_log_commit; + if (btrfs_fs_incompat(fs_info, NO_HOLES)) return 0; - } /* * 1 - for the one we're dropping @@ -5342,7 +5392,7 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr if (btrfs_root_readonly(root)) return -EROFS; - err = setattr_prepare(&init_user_ns, dentry, attr); + err = setattr_prepare(mnt_userns, dentry, attr); if (err) return err; @@ -5353,13 +5403,12 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr } if (attr->ia_valid) { - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(mnt_userns, inode, attr); inode_inc_iversion(inode); err = btrfs_dirty_inode(inode); if (!err && attr->ia_valid & ATTR_MODE) - err = posix_acl_chmod(&init_user_ns, inode, - inode->i_mode); + err = posix_acl_chmod(mnt_userns, inode, inode->i_mode); } return err; @@ -5522,6 +5571,7 @@ void btrfs_evict_inode(struct inode *inode) trace_btrfs_inode_evict(inode); if (!root) { + fsverity_cleanup_inode(inode); clear_inode(inode); return; } @@ -5604,6 +5654,7 @@ no_delete: * to retry these periodically in the future. */ btrfs_remove_delayed_node(BTRFS_I(inode)); + fsverity_cleanup_inode(inode); clear_inode(inode); } @@ -6370,6 +6421,7 @@ static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct user_namespace *mnt_userns, struct inode *dir, const char *name, int name_len, u64 ref_objectid, u64 objectid, @@ -6479,7 +6531,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, if (ret != 0) goto fail_unlock; - inode_init_owner(&init_user_ns, inode, dir, mode); + inode_init_owner(mnt_userns, inode, dir, mode); inode_set_bytes(inode, 0); inode->i_mtime = current_time(inode); @@ -6664,9 +6716,9 @@ static int btrfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, if (err) goto out_unlock; - inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, - dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid, - mode, &index); + inode = btrfs_new_inode(trans, root, mnt_userns, dir, + dentry->d_name.name, dentry->d_name.len, + btrfs_ino(BTRFS_I(dir)), objectid, mode, &index); if (IS_ERR(inode)) { err = PTR_ERR(inode); inode = NULL; @@ -6728,9 +6780,9 @@ static int btrfs_create(struct user_namespace *mnt_userns, struct inode *dir, if (err) goto out_unlock; - inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, - dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid, - mode, &index); + inode = btrfs_new_inode(trans, root, mnt_userns, dir, + dentry->d_name.name, dentry->d_name.len, + btrfs_ino(BTRFS_I(dir)), objectid, mode, &index); if (IS_ERR(inode)) { err = PTR_ERR(inode); inode = NULL; @@ -6873,8 +6925,9 @@ static int btrfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, if (err) goto out_fail; - inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, - dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid, + inode = btrfs_new_inode(trans, root, mnt_userns, dir, + dentry->d_name.name, dentry->d_name.len, + btrfs_ino(BTRFS_I(dir)), objectid, S_IFDIR | mode, &index); if (IS_ERR(inode)) { err = PTR_ERR(inode); @@ -8194,9 +8247,10 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio, return dip; } -static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, +static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter, struct bio *dio_bio, loff_t file_offset) { + struct inode *inode = iter->inode; const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); const bool raid56 = (btrfs_data_alloc_profile(fs_info) & @@ -8206,13 +8260,13 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, u64 start_sector; int async_submit = 0; u64 submit_len; - int clone_offset = 0; - int clone_len; + u64 clone_offset = 0; + u64 clone_len; u64 logical; int ret; blk_status_t status; struct btrfs_io_geometry geom; - struct btrfs_dio_data *dio_data = iomap->private; + struct btrfs_dio_data *dio_data = iter->iomap.private; struct extent_map *em = NULL; dip = btrfs_create_dio_private(dio_bio, inode, file_offset); @@ -8255,9 +8309,9 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, status = errno_to_blk_status(ret); goto out_err_em; } - ASSERT(geom.len <= INT_MAX); - clone_len = min_t(int, submit_len, geom.len); + clone_len = min(submit_len, geom.len); + ASSERT(clone_len <= UINT_MAX); /* * This will never fail as it's passing GPF_NOFS and @@ -8401,11 +8455,47 @@ static void btrfs_readahead(struct readahead_control *rac) extent_readahead(rac); } +/* + * For releasepage() and invalidatepage() we have a race window where + * end_page_writeback() is called but the subpage spinlock is not yet released. + * If we continue to release/invalidate the page, we could cause use-after-free + * for subpage spinlock. So this function is to spin and wait for subpage + * spinlock. + */ +static void wait_subpage_spinlock(struct page *page) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); + struct btrfs_subpage *subpage; + + if (fs_info->sectorsize == PAGE_SIZE) + return; + + ASSERT(PagePrivate(page) && page->private); + subpage = (struct btrfs_subpage *)page->private; + + /* + * This may look insane as we just acquire the spinlock and release it, + * without doing anything. But we just want to make sure no one is + * still holding the subpage spinlock. + * And since the page is not dirty nor writeback, and we have page + * locked, the only possible way to hold a spinlock is from the endio + * function to clear page writeback. + * + * Here we just acquire the spinlock so that all existing callers + * should exit and we're safe to release/invalidate the page. + */ + spin_lock_irq(&subpage->lock); + spin_unlock_irq(&subpage->lock); +} + static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags) { int ret = try_release_extent_mapping(page, gfp_flags); - if (ret == 1) + + if (ret == 1) { + wait_subpage_spinlock(page); clear_page_extent_mapped(page); + } return ret; } @@ -8469,6 +8559,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, * do double ordered extent accounting on the same page. */ wait_on_page_writeback(page); + wait_subpage_spinlock(page); /* * For subpage case, we have call sites like @@ -8557,7 +8648,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, spin_unlock_irq(&inode->ordered_tree.lock); if (btrfs_dec_test_ordered_pending(inode, &ordered, - cur, range_end + 1 - cur, 1)) { + cur, range_end + 1 - cur)) { btrfs_finish_ordered_io(ordered); /* * The ordered extent has finished, now we're again @@ -8938,7 +9029,8 @@ out: */ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, struct btrfs_root *new_root, - struct btrfs_root *parent_root) + struct btrfs_root *parent_root, + struct user_namespace *mnt_userns) { struct inode *inode; int err; @@ -8949,7 +9041,8 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, if (err < 0) return err; - inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, ino, ino, + inode = btrfs_new_inode(trans, new_root, mnt_userns, NULL, "..", 2, + ino, ino, S_IFDIR | (~current_umask() & S_IRWXUGO), &index); if (IS_ERR(inode)) @@ -8993,6 +9086,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->defrag_bytes = 0; ei->disk_i_size = 0; ei->flags = 0; + ei->ro_flags = 0; ei->csum_bytes = 0; ei->index_cnt = (u64)-1; ei->dir_index = 0; @@ -9174,6 +9268,7 @@ static int btrfs_getattr(struct user_namespace *mnt_userns, struct inode *inode = d_inode(path->dentry); u32 blocksize = inode->i_sb->s_blocksize; u32 bi_flags = BTRFS_I(inode)->flags; + u32 bi_ro_flags = BTRFS_I(inode)->ro_flags; stat->result_mask |= STATX_BTIME; stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec; @@ -9186,13 +9281,15 @@ static int btrfs_getattr(struct user_namespace *mnt_userns, stat->attributes |= STATX_ATTR_IMMUTABLE; if (bi_flags & BTRFS_INODE_NODUMP) stat->attributes |= STATX_ATTR_NODUMP; + if (bi_ro_flags & BTRFS_INODE_RO_VERITY) + stat->attributes |= STATX_ATTR_VERITY; stat->attributes_mask |= (STATX_ATTR_APPEND | STATX_ATTR_COMPRESSED | STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP); - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(mnt_userns, inode, stat); stat->dev = BTRFS_I(inode)->root->anon_dev; spin_lock(&BTRFS_I(inode)->lock); @@ -9226,8 +9323,14 @@ static int btrfs_rename_exchange(struct inode *old_dir, bool dest_log_pinned = false; bool need_abort = false; - /* we only allow rename subvolume link between subvolumes */ - if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) + /* + * For non-subvolumes allow exchange only within one subvolume, in the + * same inode namespace. Two subvolumes (represented as directory) can + * be exchanged as they're a logical link and have a fixed inode number. + */ + if (root != dest && + (old_ino != BTRFS_FIRST_FREE_OBJECTID || + new_ino != BTRFS_FIRST_FREE_OBJECTID)) return -EXDEV; /* close the race window with snapshot create/destroy ioctl */ @@ -9274,8 +9377,6 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* force full log commit if subvolume involved. */ btrfs_set_log_full_commit(trans); } else { - btrfs_pin_log_trans(root); - root_log_pinned = true; ret = btrfs_insert_inode_ref(trans, dest, new_dentry->d_name.name, new_dentry->d_name.len, @@ -9292,8 +9393,6 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* force full log commit if subvolume involved. */ btrfs_set_log_full_commit(trans); } else { - btrfs_pin_log_trans(dest); - dest_log_pinned = true; ret = btrfs_insert_inode_ref(trans, root, old_dentry->d_name.name, old_dentry->d_name.len, @@ -9324,6 +9423,29 @@ static int btrfs_rename_exchange(struct inode *old_dir, BTRFS_I(new_inode), 1); } + /* + * Now pin the logs of the roots. We do it to ensure that no other task + * can sync the logs while we are in progress with the rename, because + * that could result in an inconsistency in case any of the inodes that + * are part of this rename operation were logged before. + * + * We pin the logs even if at this precise moment none of the inodes was + * logged before. This is because right after we checked for that, some + * other task fsyncing some other inode not involved with this rename + * operation could log that one of our inodes exists. + * + * We don't need to pin the logs before the above calls to + * btrfs_insert_inode_ref(), since those don't ever need to change a log. + */ + if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { + btrfs_pin_log_trans(root); + root_log_pinned = true; + } + if (new_ino != BTRFS_FIRST_FREE_OBJECTID) { + btrfs_pin_log_trans(dest); + dest_log_pinned = true; + } + /* src is a subvolume */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_unlink_subvol(trans, old_dir, old_dentry); @@ -9405,8 +9527,7 @@ out_fail: if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) || btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) || btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) || - (new_inode && - btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation))) + btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation)) btrfs_set_log_full_commit(trans); if (root_log_pinned) { @@ -9430,6 +9551,7 @@ out_notrans: static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry) { @@ -9442,7 +9564,7 @@ static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans, if (ret) return ret; - inode = btrfs_new_inode(trans, root, dir, + inode = btrfs_new_inode(trans, root, mnt_userns, dir, dentry->d_name.name, dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), @@ -9479,9 +9601,10 @@ out: return ret; } -static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) +static int btrfs_rename(struct user_namespace *mnt_userns, + struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb); struct btrfs_trans_handle *trans; @@ -9576,8 +9699,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, /* force full log commit if subvolume involved. */ btrfs_set_log_full_commit(trans); } else { - btrfs_pin_log_trans(root); - log_pinned = true; ret = btrfs_insert_inode_ref(trans, dest, new_dentry->d_name.name, new_dentry->d_name.len, @@ -9601,6 +9722,25 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { ret = btrfs_unlink_subvol(trans, old_dir, old_dentry); } else { + /* + * Now pin the log. We do it to ensure that no other task can + * sync the log while we are in progress with the rename, as + * that could result in an inconsistency in case any of the + * inodes that are part of this rename operation were logged + * before. + * + * We pin the log even if at this precise moment none of the + * inodes was logged before. This is because right after we + * checked for that, some other task fsyncing some other inode + * not involved with this rename operation could log that one of + * our inodes exists. + * + * We don't need to pin the logs before the above call to + * btrfs_insert_inode_ref(), since that does not need to change + * a log. + */ + btrfs_pin_log_trans(root); + log_pinned = true; ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir), BTRFS_I(d_inode(old_dentry)), old_dentry->d_name.name, @@ -9654,8 +9794,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, } if (flags & RENAME_WHITEOUT) { - ret = btrfs_whiteout_for_rename(trans, root, old_dir, - old_dentry); + ret = btrfs_whiteout_for_rename(trans, root, mnt_userns, + old_dir, old_dentry); if (ret) { btrfs_abort_transaction(trans, ret); @@ -9705,7 +9845,8 @@ static int btrfs_rename2(struct user_namespace *mnt_userns, struct inode *old_di return btrfs_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); - return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry, flags); + return btrfs_rename(mnt_userns, old_dir, old_dentry, new_dir, + new_dentry, flags); } struct btrfs_delalloc_work { @@ -9802,11 +9943,7 @@ static int start_delalloc_inodes(struct btrfs_root *root, btrfs_queue_work(root->fs_info->flush_workers, &work->work); } else { - ret = sync_inode(inode, wbc); - if (!ret && - test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags)) - ret = sync_inode(inode, wbc); + ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc); btrfs_add_delayed_iput(inode); if (ret || wbc->nr_to_write <= 0) goto out; @@ -9941,9 +10078,10 @@ static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, if (err) goto out_unlock; - inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, - dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), - objectid, S_IFLNK|S_IRWXUGO, &index); + inode = btrfs_new_inode(trans, root, mnt_userns, dir, + dentry->d_name.name, dentry->d_name.len, + btrfs_ino(BTRFS_I(dir)), objectid, + S_IFLNK | S_IRWXUGO, &index); if (IS_ERR(inode)) { err = PTR_ERR(inode); inode = NULL; @@ -10267,7 +10405,7 @@ static int btrfs_permission(struct user_namespace *mnt_userns, if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) return -EACCES; } - return generic_permission(&init_user_ns, inode, mask); + return generic_permission(mnt_userns, inode, mask); } static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, @@ -10292,7 +10430,7 @@ static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, if (ret) goto out; - inode = btrfs_new_inode(trans, root, dir, NULL, 0, + inode = btrfs_new_inode(trans, root, mnt_userns, dir, NULL, 0, btrfs_ino(BTRFS_I(dir)), objectid, mode, &index); if (IS_ERR(inode)) { ret = PTR_ERR(inode); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0ba98e08a029..41524f9aeac3 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -27,6 +27,7 @@ #include <linux/uaccess.h> #include <linux/iversion.h> #include <linux/fileattr.h> +#include <linux/fsverity.h> #include "ctree.h" #include "disk-io.h" #include "export.h" @@ -103,9 +104,11 @@ static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode, * Export internal inode flags to the format expected by the FS_IOC_GETFLAGS * ioctl. */ -static unsigned int btrfs_inode_flags_to_fsflags(unsigned int flags) +static unsigned int btrfs_inode_flags_to_fsflags(struct btrfs_inode *binode) { unsigned int iflags = 0; + u32 flags = binode->flags; + u32 ro_flags = binode->ro_flags; if (flags & BTRFS_INODE_SYNC) iflags |= FS_SYNC_FL; @@ -121,6 +124,8 @@ static unsigned int btrfs_inode_flags_to_fsflags(unsigned int flags) iflags |= FS_DIRSYNC_FL; if (flags & BTRFS_INODE_NODATACOW) iflags |= FS_NOCOW_FL; + if (ro_flags & BTRFS_INODE_RO_VERITY) + iflags |= FS_VERITY_FL; if (flags & BTRFS_INODE_NOCOMPRESS) iflags |= FS_NOCOMP_FL; @@ -148,10 +153,12 @@ void btrfs_sync_inode_flags_to_i_flags(struct inode *inode) new_fl |= S_NOATIME; if (binode->flags & BTRFS_INODE_DIRSYNC) new_fl |= S_DIRSYNC; + if (binode->ro_flags & BTRFS_INODE_RO_VERITY) + new_fl |= S_VERITY; set_mask_bits(&inode->i_flags, - S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC, - new_fl); + S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC | + S_VERITY, new_fl); } /* @@ -200,7 +207,7 @@ int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) { struct btrfs_inode *binode = BTRFS_I(d_inode(dentry)); - fileattr_fill_flags(fa, btrfs_inode_flags_to_fsflags(binode->flags)); + fileattr_fill_flags(fa, btrfs_inode_flags_to_fsflags(binode)); return 0; } @@ -224,7 +231,7 @@ int btrfs_fileattr_set(struct user_namespace *mnt_userns, return -EOPNOTSUPP; fsflags = btrfs_mask_fsflags_for_type(inode, fa->flags); - old_fsflags = btrfs_inode_flags_to_fsflags(binode->flags); + old_fsflags = btrfs_inode_flags_to_fsflags(binode); ret = check_fsflags(old_fsflags, fsflags); if (ret) return ret; @@ -492,8 +499,8 @@ int __pure btrfs_is_empty_uuid(u8 *uuid) return 1; } -static noinline int create_subvol(struct inode *dir, - struct dentry *dentry, +static noinline int create_subvol(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *dentry, const char *name, int namelen, struct btrfs_qgroup_inherit *inherit) { @@ -638,7 +645,7 @@ static noinline int create_subvol(struct inode *dir, goto fail; } - ret = btrfs_create_subvol_root(trans, new_root, root); + ret = btrfs_create_subvol_root(trans, new_root, root, mnt_userns); btrfs_put_root(new_root); if (ret) { /* We potentially lose an unused inode item here */ @@ -830,7 +837,8 @@ free_pending: * nfs_async_unlink(). */ -static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir) +static int btrfs_may_delete(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *victim, int isdir) { int error; @@ -840,12 +848,12 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir) BUG_ON(d_inode(victim->d_parent) != dir); audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); - error = inode_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC); + error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC); if (error) return error; if (IS_APPEND(dir)) return -EPERM; - if (check_sticky(&init_user_ns, dir, d_inode(victim)) || + if (check_sticky(mnt_userns, dir, d_inode(victim)) || IS_APPEND(d_inode(victim)) || IS_IMMUTABLE(d_inode(victim)) || IS_SWAPFILE(d_inode(victim))) return -EPERM; @@ -864,13 +872,16 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir) } /* copy of may_create in fs/namei.c() */ -static inline int btrfs_may_create(struct inode *dir, struct dentry *child) +static inline int btrfs_may_create(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *child) { if (d_really_is_positive(child)) return -EEXIST; if (IS_DEADDIR(dir)) return -ENOENT; - return inode_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC); + if (!fsuidgid_has_mapping(dir->i_sb, mnt_userns)) + return -EOVERFLOW; + return inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC); } /* @@ -879,6 +890,7 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child) * inside this filesystem so it's quite a bit simpler. */ static noinline int btrfs_mksubvol(const struct path *parent, + struct user_namespace *mnt_userns, const char *name, int namelen, struct btrfs_root *snap_src, bool readonly, @@ -893,12 +905,12 @@ static noinline int btrfs_mksubvol(const struct path *parent, if (error == -EINTR) return error; - dentry = lookup_one_len(name, parent->dentry, namelen); + dentry = lookup_one(mnt_userns, name, parent->dentry, namelen); error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out_unlock; - error = btrfs_may_create(dir, dentry); + error = btrfs_may_create(mnt_userns, dir, dentry); if (error) goto out_dput; @@ -920,7 +932,7 @@ static noinline int btrfs_mksubvol(const struct path *parent, if (snap_src) error = create_snapshot(snap_src, dir, dentry, readonly, inherit); else - error = create_subvol(dir, dentry, name, namelen, inherit); + error = create_subvol(mnt_userns, dir, dentry, name, namelen, inherit); if (!error) fsnotify_mkdir(dir, dentry); @@ -934,6 +946,7 @@ out_unlock: } static noinline int btrfs_mksnapshot(const struct path *parent, + struct user_namespace *mnt_userns, const char *name, int namelen, struct btrfs_root *root, bool readonly, @@ -963,7 +976,7 @@ static noinline int btrfs_mksnapshot(const struct path *parent, btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1); - ret = btrfs_mksubvol(parent, name, namelen, + ret = btrfs_mksubvol(parent, mnt_userns, name, namelen, root, readonly, inherit); out: if (snapshot_force_cow) @@ -1792,6 +1805,7 @@ out_drop: } static noinline int __btrfs_ioctl_snap_create(struct file *file, + struct user_namespace *mnt_userns, const char *name, unsigned long fd, int subvol, bool readonly, struct btrfs_qgroup_inherit *inherit) @@ -1819,8 +1833,8 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file, } if (subvol) { - ret = btrfs_mksubvol(&file->f_path, name, namelen, - NULL, readonly, inherit); + ret = btrfs_mksubvol(&file->f_path, mnt_userns, name, + namelen, NULL, readonly, inherit); } else { struct fd src = fdget(fd); struct inode *src_inode; @@ -1834,16 +1848,17 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file, btrfs_info(BTRFS_I(file_inode(file))->root->fs_info, "Snapshot src from another FS"); ret = -EXDEV; - } else if (!inode_owner_or_capable(&init_user_ns, src_inode)) { + } else if (!inode_owner_or_capable(mnt_userns, src_inode)) { /* * Subvolume creation is not restricted, but snapshots * are limited to own subvolumes only */ ret = -EPERM; } else { - ret = btrfs_mksnapshot(&file->f_path, name, namelen, - BTRFS_I(src_inode)->root, - readonly, inherit); + ret = btrfs_mksnapshot(&file->f_path, mnt_userns, + name, namelen, + BTRFS_I(src_inode)->root, + readonly, inherit); } fdput(src); } @@ -1867,8 +1882,9 @@ static noinline int btrfs_ioctl_snap_create(struct file *file, return PTR_ERR(vol_args); vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - ret = __btrfs_ioctl_snap_create(file, vol_args->name, vol_args->fd, - subvol, false, NULL); + ret = __btrfs_ioctl_snap_create(file, file_mnt_user_ns(file), + vol_args->name, vol_args->fd, subvol, + false, NULL); kfree(vol_args); return ret; @@ -1926,8 +1942,9 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, } } - ret = __btrfs_ioctl_snap_create(file, vol_args->name, vol_args->fd, - subvol, readonly, inherit); + ret = __btrfs_ioctl_snap_create(file, file_mnt_user_ns(file), + vol_args->name, vol_args->fd, subvol, + readonly, inherit); if (ret) goto free_inherit; free_inherit: @@ -1971,7 +1988,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file, u64 flags; int ret = 0; - if (!inode_owner_or_capable(&init_user_ns, inode)) + if (!inode_owner_or_capable(file_mnt_user_ns(file), inode)) return -EPERM; ret = mnt_want_write_file(file); @@ -2382,23 +2399,16 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, key.offset = (u64)-1; while (1) { - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + ret = btrfs_search_backwards(root, &key, path); if (ret < 0) goto out; else if (ret > 0) { - ret = btrfs_previous_item(root, path, dirid, - BTRFS_INODE_REF_KEY); - if (ret < 0) - goto out; - else if (ret > 0) { - ret = -ENOENT; - goto out; - } + ret = -ENOENT; + goto out; } l = path->nodes[0]; slot = path->slots[0]; - btrfs_item_key_to_cpu(l, &key, slot); iref = btrfs_item_ptr(l, slot, struct btrfs_inode_ref); len = btrfs_inode_ref_name_len(l, iref); @@ -2429,7 +2439,8 @@ out: return ret; } -static int btrfs_search_path_in_tree_user(struct inode *inode, +static int btrfs_search_path_in_tree_user(struct user_namespace *mnt_userns, + struct inode *inode, struct btrfs_ioctl_ino_lookup_user_args *args) { struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; @@ -2473,23 +2484,16 @@ static int btrfs_search_path_in_tree_user(struct inode *inode, key.type = BTRFS_INODE_REF_KEY; key.offset = (u64)-1; while (1) { - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) { + ret = btrfs_search_backwards(root, &key, path); + if (ret < 0) + goto out_put; + else if (ret > 0) { + ret = -ENOENT; goto out_put; - } else if (ret > 0) { - ret = btrfs_previous_item(root, path, dirid, - BTRFS_INODE_REF_KEY); - if (ret < 0) { - goto out_put; - } else if (ret > 0) { - ret = -ENOENT; - goto out_put; - } } leaf = path->nodes[0]; slot = path->slots[0]; - btrfs_item_key_to_cpu(leaf, &key, slot); iref = btrfs_item_ptr(leaf, slot, struct btrfs_inode_ref); len = btrfs_inode_ref_name_len(leaf, iref); @@ -2527,7 +2531,7 @@ static int btrfs_search_path_in_tree_user(struct inode *inode, ret = PTR_ERR(temp_inode); goto out_put; } - ret = inode_permission(&init_user_ns, temp_inode, + ret = inode_permission(mnt_userns, temp_inode, MAY_READ | MAY_EXEC); iput(temp_inode); if (ret) { @@ -2669,7 +2673,7 @@ static int btrfs_ioctl_ino_lookup_user(struct file *file, void __user *argp) return -EACCES; } - ret = btrfs_search_path_in_tree_user(inode, args); + ret = btrfs_search_path_in_tree_user(file_mnt_user_ns(file), inode, args); if (ret == 0 && copy_to_user(argp, args, sizeof(*args))) ret = -EFAULT; @@ -2905,6 +2909,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, struct btrfs_root *dest = NULL; struct btrfs_ioctl_vol_args *vol_args = NULL; struct btrfs_ioctl_vol_args_v2 *vol_args2 = NULL; + struct user_namespace *mnt_userns = file_mnt_user_ns(file); char *subvol_name, *subvol_name_ptr = NULL; int subvol_namelen; int err = 0; @@ -2932,6 +2937,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, if (err) goto out; } else { + struct inode *old_dir; + if (vol_args2->subvolid < BTRFS_FIRST_FREE_OBJECTID) { err = -EINVAL; goto out; @@ -2968,6 +2975,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, err = PTR_ERR(parent); goto out_drop_write; } + old_dir = dir; dir = d_inode(parent); /* @@ -2978,6 +2986,20 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, */ destroy_parent = true; + /* + * On idmapped mounts, deletion via subvolid is + * restricted to subvolumes that are immediate + * ancestors of the inode referenced by the file + * descriptor in the ioctl. Otherwise the idmapping + * could potentially be abused to delete subvolumes + * anywhere in the filesystem the user wouldn't be able + * to delete without an idmapped mount. + */ + if (old_dir != dir && mnt_userns != &init_user_ns) { + err = -EOPNOTSUPP; + goto free_parent; + } + subvol_name_ptr = btrfs_get_subvol_name_from_objectid( fs_info, vol_args2->subvolid); if (IS_ERR(subvol_name_ptr)) { @@ -3016,7 +3038,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, err = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); if (err == -EINTR) goto free_subvol_name; - dentry = lookup_one_len(subvol_name, parent, subvol_namelen); + dentry = lookup_one(mnt_userns, subvol_name, parent, subvol_namelen); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); goto out_unlock_dir; @@ -3058,14 +3080,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, if (root == dest) goto out_dput; - err = inode_permission(&init_user_ns, inode, - MAY_WRITE | MAY_EXEC); + err = inode_permission(mnt_userns, inode, MAY_WRITE | MAY_EXEC); if (err) goto out_dput; } /* check if subvolume may be deleted by a user */ - err = btrfs_may_delete(dir, dentry, 1); + err = btrfs_may_delete(mnt_userns, dir, dentry, 1); if (err) goto out_dput; @@ -3103,7 +3124,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) { struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_ioctl_defrag_range_args *range; + struct btrfs_ioctl_defrag_range_args range = {0}; int ret; ret = mnt_want_write_file(file); @@ -3115,6 +3136,12 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) goto out; } + /* Subpage defrag will be supported in later commits */ + if (root->fs_info->sectorsize < PAGE_SIZE) { + ret = -ENOTTY; + goto out; + } + switch (inode->i_mode & S_IFMT) { case S_IFDIR: if (!capable(CAP_SYS_ADMIN)) { @@ -3135,33 +3162,24 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) goto out; } - range = kzalloc(sizeof(*range), GFP_KERNEL); - if (!range) { - ret = -ENOMEM; - goto out; - } - if (argp) { - if (copy_from_user(range, argp, - sizeof(*range))) { + if (copy_from_user(&range, argp, sizeof(range))) { ret = -EFAULT; - kfree(range); goto out; } /* compression requires us to start the IO */ - if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) { - range->flags |= BTRFS_DEFRAG_RANGE_START_IO; - range->extent_thresh = (u32)-1; + if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS)) { + range.flags |= BTRFS_DEFRAG_RANGE_START_IO; + range.extent_thresh = (u32)-1; } } else { /* the rest are all set to zero by kzalloc */ - range->len = (u64)-1; + range.len = (u64)-1; } ret = btrfs_defrag_file(file_inode(file), file, - range, BTRFS_OLDEST_GENERATION, 0); + &range, BTRFS_OLDEST_GENERATION, 0); if (ret > 0) ret = 0; - kfree(range); break; default: ret = -EINVAL; @@ -4404,25 +4422,20 @@ drop_write: static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info, void __user *arg) { - struct btrfs_ioctl_quota_rescan_args *qsa; + struct btrfs_ioctl_quota_rescan_args qsa = {0}; int ret = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - qsa = kzalloc(sizeof(*qsa), GFP_KERNEL); - if (!qsa) - return -ENOMEM; - if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { - qsa->flags = 1; - qsa->progress = fs_info->qgroup_rescan_progress.objectid; + qsa.flags = 1; + qsa.progress = fs_info->qgroup_rescan_progress.objectid; } - if (copy_to_user(arg, qsa, sizeof(*qsa))) + if (copy_to_user(arg, &qsa, sizeof(qsa))) ret = -EFAULT; - kfree(qsa); return ret; } @@ -4436,6 +4449,7 @@ static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info, } static long _btrfs_ioctl_set_received_subvol(struct file *file, + struct user_namespace *mnt_userns, struct btrfs_ioctl_received_subvol_args *sa) { struct inode *inode = file_inode(file); @@ -4447,7 +4461,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file, int ret = 0; int received_uuid_changed; - if (!inode_owner_or_capable(&init_user_ns, inode)) + if (!inode_owner_or_capable(mnt_userns, inode)) return -EPERM; ret = mnt_want_write_file(file); @@ -4552,7 +4566,7 @@ static long btrfs_ioctl_set_received_subvol_32(struct file *file, args64->rtime.nsec = args32->rtime.nsec; args64->flags = args32->flags; - ret = _btrfs_ioctl_set_received_subvol(file, args64); + ret = _btrfs_ioctl_set_received_subvol(file, file_mnt_user_ns(file), args64); if (ret) goto out; @@ -4586,7 +4600,7 @@ static long btrfs_ioctl_set_received_subvol(struct file *file, if (IS_ERR(sa)) return PTR_ERR(sa); - ret = _btrfs_ioctl_set_received_subvol(file, sa); + ret = _btrfs_ioctl_set_received_subvol(file, file_mnt_user_ns(file), sa); if (ret) goto out; @@ -5013,6 +5027,10 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_get_subvol_rootref(file, argp); case BTRFS_IOC_INO_LOOKUP_USER: return btrfs_ioctl_ino_lookup_user(file, argp); + case FS_IOC_ENABLE_VERITY: + return fsverity_ioctl_enable(file, (const void __user *)argp); + case FS_IOC_MEASURE_VERITY: + return fsverity_ioctl_measure(file, argp); } return -ENOTTY; diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index cd042c7567a4..c25dfd1a8a54 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -14,6 +14,7 @@ #include <linux/lzo.h> #include <linux/refcount.h> #include "compression.h" +#include "ctree.h" #define LZO_LEN 4 @@ -140,18 +141,18 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, *total_in = 0; in_page = find_get_page(mapping, start >> PAGE_SHIFT); - data_in = kmap(in_page); + data_in = page_address(in_page); /* * store the size of all chunks of compressed data in * the first 4 bytes */ - out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + out_page = alloc_page(GFP_NOFS); if (out_page == NULL) { ret = -ENOMEM; goto out; } - cpage_out = kmap(out_page); + cpage_out = page_address(out_page); out_offset = LZO_LEN; tot_out = LZO_LEN; pages[0] = out_page; @@ -209,19 +210,18 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, if (out_len == 0 && tot_in >= len) break; - kunmap(out_page); if (nr_pages == nr_dest_pages) { out_page = NULL; ret = -E2BIG; goto out; } - out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + out_page = alloc_page(GFP_NOFS); if (out_page == NULL) { ret = -ENOMEM; goto out; } - cpage_out = kmap(out_page); + cpage_out = page_address(out_page); pages[nr_pages++] = out_page; pg_bytes_left = PAGE_SIZE; @@ -243,12 +243,11 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, break; bytes_left = len - tot_in; - kunmap(in_page); put_page(in_page); start += PAGE_SIZE; in_page = find_get_page(mapping, start >> PAGE_SHIFT); - data_in = kmap(in_page); + data_in = page_address(in_page); in_len = min(bytes_left, PAGE_SIZE); } @@ -258,164 +257,130 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, } /* store the size of all chunks of compressed data */ - sizes_ptr = kmap_local_page(pages[0]); + sizes_ptr = page_address(pages[0]); write_compress_length(sizes_ptr, tot_out); - kunmap_local(sizes_ptr); ret = 0; *total_out = tot_out; *total_in = tot_in; out: *out_pages = nr_pages; - if (out_page) - kunmap(out_page); - if (in_page) { - kunmap(in_page); + if (in_page) put_page(in_page); - } return ret; } +/* + * Copy the compressed segment payload into @dest. + * + * For the payload there will be no padding, just need to do page switching. + */ +static void copy_compressed_segment(struct compressed_bio *cb, + char *dest, u32 len, u32 *cur_in) +{ + u32 orig_in = *cur_in; + + while (*cur_in < orig_in + len) { + struct page *cur_page; + u32 copy_len = min_t(u32, PAGE_SIZE - offset_in_page(*cur_in), + orig_in + len - *cur_in); + + ASSERT(copy_len); + cur_page = cb->compressed_pages[*cur_in / PAGE_SIZE]; + + memcpy(dest + *cur_in - orig_in, + page_address(cur_page) + offset_in_page(*cur_in), + copy_len); + + *cur_in += copy_len; + } +} + int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) { struct workspace *workspace = list_entry(ws, struct workspace, list); - int ret = 0, ret2; - char *data_in; - unsigned long page_in_index = 0; - size_t srclen = cb->compressed_len; - unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE); - unsigned long buf_start; - unsigned long buf_offset = 0; - unsigned long bytes; - unsigned long working_bytes; - size_t in_len; - size_t out_len; - const size_t max_segment_len = lzo1x_worst_compress(PAGE_SIZE); - unsigned long in_offset; - unsigned long in_page_bytes_left; - unsigned long tot_in; - unsigned long tot_out; - unsigned long tot_len; - char *buf; - bool may_late_unmap, need_unmap; - struct page **pages_in = cb->compressed_pages; - u64 disk_start = cb->start; - struct bio *orig_bio = cb->orig_bio; + const struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); + const u32 sectorsize = fs_info->sectorsize; + int ret; + /* Compressed data length, can be unaligned */ + u32 len_in; + /* Offset inside the compressed data */ + u32 cur_in = 0; + /* Bytes decompressed so far */ + u32 cur_out = 0; + + len_in = read_compress_length(page_address(cb->compressed_pages[0])); + cur_in += LZO_LEN; - data_in = kmap(pages_in[0]); - tot_len = read_compress_length(data_in); /* - * Compressed data header check. + * LZO header length check * - * The real compressed size can't exceed the maximum extent length, and - * all pages should be used (whole unused page with just the segment - * header is not possible). If this happens it means the compressed - * extent is corrupted. + * The total length should not exceed the maximum extent length, + * and all sectors should be used. + * If this happens, it means the compressed extent is corrupted. */ - if (tot_len > min_t(size_t, BTRFS_MAX_COMPRESSED, srclen) || - tot_len < srclen - PAGE_SIZE) { - ret = -EUCLEAN; - goto done; + if (len_in > min_t(size_t, BTRFS_MAX_COMPRESSED, cb->compressed_len) || + round_up(len_in, sectorsize) < cb->compressed_len) { + btrfs_err(fs_info, + "invalid lzo header, lzo len %u compressed len %u", + len_in, cb->compressed_len); + return -EUCLEAN; } - tot_in = LZO_LEN; - in_offset = LZO_LEN; - in_page_bytes_left = PAGE_SIZE - LZO_LEN; - - tot_out = 0; - - while (tot_in < tot_len) { - in_len = read_compress_length(data_in + in_offset); - in_page_bytes_left -= LZO_LEN; - in_offset += LZO_LEN; - tot_in += LZO_LEN; + /* Go through each lzo segment */ + while (cur_in < len_in) { + struct page *cur_page; + /* Length of the compressed segment */ + u32 seg_len; + u32 sector_bytes_left; + size_t out_len = lzo1x_worst_compress(sectorsize); /* - * Segment header check. - * - * The segment length must not exceed the maximum LZO - * compression size, nor the total compressed size. + * We should always have enough space for one segment header + * inside current sector. */ - if (in_len > max_segment_len || tot_in + in_len > tot_len) { - ret = -EUCLEAN; - goto done; - } - - tot_in += in_len; - working_bytes = in_len; - may_late_unmap = need_unmap = false; - - /* fast path: avoid using the working buffer */ - if (in_page_bytes_left >= in_len) { - buf = data_in + in_offset; - bytes = in_len; - may_late_unmap = true; - goto cont; - } - - /* copy bytes from the pages into the working buffer */ - buf = workspace->cbuf; - buf_offset = 0; - while (working_bytes) { - bytes = min(working_bytes, in_page_bytes_left); - - memcpy(buf + buf_offset, data_in + in_offset, bytes); - buf_offset += bytes; -cont: - working_bytes -= bytes; - in_page_bytes_left -= bytes; - in_offset += bytes; - - /* check if we need to pick another page */ - if ((working_bytes == 0 && in_page_bytes_left < LZO_LEN) - || in_page_bytes_left == 0) { - tot_in += in_page_bytes_left; - - if (working_bytes == 0 && tot_in >= tot_len) - break; - - if (page_in_index + 1 >= total_pages_in) { - ret = -EIO; - goto done; - } - - if (may_late_unmap) - need_unmap = true; - else - kunmap(pages_in[page_in_index]); - - data_in = kmap(pages_in[++page_in_index]); - - in_page_bytes_left = PAGE_SIZE; - in_offset = 0; - } - } - - out_len = max_segment_len; - ret = lzo1x_decompress_safe(buf, in_len, workspace->buf, - &out_len); - if (need_unmap) - kunmap(pages_in[page_in_index - 1]); + ASSERT(cur_in / sectorsize == + (cur_in + LZO_LEN - 1) / sectorsize); + cur_page = cb->compressed_pages[cur_in / PAGE_SIZE]; + ASSERT(cur_page); + seg_len = read_compress_length(page_address(cur_page) + + offset_in_page(cur_in)); + cur_in += LZO_LEN; + + /* Copy the compressed segment payload into workspace */ + copy_compressed_segment(cb, workspace->cbuf, seg_len, &cur_in); + + /* Decompress the data */ + ret = lzo1x_decompress_safe(workspace->cbuf, seg_len, + workspace->buf, &out_len); if (ret != LZO_E_OK) { - pr_warn("BTRFS: decompress failed\n"); + btrfs_err(fs_info, "failed to decompress"); ret = -EIO; - break; + goto out; } - buf_start = tot_out; - tot_out += out_len; + /* Copy the data into inode pages */ + ret = btrfs_decompress_buf2page(workspace->buf, out_len, cb, cur_out); + cur_out += out_len; - ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start, - tot_out, disk_start, orig_bio); - if (ret2 == 0) - break; + /* All data read, exit */ + if (ret == 0) + goto out; + ret = 0; + + /* Check if the sector has enough space for a segment header */ + sector_bytes_left = sectorsize - (cur_in % sectorsize); + if (sector_bytes_left >= LZO_LEN) + continue; + + /* Skip the padding zeros */ + cur_in += sector_bytes_left; } -done: - kunmap(pages_in[page_in_index]); +out: if (!ret) - zero_fill_bio(orig_bio); + zero_fill_bio(cb->orig_bio); return ret; } @@ -466,7 +431,7 @@ int lzo_decompress(struct list_head *ws, unsigned char *data_in, destlen = min_t(unsigned long, destlen, PAGE_SIZE); bytes = min_t(unsigned long, destlen, out_len - start_byte); - kaddr = kmap_local_page(dest_page); + kaddr = page_address(dest_page); memcpy(kaddr, workspace->buf + start_byte, bytes); /* @@ -476,7 +441,6 @@ int lzo_decompress(struct list_head *ws, unsigned char *data_in, */ if (bytes < destlen) memset(kaddr+bytes, 0, destlen-bytes); - kunmap_local(kaddr); out: return ret; } diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 5c0f8481e25e..edb65abf0393 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -446,7 +446,6 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, * Will be also used to store the finished ordered extent. * @file_offset: File offset for the finished IO * @io_size: Length of the finish IO range - * @uptodate: If the IO finishes without problem * * Return true if the ordered extent is finished in the range, and update * @cached. @@ -457,7 +456,7 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, */ bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, struct btrfs_ordered_extent **cached, - u64 file_offset, u64 io_size, int uptodate) + u64 file_offset, u64 io_size) { struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; struct rb_node *node; @@ -486,8 +485,6 @@ have_entry: entry->bytes_left, io_size); entry->bytes_left -= io_size; - if (!uptodate) - set_bit(BTRFS_ORDERED_IOERR, &entry->flags); if (entry->bytes_left == 0) { /* diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index b2d88aba8420..4194e960ff61 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -177,7 +177,7 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, bool uptodate); bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, struct btrfs_ordered_extent **cached, - u64 file_offset, u64 io_size, int uptodate); + u64 file_offset, u64 io_size); int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes, int type); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 0fa121171ca1..db680f5be745 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1733,7 +1733,7 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, ASSERT(trans != NULL); ret = btrfs_find_all_roots(NULL, trans->fs_info, bytenr, 0, &old_root, - false, true); + true); if (ret < 0) { trans->fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; btrfs_warn(trans->fs_info, @@ -2651,7 +2651,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) /* Search commit root to find old_roots */ ret = btrfs_find_all_roots(NULL, fs_info, record->bytenr, 0, - &record->old_roots, false, false); + &record->old_roots, false); if (ret < 0) goto cleanup; } @@ -2667,7 +2667,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) * current root. It's safe inside commit_transaction(). */ ret = btrfs_find_all_roots(trans, fs_info, - record->bytenr, BTRFS_SEQ_LAST, &new_roots, false, false); + record->bytenr, BTRFS_SEQ_LAST, &new_roots, false); if (ret < 0) goto cleanup; if (qgroup_to_skip) { @@ -3201,7 +3201,7 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans, num_bytes = found.offset; ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0, - &roots, false, false); + &roots, false); if (ret < 0) goto out; /* For rescan, just pass old_roots as NULL */ diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 244d499ebc72..d8d268ca8aa7 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1035,7 +1035,7 @@ static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) for (i = 0; i < rbio->nr_pages; i++) { if (rbio->stripe_pages[i]) continue; - page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + page = alloc_page(GFP_NOFS); if (!page) return -ENOMEM; rbio->stripe_pages[i] = page; @@ -1054,7 +1054,7 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) for (; i < rbio->nr_pages; i++) { if (rbio->stripe_pages[i]) continue; - page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + page = alloc_page(GFP_NOFS); if (!page) return -ENOMEM; rbio->stripe_pages[i] = page; @@ -1636,10 +1636,10 @@ struct btrfs_plug_cb { static int plug_cmp(void *priv, const struct list_head *a, const struct list_head *b) { - struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio, - plug_list); - struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio, - plug_list); + const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio, + plug_list); + const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio, + plug_list); u64 a_sector = ra->bio_list.head->bi_iter.bi_sector; u64 b_sector = rb->bio_list.head->bi_iter.bi_sector; @@ -2300,7 +2300,7 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) if (rbio->stripe_pages[index]) continue; - page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + page = alloc_page(GFP_NOFS); if (!page) return -ENOMEM; rbio->stripe_pages[index] = page; @@ -2350,14 +2350,14 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, if (!need_check) goto writeback; - p_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + p_page = alloc_page(GFP_NOFS); if (!p_page) goto cleanup; SetPageUptodate(p_page); if (has_qstripe) { /* RAID6, allocate and map temp space for the Q stripe */ - q_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + q_page = alloc_page(GFP_NOFS); if (!q_page) { __free_page(p_page); goto cleanup; diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 8e026de74c44..d2062d5f71dd 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -264,8 +264,8 @@ static struct block_entry *add_block_entry(struct btrfs_fs_info *fs_info, struct block_entry *be = NULL, *exist; struct root_entry *re = NULL; - re = kzalloc(sizeof(struct root_entry), GFP_KERNEL); - be = kzalloc(sizeof(struct block_entry), GFP_KERNEL); + re = kzalloc(sizeof(struct root_entry), GFP_NOFS); + be = kzalloc(sizeof(struct block_entry), GFP_NOFS); if (!be || !re) { kfree(re); kfree(be); @@ -313,7 +313,7 @@ static int add_tree_block(struct btrfs_fs_info *fs_info, u64 ref_root, struct root_entry *re; struct ref_entry *ref = NULL, *exist; - ref = kmalloc(sizeof(struct ref_entry), GFP_KERNEL); + ref = kmalloc(sizeof(struct ref_entry), GFP_NOFS); if (!ref) return -ENOMEM; @@ -358,7 +358,7 @@ static int add_shared_data_ref(struct btrfs_fs_info *fs_info, struct block_entry *be; struct ref_entry *ref; - ref = kzalloc(sizeof(struct ref_entry), GFP_KERNEL); + ref = kzalloc(sizeof(struct ref_entry), GFP_NOFS); if (!ref) return -ENOMEM; be = add_block_entry(fs_info, bytenr, num_bytes, 0); @@ -393,7 +393,7 @@ static int add_extent_data_ref(struct btrfs_fs_info *fs_info, u64 offset = btrfs_extent_data_ref_offset(leaf, dref); u32 num_refs = btrfs_extent_data_ref_count(leaf, dref); - ref = kzalloc(sizeof(struct ref_entry), GFP_KERNEL); + ref = kzalloc(sizeof(struct ref_entry), GFP_NOFS); if (!ref) return -ENOMEM; be = add_block_entry(fs_info, bytenr, num_bytes, ref_root); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index fc831597cb22..914d403b4415 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -24,6 +24,7 @@ #include "block-group.h" #include "backref.h" #include "misc.h" +#include "subpage.h" /* * Relocation overview @@ -2781,10 +2782,70 @@ static noinline_for_stack int prealloc_file_extent_cluster( u64 num_bytes; int nr; int ret = 0; + u64 i_size = i_size_read(&inode->vfs_inode); u64 prealloc_start = cluster->start - offset; u64 prealloc_end = cluster->end - offset; u64 cur_offset = prealloc_start; + /* + * For subpage case, previous i_size may not be aligned to PAGE_SIZE. + * This means the range [i_size, PAGE_END + 1) is filled with zeros by + * btrfs_do_readpage() call of previously relocated file cluster. + * + * If the current cluster starts in the above range, btrfs_do_readpage() + * will skip the read, and relocate_one_page() will later writeback + * the padding zeros as new data, causing data corruption. + * + * Here we have to manually invalidate the range (i_size, PAGE_END + 1). + */ + if (!IS_ALIGNED(i_size, PAGE_SIZE)) { + struct address_space *mapping = inode->vfs_inode.i_mapping; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + const u32 sectorsize = fs_info->sectorsize; + struct page *page; + + ASSERT(sectorsize < PAGE_SIZE); + ASSERT(IS_ALIGNED(i_size, sectorsize)); + + /* + * Subpage can't handle page with DIRTY but without UPTODATE + * bit as it can lead to the following deadlock: + * + * btrfs_readpage() + * | Page already *locked* + * |- btrfs_lock_and_flush_ordered_range() + * |- btrfs_start_ordered_extent() + * |- extent_write_cache_pages() + * |- lock_page() + * We try to lock the page we already hold. + * + * Here we just writeback the whole data reloc inode, so that + * we will be ensured to have no dirty range in the page, and + * are safe to clear the uptodate bits. + * + * This shouldn't cause too much overhead, as we need to write + * the data back anyway. + */ + ret = filemap_write_and_wait(mapping); + if (ret < 0) + return ret; + + clear_extent_bits(&inode->io_tree, i_size, + round_up(i_size, PAGE_SIZE) - 1, + EXTENT_UPTODATE); + page = find_lock_page(mapping, i_size >> PAGE_SHIFT); + /* + * If page is freed we don't need to do anything then, as we + * will re-read the whole page anyway. + */ + if (page) { + btrfs_subpage_clear_uptodate(fs_info, page, i_size, + round_up(i_size, PAGE_SIZE) - i_size); + unlock_page(page); + put_page(page); + } + } + BUG_ON(cluster->start != cluster->boundary[0]); ret = btrfs_alloc_data_chunk_ondemand(inode, prealloc_end + 1 - prealloc_start); @@ -2886,19 +2947,149 @@ noinline int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info) } ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE); -static int relocate_file_extent_cluster(struct inode *inode, - struct file_extent_cluster *cluster) +static u64 get_cluster_boundary_end(struct file_extent_cluster *cluster, + int cluster_nr) +{ + /* Last extent, use cluster end directly */ + if (cluster_nr >= cluster->nr - 1) + return cluster->end; + + /* Use next boundary start*/ + return cluster->boundary[cluster_nr + 1] - 1; +} + +static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, + struct file_extent_cluster *cluster, + int *cluster_nr, unsigned long page_index) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + u64 offset = BTRFS_I(inode)->index_cnt; + const unsigned long last_index = (cluster->end - offset) >> PAGE_SHIFT; + gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); + struct page *page; u64 page_start; u64 page_end; + u64 cur; + int ret; + + ASSERT(page_index <= last_index); + page = find_lock_page(inode->i_mapping, page_index); + if (!page) { + page_cache_sync_readahead(inode->i_mapping, ra, NULL, + page_index, last_index + 1 - page_index); + page = find_or_create_page(inode->i_mapping, page_index, mask); + if (!page) + return -ENOMEM; + } + ret = set_page_extent_mapped(page); + if (ret < 0) + goto release_page; + + if (PageReadahead(page)) + page_cache_async_readahead(inode->i_mapping, ra, NULL, page, + page_index, last_index + 1 - page_index); + + if (!PageUptodate(page)) { + btrfs_readpage(NULL, page); + lock_page(page); + if (!PageUptodate(page)) { + ret = -EIO; + goto release_page; + } + } + + page_start = page_offset(page); + page_end = page_start + PAGE_SIZE - 1; + + /* + * Start from the cluster, as for subpage case, the cluster can start + * inside the page. + */ + cur = max(page_start, cluster->boundary[*cluster_nr] - offset); + while (cur <= page_end) { + u64 extent_start = cluster->boundary[*cluster_nr] - offset; + u64 extent_end = get_cluster_boundary_end(cluster, + *cluster_nr) - offset; + u64 clamped_start = max(page_start, extent_start); + u64 clamped_end = min(page_end, extent_end); + u32 clamped_len = clamped_end + 1 - clamped_start; + + /* Reserve metadata for this range */ + ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), + clamped_len); + if (ret) + goto release_page; + + /* Mark the range delalloc and dirty for later writeback */ + lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end); + ret = btrfs_set_extent_delalloc(BTRFS_I(inode), clamped_start, + clamped_end, 0, NULL); + if (ret) { + clear_extent_bits(&BTRFS_I(inode)->io_tree, + clamped_start, clamped_end, + EXTENT_LOCKED | EXTENT_BOUNDARY); + btrfs_delalloc_release_metadata(BTRFS_I(inode), + clamped_len, true); + btrfs_delalloc_release_extents(BTRFS_I(inode), + clamped_len); + goto release_page; + } + btrfs_page_set_dirty(fs_info, page, clamped_start, clamped_len); + + /* + * Set the boundary if it's inside the page. + * Data relocation requires the destination extents to have the + * same size as the source. + * EXTENT_BOUNDARY bit prevents current extent from being merged + * with previous extent. + */ + if (in_range(cluster->boundary[*cluster_nr] - offset, + page_start, PAGE_SIZE)) { + u64 boundary_start = cluster->boundary[*cluster_nr] - + offset; + u64 boundary_end = boundary_start + + fs_info->sectorsize - 1; + + set_extent_bits(&BTRFS_I(inode)->io_tree, + boundary_start, boundary_end, + EXTENT_BOUNDARY); + } + unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end); + btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len); + cur += clamped_len; + + /* Crossed extent end, go to next extent */ + if (cur >= extent_end) { + (*cluster_nr)++; + /* Just finished the last extent of the cluster, exit. */ + if (*cluster_nr >= cluster->nr) + break; + } + } + unlock_page(page); + put_page(page); + + balance_dirty_pages_ratelimited(inode->i_mapping); + btrfs_throttle(fs_info); + if (btrfs_should_cancel_balance(fs_info)) + ret = -ECANCELED; + return ret; + +release_page: + unlock_page(page); + put_page(page); + return ret; +} + +static int relocate_file_extent_cluster(struct inode *inode, + struct file_extent_cluster *cluster) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 offset = BTRFS_I(inode)->index_cnt; unsigned long index; unsigned long last_index; - struct page *page; struct file_ra_state *ra; - gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); - int nr = 0; + int cluster_nr = 0; int ret = 0; if (!cluster->nr) @@ -2919,109 +3110,14 @@ static int relocate_file_extent_cluster(struct inode *inode, if (ret) goto out; - index = (cluster->start - offset) >> PAGE_SHIFT; last_index = (cluster->end - offset) >> PAGE_SHIFT; - while (index <= last_index) { - ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), - PAGE_SIZE); - if (ret) - goto out; - - page = find_lock_page(inode->i_mapping, index); - if (!page) { - page_cache_sync_readahead(inode->i_mapping, - ra, NULL, index, - last_index + 1 - index); - page = find_or_create_page(inode->i_mapping, index, - mask); - if (!page) { - btrfs_delalloc_release_metadata(BTRFS_I(inode), - PAGE_SIZE, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), - PAGE_SIZE); - ret = -ENOMEM; - goto out; - } - } - ret = set_page_extent_mapped(page); - if (ret < 0) { - btrfs_delalloc_release_metadata(BTRFS_I(inode), - PAGE_SIZE, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); - unlock_page(page); - put_page(page); - goto out; - } - - if (PageReadahead(page)) { - page_cache_async_readahead(inode->i_mapping, - ra, NULL, page, index, - last_index + 1 - index); - } - - if (!PageUptodate(page)) { - btrfs_readpage(NULL, page); - lock_page(page); - if (!PageUptodate(page)) { - unlock_page(page); - put_page(page); - btrfs_delalloc_release_metadata(BTRFS_I(inode), - PAGE_SIZE, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), - PAGE_SIZE); - ret = -EIO; - goto out; - } - } - - page_start = page_offset(page); - page_end = page_start + PAGE_SIZE - 1; - - lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end); - - if (nr < cluster->nr && - page_start + offset == cluster->boundary[nr]) { - set_extent_bits(&BTRFS_I(inode)->io_tree, - page_start, page_end, - EXTENT_BOUNDARY); - nr++; - } - - ret = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, - page_end, 0, NULL); - if (ret) { - unlock_page(page); - put_page(page); - btrfs_delalloc_release_metadata(BTRFS_I(inode), - PAGE_SIZE, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), - PAGE_SIZE); - - clear_extent_bits(&BTRFS_I(inode)->io_tree, - page_start, page_end, - EXTENT_LOCKED | EXTENT_BOUNDARY); - goto out; - - } - set_page_dirty(page); - - unlock_extent(&BTRFS_I(inode)->io_tree, - page_start, page_end); - unlock_page(page); - put_page(page); - - index++; - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); - balance_dirty_pages_ratelimited(inode->i_mapping); - btrfs_throttle(fs_info); - if (btrfs_should_cancel_balance(fs_info)) { - ret = -ECANCELED; - goto out; - } - } - WARN_ON(nr != cluster->nr); + for (index = (cluster->start - offset) >> PAGE_SHIFT; + index <= last_index && !ret; index++) + ret = relocate_one_page(inode, ra, cluster, &cluster_nr, index); if (btrfs_is_zoned(fs_info) && !ret) ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); + if (ret == 0) + WARN_ON(cluster_nr != cluster->nr); out: kfree(ra); return ret; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 6ac37ae6c811..72f9b865e847 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1198,7 +1198,7 @@ struct backref_ctx { static int __clone_root_cmp_bsearch(const void *key, const void *elt) { u64 root = (u64)(uintptr_t)key; - struct clone_root *cr = (struct clone_root *)elt; + const struct clone_root *cr = elt; if (root < cr->root->root_key.objectid) return -1; @@ -1209,8 +1209,8 @@ static int __clone_root_cmp_bsearch(const void *key, const void *elt) static int __clone_root_cmp_sort(const void *e1, const void *e2) { - struct clone_root *cr1 = (struct clone_root *)e1; - struct clone_root *cr2 = (struct clone_root *)e2; + const struct clone_root *cr1 = e1; + const struct clone_root *cr2 = e2; if (cr1->root->root_key.objectid < cr2->root->root_key.objectid) return -1; @@ -1307,7 +1307,7 @@ static int find_extent_clone(struct send_ctx *sctx, u64 flags = 0; struct btrfs_file_extent_item *fi; struct extent_buffer *eb = path->nodes[0]; - struct backref_ctx *backref_ctx = NULL; + struct backref_ctx backref_ctx = {0}; struct clone_root *cur_clone_root; struct btrfs_key found_key; struct btrfs_path *tmp_path; @@ -1322,12 +1322,6 @@ static int find_extent_clone(struct send_ctx *sctx, /* We only use this path under the commit sem */ tmp_path->need_commit_sem = 0; - backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_KERNEL); - if (!backref_ctx) { - ret = -ENOMEM; - goto out; - } - if (data_offset >= ino_size) { /* * There may be extents that lie behind the file's size. @@ -1392,12 +1386,12 @@ static int find_extent_clone(struct send_ctx *sctx, cur_clone_root->found_refs = 0; } - backref_ctx->sctx = sctx; - backref_ctx->found = 0; - backref_ctx->cur_objectid = ino; - backref_ctx->cur_offset = data_offset; - backref_ctx->found_itself = 0; - backref_ctx->extent_len = num_bytes; + backref_ctx.sctx = sctx; + backref_ctx.found = 0; + backref_ctx.cur_objectid = ino; + backref_ctx.cur_offset = data_offset; + backref_ctx.found_itself = 0; + backref_ctx.extent_len = num_bytes; /* * The last extent of a file may be too large due to page alignment. @@ -1405,7 +1399,7 @@ static int find_extent_clone(struct send_ctx *sctx, * __iterate_backrefs work. */ if (data_offset + num_bytes >= ino_size) - backref_ctx->extent_len = ino_size - data_offset; + backref_ctx.extent_len = ino_size - data_offset; /* * Now collect all backrefs. @@ -1416,12 +1410,12 @@ static int find_extent_clone(struct send_ctx *sctx, extent_item_pos = 0; ret = iterate_extent_inodes(fs_info, found_key.objectid, extent_item_pos, 1, __iterate_backrefs, - backref_ctx, false); + &backref_ctx, false); if (ret < 0) goto out; - if (!backref_ctx->found_itself) { + if (!backref_ctx.found_itself) { /* found a bug in backref code? */ ret = -EIO; btrfs_err(fs_info, @@ -1434,7 +1428,7 @@ static int find_extent_clone(struct send_ctx *sctx, "find_extent_clone: data_offset=%llu, ino=%llu, num_bytes=%llu, logical=%llu", data_offset, ino, num_bytes, logical); - if (!backref_ctx->found) + if (!backref_ctx.found) btrfs_debug(fs_info, "no clones found"); cur_clone_root = NULL; @@ -1458,7 +1452,6 @@ static int find_extent_clone(struct send_ctx *sctx, out: btrfs_free_path(tmp_path); - kfree(backref_ctx); return ret; } diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index f79bf85f2439..5ada02e0e629 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -493,6 +493,11 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, long time_left; int loops; + delalloc_bytes = percpu_counter_sum_positive(&fs_info->delalloc_bytes); + ordered_bytes = percpu_counter_sum_positive(&fs_info->ordered_bytes); + if (delalloc_bytes == 0 && ordered_bytes == 0) + return; + /* Calc the number of the pages we need flush for space reservation */ if (to_reclaim == U64_MAX) { items = U64_MAX; @@ -500,22 +505,21 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, /* * to_reclaim is set to however much metadata we need to * reclaim, but reclaiming that much data doesn't really track - * exactly, so increase the amount to reclaim by 2x in order to - * make sure we're flushing enough delalloc to hopefully reclaim - * some metadata reservations. + * exactly. What we really want to do is reclaim full inode's + * worth of reservations, however that's not available to us + * here. We will take a fraction of the delalloc bytes for our + * flushing loops and hope for the best. Delalloc will expand + * the amount we write to cover an entire dirty extent, which + * will reclaim the metadata reservation for that range. If + * it's not enough subsequent flush stages will be more + * aggressive. */ + to_reclaim = max(to_reclaim, delalloc_bytes >> 3); items = calc_reclaim_items_nr(fs_info, to_reclaim) * 2; - to_reclaim = items * EXTENT_SIZE_PER_ITEM; } trans = (struct btrfs_trans_handle *)current->journal_info; - delalloc_bytes = percpu_counter_sum_positive( - &fs_info->delalloc_bytes); - ordered_bytes = percpu_counter_sum_positive(&fs_info->ordered_bytes); - if (delalloc_bytes == 0 && ordered_bytes == 0) - return; - /* * If we are doing more ordered than delalloc we need to just wait on * ordered extents, otherwise we'll waste time trying to flush delalloc @@ -528,9 +532,49 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, while ((delalloc_bytes || ordered_bytes) && loops < 3) { u64 temp = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT; long nr_pages = min_t(u64, temp, LONG_MAX); + int async_pages; btrfs_start_delalloc_roots(fs_info, nr_pages, true); + /* + * We need to make sure any outstanding async pages are now + * processed before we continue. This is because things like + * sync_inode() try to be smart and skip writing if the inode is + * marked clean. We don't use filemap_fwrite for flushing + * because we want to control how many pages we write out at a + * time, thus this is the only safe way to make sure we've + * waited for outstanding compressed workers to have started + * their jobs and thus have ordered extents set up properly. + * + * This exists because we do not want to wait for each + * individual inode to finish its async work, we simply want to + * start the IO on everybody, and then come back here and wait + * for all of the async work to catch up. Once we're done with + * that we know we'll have ordered extents for everything and we + * can decide if we wait for that or not. + * + * If we choose to replace this in the future, make absolutely + * sure that the proper waiting is being done in the async case, + * as there have been bugs in that area before. + */ + async_pages = atomic_read(&fs_info->async_delalloc_pages); + if (!async_pages) + goto skip_async; + + /* + * We don't want to wait forever, if we wrote less pages in this + * loop than we have outstanding, only wait for that number of + * pages, otherwise we can wait for all async pages to finish + * before continuing. + */ + if (async_pages > nr_pages) + async_pages -= nr_pages; + else + async_pages = 0; + wait_event(fs_info->async_submit_wait, + atomic_read(&fs_info->async_delalloc_pages) <= + async_pages); +skip_async: loops++; if (wait_ordered && !trans) { btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); @@ -595,8 +639,11 @@ static void flush_space(struct btrfs_fs_info *fs_info, break; case FLUSH_DELALLOC: case FLUSH_DELALLOC_WAIT: + case FLUSH_DELALLOC_FULL: + if (state == FLUSH_DELALLOC_FULL) + num_bytes = U64_MAX; shrink_delalloc(fs_info, space_info, num_bytes, - state == FLUSH_DELALLOC_WAIT, for_preempt); + state != FLUSH_DELALLOC, for_preempt); break; case FLUSH_DELAYED_REFS_NR: case FLUSH_DELAYED_REFS: @@ -686,7 +733,7 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, { u64 global_rsv_size = fs_info->global_block_rsv.reserved; u64 ordered, delalloc; - u64 thresh = div_factor_fine(space_info->total_bytes, 98); + u64 thresh = div_factor_fine(space_info->total_bytes, 90); u64 used; /* If we're just plain full then async reclaim just slows us down. */ @@ -694,6 +741,20 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, global_rsv_size) >= thresh) return false; + used = space_info->bytes_may_use + space_info->bytes_pinned; + + /* The total flushable belongs to the global rsv, don't flush. */ + if (global_rsv_size >= used) + return false; + + /* + * 128MiB is 1/4 of the maximum global rsv size. If we have less than + * that devoted to other reservations then there's no sense in flushing, + * we don't have a lot of things that need flushing. + */ + if (used - global_rsv_size <= SZ_128M) + return false; + /* * We have tickets queued, bail so we don't compete with the async * flushers. @@ -824,6 +885,8 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info, struct reserve_ticket *ticket; u64 tickets_id = space_info->tickets_id; + trace_btrfs_fail_all_tickets(fs_info, space_info); + if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { btrfs_info(fs_info, "cannot satisfy tickets, dumping space info"); __btrfs_dump_space_info(fs_info, space_info); @@ -905,6 +968,14 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) } /* + * We do not want to empty the system of delalloc unless we're + * under heavy pressure, so allow one trip through the flushing + * logic before we start doing a FLUSH_DELALLOC_FULL. + */ + if (flush_state == FLUSH_DELALLOC_FULL && !commit_cycles) + flush_state++; + + /* * We don't want to force a chunk allocation until we've tried * pretty hard to reclaim space. Think of the case where we * freed up a bunch of space and so have a lot of pinned space @@ -1067,7 +1138,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) * so if we now have space to allocate do the force chunk allocation. */ static const enum btrfs_flush_state data_flush_states[] = { - FLUSH_DELALLOC_WAIT, + FLUSH_DELALLOC_FULL, RUN_DELAYED_IPUTS, COMMIT_TRANS, ALLOC_CHUNK_FORCE, @@ -1156,6 +1227,7 @@ static const enum btrfs_flush_state evict_flush_states[] = { FLUSH_DELAYED_REFS, FLUSH_DELALLOC, FLUSH_DELALLOC_WAIT, + FLUSH_DELALLOC_FULL, ALLOC_CHUNK, COMMIT_TRANS, }; diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c index 8260f8bb3ff0..f429256f56db 100644 --- a/fs/btrfs/struct-funcs.c +++ b/fs/btrfs/struct-funcs.c @@ -73,7 +73,7 @@ u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \ } \ token->kaddr = page_address(token->eb->pages[idx]); \ token->offset = idx << PAGE_SHIFT; \ - if (oip + size <= PAGE_SIZE) \ + if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE ) \ return get_unaligned_le##bits(token->kaddr + oip); \ \ memcpy(lebytes, token->kaddr + oip, part); \ @@ -94,7 +94,7 @@ u##bits btrfs_get_##bits(const struct extent_buffer *eb, \ u8 lebytes[sizeof(u##bits)]; \ \ ASSERT(check_setget_bounds(eb, ptr, off, size)); \ - if (oip + size <= PAGE_SIZE) \ + if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) \ return get_unaligned_le##bits(kaddr + oip); \ \ memcpy(lebytes, kaddr + oip, part); \ @@ -124,7 +124,7 @@ void btrfs_set_token_##bits(struct btrfs_map_token *token, \ } \ token->kaddr = page_address(token->eb->pages[idx]); \ token->offset = idx << PAGE_SHIFT; \ - if (oip + size <= PAGE_SIZE) { \ + if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) { \ put_unaligned_le##bits(val, token->kaddr + oip); \ return; \ } \ @@ -146,7 +146,7 @@ void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \ u8 lebytes[sizeof(u##bits)]; \ \ ASSERT(check_setget_bounds(eb, ptr, off, size)); \ - if (oip + size <= PAGE_SIZE) { \ + if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) { \ put_unaligned_le##bits(val, kaddr + oip); \ return; \ } \ diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 640bcd21bf28..cb10e56ee31e 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -435,8 +435,10 @@ void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info, spin_lock_irqsave(&subpage->lock, flags); subpage->writeback_bitmap &= ~tmp; - if (subpage->writeback_bitmap == 0) + if (subpage->writeback_bitmap == 0) { + ASSERT(PageWriteback(page)); end_page_writeback(page); + } spin_unlock_irqrestore(&subpage->lock, flags); } @@ -559,3 +561,23 @@ IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback, PageWriteback); IMPLEMENT_BTRFS_PAGE_OPS(ordered, SetPageOrdered, ClearPageOrdered, PageOrdered); + +/* + * Make sure not only the page dirty bit is cleared, but also subpage dirty bit + * is cleared. + */ +void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info, + struct page *page) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + + if (!IS_ENABLED(CONFIG_BTRFS_ASSERT)) + return; + + ASSERT(!PageDirty(page)); + if (fs_info->sectorsize == PAGE_SIZE) + return; + + ASSERT(PagePrivate(page) && page->private); + ASSERT(subpage->dirty_bitmap == 0); +} diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index 4d7aca85d915..0120948f37a1 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -126,4 +126,7 @@ DECLARE_BTRFS_SUBPAGE_OPS(ordered); bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len); +void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info, + struct page *page); + #endif diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index d07b18b2b250..537d90bf5d84 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1201,21 +1201,14 @@ char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, key.type = BTRFS_ROOT_BACKREF_KEY; key.offset = (u64)-1; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + ret = btrfs_search_backwards(root, &key, path); if (ret < 0) { goto err; } else if (ret > 0) { - ret = btrfs_previous_item(root, path, subvol_objectid, - BTRFS_ROOT_BACKREF_KEY); - if (ret < 0) { - goto err; - } else if (ret > 0) { - ret = -ENOENT; - goto err; - } + ret = -ENOENT; + goto err; } - btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); subvol_objectid = key.offset; root_ref = btrfs_item_ptr(path->nodes[0], path->slots[0], @@ -1248,21 +1241,14 @@ char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, key.type = BTRFS_INODE_REF_KEY; key.offset = (u64)-1; - ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0); + ret = btrfs_search_backwards(fs_root, &key, path); if (ret < 0) { goto err; } else if (ret > 0) { - ret = btrfs_previous_item(fs_root, path, dirid, - BTRFS_INODE_REF_KEY); - if (ret < 0) { - goto err; - } else if (ret > 0) { - ret = -ENOENT; - goto err; - } + ret = -ENOENT; + goto err; } - btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); dirid = key.offset; inode_ref = btrfs_item_ptr(path->nodes[0], @@ -1353,6 +1339,9 @@ static int btrfs_fill_super(struct super_block *sb, sb->s_op = &btrfs_super_ops; sb->s_d_op = &btrfs_dentry_operations; sb->s_export_op = &btrfs_export_ops; +#ifdef CONFIG_FS_VERITY + sb->s_vop = &btrfs_verityops; +#endif sb->s_xattr = btrfs_xattr_handlers; sb->s_time_gran = 1; #ifdef CONFIG_BTRFS_FS_POSIX_ACL @@ -2041,13 +2030,6 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) ret = -EINVAL; goto restore; } - if (fs_info->sectorsize < PAGE_SIZE) { - btrfs_warn(fs_info, - "read-write mount is not yet allowed for sectorsize %u page size %lu", - fs_info->sectorsize, PAGE_SIZE); - ret = -EINVAL; - goto restore; - } /* * NOTE: when remounting with a change that does writes, don't @@ -2096,16 +2078,15 @@ restore: } /* Used to sort the devices by max_avail(descending sort) */ -static inline int btrfs_cmp_device_free_bytes(const void *dev_info1, - const void *dev_info2) +static int btrfs_cmp_device_free_bytes(const void *a, const void *b) { - if (((struct btrfs_device_info *)dev_info1)->max_avail > - ((struct btrfs_device_info *)dev_info2)->max_avail) + const struct btrfs_device_info *dev_info1 = a; + const struct btrfs_device_info *dev_info2 = b; + + if (dev_info1->max_avail > dev_info2->max_avail) return -1; - else if (((struct btrfs_device_info *)dev_info1)->max_avail < - ((struct btrfs_device_info *)dev_info2)->max_avail) + else if (dev_info1->max_avail < dev_info2->max_avail) return 1; - else return 0; } @@ -2381,7 +2362,7 @@ static struct file_system_type btrfs_root_fs_type = { .name = "btrfs", .mount = btrfs_mount_root, .kill_sb = btrfs_kill_super, - .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA, + .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_ALLOW_IDMAP, }; MODULE_ALIAS_FS("btrfs"); @@ -2572,6 +2553,11 @@ static void __init btrfs_print_mod_info(void) #else ", zoned=no" #endif +#ifdef CONFIG_FS_VERITY + ", fsverity=yes" +#else + ", fsverity=no" +#endif ; pr_info("Btrfs loaded, crc32c=%s%s\n", crc32c_impl(), options); } diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 9d1d140118ff..25a6f587852b 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -22,6 +22,26 @@ #include "block-group.h" #include "qgroup.h" +/* + * Structure name Path + * -------------------------------------------------------------------------- + * btrfs_supported_static_feature_attrs /sys/fs/btrfs/features + * btrfs_supported_feature_attrs /sys/fs/btrfs/features and + * /sys/fs/btrfs/<uuid>/features + * btrfs_attrs /sys/fs/btrfs/<uuid> + * devid_attrs /sys/fs/btrfs/<uuid>/devinfo/<devid> + * allocation_attrs /sys/fs/btrfs/<uuid>/allocation + * qgroup_attrs /sys/fs/btrfs/<uuid>/qgroups/<level>_<qgroupid> + * space_info_attrs /sys/fs/btrfs/<uuid>/allocation/<bg-type> + * raid_attrs /sys/fs/btrfs/<uuid>/allocation/<bg-type>/<bg-profile> + * + * When built with BTRFS_CONFIG_DEBUG: + * + * btrfs_debug_feature_attrs /sys/fs/btrfs/debug + * btrfs_debug_mount_attrs /sys/fs/btrfs/<uuid>/debug + * discard_debug_attrs /sys/fs/btrfs/<uuid>/debug/discard + */ + struct btrfs_feature_attr { struct kobj_attribute kobj_attr; enum btrfs_feature_set feature_set; @@ -267,7 +287,17 @@ BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34); #ifdef CONFIG_BTRFS_DEBUG BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED); #endif +#ifdef CONFIG_FS_VERITY +BTRFS_FEAT_ATTR_COMPAT_RO(verity, VERITY); +#endif +/* + * Features which depend on feature bits and may differ between each fs. + * + * /sys/fs/btrfs/features - all available features implemeted by this version + * /sys/fs/btrfs/UUID/features - features of the fs which are enabled or + * can be changed on a mounted filesystem. + */ static struct attribute *btrfs_supported_feature_attrs[] = { BTRFS_FEAT_ATTR_PTR(mixed_backref), BTRFS_FEAT_ATTR_PTR(default_subvol), @@ -285,16 +315,12 @@ static struct attribute *btrfs_supported_feature_attrs[] = { #ifdef CONFIG_BTRFS_DEBUG BTRFS_FEAT_ATTR_PTR(zoned), #endif +#ifdef CONFIG_FS_VERITY + BTRFS_FEAT_ATTR_PTR(verity), +#endif NULL }; -/* - * Features which depend on feature bits and may differ between each fs. - * - * /sys/fs/btrfs/features lists all available features of this kernel while - * /sys/fs/btrfs/UUID/features shows features of the fs which are enabled or - * can be changed online. - */ static const struct attribute_group btrfs_feature_attr_group = { .name = "features", .is_visible = btrfs_feature_visible, @@ -366,6 +392,10 @@ static ssize_t supported_sectorsizes_show(struct kobject *kobj, { ssize_t ret = 0; + /* 4K sector size is also supported with 64K page size */ + if (PAGE_SIZE == SZ_64K) + ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%u ", SZ_4K); + /* Only sectorsize == PAGE_SIZE is now supported */ ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%lu\n", PAGE_SIZE); @@ -374,6 +404,12 @@ static ssize_t supported_sectorsizes_show(struct kobject *kobj, BTRFS_ATTR(static_feature, supported_sectorsizes, supported_sectorsizes_show); +/* + * Features which only depend on kernel version. + * + * These are listed in /sys/fs/btrfs/features along with + * btrfs_supported_feature_attrs. + */ static struct attribute *btrfs_supported_static_feature_attrs[] = { BTRFS_ATTR_PTR(static_feature, rmdir_subvol), BTRFS_ATTR_PTR(static_feature, supported_checksums), @@ -383,12 +419,6 @@ static struct attribute *btrfs_supported_static_feature_attrs[] = { NULL }; -/* - * Features which only depend on kernel version. - * - * These are listed in /sys/fs/btrfs/features along with - * btrfs_feature_attr_group - */ static const struct attribute_group btrfs_static_feature_attr_group = { .name = "features", .attrs = btrfs_supported_static_feature_attrs, @@ -547,6 +577,11 @@ static ssize_t btrfs_discard_max_discard_size_store(struct kobject *kobj, BTRFS_ATTR_RW(discard, max_discard_size, btrfs_discard_max_discard_size_show, btrfs_discard_max_discard_size_store); +/* + * Per-filesystem debugging of discard (when mounted with discard=async). + * + * Path: /sys/fs/btrfs/<uuid>/debug/discard/ + */ static const struct attribute *discard_debug_attrs[] = { BTRFS_ATTR_PTR(discard, discardable_bytes), BTRFS_ATTR_PTR(discard, discardable_extents), @@ -560,15 +595,19 @@ static const struct attribute *discard_debug_attrs[] = { }; /* - * Runtime debugging exported via sysfs + * Per-filesystem runtime debugging exported via sysfs. * - * /sys/fs/btrfs/debug - applies to module or all filesystems - * /sys/fs/btrfs/UUID - applies only to the given filesystem + * Path: /sys/fs/btrfs/UUID/debug/ */ static const struct attribute *btrfs_debug_mount_attrs[] = { NULL, }; +/* + * Runtime debugging exported via sysfs, applies to all mounted filesystems. + * + * Path: /sys/fs/btrfs/debug + */ static struct attribute *btrfs_debug_feature_attrs[] = { NULL }; @@ -637,6 +676,11 @@ static ssize_t raid_bytes_show(struct kobject *kobj, return scnprintf(buf, PAGE_SIZE, "%llu\n", val); } +/* + * Allocation information about block group profiles. + * + * Path: /sys/fs/btrfs/<uuid>/allocation/<bg-type>/<bg-profile>/ + */ static struct attribute *raid_attrs[] = { BTRFS_ATTR_PTR(raid, total_bytes), BTRFS_ATTR_PTR(raid, used_bytes), @@ -676,6 +720,11 @@ SPACE_INFO_ATTR(bytes_zone_unusable); SPACE_INFO_ATTR(disk_used); SPACE_INFO_ATTR(disk_total); +/* + * Allocation information about block group types. + * + * Path: /sys/fs/btrfs/<uuid>/allocation/<bg-type>/ + */ static struct attribute *space_info_attrs[] = { BTRFS_ATTR_PTR(space_info, flags), BTRFS_ATTR_PTR(space_info, total_bytes), @@ -703,6 +752,11 @@ static struct kobj_type space_info_ktype = { .default_groups = space_info_groups, }; +/* + * Allocation information about block groups. + * + * Path: /sys/fs/btrfs/<uuid>/allocation/ + */ static const struct attribute *allocation_attrs[] = { BTRFS_ATTR_PTR(allocation, global_rsv_reserved), BTRFS_ATTR_PTR(allocation, global_rsv_size), @@ -974,7 +1028,8 @@ static ssize_t btrfs_bg_reclaim_threshold_show(struct kobject *kobj, struct btrfs_fs_info *fs_info = to_fs_info(kobj); ssize_t ret; - ret = scnprintf(buf, PAGE_SIZE, "%d\n", fs_info->bg_reclaim_threshold); + ret = scnprintf(buf, PAGE_SIZE, "%d\n", + READ_ONCE(fs_info->bg_reclaim_threshold)); return ret; } @@ -991,16 +1046,21 @@ static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj, if (ret) return ret; - if (thresh <= 50 || thresh > 100) + if (thresh != 0 && (thresh <= 50 || thresh > 100)) return -EINVAL; - fs_info->bg_reclaim_threshold = thresh; + WRITE_ONCE(fs_info->bg_reclaim_threshold, thresh); return len; } BTRFS_ATTR_RW(, bg_reclaim_threshold, btrfs_bg_reclaim_threshold_show, btrfs_bg_reclaim_threshold_store); +/* + * Per-filesystem information and stats. + * + * Path: /sys/fs/btrfs/<uuid>/ + */ static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, label), BTRFS_ATTR_PTR(, nodesize), @@ -1510,6 +1570,11 @@ static ssize_t btrfs_devinfo_error_stats_show(struct kobject *kobj, } BTRFS_ATTR(devid, error_stats, btrfs_devinfo_error_stats_show); +/* + * Information about one device. + * + * Path: /sys/fs/btrfs/<uuid>/devinfo/<devid>/ + */ static struct attribute *devid_attrs[] = { BTRFS_ATTR_PTR(devid, error_stats), BTRFS_ATTR_PTR(devid, in_fs_metadata), @@ -1799,6 +1864,11 @@ QGROUP_RSV_ATTR(data, BTRFS_QGROUP_RSV_DATA); QGROUP_RSV_ATTR(meta_pertrans, BTRFS_QGROUP_RSV_META_PERTRANS); QGROUP_RSV_ATTR(meta_prealloc, BTRFS_QGROUP_RSV_META_PREALLOC); +/* + * Qgroup information. + * + * Path: /sys/fs/btrfs/<uuid>/qgroups/<level>_<qgroupid>/ + */ static struct attribute *qgroup_attrs[] = { BTRFS_ATTR_PTR(qgroup, referenced), BTRFS_ATTR_PTR(qgroup, exclusive), diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index 98b5aaba46f1..19ba7d5b7d8f 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -223,8 +223,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root, * we can only call btrfs_qgroup_account_extent() directly to test * quota. */ - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, - false, false); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false); if (ret) { ulist_free(old_roots); test_err("couldn't find old roots: %d", ret); @@ -236,8 +235,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root, if (ret) return ret; - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, - false, false); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false); if (ret) { ulist_free(old_roots); ulist_free(new_roots); @@ -260,8 +258,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root, old_roots = NULL; new_roots = NULL; - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, - false, false); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false); if (ret) { ulist_free(old_roots); test_err("couldn't find old roots: %d", ret); @@ -272,8 +269,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root, if (ret) return -EINVAL; - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, - false, false); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false); if (ret) { ulist_free(old_roots); ulist_free(new_roots); @@ -324,8 +320,7 @@ static int test_multiple_refs(struct btrfs_root *root, return ret; } - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, - false, false); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false); if (ret) { ulist_free(old_roots); test_err("couldn't find old roots: %d", ret); @@ -337,8 +332,7 @@ static int test_multiple_refs(struct btrfs_root *root, if (ret) return ret; - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, - false, false); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false); if (ret) { ulist_free(old_roots); ulist_free(new_roots); @@ -359,8 +353,7 @@ static int test_multiple_refs(struct btrfs_root *root, return -EINVAL; } - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, - false, false); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false); if (ret) { ulist_free(old_roots); test_err("couldn't find old roots: %d", ret); @@ -372,8 +365,7 @@ static int test_multiple_refs(struct btrfs_root *root, if (ret) return ret; - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, - false, false); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false); if (ret) { ulist_free(old_roots); ulist_free(new_roots); @@ -400,8 +392,7 @@ static int test_multiple_refs(struct btrfs_root *root, return -EINVAL; } - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, - false, false); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false); if (ret) { ulist_free(old_roots); test_err("couldn't find old roots: %d", ret); @@ -413,8 +404,7 @@ static int test_multiple_refs(struct btrfs_root *root, if (ret) return ret; - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, - false, false); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false); if (ret) { ulist_free(old_roots); ulist_free(new_roots); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index a8b2e0d2c025..7733e8ac0a69 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -24,6 +24,7 @@ #include "compression.h" #include "volumes.h" #include "misc.h" +#include "btrfs_inode.h" /* * Error message should follow the following format: @@ -873,13 +874,22 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf, } } - if (unlikely((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) || - (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes != 2) || - (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) || - (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) || - (type & BTRFS_BLOCK_GROUP_DUP && num_stripes != 2) || + if (unlikely((type & BTRFS_BLOCK_GROUP_RAID10 && + sub_stripes != btrfs_raid_array[BTRFS_RAID_RAID10].sub_stripes) || + (type & BTRFS_BLOCK_GROUP_RAID1 && + num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1].devs_min) || + (type & BTRFS_BLOCK_GROUP_RAID1C3 && + num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1C3].devs_min) || + (type & BTRFS_BLOCK_GROUP_RAID1C4 && + num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1C4].devs_min) || + (type & BTRFS_BLOCK_GROUP_RAID5 && + num_stripes < btrfs_raid_array[BTRFS_RAID_RAID5].devs_min) || + (type & BTRFS_BLOCK_GROUP_RAID6 && + num_stripes < btrfs_raid_array[BTRFS_RAID_RAID6].devs_min) || + (type & BTRFS_BLOCK_GROUP_DUP && + num_stripes != btrfs_raid_array[BTRFS_RAID_DUP].dev_stripes) || ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 && - num_stripes != 1))) { + num_stripes != btrfs_raid_array[BTRFS_RAID_SINGLE].dev_stripes))) { chunk_err(leaf, chunk, logical, "invalid num_stripes:sub_stripes %u:%u for profile %llu", num_stripes, sub_stripes, @@ -999,6 +1009,8 @@ static int check_inode_item(struct extent_buffer *leaf, u32 valid_mask = (S_IFMT | S_ISUID | S_ISGID | S_ISVTX | 0777); u32 mode; int ret; + u32 flags; + u32 ro_flags; ret = check_inode_key(leaf, key, slot); if (unlikely(ret < 0)) @@ -1054,11 +1066,17 @@ static int check_inode_item(struct extent_buffer *leaf, btrfs_inode_nlink(leaf, iitem)); return -EUCLEAN; } - if (unlikely(btrfs_inode_flags(leaf, iitem) & ~BTRFS_INODE_FLAG_MASK)) { + btrfs_inode_split_flags(btrfs_inode_flags(leaf, iitem), &flags, &ro_flags); + if (unlikely(flags & ~BTRFS_INODE_FLAG_MASK)) { inode_item_err(leaf, slot, - "unknown flags detected: 0x%llx", - btrfs_inode_flags(leaf, iitem) & - ~BTRFS_INODE_FLAG_MASK); + "unknown incompat flags detected: 0x%x", flags); + return -EUCLEAN; + } + if (unlikely(!sb_rdonly(fs_info->sb) && + (ro_flags & ~BTRFS_INODE_RO_FLAG_MASK))) { + inode_item_err(leaf, slot, + "unknown ro-compat flags detected on writeable mount: 0x%x", + ro_flags); return -EUCLEAN; } return 0; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index e6430ac9bbe8..f7efc26aa82a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -753,7 +753,9 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, */ ret = btrfs_lookup_data_extent(fs_info, ins.objectid, ins.offset); - if (ret == 0) { + if (ret < 0) { + goto out; + } else if (ret == 0) { btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, ins.objectid, ins.offset, 0); @@ -3039,8 +3041,6 @@ static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, list_del_init(&ctx->list); ctx->log_ret = error; } - - INIT_LIST_HEAD(&root->log_ctxs[index]); } /* @@ -3328,10 +3328,16 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, goto out_wake_log_root; } - mutex_lock(&root->log_mutex); - if (root->last_log_commit < log_transid) - root->last_log_commit = log_transid; - mutex_unlock(&root->log_mutex); + /* + * We know there can only be one task here, since we have not yet set + * root->log_commit[index1] to 0 and any task attempting to sync the + * log must wait for the previous log transaction to commit if it's + * still in progress or wait for the current log transaction commit if + * someone else already started it. We use <= and not < because the + * first log transaction has an ID of 0. + */ + ASSERT(root->last_log_commit <= log_transid); + root->last_log_commit = log_transid; out_wake_log_root: mutex_lock(&log_root_tree->log_mutex); @@ -3417,14 +3423,10 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, } /* - * Check if an inode was logged in the current transaction. We can't always rely - * on an inode's logged_trans value, because it's an in-memory only field and - * therefore not persisted. This means that its value is lost if the inode gets - * evicted and loaded again from disk (in which case it has a value of 0, and - * certainly it is smaller then any possible transaction ID), when that happens - * the full_sync flag is set in the inode's runtime flags, so on that case we - * assume eviction happened and ignore the logged_trans value, assuming the - * worst case, that the inode was logged before in the current transaction. + * Check if an inode was logged in the current transaction. This may often + * return some false positives, because logged_trans is an in memory only field, + * not persisted anywhere. This is meant to be used in contexts where a false + * positive has no functional consequences. */ static bool inode_logged(struct btrfs_trans_handle *trans, struct btrfs_inode *inode) @@ -3432,8 +3434,17 @@ static bool inode_logged(struct btrfs_trans_handle *trans, if (inode->logged_trans == trans->transid) return true; - if (inode->last_trans == trans->transid && - test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) && + /* + * The inode's logged_trans is always 0 when we load it (because it is + * not persisted in the inode item or elsewhere). So if it is 0, the + * inode was last modified in the current transaction then the inode may + * have been logged before in the current transaction, then evicted and + * loaded again in the current transaction - or may have never been logged + * in the current transaction, but since we can not be sure, we have to + * assume it was, otherwise our callers can leave an inconsistent log. + */ + if (inode->logged_trans == 0 && + inode->last_trans == trans->transid && !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags)) return true; @@ -3913,6 +3924,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, u64 logged_isize) { struct btrfs_map_token token; + u64 flags; btrfs_init_map_token(&token, leaf); @@ -3962,20 +3974,49 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode)); btrfs_set_token_inode_transid(&token, item, trans->transid); btrfs_set_token_inode_rdev(&token, item, inode->i_rdev); - btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags); + flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags, + BTRFS_I(inode)->ro_flags); + btrfs_set_token_inode_flags(&token, item, flags); btrfs_set_token_inode_block_group(&token, item, 0); } static int log_inode_item(struct btrfs_trans_handle *trans, struct btrfs_root *log, struct btrfs_path *path, - struct btrfs_inode *inode) + struct btrfs_inode *inode, bool inode_item_dropped) { struct btrfs_inode_item *inode_item; int ret; - ret = btrfs_insert_empty_item(trans, log, path, - &inode->location, sizeof(*inode_item)); - if (ret && ret != -EEXIST) + /* + * If we are doing a fast fsync and the inode was logged before in the + * current transaction, then we know the inode was previously logged and + * it exists in the log tree. For performance reasons, in this case use + * btrfs_search_slot() directly with ins_len set to 0 so that we never + * attempt a write lock on the leaf's parent, which adds unnecessary lock + * contention in case there are concurrent fsyncs for other inodes of the + * same subvolume. Using btrfs_insert_empty_item() when the inode item + * already exists can also result in unnecessarily splitting a leaf. + */ + if (!inode_item_dropped && inode->logged_trans == trans->transid) { + ret = btrfs_search_slot(trans, log, &inode->location, path, 0, 1); + ASSERT(ret <= 0); + if (ret > 0) + ret = -ENOENT; + } else { + /* + * This means it is the first fsync in the current transaction, + * so the inode item is not in the log and we need to insert it. + * We can never get -EEXIST because we are only called for a fast + * fsync and in case an inode eviction happens after the inode was + * logged before in the current transaction, when we load again + * the inode, we set BTRFS_INODE_NEEDS_FULL_SYNC on its runtime + * flags and set ->logged_trans to 0. + */ + ret = btrfs_insert_empty_item(trans, log, path, &inode->location, + sizeof(*inode_item)); + ASSERT(ret != -EEXIST); + } + if (ret) return ret; inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_item); @@ -4160,7 +4201,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, static int extent_cmp(void *priv, const struct list_head *a, const struct list_head *b) { - struct extent_map *em1, *em2; + const struct extent_map *em1, *em2; em1 = list_entry(a, struct extent_map, list); em2 = list_entry(b, struct extent_map, list); @@ -5053,8 +5094,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, /* * Check the inode's logged_trans only instead of * btrfs_inode_in_log(). This is because the last_log_commit of - * the inode is not updated when we only log that it exists and - * it has the full sync bit set (see btrfs_log_inode()). + * the inode is not updated when we only log that it exists (see + * btrfs_log_inode()). */ if (BTRFS_I(inode)->logged_trans == trans->transid) { spin_unlock(&BTRFS_I(inode)->lock); @@ -5299,6 +5340,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, bool need_log_inode_item = true; bool xattrs_logged = false; bool recursive_logging = false; + bool inode_item_dropped = true; path = btrfs_alloc_path(); if (!path) @@ -5433,6 +5475,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, } else { if (inode_only == LOG_INODE_ALL) fast_search = true; + inode_item_dropped = false; goto log_extents; } @@ -5466,7 +5509,7 @@ log_extents: btrfs_release_path(path); btrfs_release_path(dst_path); if (need_log_inode_item) { - err = log_inode_item(trans, log, dst_path, inode); + err = log_inode_item(trans, log, dst_path, inode, inode_item_dropped); if (err) goto out_unlock; /* @@ -5573,6 +5616,13 @@ static bool need_log_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *inode) { /* + * If a directory was not modified, no dentries added or removed, we can + * and should avoid logging it. + */ + if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid) + return false; + + /* * If this inode does not have new/updated/deleted xattrs since the last * time it was logged and is flagged as logged in the current transaction, * we can skip logging it. As for new/deleted names, those are updated in diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c new file mode 100644 index 000000000000..28d443d3ef93 --- /dev/null +++ b/fs/btrfs/verity.c @@ -0,0 +1,811 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/init.h> +#include <linux/fs.h> +#include <linux/slab.h> +#include <linux/rwsem.h> +#include <linux/xattr.h> +#include <linux/security.h> +#include <linux/posix_acl_xattr.h> +#include <linux/iversion.h> +#include <linux/fsverity.h> +#include <linux/sched/mm.h> +#include "ctree.h" +#include "btrfs_inode.h" +#include "transaction.h" +#include "disk-io.h" +#include "locking.h" + +/* + * Implementation of the interface defined in struct fsverity_operations. + * + * The main question is how and where to store the verity descriptor and the + * Merkle tree. We store both in dedicated btree items in the filesystem tree, + * together with the rest of the inode metadata. This means we'll need to do + * extra work to encrypt them once encryption is supported in btrfs, but btrfs + * has a lot of careful code around i_size and it seems better to make a new key + * type than try and adjust all of our expectations for i_size. + * + * Note that this differs from the implementation in ext4 and f2fs, where + * this data is stored as if it were in the file, but past EOF. However, btrfs + * does not have a widespread mechanism for caching opaque metadata pages, so we + * do pretend that the Merkle tree pages themselves are past EOF for the + * purposes of caching them (as opposed to creating a virtual inode). + * + * fs verity items are stored under two different key types on disk. + * The descriptor items: + * [ inode objectid, BTRFS_VERITY_DESC_ITEM_KEY, offset ] + * + * At offset 0, we store a btrfs_verity_descriptor_item which tracks the + * size of the descriptor item and some extra data for encryption. + * Starting at offset 1, these hold the generic fs verity descriptor. + * The latter are opaque to btrfs, we just read and write them as a blob for + * the higher level verity code. The most common descriptor size is 256 bytes. + * + * The merkle tree items: + * [ inode objectid, BTRFS_VERITY_MERKLE_ITEM_KEY, offset ] + * + * These also start at offset 0, and correspond to the merkle tree bytes. + * So when fsverity asks for page 0 of the merkle tree, we pull up one page + * starting at offset 0 for this key type. These are also opaque to btrfs, + * we're blindly storing whatever fsverity sends down. + * + * Another important consideration is the fact that the Merkle tree data scales + * linearly with the size of the file (with 4K pages/blocks and SHA-256, it's + * ~1/127th the size) so for large files, writing the tree can be a lengthy + * operation. For that reason, we guard the whole enable verity operation + * (between begin_enable_verity and end_enable_verity) with an orphan item. + * Again, because the data can be pretty large, it's quite possible that we + * could run out of space writing it, so we try our best to handle errors by + * stopping and rolling back rather than aborting the victim transaction. + */ + +#define MERKLE_START_ALIGN 65536 + +/* + * Compute the logical file offset where we cache the Merkle tree. + * + * @inode: inode of the verity file + * + * For the purposes of caching the Merkle tree pages, as required by + * fs-verity, it is convenient to do size computations in terms of a file + * offset, rather than in terms of page indices. + * + * Use 64K to be sure it's past the last page in the file, even with 64K pages. + * That rounding operation itself can overflow loff_t, so we do it in u64 and + * check. + * + * Returns the file offset on success, negative error code on failure. + */ +static loff_t merkle_file_pos(const struct inode *inode) +{ + u64 sz = inode->i_size; + u64 rounded = round_up(sz, MERKLE_START_ALIGN); + + if (rounded > inode->i_sb->s_maxbytes) + return -EFBIG; + + return rounded; +} + +/* + * Drop all the items for this inode with this key_type. + * + * @inode: inode to drop items for + * @key_type: type of items to drop (BTRFS_VERITY_DESC_ITEM or + * BTRFS_VERITY_MERKLE_ITEM) + * + * Before doing a verity enable we cleanup any existing verity items. + * This is also used to clean up if a verity enable failed half way through. + * + * Returns number of dropped items on success, negative error code on failure. + */ +static int drop_verity_items(struct btrfs_inode *inode, u8 key_type) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = inode->root; + struct btrfs_path *path; + struct btrfs_key key; + int count = 0; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + while (1) { + /* 1 for the item being dropped */ + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + + /* + * Walk backwards through all the items until we find one that + * isn't from our key type or objectid + */ + key.objectid = btrfs_ino(inode); + key.type = key_type; + key.offset = (u64)-1; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) { + ret = 0; + /* No more keys of this type, we're done */ + if (path->slots[0] == 0) + break; + path->slots[0]--; + } else if (ret < 0) { + btrfs_end_transaction(trans); + goto out; + } + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + /* No more keys of this type, we're done */ + if (key.objectid != btrfs_ino(inode) || key.type != key_type) + break; + + /* + * This shouldn't be a performance sensitive function because + * it's not used as part of truncate. If it ever becomes + * perf sensitive, change this to walk forward and bulk delete + * items + */ + ret = btrfs_del_items(trans, root, path, path->slots[0], 1); + if (ret) { + btrfs_end_transaction(trans); + goto out; + } + count++; + btrfs_release_path(path); + btrfs_end_transaction(trans); + } + ret = count; + btrfs_end_transaction(trans); +out: + btrfs_free_path(path); + return ret; +} + +/* + * Drop all verity items + * + * @inode: inode to drop verity items for + * + * In most contexts where we are dropping verity items, we want to do it for all + * the types of verity items, not a particular one. + * + * Returns: 0 on success, negative error code on failure. + */ +int btrfs_drop_verity_items(struct btrfs_inode *inode) +{ + int ret; + + ret = drop_verity_items(inode, BTRFS_VERITY_DESC_ITEM_KEY); + if (ret < 0) + return ret; + ret = drop_verity_items(inode, BTRFS_VERITY_MERKLE_ITEM_KEY); + if (ret < 0) + return ret; + + return 0; +} + +/* + * Insert and write inode items with a given key type and offset. + * + * @inode: inode to insert for + * @key_type: key type to insert + * @offset: item offset to insert at + * @src: source data to write + * @len: length of source data to write + * + * Write len bytes from src into items of up to 2K length. + * The inserted items will have key (ino, key_type, offset + off) where off is + * consecutively increasing from 0 up to the last item ending at offset + len. + * + * Returns 0 on success and a negative error code on failure. + */ +static int write_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, + const char *src, u64 len) +{ + struct btrfs_trans_handle *trans; + struct btrfs_path *path; + struct btrfs_root *root = inode->root; + struct extent_buffer *leaf; + struct btrfs_key key; + unsigned long copy_bytes; + unsigned long src_offset = 0; + void *data; + int ret = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + while (len > 0) { + /* 1 for the new item being inserted */ + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } + + key.objectid = btrfs_ino(inode); + key.type = key_type; + key.offset = offset; + + /* + * Insert 2K at a time mostly to be friendly for smaller leaf + * size filesystems + */ + copy_bytes = min_t(u64, len, 2048); + + ret = btrfs_insert_empty_item(trans, root, path, &key, copy_bytes); + if (ret) { + btrfs_end_transaction(trans); + break; + } + + leaf = path->nodes[0]; + + data = btrfs_item_ptr(leaf, path->slots[0], void); + write_extent_buffer(leaf, src + src_offset, + (unsigned long)data, copy_bytes); + offset += copy_bytes; + src_offset += copy_bytes; + len -= copy_bytes; + + btrfs_release_path(path); + btrfs_end_transaction(trans); + } + + btrfs_free_path(path); + return ret; +} + +/* + * Read inode items of the given key type and offset from the btree. + * + * @inode: inode to read items of + * @key_type: key type to read + * @offset: item offset to read from + * @dest: Buffer to read into. This parameter has slightly tricky + * semantics. If it is NULL, the function will not do any copying + * and will just return the size of all the items up to len bytes. + * If dest_page is passed, then the function will kmap_local the + * page and ignore dest, but it must still be non-NULL to avoid the + * counting-only behavior. + * @len: length in bytes to read + * @dest_page: copy into this page instead of the dest buffer + * + * Helper function to read items from the btree. This returns the number of + * bytes read or < 0 for errors. We can return short reads if the items don't + * exist on disk or aren't big enough to fill the desired length. Supports + * reading into a provided buffer (dest) or into the page cache + * + * Returns number of bytes read or a negative error code on failure. + */ +static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, + char *dest, u64 len, struct page *dest_page) +{ + struct btrfs_path *path; + struct btrfs_root *root = inode->root; + struct extent_buffer *leaf; + struct btrfs_key key; + u64 item_end; + u64 copy_end; + int copied = 0; + u32 copy_offset; + unsigned long copy_bytes; + unsigned long dest_offset = 0; + void *data; + char *kaddr = dest; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + if (dest_page) + path->reada = READA_FORWARD; + + key.objectid = btrfs_ino(inode); + key.type = key_type; + key.offset = offset; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = 0; + if (path->slots[0] == 0) + goto out; + path->slots[0]--; + } + + while (len > 0) { + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + if (key.objectid != btrfs_ino(inode) || key.type != key_type) + break; + + item_end = btrfs_item_size_nr(leaf, path->slots[0]) + key.offset; + + if (copied > 0) { + /* + * Once we've copied something, we want all of the items + * to be sequential + */ + if (key.offset != offset) + break; + } else { + /* + * Our initial offset might be in the middle of an + * item. Make sure it all makes sense. + */ + if (key.offset > offset) + break; + if (item_end <= offset) + break; + } + + /* desc = NULL to just sum all the item lengths */ + if (!dest) + copy_end = item_end; + else + copy_end = min(offset + len, item_end); + + /* Number of bytes in this item we want to copy */ + copy_bytes = copy_end - offset; + + /* Offset from the start of item for copying */ + copy_offset = offset - key.offset; + + if (dest) { + if (dest_page) + kaddr = kmap_local_page(dest_page); + + data = btrfs_item_ptr(leaf, path->slots[0], void); + read_extent_buffer(leaf, kaddr + dest_offset, + (unsigned long)data + copy_offset, + copy_bytes); + + if (dest_page) + kunmap_local(kaddr); + } + + offset += copy_bytes; + dest_offset += copy_bytes; + len -= copy_bytes; + copied += copy_bytes; + + path->slots[0]++; + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + /* + * We've reached the last slot in this leaf and we need + * to go to the next leaf. + */ + ret = btrfs_next_leaf(root, path); + if (ret < 0) { + break; + } else if (ret > 0) { + ret = 0; + break; + } + } + } +out: + btrfs_free_path(path); + if (!ret) + ret = copied; + return ret; +} + +/* + * Delete an fsverity orphan + * + * @trans: transaction to do the delete in + * @inode: inode to orphan + * + * Capture verity orphan specific logic that is repeated in the couple places + * we delete verity orphans. Specifically, handling ENOENT and ignoring inodes + * with 0 links. + * + * Returns zero on success or a negative error code on failure. + */ +static int del_orphan(struct btrfs_trans_handle *trans, struct btrfs_inode *inode) +{ + struct btrfs_root *root = inode->root; + int ret; + + /* + * If the inode has no links, it is either already unlinked, or was + * created with O_TMPFILE. In either case, it should have an orphan from + * that other operation. Rather than reference count the orphans, we + * simply ignore them here, because we only invoke the verity path in + * the orphan logic when i_nlink is 1. + */ + if (!inode->vfs_inode.i_nlink) + return 0; + + ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode)); + if (ret == -ENOENT) + ret = 0; + return ret; +} + +/* + * Rollback in-progress verity if we encounter an error. + * + * @inode: inode verity had an error for + * + * We try to handle recoverable errors while enabling verity by rolling it back + * and just failing the operation, rather than having an fs level error no + * matter what. However, any error in rollback is unrecoverable. + * + * Returns 0 on success, negative error code on failure. + */ +static int rollback_verity(struct btrfs_inode *inode) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = inode->root; + int ret; + + ASSERT(inode_is_locked(&inode->vfs_inode)); + truncate_inode_pages(inode->vfs_inode.i_mapping, inode->vfs_inode.i_size); + clear_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags); + ret = btrfs_drop_verity_items(inode); + if (ret) { + btrfs_handle_fs_error(root->fs_info, ret, + "failed to drop verity items in rollback %llu", + (u64)inode->vfs_inode.i_ino); + goto out; + } + + /* + * 1 for updating the inode flag + * 1 for deleting the orphan + */ + trans = btrfs_start_transaction(root, 2); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + btrfs_handle_fs_error(root->fs_info, ret, + "failed to start transaction in verity rollback %llu", + (u64)inode->vfs_inode.i_ino); + goto out; + } + inode->ro_flags &= ~BTRFS_INODE_RO_VERITY; + btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode); + ret = btrfs_update_inode(trans, root, inode); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } + ret = del_orphan(trans, inode); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } + btrfs_end_transaction(trans); +out: + return ret; +} + +/* + * Finalize making the file a valid verity file + * + * @inode: inode to be marked as verity + * @desc: contents of the verity descriptor to write (not NULL) + * @desc_size: size of the verity descriptor + * + * Do the actual work of finalizing verity after successfully writing the Merkle + * tree: + * + * - write out the descriptor items + * - mark the inode with the verity flag + * - delete the orphan item + * - mark the ro compat bit + * - clear the in progress bit + * + * Returns 0 on success, negative error code on failure. + */ +static int finish_verity(struct btrfs_inode *inode, const void *desc, + size_t desc_size) +{ + struct btrfs_trans_handle *trans = NULL; + struct btrfs_root *root = inode->root; + struct btrfs_verity_descriptor_item item; + int ret; + + /* Write out the descriptor item */ + memset(&item, 0, sizeof(item)); + btrfs_set_stack_verity_descriptor_size(&item, desc_size); + ret = write_key_bytes(inode, BTRFS_VERITY_DESC_ITEM_KEY, 0, + (const char *)&item, sizeof(item)); + if (ret) + goto out; + + /* Write out the descriptor itself */ + ret = write_key_bytes(inode, BTRFS_VERITY_DESC_ITEM_KEY, 1, + desc, desc_size); + if (ret) + goto out; + + /* + * 1 for updating the inode flag + * 1 for deleting the orphan + */ + trans = btrfs_start_transaction(root, 2); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + inode->ro_flags |= BTRFS_INODE_RO_VERITY; + btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode); + ret = btrfs_update_inode(trans, root, inode); + if (ret) + goto end_trans; + ret = del_orphan(trans, inode); + if (ret) + goto end_trans; + clear_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags); + btrfs_set_fs_compat_ro(root->fs_info, VERITY); +end_trans: + btrfs_end_transaction(trans); +out: + return ret; + +} + +/* + * fsverity op that begins enabling verity. + * + * @filp: file to enable verity on + * + * Begin enabling fsverity for the file. We drop any existing verity items, add + * an orphan and set the in progress bit. + * + * Returns 0 on success, negative error code on failure. + */ +static int btrfs_begin_enable_verity(struct file *filp) +{ + struct btrfs_inode *inode = BTRFS_I(file_inode(filp)); + struct btrfs_root *root = inode->root; + struct btrfs_trans_handle *trans; + int ret; + + ASSERT(inode_is_locked(file_inode(filp))); + + if (test_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags)) + return -EBUSY; + + /* + * This should almost never do anything, but theoretically, it's + * possible that we failed to enable verity on a file, then were + * interrupted or failed while rolling back, failed to cleanup the + * orphan, and finally attempt to enable verity again. + */ + ret = btrfs_drop_verity_items(inode); + if (ret) + return ret; + + /* 1 for the orphan item */ + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + ret = btrfs_orphan_add(trans, inode); + if (!ret) + set_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags); + btrfs_end_transaction(trans); + + return 0; +} + +/* + * fsverity op that ends enabling verity. + * + * @filp: file we are finishing enabling verity on + * @desc: verity descriptor to write out (NULL in error conditions) + * @desc_size: size of the verity descriptor (variable with signatures) + * @merkle_tree_size: size of the merkle tree in bytes + * + * If desc is null, then VFS is signaling an error occurred during verity + * enable, and we should try to rollback. Otherwise, attempt to finish verity. + * + * Returns 0 on success, negative error code on error. + */ +static int btrfs_end_enable_verity(struct file *filp, const void *desc, + size_t desc_size, u64 merkle_tree_size) +{ + struct btrfs_inode *inode = BTRFS_I(file_inode(filp)); + int ret = 0; + int rollback_ret; + + ASSERT(inode_is_locked(file_inode(filp))); + + if (desc == NULL) + goto rollback; + + ret = finish_verity(inode, desc, desc_size); + if (ret) + goto rollback; + return ret; + +rollback: + rollback_ret = rollback_verity(inode); + if (rollback_ret) + btrfs_err(inode->root->fs_info, + "failed to rollback verity items: %d", rollback_ret); + return ret; +} + +/* + * fsverity op that gets the struct fsverity_descriptor. + * + * @inode: inode to get the descriptor of + * @buf: output buffer for the descriptor contents + * @buf_size: size of the output buffer. 0 to query the size + * + * fsverity does a two pass setup for reading the descriptor, in the first pass + * it calls with buf_size = 0 to query the size of the descriptor, and then in + * the second pass it actually reads the descriptor off disk. + * + * Returns the size on success or a negative error code on failure. + */ +static int btrfs_get_verity_descriptor(struct inode *inode, void *buf, + size_t buf_size) +{ + u64 true_size; + int ret = 0; + struct btrfs_verity_descriptor_item item; + + memset(&item, 0, sizeof(item)); + ret = read_key_bytes(BTRFS_I(inode), BTRFS_VERITY_DESC_ITEM_KEY, 0, + (char *)&item, sizeof(item), NULL); + if (ret < 0) + return ret; + + if (item.reserved[0] != 0 || item.reserved[1] != 0) + return -EUCLEAN; + + true_size = btrfs_stack_verity_descriptor_size(&item); + if (true_size > INT_MAX) + return -EUCLEAN; + + if (buf_size == 0) + return true_size; + if (buf_size < true_size) + return -ERANGE; + + ret = read_key_bytes(BTRFS_I(inode), BTRFS_VERITY_DESC_ITEM_KEY, 1, + buf, buf_size, NULL); + if (ret < 0) + return ret; + if (ret != true_size) + return -EIO; + + return true_size; +} + +/* + * fsverity op that reads and caches a merkle tree page. + * + * @inode: inode to read a merkle tree page for + * @index: page index relative to the start of the merkle tree + * @num_ra_pages: number of pages to readahead. Optional, we ignore it + * + * The Merkle tree is stored in the filesystem btree, but its pages are cached + * with a logical position past EOF in the inode's mapping. + * + * Returns the page we read, or an ERR_PTR on error. + */ +static struct page *btrfs_read_merkle_tree_page(struct inode *inode, + pgoff_t index, + unsigned long num_ra_pages) +{ + struct page *page; + u64 off = (u64)index << PAGE_SHIFT; + loff_t merkle_pos = merkle_file_pos(inode); + int ret; + + if (merkle_pos < 0) + return ERR_PTR(merkle_pos); + if (merkle_pos > inode->i_sb->s_maxbytes - off - PAGE_SIZE) + return ERR_PTR(-EFBIG); + index += merkle_pos >> PAGE_SHIFT; +again: + page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED); + if (page) { + if (PageUptodate(page)) + return page; + + lock_page(page); + /* + * We only insert uptodate pages, so !Uptodate has to be + * an error + */ + if (!PageUptodate(page)) { + unlock_page(page); + put_page(page); + return ERR_PTR(-EIO); + } + unlock_page(page); + return page; + } + + page = __page_cache_alloc(mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS)); + if (!page) + return ERR_PTR(-ENOMEM); + + /* + * Merkle item keys are indexed from byte 0 in the merkle tree. + * They have the form: + * + * [ inode objectid, BTRFS_MERKLE_ITEM_KEY, offset in bytes ] + */ + ret = read_key_bytes(BTRFS_I(inode), BTRFS_VERITY_MERKLE_ITEM_KEY, off, + page_address(page), PAGE_SIZE, page); + if (ret < 0) { + put_page(page); + return ERR_PTR(ret); + } + if (ret < PAGE_SIZE) + memzero_page(page, ret, PAGE_SIZE - ret); + + SetPageUptodate(page); + ret = add_to_page_cache_lru(page, inode->i_mapping, index, GFP_NOFS); + + if (!ret) { + /* Inserted and ready for fsverity */ + unlock_page(page); + } else { + put_page(page); + /* Did someone race us into inserting this page? */ + if (ret == -EEXIST) + goto again; + page = ERR_PTR(ret); + } + return page; +} + +/* + * fsverity op that writes a Merkle tree block into the btree. + * + * @inode: inode to write a Merkle tree block for + * @buf: Merkle tree data block to write + * @index: index of the block in the Merkle tree + * @log_blocksize: log base 2 of the Merkle tree block size + * + * Note that the block size could be different from the page size, so it is not + * safe to assume that index is a page index. + * + * Returns 0 on success or negative error code on failure + */ +static int btrfs_write_merkle_tree_block(struct inode *inode, const void *buf, + u64 index, int log_blocksize) +{ + u64 off = index << log_blocksize; + u64 len = 1ULL << log_blocksize; + loff_t merkle_pos = merkle_file_pos(inode); + + if (merkle_pos < 0) + return merkle_pos; + if (merkle_pos > inode->i_sb->s_maxbytes - off - len) + return -EFBIG; + + return write_key_bytes(BTRFS_I(inode), BTRFS_VERITY_MERKLE_ITEM_KEY, + off, buf, len); +} + +const struct fsverity_operations btrfs_verityops = { + .begin_enable_verity = btrfs_begin_enable_verity, + .end_enable_verity = btrfs_end_enable_verity, + .get_verity_descriptor = btrfs_get_verity_descriptor, + .read_merkle_tree_page = btrfs_read_merkle_tree_page, + .write_merkle_tree_block = btrfs_write_merkle_tree_block, +}; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 70f94b75f25a..ec3a874165de 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -38,7 +38,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .sub_stripes = 2, .dev_stripes = 1, .devs_max = 0, /* 0 == as many as possible */ - .devs_min = 4, + .devs_min = 2, .tolerated_failures = 1, .devs_increment = 2, .ncopies = 2, @@ -103,7 +103,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .sub_stripes = 1, .dev_stripes = 1, .devs_max = 0, - .devs_min = 2, + .devs_min = 1, .tolerated_failures = 0, .devs_increment = 1, .ncopies = 1, @@ -153,6 +153,32 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { }, }; +/* + * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which + * can be used as index to access btrfs_raid_array[]. + */ +enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags) +{ + if (flags & BTRFS_BLOCK_GROUP_RAID10) + return BTRFS_RAID_RAID10; + else if (flags & BTRFS_BLOCK_GROUP_RAID1) + return BTRFS_RAID_RAID1; + else if (flags & BTRFS_BLOCK_GROUP_RAID1C3) + return BTRFS_RAID_RAID1C3; + else if (flags & BTRFS_BLOCK_GROUP_RAID1C4) + return BTRFS_RAID_RAID1C4; + else if (flags & BTRFS_BLOCK_GROUP_DUP) + return BTRFS_RAID_DUP; + else if (flags & BTRFS_BLOCK_GROUP_RAID0) + return BTRFS_RAID_RAID0; + else if (flags & BTRFS_BLOCK_GROUP_RAID5) + return BTRFS_RAID_RAID5; + else if (flags & BTRFS_BLOCK_GROUP_RAID6) + return BTRFS_RAID_RAID6; + + return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ +} + const char *btrfs_bg_type_to_raid_name(u64 flags) { const int index = btrfs_bg_flags_to_raid_index(flags); @@ -404,44 +430,6 @@ void __exit btrfs_cleanup_fs_uuids(void) } } -/* - * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error. - * Returned struct is not linked onto any lists and must be destroyed using - * btrfs_free_device. - */ -static struct btrfs_device *__alloc_device(struct btrfs_fs_info *fs_info) -{ - struct btrfs_device *dev; - - dev = kzalloc(sizeof(*dev), GFP_KERNEL); - if (!dev) - return ERR_PTR(-ENOMEM); - - /* - * Preallocate a bio that's always going to be used for flushing device - * barriers and matches the device lifespan - */ - dev->flush_bio = bio_kmalloc(GFP_KERNEL, 0); - if (!dev->flush_bio) { - kfree(dev); - return ERR_PTR(-ENOMEM); - } - - INIT_LIST_HEAD(&dev->dev_list); - INIT_LIST_HEAD(&dev->dev_alloc_list); - INIT_LIST_HEAD(&dev->post_commit_list); - - atomic_set(&dev->reada_in_flight, 0); - atomic_set(&dev->dev_stats_ccnt, 0); - btrfs_device_data_ordered_init(dev); - INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); - INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); - extent_io_tree_init(fs_info, &dev->alloc_state, - IO_TREE_DEVICE_ALLOC_STATE, NULL); - - return dev; -} - static noinline struct btrfs_fs_devices *find_fsid( const u8 *fsid, const u8 *metadata_fsid) { @@ -1130,6 +1118,9 @@ static void btrfs_close_one_device(struct btrfs_device *device) fs_devices->rw_devices--; } + if (device->devid == BTRFS_DEV_REPLACE_DEVID) + clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); + if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) fs_devices->missing_devices--; @@ -1228,7 +1219,7 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, static int devid_cmp(void *priv, const struct list_head *a, const struct list_head *b) { - struct btrfs_device *dev1, *dev2; + const struct btrfs_device *dev1, *dev2; dev1 = list_entry(a, struct btrfs_device, dev_list); dev2 = list_entry(b, struct btrfs_device, dev_list); @@ -1598,14 +1589,9 @@ again: key.offset = search_start; key.type = BTRFS_DEV_EXTENT_KEY; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + ret = btrfs_search_backwards(root, &key, path); if (ret < 0) goto out; - if (ret > 0) { - ret = btrfs_previous_item(root, path, key.objectid, key.type); - if (ret < 0) - goto out; - } while (1) { l = path->nodes[0]; @@ -1759,48 +1745,6 @@ out: return ret; } -static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, - struct btrfs_device *device, - u64 chunk_offset, u64 start, u64 num_bytes) -{ - int ret; - struct btrfs_path *path; - struct btrfs_fs_info *fs_info = device->fs_info; - struct btrfs_root *root = fs_info->dev_root; - struct btrfs_dev_extent *extent; - struct extent_buffer *leaf; - struct btrfs_key key; - - WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)); - WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - key.objectid = device->devid; - key.offset = start; - key.type = BTRFS_DEV_EXTENT_KEY; - ret = btrfs_insert_empty_item(trans, root, path, &key, - sizeof(*extent)); - if (ret) - goto out; - - leaf = path->nodes[0]; - extent = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_dev_extent); - btrfs_set_dev_extent_chunk_tree(leaf, extent, - BTRFS_CHUNK_TREE_OBJECTID); - btrfs_set_dev_extent_chunk_objectid(leaf, extent, - BTRFS_FIRST_CHUNK_TREE_OBJECTID); - btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); - - btrfs_set_dev_extent_length(leaf, extent, num_bytes); - btrfs_mark_buffer_dirty(leaf); -out: - btrfs_free_path(path); - return ret; -} - static u64 find_next_chunk(struct btrfs_fs_info *fs_info) { struct extent_map_tree *em_tree; @@ -2003,12 +1947,8 @@ static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info, if (!(all_avail & btrfs_raid_array[i].bg_flag)) continue; - if (num_devices < btrfs_raid_array[i].devs_min) { - int ret = btrfs_raid_array[i].mindev_error; - - if (ret) - return ret; - } + if (num_devices < btrfs_raid_array[i].devs_min) + return btrfs_raid_array[i].mindev_error; } return 0; @@ -2137,7 +2077,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, if (IS_ERR(device)) { if (PTR_ERR(device) == -ENOENT && - strcmp(device_path, "missing") == 0) + device_path && strcmp(device_path, "missing") == 0) ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; else ret = PTR_ERR(device); @@ -3622,10 +3562,7 @@ static u64 calc_data_stripes(u64 type, int num_stripes) const int ncopies = btrfs_raid_array[index].ncopies; const int nparity = btrfs_raid_array[index].nparity; - if (nparity) - return num_stripes - nparity; - else - return num_stripes / ncopies; + return (num_stripes - nparity) / ncopies; } /* [pstart, pend) */ @@ -4025,6 +3962,13 @@ static inline int validate_convert_profile(struct btrfs_fs_info *fs_info, if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) return true; + if (fs_info->sectorsize < PAGE_SIZE && + bargs->target & BTRFS_BLOCK_GROUP_RAID56_MASK) { + btrfs_err(fs_info, + "RAID56 is not yet supported for sectorsize %u with page size %lu", + fs_info->sectorsize, PAGE_SIZE); + return false; + } /* Profile is valid and does not have bits outside of the allowed set */ if (alloc_profile_is_valid(bargs->target, 1) && (bargs->target & ~allowed) == 0) @@ -5464,56 +5408,6 @@ out: } /* - * This function, btrfs_finish_chunk_alloc(), belongs to phase 2. - * - * See the comment at btrfs_chunk_alloc() for details about the chunk allocation - * phases. - */ -int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, - u64 chunk_offset, u64 chunk_size) -{ - struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_device *device; - struct extent_map *em; - struct map_lookup *map; - u64 dev_offset; - u64 stripe_size; - int i; - int ret = 0; - - em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size); - if (IS_ERR(em)) - return PTR_ERR(em); - - map = em->map_lookup; - stripe_size = em->orig_block_len; - - /* - * Take the device list mutex to prevent races with the final phase of - * a device replace operation that replaces the device object associated - * with the map's stripes, because the device object's id can change - * at any time during that final phase of the device replace operation - * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the - * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID, - * resulting in persisting a device extent item with such ID. - */ - mutex_lock(&fs_info->fs_devices->device_list_mutex); - for (i = 0; i < map->num_stripes; i++) { - device = map->stripes[i].dev; - dev_offset = map->stripes[i].physical; - - ret = btrfs_alloc_dev_extent(trans, device, chunk_offset, - dev_offset, stripe_size); - if (ret) - break; - } - mutex_unlock(&fs_info->fs_devices->device_list_mutex); - - free_extent_map(em); - return ret; -} - -/* * This function, btrfs_chunk_alloc_add_chunk_item(), typically belongs to the * phase 1 of chunk allocation. It belongs to phase 2 only when allocating system * chunks. @@ -6923,9 +6817,31 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, if (WARN_ON(!devid && !fs_info)) return ERR_PTR(-EINVAL); - dev = __alloc_device(fs_info); - if (IS_ERR(dev)) - return dev; + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return ERR_PTR(-ENOMEM); + + /* + * Preallocate a bio that's always going to be used for flushing device + * barriers and matches the device lifespan + */ + dev->flush_bio = bio_kmalloc(GFP_KERNEL, 0); + if (!dev->flush_bio) { + kfree(dev); + return ERR_PTR(-ENOMEM); + } + + INIT_LIST_HEAD(&dev->dev_list); + INIT_LIST_HEAD(&dev->dev_alloc_list); + INIT_LIST_HEAD(&dev->post_commit_list); + + atomic_set(&dev->reada_in_flight, 0); + atomic_set(&dev->dev_stats_ccnt, 0); + btrfs_device_data_ordered_init(dev); + INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); + INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); + extent_io_tree_init(fs_info, &dev->alloc_state, + IO_TREE_DEVICE_ALLOC_STATE, NULL); if (devid) tmp = *devid; @@ -6961,15 +6877,7 @@ static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes) { - int index = btrfs_bg_flags_to_raid_index(type); - int ncopies = btrfs_raid_array[index].ncopies; - const int nparity = btrfs_raid_array[index].nparity; - int data_stripes; - - if (nparity) - data_stripes = num_stripes - nparity; - else - data_stripes = num_stripes / ncopies; + const int data_stripes = calc_data_stripes(type, num_stripes); return div_u64(chunk_len, data_stripes); } @@ -8144,7 +8052,7 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) goto out; if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { - ret = btrfs_next_item(root, path); + ret = btrfs_next_leaf(root, path); if (ret < 0) goto out; /* No dev extents at all? Not good */ diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 55a8ba244716..b082250b42e0 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -508,8 +508,6 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len); unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, u64 logical); -int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, - u64 chunk_offset, u64 chunk_size); int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, struct btrfs_block_group *bg); int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset); @@ -568,32 +566,6 @@ static inline void btrfs_dev_stat_set(struct btrfs_device *dev, atomic_inc(&dev->dev_stats_ccnt); } -/* - * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which - * can be used as index to access btrfs_raid_array[]. - */ -static inline enum btrfs_raid_types btrfs_bg_flags_to_raid_index(u64 flags) -{ - if (flags & BTRFS_BLOCK_GROUP_RAID10) - return BTRFS_RAID_RAID10; - else if (flags & BTRFS_BLOCK_GROUP_RAID1) - return BTRFS_RAID_RAID1; - else if (flags & BTRFS_BLOCK_GROUP_RAID1C3) - return BTRFS_RAID_RAID1C3; - else if (flags & BTRFS_BLOCK_GROUP_RAID1C4) - return BTRFS_RAID_RAID1C4; - else if (flags & BTRFS_BLOCK_GROUP_DUP) - return BTRFS_RAID_DUP; - else if (flags & BTRFS_BLOCK_GROUP_RAID0) - return BTRFS_RAID_RAID0; - else if (flags & BTRFS_BLOCK_GROUP_RAID5) - return BTRFS_RAID_RAID5; - else if (flags & BTRFS_BLOCK_GROUP_RAID6) - return BTRFS_RAID_RAID6; - - return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ -} - void btrfs_commit_device_sizes(struct btrfs_transaction *trans); struct list_head * __attribute_const__ btrfs_get_fs_uuids(void); @@ -603,6 +575,7 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, struct block_device *bdev, const char *device_path); +enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags); int btrfs_bg_type_to_factor(u64 flags); const char *btrfs_bg_type_to_raid_name(u64 flags); int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index c3fa7d3fa770..8afa90074891 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -121,12 +121,12 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, workspace->strm.total_in = 0; workspace->strm.total_out = 0; - out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + out_page = alloc_page(GFP_NOFS); if (out_page == NULL) { ret = -ENOMEM; goto out; } - cpage_out = kmap(out_page); + cpage_out = page_address(out_page); pages[0] = out_page; nr_pages = 1; @@ -148,26 +148,22 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, int i; for (i = 0; i < in_buf_pages; i++) { - if (in_page) { - kunmap(in_page); + if (in_page) put_page(in_page); - } in_page = find_get_page(mapping, start >> PAGE_SHIFT); - data_in = kmap(in_page); + data_in = page_address(in_page); memcpy(workspace->buf + i * PAGE_SIZE, data_in, PAGE_SIZE); start += PAGE_SIZE; } workspace->strm.next_in = workspace->buf; } else { - if (in_page) { - kunmap(in_page); + if (in_page) put_page(in_page); - } in_page = find_get_page(mapping, start >> PAGE_SHIFT); - data_in = kmap(in_page); + data_in = page_address(in_page); start += PAGE_SIZE; workspace->strm.next_in = data_in; } @@ -196,18 +192,17 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, * the stream end if required */ if (workspace->strm.avail_out == 0) { - kunmap(out_page); if (nr_pages == nr_dest_pages) { out_page = NULL; ret = -E2BIG; goto out; } - out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + out_page = alloc_page(GFP_NOFS); if (out_page == NULL) { ret = -ENOMEM; goto out; } - cpage_out = kmap(out_page); + cpage_out = page_address(out_page); pages[nr_pages] = out_page; nr_pages++; workspace->strm.avail_out = PAGE_SIZE; @@ -234,18 +229,17 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, goto out; } else if (workspace->strm.avail_out == 0) { /* get another page for the stream end */ - kunmap(out_page); if (nr_pages == nr_dest_pages) { out_page = NULL; ret = -E2BIG; goto out; } - out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + out_page = alloc_page(GFP_NOFS); if (out_page == NULL) { ret = -ENOMEM; goto out; } - cpage_out = kmap(out_page); + cpage_out = page_address(out_page); pages[nr_pages] = out_page; nr_pages++; workspace->strm.avail_out = PAGE_SIZE; @@ -264,13 +258,8 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, *total_in = workspace->strm.total_in; out: *out_pages = nr_pages; - if (out_page) - kunmap(out_page); - - if (in_page) { - kunmap(in_page); + if (in_page) put_page(in_page); - } return ret; } @@ -286,10 +275,8 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE); unsigned long buf_start; struct page **pages_in = cb->compressed_pages; - u64 disk_start = cb->start; - struct bio *orig_bio = cb->orig_bio; - data_in = kmap(pages_in[page_in_index]); + data_in = page_address(pages_in[page_in_index]); workspace->strm.next_in = data_in; workspace->strm.avail_in = min_t(size_t, srclen, PAGE_SIZE); workspace->strm.total_in = 0; @@ -311,7 +298,6 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) { pr_warn("BTRFS: inflateInit failed\n"); - kunmap(pages_in[page_in_index]); return -EIO; } while (workspace->strm.total_in < srclen) { @@ -326,9 +312,8 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) if (buf_start == total_out) break; - ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start, - total_out, disk_start, - orig_bio); + ret2 = btrfs_decompress_buf2page(workspace->buf, + total_out - buf_start, cb, buf_start); if (ret2 == 0) { ret = 0; goto done; @@ -339,17 +324,16 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) if (workspace->strm.avail_in == 0) { unsigned long tmp; - kunmap(pages_in[page_in_index]); + page_in_index++; if (page_in_index >= total_pages_in) { data_in = NULL; break; } - data_in = kmap(pages_in[page_in_index]); + data_in = page_address(pages_in[page_in_index]); workspace->strm.next_in = data_in; tmp = srclen - workspace->strm.total_in; - workspace->strm.avail_in = min(tmp, - PAGE_SIZE); + workspace->strm.avail_in = min(tmp, PAGE_SIZE); } } if (ret != Z_STREAM_END) @@ -358,10 +342,8 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) ret = 0; done: zlib_inflateEnd(&workspace->strm); - if (data_in) - kunmap(pages_in[page_in_index]); if (!ret) - zero_fill_bio(orig_bio); + zero_fill_bio(cb->orig_bio); return ret; } diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 907c2cc45c9c..47af1ab3bf12 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -245,7 +245,7 @@ static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info) goto out; if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { - ret = btrfs_next_item(root, path); + ret = btrfs_next_leaf(root, path); if (ret < 0) goto out; /* No dev extents at all? Not good */ @@ -296,7 +296,6 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_zoned_device_info *zone_info = NULL; struct block_device *bdev = device->bdev; - struct request_queue *queue = bdev_get_queue(bdev); sector_t nr_sectors; sector_t sector = 0; struct blk_zone *zones = NULL; @@ -348,19 +347,10 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) nr_sectors = bdev_nr_sectors(bdev); zone_info->zone_size_shift = ilog2(zone_info->zone_size); - zone_info->max_zone_append_size = - (u64)queue_max_zone_append_sectors(queue) << SECTOR_SHIFT; zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors); if (!IS_ALIGNED(nr_sectors, zone_sectors)) zone_info->nr_zones++; - if (bdev_is_zoned(bdev) && zone_info->max_zone_append_size == 0) { - btrfs_err(fs_info, "zoned: device %pg does not support zone append", - bdev); - ret = -EINVAL; - goto out; - } - zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); if (!zone_info->seq_zones) { ret = -ENOMEM; @@ -529,7 +519,6 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) u64 zoned_devices = 0; u64 nr_devices = 0; u64 zone_size = 0; - u64 max_zone_append_size = 0; const bool incompat_zoned = btrfs_fs_incompat(fs_info, ZONED); int ret = 0; @@ -565,11 +554,6 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) ret = -EINVAL; goto out; } - if (!max_zone_append_size || - (zone_info->max_zone_append_size && - zone_info->max_zone_append_size < max_zone_append_size)) - max_zone_append_size = - zone_info->max_zone_append_size; } nr_devices++; } @@ -619,7 +603,6 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) } fs_info->zone_size = zone_size; - fs_info->max_zone_append_size = max_zone_append_size; fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED; /* @@ -1318,9 +1301,6 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start) if (!btrfs_is_zoned(fs_info)) return false; - if (!fs_info->max_zone_append_size) - return false; - if (!is_data_inode(&inode->vfs_inode)) return false; diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index b0ae2608cb6b..4b299705bb12 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -22,7 +22,6 @@ struct btrfs_zoned_device_info { */ u64 zone_size; u8 zone_size_shift; - u64 max_zone_append_size; u32 nr_zones; unsigned long *seq_zones; unsigned long *empty_zones; diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 3e26b466476a..56dce9f00988 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -399,19 +399,19 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, /* map in the first page of input data */ in_page = find_get_page(mapping, start >> PAGE_SHIFT); - workspace->in_buf.src = kmap(in_page); + workspace->in_buf.src = page_address(in_page); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE); /* Allocate and map in the output buffer */ - out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + out_page = alloc_page(GFP_NOFS); if (out_page == NULL) { ret = -ENOMEM; goto out; } pages[nr_pages++] = out_page; - workspace->out_buf.dst = kmap(out_page); + workspace->out_buf.dst = page_address(out_page); workspace->out_buf.pos = 0; workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); @@ -446,19 +446,18 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, if (workspace->out_buf.pos == workspace->out_buf.size) { tot_out += PAGE_SIZE; max_out -= PAGE_SIZE; - kunmap(out_page); if (nr_pages == nr_dest_pages) { out_page = NULL; ret = -E2BIG; goto out; } - out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + out_page = alloc_page(GFP_NOFS); if (out_page == NULL) { ret = -ENOMEM; goto out; } pages[nr_pages++] = out_page; - workspace->out_buf.dst = kmap(out_page); + workspace->out_buf.dst = page_address(out_page); workspace->out_buf.pos = 0; workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); @@ -473,13 +472,12 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, /* Check if we need more input */ if (workspace->in_buf.pos == workspace->in_buf.size) { tot_in += PAGE_SIZE; - kunmap(in_page); put_page(in_page); start += PAGE_SIZE; len -= PAGE_SIZE; in_page = find_get_page(mapping, start >> PAGE_SHIFT); - workspace->in_buf.src = kmap(in_page); + workspace->in_buf.src = page_address(in_page); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE); } @@ -506,19 +504,18 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, tot_out += PAGE_SIZE; max_out -= PAGE_SIZE; - kunmap(out_page); if (nr_pages == nr_dest_pages) { out_page = NULL; ret = -E2BIG; goto out; } - out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + out_page = alloc_page(GFP_NOFS); if (out_page == NULL) { ret = -ENOMEM; goto out; } pages[nr_pages++] = out_page; - workspace->out_buf.dst = kmap(out_page); + workspace->out_buf.dst = page_address(out_page); workspace->out_buf.pos = 0; workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); } @@ -534,12 +531,8 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, out: *out_pages = nr_pages; /* Cleanup */ - if (in_page) { - kunmap(in_page); + if (in_page) put_page(in_page); - } - if (out_page) - kunmap(out_page); return ret; } @@ -547,8 +540,6 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) { struct workspace *workspace = list_entry(ws, struct workspace, list); struct page **pages_in = cb->compressed_pages; - u64 disk_start = cb->start; - struct bio *orig_bio = cb->orig_bio; size_t srclen = cb->compressed_len; ZSTD_DStream *stream; int ret = 0; @@ -565,7 +556,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) goto done; } - workspace->in_buf.src = kmap(pages_in[page_in_index]); + workspace->in_buf.src = page_address(pages_in[page_in_index]); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE); @@ -589,7 +580,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) workspace->out_buf.pos = 0; ret = btrfs_decompress_buf2page(workspace->out_buf.dst, - buf_start, total_out, disk_start, orig_bio); + total_out - buf_start, cb, buf_start); if (ret == 0) break; @@ -601,23 +592,21 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) break; if (workspace->in_buf.pos == workspace->in_buf.size) { - kunmap(pages_in[page_in_index++]); + page_in_index++; if (page_in_index >= total_pages_in) { workspace->in_buf.src = NULL; ret = -EIO; goto done; } srclen -= PAGE_SIZE; - workspace->in_buf.src = kmap(pages_in[page_in_index]); + workspace->in_buf.src = page_address(pages_in[page_in_index]); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE); } } ret = 0; - zero_fill_bio(orig_bio); + zero_fill_bio(cb->orig_bio); done: - if (workspace->in_buf.src) - kunmap(pages_in[page_in_index]); return ret; } diff --git a/fs/buffer.c b/fs/buffer.c index 6290c3afdba4..ab7573d72dd7 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1912,7 +1912,7 @@ EXPORT_SYMBOL(page_zero_new_buffers); static void iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh, - struct iomap *iomap) + const struct iomap *iomap) { loff_t offset = block << inode->i_blkbits; @@ -1966,7 +1966,7 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh, } int __block_write_begin_int(struct page *page, loff_t pos, unsigned len, - get_block_t *get_block, struct iomap *iomap) + get_block_t *get_block, const struct iomap *iomap) { unsigned from = pos & (PAGE_SIZE - 1); unsigned to = from + len; @@ -3268,33 +3268,6 @@ out: EXPORT_SYMBOL(try_to_free_buffers); /* - * There are no bdflush tunables left. But distributions are - * still running obsolete flush daemons, so we terminate them here. - * - * Use of bdflush() is deprecated and will be removed in a future kernel. - * The `flush-X' kernel threads fully replace bdflush daemons and this call. - */ -SYSCALL_DEFINE2(bdflush, int, func, long, data) -{ - static int msg_count; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (msg_count < 5) { - msg_count++; - printk(KERN_INFO - "warning: process `%s' used the obsolete bdflush" - " system call\n", current->comm); - printk(KERN_INFO "Fix your initscripts?\n"); - } - - if (func == 1) - do_exit(0); - return 0; -} - -/* * Buffer-head allocation */ static struct kmem_cache *bh_cachep __read_mostly; diff --git a/fs/cachefiles/Kconfig b/fs/cachefiles/Kconfig index ff9ca55a9ae9..6827b40f7ddc 100644 --- a/fs/cachefiles/Kconfig +++ b/fs/cachefiles/Kconfig @@ -19,22 +19,3 @@ config CACHEFILES_DEBUG caching on files module. If this is set, the debugging output may be enabled by setting bits in /sys/modules/cachefiles/parameter/debug or by including a debugging specifier in /etc/cachefilesd.conf. - -config CACHEFILES_HISTOGRAM - bool "Gather latency information on CacheFiles" - depends on CACHEFILES && PROC_FS - help - - This option causes latency information to be gathered on CacheFiles - operation and exported through file: - - /proc/fs/cachefiles/histogram - - The generation of this histogram adds a certain amount of overhead to - execution as there are a number of points at which data is gathered, - and on a multi-CPU system these may be on cachelines that keep - bouncing between CPUs. On the other hand, the histogram may be - useful for debugging purposes. Saying 'N' here is recommended. - - See Documentation/filesystems/caching/cachefiles.rst for more - information. diff --git a/fs/cachefiles/Makefile b/fs/cachefiles/Makefile index 2227dc2d5498..02fd17731769 100644 --- a/fs/cachefiles/Makefile +++ b/fs/cachefiles/Makefile @@ -15,6 +15,4 @@ cachefiles-y := \ security.o \ xattr.o -cachefiles-$(CONFIG_CACHEFILES_HISTOGRAM) += proc.o - obj-$(CONFIG_CACHEFILES) := cachefiles.o diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c index 38bb7764b454..d463d89f5db8 100644 --- a/fs/cachefiles/bind.c +++ b/fs/cachefiles/bind.c @@ -108,8 +108,6 @@ static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache) atomic_set(&fsdef->usage, 1); fsdef->type = FSCACHE_COOKIE_TYPE_INDEX; - _debug("- fsdef %p", fsdef); - /* look up the directory at the root of the cache */ ret = kern_path(cache->rootdirname, LOOKUP_DIRECTORY, &path); if (ret < 0) diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index da3948fdb615..da28ac1fa225 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -33,7 +33,7 @@ static struct fscache_object *cachefiles_alloc_object( cache = container_of(_cache, struct cachefiles_cache, cache); - _enter("{%s},%p,", cache->cache.identifier, cookie); + _enter("{%s},%x,", cache->cache.identifier, cookie->debug_id); lookup_data = kmalloc(sizeof(*lookup_data), cachefiles_gfp); if (!lookup_data) @@ -96,7 +96,7 @@ static struct fscache_object *cachefiles_alloc_object( lookup_data->key = key; object->lookup_data = lookup_data; - _leave(" = %p [%p]", &object->fscache, lookup_data); + _leave(" = %x [%p]", object->fscache.debug_id, lookup_data); return &object->fscache; nomem_key: @@ -379,7 +379,7 @@ static void cachefiles_sync_cache(struct fscache_cache *_cache) const struct cred *saved_cred; int ret; - _enter("%p", _cache); + _enter("%s", _cache->tag->name); cache = container_of(_cache, struct cachefiles_cache, cache); diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index 4ed83aa5253b..0a511c36dab8 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -181,31 +181,6 @@ extern int cachefiles_check_in_use(struct cachefiles_cache *cache, struct dentry *dir, char *filename); /* - * proc.c - */ -#ifdef CONFIG_CACHEFILES_HISTOGRAM -extern atomic_t cachefiles_lookup_histogram[HZ]; -extern atomic_t cachefiles_mkdir_histogram[HZ]; -extern atomic_t cachefiles_create_histogram[HZ]; - -extern int __init cachefiles_proc_init(void); -extern void cachefiles_proc_cleanup(void); -static inline -void cachefiles_hist(atomic_t histogram[], unsigned long start_jif) -{ - unsigned long jif = jiffies - start_jif; - if (jif >= HZ) - jif = HZ - 1; - atomic_inc(&histogram[jif]); -} - -#else -#define cachefiles_proc_init() (0) -#define cachefiles_proc_cleanup() do {} while (0) -#define cachefiles_hist(hist, start_jif) do {} while (0) -#endif - -/* * rdwr.c */ extern int cachefiles_read_or_alloc_page(struct fscache_retrieval *, diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c index b13fb45fc3f3..fac2e8e7b533 100644 --- a/fs/cachefiles/io.c +++ b/fs/cachefiles/io.c @@ -70,7 +70,7 @@ static int cachefiles_read(struct netfs_cache_resources *cres, _enter("%pD,%li,%llx,%zx/%llx", file, file_inode(file)->i_ino, start_pos, len, - i_size_read(file->f_inode)); + i_size_read(file_inode(file))); /* If the caller asked us to seek for data before doing the read, then * we should do that now. If we find a gap, we fill it with zeros. @@ -194,7 +194,7 @@ static int cachefiles_write(struct netfs_cache_resources *cres, _enter("%pD,%li,%llx,%zx/%llx", file, file_inode(file)->i_ino, start_pos, len, - i_size_read(file->f_inode)); + i_size_read(file_inode(file))); ki = kzalloc(sizeof(struct cachefiles_kiocb), GFP_KERNEL); if (!ki) @@ -410,7 +410,7 @@ int cachefiles_begin_read_operation(struct netfs_read_request *rreq, rreq->cache_resources.cache_priv = op; rreq->cache_resources.cache_priv2 = file; rreq->cache_resources.ops = &cachefiles_netfs_cache_ops; - rreq->cookie_debug_id = object->fscache.debug_id; + rreq->cache_resources.debug_id = object->fscache.debug_id; _leave(""); return 0; diff --git a/fs/cachefiles/key.c b/fs/cachefiles/key.c index be96f5fc5cac..7f94efc97e23 100644 --- a/fs/cachefiles/key.c +++ b/fs/cachefiles/key.c @@ -150,6 +150,6 @@ char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type) key[len++] = 0; key[len] = 0; - _leave(" = %p %d", key, len); + _leave(" = %s %d", key, len); return key; } diff --git a/fs/cachefiles/main.c b/fs/cachefiles/main.c index ddf0cd58d60c..9c8d34c49b12 100644 --- a/fs/cachefiles/main.c +++ b/fs/cachefiles/main.c @@ -69,15 +69,9 @@ static int __init cachefiles_init(void) goto error_object_jar; } - ret = cachefiles_proc_init(); - if (ret < 0) - goto error_proc; - pr_info("Loaded\n"); return 0; -error_proc: - kmem_cache_destroy(cachefiles_object_jar); error_object_jar: misc_deregister(&cachefiles_dev); error_dev: @@ -94,7 +88,6 @@ static void __exit cachefiles_exit(void) { pr_info("Unloading\n"); - cachefiles_proc_cleanup(); kmem_cache_destroy(cachefiles_object_jar); misc_deregister(&cachefiles_dev); } diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 7bf0732ae25c..a9aca5ab5970 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -39,18 +39,18 @@ void __cachefiles_printk_object(struct cachefiles_object *object, pr_err("%sops=%u inp=%u exc=%u\n", prefix, object->fscache.n_ops, object->fscache.n_in_progress, object->fscache.n_exclusive); - pr_err("%sparent=%p\n", - prefix, object->fscache.parent); + pr_err("%sparent=%x\n", + prefix, object->fscache.parent ? object->fscache.parent->debug_id : 0); spin_lock(&object->fscache.lock); cookie = object->fscache.cookie; if (cookie) { - pr_err("%scookie=%p [pr=%p nd=%p fl=%lx]\n", + pr_err("%scookie=%x [pr=%x nd=%p fl=%lx]\n", prefix, - object->fscache.cookie, - object->fscache.cookie->parent, - object->fscache.cookie->netfs_data, - object->fscache.cookie->flags); + cookie->debug_id, + cookie->parent ? cookie->parent->debug_id : 0, + cookie->netfs_data, + cookie->flags); pr_err("%skey=[%u] '", prefix, cookie->key_len); k = (cookie->key_len <= sizeof(cookie->inline_key)) ? cookie->inline_key : cookie->key; @@ -110,7 +110,7 @@ static void cachefiles_mark_object_buried(struct cachefiles_cache *cache, /* found the dentry for */ found_dentry: - kdebug("preemptive burial: OBJ%x [%s] %p", + kdebug("preemptive burial: OBJ%x [%s] %pd", object->fscache.debug_id, object->fscache.state->name, dentry); @@ -140,7 +140,7 @@ static int cachefiles_mark_object_active(struct cachefiles_cache *cache, struct rb_node **_p, *_parent = NULL; struct dentry *dentry; - _enter(",%p", object); + _enter(",%x", object->fscache.debug_id); try_again: write_lock(&cache->active_lock); @@ -298,8 +298,6 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache, _enter(",'%pd','%pd'", dir, rep); - _debug("remove %p from %p", rep, dir); - /* non-directories can just be unlinked */ if (!d_is_dir(rep)) { _debug("unlink stale object"); @@ -446,7 +444,7 @@ int cachefiles_delete_object(struct cachefiles_cache *cache, struct dentry *dir; int ret; - _enter(",OBJ%x{%p}", object->fscache.debug_id, object->dentry); + _enter(",OBJ%x{%pd}", object->fscache.debug_id, object->dentry); ASSERT(object->dentry); ASSERT(d_backing_inode(object->dentry)); @@ -496,11 +494,10 @@ int cachefiles_walk_to_object(struct cachefiles_object *parent, struct dentry *dir, *next = NULL; struct inode *inode; struct path path; - unsigned long start; const char *name; int ret, nlen; - _enter("OBJ%x{%p},OBJ%x,%s,", + _enter("OBJ%x{%pd},OBJ%x,%s,", parent->fscache.debug_id, parent->dentry, object->fscache.debug_id, key); @@ -535,9 +532,7 @@ lookup_again: inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); - start = jiffies; next = lookup_one_len(name, dir, nlen); - cachefiles_hist(cachefiles_lookup_histogram, start); if (IS_ERR(next)) { trace_cachefiles_lookup(object, next, NULL); goto lookup_error; @@ -545,7 +540,7 @@ lookup_again: inode = d_backing_inode(next); trace_cachefiles_lookup(object, next, inode); - _debug("next -> %p %s", next, inode ? "positive" : "negative"); + _debug("next -> %pd %s", next, inode ? "positive" : "negative"); if (!key) object->new = !inode; @@ -568,9 +563,7 @@ lookup_again: ret = security_path_mkdir(&path, next, 0); if (ret < 0) goto create_error; - start = jiffies; ret = vfs_mkdir(&init_user_ns, d_inode(dir), next, 0); - cachefiles_hist(cachefiles_mkdir_histogram, start); if (!key) trace_cachefiles_mkdir(object, next, ret); if (ret < 0) @@ -583,8 +576,8 @@ lookup_again: } ASSERT(d_backing_inode(next)); - _debug("mkdir -> %p{%p{ino=%lu}}", - next, d_backing_inode(next), d_backing_inode(next)->i_ino); + _debug("mkdir -> %pd{ino=%lu}", + next, d_backing_inode(next)->i_ino); } else if (!d_can_lookup(next)) { pr_err("inode %lu is not a directory\n", @@ -604,18 +597,16 @@ lookup_again: ret = security_path_mknod(&path, next, S_IFREG, 0); if (ret < 0) goto create_error; - start = jiffies; ret = vfs_create(&init_user_ns, d_inode(dir), next, S_IFREG, true); - cachefiles_hist(cachefiles_create_histogram, start); trace_cachefiles_create(object, next, ret); if (ret < 0) goto create_error; ASSERT(d_backing_inode(next)); - _debug("create -> %p{%p{ino=%lu}}", - next, d_backing_inode(next), d_backing_inode(next)->i_ino); + _debug("create -> %pd{ino=%lu}", + next, d_backing_inode(next)->i_ino); } else if (!d_can_lookup(next) && !d_is_reg(next) @@ -765,7 +756,6 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, const char *dirname) { struct dentry *subdir; - unsigned long start; struct path path; int ret; @@ -775,16 +765,14 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, inode_lock(d_inode(dir)); retry: - start = jiffies; subdir = lookup_one_len(dirname, dir, strlen(dirname)); - cachefiles_hist(cachefiles_lookup_histogram, start); if (IS_ERR(subdir)) { if (PTR_ERR(subdir) == -ENOMEM) goto nomem_d_alloc; goto lookup_error; } - _debug("subdir -> %p %s", + _debug("subdir -> %pd %s", subdir, d_backing_inode(subdir) ? "positive" : "negative"); /* we need to create the subdir if it doesn't exist yet */ @@ -810,10 +798,8 @@ retry: } ASSERT(d_backing_inode(subdir)); - _debug("mkdir -> %p{%p{ino=%lu}}", - subdir, - d_backing_inode(subdir), - d_backing_inode(subdir)->i_ino); + _debug("mkdir -> %pd{ino=%lu}", + subdir, d_backing_inode(subdir)->i_ino); } inode_unlock(d_inode(dir)); @@ -876,7 +862,6 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache, struct cachefiles_object *object; struct rb_node *_n; struct dentry *victim; - unsigned long start; int ret; //_enter(",%pd/,%s", @@ -885,13 +870,11 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache, /* look up the victim */ inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); - start = jiffies; victim = lookup_one_len(filename, dir, strlen(filename)); - cachefiles_hist(cachefiles_lookup_histogram, start); if (IS_ERR(victim)) goto lookup_error; - //_debug("victim -> %p %s", + //_debug("victim -> %pd %s", // victim, d_backing_inode(victim) ? "positive" : "negative"); /* if the object is no longer there then we probably retired the object @@ -922,7 +905,7 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache, read_unlock(&cache->active_lock); - //_leave(" = %p", victim); + //_leave(" = %pd", victim); return victim; object_in_use: @@ -968,7 +951,7 @@ int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir, if (IS_ERR(victim)) return PTR_ERR(victim); - _debug("victim -> %p %s", + _debug("victim -> %pd %s", victim, d_backing_inode(victim) ? "positive" : "negative"); /* okay... the victim is not being used so we can cull it diff --git a/fs/cachefiles/proc.c b/fs/cachefiles/proc.c deleted file mode 100644 index 6e67aea0f24e..000000000000 --- a/fs/cachefiles/proc.c +++ /dev/null @@ -1,114 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* CacheFiles statistics - * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - */ - -#include <linux/module.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> -#include "internal.h" - -atomic_t cachefiles_lookup_histogram[HZ]; -atomic_t cachefiles_mkdir_histogram[HZ]; -atomic_t cachefiles_create_histogram[HZ]; - -/* - * display the latency histogram - */ -static int cachefiles_histogram_show(struct seq_file *m, void *v) -{ - unsigned long index; - unsigned x, y, z, t; - - switch ((unsigned long) v) { - case 1: - seq_puts(m, "JIFS SECS LOOKUPS MKDIRS CREATES\n"); - return 0; - case 2: - seq_puts(m, "===== ===== ========= ========= =========\n"); - return 0; - default: - index = (unsigned long) v - 3; - x = atomic_read(&cachefiles_lookup_histogram[index]); - y = atomic_read(&cachefiles_mkdir_histogram[index]); - z = atomic_read(&cachefiles_create_histogram[index]); - if (x == 0 && y == 0 && z == 0) - return 0; - - t = (index * 1000) / HZ; - - seq_printf(m, "%4lu 0.%03u %9u %9u %9u\n", index, t, x, y, z); - return 0; - } -} - -/* - * set up the iterator to start reading from the first line - */ -static void *cachefiles_histogram_start(struct seq_file *m, loff_t *_pos) -{ - if ((unsigned long long)*_pos >= HZ + 2) - return NULL; - if (*_pos == 0) - *_pos = 1; - return (void *)(unsigned long) *_pos; -} - -/* - * move to the next line - */ -static void *cachefiles_histogram_next(struct seq_file *m, void *v, loff_t *pos) -{ - (*pos)++; - return (unsigned long long)*pos > HZ + 2 ? - NULL : (void *)(unsigned long) *pos; -} - -/* - * clean up after reading - */ -static void cachefiles_histogram_stop(struct seq_file *m, void *v) -{ -} - -static const struct seq_operations cachefiles_histogram_ops = { - .start = cachefiles_histogram_start, - .stop = cachefiles_histogram_stop, - .next = cachefiles_histogram_next, - .show = cachefiles_histogram_show, -}; - -/* - * initialise the /proc/fs/cachefiles/ directory - */ -int __init cachefiles_proc_init(void) -{ - _enter(""); - - if (!proc_mkdir("fs/cachefiles", NULL)) - goto error_dir; - - if (!proc_create_seq("fs/cachefiles/histogram", S_IFREG | 0444, NULL, - &cachefiles_histogram_ops)) - goto error_histogram; - - _leave(" = 0"); - return 0; - -error_histogram: - remove_proc_entry("fs/cachefiles", NULL); -error_dir: - _leave(" = -ENOMEM"); - return -ENOMEM; -} - -/* - * clean up the /proc/fs/cachefiles/ directory - */ -void cachefiles_proc_cleanup(void) -{ - remove_proc_entry("fs/cachefiles/histogram", NULL); - remove_proc_entry("fs/cachefiles", NULL); -} diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c index a591b5e09637..9e82de668595 100644 --- a/fs/cachefiles/xattr.c +++ b/fs/cachefiles/xattr.c @@ -36,7 +36,7 @@ int cachefiles_check_object_type(struct cachefiles_object *object) else snprintf(type, 3, "%02x", object->fscache.cookie->def->type); - _enter("%p{%s}", object, type); + _enter("%x{%s}", object->fscache.debug_id, type); /* attempt to install a type label directly */ ret = vfs_setxattr(&init_user_ns, dentry, cachefiles_xattr_cache, type, @@ -134,7 +134,7 @@ int cachefiles_update_object_xattr(struct cachefiles_object *object, if (!dentry) return -ESTALE; - _enter("%p,#%d", object, auxdata->len); + _enter("%x,#%d", object->fscache.debug_id, auxdata->len); /* attempt to install the cache metadata directly */ _debug("SET #%u", auxdata->len); diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c index 529af59d9fd3..f4fc8e0b847c 100644 --- a/fs/ceph/acl.c +++ b/fs/ceph/acl.c @@ -29,7 +29,7 @@ static inline void ceph_set_cached_acl(struct inode *inode, spin_unlock(&ci->i_ceph_lock); } -struct posix_acl *ceph_get_acl(struct inode *inode, int type) +struct posix_acl *ceph_get_acl(struct inode *inode, int type, bool rcu) { int size; unsigned int retry_cnt = 0; @@ -37,6 +37,9 @@ struct posix_acl *ceph_get_acl(struct inode *inode, int type) char *value = NULL; struct posix_acl *acl; + if (rcu) + return ERR_PTR(-ECHILD); + switch (type) { case ACL_TYPE_ACCESS: name = XATTR_NAME_POSIX_ACL_ACCESS; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index a1e2813731d1..7e7a897ae0d3 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1395,9 +1395,11 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf) ret = VM_FAULT_SIGBUS; } else { struct address_space *mapping = inode->i_mapping; - struct page *page = find_or_create_page(mapping, 0, - mapping_gfp_constraint(mapping, - ~__GFP_FS)); + struct page *page; + + filemap_invalidate_lock_shared(mapping); + page = find_or_create_page(mapping, 0, + mapping_gfp_constraint(mapping, ~__GFP_FS)); if (!page) { ret = VM_FAULT_OOM; goto out_inline; @@ -1418,6 +1420,7 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf) vmf->page = page; ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED; out_inline: + filemap_invalidate_unlock_shared(mapping); dout("filemap_fault %p %llu read inline data ret %x\n", inode, off, ret); } diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 7bdefd0c789a..39db97f149b9 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1743,7 +1743,11 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask, struct ceph_cap_flush *ceph_alloc_cap_flush(void) { - return kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL); + struct ceph_cap_flush *cf; + + cf = kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL); + cf->is_capsnap = false; + return cf; } void ceph_free_cap_flush(struct ceph_cap_flush *cf) @@ -1778,7 +1782,7 @@ static bool __detach_cap_flush_from_mdsc(struct ceph_mds_client *mdsc, prev->wake = true; wake = false; } - list_del(&cf->g_list); + list_del_init(&cf->g_list); return wake; } @@ -1793,7 +1797,7 @@ static bool __detach_cap_flush_from_ci(struct ceph_inode_info *ci, prev->wake = true; wake = false; } - list_del(&cf->i_list); + list_del_init(&cf->i_list); return wake; } @@ -2352,7 +2356,7 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc, ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; list_for_each_entry_reverse(cf, &ci->i_cap_flush_list, i_list) { - if (!cf->caps) { + if (cf->is_capsnap) { last_snap_flush = cf->tid; break; } @@ -2371,7 +2375,7 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc, first_tid = cf->tid + 1; - if (cf->caps) { + if (!cf->is_capsnap) { struct cap_msg_args arg; dout("kick_flushing_caps %p cap %p tid %llu %s\n", @@ -3516,7 +3520,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, cleaned = cf->caps; /* Is this a capsnap? */ - if (cf->caps == 0) + if (cf->is_capsnap) continue; if (cf->tid <= flush_tid) { @@ -3589,8 +3593,9 @@ out: while (!list_empty(&to_remove)) { cf = list_first_entry(&to_remove, struct ceph_cap_flush, i_list); - list_del(&cf->i_list); - ceph_free_cap_flush(cf); + list_del_init(&cf->i_list); + if (!cf->is_capsnap) + ceph_free_cap_flush(cf); } if (wake_ci) @@ -4150,11 +4155,19 @@ bad: /* * Delayed work handler to process end of delayed cap release LRU list. + * + * If new caps are added to the list while processing it, these won't get + * processed in this run. In this case, the ci->i_hold_caps_max will be + * returned so that the work can be scheduled accordingly. */ -void ceph_check_delayed_caps(struct ceph_mds_client *mdsc) +unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc) { struct inode *inode; struct ceph_inode_info *ci; + struct ceph_mount_options *opt = mdsc->fsc->mount_options; + unsigned long delay_max = opt->caps_wanted_delay_max * HZ; + unsigned long loop_start = jiffies; + unsigned long delay = 0; dout("check_delayed_caps\n"); spin_lock(&mdsc->cap_delay_lock); @@ -4162,6 +4175,11 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc) ci = list_first_entry(&mdsc->cap_delay_list, struct ceph_inode_info, i_cap_delay_list); + if (time_before(loop_start, ci->i_hold_caps_max - delay_max)) { + dout("%s caps added recently. Exiting loop", __func__); + delay = ci->i_hold_caps_max; + break; + } if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 && time_before(jiffies, ci->i_hold_caps_max)) break; @@ -4177,6 +4195,8 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc) } } spin_unlock(&mdsc->cap_delay_lock); + + return delay; } /* diff --git a/fs/ceph/file.c b/fs/ceph/file.c index d1755ac1d964..e1d605a02d4a 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -2088,6 +2088,7 @@ static long ceph_fallocate(struct file *file, int mode, if (ret < 0) goto unlock; + filemap_invalidate_lock(inode->i_mapping); ceph_zero_pagecache_range(inode, offset, length); ret = ceph_zero_objects(inode, offset, length); @@ -2100,6 +2101,7 @@ static long ceph_fallocate(struct file *file, int mode, if (dirty) __mark_inode_dirty(inode, dirty); } + filemap_invalidate_unlock(inode->i_mapping); ceph_put_cap_refs(ci, got); unlock: diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index fa8a847743d0..bdeb271f47d9 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -240,9 +240,6 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) if (!(fl->fl_flags & FL_POSIX)) return -ENOLCK; - /* No mandatory locks */ - if (__mandatory_lock(file->f_mapping->host) && fl->fl_type != F_UNLCK) - return -ENOLCK; dout("ceph_lock, fl_owner: %p\n", fl->fl_owner); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 9db1b39df773..0b69aec23e5c 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1616,7 +1616,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, spin_lock(&mdsc->cap_dirty_lock); list_for_each_entry(cf, &to_remove, i_list) - list_del(&cf->g_list); + list_del_init(&cf->g_list); if (!list_empty(&ci->i_dirty_item)) { pr_warn_ratelimited( @@ -1668,8 +1668,9 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, struct ceph_cap_flush *cf; cf = list_first_entry(&to_remove, struct ceph_cap_flush, i_list); - list_del(&cf->i_list); - ceph_free_cap_flush(cf); + list_del_init(&cf->i_list); + if (!cf->is_capsnap) + ceph_free_cap_flush(cf); } wake_up_all(&ci->i_cap_wq); @@ -4490,22 +4491,29 @@ void inc_session_sequence(struct ceph_mds_session *s) } /* - * delayed work -- periodically trim expired leases, renew caps with mds + * delayed work -- periodically trim expired leases, renew caps with mds. If + * the @delay parameter is set to 0 or if it's more than 5 secs, the default + * workqueue delay value of 5 secs will be used. */ -static void schedule_delayed(struct ceph_mds_client *mdsc) +static void schedule_delayed(struct ceph_mds_client *mdsc, unsigned long delay) { - int delay = 5; - unsigned hz = round_jiffies_relative(HZ * delay); - schedule_delayed_work(&mdsc->delayed_work, hz); + unsigned long max_delay = HZ * 5; + + /* 5 secs default delay */ + if (!delay || (delay > max_delay)) + delay = max_delay; + schedule_delayed_work(&mdsc->delayed_work, + round_jiffies_relative(delay)); } static void delayed_work(struct work_struct *work) { - int i; struct ceph_mds_client *mdsc = container_of(work, struct ceph_mds_client, delayed_work.work); + unsigned long delay; int renew_interval; int renew_caps; + int i; dout("mdsc delayed_work\n"); @@ -4545,7 +4553,7 @@ static void delayed_work(struct work_struct *work) } mutex_unlock(&mdsc->mutex); - ceph_check_delayed_caps(mdsc); + delay = ceph_check_delayed_caps(mdsc); ceph_queue_cap_reclaim_work(mdsc); @@ -4553,7 +4561,7 @@ static void delayed_work(struct work_struct *work) maybe_recover_session(mdsc); - schedule_delayed(mdsc); + schedule_delayed(mdsc, delay); } int ceph_mdsc_init(struct ceph_fs_client *fsc) @@ -5030,7 +5038,7 @@ void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg) mdsc->mdsmap->m_epoch); mutex_unlock(&mdsc->mutex); - schedule_delayed(mdsc); + schedule_delayed(mdsc, 0); return; bad_unlock: diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index abd9af7727ad..3c444b9cb17b 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -394,9 +394,11 @@ void ceph_mdsmap_destroy(struct ceph_mdsmap *m) { int i; - for (i = 0; i < m->possible_max_rank; i++) - kfree(m->m_info[i].export_targets); - kfree(m->m_info); + if (m->m_info) { + for (i = 0; i < m->possible_max_rank; i++) + kfree(m->m_info[i].export_targets); + kfree(m->m_info); + } kfree(m->m_data_pg_pools); kfree(m); } diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 4ac0606dcbd4..15105f9da3fd 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -67,19 +67,19 @@ void ceph_get_snap_realm(struct ceph_mds_client *mdsc, { lockdep_assert_held(&mdsc->snap_rwsem); - dout("get_realm %p %d -> %d\n", realm, - atomic_read(&realm->nref), atomic_read(&realm->nref)+1); /* - * since we _only_ increment realm refs or empty the empty - * list with snap_rwsem held, adjusting the empty list here is - * safe. we do need to protect against concurrent empty list - * additions, however. + * The 0->1 and 1->0 transitions must take the snap_empty_lock + * atomically with the refcount change. Go ahead and bump the + * nref here, unless it's 0, in which case we take the spinlock + * and then do the increment and remove it from the list. */ - if (atomic_inc_return(&realm->nref) == 1) { - spin_lock(&mdsc->snap_empty_lock); + if (atomic_inc_not_zero(&realm->nref)) + return; + + spin_lock(&mdsc->snap_empty_lock); + if (atomic_inc_return(&realm->nref) == 1) list_del_init(&realm->empty_item); - spin_unlock(&mdsc->snap_empty_lock); - } + spin_unlock(&mdsc->snap_empty_lock); } static void __insert_snap_realm(struct rb_root *root, @@ -208,28 +208,28 @@ static void __put_snap_realm(struct ceph_mds_client *mdsc, { lockdep_assert_held_write(&mdsc->snap_rwsem); - dout("__put_snap_realm %llx %p %d -> %d\n", realm->ino, realm, - atomic_read(&realm->nref), atomic_read(&realm->nref)-1); + /* + * We do not require the snap_empty_lock here, as any caller that + * increments the value must hold the snap_rwsem. + */ if (atomic_dec_and_test(&realm->nref)) __destroy_snap_realm(mdsc, realm); } /* - * caller needn't hold any locks + * See comments in ceph_get_snap_realm. Caller needn't hold any locks. */ void ceph_put_snap_realm(struct ceph_mds_client *mdsc, struct ceph_snap_realm *realm) { - dout("put_snap_realm %llx %p %d -> %d\n", realm->ino, realm, - atomic_read(&realm->nref), atomic_read(&realm->nref)-1); - if (!atomic_dec_and_test(&realm->nref)) + if (!atomic_dec_and_lock(&realm->nref, &mdsc->snap_empty_lock)) return; if (down_write_trylock(&mdsc->snap_rwsem)) { + spin_unlock(&mdsc->snap_empty_lock); __destroy_snap_realm(mdsc, realm); up_write(&mdsc->snap_rwsem); } else { - spin_lock(&mdsc->snap_empty_lock); list_add(&realm->empty_item, &mdsc->snap_empty); spin_unlock(&mdsc->snap_empty_lock); } @@ -487,6 +487,9 @@ static void ceph_queue_cap_snap(struct ceph_inode_info *ci) pr_err("ENOMEM allocating ceph_cap_snap on %p\n", inode); return; } + capsnap->cap_flush.is_capsnap = true; + INIT_LIST_HEAD(&capsnap->cap_flush.i_list); + INIT_LIST_HEAD(&capsnap->cap_flush.g_list); spin_lock(&ci->i_ceph_lock); used = __ceph_caps_used(ci); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 6b6332a5c113..c30258f95e37 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -182,8 +182,9 @@ struct ceph_cap { struct ceph_cap_flush { u64 tid; - int caps; /* 0 means capsnap */ + int caps; bool wake; /* wake up flush waiters when finish ? */ + bool is_capsnap; /* true means capsnap */ struct list_head g_list; // global struct list_head i_list; // per inode }; @@ -1087,7 +1088,7 @@ void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx); /* acl.c */ #ifdef CONFIG_CEPH_FS_POSIX_ACL -struct posix_acl *ceph_get_acl(struct inode *, int); +struct posix_acl *ceph_get_acl(struct inode *, int, bool); int ceph_set_acl(struct user_namespace *mnt_userns, struct inode *inode, struct posix_acl *acl, int type); int ceph_pre_init_acls(struct inode *dir, umode_t *mode, @@ -1167,7 +1168,7 @@ extern void ceph_flush_snaps(struct ceph_inode_info *ci, extern bool __ceph_should_report_size(struct ceph_inode_info *ci); extern void ceph_check_caps(struct ceph_inode_info *ci, int flags, struct ceph_mds_session *session); -extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc); +extern unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc); extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc); extern int ceph_drop_caps_for_unlink(struct inode *inode); extern int ceph_encode_inode_release(void **p, struct inode *inode, diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index 7364950a9ef4..3b7e3b9e4fd2 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -4,19 +4,16 @@ config CIFS depends on INET select NLS select CRYPTO - select CRYPTO_MD4 select CRYPTO_MD5 select CRYPTO_SHA256 select CRYPTO_SHA512 select CRYPTO_CMAC select CRYPTO_HMAC - select CRYPTO_LIB_ARC4 select CRYPTO_AEAD2 select CRYPTO_CCM select CRYPTO_GCM select CRYPTO_ECB select CRYPTO_AES - select CRYPTO_LIB_DES select KEYS select DNS_RESOLVER select ASN1 @@ -85,33 +82,6 @@ config CIFS_ALLOW_INSECURE_LEGACY If unsure, say Y. -config CIFS_WEAK_PW_HASH - bool "Support legacy servers which use weaker LANMAN security" - depends on CIFS && CIFS_ALLOW_INSECURE_LEGACY - help - Modern CIFS servers including Samba and most Windows versions - (since 1997) support stronger NTLM (and even NTLMv2 and Kerberos) - security mechanisms. These hash the password more securely - than the mechanisms used in the older LANMAN version of the - SMB protocol but LANMAN based authentication is needed to - establish sessions with some old SMB servers. - - Enabling this option allows the cifs module to mount to older - LANMAN based servers such as OS/2 and Windows 95, but such - mounts may be less secure than mounts using NTLM or more recent - security mechanisms if you are on a public network. Unless you - have a need to access old SMB servers (and are on a private - network) you probably want to say N. Even if this support - is enabled in the kernel build, LANMAN authentication will not be - used automatically. At runtime LANMAN mounts are disabled but - can be set to required (or optional) either in - /proc/fs/cifs (see Documentation/admin-guide/cifs/usage.rst for - more detail) or via an option on the mount command. This support - is disabled by default in order to reduce the possibility of a - downgrade attack. - - If unsure, say N. - config CIFS_UPCALL bool "Kerberos/SPNEGO advanced session setup" depends on CIFS diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 8857ac7e7a14..51a824fc926a 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -250,9 +250,6 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY seq_printf(m, ",ALLOW_INSECURE_LEGACY"); #endif -#ifdef CONFIG_CIFS_WEAK_PW_HASH - seq_printf(m, ",WEAK_PW_HASH"); -#endif #ifdef CONFIG_CIFS_POSIX seq_printf(m, ",CIFS_POSIX"); #endif @@ -929,14 +926,6 @@ cifs_security_flags_handle_must_flags(unsigned int *flags) *flags = CIFSSEC_MUST_NTLMSSP; else if ((*flags & CIFSSEC_MUST_NTLMV2) == CIFSSEC_MUST_NTLMV2) *flags = CIFSSEC_MUST_NTLMV2; - else if ((*flags & CIFSSEC_MUST_NTLM) == CIFSSEC_MUST_NTLM) - *flags = CIFSSEC_MUST_NTLM; - else if (CIFSSEC_MUST_LANMAN && - (*flags & CIFSSEC_MUST_LANMAN) == CIFSSEC_MUST_LANMAN) - *flags = CIFSSEC_MUST_LANMAN; - else if (CIFSSEC_MUST_PLNTXT && - (*flags & CIFSSEC_MUST_PLNTXT) == CIFSSEC_MUST_PLNTXT) - *flags = CIFSSEC_MUST_PLNTXT; *flags |= signflags; } diff --git a/fs/cifs/cifs_swn.c b/fs/cifs/cifs_swn.c index 93b47818c6c2..12bde7bfda86 100644 --- a/fs/cifs/cifs_swn.c +++ b/fs/cifs/cifs_swn.c @@ -147,8 +147,6 @@ static int cifs_swn_send_register_message(struct cifs_swn_reg *swnreg) goto nlmsg_fail; } break; - case LANMAN: - case NTLM: case NTLMv2: case RawNTLMSSP: ret = cifs_swn_auth_info_ntlm(swnreg->tcon, skb); diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c index 9bd03a231032..171ad8b42107 100644 --- a/fs/cifs/cifs_unicode.c +++ b/fs/cifs/cifs_unicode.c @@ -358,14 +358,9 @@ cifs_strndup_from_utf16(const char *src, const int maxlen, if (!dst) return NULL; cifs_from_utf16(dst, (__le16 *) src, len, maxlen, codepage, - NO_MAP_UNI_RSVD); + NO_MAP_UNI_RSVD); } else { - len = strnlen(src, maxlen); - len++; - dst = kmalloc(len, GFP_KERNEL); - if (!dst) - return NULL; - strlcpy(dst, src, len); + dst = kstrndup(src, maxlen, GFP_KERNEL); } return dst; diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index ecf15d845dbd..6679e07e533e 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -22,7 +22,7 @@ #include <linux/random.h> #include <linux/highmem.h> #include <linux/fips.h> -#include <crypto/arc4.h> +#include "../cifs_common/arc4.h" #include <crypto/aead.h> int __cifs_calc_signature(struct smb_rqst *rqst, @@ -250,87 +250,6 @@ int cifs_verify_signature(struct smb_rqst *rqst, } -/* first calculate 24 bytes ntlm response and then 16 byte session key */ -int setup_ntlm_response(struct cifs_ses *ses, const struct nls_table *nls_cp) -{ - int rc = 0; - unsigned int temp_len = CIFS_SESS_KEY_SIZE + CIFS_AUTH_RESP_SIZE; - char temp_key[CIFS_SESS_KEY_SIZE]; - - if (!ses) - return -EINVAL; - - ses->auth_key.response = kmalloc(temp_len, GFP_KERNEL); - if (!ses->auth_key.response) - return -ENOMEM; - - ses->auth_key.len = temp_len; - - rc = SMBNTencrypt(ses->password, ses->server->cryptkey, - ses->auth_key.response + CIFS_SESS_KEY_SIZE, nls_cp); - if (rc) { - cifs_dbg(FYI, "%s Can't generate NTLM response, error: %d\n", - __func__, rc); - return rc; - } - - rc = E_md4hash(ses->password, temp_key, nls_cp); - if (rc) { - cifs_dbg(FYI, "%s Can't generate NT hash, error: %d\n", - __func__, rc); - return rc; - } - - rc = mdfour(ses->auth_key.response, temp_key, CIFS_SESS_KEY_SIZE); - if (rc) - cifs_dbg(FYI, "%s Can't generate NTLM session key, error: %d\n", - __func__, rc); - - return rc; -} - -#ifdef CONFIG_CIFS_WEAK_PW_HASH -int calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt, - char *lnm_session_key) -{ - int i, len; - int rc; - char password_with_pad[CIFS_ENCPWD_SIZE] = {0}; - - if (password) { - for (len = 0; len < CIFS_ENCPWD_SIZE; len++) - if (!password[len]) - break; - - memcpy(password_with_pad, password, len); - } - - if (!encrypt && global_secflags & CIFSSEC_MAY_PLNTXT) { - memcpy(lnm_session_key, password_with_pad, - CIFS_ENCPWD_SIZE); - return 0; - } - - /* calculate old style session key */ - /* calling toupper is less broken than repeatedly - calling nls_toupper would be since that will never - work for UTF8, but neither handles multibyte code pages - but the only alternative would be converting to UCS-16 (Unicode) - (using a routine something like UniStrupr) then - uppercasing and then converting back from Unicode - which - would only worth doing it if we knew it were utf8. Basically - utf8 and other multibyte codepages each need their own strupper - function since a byte at a time will ont work. */ - - for (i = 0; i < CIFS_ENCPWD_SIZE; i++) - password_with_pad[i] = toupper(password_with_pad[i]); - - rc = SMBencrypt(password_with_pad, cryptkey, lnm_session_key); - - return rc; -} -#endif /* CIFS_WEAK_PW_HASH */ - /* Build a proper attribute value/target info pairs blob. * Fill in netbios and dns domain name and workstation name * and client time (total five av pairs and + one end of fields indicator. @@ -780,9 +699,9 @@ calc_seckey(struct cifs_ses *ses) return -ENOMEM; } - arc4_setkey(ctx_arc4, ses->auth_key.response, CIFS_SESS_KEY_SIZE); - arc4_crypt(ctx_arc4, ses->ntlmssp->ciphertext, sec_key, - CIFS_CPHTXT_SIZE); + cifs_arc4_setkey(ctx_arc4, ses->auth_key.response, CIFS_SESS_KEY_SIZE); + cifs_arc4_crypt(ctx_arc4, ses->ntlmssp->ciphertext, sec_key, + CIFS_CPHTXT_SIZE); /* make secondary_key/nonce as session key */ memcpy(ses->auth_key.response, sec_key, CIFS_SESS_KEY_SIZE); diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 64b71c4e2a9d..8c20bfa187ac 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -399,7 +399,6 @@ cifs_evict_inode(struct inode *inode) { truncate_inode_pages_final(&inode->i_data); clear_inode(inode); - cifs_fscache_release_inode_cookie(inode); } static void @@ -438,15 +437,9 @@ cifs_show_security(struct seq_file *s, struct cifs_ses *ses) seq_puts(s, ",sec="); switch (ses->sectype) { - case LANMAN: - seq_puts(s, "lanman"); - break; case NTLMv2: seq_puts(s, "ntlmv2"); break; - case NTLM: - seq_puts(s, "ntlm"); - break; case Kerberos: seq_puts(s, "krb5"); break; @@ -1755,7 +1748,6 @@ MODULE_DESCRIPTION MODULE_VERSION(CIFS_VERSION); MODULE_SOFTDEP("ecb"); MODULE_SOFTDEP("hmac"); -MODULE_SOFTDEP("md4"); MODULE_SOFTDEP("md5"); MODULE_SOFTDEP("nls"); MODULE_SOFTDEP("aes"); diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index c0bfc2f01030..c068f7d8d879 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -114,8 +114,6 @@ enum statusEnum { enum securityEnum { Unspecified = 0, /* not specified */ - LANMAN, /* Legacy LANMAN auth */ - NTLM, /* Legacy NTLM012 auth with NTLM hash */ NTLMv2, /* Legacy NTLM auth with NTLMv2 hash */ RawNTLMSSP, /* NTLMSSP without SPNEGO, NTLMv2 hash */ Kerberos, /* Kerberos via SPNEGO */ @@ -634,7 +632,6 @@ struct TCP_Server_Info { struct session_key session_key; unsigned long lstrp; /* when we got last response from this server */ struct cifs_secmech secmech; /* crypto sec mech functs, descriptors */ -#define CIFS_NEGFLAVOR_LANMAN 0 /* wct == 13, LANMAN */ #define CIFS_NEGFLAVOR_UNENCAP 1 /* wct == 17, but no ext_sec */ #define CIFS_NEGFLAVOR_EXTENDED 2 /* wct == 17, ext_sec bit set */ char negflavor; /* NEGOTIATE response flavor */ @@ -1611,6 +1608,11 @@ struct dfs_info3_param { int ttl; }; +struct file_list { + struct list_head list; + struct cifsFileInfo *cfile; +}; + /* * common struct for holding inode info when searching for or updating an * inode with new info @@ -1729,16 +1731,8 @@ static inline bool is_retryable_error(int error) /* Security Flags: indicate type of session setup needed */ #define CIFSSEC_MAY_SIGN 0x00001 -#define CIFSSEC_MAY_NTLM 0x00002 #define CIFSSEC_MAY_NTLMV2 0x00004 #define CIFSSEC_MAY_KRB5 0x00008 -#ifdef CONFIG_CIFS_WEAK_PW_HASH -#define CIFSSEC_MAY_LANMAN 0x00010 -#define CIFSSEC_MAY_PLNTXT 0x00020 -#else -#define CIFSSEC_MAY_LANMAN 0 -#define CIFSSEC_MAY_PLNTXT 0 -#endif /* weak passwords */ #define CIFSSEC_MAY_SEAL 0x00040 /* not supported yet */ #define CIFSSEC_MAY_NTLMSSP 0x00080 /* raw ntlmssp with ntlmv2 */ @@ -1746,32 +1740,19 @@ static inline bool is_retryable_error(int error) /* note that only one of the following can be set so the result of setting MUST flags more than once will be to require use of the stronger protocol */ -#define CIFSSEC_MUST_NTLM 0x02002 #define CIFSSEC_MUST_NTLMV2 0x04004 #define CIFSSEC_MUST_KRB5 0x08008 -#ifdef CONFIG_CIFS_WEAK_PW_HASH -#define CIFSSEC_MUST_LANMAN 0x10010 -#define CIFSSEC_MUST_PLNTXT 0x20020 -#ifdef CONFIG_CIFS_UPCALL -#define CIFSSEC_MASK 0xBF0BF /* allows weak security but also krb5 */ -#else -#define CIFSSEC_MASK 0xB70B7 /* current flags supported if weak */ -#endif /* UPCALL */ -#else /* do not allow weak pw hash */ -#define CIFSSEC_MUST_LANMAN 0 -#define CIFSSEC_MUST_PLNTXT 0 #ifdef CONFIG_CIFS_UPCALL #define CIFSSEC_MASK 0x8F08F /* flags supported if no weak allowed */ #else #define CIFSSEC_MASK 0x87087 /* flags supported if no weak allowed */ #endif /* UPCALL */ -#endif /* WEAK_PW_HASH */ #define CIFSSEC_MUST_SEAL 0x40040 /* not supported yet */ #define CIFSSEC_MUST_NTLMSSP 0x80080 /* raw ntlmssp with ntlmv2 */ #define CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_NTLMSSP) -#define CIFSSEC_MAX (CIFSSEC_MUST_SIGN | CIFSSEC_MUST_NTLMV2) -#define CIFSSEC_AUTH_MASK (CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_LANMAN | CIFSSEC_MAY_PLNTXT | CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP) +#define CIFSSEC_MAX (CIFSSEC_MUST_NTLMV2) +#define CIFSSEC_AUTH_MASK (CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP) /* ***************************************************************** * All constants go here @@ -1935,10 +1916,6 @@ static inline char *get_security_type_str(enum securityEnum sectype) return "Kerberos"; case NTLMv2: return "NTLMv2"; - case NTLM: - return "NTLM"; - case LANMAN: - return "LANMAN"; default: return "Unknown"; } diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index f6e235001358..dc920e206336 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -14,13 +14,7 @@ #include <asm/unaligned.h> #include "smbfsctl.h" -#ifdef CONFIG_CIFS_WEAK_PW_HASH -#define LANMAN_PROT 0 -#define LANMAN2_PROT 1 -#define CIFS_PROT 2 -#else #define CIFS_PROT 0 -#endif #define POSIX_PROT (CIFS_PROT+1) #define BAD_PROT 0xFFFF @@ -505,30 +499,8 @@ typedef struct negotiate_req { unsigned char DialectsArray[1]; } __attribute__((packed)) NEGOTIATE_REQ; -/* Dialect index is 13 for LANMAN */ - #define MIN_TZ_ADJ (15 * 60) /* minimum grid for timezones in seconds */ -typedef struct lanman_neg_rsp { - struct smb_hdr hdr; /* wct = 13 */ - __le16 DialectIndex; - __le16 SecurityMode; - __le16 MaxBufSize; - __le16 MaxMpxCount; - __le16 MaxNumberVcs; - __le16 RawMode; - __le32 SessionKey; - struct { - __le16 Time; - __le16 Date; - } __attribute__((packed)) SrvTime; - __le16 ServerTimeZone; - __le16 EncryptionKeyLength; - __le16 Reserved; - __u16 ByteCount; - unsigned char EncryptionKey[1]; -} __attribute__((packed)) LANMAN_NEG_RSP; - #define READ_RAW_ENABLE 1 #define WRITE_RAW_ENABLE 2 #define RAW_ENABLE (READ_RAW_ENABLE | WRITE_RAW_ENABLE) diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index e0def0f0714b..f9740c21ca3d 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -498,19 +498,12 @@ extern int cifs_sign_smb(struct smb_hdr *, struct TCP_Server_Info *, __u32 *); extern int cifs_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, __u32 expected_sequence_number); -extern int SMBNTencrypt(unsigned char *, unsigned char *, unsigned char *, - const struct nls_table *); -extern int setup_ntlm_response(struct cifs_ses *, const struct nls_table *); extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *); extern void cifs_crypto_secmech_release(struct TCP_Server_Info *server); extern int calc_seckey(struct cifs_ses *); extern int generate_smb30signingkey(struct cifs_ses *); extern int generate_smb311signingkey(struct cifs_ses *); -#ifdef CONFIG_CIFS_WEAK_PW_HASH -extern int calc_lanman_hash(const char *password, const char *cryptkey, - bool encrypt, char *lnm_session_key); -#endif /* CIFS_WEAK_PW_HASH */ extern int CIFSSMBCopy(unsigned int xid, struct cifs_tcon *source_tcon, const char *fromName, @@ -547,11 +540,8 @@ extern int check_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, const unsigned char *path); -extern int mdfour(unsigned char *, unsigned char *, int); extern int E_md4hash(const unsigned char *passwd, unsigned char *p16, const struct nls_table *codepage); -extern int SMBencrypt(unsigned char *passwd, const unsigned char *c8, - unsigned char *p24); extern int cifs_setup_volume_info(struct smb3_fs_context *ctx, const char *mntopts, const char *devname); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 65d1a65bfc37..a8e41c1e80ca 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -42,10 +42,6 @@ static struct { int index; char *name; } protocols[] = { -#ifdef CONFIG_CIFS_WEAK_PW_HASH - {LANMAN_PROT, "\2LM1.2X002"}, - {LANMAN2_PROT, "\2LANMAN2.1"}, -#endif /* weak password hashing for legacy clients */ {CIFS_PROT, "\2NT LM 0.12"}, {POSIX_PROT, "\2POSIX 2"}, {BAD_PROT, "\2"} @@ -55,10 +51,6 @@ static struct { int index; char *name; } protocols[] = { -#ifdef CONFIG_CIFS_WEAK_PW_HASH - {LANMAN_PROT, "\2LM1.2X002"}, - {LANMAN2_PROT, "\2LANMAN2.1"}, -#endif /* weak password hashing for legacy clients */ {CIFS_PROT, "\2NT LM 0.12"}, {BAD_PROT, "\2"} }; @@ -66,17 +58,9 @@ static struct { /* define the number of elements in the cifs dialect array */ #ifdef CONFIG_CIFS_POSIX -#ifdef CONFIG_CIFS_WEAK_PW_HASH -#define CIFS_NUM_PROT 4 -#else #define CIFS_NUM_PROT 2 -#endif /* CIFS_WEAK_PW_HASH */ #else /* not posix */ -#ifdef CONFIG_CIFS_WEAK_PW_HASH -#define CIFS_NUM_PROT 3 -#else #define CIFS_NUM_PROT 1 -#endif /* CONFIG_CIFS_WEAK_PW_HASH */ #endif /* CIFS_POSIX */ /* @@ -475,89 +459,6 @@ cifs_enable_signing(struct TCP_Server_Info *server, bool mnt_sign_required) return 0; } -#ifdef CONFIG_CIFS_WEAK_PW_HASH -static int -decode_lanman_negprot_rsp(struct TCP_Server_Info *server, NEGOTIATE_RSP *pSMBr) -{ - __s16 tmp; - struct lanman_neg_rsp *rsp = (struct lanman_neg_rsp *)pSMBr; - - if (server->dialect != LANMAN_PROT && server->dialect != LANMAN2_PROT) - return -EOPNOTSUPP; - - server->sec_mode = le16_to_cpu(rsp->SecurityMode); - server->maxReq = min_t(unsigned int, - le16_to_cpu(rsp->MaxMpxCount), - cifs_max_pending); - set_credits(server, server->maxReq); - server->maxBuf = le16_to_cpu(rsp->MaxBufSize); - /* set up max_read for readpages check */ - server->max_read = server->maxBuf; - /* even though we do not use raw we might as well set this - accurately, in case we ever find a need for it */ - if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) { - server->max_rw = 0xFF00; - server->capabilities = CAP_MPX_MODE | CAP_RAW_MODE; - } else { - server->max_rw = 0;/* do not need to use raw anyway */ - server->capabilities = CAP_MPX_MODE; - } - tmp = (__s16)le16_to_cpu(rsp->ServerTimeZone); - if (tmp == -1) { - /* OS/2 often does not set timezone therefore - * we must use server time to calc time zone. - * Could deviate slightly from the right zone. - * Smallest defined timezone difference is 15 minutes - * (i.e. Nepal). Rounding up/down is done to match - * this requirement. - */ - int val, seconds, remain, result; - struct timespec64 ts; - time64_t utc = ktime_get_real_seconds(); - ts = cnvrtDosUnixTm(rsp->SrvTime.Date, - rsp->SrvTime.Time, 0); - cifs_dbg(FYI, "SrvTime %lld sec since 1970 (utc: %lld) diff: %lld\n", - ts.tv_sec, utc, - utc - ts.tv_sec); - val = (int)(utc - ts.tv_sec); - seconds = abs(val); - result = (seconds / MIN_TZ_ADJ) * MIN_TZ_ADJ; - remain = seconds % MIN_TZ_ADJ; - if (remain >= (MIN_TZ_ADJ / 2)) - result += MIN_TZ_ADJ; - if (val < 0) - result = -result; - server->timeAdj = result; - } else { - server->timeAdj = (int)tmp; - server->timeAdj *= 60; /* also in seconds */ - } - cifs_dbg(FYI, "server->timeAdj: %d seconds\n", server->timeAdj); - - - /* BB get server time for time conversions and add - code to use it and timezone since this is not UTC */ - - if (rsp->EncryptionKeyLength == - cpu_to_le16(CIFS_CRYPTO_KEY_SIZE)) { - memcpy(server->cryptkey, rsp->EncryptionKey, - CIFS_CRYPTO_KEY_SIZE); - } else if (server->sec_mode & SECMODE_PW_ENCRYPT) { - return -EIO; /* need cryptkey unless plain text */ - } - - cifs_dbg(FYI, "LANMAN negotiated\n"); - return 0; -} -#else -static inline int -decode_lanman_negprot_rsp(struct TCP_Server_Info *server, NEGOTIATE_RSP *pSMBr) -{ - cifs_dbg(VFS, "mount failed, cifs module not built with CIFS_WEAK_PW_HASH support\n"); - return -EOPNOTSUPP; -} -#endif - static bool should_set_ext_sec_flag(enum securityEnum sectype) { @@ -626,16 +527,12 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses) server->dialect = le16_to_cpu(pSMBr->DialectIndex); cifs_dbg(FYI, "Dialect: %d\n", server->dialect); /* Check wct = 1 error case */ - if ((pSMBr->hdr.WordCount < 13) || (server->dialect == BAD_PROT)) { + if ((pSMBr->hdr.WordCount <= 13) || (server->dialect == BAD_PROT)) { /* core returns wct = 1, but we do not ask for core - otherwise small wct just comes when dialect index is -1 indicating we could not negotiate a common dialect */ rc = -EOPNOTSUPP; goto neg_err_exit; - } else if (pSMBr->hdr.WordCount == 13) { - server->negflavor = CIFS_NEGFLAVOR_LANMAN; - rc = decode_lanman_negprot_rsp(server, pSMBr); - goto signing_check; } else if (pSMBr->hdr.WordCount != 17) { /* unknown wct */ rc = -EOPNOTSUPP; @@ -677,7 +574,6 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses) server->capabilities &= ~CAP_EXTENDED_SECURITY; } -signing_check: if (!rc) rc = cifs_enable_signing(server, ses->sign); neg_err_exit: @@ -2101,6 +1997,7 @@ cifs_writev_complete(struct work_struct *work) else if (wdata->result < 0) SetPageError(page); end_page_writeback(page); + cifs_readpage_to_fscache(inode, page); put_page(page); } if (wdata->result != -EAGAIN) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 3781eee9360a..0db344807ef1 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3684,38 +3684,6 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses, *bcc_ptr = 0; /* password is null byte */ bcc_ptr++; /* skip password */ /* already aligned so no need to do it below */ - } else { - pSMB->PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE); - /* BB FIXME add code to fail this if NTLMv2 or Kerberos - specified as required (when that support is added to - the vfs in the future) as only NTLM or the much - weaker LANMAN (which we do not send by default) is accepted - by Samba (not sure whether other servers allow - NTLMv2 password here) */ -#ifdef CONFIG_CIFS_WEAK_PW_HASH - if ((global_secflags & CIFSSEC_MAY_LANMAN) && - (ses->sectype == LANMAN)) - calc_lanman_hash(tcon->password, ses->server->cryptkey, - ses->server->sec_mode & - SECMODE_PW_ENCRYPT ? true : false, - bcc_ptr); - else -#endif /* CIFS_WEAK_PW_HASH */ - rc = SMBNTencrypt(tcon->password, ses->server->cryptkey, - bcc_ptr, nls_codepage); - if (rc) { - cifs_dbg(FYI, "%s Can't generate NTLM rsp. Error: %d\n", - __func__, rc); - cifs_buf_release(smb_buffer); - return rc; - } - - bcc_ptr += CIFS_AUTH_RESP_SIZE; - if (ses->capabilities & CAP_UNICODE) { - /* must align unicode strings */ - *bcc_ptr = 0; /* null byte password */ - bcc_ptr++; - } } if (ses->server->sign) diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 79402ca0ddfa..5f8a302ffcb2 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -100,7 +100,7 @@ build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page, if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) pplen = cifs_sb->prepath ? strlen(cifs_sb->prepath) + 1 : 0; - s = dentry_path_raw(direntry, page, PAGE_SIZE); + s = dentry_path_raw(direntry, page, PATH_MAX); if (IS_ERR(s)) return s; if (!s[1]) // for root we want "", not "/" diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 0a72840a88f1..d0216472f1c6 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -377,6 +377,8 @@ static void cifsFileInfo_put_final(struct cifsFileInfo *cifs_file) struct cifsLockInfo *li, *tmp; struct super_block *sb = inode->i_sb; + cifs_fscache_release_inode_cookie(inode); + /* * Delete any outstanding lock records. We'll lose them when the file * is closed anyway. @@ -882,8 +884,10 @@ int cifs_close(struct inode *inode, struct file *file) if ((cinode->oplock == CIFS_CACHE_RHW_FLG) && cinode->lease_granted && dclose) { - if (test_bit(CIFS_INO_MODIFIED_ATTR, &cinode->flags)) + if (test_bit(CIFS_INO_MODIFIED_ATTR, &cinode->flags)) { inode->i_ctime = inode->i_mtime = current_time(inode); + cifs_fscache_update_inode_cookie(inode); + } spin_lock(&cinode->deferred_lock); cifs_add_deferred_close(cfile, dclose); if (cfile->deferred_close_scheduled && @@ -4170,6 +4174,10 @@ static vm_fault_t cifs_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; + struct file *file = vmf->vma->vm_file; + struct inode *inode = file_inode(file); + + cifs_fscache_wait_on_page_write(inode, page); lock_page(page); return VM_FAULT_LOCKED; @@ -4235,13 +4243,16 @@ cifs_readv_complete(struct work_struct *work) (rdata->result == -EAGAIN && got_bytes)) { flush_dcache_page(page); SetPageUptodate(page); - } + } else + SetPageError(page); unlock_page(page); if (rdata->result == 0 || (rdata->result == -EAGAIN && got_bytes)) cifs_readpage_to_fscache(rdata->mapping->host, page); + else + cifs_fscache_uncache_page(rdata->mapping->host, page); got_bytes -= min_t(unsigned int, PAGE_SIZE, got_bytes); @@ -4848,34 +4859,33 @@ void cifs_oplock_break(struct work_struct *work) oplock_break_ack: /* - * releasing stale oplock after recent reconnect of smb session using - * a now incorrect file handle is not a data integrity issue but do - * not bother sending an oplock release if session to server still is - * disconnected since oplock already released by the server - */ - if (!cfile->oplock_break_cancelled) { - rc = tcon->ses->server->ops->oplock_response(tcon, &cfile->fid, - cinode); - cifs_dbg(FYI, "Oplock release rc = %d\n", rc); - } - /* * When oplock break is received and there are no active * file handles but cached, then schedule deferred close immediately. * So, new open will not use cached handle. */ spin_lock(&CIFS_I(inode)->deferred_lock); is_deferred = cifs_is_deferred_close(cfile, &dclose); + spin_unlock(&CIFS_I(inode)->deferred_lock); if (is_deferred && cfile->deferred_close_scheduled && delayed_work_pending(&cfile->deferred)) { - /* - * If there is no pending work, mod_delayed_work queues new work. - * So, Increase the ref count to avoid use-after-free. - */ - if (!mod_delayed_work(deferredclose_wq, &cfile->deferred, 0)) - cifsFileInfo_get(cfile); + if (cancel_delayed_work(&cfile->deferred)) { + _cifsFileInfo_put(cfile, false, false); + goto oplock_break_done; + } } - spin_unlock(&CIFS_I(inode)->deferred_lock); + /* + * releasing stale oplock after recent reconnect of smb session using + * a now incorrect file handle is not a data integrity issue but do + * not bother sending an oplock release if session to server still is + * disconnected since oplock already released by the server + */ + if (!cfile->oplock_break_cancelled) { + rc = tcon->ses->server->ops->oplock_response(tcon, &cfile->fid, + cinode); + cifs_dbg(FYI, "Oplock release rc = %d\n", rc); + } +oplock_break_done: _cifsFileInfo_put(cfile, false /* do not wait for ourself */, false); cifs_done_oplock_break(cinode); } diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c index eed59bc1d913..3109def8e199 100644 --- a/fs/cifs/fs_context.c +++ b/fs/cifs/fs_context.c @@ -57,12 +57,9 @@ static const match_table_t cifs_secflavor_tokens = { { Opt_sec_krb5p, "krb5p" }, { Opt_sec_ntlmsspi, "ntlmsspi" }, { Opt_sec_ntlmssp, "ntlmssp" }, - { Opt_ntlm, "ntlm" }, - { Opt_sec_ntlmi, "ntlmi" }, { Opt_sec_ntlmv2, "nontlm" }, { Opt_sec_ntlmv2, "ntlmv2" }, { Opt_sec_ntlmv2i, "ntlmv2i" }, - { Opt_sec_lanman, "lanman" }, { Opt_sec_none, "none" }, { Opt_sec_err, NULL } @@ -221,23 +218,12 @@ cifs_parse_security_flavors(struct fs_context *fc, char *value, struct smb3_fs_c case Opt_sec_ntlmssp: ctx->sectype = RawNTLMSSP; break; - case Opt_sec_ntlmi: - ctx->sign = true; - fallthrough; - case Opt_ntlm: - ctx->sectype = NTLM; - break; case Opt_sec_ntlmv2i: ctx->sign = true; fallthrough; case Opt_sec_ntlmv2: ctx->sectype = NTLMv2; break; -#ifdef CONFIG_CIFS_WEAK_PW_HASH - case Opt_sec_lanman: - ctx->sectype = LANMAN; - break; -#endif case Opt_sec_none: ctx->nullauth = 1; break; @@ -1266,10 +1252,17 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, ctx->posix_paths = 1; break; case Opt_unix: - if (result.negated) + if (result.negated) { + if (ctx->linux_ext == 1) + pr_warn_once("conflicting posix mount options specified\n"); ctx->linux_ext = 0; - else ctx->no_linux_ext = 1; + } else { + if (ctx->no_linux_ext == 1) + pr_warn_once("conflicting posix mount options specified\n"); + ctx->linux_ext = 1; + ctx->no_linux_ext = 0; + } break; case Opt_nocase: ctx->nocase = 1; diff --git a/fs/cifs/fs_context.h b/fs/cifs/fs_context.h index b6243972edf3..a42ba71d7a81 100644 --- a/fs/cifs/fs_context.h +++ b/fs/cifs/fs_context.h @@ -47,11 +47,8 @@ enum cifs_sec_param { Opt_sec_krb5p, Opt_sec_ntlmsspi, Opt_sec_ntlmssp, - Opt_ntlm, - Opt_sec_ntlmi, Opt_sec_ntlmv2, Opt_sec_ntlmv2i, - Opt_sec_lanman, Opt_sec_none, Opt_sec_err diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c index dd625033cd6b..fab47fa7df74 100644 --- a/fs/cifs/fscache.c +++ b/fs/cifs/fscache.c @@ -176,29 +176,34 @@ void cifs_fscache_release_inode_cookie(struct inode *inode) auxdata.last_change_time_nsec = cifsi->vfs_inode.i_ctime.tv_nsec; cifs_dbg(FYI, "%s: (0x%p)\n", __func__, cifsi->fscache); + /* fscache_relinquish_cookie does not seem to update auxdata */ + fscache_update_cookie(cifsi->fscache, &auxdata); fscache_relinquish_cookie(cifsi->fscache, &auxdata, false); cifsi->fscache = NULL; } } -static void cifs_fscache_disable_inode_cookie(struct inode *inode) +void cifs_fscache_update_inode_cookie(struct inode *inode) { + struct cifs_fscache_inode_auxdata auxdata; struct cifsInodeInfo *cifsi = CIFS_I(inode); if (cifsi->fscache) { + memset(&auxdata, 0, sizeof(auxdata)); + auxdata.eof = cifsi->server_eof; + auxdata.last_write_time_sec = cifsi->vfs_inode.i_mtime.tv_sec; + auxdata.last_change_time_sec = cifsi->vfs_inode.i_ctime.tv_sec; + auxdata.last_write_time_nsec = cifsi->vfs_inode.i_mtime.tv_nsec; + auxdata.last_change_time_nsec = cifsi->vfs_inode.i_ctime.tv_nsec; + cifs_dbg(FYI, "%s: (0x%p)\n", __func__, cifsi->fscache); - fscache_uncache_all_inode_pages(cifsi->fscache, inode); - fscache_relinquish_cookie(cifsi->fscache, NULL, true); - cifsi->fscache = NULL; + fscache_update_cookie(cifsi->fscache, &auxdata); } } void cifs_fscache_set_inode_cookie(struct inode *inode, struct file *filp) { - if ((filp->f_flags & O_ACCMODE) != O_RDONLY) - cifs_fscache_disable_inode_cookie(inode); - else - cifs_fscache_enable_inode_cookie(inode); + cifs_fscache_enable_inode_cookie(inode); } void cifs_fscache_reset_inode_cookie(struct inode *inode) @@ -310,6 +315,8 @@ void __cifs_readpage_to_fscache(struct inode *inode, struct page *page) struct cifsInodeInfo *cifsi = CIFS_I(inode); int ret; + WARN_ON(!cifsi->fscache); + cifs_dbg(FYI, "%s: (fsc: %p, p: %p, i: %p)\n", __func__, cifsi->fscache, page, inode); ret = fscache_write_page(cifsi->fscache, page, @@ -334,3 +341,21 @@ void __cifs_fscache_invalidate_page(struct page *page, struct inode *inode) fscache_wait_on_page_write(cookie, page); fscache_uncache_page(cookie, page); } + +void __cifs_fscache_wait_on_page_write(struct inode *inode, struct page *page) +{ + struct cifsInodeInfo *cifsi = CIFS_I(inode); + struct fscache_cookie *cookie = cifsi->fscache; + + cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", __func__, page, cookie); + fscache_wait_on_page_write(cookie, page); +} + +void __cifs_fscache_uncache_page(struct inode *inode, struct page *page) +{ + struct cifsInodeInfo *cifsi = CIFS_I(inode); + struct fscache_cookie *cookie = cifsi->fscache; + + cifs_dbg(FYI, "%s: (0x%p/0x%p)\n", __func__, page, cookie); + fscache_uncache_page(cookie, page); +} diff --git a/fs/cifs/fscache.h b/fs/cifs/fscache.h index 3d55cb2ef055..82e856b9cf89 100644 --- a/fs/cifs/fscache.h +++ b/fs/cifs/fscache.h @@ -55,10 +55,13 @@ extern void cifs_fscache_get_super_cookie(struct cifs_tcon *); extern void cifs_fscache_release_super_cookie(struct cifs_tcon *); extern void cifs_fscache_release_inode_cookie(struct inode *); +extern void cifs_fscache_update_inode_cookie(struct inode *inode); extern void cifs_fscache_set_inode_cookie(struct inode *, struct file *); extern void cifs_fscache_reset_inode_cookie(struct inode *); extern void __cifs_fscache_invalidate_page(struct page *, struct inode *); +extern void __cifs_fscache_wait_on_page_write(struct inode *inode, struct page *page); +extern void __cifs_fscache_uncache_page(struct inode *inode, struct page *page); extern int cifs_fscache_release_page(struct page *page, gfp_t gfp); extern int __cifs_readpage_from_fscache(struct inode *, struct page *); extern int __cifs_readpages_from_fscache(struct inode *, @@ -76,6 +79,20 @@ static inline void cifs_fscache_invalidate_page(struct page *page, __cifs_fscache_invalidate_page(page, inode); } +static inline void cifs_fscache_wait_on_page_write(struct inode *inode, + struct page *page) +{ + if (PageFsCache(page)) + __cifs_fscache_wait_on_page_write(inode, page); +} + +static inline void cifs_fscache_uncache_page(struct inode *inode, + struct page *page) +{ + if (PageFsCache(page)) + __cifs_fscache_uncache_page(inode, page); +} + static inline int cifs_readpage_from_fscache(struct inode *inode, struct page *page) { @@ -123,6 +140,7 @@ static inline void cifs_fscache_release_super_cookie(struct cifs_tcon *tcon) {} static inline void cifs_fscache_release_inode_cookie(struct inode *inode) {} +static inline void cifs_fscache_update_inode_cookie(struct inode *inode) {} static inline void cifs_fscache_set_inode_cookie(struct inode *inode, struct file *filp) {} static inline void cifs_fscache_reset_inode_cookie(struct inode *inode) {} @@ -133,6 +151,11 @@ static inline int cifs_fscache_release_page(struct page *page, gfp_t gfp) static inline void cifs_fscache_invalidate_page(struct page *page, struct inode *inode) {} +static inline void cifs_fscache_wait_on_page_write(struct inode *inode, + struct page *page) {} +static inline void cifs_fscache_uncache_page(struct inode *inode, + struct page *page) {} + static inline int cifs_readpage_from_fscache(struct inode *inode, struct page *page) { diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index b96b253e7635..50c01cff4c84 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1625,7 +1625,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry) goto unlink_out; } - cifs_close_all_deferred_files(tcon); + cifs_close_deferred_file(CIFS_I(inode)); if (cap_unix(tcon->ses) && (CIFS_UNIX_POSIX_PATH_OPS_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability))) { rc = CIFSPOSIXDelFile(xid, tcon, full_path, @@ -2084,6 +2084,7 @@ cifs_rename2(struct user_namespace *mnt_userns, struct inode *source_dir, FILE_UNIX_BASIC_INFO *info_buf_target; unsigned int xid; int rc, tmprc; + int retry_count = 0; if (flags & ~RENAME_NOREPLACE) return -EINVAL; @@ -2113,10 +2114,24 @@ cifs_rename2(struct user_namespace *mnt_userns, struct inode *source_dir, goto cifs_rename_exit; } - cifs_close_all_deferred_files(tcon); + cifs_close_deferred_file(CIFS_I(d_inode(source_dentry))); + if (d_inode(target_dentry) != NULL) + cifs_close_deferred_file(CIFS_I(d_inode(target_dentry))); + rc = cifs_do_rename(xid, source_dentry, from_name, target_dentry, to_name); + if (rc == -EACCES) { + while (retry_count < 3) { + cifs_close_all_deferred_files(tcon); + rc = cifs_do_rename(xid, source_dentry, from_name, target_dentry, + to_name); + if (rc != -EACCES) + break; + retry_count++; + } + } + /* * No-replace is the natural behavior for CIFS, so skip unlink hacks. */ @@ -2282,6 +2297,7 @@ cifs_revalidate_mapping(struct inode *inode) { int rc; unsigned long *flags = &CIFS_I(inode)->flags; + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); /* swapfiles are not supposed to be shared */ if (IS_SWAPFILE(inode)) @@ -2293,11 +2309,16 @@ cifs_revalidate_mapping(struct inode *inode) return rc; if (test_and_clear_bit(CIFS_INO_INVALID_MAPPING, flags)) { + /* for cache=singleclient, do not invalidate */ + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RW_CACHE) + goto skip_invalidate; + rc = cifs_invalidate_mapping(inode); if (rc) set_bit(CIFS_INO_INVALID_MAPPING, flags); } +skip_invalidate: clear_bit_unlock(CIFS_INO_LOCK, flags); smp_mb__after_atomic(); wake_up_bit(flags, CIFS_INO_LOCK); diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 844abeb2b48f..9469f1cf0b46 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -723,13 +723,31 @@ void cifs_close_deferred_file(struct cifsInodeInfo *cifs_inode) { struct cifsFileInfo *cfile = NULL; - struct cifs_deferred_close *dclose; + struct file_list *tmp_list, *tmp_next_list; + struct list_head file_head; + + if (cifs_inode == NULL) + return; + INIT_LIST_HEAD(&file_head); + spin_lock(&cifs_inode->open_file_lock); list_for_each_entry(cfile, &cifs_inode->openFileList, flist) { - spin_lock(&cifs_inode->deferred_lock); - if (cifs_is_deferred_close(cfile, &dclose)) - mod_delayed_work(deferredclose_wq, &cfile->deferred, 0); - spin_unlock(&cifs_inode->deferred_lock); + if (delayed_work_pending(&cfile->deferred)) { + if (cancel_delayed_work(&cfile->deferred)) { + tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC); + if (tmp_list == NULL) + continue; + tmp_list->cfile = cfile; + list_add_tail(&tmp_list->list, &file_head); + } + } + } + spin_unlock(&cifs_inode->open_file_lock); + + list_for_each_entry_safe(tmp_list, tmp_next_list, &file_head, list) { + _cifsFileInfo_put(tmp_list->cfile, true, false); + list_del(&tmp_list->list); + kfree(tmp_list); } } @@ -738,20 +756,30 @@ cifs_close_all_deferred_files(struct cifs_tcon *tcon) { struct cifsFileInfo *cfile; struct list_head *tmp; + struct file_list *tmp_list, *tmp_next_list; + struct list_head file_head; + INIT_LIST_HEAD(&file_head); spin_lock(&tcon->open_file_lock); list_for_each(tmp, &tcon->openFileList) { cfile = list_entry(tmp, struct cifsFileInfo, tlist); if (delayed_work_pending(&cfile->deferred)) { - /* - * If there is no pending work, mod_delayed_work queues new work. - * So, Increase the ref count to avoid use-after-free. - */ - if (!mod_delayed_work(deferredclose_wq, &cfile->deferred, 0)) - cifsFileInfo_get(cfile); + if (cancel_delayed_work(&cfile->deferred)) { + tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC); + if (tmp_list == NULL) + continue; + tmp_list->cfile = cfile; + list_add_tail(&tmp_list->list, &file_head); + } } } spin_unlock(&tcon->open_file_lock); + + list_for_each_entry_safe(tmp_list, tmp_next_list, &file_head, list) { + _cifsFileInfo_put(tmp_list->cfile, true, false); + list_del(&tmp_list->list); + kfree(tmp_list); + } } /* parses DFS refferal V3 structure diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index bfee176b901d..54d77c99e21c 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -369,7 +369,7 @@ int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb, */ static int -initiate_cifs_search(const unsigned int xid, struct file *file, +_initiate_cifs_search(const unsigned int xid, struct file *file, const char *full_path) { __u16 search_flags; @@ -451,6 +451,27 @@ error_exit: return rc; } +static int +initiate_cifs_search(const unsigned int xid, struct file *file, + const char *full_path) +{ + int rc, retry_count = 0; + + do { + rc = _initiate_cifs_search(xid, file, full_path); + /* + * If we don't have enough credits to start reading the + * directory just try again after short wait. + */ + if (rc != -EDEADLK) + break; + + usleep_range(512, 2048); + } while (retry_count++ < 5); + + return rc; +} + /* return length of unicode string in bytes */ static int cifs_unicode_bytelen(const char *str) { diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index c5785fd3f52e..118403fbeda2 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -799,30 +799,16 @@ cifs_select_sectype(struct TCP_Server_Info *server, enum securityEnum requested) } case CIFS_NEGFLAVOR_UNENCAP: switch (requested) { - case NTLM: case NTLMv2: return requested; case Unspecified: if (global_secflags & CIFSSEC_MAY_NTLMV2) return NTLMv2; - if (global_secflags & CIFSSEC_MAY_NTLM) - return NTLM; break; default: break; } - fallthrough; /* to attempt LANMAN authentication next */ - case CIFS_NEGFLAVOR_LANMAN: - switch (requested) { - case LANMAN: - return requested; - case Unspecified: - if (global_secflags & CIFSSEC_MAY_LANMAN) - return LANMAN; - fallthrough; - default: - return Unspecified; - } + fallthrough; default: return Unspecified; } @@ -877,7 +863,7 @@ sess_alloc_buffer(struct sess_data *sess_data, int wct) return 0; out_free_smb_buf: - kfree(smb_buf); + cifs_small_buf_release(smb_buf); sess_data->iov[0].iov_base = NULL; sess_data->iov[0].iov_len = 0; sess_data->buf0_type = CIFS_NO_BUFFER; @@ -947,230 +933,6 @@ sess_sendreceive(struct sess_data *sess_data) return rc; } -/* - * LANMAN and plaintext are less secure and off by default. - * So we make this explicitly be turned on in kconfig (in the - * build) and turned on at runtime (changed from the default) - * in proc/fs/cifs or via mount parm. Unfortunately this is - * needed for old Win (e.g. Win95), some obscure NAS and OS/2 - */ -#ifdef CONFIG_CIFS_WEAK_PW_HASH -static void -sess_auth_lanman(struct sess_data *sess_data) -{ - int rc = 0; - struct smb_hdr *smb_buf; - SESSION_SETUP_ANDX *pSMB; - char *bcc_ptr; - struct cifs_ses *ses = sess_data->ses; - char lnm_session_key[CIFS_AUTH_RESP_SIZE]; - __u16 bytes_remaining; - - /* lanman 2 style sessionsetup */ - /* wct = 10 */ - rc = sess_alloc_buffer(sess_data, 10); - if (rc) - goto out; - - pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; - bcc_ptr = sess_data->iov[2].iov_base; - (void)cifs_ssetup_hdr(ses, pSMB); - - pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE; - - if (ses->user_name != NULL) { - /* no capabilities flags in old lanman negotiation */ - pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE); - - /* Calculate hash with password and copy into bcc_ptr. - * Encryption Key (stored as in cryptkey) gets used if the - * security mode bit in Negotiate Protocol response states - * to use challenge/response method (i.e. Password bit is 1). - */ - rc = calc_lanman_hash(ses->password, ses->server->cryptkey, - ses->server->sec_mode & SECMODE_PW_ENCRYPT ? - true : false, lnm_session_key); - if (rc) - goto out; - - memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE); - bcc_ptr += CIFS_AUTH_RESP_SIZE; - } else { - pSMB->old_req.PasswordLength = 0; - } - - /* - * can not sign if LANMAN negotiated so no need - * to calculate signing key? but what if server - * changed to do higher than lanman dialect and - * we reconnected would we ever calc signing_key? - */ - - cifs_dbg(FYI, "Negotiating LANMAN setting up strings\n"); - /* Unicode not allowed for LANMAN dialects */ - ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp); - - sess_data->iov[2].iov_len = (long) bcc_ptr - - (long) sess_data->iov[2].iov_base; - - rc = sess_sendreceive(sess_data); - if (rc) - goto out; - - pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; - smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base; - - /* lanman response has a word count of 3 */ - if (smb_buf->WordCount != 3) { - rc = -EIO; - cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount); - goto out; - } - - if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN) - cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */ - - ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */ - cifs_dbg(FYI, "UID = %llu\n", ses->Suid); - - bytes_remaining = get_bcc(smb_buf); - bcc_ptr = pByteArea(smb_buf); - - /* BB check if Unicode and decode strings */ - if (bytes_remaining == 0) { - /* no string area to decode, do nothing */ - } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) { - /* unicode string area must be word-aligned */ - if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) { - ++bcc_ptr; - --bytes_remaining; - } - decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, - sess_data->nls_cp); - } else { - decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses, - sess_data->nls_cp); - } - - rc = sess_establish_session(sess_data); -out: - sess_data->result = rc; - sess_data->func = NULL; - sess_free_buffer(sess_data); -} - -#endif - -static void -sess_auth_ntlm(struct sess_data *sess_data) -{ - int rc = 0; - struct smb_hdr *smb_buf; - SESSION_SETUP_ANDX *pSMB; - char *bcc_ptr; - struct cifs_ses *ses = sess_data->ses; - __u32 capabilities; - __u16 bytes_remaining; - - /* old style NTLM sessionsetup */ - /* wct = 13 */ - rc = sess_alloc_buffer(sess_data, 13); - if (rc) - goto out; - - pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; - bcc_ptr = sess_data->iov[2].iov_base; - capabilities = cifs_ssetup_hdr(ses, pSMB); - - pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities); - if (ses->user_name != NULL) { - pSMB->req_no_secext.CaseInsensitivePasswordLength = - cpu_to_le16(CIFS_AUTH_RESP_SIZE); - pSMB->req_no_secext.CaseSensitivePasswordLength = - cpu_to_le16(CIFS_AUTH_RESP_SIZE); - - /* calculate ntlm response and session key */ - rc = setup_ntlm_response(ses, sess_data->nls_cp); - if (rc) { - cifs_dbg(VFS, "Error %d during NTLM authentication\n", - rc); - goto out; - } - - /* copy ntlm response */ - memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, - CIFS_AUTH_RESP_SIZE); - bcc_ptr += CIFS_AUTH_RESP_SIZE; - memcpy(bcc_ptr, ses->auth_key.response + CIFS_SESS_KEY_SIZE, - CIFS_AUTH_RESP_SIZE); - bcc_ptr += CIFS_AUTH_RESP_SIZE; - } else { - pSMB->req_no_secext.CaseInsensitivePasswordLength = 0; - pSMB->req_no_secext.CaseSensitivePasswordLength = 0; - } - - if (ses->capabilities & CAP_UNICODE) { - /* unicode strings must be word aligned */ - if (sess_data->iov[0].iov_len % 2) { - *bcc_ptr = 0; - bcc_ptr++; - } - unicode_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp); - } else { - ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp); - } - - - sess_data->iov[2].iov_len = (long) bcc_ptr - - (long) sess_data->iov[2].iov_base; - - rc = sess_sendreceive(sess_data); - if (rc) - goto out; - - pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; - smb_buf = (struct smb_hdr *)sess_data->iov[0].iov_base; - - if (smb_buf->WordCount != 3) { - rc = -EIO; - cifs_dbg(VFS, "bad word count %d\n", smb_buf->WordCount); - goto out; - } - - if (le16_to_cpu(pSMB->resp.Action) & GUEST_LOGIN) - cifs_dbg(FYI, "Guest login\n"); /* BB mark SesInfo struct? */ - - ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */ - cifs_dbg(FYI, "UID = %llu\n", ses->Suid); - - bytes_remaining = get_bcc(smb_buf); - bcc_ptr = pByteArea(smb_buf); - - /* BB check if Unicode and decode strings */ - if (bytes_remaining == 0) { - /* no string area to decode, do nothing */ - } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) { - /* unicode string area must be word-aligned */ - if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) { - ++bcc_ptr; - --bytes_remaining; - } - decode_unicode_ssetup(&bcc_ptr, bytes_remaining, ses, - sess_data->nls_cp); - } else { - decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses, - sess_data->nls_cp); - } - - rc = sess_establish_session(sess_data); -out: - sess_data->result = rc; - sess_data->func = NULL; - sess_free_buffer(sess_data); - kfree(ses->auth_key.response); - ses->auth_key.response = NULL; -} - static void sess_auth_ntlmv2(struct sess_data *sess_data) { @@ -1675,21 +1437,6 @@ static int select_sec(struct cifs_ses *ses, struct sess_data *sess_data) } switch (type) { - case LANMAN: - /* LANMAN and plaintext are less secure and off by default. - * So we make this explicitly be turned on in kconfig (in the - * build) and turned on at runtime (changed from the default) - * in proc/fs/cifs or via mount parm. Unfortunately this is - * needed for old Win (e.g. Win95), some obscure NAS and OS/2 */ -#ifdef CONFIG_CIFS_WEAK_PW_HASH - sess_data->func = sess_auth_lanman; - break; -#else - return -EOPNOTSUPP; -#endif - case NTLM: - sess_data->func = sess_auth_ntlm; - break; case NTLMv2: sess_data->func = sess_auth_ntlmv2; break; diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c index cea39bcecbab..181514b8770d 100644 --- a/fs/cifs/smb2maperror.c +++ b/fs/cifs/smb2maperror.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: LGPL-2.1 /* - * fs/smb2/smb2maperror.c * * Functions which do error mapping of SMB2 status codes to POSIX errors * diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 2dfd0d8297eb..ddc0e8f97872 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -3590,6 +3590,7 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon, return rc; } + filemap_invalidate_lock(inode->i_mapping); /* * We implement the punch hole through ioctl, so we need remove the page * caches first, otherwise the data may be inconsistent with the server. @@ -3607,6 +3608,7 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon, sizeof(struct file_zero_data_information), CIFSMaxBufSize, NULL, NULL); free_xid(xid); + filemap_invalidate_unlock(inode->i_mapping); return rc; } diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 781d14e5f2af..b6d2e3591927 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -2426,7 +2426,7 @@ create_sd_buf(umode_t mode, bool set_owner, unsigned int *len) memcpy(aclptr, &acl, sizeof(struct cifs_acl)); buf->ccontext.DataLength = cpu_to_le32(ptr - (__u8 *)&buf->sd); - *len = ptr - (__u8 *)buf; + *len = roundup(ptr - (__u8 *)buf, 8); return buf; } diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c index 39a938443e3e..10047cc55286 100644 --- a/fs/cifs/smbencrypt.c +++ b/fs/cifs/smbencrypt.c @@ -18,13 +18,13 @@ #include <linux/string.h> #include <linux/kernel.h> #include <linux/random.h> -#include <crypto/des.h> #include "cifs_fs_sb.h" #include "cifs_unicode.h" #include "cifspdu.h" #include "cifsglob.h" #include "cifs_debug.h" #include "cifsproto.h" +#include "../cifs_common/md4.h" #ifndef false #define false 0 @@ -38,126 +38,29 @@ #define SSVALX(buf,pos,val) (CVAL(buf,pos)=(val)&0xFF,CVAL(buf,pos+1)=(val)>>8) #define SSVAL(buf,pos,val) SSVALX((buf),(pos),((__u16)(val))) -static void -str_to_key(unsigned char *str, unsigned char *key) -{ - int i; - - key[0] = str[0] >> 1; - key[1] = ((str[0] & 0x01) << 6) | (str[1] >> 2); - key[2] = ((str[1] & 0x03) << 5) | (str[2] >> 3); - key[3] = ((str[2] & 0x07) << 4) | (str[3] >> 4); - key[4] = ((str[3] & 0x0F) << 3) | (str[4] >> 5); - key[5] = ((str[4] & 0x1F) << 2) | (str[5] >> 6); - key[6] = ((str[5] & 0x3F) << 1) | (str[6] >> 7); - key[7] = str[6] & 0x7F; - for (i = 0; i < 8; i++) - key[i] = (key[i] << 1); -} - -static int -smbhash(unsigned char *out, const unsigned char *in, unsigned char *key) -{ - unsigned char key2[8]; - struct des_ctx ctx; - - str_to_key(key, key2); - - if (fips_enabled) { - cifs_dbg(VFS, "FIPS compliance enabled: DES not permitted\n"); - return -ENOENT; - } - - des_expand_key(&ctx, key2, DES_KEY_SIZE); - des_encrypt(&ctx, out, in); - memzero_explicit(&ctx, sizeof(ctx)); - - return 0; -} - -static int -E_P16(unsigned char *p14, unsigned char *p16) -{ - int rc; - unsigned char sp8[8] = - { 0x4b, 0x47, 0x53, 0x21, 0x40, 0x23, 0x24, 0x25 }; - - rc = smbhash(p16, sp8, p14); - if (rc) - return rc; - rc = smbhash(p16 + 8, sp8, p14 + 7); - return rc; -} - -static int -E_P24(unsigned char *p21, const unsigned char *c8, unsigned char *p24) -{ - int rc; - - rc = smbhash(p24, c8, p21); - if (rc) - return rc; - rc = smbhash(p24 + 8, c8, p21 + 7); - if (rc) - return rc; - rc = smbhash(p24 + 16, c8, p21 + 14); - return rc; -} - /* produce a md4 message digest from data of length n bytes */ -int +static int mdfour(unsigned char *md4_hash, unsigned char *link_str, int link_len) { int rc; - struct crypto_shash *md4 = NULL; - struct sdesc *sdescmd4 = NULL; - - rc = cifs_alloc_hash("md4", &md4, &sdescmd4); - if (rc) - goto mdfour_err; + struct md4_ctx mctx; - rc = crypto_shash_init(&sdescmd4->shash); + rc = cifs_md4_init(&mctx); if (rc) { - cifs_dbg(VFS, "%s: Could not init md4 shash\n", __func__); + cifs_dbg(VFS, "%s: Could not init MD4\n", __func__); goto mdfour_err; } - rc = crypto_shash_update(&sdescmd4->shash, link_str, link_len); + rc = cifs_md4_update(&mctx, link_str, link_len); if (rc) { - cifs_dbg(VFS, "%s: Could not update with link_str\n", __func__); + cifs_dbg(VFS, "%s: Could not update MD4\n", __func__); goto mdfour_err; } - rc = crypto_shash_final(&sdescmd4->shash, md4_hash); + rc = cifs_md4_final(&mctx, md4_hash); if (rc) - cifs_dbg(VFS, "%s: Could not generate md4 hash\n", __func__); - -mdfour_err: - cifs_free_hash(&md4, &sdescmd4); - return rc; -} - -/* - This implements the X/Open SMB password encryption - It takes a password, a 8 byte "crypt key" and puts 24 bytes of - encrypted password into p24 */ -/* Note that password must be uppercased and null terminated */ -int -SMBencrypt(unsigned char *passwd, const unsigned char *c8, unsigned char *p24) -{ - int rc; - unsigned char p14[14], p16[16], p21[21]; + cifs_dbg(VFS, "%s: Could not finalize MD4\n", __func__); - memset(p14, '\0', 14); - memset(p16, '\0', 16); - memset(p21, '\0', 21); - - memcpy(p14, passwd, 14); - rc = E_P16(p14, p16); - if (rc) - return rc; - - memcpy(p21, p16, 16); - rc = E_P24(p21, c8, p24); +mdfour_err: return rc; } @@ -186,25 +89,3 @@ E_md4hash(const unsigned char *passwd, unsigned char *p16, return rc; } - -/* Does the NT MD4 hash then des encryption. */ -int -SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24, - const struct nls_table *codepage) -{ - int rc; - unsigned char p16[16], p21[21]; - - memset(p16, '\0', 16); - memset(p21, '\0', 21); - - rc = E_md4hash(passwd, p16, codepage); - if (rc) { - cifs_dbg(FYI, "%s Can't generate NT hash, error: %d\n", - __func__, rc); - return rc; - } - memcpy(p21, p16, 16); - rc = E_P24(p21, c8, p24); - return rc; -} diff --git a/fs/cifs_common/Makefile b/fs/cifs_common/Makefile new file mode 100644 index 000000000000..6fedd2f88a25 --- /dev/null +++ b/fs/cifs_common/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# Makefile for Linux filesystem routines that are shared by client and server. +# + +obj-$(CONFIG_CIFS_COMMON) += cifs_arc4.o +obj-$(CONFIG_CIFS_COMMON) += cifs_md4.o diff --git a/fs/cifs_common/arc4.h b/fs/cifs_common/arc4.h new file mode 100644 index 000000000000..12e71ec033a1 --- /dev/null +++ b/fs/cifs_common/arc4.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Common values for ARC4 Cipher Algorithm + */ + +#ifndef _CRYPTO_ARC4_H +#define _CRYPTO_ARC4_H + +#include <linux/types.h> + +#define ARC4_MIN_KEY_SIZE 1 +#define ARC4_MAX_KEY_SIZE 256 +#define ARC4_BLOCK_SIZE 1 + +struct arc4_ctx { + u32 S[256]; + u32 x, y; +}; + +int cifs_arc4_setkey(struct arc4_ctx *ctx, const u8 *in_key, unsigned int key_len); +void cifs_arc4_crypt(struct arc4_ctx *ctx, u8 *out, const u8 *in, unsigned int len); + +#endif /* _CRYPTO_ARC4_H */ diff --git a/fs/cifs_common/cifs_arc4.c b/fs/cifs_common/cifs_arc4.c new file mode 100644 index 000000000000..b964cc682944 --- /dev/null +++ b/fs/cifs_common/cifs_arc4.c @@ -0,0 +1,87 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Cryptographic API + * + * ARC4 Cipher Algorithm + * + * Jon Oberheide <jon@oberheide.org> + */ + +#include <linux/module.h> +#include "arc4.h" + +MODULE_LICENSE("GPL"); + +int cifs_arc4_setkey(struct arc4_ctx *ctx, const u8 *in_key, unsigned int key_len) +{ + int i, j = 0, k = 0; + + ctx->x = 1; + ctx->y = 0; + + for (i = 0; i < 256; i++) + ctx->S[i] = i; + + for (i = 0; i < 256; i++) { + u32 a = ctx->S[i]; + + j = (j + in_key[k] + a) & 0xff; + ctx->S[i] = ctx->S[j]; + ctx->S[j] = a; + if (++k >= key_len) + k = 0; + } + + return 0; +} +EXPORT_SYMBOL_GPL(cifs_arc4_setkey); + +void cifs_arc4_crypt(struct arc4_ctx *ctx, u8 *out, const u8 *in, unsigned int len) +{ + u32 *const S = ctx->S; + u32 x, y, a, b; + u32 ty, ta, tb; + + if (len == 0) + return; + + x = ctx->x; + y = ctx->y; + + a = S[x]; + y = (y + a) & 0xff; + b = S[y]; + + do { + S[y] = a; + a = (a + b) & 0xff; + S[x] = b; + x = (x + 1) & 0xff; + ta = S[x]; + ty = (y + ta) & 0xff; + tb = S[ty]; + *out++ = *in++ ^ S[a]; + if (--len == 0) + break; + y = ty; + a = ta; + b = tb; + } while (true); + + ctx->x = x; + ctx->y = y; +} +EXPORT_SYMBOL_GPL(cifs_arc4_crypt); + +static int __init +init_cifs_common(void) +{ + return 0; +} +static void __init +exit_cifs_common(void) +{ +} + +module_init(init_cifs_common) +module_exit(exit_cifs_common) diff --git a/fs/cifs_common/cifs_md4.c b/fs/cifs_common/cifs_md4.c new file mode 100644 index 000000000000..50f78cfc6ce9 --- /dev/null +++ b/fs/cifs_common/cifs_md4.c @@ -0,0 +1,197 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Cryptographic API. + * + * MD4 Message Digest Algorithm (RFC1320). + * + * Implementation derived from Andrew Tridgell and Steve French's + * CIFS MD4 implementation, and the cryptoapi implementation + * originally based on the public domain implementation written + * by Colin Plumb in 1993. + * + * Copyright (c) Andrew Tridgell 1997-1998. + * Modified by Steve French (sfrench@us.ibm.com) 2002 + * Copyright (c) Cryptoapi developers. + * Copyright (c) 2002 David S. Miller (davem@redhat.com) + * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> + * + */ +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/string.h> +#include <linux/types.h> +#include <asm/byteorder.h> +#include "md4.h" + +MODULE_LICENSE("GPL"); + +static inline u32 lshift(u32 x, unsigned int s) +{ + x &= 0xFFFFFFFF; + return ((x << s) & 0xFFFFFFFF) | (x >> (32 - s)); +} + +static inline u32 F(u32 x, u32 y, u32 z) +{ + return (x & y) | ((~x) & z); +} + +static inline u32 G(u32 x, u32 y, u32 z) +{ + return (x & y) | (x & z) | (y & z); +} + +static inline u32 H(u32 x, u32 y, u32 z) +{ + return x ^ y ^ z; +} + +#define ROUND1(a,b,c,d,k,s) (a = lshift(a + F(b,c,d) + k, s)) +#define ROUND2(a,b,c,d,k,s) (a = lshift(a + G(b,c,d) + k + (u32)0x5A827999,s)) +#define ROUND3(a,b,c,d,k,s) (a = lshift(a + H(b,c,d) + k + (u32)0x6ED9EBA1,s)) + +static void md4_transform(u32 *hash, u32 const *in) +{ + u32 a, b, c, d; + + a = hash[0]; + b = hash[1]; + c = hash[2]; + d = hash[3]; + + ROUND1(a, b, c, d, in[0], 3); + ROUND1(d, a, b, c, in[1], 7); + ROUND1(c, d, a, b, in[2], 11); + ROUND1(b, c, d, a, in[3], 19); + ROUND1(a, b, c, d, in[4], 3); + ROUND1(d, a, b, c, in[5], 7); + ROUND1(c, d, a, b, in[6], 11); + ROUND1(b, c, d, a, in[7], 19); + ROUND1(a, b, c, d, in[8], 3); + ROUND1(d, a, b, c, in[9], 7); + ROUND1(c, d, a, b, in[10], 11); + ROUND1(b, c, d, a, in[11], 19); + ROUND1(a, b, c, d, in[12], 3); + ROUND1(d, a, b, c, in[13], 7); + ROUND1(c, d, a, b, in[14], 11); + ROUND1(b, c, d, a, in[15], 19); + + ROUND2(a, b, c, d, in[0], 3); + ROUND2(d, a, b, c, in[4], 5); + ROUND2(c, d, a, b, in[8], 9); + ROUND2(b, c, d, a, in[12], 13); + ROUND2(a, b, c, d, in[1], 3); + ROUND2(d, a, b, c, in[5], 5); + ROUND2(c, d, a, b, in[9], 9); + ROUND2(b, c, d, a, in[13], 13); + ROUND2(a, b, c, d, in[2], 3); + ROUND2(d, a, b, c, in[6], 5); + ROUND2(c, d, a, b, in[10], 9); + ROUND2(b, c, d, a, in[14], 13); + ROUND2(a, b, c, d, in[3], 3); + ROUND2(d, a, b, c, in[7], 5); + ROUND2(c, d, a, b, in[11], 9); + ROUND2(b, c, d, a, in[15], 13); + + ROUND3(a, b, c, d, in[0], 3); + ROUND3(d, a, b, c, in[8], 9); + ROUND3(c, d, a, b, in[4], 11); + ROUND3(b, c, d, a, in[12], 15); + ROUND3(a, b, c, d, in[2], 3); + ROUND3(d, a, b, c, in[10], 9); + ROUND3(c, d, a, b, in[6], 11); + ROUND3(b, c, d, a, in[14], 15); + ROUND3(a, b, c, d, in[1], 3); + ROUND3(d, a, b, c, in[9], 9); + ROUND3(c, d, a, b, in[5], 11); + ROUND3(b, c, d, a, in[13], 15); + ROUND3(a, b, c, d, in[3], 3); + ROUND3(d, a, b, c, in[11], 9); + ROUND3(c, d, a, b, in[7], 11); + ROUND3(b, c, d, a, in[15], 15); + + hash[0] += a; + hash[1] += b; + hash[2] += c; + hash[3] += d; +} + +static inline void md4_transform_helper(struct md4_ctx *ctx) +{ + le32_to_cpu_array(ctx->block, ARRAY_SIZE(ctx->block)); + md4_transform(ctx->hash, ctx->block); +} + +int cifs_md4_init(struct md4_ctx *mctx) +{ + memset(mctx, 0, sizeof(struct md4_ctx)); + mctx->hash[0] = 0x67452301; + mctx->hash[1] = 0xefcdab89; + mctx->hash[2] = 0x98badcfe; + mctx->hash[3] = 0x10325476; + mctx->byte_count = 0; + + return 0; +} +EXPORT_SYMBOL_GPL(cifs_md4_init); + +int cifs_md4_update(struct md4_ctx *mctx, const u8 *data, unsigned int len) +{ + const u32 avail = sizeof(mctx->block) - (mctx->byte_count & 0x3f); + + mctx->byte_count += len; + + if (avail > len) { + memcpy((char *)mctx->block + (sizeof(mctx->block) - avail), + data, len); + return 0; + } + + memcpy((char *)mctx->block + (sizeof(mctx->block) - avail), + data, avail); + + md4_transform_helper(mctx); + data += avail; + len -= avail; + + while (len >= sizeof(mctx->block)) { + memcpy(mctx->block, data, sizeof(mctx->block)); + md4_transform_helper(mctx); + data += sizeof(mctx->block); + len -= sizeof(mctx->block); + } + + memcpy(mctx->block, data, len); + + return 0; +} +EXPORT_SYMBOL_GPL(cifs_md4_update); + +int cifs_md4_final(struct md4_ctx *mctx, u8 *out) +{ + const unsigned int offset = mctx->byte_count & 0x3f; + char *p = (char *)mctx->block + offset; + int padding = 56 - (offset + 1); + + *p++ = 0x80; + if (padding < 0) { + memset(p, 0x00, padding + sizeof(u64)); + md4_transform_helper(mctx); + p = (char *)mctx->block; + padding = 56; + } + + memset(p, 0, padding); + mctx->block[14] = mctx->byte_count << 3; + mctx->block[15] = mctx->byte_count >> 29; + le32_to_cpu_array(mctx->block, (sizeof(mctx->block) - + sizeof(u64)) / sizeof(u32)); + md4_transform(mctx->hash, mctx->block); + cpu_to_le32_array(mctx->hash, ARRAY_SIZE(mctx->hash)); + memcpy(out, mctx->hash, sizeof(mctx->hash)); + memset(mctx, 0, sizeof(*mctx)); + + return 0; +} +EXPORT_SYMBOL_GPL(cifs_md4_final); diff --git a/fs/cifs_common/md4.h b/fs/cifs_common/md4.h new file mode 100644 index 000000000000..5337becc699a --- /dev/null +++ b/fs/cifs_common/md4.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0+ */ +/* + * Common values for ARC4 Cipher Algorithm + */ + +#ifndef _CIFS_MD4_H +#define _CIFS_MD4_H + +#include <linux/types.h> + +#define MD4_DIGEST_SIZE 16 +#define MD4_HMAC_BLOCK_SIZE 64 +#define MD4_BLOCK_WORDS 16 +#define MD4_HASH_WORDS 4 + +struct md4_ctx { + u32 hash[MD4_HASH_WORDS]; + u32 block[MD4_BLOCK_WORDS]; + u64 byte_count; +}; + + +int cifs_md4_init(struct md4_ctx *mctx); +int cifs_md4_update(struct md4_ctx *mctx, const u8 *data, unsigned int len); +int cifs_md4_final(struct md4_ctx *mctx, u8 *out); + +#endif /* _CIFS_MD4_H */ diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index ac5e0c0e9181..1466b5d01cbb 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -45,7 +45,7 @@ static void configfs_d_iput(struct dentry * dentry, /* * Set sd->s_dentry to null only when this dentry is the one * that is going to be killed. Otherwise configfs_d_iput may - * run just after configfs_attach_attr and set sd->s_dentry to + * run just after configfs_lookup and set sd->s_dentry to * NULL even it's still in use. */ if (sd->s_dentry == dentry) @@ -417,44 +417,16 @@ static void configfs_remove_dir(struct config_item * item) dput(dentry); } - -/* attaches attribute's configfs_dirent to the dentry corresponding to the - * attribute file - */ -static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * dentry) -{ - struct configfs_attribute * attr = sd->s_element; - struct inode *inode; - - spin_lock(&configfs_dirent_lock); - dentry->d_fsdata = configfs_get(sd); - sd->s_dentry = dentry; - spin_unlock(&configfs_dirent_lock); - - inode = configfs_create(dentry, (attr->ca_mode & S_IALLUGO) | S_IFREG); - if (IS_ERR(inode)) { - configfs_put(sd); - return PTR_ERR(inode); - } - if (sd->s_type & CONFIGFS_ITEM_BIN_ATTR) { - inode->i_size = 0; - inode->i_fop = &configfs_bin_file_operations; - } else { - inode->i_size = PAGE_SIZE; - inode->i_fop = &configfs_file_operations; - } - d_add(dentry, inode); - return 0; -} - static struct dentry * configfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct configfs_dirent * parent_sd = dentry->d_parent->d_fsdata; struct configfs_dirent * sd; - int found = 0; - int err; + struct inode *inode = NULL; + + if (dentry->d_name.len > NAME_MAX) + return ERR_PTR(-ENAMETOOLONG); /* * Fake invisibility if dir belongs to a group/default groups hierarchy @@ -464,36 +436,39 @@ static struct dentry * configfs_lookup(struct inode *dir, * not complete their initialization, since the dentries of the * attributes won't be instantiated. */ - err = -ENOENT; if (!configfs_dirent_is_ready(parent_sd)) - goto out; + return ERR_PTR(-ENOENT); + spin_lock(&configfs_dirent_lock); list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { - if (sd->s_type & CONFIGFS_NOT_PINNED) { - const unsigned char * name = configfs_get_name(sd); + if ((sd->s_type & CONFIGFS_NOT_PINNED) && + !strcmp(configfs_get_name(sd), dentry->d_name.name)) { + struct configfs_attribute *attr = sd->s_element; + umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG; - if (strcmp(name, dentry->d_name.name)) - continue; + dentry->d_fsdata = configfs_get(sd); + sd->s_dentry = dentry; + spin_unlock(&configfs_dirent_lock); - found = 1; - err = configfs_attach_attr(sd, dentry); - break; + inode = configfs_create(dentry, mode); + if (IS_ERR(inode)) { + configfs_put(sd); + return ERR_CAST(inode); + } + if (sd->s_type & CONFIGFS_ITEM_BIN_ATTR) { + inode->i_size = 0; + inode->i_fop = &configfs_bin_file_operations; + } else { + inode->i_size = PAGE_SIZE; + inode->i_fop = &configfs_file_operations; + } + goto done; } } - - if (!found) { - /* - * If it doesn't exist and it isn't a NOT_PINNED item, - * it must be negative. - */ - if (dentry->d_name.len > NAME_MAX) - return ERR_PTR(-ENAMETOOLONG); - d_add(dentry, NULL); - return NULL; - } - -out: - return ERR_PTR(err); + spin_unlock(&configfs_dirent_lock); +done: + d_add(dentry, inode); + return NULL; } /* diff --git a/fs/configfs/file.c b/fs/configfs/file.c index 5a0be9985bae..0ad32150611e 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -177,28 +177,22 @@ out: return retval; } -/* Fill [buffer, buffer + pos) with data coming from @from. */ -static int fill_write_buffer(struct configfs_buffer *buffer, loff_t pos, +/* Fill @buffer with data coming from @from. */ +static int fill_write_buffer(struct configfs_buffer *buffer, struct iov_iter *from) { - loff_t to_copy; int copied; - u8 *to; if (!buffer->page) buffer->page = (char *)__get_free_pages(GFP_KERNEL, 0); if (!buffer->page) return -ENOMEM; - to_copy = SIMPLE_ATTR_SIZE - 1 - pos; - if (to_copy <= 0) - return 0; - to = buffer->page + pos; - copied = copy_from_iter(to, to_copy, from); + copied = copy_from_iter(buffer->page, SIMPLE_ATTR_SIZE - 1, from); buffer->needs_read_fill = 1; /* if buf is assumed to contain a string, terminate it by \0, * so e.g. sscanf() can scan the string easily */ - to[copied] = 0; + buffer->page[copied] = 0; return copied ? : -EFAULT; } @@ -227,10 +221,10 @@ static ssize_t configfs_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct configfs_buffer *buffer = file->private_data; - ssize_t len; + int len; mutex_lock(&buffer->mutex); - len = fill_write_buffer(buffer, iocb->ki_pos, from); + len = fill_write_buffer(buffer, from); if (len > 0) len = flush_write_buffer(file, buffer, len); if (len > 0) diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index d00455440d08..eb538c28df94 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -26,7 +26,7 @@ * it to find the directory entry again if requested. Naively, that would just * mean using the ciphertext filenames. However, since the ciphertext filenames * can contain illegal characters ('\0' and '/'), they must be encoded in some - * way. We use base64. But that can cause names to exceed NAME_MAX (255 + * way. We use base64url. But that can cause names to exceed NAME_MAX (255 * bytes), so we also need to use a strong hash to abbreviate long names. * * The filesystem may also need another kind of hash, the "dirhash", to quickly @@ -38,7 +38,7 @@ * casefolded directories use this type of dirhash. At least in these cases, * each no-key name must include the name's dirhash too. * - * To meet all these requirements, we base64-encode the following + * To meet all these requirements, we base64url-encode the following * variable-length structure. It contains the dirhash, or 0's if the filesystem * didn't provide one; up to 149 bytes of the ciphertext name; and for * ciphertexts longer than 149 bytes, also the SHA-256 of the remaining bytes. @@ -52,15 +52,19 @@ struct fscrypt_nokey_name { u32 dirhash[2]; u8 bytes[149]; u8 sha256[SHA256_DIGEST_SIZE]; -}; /* 189 bytes => 252 bytes base64-encoded, which is <= NAME_MAX (255) */ +}; /* 189 bytes => 252 bytes base64url-encoded, which is <= NAME_MAX (255) */ /* - * Decoded size of max-size nokey name, i.e. a name that was abbreviated using + * Decoded size of max-size no-key name, i.e. a name that was abbreviated using * the strong hash and thus includes the 'sha256' field. This isn't simply * sizeof(struct fscrypt_nokey_name), as the padding at the end isn't included. */ #define FSCRYPT_NOKEY_NAME_MAX offsetofend(struct fscrypt_nokey_name, sha256) +/* Encoded size of max-size no-key name */ +#define FSCRYPT_NOKEY_NAME_MAX_ENCODED \ + FSCRYPT_BASE64URL_CHARS(FSCRYPT_NOKEY_NAME_MAX) + static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) { if (str->len == 1 && str->name[0] == '.') @@ -175,62 +179,82 @@ static int fname_decrypt(const struct inode *inode, return 0; } -static const char lookup_table[65] = - "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,"; +static const char base64url_table[65] = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"; -#define BASE64_CHARS(nbytes) DIV_ROUND_UP((nbytes) * 4, 3) +#define FSCRYPT_BASE64URL_CHARS(nbytes) DIV_ROUND_UP((nbytes) * 4, 3) /** - * base64_encode() - base64-encode some bytes - * @src: the bytes to encode - * @len: number of bytes to encode - * @dst: (output) the base64-encoded string. Not NUL-terminated. + * fscrypt_base64url_encode() - base64url-encode some binary data + * @src: the binary data to encode + * @srclen: the length of @src in bytes + * @dst: (output) the base64url-encoded string. Not NUL-terminated. * - * Encodes the input string using characters from the set [A-Za-z0-9+,]. - * The encoded string is roughly 4/3 times the size of the input string. + * Encodes data using base64url encoding, i.e. the "Base 64 Encoding with URL + * and Filename Safe Alphabet" specified by RFC 4648. '='-padding isn't used, + * as it's unneeded and not required by the RFC. base64url is used instead of + * base64 to avoid the '/' character, which isn't allowed in filenames. * - * Return: length of the encoded string + * Return: the length of the resulting base64url-encoded string in bytes. + * This will be equal to FSCRYPT_BASE64URL_CHARS(srclen). */ -static int base64_encode(const u8 *src, int len, char *dst) +static int fscrypt_base64url_encode(const u8 *src, int srclen, char *dst) { - int i, bits = 0, ac = 0; + u32 ac = 0; + int bits = 0; + int i; char *cp = dst; - for (i = 0; i < len; i++) { - ac += src[i] << bits; + for (i = 0; i < srclen; i++) { + ac = (ac << 8) | src[i]; bits += 8; do { - *cp++ = lookup_table[ac & 0x3f]; - ac >>= 6; bits -= 6; + *cp++ = base64url_table[(ac >> bits) & 0x3f]; } while (bits >= 6); } if (bits) - *cp++ = lookup_table[ac & 0x3f]; + *cp++ = base64url_table[(ac << (6 - bits)) & 0x3f]; return cp - dst; } -static int base64_decode(const char *src, int len, u8 *dst) +/** + * fscrypt_base64url_decode() - base64url-decode a string + * @src: the string to decode. Doesn't need to be NUL-terminated. + * @srclen: the length of @src in bytes + * @dst: (output) the decoded binary data + * + * Decodes a string using base64url encoding, i.e. the "Base 64 Encoding with + * URL and Filename Safe Alphabet" specified by RFC 4648. '='-padding isn't + * accepted, nor are non-encoding characters such as whitespace. + * + * This implementation hasn't been optimized for performance. + * + * Return: the length of the resulting decoded binary data in bytes, + * or -1 if the string isn't a valid base64url string. + */ +static int fscrypt_base64url_decode(const char *src, int srclen, u8 *dst) { - int i, bits = 0, ac = 0; - const char *p; - u8 *cp = dst; + u32 ac = 0; + int bits = 0; + int i; + u8 *bp = dst; + + for (i = 0; i < srclen; i++) { + const char *p = strchr(base64url_table, src[i]); - for (i = 0; i < len; i++) { - p = strchr(lookup_table, src[i]); if (p == NULL || src[i] == 0) - return -2; - ac += (p - lookup_table) << bits; + return -1; + ac = (ac << 6) | (p - base64url_table); bits += 6; if (bits >= 8) { - *cp++ = ac & 0xff; - ac >>= 8; bits -= 8; + *bp++ = (u8)(ac >> bits); } } - if (ac) + if (ac & ((1 << bits) - 1)) return -1; - return cp - dst; + return bp - dst; } bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, @@ -263,10 +287,8 @@ bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, int fscrypt_fname_alloc_buffer(u32 max_encrypted_len, struct fscrypt_str *crypto_str) { - const u32 max_encoded_len = BASE64_CHARS(FSCRYPT_NOKEY_NAME_MAX); - u32 max_presented_len; - - max_presented_len = max(max_encoded_len, max_encrypted_len); + u32 max_presented_len = max_t(u32, FSCRYPT_NOKEY_NAME_MAX_ENCODED, + max_encrypted_len); crypto_str->name = kmalloc(max_presented_len + 1, GFP_NOFS); if (!crypto_str->name) @@ -342,7 +364,7 @@ int fscrypt_fname_disk_to_usr(const struct inode *inode, offsetof(struct fscrypt_nokey_name, bytes)); BUILD_BUG_ON(offsetofend(struct fscrypt_nokey_name, bytes) != offsetof(struct fscrypt_nokey_name, sha256)); - BUILD_BUG_ON(BASE64_CHARS(FSCRYPT_NOKEY_NAME_MAX) > NAME_MAX); + BUILD_BUG_ON(FSCRYPT_NOKEY_NAME_MAX_ENCODED > NAME_MAX); nokey_name.dirhash[0] = hash; nokey_name.dirhash[1] = minor_hash; @@ -358,7 +380,8 @@ int fscrypt_fname_disk_to_usr(const struct inode *inode, nokey_name.sha256); size = FSCRYPT_NOKEY_NAME_MAX; } - oname->len = base64_encode((const u8 *)&nokey_name, size, oname->name); + oname->len = fscrypt_base64url_encode((const u8 *)&nokey_name, size, + oname->name); return 0; } EXPORT_SYMBOL(fscrypt_fname_disk_to_usr); @@ -432,14 +455,15 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, * user-supplied name */ - if (iname->len > BASE64_CHARS(FSCRYPT_NOKEY_NAME_MAX)) + if (iname->len > FSCRYPT_NOKEY_NAME_MAX_ENCODED) return -ENOENT; fname->crypto_buf.name = kmalloc(FSCRYPT_NOKEY_NAME_MAX, GFP_KERNEL); if (fname->crypto_buf.name == NULL) return -ENOMEM; - ret = base64_decode(iname->name, iname->len, fname->crypto_buf.name); + ret = fscrypt_base64url_decode(iname->name, iname->len, + fname->crypto_buf.name); if (ret < (int)offsetof(struct fscrypt_nokey_name, bytes[1]) || (ret > offsetof(struct fscrypt_nokey_name, sha256) && ret != FSCRYPT_NOKEY_NAME_MAX)) { diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index a73b0376e6f3..af74599ae1cf 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -384,3 +384,47 @@ err_kfree: return ERR_PTR(err); } EXPORT_SYMBOL_GPL(fscrypt_get_symlink); + +/** + * fscrypt_symlink_getattr() - set the correct st_size for encrypted symlinks + * @path: the path for the encrypted symlink being queried + * @stat: the struct being filled with the symlink's attributes + * + * Override st_size of encrypted symlinks to be the length of the decrypted + * symlink target (or the no-key encoded symlink target, if the key is + * unavailable) rather than the length of the encrypted symlink target. This is + * necessary for st_size to match the symlink target that userspace actually + * sees. POSIX requires this, and some userspace programs depend on it. + * + * This requires reading the symlink target from disk if needed, setting up the + * inode's encryption key if possible, and then decrypting or encoding the + * symlink target. This makes lstat() more heavyweight than is normally the + * case. However, decrypted symlink targets will be cached in ->i_link, so + * usually the symlink won't have to be read and decrypted again later if/when + * it is actually followed, readlink() is called, or lstat() is called again. + * + * Return: 0 on success, -errno on failure + */ +int fscrypt_symlink_getattr(const struct path *path, struct kstat *stat) +{ + struct dentry *dentry = path->dentry; + struct inode *inode = d_inode(dentry); + const char *link; + DEFINE_DELAYED_CALL(done); + + /* + * To get the symlink target that userspace will see (whether it's the + * decrypted target or the no-key encoded target), we can just get it in + * the same way the VFS does during path resolution and readlink(). + */ + link = READ_ONCE(inode->i_link); + if (!link) { + link = inode->i_op->get_link(dentry, inode, &done); + if (IS_ERR(link)) + return PTR_ERR(link); + } + stat->size = strlen(link); + do_delayed_call(&done); + return 0; +} +EXPORT_SYMBOL_GPL(fscrypt_symlink_getattr); diff --git a/fs/d_path.c b/fs/d_path.c index 23a53f7b5c71..cd60c7535181 100644 --- a/fs/d_path.c +++ b/fs/d_path.c @@ -22,13 +22,57 @@ static char *extract_string(struct prepend_buffer *p) return ERR_PTR(-ENAMETOOLONG); } -static void prepend(struct prepend_buffer *p, const char *str, int namelen) +static bool prepend_char(struct prepend_buffer *p, unsigned char c) { - p->len -= namelen; - if (likely(p->len >= 0)) { - p->buf -= namelen; - memcpy(p->buf, str, namelen); + if (likely(p->len > 0)) { + p->len--; + *--p->buf = c; + return true; + } + p->len = -1; + return false; +} + +/* + * The source of the prepend data can be an optimistoc load + * of a dentry name and length. And because we don't hold any + * locks, the length and the pointer to the name may not be + * in sync if a concurrent rename happens, and the kernel + * copy might fault as a result. + * + * The end result will correct itself when we check the + * rename sequence count, but we need to be able to handle + * the fault gracefully. + */ +static bool prepend_copy(void *dst, const void *src, int len) +{ + if (unlikely(copy_from_kernel_nofault(dst, src, len))) { + memset(dst, 'x', len); + return false; } + return true; +} + +static bool prepend(struct prepend_buffer *p, const char *str, int namelen) +{ + // Already overflowed? + if (p->len < 0) + return false; + + // Will overflow? + if (p->len < namelen) { + // Fill as much as possible from the end of the name + str += namelen - p->len; + p->buf -= p->len; + prepend_copy(p->buf, str, p->len); + p->len = -1; + return false; + } + + // Fits fully + p->len -= namelen; + p->buf -= namelen; + return prepend_copy(p->buf, str, namelen); } /** @@ -40,32 +84,21 @@ static void prepend(struct prepend_buffer *p, const char *str, int namelen) * With RCU path tracing, it may race with d_move(). Use READ_ONCE() to * make sure that either the old or the new name pointer and length are * fetched. However, there may be mismatch between length and pointer. - * The length cannot be trusted, we need to copy it byte-by-byte until - * the length is reached or a null byte is found. It also prepends "/" at + * But since the length cannot be trusted, we need to copy the name very + * carefully when doing the prepend_copy(). It also prepends "/" at * the beginning of the name. The sequence number check at the caller will * retry it again when a d_move() does happen. So any garbage in the buffer * due to mismatched pointer and length will be discarded. * - * Load acquire is needed to make sure that we see that terminating NUL. + * Load acquire is needed to make sure that we see the new name data even + * if we might get the length wrong. */ static bool prepend_name(struct prepend_buffer *p, const struct qstr *name) { const char *dname = smp_load_acquire(&name->name); /* ^^^ */ u32 dlen = READ_ONCE(name->len); - char *s; - p->len -= dlen + 1; - if (unlikely(p->len < 0)) - return false; - s = p->buf -= dlen + 1; - *s++ = '/'; - while (dlen--) { - char c = *dname++; - if (!c) - break; - *s++ = c; - } - return true; + return prepend(p, dname, dlen) && prepend_char(p, '/'); } static int __prepend_path(const struct dentry *dentry, const struct mount *mnt, @@ -158,7 +191,7 @@ restart: b = *p; if (b.len == p->len) - prepend(&b, "/", 1); + prepend_char(&b, '/'); *p = b; return error; @@ -186,7 +219,7 @@ char *__d_path(const struct path *path, { DECLARE_BUFFER(b, buf, buflen); - prepend(&b, "", 1); + prepend_char(&b, 0); if (unlikely(prepend_path(path, root, &b) > 0)) return NULL; return extract_string(&b); @@ -198,7 +231,7 @@ char *d_absolute_path(const struct path *path, struct path root = {}; DECLARE_BUFFER(b, buf, buflen); - prepend(&b, "", 1); + prepend_char(&b, 0); if (unlikely(prepend_path(path, &root, &b) > 1)) return ERR_PTR(-EINVAL); return extract_string(&b); @@ -255,7 +288,7 @@ char *d_path(const struct path *path, char *buf, int buflen) if (unlikely(d_unlinked(path->dentry))) prepend(&b, " (deleted)", 11); else - prepend(&b, "", 1); + prepend_char(&b, 0); prepend_path(path, &root, &b); rcu_read_unlock(); @@ -290,7 +323,7 @@ char *simple_dname(struct dentry *dentry, char *buffer, int buflen) /* these dentries are never renamed, so d_lock is not needed */ prepend(&b, " (deleted)", 11); prepend(&b, dentry->d_name.name, dentry->d_name.len); - prepend(&b, "/", 1); + prepend_char(&b, '/'); return extract_string(&b); } @@ -324,7 +357,7 @@ restart: } done_seqretry(&rename_lock, seq); if (b.len == p->len) - prepend(&b, "/", 1); + prepend_char(&b, '/'); return extract_string(&b); } @@ -332,7 +365,7 @@ char *dentry_path_raw(const struct dentry *dentry, char *buf, int buflen) { DECLARE_BUFFER(b, buf, buflen); - prepend(&b, "", 1); + prepend_char(&b, 0); return __dentry_path(dentry, &b); } EXPORT_SYMBOL(dentry_path_raw); @@ -344,7 +377,7 @@ char *dentry_path(const struct dentry *dentry, char *buf, int buflen) if (unlikely(d_unlinked(dentry))) prepend(&b, "//deleted", 10); else - prepend(&b, "", 1); + prepend_char(&b, 0); return __dentry_path(dentry, &b); } @@ -397,7 +430,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) unsigned len; DECLARE_BUFFER(b, page, PATH_MAX); - prepend(&b, "", 1); + prepend_char(&b, 0); if (unlikely(prepend_path(&pwd, &root, &b) > 0)) prepend(&b, "(unreachable)", 13); rcu_read_unlock(); @@ -722,7 +722,7 @@ static int copy_cow_page_dax(struct block_device *bdev, struct dax_device *dax_d return rc; id = dax_read_lock(); - rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(PAGE_SIZE), &kaddr, NULL); + rc = dax_direct_access(dax_dev, pgoff, 1, &kaddr, NULL); if (rc < 0) { dax_read_unlock(id); return rc; @@ -1005,12 +1005,12 @@ int dax_writeback_mapping_range(struct address_space *mapping, } EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); -static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos) +static sector_t dax_iomap_sector(const struct iomap *iomap, loff_t pos) { return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9; } -static int dax_iomap_pfn(struct iomap *iomap, loff_t pos, size_t size, +static int dax_iomap_pfn(const struct iomap *iomap, loff_t pos, size_t size, pfn_t *pfnp) { const sector_t sector = dax_iomap_sector(iomap, pos); @@ -1066,6 +1066,66 @@ static vm_fault_t dax_load_hole(struct xa_state *xas, return ret; } +#ifdef CONFIG_FS_DAX_PMD +static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, + const struct iomap *iomap, void **entry) +{ + struct address_space *mapping = vmf->vma->vm_file->f_mapping; + unsigned long pmd_addr = vmf->address & PMD_MASK; + struct vm_area_struct *vma = vmf->vma; + struct inode *inode = mapping->host; + pgtable_t pgtable = NULL; + struct page *zero_page; + spinlock_t *ptl; + pmd_t pmd_entry; + pfn_t pfn; + + zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm); + + if (unlikely(!zero_page)) + goto fallback; + + pfn = page_to_pfn_t(zero_page); + *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, + DAX_PMD | DAX_ZERO_PAGE, false); + + if (arch_needs_pgtable_deposit()) { + pgtable = pte_alloc_one(vma->vm_mm); + if (!pgtable) + return VM_FAULT_OOM; + } + + ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); + if (!pmd_none(*(vmf->pmd))) { + spin_unlock(ptl); + goto fallback; + } + + if (pgtable) { + pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); + mm_inc_nr_ptes(vma->vm_mm); + } + pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot); + pmd_entry = pmd_mkhuge(pmd_entry); + set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry); + spin_unlock(ptl); + trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry); + return VM_FAULT_NOPAGE; + +fallback: + if (pgtable) + pte_free(vma->vm_mm, pgtable); + trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry); + return VM_FAULT_FALLBACK; +} +#else +static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, + const struct iomap *iomap, void **entry) +{ + return VM_FAULT_FALLBACK; +} +#endif /* CONFIG_FS_DAX_PMD */ + s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap) { sector_t sector = iomap_sector(iomap, pos & PAGE_MASK); @@ -1103,20 +1163,21 @@ s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap) return size; } -static loff_t -dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, - struct iomap *iomap, struct iomap *srcmap) +static loff_t dax_iomap_iter(const struct iomap_iter *iomi, + struct iov_iter *iter) { + const struct iomap *iomap = &iomi->iomap; + loff_t length = iomap_length(iomi); + loff_t pos = iomi->pos; struct block_device *bdev = iomap->bdev; struct dax_device *dax_dev = iomap->dax_dev; - struct iov_iter *iter = data; loff_t end = pos + length, done = 0; ssize_t ret = 0; size_t xfer; int id; if (iov_iter_rw(iter) == READ) { - end = min(end, i_size_read(inode)); + end = min(end, i_size_read(iomi->inode)); if (pos >= end) return 0; @@ -1133,7 +1194,7 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, * written by write(2) is visible in mmap. */ if (iomap->flags & IOMAP_F_NEW) { - invalidate_inode_pages2_range(inode->i_mapping, + invalidate_inode_pages2_range(iomi->inode->i_mapping, pos >> PAGE_SHIFT, (end - 1) >> PAGE_SHIFT); } @@ -1209,31 +1270,29 @@ ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops) { - struct address_space *mapping = iocb->ki_filp->f_mapping; - struct inode *inode = mapping->host; - loff_t pos = iocb->ki_pos, ret = 0, done = 0; - unsigned flags = 0; + struct iomap_iter iomi = { + .inode = iocb->ki_filp->f_mapping->host, + .pos = iocb->ki_pos, + .len = iov_iter_count(iter), + }; + loff_t done = 0; + int ret; if (iov_iter_rw(iter) == WRITE) { - lockdep_assert_held_write(&inode->i_rwsem); - flags |= IOMAP_WRITE; + lockdep_assert_held_write(&iomi.inode->i_rwsem); + iomi.flags |= IOMAP_WRITE; } else { - lockdep_assert_held(&inode->i_rwsem); + lockdep_assert_held(&iomi.inode->i_rwsem); } if (iocb->ki_flags & IOCB_NOWAIT) - flags |= IOMAP_NOWAIT; + iomi.flags |= IOMAP_NOWAIT; - while (iov_iter_count(iter)) { - ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops, - iter, dax_iomap_actor); - if (ret <= 0) - break; - pos += ret; - done += ret; - } + while ((ret = iomap_iter(&iomi, ops)) > 0) + iomi.processed = dax_iomap_iter(&iomi, iter); - iocb->ki_pos += done; + done = iomi.pos - iocb->ki_pos; + iocb->ki_pos = iomi.pos; return done ? done : ret; } EXPORT_SYMBOL_GPL(dax_iomap_rw); @@ -1250,44 +1309,146 @@ static vm_fault_t dax_fault_return(int error) * flushed on write-faults (non-cow), but not read-faults. */ static bool dax_fault_is_synchronous(unsigned long flags, - struct vm_area_struct *vma, struct iomap *iomap) + struct vm_area_struct *vma, const struct iomap *iomap) { return (flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC) && (iomap->flags & IOMAP_F_DIRTY); } +/* + * When handling a synchronous page fault and the inode need a fsync, we can + * insert the PTE/PMD into page tables only after that fsync happened. Skip + * insertion for now and return the pfn so that caller can insert it after the + * fsync is done. + */ +static vm_fault_t dax_fault_synchronous_pfnp(pfn_t *pfnp, pfn_t pfn) +{ + if (WARN_ON_ONCE(!pfnp)) + return VM_FAULT_SIGBUS; + *pfnp = pfn; + return VM_FAULT_NEEDDSYNC; +} + +static vm_fault_t dax_fault_cow_page(struct vm_fault *vmf, + const struct iomap_iter *iter) +{ + sector_t sector = dax_iomap_sector(&iter->iomap, iter->pos); + unsigned long vaddr = vmf->address; + vm_fault_t ret; + int error = 0; + + switch (iter->iomap.type) { + case IOMAP_HOLE: + case IOMAP_UNWRITTEN: + clear_user_highpage(vmf->cow_page, vaddr); + break; + case IOMAP_MAPPED: + error = copy_cow_page_dax(iter->iomap.bdev, iter->iomap.dax_dev, + sector, vmf->cow_page, vaddr); + break; + default: + WARN_ON_ONCE(1); + error = -EIO; + break; + } + + if (error) + return dax_fault_return(error); + + __SetPageUptodate(vmf->cow_page); + ret = finish_fault(vmf); + if (!ret) + return VM_FAULT_DONE_COW; + return ret; +} + +/** + * dax_fault_iter - Common actor to handle pfn insertion in PTE/PMD fault. + * @vmf: vm fault instance + * @iter: iomap iter + * @pfnp: pfn to be returned + * @xas: the dax mapping tree of a file + * @entry: an unlocked dax entry to be inserted + * @pmd: distinguish whether it is a pmd fault + */ +static vm_fault_t dax_fault_iter(struct vm_fault *vmf, + const struct iomap_iter *iter, pfn_t *pfnp, + struct xa_state *xas, void **entry, bool pmd) +{ + struct address_space *mapping = vmf->vma->vm_file->f_mapping; + const struct iomap *iomap = &iter->iomap; + size_t size = pmd ? PMD_SIZE : PAGE_SIZE; + loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT; + bool write = vmf->flags & FAULT_FLAG_WRITE; + bool sync = dax_fault_is_synchronous(iter->flags, vmf->vma, iomap); + unsigned long entry_flags = pmd ? DAX_PMD : 0; + int err = 0; + pfn_t pfn; + + if (!pmd && vmf->cow_page) + return dax_fault_cow_page(vmf, iter); + + /* if we are reading UNWRITTEN and HOLE, return a hole. */ + if (!write && + (iomap->type == IOMAP_UNWRITTEN || iomap->type == IOMAP_HOLE)) { + if (!pmd) + return dax_load_hole(xas, mapping, entry, vmf); + return dax_pmd_load_hole(xas, vmf, iomap, entry); + } + + if (iomap->type != IOMAP_MAPPED) { + WARN_ON_ONCE(1); + return pmd ? VM_FAULT_FALLBACK : VM_FAULT_SIGBUS; + } + + err = dax_iomap_pfn(&iter->iomap, pos, size, &pfn); + if (err) + return pmd ? VM_FAULT_FALLBACK : dax_fault_return(err); + + *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, entry_flags, + write && !sync); + + if (sync) + return dax_fault_synchronous_pfnp(pfnp, pfn); + + /* insert PMD pfn */ + if (pmd) + return vmf_insert_pfn_pmd(vmf, pfn, write); + + /* insert PTE pfn */ + if (write) + return vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn); + return vmf_insert_mixed(vmf->vma, vmf->address, pfn); +} + static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops) { - struct vm_area_struct *vma = vmf->vma; - struct address_space *mapping = vma->vm_file->f_mapping; + struct address_space *mapping = vmf->vma->vm_file->f_mapping; XA_STATE(xas, &mapping->i_pages, vmf->pgoff); - struct inode *inode = mapping->host; - unsigned long vaddr = vmf->address; - loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; - struct iomap iomap = { .type = IOMAP_HOLE }; - struct iomap srcmap = { .type = IOMAP_HOLE }; - unsigned flags = IOMAP_FAULT; - int error, major = 0; - bool write = vmf->flags & FAULT_FLAG_WRITE; - bool sync; + struct iomap_iter iter = { + .inode = mapping->host, + .pos = (loff_t)vmf->pgoff << PAGE_SHIFT, + .len = PAGE_SIZE, + .flags = IOMAP_FAULT, + }; vm_fault_t ret = 0; void *entry; - pfn_t pfn; + int error; - trace_dax_pte_fault(inode, vmf, ret); + trace_dax_pte_fault(iter.inode, vmf, ret); /* * Check whether offset isn't beyond end of file now. Caller is supposed * to hold locks serializing us with truncate / punch hole so this is * a reliable test. */ - if (pos >= i_size_read(inode)) { + if (iter.pos >= i_size_read(iter.inode)) { ret = VM_FAULT_SIGBUS; goto out; } - if (write && !vmf->cow_page) - flags |= IOMAP_WRITE; + if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) + iter.flags |= IOMAP_WRITE; entry = grab_mapping_entry(&xas, mapping, 0); if (xa_is_internal(entry)) { @@ -1306,234 +1467,103 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, goto unlock_entry; } - /* - * Note that we don't bother to use iomap_apply here: DAX required - * the file system block size to be equal the page size, which means - * that we never have to deal with more than a single extent here. - */ - error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap, &srcmap); - if (iomap_errp) - *iomap_errp = error; - if (error) { - ret = dax_fault_return(error); - goto unlock_entry; - } - if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { - error = -EIO; /* fs corruption? */ - goto error_finish_iomap; - } - - if (vmf->cow_page) { - sector_t sector = dax_iomap_sector(&iomap, pos); - - switch (iomap.type) { - case IOMAP_HOLE: - case IOMAP_UNWRITTEN: - clear_user_highpage(vmf->cow_page, vaddr); - break; - case IOMAP_MAPPED: - error = copy_cow_page_dax(iomap.bdev, iomap.dax_dev, - sector, vmf->cow_page, vaddr); - break; - default: - WARN_ON_ONCE(1); - error = -EIO; - break; + while ((error = iomap_iter(&iter, ops)) > 0) { + if (WARN_ON_ONCE(iomap_length(&iter) < PAGE_SIZE)) { + iter.processed = -EIO; /* fs corruption? */ + continue; } - if (error) - goto error_finish_iomap; - - __SetPageUptodate(vmf->cow_page); - ret = finish_fault(vmf); - if (!ret) - ret = VM_FAULT_DONE_COW; - goto finish_iomap; - } - - sync = dax_fault_is_synchronous(flags, vma, &iomap); - - switch (iomap.type) { - case IOMAP_MAPPED: - if (iomap.flags & IOMAP_F_NEW) { + ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, false); + if (ret != VM_FAULT_SIGBUS && + (iter.iomap.flags & IOMAP_F_NEW)) { count_vm_event(PGMAJFAULT); - count_memcg_event_mm(vma->vm_mm, PGMAJFAULT); - major = VM_FAULT_MAJOR; + count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); + ret |= VM_FAULT_MAJOR; } - error = dax_iomap_pfn(&iomap, pos, PAGE_SIZE, &pfn); - if (error < 0) - goto error_finish_iomap; - - entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn, - 0, write && !sync); - /* - * If we are doing synchronous page fault and inode needs fsync, - * we can insert PTE into page tables only after that happens. - * Skip insertion for now and return the pfn so that caller can - * insert it after fsync is done. - */ - if (sync) { - if (WARN_ON_ONCE(!pfnp)) { - error = -EIO; - goto error_finish_iomap; - } - *pfnp = pfn; - ret = VM_FAULT_NEEDDSYNC | major; - goto finish_iomap; - } - trace_dax_insert_mapping(inode, vmf, entry); - if (write) - ret = vmf_insert_mixed_mkwrite(vma, vaddr, pfn); - else - ret = vmf_insert_mixed(vma, vaddr, pfn); - - goto finish_iomap; - case IOMAP_UNWRITTEN: - case IOMAP_HOLE: - if (!write) { - ret = dax_load_hole(&xas, mapping, &entry, vmf); - goto finish_iomap; - } - fallthrough; - default: - WARN_ON_ONCE(1); - error = -EIO; - break; + if (!(ret & VM_FAULT_ERROR)) + iter.processed = PAGE_SIZE; } - error_finish_iomap: - ret = dax_fault_return(error); - finish_iomap: - if (ops->iomap_end) { - int copied = PAGE_SIZE; + if (iomap_errp) + *iomap_errp = error; + if (!ret && error) + ret = dax_fault_return(error); - if (ret & VM_FAULT_ERROR) - copied = 0; - /* - * The fault is done by now and there's no way back (other - * thread may be already happily using PTE we have installed). - * Just ignore error from ->iomap_end since we cannot do much - * with it. - */ - ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap); - } - unlock_entry: +unlock_entry: dax_unlock_entry(&xas, entry); - out: - trace_dax_pte_fault_done(inode, vmf, ret); - return ret | major; +out: + trace_dax_pte_fault_done(iter.inode, vmf, ret); + return ret; } #ifdef CONFIG_FS_DAX_PMD -static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, - struct iomap *iomap, void **entry) +static bool dax_fault_check_fallback(struct vm_fault *vmf, struct xa_state *xas, + pgoff_t max_pgoff) { - struct address_space *mapping = vmf->vma->vm_file->f_mapping; unsigned long pmd_addr = vmf->address & PMD_MASK; - struct vm_area_struct *vma = vmf->vma; - struct inode *inode = mapping->host; - pgtable_t pgtable = NULL; - struct page *zero_page; - spinlock_t *ptl; - pmd_t pmd_entry; - pfn_t pfn; - - zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm); - - if (unlikely(!zero_page)) - goto fallback; + bool write = vmf->flags & FAULT_FLAG_WRITE; - pfn = page_to_pfn_t(zero_page); - *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, - DAX_PMD | DAX_ZERO_PAGE, false); + /* + * Make sure that the faulting address's PMD offset (color) matches + * the PMD offset from the start of the file. This is necessary so + * that a PMD range in the page table overlaps exactly with a PMD + * range in the page cache. + */ + if ((vmf->pgoff & PG_PMD_COLOUR) != + ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR)) + return true; - if (arch_needs_pgtable_deposit()) { - pgtable = pte_alloc_one(vma->vm_mm); - if (!pgtable) - return VM_FAULT_OOM; - } + /* Fall back to PTEs if we're going to COW */ + if (write && !(vmf->vma->vm_flags & VM_SHARED)) + return true; - ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); - if (!pmd_none(*(vmf->pmd))) { - spin_unlock(ptl); - goto fallback; - } + /* If the PMD would extend outside the VMA */ + if (pmd_addr < vmf->vma->vm_start) + return true; + if ((pmd_addr + PMD_SIZE) > vmf->vma->vm_end) + return true; - if (pgtable) { - pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); - mm_inc_nr_ptes(vma->vm_mm); - } - pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot); - pmd_entry = pmd_mkhuge(pmd_entry); - set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry); - spin_unlock(ptl); - trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry); - return VM_FAULT_NOPAGE; + /* If the PMD would extend beyond the file size */ + if ((xas->xa_index | PG_PMD_COLOUR) >= max_pgoff) + return true; -fallback: - if (pgtable) - pte_free(vma->vm_mm, pgtable); - trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry); - return VM_FAULT_FALLBACK; + return false; } static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, const struct iomap_ops *ops) { - struct vm_area_struct *vma = vmf->vma; - struct address_space *mapping = vma->vm_file->f_mapping; + struct address_space *mapping = vmf->vma->vm_file->f_mapping; XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER); - unsigned long pmd_addr = vmf->address & PMD_MASK; - bool write = vmf->flags & FAULT_FLAG_WRITE; - bool sync; - unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT; - struct inode *inode = mapping->host; - vm_fault_t result = VM_FAULT_FALLBACK; - struct iomap iomap = { .type = IOMAP_HOLE }; - struct iomap srcmap = { .type = IOMAP_HOLE }; + struct iomap_iter iter = { + .inode = mapping->host, + .len = PMD_SIZE, + .flags = IOMAP_FAULT, + }; + vm_fault_t ret = VM_FAULT_FALLBACK; pgoff_t max_pgoff; void *entry; - loff_t pos; int error; - pfn_t pfn; + + if (vmf->flags & FAULT_FLAG_WRITE) + iter.flags |= IOMAP_WRITE; /* * Check whether offset isn't beyond end of file now. Caller is * supposed to hold locks serializing us with truncate / punch hole so * this is a reliable test. */ - max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); - - trace_dax_pmd_fault(inode, vmf, max_pgoff, 0); - - /* - * Make sure that the faulting address's PMD offset (color) matches - * the PMD offset from the start of the file. This is necessary so - * that a PMD range in the page table overlaps exactly with a PMD - * range in the page cache. - */ - if ((vmf->pgoff & PG_PMD_COLOUR) != - ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR)) - goto fallback; + max_pgoff = DIV_ROUND_UP(i_size_read(iter.inode), PAGE_SIZE); - /* Fall back to PTEs if we're going to COW */ - if (write && !(vma->vm_flags & VM_SHARED)) - goto fallback; - - /* If the PMD would extend outside the VMA */ - if (pmd_addr < vma->vm_start) - goto fallback; - if ((pmd_addr + PMD_SIZE) > vma->vm_end) - goto fallback; + trace_dax_pmd_fault(iter.inode, vmf, max_pgoff, 0); if (xas.xa_index >= max_pgoff) { - result = VM_FAULT_SIGBUS; + ret = VM_FAULT_SIGBUS; goto out; } - /* If the PMD would extend beyond the file size */ - if ((xas.xa_index | PG_PMD_COLOUR) >= max_pgoff) + if (dax_fault_check_fallback(vmf, &xas, max_pgoff)) goto fallback; /* @@ -1544,7 +1574,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, */ entry = grab_mapping_entry(&xas, mapping, PMD_ORDER); if (xa_is_internal(entry)) { - result = xa_to_internal(entry); + ret = xa_to_internal(entry); goto fallback; } @@ -1556,88 +1586,30 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, */ if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd) && !pmd_devmap(*vmf->pmd)) { - result = 0; + ret = 0; goto unlock_entry; } - /* - * Note that we don't use iomap_apply here. We aren't doing I/O, only - * setting up a mapping, so really we're using iomap_begin() as a way - * to look up our filesystem block. - */ - pos = (loff_t)xas.xa_index << PAGE_SHIFT; - error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap, - &srcmap); - if (error) - goto unlock_entry; - - if (iomap.offset + iomap.length < pos + PMD_SIZE) - goto finish_iomap; - - sync = dax_fault_is_synchronous(iomap_flags, vma, &iomap); - - switch (iomap.type) { - case IOMAP_MAPPED: - error = dax_iomap_pfn(&iomap, pos, PMD_SIZE, &pfn); - if (error < 0) - goto finish_iomap; - - entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn, - DAX_PMD, write && !sync); - - /* - * If we are doing synchronous page fault and inode needs fsync, - * we can insert PMD into page tables only after that happens. - * Skip insertion for now and return the pfn so that caller can - * insert it after fsync is done. - */ - if (sync) { - if (WARN_ON_ONCE(!pfnp)) - goto finish_iomap; - *pfnp = pfn; - result = VM_FAULT_NEEDDSYNC; - goto finish_iomap; - } + iter.pos = (loff_t)xas.xa_index << PAGE_SHIFT; + while ((error = iomap_iter(&iter, ops)) > 0) { + if (iomap_length(&iter) < PMD_SIZE) + continue; /* actually breaks out of the loop */ - trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry); - result = vmf_insert_pfn_pmd(vmf, pfn, write); - break; - case IOMAP_UNWRITTEN: - case IOMAP_HOLE: - if (WARN_ON_ONCE(write)) - break; - result = dax_pmd_load_hole(&xas, vmf, &iomap, &entry); - break; - default: - WARN_ON_ONCE(1); - break; + ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, true); + if (ret != VM_FAULT_FALLBACK) + iter.processed = PMD_SIZE; } - finish_iomap: - if (ops->iomap_end) { - int copied = PMD_SIZE; - - if (result == VM_FAULT_FALLBACK) - copied = 0; - /* - * The fault is done by now and there's no way back (other - * thread may be already happily using PMD we have installed). - * Just ignore error from ->iomap_end since we cannot do much - * with it. - */ - ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags, - &iomap); - } - unlock_entry: +unlock_entry: dax_unlock_entry(&xas, entry); - fallback: - if (result == VM_FAULT_FALLBACK) { - split_huge_pmd(vma, vmf->pmd, vmf->address); +fallback: + if (ret == VM_FAULT_FALLBACK) { + split_huge_pmd(vmf->vma, vmf->pmd, vmf->address); count_vm_event(THP_FAULT_FALLBACK); } out: - trace_dax_pmd_fault_done(inode, vmf, max_pgoff, result); - return result; + trace_dax_pmd_fault_done(iter.inode, vmf, max_pgoff, ret); + return ret; } #else static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index df00231d3ecc..7d162b0efbf0 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -179,8 +179,10 @@ static int open_proxy_open(struct inode *inode, struct file *filp) if (!fops_get(real_fops)) { #ifdef CONFIG_MODULES if (real_fops->owner && - real_fops->owner->state == MODULE_STATE_GOING) + real_fops->owner->state == MODULE_STATE_GOING) { + r = -ENXIO; goto out; + } #endif /* Huh? Module did not clean up after itself at exit? */ @@ -314,8 +316,10 @@ static int full_proxy_open(struct inode *inode, struct file *filp) if (!fops_get(real_fops)) { #ifdef CONFIG_MODULES if (real_fops->owner && - real_fops->owner->state == MODULE_STATE_GOING) + real_fops->owner->state == MODULE_STATE_GOING) { + r = -ENXIO; goto out; + } #endif /* Huh? Module did not cleanup after itself at exit? */ diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c index 10c36ae1a8f9..45ebbe602bbf 100644 --- a/fs/dlm/dir.c +++ b/fs/dlm/dir.c @@ -85,8 +85,10 @@ int dlm_recover_directory(struct dlm_ls *ls) for (;;) { int left; error = dlm_recovery_stopped(ls); - if (error) + if (error) { + error = -EINTR; goto out_free; + } error = dlm_rcom_names(ls, memb->nodeid, last_name, last_len); diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 91d1ca3a121a..5f57538b5d45 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -468,7 +468,7 @@ struct dlm_rcom { struct dlm_opt_header { uint16_t t_type; uint16_t t_length; - uint32_t o_pad; + uint32_t t_pad; /* need to be 8 byte aligned */ char t_value[]; }; diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index d71aba8c3e64..10eddfa6c3d7 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -498,7 +498,7 @@ static int new_lockspace(const char *name, const char *cluster, ls->ls_exflags = (flags & ~(DLM_LSFL_TIMEWARN | DLM_LSFL_FS | DLM_LSFL_NEWEXCL)); - size = dlm_config.ci_rsbtbl_size; + size = READ_ONCE(dlm_config.ci_rsbtbl_size); ls->ls_rsbtbl_size = size; ls->ls_rsbtbl = vmalloc(array_size(size, sizeof(struct dlm_rsbtable))); @@ -793,6 +793,7 @@ static int release_lockspace(struct dlm_ls *ls, int force) if (ls_count == 1) { dlm_scand_stop(); + dlm_clear_members(ls); dlm_midcomms_shutdown(); } diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 0ea9ae35da0b..8f715c620e1f 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -84,9 +84,7 @@ struct connection { struct list_head writequeue; /* List of outgoing writequeue_entries */ spinlock_t writequeue_lock; atomic_t writequeue_cnt; - void (*connect_action) (struct connection *); /* What to do to connect */ - void (*shutdown_action)(struct connection *con); /* What to do to shutdown */ - bool (*eof_condition)(struct connection *con); /* What to do to eof check */ + struct mutex wq_alloc; int retries; #define MAX_CONNECT_RETRIES 3 struct hlist_node list; @@ -145,6 +143,24 @@ struct dlm_node_addr { struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT]; }; +struct dlm_proto_ops { + bool try_new_addr; + const char *name; + int proto; + + int (*connect)(struct connection *con, struct socket *sock, + struct sockaddr *addr, int addr_len); + void (*sockopts)(struct socket *sock); + int (*bind)(struct socket *sock); + int (*listen_validate)(void); + void (*listen_sockopts)(struct socket *sock); + int (*listen_bind)(struct socket *sock); + /* What to do to shutdown */ + void (*shutdown_action)(struct connection *con); + /* What to do to eof check */ + bool (*eof_condition)(struct connection *con); +}; + static struct listen_sock_callbacks { void (*sk_error_report)(struct sock *); void (*sk_data_ready)(struct sock *); @@ -168,12 +184,26 @@ static struct hlist_head connection_hash[CONN_HASH_SIZE]; static DEFINE_SPINLOCK(connections_lock); DEFINE_STATIC_SRCU(connections_srcu); +static const struct dlm_proto_ops *dlm_proto_ops; + static void process_recv_sockets(struct work_struct *work); static void process_send_sockets(struct work_struct *work); -static void sctp_connect_to_sock(struct connection *con); -static void tcp_connect_to_sock(struct connection *con); -static void dlm_tcp_shutdown(struct connection *con); +/* need to held writequeue_lock */ +static struct writequeue_entry *con_next_wq(struct connection *con) +{ + struct writequeue_entry *e; + + if (list_empty(&con->writequeue)) + return NULL; + + e = list_first_entry(&con->writequeue, struct writequeue_entry, + list); + if (e->len == 0) + return NULL; + + return e; +} static struct connection *__find_con(int nodeid, int r) { @@ -208,20 +238,6 @@ static int dlm_con_init(struct connection *con, int nodeid) INIT_WORK(&con->rwork, process_recv_sockets); init_waitqueue_head(&con->shutdown_wait); - switch (dlm_config.ci_protocol) { - case DLM_PROTO_TCP: - con->connect_action = tcp_connect_to_sock; - con->shutdown_action = dlm_tcp_shutdown; - con->eof_condition = tcp_eof_condition; - break; - case DLM_PROTO_SCTP: - con->connect_action = sctp_connect_to_sock; - break; - default: - kfree(con->rx_buf); - return -EINVAL; - } - return 0; } @@ -249,6 +265,8 @@ static struct connection *nodeid2con(int nodeid, gfp_t alloc) return NULL; } + mutex_init(&con->wq_alloc); + spin_lock(&connections_lock); /* Because multiple workqueues/threads calls this function it can * race on multiple cpu's. Instead of locking hot path __find_con() @@ -583,8 +601,7 @@ static void lowcomms_error_report(struct sock *sk) goto out; orig_report = listen_sock.sk_error_report; - if (con->sock == NULL || - kernel_getpeername(con->sock, (struct sockaddr *)&saddr) < 0) { + if (kernel_getpeername(sk->sk_socket, (struct sockaddr *)&saddr) < 0) { printk_ratelimited(KERN_ERR "dlm: node %d: socket error " "sending to node %d, port %d, " "sk_err=%d/%d\n", dlm_our_nodeid(), @@ -801,6 +818,7 @@ static void close_connection(struct connection *con, bool and_other, con->rx_leftover = 0; con->retries = 0; + clear_bit(CF_APP_LIMITED, &con->flags); clear_bit(CF_CONNECTED, &con->flags); clear_bit(CF_DELAY_CONNECT, &con->flags); clear_bit(CF_RECONNECT, &con->flags); @@ -877,7 +895,6 @@ static int con_realloc_receive_buf(struct connection *con, int newlen) /* Data received from remote end */ static int receive_from_sock(struct connection *con) { - int call_again_soon = 0; struct msghdr msg; struct kvec iov; int ret, buflen; @@ -897,41 +914,40 @@ static int receive_from_sock(struct connection *con) goto out_resched; } - /* calculate new buffer parameter regarding last receive and - * possible leftover bytes - */ - iov.iov_base = con->rx_buf + con->rx_leftover; - iov.iov_len = con->rx_buflen - con->rx_leftover; - - memset(&msg, 0, sizeof(msg)); - msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; - ret = kernel_recvmsg(con->sock, &msg, &iov, 1, iov.iov_len, - msg.msg_flags); - if (ret <= 0) - goto out_close; - else if (ret == iov.iov_len) - call_again_soon = 1; - - /* new buflen according readed bytes and leftover from last receive */ - buflen = ret + con->rx_leftover; - ret = dlm_process_incoming_buffer(con->nodeid, con->rx_buf, buflen); - if (ret < 0) - goto out_close; + for (;;) { + /* calculate new buffer parameter regarding last receive and + * possible leftover bytes + */ + iov.iov_base = con->rx_buf + con->rx_leftover; + iov.iov_len = con->rx_buflen - con->rx_leftover; + + memset(&msg, 0, sizeof(msg)); + msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; + ret = kernel_recvmsg(con->sock, &msg, &iov, 1, iov.iov_len, + msg.msg_flags); + if (ret == -EAGAIN) + break; + else if (ret <= 0) + goto out_close; - /* calculate leftover bytes from process and put it into begin of - * the receive buffer, so next receive we have the full message - * at the start address of the receive buffer. - */ - con->rx_leftover = buflen - ret; - if (con->rx_leftover) { - memmove(con->rx_buf, con->rx_buf + ret, - con->rx_leftover); - call_again_soon = true; + /* new buflen according readed bytes and leftover from last receive */ + buflen = ret + con->rx_leftover; + ret = dlm_process_incoming_buffer(con->nodeid, con->rx_buf, buflen); + if (ret < 0) + goto out_close; + + /* calculate leftover bytes from process and put it into begin of + * the receive buffer, so next receive we have the full message + * at the start address of the receive buffer. + */ + con->rx_leftover = buflen - ret; + if (con->rx_leftover) { + memmove(con->rx_buf, con->rx_buf + ret, + con->rx_leftover); + } } - if (call_again_soon) - goto out_resched; - + dlm_midcomms_receive_done(con->nodeid); mutex_unlock(&con->sock_mutex); return 0; @@ -946,7 +962,8 @@ out_close: log_print("connection %p got EOF from %d", con, con->nodeid); - if (con->eof_condition && con->eof_condition(con)) { + if (dlm_proto_ops->eof_condition && + dlm_proto_ops->eof_condition(con)) { set_bit(CF_EOF, &con->flags); mutex_unlock(&con->sock_mutex); } else { @@ -1134,242 +1151,6 @@ static int sctp_bind_addrs(struct socket *sock, uint16_t port) return result; } -/* Initiate an SCTP association. - This is a special case of send_to_sock() in that we don't yet have a - peeled-off socket for this association, so we use the listening socket - and add the primary IP address of the remote node. - */ -static void sctp_connect_to_sock(struct connection *con) -{ - struct sockaddr_storage daddr; - int result; - int addr_len; - struct socket *sock; - unsigned int mark; - - mutex_lock(&con->sock_mutex); - - /* Some odd races can cause double-connects, ignore them */ - if (con->retries++ > MAX_CONNECT_RETRIES) - goto out; - - if (con->sock) { - log_print("node %d already connected.", con->nodeid); - goto out; - } - - memset(&daddr, 0, sizeof(daddr)); - result = nodeid_to_addr(con->nodeid, &daddr, NULL, true, &mark); - if (result < 0) { - log_print("no address for nodeid %d", con->nodeid); - goto out; - } - - /* Create a socket to communicate with */ - result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family, - SOCK_STREAM, IPPROTO_SCTP, &sock); - if (result < 0) - goto socket_err; - - sock_set_mark(sock->sk, mark); - - add_sock(sock, con); - - /* Bind to all addresses. */ - if (sctp_bind_addrs(con->sock, 0)) - goto bind_err; - - make_sockaddr(&daddr, dlm_config.ci_tcp_port, &addr_len); - - log_print_ratelimited("connecting to %d", con->nodeid); - - /* Turn off Nagle's algorithm */ - sctp_sock_set_nodelay(sock->sk); - - /* - * Make sock->ops->connect() function return in specified time, - * since O_NONBLOCK argument in connect() function does not work here, - * then, we should restore the default value of this attribute. - */ - sock_set_sndtimeo(sock->sk, 5); - result = sock->ops->connect(sock, (struct sockaddr *)&daddr, addr_len, - 0); - sock_set_sndtimeo(sock->sk, 0); - - if (result == -EINPROGRESS) - result = 0; - if (result == 0) { - if (!test_and_set_bit(CF_CONNECTED, &con->flags)) - log_print("successful connected to node %d", con->nodeid); - goto out; - } - -bind_err: - con->sock = NULL; - sock_release(sock); - -socket_err: - /* - * Some errors are fatal and this list might need adjusting. For other - * errors we try again until the max number of retries is reached. - */ - if (result != -EHOSTUNREACH && - result != -ENETUNREACH && - result != -ENETDOWN && - result != -EINVAL && - result != -EPROTONOSUPPORT) { - log_print("connect %d try %d error %d", con->nodeid, - con->retries, result); - mutex_unlock(&con->sock_mutex); - msleep(1000); - lowcomms_connect_sock(con); - return; - } - -out: - mutex_unlock(&con->sock_mutex); -} - -/* Connect a new socket to its peer */ -static void tcp_connect_to_sock(struct connection *con) -{ - struct sockaddr_storage saddr, src_addr; - unsigned int mark; - int addr_len; - struct socket *sock = NULL; - int result; - - mutex_lock(&con->sock_mutex); - if (con->retries++ > MAX_CONNECT_RETRIES) - goto out; - - /* Some odd races can cause double-connects, ignore them */ - if (con->sock) - goto out; - - /* Create a socket to communicate with */ - result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family, - SOCK_STREAM, IPPROTO_TCP, &sock); - if (result < 0) - goto out_err; - - memset(&saddr, 0, sizeof(saddr)); - result = nodeid_to_addr(con->nodeid, &saddr, NULL, false, &mark); - if (result < 0) { - log_print("no address for nodeid %d", con->nodeid); - goto out_err; - } - - sock_set_mark(sock->sk, mark); - - add_sock(sock, con); - - /* Bind to our cluster-known address connecting to avoid - routing problems */ - memcpy(&src_addr, dlm_local_addr[0], sizeof(src_addr)); - make_sockaddr(&src_addr, 0, &addr_len); - result = sock->ops->bind(sock, (struct sockaddr *) &src_addr, - addr_len); - if (result < 0) { - log_print("could not bind for connect: %d", result); - /* This *may* not indicate a critical error */ - } - - make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len); - - log_print_ratelimited("connecting to %d", con->nodeid); - - /* Turn off Nagle's algorithm */ - tcp_sock_set_nodelay(sock->sk); - - result = sock->ops->connect(sock, (struct sockaddr *)&saddr, addr_len, - O_NONBLOCK); - if (result == -EINPROGRESS) - result = 0; - if (result == 0) - goto out; - -out_err: - if (con->sock) { - sock_release(con->sock); - con->sock = NULL; - } else if (sock) { - sock_release(sock); - } - /* - * Some errors are fatal and this list might need adjusting. For other - * errors we try again until the max number of retries is reached. - */ - if (result != -EHOSTUNREACH && - result != -ENETUNREACH && - result != -ENETDOWN && - result != -EINVAL && - result != -EPROTONOSUPPORT) { - log_print("connect %d try %d error %d", con->nodeid, - con->retries, result); - mutex_unlock(&con->sock_mutex); - msleep(1000); - lowcomms_connect_sock(con); - return; - } -out: - mutex_unlock(&con->sock_mutex); - return; -} - -/* On error caller must run dlm_close_sock() for the - * listen connection socket. - */ -static int tcp_create_listen_sock(struct listen_connection *con, - struct sockaddr_storage *saddr) -{ - struct socket *sock = NULL; - int result = 0; - int addr_len; - - if (dlm_local_addr[0]->ss_family == AF_INET) - addr_len = sizeof(struct sockaddr_in); - else - addr_len = sizeof(struct sockaddr_in6); - - /* Create a socket to communicate with */ - result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family, - SOCK_STREAM, IPPROTO_TCP, &sock); - if (result < 0) { - log_print("Can't create listening comms socket"); - goto create_out; - } - - sock_set_mark(sock->sk, dlm_config.ci_mark); - - /* Turn off Nagle's algorithm */ - tcp_sock_set_nodelay(sock->sk); - - sock_set_reuseaddr(sock->sk); - - add_listen_sock(sock, con); - - /* Bind to our port */ - make_sockaddr(saddr, dlm_config.ci_tcp_port, &addr_len); - result = sock->ops->bind(sock, (struct sockaddr *) saddr, addr_len); - if (result < 0) { - log_print("Can't bind to port %d", dlm_config.ci_tcp_port); - goto create_out; - } - sock_set_keepalive(sock->sk); - - result = sock->ops->listen(sock, 5); - if (result < 0) { - log_print("Can't listen on port %d", dlm_config.ci_tcp_port); - goto create_out; - } - - return 0; - -create_out: - return result; -} - /* Get local addresses */ static void init_local(void) { @@ -1396,63 +1177,6 @@ static void deinit_local(void) kfree(dlm_local_addr[i]); } -/* Initialise SCTP socket and bind to all interfaces - * On error caller must run dlm_close_sock() for the - * listen connection socket. - */ -static int sctp_listen_for_all(struct listen_connection *con) -{ - struct socket *sock = NULL; - int result = -EINVAL; - - log_print("Using SCTP for communications"); - - result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family, - SOCK_STREAM, IPPROTO_SCTP, &sock); - if (result < 0) { - log_print("Can't create comms socket, check SCTP is loaded"); - goto out; - } - - sock_set_rcvbuf(sock->sk, NEEDED_RMEM); - sock_set_mark(sock->sk, dlm_config.ci_mark); - sctp_sock_set_nodelay(sock->sk); - - add_listen_sock(sock, con); - - /* Bind to all addresses. */ - result = sctp_bind_addrs(con->sock, dlm_config.ci_tcp_port); - if (result < 0) - goto out; - - result = sock->ops->listen(sock, 5); - if (result < 0) { - log_print("Can't set socket listening"); - goto out; - } - - return 0; - -out: - return result; -} - -static int tcp_listen_for_all(void) -{ - /* We don't support multi-homed hosts */ - if (dlm_local_count > 1) { - log_print("TCP protocol can't handle multi-homed hosts, " - "try SCTP"); - return -EINVAL; - } - - log_print("Using TCP for communications"); - - return tcp_create_listen_sock(&listen_con, dlm_local_addr[0]); -} - - - static struct writequeue_entry *new_writequeue_entry(struct connection *con, gfp_t allocation) { @@ -1528,19 +1252,37 @@ static struct dlm_msg *dlm_lowcomms_new_msg_con(struct connection *con, int len, { struct writequeue_entry *e; struct dlm_msg *msg; + bool sleepable; msg = kzalloc(sizeof(*msg), allocation); if (!msg) return NULL; + /* this mutex is being used as a wait to avoid multiple "fast" + * new writequeue page list entry allocs in new_wq_entry in + * normal operation which is sleepable context. Without it + * we could end in multiple writequeue entries with one + * dlm message because multiple callers were waiting at + * the writequeue_lock in new_wq_entry(). + */ + sleepable = gfpflags_normal_context(allocation); + if (sleepable) + mutex_lock(&con->wq_alloc); + kref_init(&msg->ref); e = new_wq_entry(con, len, allocation, ppc, cb, mh); if (!e) { + if (sleepable) + mutex_unlock(&con->wq_alloc); + kfree(msg); return NULL; } + if (sleepable) + mutex_unlock(&con->wq_alloc); + msg->ppc = *ppc; msg->len = len; msg->entry = e; @@ -1646,10 +1388,9 @@ int dlm_lowcomms_resend_msg(struct dlm_msg *msg) /* Send a message */ static void send_to_sock(struct connection *con) { - int ret = 0; const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; struct writequeue_entry *e; - int len, offset; + int len, offset, ret; int count = 0; mutex_lock(&con->sock_mutex); @@ -1658,7 +1399,8 @@ static void send_to_sock(struct connection *con) spin_lock(&con->writequeue_lock); for (;;) { - if (list_empty(&con->writequeue)) + e = con_next_wq(con); + if (!e) break; e = list_first_entry(&con->writequeue, struct writequeue_entry, list); @@ -1667,25 +1409,22 @@ static void send_to_sock(struct connection *con) BUG_ON(len == 0 && e->users == 0); spin_unlock(&con->writequeue_lock); - ret = 0; - if (len) { - ret = kernel_sendpage(con->sock, e->page, offset, len, - msg_flags); - if (ret == -EAGAIN || ret == 0) { - if (ret == -EAGAIN && - test_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags) && - !test_and_set_bit(CF_APP_LIMITED, &con->flags)) { - /* Notify TCP that we're limited by the - * application window size. - */ - set_bit(SOCK_NOSPACE, &con->sock->flags); - con->sock->sk->sk_write_pending++; - } - cond_resched(); - goto out; - } else if (ret < 0) - goto out; - } + ret = kernel_sendpage(con->sock, e->page, offset, len, + msg_flags); + if (ret == -EAGAIN || ret == 0) { + if (ret == -EAGAIN && + test_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags) && + !test_and_set_bit(CF_APP_LIMITED, &con->flags)) { + /* Notify TCP that we're limited by the + * application window size. + */ + set_bit(SOCK_NOSPACE, &con->sock->flags); + con->sock->sk->sk_write_pending++; + } + cond_resched(); + goto out; + } else if (ret < 0) + goto out; /* Don't starve people filling buffers */ if (++count >= MAX_SEND_MSG_COUNT) { @@ -1770,12 +1509,9 @@ int dlm_lowcomms_close(int nodeid) static void process_recv_sockets(struct work_struct *work) { struct connection *con = container_of(work, struct connection, rwork); - int err; clear_bit(CF_READ_PENDING, &con->flags); - do { - err = receive_from_sock(con); - } while (!err); + receive_from_sock(con); } static void process_listen_recv_socket(struct work_struct *work) @@ -1783,6 +1519,74 @@ static void process_listen_recv_socket(struct work_struct *work) accept_from_sock(&listen_con); } +static void dlm_connect(struct connection *con) +{ + struct sockaddr_storage addr; + int result, addr_len; + struct socket *sock; + unsigned int mark; + + /* Some odd races can cause double-connects, ignore them */ + if (con->retries++ > MAX_CONNECT_RETRIES) + return; + + if (con->sock) { + log_print("node %d already connected.", con->nodeid); + return; + } + + memset(&addr, 0, sizeof(addr)); + result = nodeid_to_addr(con->nodeid, &addr, NULL, + dlm_proto_ops->try_new_addr, &mark); + if (result < 0) { + log_print("no address for nodeid %d", con->nodeid); + return; + } + + /* Create a socket to communicate with */ + result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family, + SOCK_STREAM, dlm_proto_ops->proto, &sock); + if (result < 0) + goto socket_err; + + sock_set_mark(sock->sk, mark); + dlm_proto_ops->sockopts(sock); + + add_sock(sock, con); + + result = dlm_proto_ops->bind(sock); + if (result < 0) + goto add_sock_err; + + log_print_ratelimited("connecting to %d", con->nodeid); + make_sockaddr(&addr, dlm_config.ci_tcp_port, &addr_len); + result = dlm_proto_ops->connect(con, sock, (struct sockaddr *)&addr, + addr_len); + if (result < 0) + goto add_sock_err; + + return; + +add_sock_err: + dlm_close_sock(&con->sock); + +socket_err: + /* + * Some errors are fatal and this list might need adjusting. For other + * errors we try again until the max number of retries is reached. + */ + if (result != -EHOSTUNREACH && + result != -ENETUNREACH && + result != -ENETDOWN && + result != -EINVAL && + result != -EPROTONOSUPPORT) { + log_print("connect %d try %d error %d", con->nodeid, + con->retries, result); + msleep(1000); + lowcomms_connect_sock(con); + } +} + /* Send workqueue function */ static void process_send_sockets(struct work_struct *work) { @@ -1797,11 +1601,15 @@ static void process_send_sockets(struct work_struct *work) dlm_midcomms_unack_msg_resend(con->nodeid); } - if (con->sock == NULL) { /* not mutex protected so check it inside too */ + if (con->sock == NULL) { if (test_and_clear_bit(CF_DELAY_CONNECT, &con->flags)) msleep(1000); - con->connect_action(con); + + mutex_lock(&con->sock_mutex); + dlm_connect(con); + mutex_unlock(&con->sock_mutex); } + if (!list_empty(&con->writequeue)) send_to_sock(con); } @@ -1840,8 +1648,8 @@ static int work_start(void) static void shutdown_conn(struct connection *con) { - if (con->shutdown_action) - con->shutdown_action(con); + if (dlm_proto_ops->shutdown_action) + dlm_proto_ops->shutdown_action(con); } void dlm_lowcomms_shutdown(void) @@ -1948,8 +1756,198 @@ void dlm_lowcomms_stop(void) srcu_read_unlock(&connections_srcu, idx); work_stop(); deinit_local(); + + dlm_proto_ops = NULL; } +static int dlm_listen_for_all(void) +{ + struct socket *sock; + int result; + + log_print("Using %s for communications", + dlm_proto_ops->name); + + result = dlm_proto_ops->listen_validate(); + if (result < 0) + return result; + + result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family, + SOCK_STREAM, dlm_proto_ops->proto, &sock); + if (result < 0) { + log_print("Can't create comms socket, check SCTP is loaded"); + goto out; + } + + sock_set_mark(sock->sk, dlm_config.ci_mark); + dlm_proto_ops->listen_sockopts(sock); + + result = dlm_proto_ops->listen_bind(sock); + if (result < 0) + goto out; + + save_listen_callbacks(sock); + add_listen_sock(sock, &listen_con); + + INIT_WORK(&listen_con.rwork, process_listen_recv_socket); + result = sock->ops->listen(sock, 5); + if (result < 0) { + dlm_close_sock(&listen_con.sock); + goto out; + } + + return 0; + +out: + sock_release(sock); + return result; +} + +static int dlm_tcp_bind(struct socket *sock) +{ + struct sockaddr_storage src_addr; + int result, addr_len; + + /* Bind to our cluster-known address connecting to avoid + * routing problems. + */ + memcpy(&src_addr, dlm_local_addr[0], sizeof(src_addr)); + make_sockaddr(&src_addr, 0, &addr_len); + + result = sock->ops->bind(sock, (struct sockaddr *)&src_addr, + addr_len); + if (result < 0) { + /* This *may* not indicate a critical error */ + log_print("could not bind for connect: %d", result); + } + + return 0; +} + +static int dlm_tcp_connect(struct connection *con, struct socket *sock, + struct sockaddr *addr, int addr_len) +{ + int ret; + + ret = sock->ops->connect(sock, addr, addr_len, O_NONBLOCK); + switch (ret) { + case -EINPROGRESS: + fallthrough; + case 0: + return 0; + } + + return ret; +} + +static int dlm_tcp_listen_validate(void) +{ + /* We don't support multi-homed hosts */ + if (dlm_local_count > 1) { + log_print("TCP protocol can't handle multi-homed hosts, try SCTP"); + return -EINVAL; + } + + return 0; +} + +static void dlm_tcp_sockopts(struct socket *sock) +{ + /* Turn off Nagle's algorithm */ + tcp_sock_set_nodelay(sock->sk); +} + +static void dlm_tcp_listen_sockopts(struct socket *sock) +{ + dlm_tcp_sockopts(sock); + sock_set_reuseaddr(sock->sk); +} + +static int dlm_tcp_listen_bind(struct socket *sock) +{ + int addr_len; + + /* Bind to our port */ + make_sockaddr(dlm_local_addr[0], dlm_config.ci_tcp_port, &addr_len); + return sock->ops->bind(sock, (struct sockaddr *)dlm_local_addr[0], + addr_len); +} + +static const struct dlm_proto_ops dlm_tcp_ops = { + .name = "TCP", + .proto = IPPROTO_TCP, + .connect = dlm_tcp_connect, + .sockopts = dlm_tcp_sockopts, + .bind = dlm_tcp_bind, + .listen_validate = dlm_tcp_listen_validate, + .listen_sockopts = dlm_tcp_listen_sockopts, + .listen_bind = dlm_tcp_listen_bind, + .shutdown_action = dlm_tcp_shutdown, + .eof_condition = tcp_eof_condition, +}; + +static int dlm_sctp_bind(struct socket *sock) +{ + return sctp_bind_addrs(sock, 0); +} + +static int dlm_sctp_connect(struct connection *con, struct socket *sock, + struct sockaddr *addr, int addr_len) +{ + int ret; + + /* + * Make sock->ops->connect() function return in specified time, + * since O_NONBLOCK argument in connect() function does not work here, + * then, we should restore the default value of this attribute. + */ + sock_set_sndtimeo(sock->sk, 5); + ret = sock->ops->connect(sock, addr, addr_len, 0); + sock_set_sndtimeo(sock->sk, 0); + if (ret < 0) + return ret; + + if (!test_and_set_bit(CF_CONNECTED, &con->flags)) + log_print("successful connected to node %d", con->nodeid); + + return 0; +} + +static int dlm_sctp_listen_validate(void) +{ + if (!IS_ENABLED(CONFIG_IP_SCTP)) { + log_print("SCTP is not enabled by this kernel"); + return -EOPNOTSUPP; + } + + request_module("sctp"); + return 0; +} + +static int dlm_sctp_bind_listen(struct socket *sock) +{ + return sctp_bind_addrs(sock, dlm_config.ci_tcp_port); +} + +static void dlm_sctp_sockopts(struct socket *sock) +{ + /* Turn off Nagle's algorithm */ + sctp_sock_set_nodelay(sock->sk); + sock_set_rcvbuf(sock->sk, NEEDED_RMEM); +} + +static const struct dlm_proto_ops dlm_sctp_ops = { + .name = "SCTP", + .proto = IPPROTO_SCTP, + .try_new_addr = true, + .connect = dlm_sctp_connect, + .sockopts = dlm_sctp_sockopts, + .bind = dlm_sctp_bind, + .listen_validate = dlm_sctp_listen_validate, + .listen_sockopts = dlm_sctp_sockopts, + .listen_bind = dlm_sctp_bind_listen, +}; + int dlm_lowcomms_start(void) { int error = -EINVAL; @@ -1976,23 +1974,27 @@ int dlm_lowcomms_start(void) /* Start listening */ switch (dlm_config.ci_protocol) { case DLM_PROTO_TCP: - error = tcp_listen_for_all(); + dlm_proto_ops = &dlm_tcp_ops; break; case DLM_PROTO_SCTP: - error = sctp_listen_for_all(&listen_con); + dlm_proto_ops = &dlm_sctp_ops; break; default: log_print("Invalid protocol identifier %d set", dlm_config.ci_protocol); error = -EINVAL; - break; + goto fail_proto_ops; } + + error = dlm_listen_for_all(); if (error) - goto fail_unlisten; + goto fail_listen; return 0; -fail_unlisten: +fail_listen: + dlm_proto_ops = NULL; +fail_proto_ops: dlm_allow_conn = 0; dlm_close_sock(&listen_con.sock); work_stop(); diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h index aaae7115c00d..4ccae07cf005 100644 --- a/fs/dlm/lowcomms.h +++ b/fs/dlm/lowcomms.h @@ -46,6 +46,7 @@ int dlm_lowcomms_resend_msg(struct dlm_msg *msg); int dlm_lowcomms_connect_node(int nodeid); int dlm_lowcomms_nodes_set_mark(int nodeid, unsigned int mark); int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len); +void dlm_midcomms_receive_done(int nodeid); #endif /* __LOWCOMMS_DOT_H__ */ diff --git a/fs/dlm/member.c b/fs/dlm/member.c index d9e1e4170eb1..731d489aa323 100644 --- a/fs/dlm/member.c +++ b/fs/dlm/member.c @@ -443,8 +443,10 @@ static int ping_members(struct dlm_ls *ls) list_for_each_entry(memb, &ls->ls_nodes, list) { error = dlm_recovery_stopped(ls); - if (error) + if (error) { + error = -EINTR; break; + } error = dlm_rcom_status(ls, memb->nodeid, 0); if (error) break; diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index e3de268898ed..7ae39ec8d9b0 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -109,12 +109,6 @@ * compatibility. There exists better ways to make a better handling. * However this should be changed in the next major version bump of dlm. * - * Ack handling: - * - * Currently we send an ack message for every dlm message. However we - * can ack multiple dlm messages with one ack by just delaying the ack - * message. Will reduce some traffic but makes the drop detection slower. - * * Tail Size checking: * * There exists a message tail payload in e.g. DLM_MSG however we don't @@ -169,6 +163,7 @@ struct midcomms_node { #define DLM_NODE_FLAG_CLOSE 1 #define DLM_NODE_FLAG_STOP_TX 2 #define DLM_NODE_FLAG_STOP_RX 3 +#define DLM_NODE_ULP_DELIVERED 4 unsigned long flags; wait_queue_head_t shutdown_wait; @@ -480,11 +475,12 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p, { if (seq == node->seq_next) { node->seq_next++; - /* send ack before fin */ - dlm_send_ack(node->nodeid, node->seq_next); switch (p->header.h_cmd) { case DLM_FIN: + /* send ack before fin */ + dlm_send_ack(node->nodeid, node->seq_next); + spin_lock(&node->state_lock); pr_debug("receive fin msg from node %d with state %s\n", node->nodeid, dlm_state_str(node->state)); @@ -534,6 +530,7 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p, default: WARN_ON(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags)); dlm_receive_buffer(p, node->nodeid); + set_bit(DLM_NODE_ULP_DELIVERED, &node->flags); break; } } else { @@ -933,6 +930,49 @@ int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len) return ret; } +void dlm_midcomms_receive_done(int nodeid) +{ + struct midcomms_node *node; + int idx; + + idx = srcu_read_lock(&nodes_srcu); + node = nodeid2node(nodeid, 0); + if (!node) { + srcu_read_unlock(&nodes_srcu, idx); + return; + } + + /* old protocol, we do nothing */ + switch (node->version) { + case DLM_VERSION_3_2: + break; + default: + srcu_read_unlock(&nodes_srcu, idx); + return; + } + + /* do nothing if we didn't delivered stateful to ulp */ + if (!test_and_clear_bit(DLM_NODE_ULP_DELIVERED, + &node->flags)) { + srcu_read_unlock(&nodes_srcu, idx); + return; + } + + spin_lock(&node->state_lock); + /* we only ack if state is ESTABLISHED */ + switch (node->state) { + case DLM_ESTABLISHED: + spin_unlock(&node->state_lock); + dlm_send_ack(node->nodeid, node->seq_next); + break; + default: + spin_unlock(&node->state_lock); + /* do nothing FIN has it's own ack send */ + break; + }; + srcu_read_unlock(&nodes_srcu, idx); +} + void dlm_midcomms_unack_msg_resend(int nodeid) { struct midcomms_node *node; diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c index 5651933f54a4..6cba86470278 100644 --- a/fs/dlm/rcom.c +++ b/fs/dlm/rcom.c @@ -89,22 +89,15 @@ static int create_rcom_stateless(struct dlm_ls *ls, int to_nodeid, int type, return 0; } -static void _send_rcom(struct dlm_ls *ls, struct dlm_rcom *rc) +static void send_rcom(struct dlm_mhandle *mh, struct dlm_rcom *rc) { dlm_rcom_out(rc); -} - -static void send_rcom(struct dlm_ls *ls, struct dlm_mhandle *mh, - struct dlm_rcom *rc) -{ - _send_rcom(ls, rc); dlm_midcomms_commit_mhandle(mh); } -static void send_rcom_stateless(struct dlm_ls *ls, struct dlm_msg *msg, - struct dlm_rcom *rc) +static void send_rcom_stateless(struct dlm_msg *msg, struct dlm_rcom *rc) { - _send_rcom(ls, rc); + dlm_rcom_out(rc); dlm_lowcomms_commit_msg(msg); dlm_lowcomms_put_msg(msg); } @@ -204,7 +197,7 @@ retry: allow_sync_reply(ls, &rc->rc_id); memset(ls->ls_recover_buf, 0, DLM_MAX_SOCKET_BUFSIZE); - send_rcom_stateless(ls, msg, rc); + send_rcom_stateless(msg, rc); error = dlm_wait_function(ls, &rcom_response); disallow_sync_reply(ls); @@ -287,7 +280,7 @@ static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in) spin_unlock(&ls->ls_recover_lock); do_send: - send_rcom_stateless(ls, msg, rc); + send_rcom_stateless(msg, rc); } static void receive_sync_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in) @@ -327,7 +320,7 @@ retry: allow_sync_reply(ls, &rc->rc_id); memset(ls->ls_recover_buf, 0, DLM_MAX_SOCKET_BUFSIZE); - send_rcom_stateless(ls, msg, rc); + send_rcom_stateless(msg, rc); error = dlm_wait_function(ls, &rcom_response); disallow_sync_reply(ls); @@ -356,7 +349,7 @@ static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in) dlm_copy_master_names(ls, rc_in->rc_buf, inlen, rc->rc_buf, outlen, nodeid); - send_rcom_stateless(ls, msg, rc); + send_rcom_stateless(msg, rc); } int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid) @@ -373,7 +366,7 @@ int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid) memcpy(rc->rc_buf, r->res_name, r->res_length); rc->rc_id = (unsigned long) r->res_id; - send_rcom(ls, mh, rc); + send_rcom(mh, rc); out: return error; } @@ -404,7 +397,7 @@ static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in) rc->rc_id = rc_in->rc_id; rc->rc_seq_reply = rc_in->rc_seq; - send_rcom(ls, mh, rc); + send_rcom(mh, rc); } static void receive_rcom_lookup_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in) @@ -461,7 +454,7 @@ int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) pack_rcom_lock(r, lkb, rl); rc->rc_id = (unsigned long) r; - send_rcom(ls, mh, rc); + send_rcom(mh, rc); out: return error; } @@ -487,7 +480,7 @@ static void receive_rcom_lock(struct dlm_ls *ls, struct dlm_rcom *rc_in) rc->rc_id = rc_in->rc_id; rc->rc_seq_reply = rc_in->rc_seq; - send_rcom(ls, mh, rc); + send_rcom(mh, rc); } /* If the lockspace doesn't exist then still send a status message diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c index 85e245392715..97d052cea5a9 100644 --- a/fs/dlm/recoverd.c +++ b/fs/dlm/recoverd.c @@ -125,8 +125,10 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) dlm_recover_waiters_pre(ls); error = dlm_recovery_stopped(ls); - if (error) + if (error) { + error = -EINTR; goto fail; + } if (neg || dlm_no_directory(ls)) { /* diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig index 906af0c1998c..14b747026742 100644 --- a/fs/erofs/Kconfig +++ b/fs/erofs/Kconfig @@ -3,6 +3,7 @@ config EROFS_FS tristate "EROFS filesystem support" depends on BLOCK + select FS_IOMAP select LIBCRC32C help EROFS (Enhanced Read-Only File System) is a lightweight diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 3787a5fb0a42..9db829715652 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -2,35 +2,13 @@ /* * Copyright (C) 2017-2018 HUAWEI, Inc. * https://www.huawei.com/ + * Copyright (C) 2021, Alibaba Cloud */ #include "internal.h" #include <linux/prefetch.h> - +#include <linux/dax.h> #include <trace/events/erofs.h> -static void erofs_readendio(struct bio *bio) -{ - struct bio_vec *bvec; - blk_status_t err = bio->bi_status; - struct bvec_iter_all iter_all; - - bio_for_each_segment_all(bvec, bio, iter_all) { - struct page *page = bvec->bv_page; - - /* page is already locked */ - DBG_BUGON(PageUptodate(page)); - - if (err) - SetPageError(page); - else - SetPageUptodate(page); - - unlock_page(page); - /* page could be reclaimed now */ - } - bio_put(bio); -} - struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr) { struct address_space *const mapping = sb->s_bdev->bd_inode->i_mapping; @@ -59,13 +37,6 @@ static int erofs_map_blocks_flatmode(struct inode *inode, nblocks = DIV_ROUND_UP(inode->i_size, PAGE_SIZE); lastblk = nblocks - tailendpacking; - if (offset >= inode->i_size) { - /* leave out-of-bound access unmapped */ - map->m_flags = 0; - map->m_plen = 0; - goto out; - } - /* there is no hole in flatmode */ map->m_flags = EROFS_MAP_MAPPED; @@ -100,217 +71,273 @@ static int erofs_map_blocks_flatmode(struct inode *inode, goto err_out; } -out: map->m_llen = map->m_plen; - err_out: trace_erofs_map_blocks_flatmode_exit(inode, map, flags, 0); return err; } -static inline struct bio *erofs_read_raw_page(struct bio *bio, - struct address_space *mapping, - struct page *page, - erofs_off_t *last_block, - unsigned int nblocks, - unsigned int *eblks, - bool ra) +static int erofs_map_blocks(struct inode *inode, + struct erofs_map_blocks *map, int flags) { - struct inode *const inode = mapping->host; - struct super_block *const sb = inode->i_sb; - erofs_off_t current_block = (erofs_off_t)page->index; - int err; - - DBG_BUGON(!nblocks); - - if (PageUptodate(page)) { - err = 0; - goto has_updated; - } + struct super_block *sb = inode->i_sb; + struct erofs_inode *vi = EROFS_I(inode); + struct erofs_inode_chunk_index *idx; + struct page *page; + u64 chunknr; + unsigned int unit; + erofs_off_t pos; + int err = 0; - /* note that for readpage case, bio also equals to NULL */ - if (bio && - (*last_block + 1 != current_block || !*eblks)) { -submit_bio_retry: - submit_bio(bio); - bio = NULL; + if (map->m_la >= inode->i_size) { + /* leave out-of-bound access unmapped */ + map->m_flags = 0; + map->m_plen = 0; + goto out; } - if (!bio) { - struct erofs_map_blocks map = { - .m_la = blknr_to_addr(current_block), - }; - erofs_blk_t blknr; - unsigned int blkoff; - - err = erofs_map_blocks_flatmode(inode, &map, EROFS_GET_BLOCKS_RAW); - if (err) - goto err_out; - - /* zero out the holed page */ - if (!(map.m_flags & EROFS_MAP_MAPPED)) { - zero_user_segment(page, 0, PAGE_SIZE); - SetPageUptodate(page); - - /* imply err = 0, see erofs_map_blocks */ - goto has_updated; - } - - /* for RAW access mode, m_plen must be equal to m_llen */ - DBG_BUGON(map.m_plen != map.m_llen); - - blknr = erofs_blknr(map.m_pa); - blkoff = erofs_blkoff(map.m_pa); - - /* deal with inline page */ - if (map.m_flags & EROFS_MAP_META) { - void *vsrc, *vto; - struct page *ipage; + if (vi->datalayout != EROFS_INODE_CHUNK_BASED) + return erofs_map_blocks_flatmode(inode, map, flags); - DBG_BUGON(map.m_plen > PAGE_SIZE); + if (vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES) + unit = sizeof(*idx); /* chunk index */ + else + unit = EROFS_BLOCK_MAP_ENTRY_SIZE; /* block map */ - ipage = erofs_get_meta_page(inode->i_sb, blknr); + chunknr = map->m_la >> vi->chunkbits; + pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize + + vi->xattr_isize, unit) + unit * chunknr; - if (IS_ERR(ipage)) { - err = PTR_ERR(ipage); - goto err_out; - } + page = erofs_get_meta_page(inode->i_sb, erofs_blknr(pos)); + if (IS_ERR(page)) + return PTR_ERR(page); - vsrc = kmap_atomic(ipage); - vto = kmap_atomic(page); - memcpy(vto, vsrc + blkoff, map.m_plen); - memset(vto + map.m_plen, 0, PAGE_SIZE - map.m_plen); - kunmap_atomic(vto); - kunmap_atomic(vsrc); - flush_dcache_page(page); + map->m_la = chunknr << vi->chunkbits; + map->m_plen = min_t(erofs_off_t, 1UL << vi->chunkbits, + roundup(inode->i_size - map->m_la, EROFS_BLKSIZ)); - SetPageUptodate(page); - /* TODO: could we unlock the page earlier? */ - unlock_page(ipage); - put_page(ipage); + /* handle block map */ + if (!(vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES)) { + __le32 *blkaddr = page_address(page) + erofs_blkoff(pos); - /* imply err = 0, see erofs_map_blocks */ - goto has_updated; + if (le32_to_cpu(*blkaddr) == EROFS_NULL_ADDR) { + map->m_flags = 0; + } else { + map->m_pa = blknr_to_addr(le32_to_cpu(*blkaddr)); + map->m_flags = EROFS_MAP_MAPPED; } + goto out_unlock; + } + /* parse chunk indexes */ + idx = page_address(page) + erofs_blkoff(pos); + switch (le32_to_cpu(idx->blkaddr)) { + case EROFS_NULL_ADDR: + map->m_flags = 0; + break; + default: + /* only one device is supported for now */ + if (idx->device_id) { + erofs_err(sb, "invalid device id %u @ %llu for nid %llu", + le16_to_cpu(idx->device_id), + chunknr, vi->nid); + err = -EFSCORRUPTED; + goto out_unlock; + } + map->m_pa = blknr_to_addr(le32_to_cpu(idx->blkaddr)); + map->m_flags = EROFS_MAP_MAPPED; + break; + } +out_unlock: + unlock_page(page); + put_page(page); +out: + map->m_llen = map->m_plen; + return err; +} - /* pa must be block-aligned for raw reading */ - DBG_BUGON(erofs_blkoff(map.m_pa)); +static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, + unsigned int flags, struct iomap *iomap, struct iomap *srcmap) +{ + int ret; + struct erofs_map_blocks map; + + map.m_la = offset; + map.m_llen = length; + + ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW); + if (ret < 0) + return ret; + + iomap->bdev = inode->i_sb->s_bdev; + iomap->dax_dev = EROFS_I_SB(inode)->dax_dev; + iomap->offset = map.m_la; + iomap->length = map.m_llen; + iomap->flags = 0; + iomap->private = NULL; + + if (!(map.m_flags & EROFS_MAP_MAPPED)) { + iomap->type = IOMAP_HOLE; + iomap->addr = IOMAP_NULL_ADDR; + if (!iomap->length) + iomap->length = length; + return 0; + } - /* max # of continuous pages */ - if (nblocks > DIV_ROUND_UP(map.m_plen, PAGE_SIZE)) - nblocks = DIV_ROUND_UP(map.m_plen, PAGE_SIZE); + if (map.m_flags & EROFS_MAP_META) { + struct page *ipage; + + iomap->type = IOMAP_INLINE; + ipage = erofs_get_meta_page(inode->i_sb, + erofs_blknr(map.m_pa)); + if (IS_ERR(ipage)) + return PTR_ERR(ipage); + iomap->inline_data = page_address(ipage) + + erofs_blkoff(map.m_pa); + iomap->private = ipage; + } else { + iomap->type = IOMAP_MAPPED; + iomap->addr = map.m_pa; + } + return 0; +} - *eblks = bio_max_segs(nblocks); - bio = bio_alloc(GFP_NOIO, *eblks); +static int erofs_iomap_end(struct inode *inode, loff_t pos, loff_t length, + ssize_t written, unsigned int flags, struct iomap *iomap) +{ + struct page *ipage = iomap->private; - bio->bi_end_io = erofs_readendio; - bio_set_dev(bio, sb->s_bdev); - bio->bi_iter.bi_sector = (sector_t)blknr << - LOG_SECTORS_PER_BLOCK; - bio->bi_opf = REQ_OP_READ | (ra ? REQ_RAHEAD : 0); + if (ipage) { + DBG_BUGON(iomap->type != IOMAP_INLINE); + unlock_page(ipage); + put_page(ipage); + } else { + DBG_BUGON(iomap->type == IOMAP_INLINE); } + return written; +} - err = bio_add_page(bio, page, PAGE_SIZE, 0); - /* out of the extent or bio is full */ - if (err < PAGE_SIZE) - goto submit_bio_retry; - --*eblks; - *last_block = current_block; - return bio; +static const struct iomap_ops erofs_iomap_ops = { + .iomap_begin = erofs_iomap_begin, + .iomap_end = erofs_iomap_end, +}; -err_out: - /* for sync reading, set page error immediately */ - if (!ra) { - SetPageError(page); - ClearPageUptodate(page); +int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len) +{ + if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout)) { +#ifdef CONFIG_EROFS_FS_ZIP + return iomap_fiemap(inode, fieinfo, start, len, + &z_erofs_iomap_report_ops); +#else + return -EOPNOTSUPP; +#endif } -has_updated: - unlock_page(page); - - /* if updated manually, continuous pages has a gap */ - if (bio) - submit_bio(bio); - return err ? ERR_PTR(err) : NULL; + return iomap_fiemap(inode, fieinfo, start, len, &erofs_iomap_ops); } /* * since we dont have write or truncate flows, so no inode * locking needs to be held at the moment. */ -static int erofs_raw_access_readpage(struct file *file, struct page *page) +static int erofs_readpage(struct file *file, struct page *page) { - erofs_off_t last_block; - unsigned int eblks; - struct bio *bio; - - trace_erofs_readpage(page, true); + return iomap_readpage(page, &erofs_iomap_ops); +} - bio = erofs_read_raw_page(NULL, page->mapping, - page, &last_block, 1, &eblks, false); +static void erofs_readahead(struct readahead_control *rac) +{ + return iomap_readahead(rac, &erofs_iomap_ops); +} - if (IS_ERR(bio)) - return PTR_ERR(bio); +static sector_t erofs_bmap(struct address_space *mapping, sector_t block) +{ + return iomap_bmap(mapping, block, &erofs_iomap_ops); +} - if (bio) - submit_bio(bio); +static int erofs_prepare_dio(struct kiocb *iocb, struct iov_iter *to) +{ + struct inode *inode = file_inode(iocb->ki_filp); + loff_t align = iocb->ki_pos | iov_iter_count(to) | + iov_iter_alignment(to); + struct block_device *bdev = inode->i_sb->s_bdev; + unsigned int blksize_mask; + + if (bdev) + blksize_mask = (1 << ilog2(bdev_logical_block_size(bdev))) - 1; + else + blksize_mask = (1 << inode->i_blkbits) - 1; + + if (align & blksize_mask) + return -EINVAL; return 0; } -static void erofs_raw_access_readahead(struct readahead_control *rac) +static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { - erofs_off_t last_block; - unsigned int eblks; - struct bio *bio = NULL; - struct page *page; - - trace_erofs_readpages(rac->mapping->host, readahead_index(rac), - readahead_count(rac), true); - - while ((page = readahead_page(rac))) { - prefetchw(&page->flags); - - bio = erofs_read_raw_page(bio, rac->mapping, page, &last_block, - readahead_count(rac), &eblks, true); - - /* all the page errors are ignored when readahead */ - if (IS_ERR(bio)) { - pr_err("%s, readahead error at page %lu of nid %llu\n", - __func__, page->index, - EROFS_I(rac->mapping->host)->nid); - - bio = NULL; - } - - put_page(page); + /* no need taking (shared) inode lock since it's a ro filesystem */ + if (!iov_iter_count(to)) + return 0; + +#ifdef CONFIG_FS_DAX + if (IS_DAX(iocb->ki_filp->f_mapping->host)) + return dax_iomap_rw(iocb, to, &erofs_iomap_ops); +#endif + if (iocb->ki_flags & IOCB_DIRECT) { + int err = erofs_prepare_dio(iocb, to); + + if (!err) + return iomap_dio_rw(iocb, to, &erofs_iomap_ops, + NULL, 0); + if (err < 0) + return err; } + return filemap_read(iocb, to, 0); +} + +/* for uncompressed (aligned) files and raw access for other files */ +const struct address_space_operations erofs_raw_access_aops = { + .readpage = erofs_readpage, + .readahead = erofs_readahead, + .bmap = erofs_bmap, + .direct_IO = noop_direct_IO, +}; - if (bio) - submit_bio(bio); +#ifdef CONFIG_FS_DAX +static vm_fault_t erofs_dax_huge_fault(struct vm_fault *vmf, + enum page_entry_size pe_size) +{ + return dax_iomap_fault(vmf, pe_size, NULL, NULL, &erofs_iomap_ops); } -static sector_t erofs_bmap(struct address_space *mapping, sector_t block) +static vm_fault_t erofs_dax_fault(struct vm_fault *vmf) { - struct inode *inode = mapping->host; - struct erofs_map_blocks map = { - .m_la = blknr_to_addr(block), - }; + return erofs_dax_huge_fault(vmf, PE_SIZE_PTE); +} - if (EROFS_I(inode)->datalayout == EROFS_INODE_FLAT_INLINE) { - erofs_blk_t blks = i_size_read(inode) >> LOG_BLOCK_SIZE; +static const struct vm_operations_struct erofs_dax_vm_ops = { + .fault = erofs_dax_fault, + .huge_fault = erofs_dax_huge_fault, +}; - if (block >> LOG_SECTORS_PER_BLOCK >= blks) - return 0; - } +static int erofs_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + if (!IS_DAX(file_inode(file))) + return generic_file_readonly_mmap(file, vma); - if (!erofs_map_blocks_flatmode(inode, &map, EROFS_GET_BLOCKS_RAW)) - return erofs_blknr(map.m_pa); + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) + return -EINVAL; + vma->vm_ops = &erofs_dax_vm_ops; + vma->vm_flags |= VM_HUGEPAGE; return 0; } - -/* for uncompressed (aligned) files and raw access for other files */ -const struct address_space_operations erofs_raw_access_aops = { - .readpage = erofs_raw_access_readpage, - .readahead = erofs_raw_access_readahead, - .bmap = erofs_bmap, +#else +#define erofs_file_mmap generic_file_readonly_mmap +#endif + +const struct file_operations erofs_file_fops = { + .llseek = generic_file_llseek, + .read_iter = erofs_file_read_iter, + .mmap = erofs_file_mmap, + .splice_read = generic_file_splice_read, }; diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index 0f8da74570b4..b0b23f41abc3 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -4,6 +4,7 @@ * * Copyright (C) 2017-2018 HUAWEI, Inc. * https://www.huawei.com/ + * Copyright (C) 2021, Alibaba Cloud */ #ifndef __EROFS_FS_H #define __EROFS_FS_H @@ -19,10 +20,12 @@ #define EROFS_FEATURE_INCOMPAT_LZ4_0PADDING 0x00000001 #define EROFS_FEATURE_INCOMPAT_COMPR_CFGS 0x00000002 #define EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER 0x00000002 +#define EROFS_FEATURE_INCOMPAT_CHUNKED_FILE 0x00000004 #define EROFS_ALL_FEATURE_INCOMPAT \ (EROFS_FEATURE_INCOMPAT_LZ4_0PADDING | \ EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \ - EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER) + EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER | \ + EROFS_FEATURE_INCOMPAT_CHUNKED_FILE) #define EROFS_SB_EXTSLOT_SIZE 16 @@ -64,13 +67,16 @@ struct erofs_super_block { * inode, [xattrs], last_inline_data, ... | ... | no-holed data * 3 - inode compression D: * inode, [xattrs], map_header, extents ... | ... - * 4~7 - reserved + * 4 - inode chunk-based E: + * inode, [xattrs], chunk indexes ... | ... + * 5~7 - reserved */ enum { EROFS_INODE_FLAT_PLAIN = 0, EROFS_INODE_FLAT_COMPRESSION_LEGACY = 1, EROFS_INODE_FLAT_INLINE = 2, EROFS_INODE_FLAT_COMPRESSION = 3, + EROFS_INODE_CHUNK_BASED = 4, EROFS_INODE_DATALAYOUT_MAX }; @@ -90,6 +96,19 @@ static inline bool erofs_inode_is_data_compressed(unsigned int datamode) #define EROFS_I_ALL \ ((1 << (EROFS_I_DATALAYOUT_BIT + EROFS_I_DATALAYOUT_BITS)) - 1) +/* indicate chunk blkbits, thus 'chunksize = blocksize << chunk blkbits' */ +#define EROFS_CHUNK_FORMAT_BLKBITS_MASK 0x001F +/* with chunk indexes or just a 4-byte blkaddr array */ +#define EROFS_CHUNK_FORMAT_INDEXES 0x0020 + +#define EROFS_CHUNK_FORMAT_ALL \ + (EROFS_CHUNK_FORMAT_BLKBITS_MASK | EROFS_CHUNK_FORMAT_INDEXES) + +struct erofs_inode_chunk_info { + __le16 format; /* chunk blkbits, etc. */ + __le16 reserved; +}; + /* 32-byte reduced form of an ondisk inode */ struct erofs_inode_compact { __le16 i_format; /* inode format hints */ @@ -107,6 +126,9 @@ struct erofs_inode_compact { /* for device files, used to indicate old/new device # */ __le32 rdev; + + /* for chunk-based files, it contains the summary info */ + struct erofs_inode_chunk_info c; } i_u; __le32 i_ino; /* only used for 32-bit stat compatibility */ __le16 i_uid; @@ -135,6 +157,9 @@ struct erofs_inode_extended { /* for device files, used to indicate old/new device # */ __le32 rdev; + + /* for chunk-based files, it contains the summary info */ + struct erofs_inode_chunk_info c; } i_u; /* only used for 32-bit stat compatibility */ @@ -204,6 +229,19 @@ static inline unsigned int erofs_xattr_entry_size(struct erofs_xattr_entry *e) e->e_name_len + le16_to_cpu(e->e_value_size)); } +/* represent a zeroed chunk (hole) */ +#define EROFS_NULL_ADDR -1 + +/* 4-byte block address array */ +#define EROFS_BLOCK_MAP_ENTRY_SIZE sizeof(__le32) + +/* 8-byte inode chunk indexes */ +struct erofs_inode_chunk_index { + __le16 advise; /* always 0, don't care for now */ + __le16 device_id; /* back-end storage id, always 0 for now */ + __le32 blkaddr; /* start block address of this inode chunk */ +}; + /* maximum supported size of a physical compression cluster */ #define Z_EROFS_PCLUSTER_MAX_SIZE (1024 * 1024) @@ -338,9 +376,14 @@ static inline void erofs_check_ondisk_layout_definitions(void) BUILD_BUG_ON(sizeof(struct erofs_inode_extended) != 64); BUILD_BUG_ON(sizeof(struct erofs_xattr_ibody_header) != 12); BUILD_BUG_ON(sizeof(struct erofs_xattr_entry) != 4); + BUILD_BUG_ON(sizeof(struct erofs_inode_chunk_info) != 4); + BUILD_BUG_ON(sizeof(struct erofs_inode_chunk_index) != 8); BUILD_BUG_ON(sizeof(struct z_erofs_map_header) != 8); BUILD_BUG_ON(sizeof(struct z_erofs_vle_decompressed_index) != 8); BUILD_BUG_ON(sizeof(struct erofs_dirent) != 12); + /* keep in sync between 2 index structures for better extendibility */ + BUILD_BUG_ON(sizeof(struct erofs_inode_chunk_index) != + sizeof(struct z_erofs_vle_decompressed_index)); BUILD_BUG_ON(BIT(Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) < Z_EROFS_VLE_CLUSTER_TYPE_MAX - 1); diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index aa8a0d770ba3..31ac3a73b390 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -2,6 +2,7 @@ /* * Copyright (C) 2017-2018 HUAWEI, Inc. * https://www.huawei.com/ + * Copyright (C) 2021, Alibaba Cloud */ #include "xattr.h" @@ -122,8 +123,11 @@ static struct page *erofs_read_inode(struct inode *inode, /* total blocks for compressed files */ if (erofs_inode_is_data_compressed(vi->datalayout)) nblks = le32_to_cpu(die->i_u.compressed_blocks); - + else if (vi->datalayout == EROFS_INODE_CHUNK_BASED) + /* fill chunked inode summary info */ + vi->chunkformat = le16_to_cpu(die->i_u.c.format); kfree(copied); + copied = NULL; break; case EROFS_INODE_LAYOUT_COMPACT: vi->inode_isize = sizeof(struct erofs_inode_compact); @@ -160,6 +164,8 @@ static struct page *erofs_read_inode(struct inode *inode, inode->i_size = le32_to_cpu(dic->i_size); if (erofs_inode_is_data_compressed(vi->datalayout)) nblks = le32_to_cpu(dic->i_u.compressed_blocks); + else if (vi->datalayout == EROFS_INODE_CHUNK_BASED) + vi->chunkformat = le16_to_cpu(dic->i_u.c.format); break; default: erofs_err(inode->i_sb, @@ -169,11 +175,26 @@ static struct page *erofs_read_inode(struct inode *inode, goto err_out; } + if (vi->datalayout == EROFS_INODE_CHUNK_BASED) { + if (!(vi->chunkformat & EROFS_CHUNK_FORMAT_ALL)) { + erofs_err(inode->i_sb, + "unsupported chunk format %x of nid %llu", + vi->chunkformat, vi->nid); + err = -EOPNOTSUPP; + goto err_out; + } + vi->chunkbits = LOG_BLOCK_SIZE + + (vi->chunkformat & EROFS_CHUNK_FORMAT_BLKBITS_MASK); + } inode->i_mtime.tv_sec = inode->i_ctime.tv_sec; inode->i_atime.tv_sec = inode->i_ctime.tv_sec; inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec; inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec; + inode->i_flags &= ~S_DAX; + if (test_opt(&sbi->ctx, DAX_ALWAYS) && S_ISREG(inode->i_mode) && + vi->datalayout == EROFS_INODE_FLAT_PLAIN) + inode->i_flags |= S_DAX; if (!nblks) /* measure inode.i_blocks as generic filesystems */ inode->i_blocks = roundup(inode->i_size, EROFS_BLKSIZ) >> 9; @@ -247,7 +268,10 @@ static int erofs_fill_inode(struct inode *inode, int isdir) switch (inode->i_mode & S_IFMT) { case S_IFREG: inode->i_op = &erofs_generic_iops; - inode->i_fop = &generic_ro_fops; + if (erofs_inode_is_data_compressed(vi->datalayout)) + inode->i_fop = &generic_ro_fops; + else + inode->i_fop = &erofs_file_fops; break; case S_IFDIR: inode->i_op = &erofs_dir_iops; @@ -358,6 +382,7 @@ const struct inode_operations erofs_generic_iops = { .getattr = erofs_getattr, .listxattr = erofs_listxattr, .get_acl = erofs_get_acl, + .fiemap = erofs_fiemap, }; const struct inode_operations erofs_symlink_iops = { diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 543c2ff97d30..9524e155b38f 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -2,6 +2,7 @@ /* * Copyright (C) 2017-2018 HUAWEI, Inc. * https://www.huawei.com/ + * Copyright (C) 2021, Alibaba Cloud */ #ifndef __EROFS_INTERNAL_H #define __EROFS_INTERNAL_H @@ -15,6 +16,7 @@ #include <linux/magic.h> #include <linux/slab.h> #include <linux/vmalloc.h> +#include <linux/iomap.h> #include "erofs_fs.h" /* redefine pr_fmt "erofs: " */ @@ -83,6 +85,7 @@ struct erofs_sb_info { struct erofs_sb_lz4_info lz4; #endif /* CONFIG_EROFS_FS_ZIP */ + struct dax_device *dax_dev; u32 blocks; u32 meta_blkaddr; #ifdef CONFIG_EROFS_FS_XATTR @@ -115,6 +118,8 @@ struct erofs_sb_info { /* Mount flags set via mount options or defaults */ #define EROFS_MOUNT_XATTR_USER 0x00000010 #define EROFS_MOUNT_POSIX_ACL 0x00000020 +#define EROFS_MOUNT_DAX_ALWAYS 0x00000040 +#define EROFS_MOUNT_DAX_NEVER 0x00000080 #define clear_opt(ctx, option) ((ctx)->mount_opt &= ~EROFS_MOUNT_##option) #define set_opt(ctx, option) ((ctx)->mount_opt |= EROFS_MOUNT_##option) @@ -257,6 +262,10 @@ struct erofs_inode { union { erofs_blk_t raw_blkaddr; + struct { + unsigned short chunkformat; + unsigned char chunkbits; + }; #ifdef CONFIG_EROFS_FS_ZIP struct { unsigned short z_advise; @@ -353,8 +362,15 @@ struct erofs_map_blocks { /* Flags used by erofs_map_blocks_flatmode() */ #define EROFS_GET_BLOCKS_RAW 0x0001 +/* + * Used to get the exact decompressed length, e.g. fiemap (consider lookback + * approach instead if possible since it's more metadata lightweight.) + */ +#define EROFS_GET_BLOCKS_FIEMAP 0x0002 /* zmap.c */ +extern const struct iomap_ops z_erofs_iomap_report_ops; + #ifdef CONFIG_EROFS_FS_ZIP int z_erofs_fill_inode(struct inode *inode); int z_erofs_map_blocks_iter(struct inode *inode, @@ -371,7 +387,10 @@ static inline int z_erofs_map_blocks_iter(struct inode *inode, #endif /* !CONFIG_EROFS_FS_ZIP */ /* data.c */ +extern const struct file_operations erofs_file_fops; struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr); +int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len); /* inode.c */ static inline unsigned long erofs_inode_hash(erofs_nid_t nid) @@ -441,8 +460,7 @@ int __init z_erofs_init_zip_subsystem(void); void z_erofs_exit_zip_subsystem(void); int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, struct erofs_workgroup *egrp); -int erofs_try_to_free_cached_page(struct address_space *mapping, - struct page *page); +int erofs_try_to_free_cached_page(struct page *page); int z_erofs_load_lz4_config(struct super_block *sb, struct erofs_super_block *dsb, struct z_erofs_lz4_cfgs *lz4, int len); diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c index a8271ce5e13f..8629e616028c 100644 --- a/fs/erofs/namei.c +++ b/fs/erofs/namei.c @@ -245,4 +245,5 @@ const struct inode_operations erofs_dir_iops = { .getattr = erofs_getattr, .listxattr = erofs_listxattr, .get_acl = erofs_get_acl, + .fiemap = erofs_fiemap, }; diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 8fc6c04b54f4..a8d49e8fc83a 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -11,6 +11,7 @@ #include <linux/crc32c.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> +#include <linux/dax.h> #include "xattr.h" #define CREATE_TRACE_POINTS @@ -355,6 +356,8 @@ enum { Opt_user_xattr, Opt_acl, Opt_cache_strategy, + Opt_dax, + Opt_dax_enum, Opt_err }; @@ -365,14 +368,47 @@ static const struct constant_table erofs_param_cache_strategy[] = { {} }; +static const struct constant_table erofs_dax_param_enums[] = { + {"always", EROFS_MOUNT_DAX_ALWAYS}, + {"never", EROFS_MOUNT_DAX_NEVER}, + {} +}; + static const struct fs_parameter_spec erofs_fs_parameters[] = { fsparam_flag_no("user_xattr", Opt_user_xattr), fsparam_flag_no("acl", Opt_acl), fsparam_enum("cache_strategy", Opt_cache_strategy, erofs_param_cache_strategy), + fsparam_flag("dax", Opt_dax), + fsparam_enum("dax", Opt_dax_enum, erofs_dax_param_enums), {} }; +static bool erofs_fc_set_dax_mode(struct fs_context *fc, unsigned int mode) +{ +#ifdef CONFIG_FS_DAX + struct erofs_fs_context *ctx = fc->fs_private; + + switch (mode) { + case EROFS_MOUNT_DAX_ALWAYS: + warnfc(fc, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); + set_opt(ctx, DAX_ALWAYS); + clear_opt(ctx, DAX_NEVER); + return true; + case EROFS_MOUNT_DAX_NEVER: + set_opt(ctx, DAX_NEVER); + clear_opt(ctx, DAX_ALWAYS); + return true; + default: + DBG_BUGON(1); + return false; + } +#else + errorfc(fc, "dax options not supported"); + return false; +#endif +} + static int erofs_fc_parse_param(struct fs_context *fc, struct fs_parameter *param) { @@ -412,6 +448,14 @@ static int erofs_fc_parse_param(struct fs_context *fc, errorfc(fc, "compression not supported, cache_strategy ignored"); #endif break; + case Opt_dax: + if (!erofs_fc_set_dax_mode(fc, EROFS_MOUNT_DAX_ALWAYS)) + return -EINVAL; + break; + case Opt_dax_enum: + if (!erofs_fc_set_dax_mode(fc, result.uint_32)) + return -EINVAL; + break; default: return -ENOPARAM; } @@ -430,7 +474,7 @@ static int erofs_managed_cache_releasepage(struct page *page, gfp_t gfp_mask) DBG_BUGON(mapping->a_ops != &managed_cache_aops); if (PagePrivate(page)) - ret = erofs_try_to_free_cached_page(mapping, page); + ret = erofs_try_to_free_cached_page(page); return ret; } @@ -496,10 +540,16 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) return -ENOMEM; sb->s_fs_info = sbi; + sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev); err = erofs_read_superblock(sb); if (err) return err; + if (test_opt(ctx, DAX_ALWAYS) && + !bdev_dax_supported(sb->s_bdev, EROFS_BLKSIZ)) { + errorfc(fc, "DAX unsupported by block device. Turning off DAX."); + clear_opt(ctx, DAX_ALWAYS); + } sb->s_flags |= SB_RDONLY | SB_NOATIME; sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_time_gran = 1; @@ -609,6 +659,7 @@ static void erofs_kill_sb(struct super_block *sb) sbi = EROFS_SB(sb); if (!sbi) return; + fs_put_dax(sbi->dax_dev); kfree(sbi); sb->s_fs_info = NULL; } @@ -711,8 +762,8 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf) static int erofs_show_options(struct seq_file *seq, struct dentry *root) { - struct erofs_sb_info *sbi __maybe_unused = EROFS_SB(root->d_sb); - struct erofs_fs_context *ctx __maybe_unused = &sbi->ctx; + struct erofs_sb_info *sbi = EROFS_SB(root->d_sb); + struct erofs_fs_context *ctx = &sbi->ctx; #ifdef CONFIG_EROFS_FS_XATTR if (test_opt(ctx, XATTR_USER)) @@ -734,6 +785,10 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root) else if (ctx->cache_strategy == EROFS_ZIP_CACHE_READAROUND) seq_puts(seq, ",cache_strategy=readaround"); #endif + if (test_opt(ctx, DAX_ALWAYS)) + seq_puts(seq, ",dax=always"); + if (test_opt(ctx, DAX_NEVER)) + seq_puts(seq, ",dax=never"); return 0; } diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index 8dd54b420a1d..778f2c52295d 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -673,12 +673,15 @@ ssize_t erofs_listxattr(struct dentry *dentry, } #ifdef CONFIG_EROFS_FS_POSIX_ACL -struct posix_acl *erofs_get_acl(struct inode *inode, int type) +struct posix_acl *erofs_get_acl(struct inode *inode, int type, bool rcu) { struct posix_acl *acl; int prefix, rc; char *value = NULL; + if (rcu) + return ERR_PTR(-ECHILD); + switch (type) { case ACL_TYPE_ACCESS: prefix = EROFS_XATTR_INDEX_POSIX_ACL_ACCESS; diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h index 366dcb400525..94090c74b3f7 100644 --- a/fs/erofs/xattr.h +++ b/fs/erofs/xattr.h @@ -80,7 +80,7 @@ static inline int erofs_getxattr(struct inode *inode, int index, #endif /* !CONFIG_EROFS_FS_XATTR */ #ifdef CONFIG_EROFS_FS_POSIX_ACL -struct posix_acl *erofs_get_acl(struct inode *inode, int type); +struct posix_acl *erofs_get_acl(struct inode *inode, int type, bool rcu); #else #define erofs_get_acl (NULL) #endif diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index cb4d0889eca9..11c7a1aaebad 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -309,7 +309,6 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, { struct z_erofs_pcluster *const pcl = container_of(grp, struct z_erofs_pcluster, obj); - struct address_space *const mapping = MNGD_MAPPING(sbi); int i; /* @@ -326,7 +325,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, if (!trylock_page(page)) return -EBUSY; - if (page->mapping != mapping) + if (!erofs_page_is_managed(sbi, page)) continue; /* barrier is implied in the following 'unlock_page' */ @@ -337,8 +336,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, return 0; } -int erofs_try_to_free_cached_page(struct address_space *mapping, - struct page *page) +int erofs_try_to_free_cached_page(struct page *page) { struct z_erofs_pcluster *const pcl = (void *)page_private(page); int ret = 0; /* 0 - busy */ diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index f68aea4baed7..9fb98d85a3ce 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -212,9 +212,34 @@ static unsigned int decode_compactedbits(unsigned int lobits, return lo; } +static int get_compacted_la_distance(unsigned int lclusterbits, + unsigned int encodebits, + unsigned int vcnt, u8 *in, int i) +{ + const unsigned int lomask = (1 << lclusterbits) - 1; + unsigned int lo, d1 = 0; + u8 type; + + DBG_BUGON(i >= vcnt); + + do { + lo = decode_compactedbits(lclusterbits, lomask, + in, encodebits * i, &type); + + if (type != Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) + return d1; + ++d1; + } while (++i < vcnt); + + /* vcnt - 1 (Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) item */ + if (!(lo & Z_EROFS_VLE_DI_D0_CBLKCNT)) + d1 += lo - 1; + return d1; +} + static int unpack_compacted_index(struct z_erofs_maprecorder *m, unsigned int amortizedshift, - unsigned int eofs) + unsigned int eofs, bool lookahead) { struct erofs_inode *const vi = EROFS_I(m->inode); const unsigned int lclusterbits = vi->z_logical_clusterbits; @@ -243,6 +268,11 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, m->type = type; if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) { m->clusterofs = 1 << lclusterbits; + + /* figure out lookahead_distance: delta[1] if needed */ + if (lookahead) + m->delta[1] = get_compacted_la_distance(lclusterbits, + encodebits, vcnt, in, i); if (lo & Z_EROFS_VLE_DI_D0_CBLKCNT) { if (!big_pcluster) { DBG_BUGON(1); @@ -313,7 +343,7 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, } static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m, - unsigned long lcn) + unsigned long lcn, bool lookahead) { struct inode *const inode = m->inode; struct erofs_inode *const vi = EROFS_I(inode); @@ -364,11 +394,12 @@ out: err = z_erofs_reload_indexes(m, erofs_blknr(pos)); if (err) return err; - return unpack_compacted_index(m, amortizedshift, erofs_blkoff(pos)); + return unpack_compacted_index(m, amortizedshift, erofs_blkoff(pos), + lookahead); } static int z_erofs_load_cluster_from_disk(struct z_erofs_maprecorder *m, - unsigned int lcn) + unsigned int lcn, bool lookahead) { const unsigned int datamode = EROFS_I(m->inode)->datalayout; @@ -376,7 +407,7 @@ static int z_erofs_load_cluster_from_disk(struct z_erofs_maprecorder *m, return legacy_load_cluster_from_disk(m, lcn); if (datamode == EROFS_INODE_FLAT_COMPRESSION) - return compacted_load_cluster_from_disk(m, lcn); + return compacted_load_cluster_from_disk(m, lcn, lookahead); return -EINVAL; } @@ -399,7 +430,7 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m, /* load extent head logical cluster if needed */ lcn -= lookback_distance; - err = z_erofs_load_cluster_from_disk(m, lcn); + err = z_erofs_load_cluster_from_disk(m, lcn, false); if (err) return err; @@ -450,7 +481,7 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m, if (m->compressedlcs) goto out; - err = z_erofs_load_cluster_from_disk(m, lcn); + err = z_erofs_load_cluster_from_disk(m, lcn, false); if (err) return err; @@ -498,6 +529,48 @@ err_bonus_cblkcnt: return -EFSCORRUPTED; } +static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m) +{ + struct inode *inode = m->inode; + struct erofs_inode *vi = EROFS_I(inode); + struct erofs_map_blocks *map = m->map; + unsigned int lclusterbits = vi->z_logical_clusterbits; + u64 lcn = m->lcn, headlcn = map->m_la >> lclusterbits; + int err; + + do { + /* handle the last EOF pcluster (no next HEAD lcluster) */ + if ((lcn << lclusterbits) >= inode->i_size) { + map->m_llen = inode->i_size - map->m_la; + return 0; + } + + err = z_erofs_load_cluster_from_disk(m, lcn, true); + if (err) + return err; + + if (m->type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) { + DBG_BUGON(!m->delta[1] && + m->clusterofs != 1 << lclusterbits); + } else if (m->type == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN || + m->type == Z_EROFS_VLE_CLUSTER_TYPE_HEAD) { + /* go on until the next HEAD lcluster */ + if (lcn != headlcn) + break; + m->delta[1] = 1; + } else { + erofs_err(inode->i_sb, "unknown type %u @ lcn %llu of nid %llu", + m->type, lcn, vi->nid); + DBG_BUGON(1); + return -EOPNOTSUPP; + } + lcn += m->delta[1]; + } while (m->delta[1]); + + map->m_llen = (lcn << lclusterbits) + m->clusterofs - map->m_la; + return 0; +} + int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, int flags) @@ -531,7 +604,7 @@ int z_erofs_map_blocks_iter(struct inode *inode, initial_lcn = ofs >> lclusterbits; endoff = ofs & ((1 << lclusterbits) - 1); - err = z_erofs_load_cluster_from_disk(&m, initial_lcn); + err = z_erofs_load_cluster_from_disk(&m, initial_lcn, false); if (err) goto unmap_out; @@ -581,6 +654,12 @@ int z_erofs_map_blocks_iter(struct inode *inode, err = z_erofs_get_extent_compressedlen(&m, initial_lcn); if (err) goto out; + + if (flags & EROFS_GET_BLOCKS_FIEMAP) { + err = z_erofs_get_extent_decompressedlen(&m); + if (!err) + map->m_flags |= EROFS_MAP_FULL_MAPPED; + } unmap_out: if (m.kaddr) kunmap_atomic(m.kaddr); @@ -596,3 +675,41 @@ out: DBG_BUGON(err < 0 && err != -ENOMEM); return err; } + +static int z_erofs_iomap_begin_report(struct inode *inode, loff_t offset, + loff_t length, unsigned int flags, + struct iomap *iomap, struct iomap *srcmap) +{ + int ret; + struct erofs_map_blocks map = { .m_la = offset }; + + ret = z_erofs_map_blocks_iter(inode, &map, EROFS_GET_BLOCKS_FIEMAP); + if (map.mpage) + put_page(map.mpage); + if (ret < 0) + return ret; + + iomap->bdev = inode->i_sb->s_bdev; + iomap->offset = map.m_la; + iomap->length = map.m_llen; + if (map.m_flags & EROFS_MAP_MAPPED) { + iomap->type = IOMAP_MAPPED; + iomap->addr = map.m_pa; + } else { + iomap->type = IOMAP_HOLE; + iomap->addr = IOMAP_NULL_ADDR; + /* + * No strict rule how to describe extents for post EOF, yet + * we need do like below. Otherwise, iomap itself will get + * into an endless loop on post EOF. + */ + if (iomap->offset >= inode->i_size) + iomap->length = length + map.m_la - offset; + } + iomap->flags = 0; + return 0; +} + +const struct iomap_ops z_erofs_iomap_report_ops = { + .iomap_begin = z_erofs_iomap_begin_report, +}; diff --git a/fs/eventfd.c b/fs/eventfd.c index e265b6dd4f34..3627dd7d25db 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -25,8 +25,6 @@ #include <linux/idr.h> #include <linux/uio.h> -DEFINE_PER_CPU(int, eventfd_wake_count); - static DEFINE_IDA(eventfd_ida); struct eventfd_ctx { @@ -67,21 +65,21 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n) * Deadlock or stack overflow issues can happen if we recurse here * through waitqueue wakeup handlers. If the caller users potentially * nested waitqueues with custom wakeup handlers, then it should - * check eventfd_signal_count() before calling this function. If - * it returns true, the eventfd_signal() call should be deferred to a + * check eventfd_signal_allowed() before calling this function. If + * it returns false, the eventfd_signal() call should be deferred to a * safe context. */ - if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count))) + if (WARN_ON_ONCE(current->in_eventfd_signal)) return 0; spin_lock_irqsave(&ctx->wqh.lock, flags); - this_cpu_inc(eventfd_wake_count); + current->in_eventfd_signal = 1; if (ULLONG_MAX - ctx->count < n) n = ULLONG_MAX - ctx->count; ctx->count += n; if (waitqueue_active(&ctx->wqh)) wake_up_locked_poll(&ctx->wqh, EPOLLIN); - this_cpu_dec(eventfd_wake_count); + current->in_eventfd_signal = 0; spin_unlock_irqrestore(&ctx->wqh.lock, flags); return n; diff --git a/fs/exec.c b/fs/exec.c index 38f63451b928..3b78b22addfb 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -2070,10 +2070,8 @@ SYSCALL_DEFINE5(execveat, const char __user *const __user *, envp, int, flags) { - int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0; - return do_execveat(fd, - getname_flags(filename, lookup_flags, NULL), + getname_uflags(filename, flags), argv, envp, flags); } @@ -2091,10 +2089,8 @@ COMPAT_SYSCALL_DEFINE5(execveat, int, fd, const compat_uptr_t __user *, envp, int, flags) { - int lookup_flags = (flags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0; - return compat_do_execveat(fd, - getname_flags(filename, lookup_flags, NULL), + getname_uflags(filename, flags), argv, envp, flags); } #endif diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig index 54eec9185627..1248ff4ef562 100644 --- a/fs/ext2/Kconfig +++ b/fs/ext2/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config EXT2_FS tristate "Second extended fs support" + select FS_IOMAP help Ext2 is a standard Linux file system for hard disks. diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index b9a9db98e94b..bf298967c5b8 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -141,13 +141,16 @@ fail: * inode->i_mutex: don't care */ struct posix_acl * -ext2_get_acl(struct inode *inode, int type) +ext2_get_acl(struct inode *inode, int type, bool rcu) { int name_index; char *value = NULL; struct posix_acl *acl; int retval; + if (rcu) + return ERR_PTR(-ECHILD); + switch (type) { case ACL_TYPE_ACCESS: name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS; diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h index 917db5f6630a..925ab6287d35 100644 --- a/fs/ext2/acl.h +++ b/fs/ext2/acl.h @@ -55,7 +55,7 @@ static inline int ext2_acl_count(size_t size) #ifdef CONFIG_EXT2_FS_POSIX_ACL /* acl.c */ -extern struct posix_acl *ext2_get_acl(struct inode *inode, int type); +extern struct posix_acl *ext2_get_acl(struct inode *inode, int type, bool rcu); extern int ext2_set_acl(struct user_namespace *mnt_userns, struct inode *inode, struct posix_acl *acl, int type); extern int ext2_init_acl (struct inode *, struct inode *); diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index e512630cb63e..3be9dd6412b7 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -667,9 +667,6 @@ struct ext2_inode_info { struct rw_semaphore xattr_sem; #endif rwlock_t i_meta_lock; -#ifdef CONFIG_FS_DAX - struct rw_semaphore dax_sem; -#endif /* * truncate_mutex is for serialising ext2_truncate() against @@ -685,14 +682,6 @@ struct ext2_inode_info { #endif }; -#ifdef CONFIG_FS_DAX -#define dax_sem_down_write(ext2_inode) down_write(&(ext2_inode)->dax_sem) -#define dax_sem_up_write(ext2_inode) up_write(&(ext2_inode)->dax_sem) -#else -#define dax_sem_down_write(ext2_inode) -#define dax_sem_up_write(ext2_inode) -#endif - /* * Inode dynamic state flags */ diff --git a/fs/ext2/file.c b/fs/ext2/file.c index f98466acc672..eb97aa3d700e 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -81,7 +81,7 @@ out_unlock: * * mmap_lock (MM) * sb_start_pagefault (vfs, freeze) - * ext2_inode_info->dax_sem + * address_space->invalidate_lock * address_space->i_mmap_rwsem or page_lock (mutually exclusive in DAX) * ext2_inode_info->truncate_mutex * @@ -91,7 +91,6 @@ out_unlock: static vm_fault_t ext2_dax_fault(struct vm_fault *vmf) { struct inode *inode = file_inode(vmf->vma->vm_file); - struct ext2_inode_info *ei = EXT2_I(inode); vm_fault_t ret; bool write = (vmf->flags & FAULT_FLAG_WRITE) && (vmf->vma->vm_flags & VM_SHARED); @@ -100,11 +99,11 @@ static vm_fault_t ext2_dax_fault(struct vm_fault *vmf) sb_start_pagefault(inode->i_sb); file_update_time(vmf->vma->vm_file); } - down_read(&ei->dax_sem); + filemap_invalidate_lock_shared(inode->i_mapping); ret = dax_iomap_fault(vmf, PE_SIZE_PTE, NULL, NULL, &ext2_iomap_ops); - up_read(&ei->dax_sem); + filemap_invalidate_unlock_shared(inode->i_mapping); if (write) sb_end_pagefault(inode->i_sb); return ret; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index dadb121beb22..333fa62661d5 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -799,7 +799,6 @@ int ext2_get_block(struct inode *inode, sector_t iblock, } -#ifdef CONFIG_FS_DAX static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned flags, struct iomap *iomap, struct iomap *srcmap) { @@ -852,16 +851,18 @@ const struct iomap_ops ext2_iomap_ops = { .iomap_begin = ext2_iomap_begin, .iomap_end = ext2_iomap_end, }; -#else -/* Define empty ops for !CONFIG_FS_DAX case to avoid ugly ifdefs */ -const struct iomap_ops ext2_iomap_ops; -#endif /* CONFIG_FS_DAX */ int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { - return generic_block_fiemap(inode, fieinfo, start, len, - ext2_get_block); + int ret; + + inode_lock(inode); + len = min_t(u64, len, i_size_read(inode)); + ret = iomap_fiemap(inode, fieinfo, start, len, &ext2_iomap_ops); + inode_unlock(inode); + + return ret; } static int ext2_writepage(struct page *page, struct writeback_control *wbc) @@ -1177,7 +1178,7 @@ static void ext2_free_branches(struct inode *inode, __le32 *p, __le32 *q, int de ext2_free_data(inode, p, q); } -/* dax_sem must be held when calling this function */ +/* mapping->invalidate_lock must be held when calling this function */ static void __ext2_truncate_blocks(struct inode *inode, loff_t offset) { __le32 *i_data = EXT2_I(inode)->i_data; @@ -1194,7 +1195,7 @@ static void __ext2_truncate_blocks(struct inode *inode, loff_t offset) iblock = (offset + blocksize-1) >> EXT2_BLOCK_SIZE_BITS(inode->i_sb); #ifdef CONFIG_FS_DAX - WARN_ON(!rwsem_is_locked(&ei->dax_sem)); + WARN_ON(!rwsem_is_locked(&inode->i_mapping->invalidate_lock)); #endif n = ext2_block_to_path(inode, iblock, offsets, NULL); @@ -1276,9 +1277,9 @@ static void ext2_truncate_blocks(struct inode *inode, loff_t offset) if (ext2_inode_is_fast_symlink(inode)) return; - dax_sem_down_write(EXT2_I(inode)); + filemap_invalidate_lock(inode->i_mapping); __ext2_truncate_blocks(inode, offset); - dax_sem_up_write(EXT2_I(inode)); + filemap_invalidate_unlock(inode->i_mapping); } static int ext2_setsize(struct inode *inode, loff_t newsize) @@ -1308,10 +1309,10 @@ static int ext2_setsize(struct inode *inode, loff_t newsize) if (error) return error; - dax_sem_down_write(EXT2_I(inode)); + filemap_invalidate_lock(inode->i_mapping); truncate_setsize(inode, newsize); __ext2_truncate_blocks(inode, newsize); - dax_sem_up_write(EXT2_I(inode)); + filemap_invalidate_unlock(inode->i_mapping); inode->i_mtime = inode->i_ctime = current_time(inode); if (inode_needs_sync(inode)) { diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 21e09fbaa46f..987bcf32ed46 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -206,9 +206,6 @@ static void init_once(void *foo) init_rwsem(&ei->xattr_sem); #endif mutex_init(&ei->truncate_mutex); -#ifdef CONFIG_FS_DAX - init_rwsem(&ei->dax_sem); -#endif inode_init_once(&ei->vfs_inode); } diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index 49e7af6cc93f..7d89142e1421 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -10,7 +10,7 @@ ext4-y := balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \ indirect.o inline.o inode.o ioctl.o mballoc.o migrate.o \ mmp.o move_extent.o namei.o page-io.o readpage.o resize.o \ super.o symlink.o sysfs.o xattr.o xattr_hurd.o xattr_trusted.o \ - xattr_user.o fast_commit.o + xattr_user.o fast_commit.o orphan.o ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index c5eaffccecc3..0613dfcbfd4a 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -142,13 +142,16 @@ fail: * inode->i_mutex: don't care */ struct posix_acl * -ext4_get_acl(struct inode *inode, int type) +ext4_get_acl(struct inode *inode, int type, bool rcu) { int name_index; char *value = NULL; struct posix_acl *acl; int retval; + if (rcu) + return ERR_PTR(-ECHILD); + switch (type) { case ACL_TYPE_ACCESS: name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h index 84b8942a57f2..3219669732bf 100644 --- a/fs/ext4/acl.h +++ b/fs/ext4/acl.h @@ -55,7 +55,7 @@ static inline int ext4_acl_count(size_t size) #ifdef CONFIG_EXT4_FS_POSIX_ACL /* acl.c */ -struct posix_acl *ext4_get_acl(struct inode *inode, int type); +struct posix_acl *ext4_get_acl(struct inode *inode, int type, bool rcu); int ext4_set_acl(struct user_namespace *mnt_userns, struct inode *inode, struct posix_acl *acl, int type); extern int ext4_init_acl(handle_t *, struct inode *, struct inode *); diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 9dc6e74b265c..a0fb0c4bdc7c 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -652,8 +652,14 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries) * possible we just missed a transaction commit that did so */ smp_mb(); - if (sbi->s_mb_free_pending == 0) + if (sbi->s_mb_free_pending == 0) { + if (test_opt(sb, DISCARD)) { + atomic_inc(&sbi->s_retry_alloc_pending); + flush_work(&sbi->s_discard_work); + atomic_dec(&sbi->s_retry_alloc_pending); + } return ext4_has_free_clusters(sbi, 1, 0); + } /* * it's possible we've just missed a transaction commit here, diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 3c51e243450d..90ff5acaf11f 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1034,7 +1034,14 @@ struct ext4_inode_info { */ struct rw_semaphore xattr_sem; - struct list_head i_orphan; /* unlinked but open inodes */ + /* + * Inodes with EXT4_STATE_ORPHAN_FILE use i_orphan_idx. Otherwise + * i_orphan is used. + */ + union { + struct list_head i_orphan; /* unlinked but open inodes */ + unsigned int i_orphan_idx; /* Index in orphan file */ + }; /* Fast commit related info */ @@ -1086,15 +1093,6 @@ struct ext4_inode_info { * by other means, so we have i_data_sem. */ struct rw_semaphore i_data_sem; - /* - * i_mmap_sem is for serializing page faults with truncate / punch hole - * operations. We have to make sure that new page cannot be faulted in - * a section of the inode that is being punched. We cannot easily use - * i_data_sem for this since we need protection for the whole punch - * operation and i_data_sem ranks below transaction start so we have - * to occasionally drop it. - */ - struct rw_semaphore i_mmap_sem; struct inode vfs_inode; struct jbd2_inode *jinode; @@ -1428,7 +1426,8 @@ struct ext4_super_block { __u8 s_last_error_errcode; __le16 s_encoding; /* Filename charset encoding */ __le16 s_encoding_flags; /* Filename charset encoding flags */ - __le32 s_reserved[95]; /* Padding to the end of the block */ + __le32 s_orphan_file_inum; /* Inode for tracking orphan inodes */ + __le32 s_reserved[94]; /* Padding to the end of the block */ __le32 s_checksum; /* crc32c(superblock) */ }; @@ -1447,6 +1446,54 @@ struct ext4_super_block { #define EXT4_ENC_UTF8_12_1 1 +/* Types of ext4 journal triggers */ +enum ext4_journal_trigger_type { + EXT4_JTR_ORPHAN_FILE, + EXT4_JTR_NONE /* This must be the last entry for indexing to work! */ +}; + +#define EXT4_JOURNAL_TRIGGER_COUNT EXT4_JTR_NONE + +struct ext4_journal_trigger { + struct jbd2_buffer_trigger_type tr_triggers; + struct super_block *sb; +}; + +static inline struct ext4_journal_trigger *EXT4_TRIGGER( + struct jbd2_buffer_trigger_type *trigger) +{ + return container_of(trigger, struct ext4_journal_trigger, tr_triggers); +} + +#define EXT4_ORPHAN_BLOCK_MAGIC 0x0b10ca04 + +/* Structure at the tail of orphan block */ +struct ext4_orphan_block_tail { + __le32 ob_magic; + __le32 ob_checksum; +}; + +static inline int ext4_inodes_per_orphan_block(struct super_block *sb) +{ + return (sb->s_blocksize - sizeof(struct ext4_orphan_block_tail)) / + sizeof(u32); +} + +struct ext4_orphan_block { + atomic_t ob_free_entries; /* Number of free orphan entries in block */ + struct buffer_head *ob_bh; /* Buffer for orphan block */ +}; + +/* + * Info about orphan file. + */ +struct ext4_orphan_info { + int of_blocks; /* Number of orphan blocks in a file */ + __u32 of_csum_seed; /* Checksum seed for orphan file */ + struct ext4_orphan_block *of_binfo; /* Array with info about orphan + * file blocks */ +}; + /* * fourth extended-fs super-block data in memory */ @@ -1501,9 +1548,11 @@ struct ext4_sb_info { /* Journaling */ struct journal_s *s_journal; - struct list_head s_orphan; - struct mutex s_orphan_lock; unsigned long s_ext4_flags; /* Ext4 superblock flags */ + struct mutex s_orphan_lock; /* Protects on disk list changes */ + struct list_head s_orphan; /* List of orphaned inodes in on disk + list */ + struct ext4_orphan_info s_orphan_info; unsigned long s_commit_interval; u32 s_max_batch_time; u32 s_min_batch_time; @@ -1536,6 +1585,9 @@ struct ext4_sb_info { unsigned int s_mb_free_pending; struct list_head s_freed_data_list; /* List of blocks to be freed after commit completed */ + struct list_head s_discard_list; + struct work_struct s_discard_work; + atomic_t s_retry_alloc_pending; struct rb_root s_mb_avg_fragment_size_root; rwlock_t s_mb_rb_lock; struct list_head *s_mb_largest_free_orders; @@ -1625,6 +1677,9 @@ struct ext4_sb_info { struct mb_cache *s_ea_inode_cache; spinlock_t s_es_lock ____cacheline_aligned_in_smp; + /* Journal triggers for checksum computation */ + struct ext4_journal_trigger s_journal_triggers[EXT4_JOURNAL_TRIGGER_COUNT]; + /* Ratelimit ext4 messages. */ struct ratelimit_state s_err_ratelimit_state; struct ratelimit_state s_warning_ratelimit_state; @@ -1835,6 +1890,7 @@ enum { EXT4_STATE_LUSTRE_EA_INODE, /* Lustre-style ea_inode */ EXT4_STATE_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */ EXT4_STATE_FC_COMMITTING, /* Fast commit ongoing */ + EXT4_STATE_ORPHAN_FILE, /* Inode orphaned in orphan file */ }; #define EXT4_INODE_BIT_FNS(name, field, offset) \ @@ -1936,6 +1992,7 @@ static inline bool ext4_verity_in_progress(struct inode *inode) */ #define EXT4_FEATURE_COMPAT_FAST_COMMIT 0x0400 #define EXT4_FEATURE_COMPAT_STABLE_INODES 0x0800 +#define EXT4_FEATURE_COMPAT_ORPHAN_FILE 0x1000 /* Orphan file exists */ #define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 #define EXT4_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 @@ -1956,6 +2013,8 @@ static inline bool ext4_verity_in_progress(struct inode *inode) #define EXT4_FEATURE_RO_COMPAT_READONLY 0x1000 #define EXT4_FEATURE_RO_COMPAT_PROJECT 0x2000 #define EXT4_FEATURE_RO_COMPAT_VERITY 0x8000 +#define EXT4_FEATURE_RO_COMPAT_ORPHAN_PRESENT 0x10000 /* Orphan file may be + non-empty */ #define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 #define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 @@ -2039,6 +2098,7 @@ EXT4_FEATURE_COMPAT_FUNCS(dir_index, DIR_INDEX) EXT4_FEATURE_COMPAT_FUNCS(sparse_super2, SPARSE_SUPER2) EXT4_FEATURE_COMPAT_FUNCS(fast_commit, FAST_COMMIT) EXT4_FEATURE_COMPAT_FUNCS(stable_inodes, STABLE_INODES) +EXT4_FEATURE_COMPAT_FUNCS(orphan_file, ORPHAN_FILE) EXT4_FEATURE_RO_COMPAT_FUNCS(sparse_super, SPARSE_SUPER) EXT4_FEATURE_RO_COMPAT_FUNCS(large_file, LARGE_FILE) @@ -2053,6 +2113,7 @@ EXT4_FEATURE_RO_COMPAT_FUNCS(metadata_csum, METADATA_CSUM) EXT4_FEATURE_RO_COMPAT_FUNCS(readonly, READONLY) EXT4_FEATURE_RO_COMPAT_FUNCS(project, PROJECT) EXT4_FEATURE_RO_COMPAT_FUNCS(verity, VERITY) +EXT4_FEATURE_RO_COMPAT_FUNCS(orphan_present, ORPHAN_PRESENT) EXT4_FEATURE_INCOMPAT_FUNCS(compression, COMPRESSION) EXT4_FEATURE_INCOMPAT_FUNCS(filetype, FILETYPE) @@ -2086,7 +2147,8 @@ EXT4_FEATURE_INCOMPAT_FUNCS(casefold, CASEFOLD) EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ EXT4_FEATURE_RO_COMPAT_BTREE_DIR) -#define EXT4_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR +#define EXT4_FEATURE_COMPAT_SUPP (EXT4_FEATURE_COMPAT_EXT_ATTR| \ + EXT4_FEATURE_COMPAT_ORPHAN_FILE) #define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ EXT4_FEATURE_INCOMPAT_RECOVER| \ EXT4_FEATURE_INCOMPAT_META_BG| \ @@ -2111,7 +2173,8 @@ EXT4_FEATURE_INCOMPAT_FUNCS(casefold, CASEFOLD) EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\ EXT4_FEATURE_RO_COMPAT_QUOTA |\ EXT4_FEATURE_RO_COMPAT_PROJECT |\ - EXT4_FEATURE_RO_COMPAT_VERITY) + EXT4_FEATURE_RO_COMPAT_VERITY |\ + EXT4_FEATURE_RO_COMPAT_ORPHAN_PRESENT) #define EXTN_FEATURE_FUNCS(ver) \ static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \ @@ -2147,6 +2210,8 @@ static inline bool ext4_has_incompat_features(struct super_block *sb) return (EXT4_SB(sb)->s_es->s_feature_incompat != 0); } +extern int ext4_feature_set_ok(struct super_block *sb, int readonly); + /* * Superblock flags */ @@ -2159,7 +2224,6 @@ static inline int ext4_forced_shutdown(struct ext4_sb_info *sbi) return test_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); } - /* * Default values for user and/or group using reserved blocks */ @@ -2920,13 +2984,14 @@ int ext4_get_block(struct inode *inode, sector_t iblock, int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create); int ext4_walk_page_buffers(handle_t *handle, + struct inode *inode, struct buffer_head *head, unsigned from, unsigned to, int *partial, - int (*fn)(handle_t *handle, + int (*fn)(handle_t *handle, struct inode *inode, struct buffer_head *bh)); -int do_journal_get_write_access(handle_t *handle, +int do_journal_get_write_access(handle_t *handle, struct inode *inode, struct buffer_head *bh); #define FALL_BACK_TO_NONDELALLOC 1 #define CONVERT_INLINE_DATA 2 @@ -2972,7 +3037,6 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, loff_t lstart, loff_t lend); extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf); -extern vm_fault_t ext4_filemap_fault(struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); extern int ext4_get_projid(struct inode *inode, kprojid_t *projid); extern void ext4_da_release_space(struct inode *inode, int to_free); @@ -3006,8 +3070,6 @@ extern int ext4_init_new_dir(handle_t *handle, struct inode *dir, struct inode *inode); extern int ext4_dirblock_csum_verify(struct inode *inode, struct buffer_head *bh); -extern int ext4_orphan_add(handle_t *, struct inode *); -extern int ext4_orphan_del(handle_t *, struct inode *); extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, __u32 start_minor_hash, __u32 *next_hash); extern int ext4_search_dir(struct buffer_head *bh, @@ -3476,6 +3538,7 @@ static inline bool ext4_is_quota_journalled(struct super_block *sb) return (ext4_has_feature_quota(sb) || sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]); } +int ext4_enable_quotas(struct super_block *sb); #endif /* @@ -3737,6 +3800,19 @@ extern void ext4_stop_mmpd(struct ext4_sb_info *sbi); /* verity.c */ extern const struct fsverity_operations ext4_verityops; +/* orphan.c */ +extern int ext4_orphan_add(handle_t *, struct inode *); +extern int ext4_orphan_del(handle_t *, struct inode *); +extern void ext4_orphan_cleanup(struct super_block *sb, + struct ext4_super_block *es); +extern void ext4_release_orphan_info(struct super_block *sb); +extern int ext4_init_orphan_info(struct super_block *sb); +extern int ext4_orphan_file_empty(struct super_block *sb); +extern void ext4_orphan_file_block_trigger( + struct jbd2_buffer_trigger_type *triggers, + struct buffer_head *bh, + void *data, size_t size); + /* * Add new method to test whether block and inode bitmaps are properly * initialized. With uninit_bg reading the block from disk is not enough diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 44e59881a1f0..26435f3a3094 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -173,10 +173,11 @@ struct partial_cluster { #define EXT_MAX_EXTENT(__hdr__) \ ((le16_to_cpu((__hdr__)->eh_max)) ? \ ((EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)) \ - : 0) + : NULL) #define EXT_MAX_INDEX(__hdr__) \ ((le16_to_cpu((__hdr__)->eh_max)) ? \ - ((EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)) : 0) + ((EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)) \ + : NULL) static inline struct ext4_extent_header *ext_inode_hdr(struct inode *inode) { diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index b60f0152ea57..6def7339056d 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -218,9 +218,11 @@ static void ext4_check_bdev_write_error(struct super_block *sb) } int __ext4_journal_get_write_access(const char *where, unsigned int line, - handle_t *handle, struct buffer_head *bh) + handle_t *handle, struct super_block *sb, + struct buffer_head *bh, + enum ext4_journal_trigger_type trigger_type) { - int err = 0; + int err; might_sleep(); @@ -229,11 +231,18 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line, if (ext4_handle_valid(handle)) { err = jbd2_journal_get_write_access(handle, bh); - if (err) + if (err) { ext4_journal_abort_handle(where, line, __func__, bh, handle, err); + return err; + } } - return err; + if (trigger_type == EXT4_JTR_NONE || !ext4_has_metadata_csum(sb)) + return 0; + BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT); + jbd2_journal_set_triggers(bh, + &EXT4_SB(sb)->s_journal_triggers[trigger_type].tr_triggers); + return 0; } /* @@ -301,17 +310,27 @@ int __ext4_forget(const char *where, unsigned int line, handle_t *handle, } int __ext4_journal_get_create_access(const char *where, unsigned int line, - handle_t *handle, struct buffer_head *bh) + handle_t *handle, struct super_block *sb, + struct buffer_head *bh, + enum ext4_journal_trigger_type trigger_type) { - int err = 0; + int err; - if (ext4_handle_valid(handle)) { - err = jbd2_journal_get_create_access(handle, bh); - if (err) - ext4_journal_abort_handle(where, line, __func__, - bh, handle, err); + if (!ext4_handle_valid(handle)) + return 0; + + err = jbd2_journal_get_create_access(handle, bh); + if (err) { + ext4_journal_abort_handle(where, line, __func__, bh, handle, + err); + return err; } - return err; + if (trigger_type == EXT4_JTR_NONE || !ext4_has_metadata_csum(sb)) + return 0; + BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT); + jbd2_journal_set_triggers(bh, + &EXT4_SB(sb)->s_journal_triggers[trigger_type].tr_triggers); + return 0; } int __ext4_handle_dirty_metadata(const char *where, unsigned int line, diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 0d2fa423b7ad..0e4fa644df01 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -231,26 +231,32 @@ int ext4_expand_extra_isize(struct inode *inode, * Wrapper functions with which ext4 calls into JBD. */ int __ext4_journal_get_write_access(const char *where, unsigned int line, - handle_t *handle, struct buffer_head *bh); + handle_t *handle, struct super_block *sb, + struct buffer_head *bh, + enum ext4_journal_trigger_type trigger_type); int __ext4_forget(const char *where, unsigned int line, handle_t *handle, int is_metadata, struct inode *inode, struct buffer_head *bh, ext4_fsblk_t blocknr); int __ext4_journal_get_create_access(const char *where, unsigned int line, - handle_t *handle, struct buffer_head *bh); + handle_t *handle, struct super_block *sb, + struct buffer_head *bh, + enum ext4_journal_trigger_type trigger_type); int __ext4_handle_dirty_metadata(const char *where, unsigned int line, handle_t *handle, struct inode *inode, struct buffer_head *bh); -#define ext4_journal_get_write_access(handle, bh) \ - __ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh)) +#define ext4_journal_get_write_access(handle, sb, bh, trigger_type) \ + __ext4_journal_get_write_access(__func__, __LINE__, (handle), (sb), \ + (bh), (trigger_type)) #define ext4_forget(handle, is_metadata, inode, bh, block_nr) \ __ext4_forget(__func__, __LINE__, (handle), (is_metadata), (inode), \ (bh), (block_nr)) -#define ext4_journal_get_create_access(handle, bh) \ - __ext4_journal_get_create_access(__func__, __LINE__, (handle), (bh)) +#define ext4_journal_get_create_access(handle, sb, bh, trigger_type) \ + __ext4_journal_get_create_access(__func__, __LINE__, (handle), (sb), \ + (bh), (trigger_type)) #define ext4_handle_dirty_metadata(handle, inode, bh) \ __ext4_handle_dirty_metadata(__func__, __LINE__, (handle), (inode), \ (bh)) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 92ad64b89d9b..c0de30f25185 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -139,7 +139,8 @@ static int ext4_ext_get_access(handle_t *handle, struct inode *inode, if (path->p_bh) { /* path points to block */ BUFFER_TRACE(path->p_bh, "get_write_access"); - return ext4_journal_get_write_access(handle, path->p_bh); + return ext4_journal_get_write_access(handle, inode->i_sb, + path->p_bh, EXT4_JTR_NONE); } /* path points to leaf/index in inode body */ /* we use in-core data, no need to protect them */ @@ -1082,7 +1083,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, } lock_buffer(bh); - err = ext4_journal_get_create_access(handle, bh); + err = ext4_journal_get_create_access(handle, inode->i_sb, bh, + EXT4_JTR_NONE); if (err) goto cleanup; @@ -1160,7 +1162,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, } lock_buffer(bh); - err = ext4_journal_get_create_access(handle, bh); + err = ext4_journal_get_create_access(handle, inode->i_sb, bh, + EXT4_JTR_NONE); if (err) goto cleanup; @@ -1286,7 +1289,8 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, return -ENOMEM; lock_buffer(bh); - err = ext4_journal_get_create_access(handle, bh); + err = ext4_journal_get_create_access(handle, inode->i_sb, bh, + EXT4_JTR_NONE); if (err) { unlock_buffer(bh); goto out; @@ -3569,7 +3573,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, split_map.m_len - ee_block); err = ext4_ext_zeroout(inode, &zero_ex1); if (err) - goto out; + goto fallback; split_map.m_len = allocated; } if (split_map.m_lblk - ee_block + split_map.m_len < @@ -3583,7 +3587,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ext4_ext_pblock(ex)); err = ext4_ext_zeroout(inode, &zero_ex2); if (err) - goto out; + goto fallback; } split_map.m_len += split_map.m_lblk - ee_block; @@ -3592,6 +3596,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, } } +fallback: err = ext4_split_extent(handle, inode, ppath, &split_map, split_flag, flags); if (err > 0) @@ -4474,6 +4479,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, loff_t len, int mode) { struct inode *inode = file_inode(file); + struct address_space *mapping = file->f_mapping; handle_t *handle = NULL; unsigned int max_blocks; loff_t new_size = 0; @@ -4560,17 +4566,17 @@ static long ext4_zero_range(struct file *file, loff_t offset, * Prevent page faults from reinstantiating pages we have * released from page cache. */ - down_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_lock(mapping); ret = ext4_break_layouts(inode); if (ret) { - up_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); goto out_mutex; } ret = ext4_update_disksize_before_punch(inode, offset, len); if (ret) { - up_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); goto out_mutex; } /* Now release the pages and zero block aligned part of pages */ @@ -4579,7 +4585,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags); - up_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); if (ret) goto out_mutex; } @@ -5221,6 +5227,7 @@ out: static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) { struct super_block *sb = inode->i_sb; + struct address_space *mapping = inode->i_mapping; ext4_lblk_t punch_start, punch_stop; handle_t *handle; unsigned int credits; @@ -5274,7 +5281,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) * Prevent page faults from reinstantiating pages we have released from * page cache. */ - down_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_lock(mapping); ret = ext4_break_layouts(inode); if (ret) @@ -5289,15 +5296,15 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) * Write tail of the last page before removed range since it will get * removed from the page cache below. */ - ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, offset); + ret = filemap_write_and_wait_range(mapping, ioffset, offset); if (ret) goto out_mmap; /* * Write data that will be shifted to preserve them when discarding * page cache below. We are also protected from pages becoming dirty - * by i_mmap_sem. + * by i_rwsem and invalidate_lock. */ - ret = filemap_write_and_wait_range(inode->i_mapping, offset + len, + ret = filemap_write_and_wait_range(mapping, offset + len, LLONG_MAX); if (ret) goto out_mmap; @@ -5350,7 +5357,7 @@ out_stop: ext4_journal_stop(handle); ext4_fc_stop_ineligible(sb); out_mmap: - up_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); out_mutex: inode_unlock(inode); return ret; @@ -5367,6 +5374,7 @@ out_mutex: static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) { struct super_block *sb = inode->i_sb; + struct address_space *mapping = inode->i_mapping; handle_t *handle; struct ext4_ext_path *path; struct ext4_extent *extent; @@ -5425,7 +5433,7 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) * Prevent page faults from reinstantiating pages we have released from * page cache. */ - down_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_lock(mapping); ret = ext4_break_layouts(inode); if (ret) @@ -5526,7 +5534,7 @@ out_stop: ext4_journal_stop(handle); ext4_fc_stop_ineligible(sb); out_mmap: - up_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); out_mutex: inode_unlock(inode); return ret; diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index e8195229c252..8e610a381862 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -775,28 +775,27 @@ static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val, } /* Same as above, but adds dentry tlv. */ -static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag, - int parent_ino, int ino, int dlen, - const unsigned char *dname, - u32 *crc) +static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc, + struct ext4_fc_dentry_update *fc_dentry) { struct ext4_fc_dentry_info fcd; struct ext4_fc_tl tl; + int dlen = fc_dentry->fcd_name.len; u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen, crc); if (!dst) return false; - fcd.fc_parent_ino = cpu_to_le32(parent_ino); - fcd.fc_ino = cpu_to_le32(ino); - tl.fc_tag = cpu_to_le16(tag); + fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent); + fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino); + tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op); tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen); ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc); dst += sizeof(tl); ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc); dst += sizeof(fcd); - ext4_fc_memcpy(sb, dst, dname, dlen, crc); + ext4_fc_memcpy(sb, dst, fc_dentry->fcd_name.name, dlen, crc); dst += dlen; return true; @@ -992,11 +991,7 @@ __releases(&sbi->s_fc_lock) &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) { if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) { spin_unlock(&sbi->s_fc_lock); - if (!ext4_fc_add_dentry_tlv( - sb, fc_dentry->fcd_op, - fc_dentry->fcd_parent, fc_dentry->fcd_ino, - fc_dentry->fcd_name.len, - fc_dentry->fcd_name.name, crc)) { + if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) { ret = -ENOSPC; goto lock_and_exit; } @@ -1035,11 +1030,7 @@ __releases(&sbi->s_fc_lock) if (ret) goto lock_and_exit; - if (!ext4_fc_add_dentry_tlv( - sb, fc_dentry->fcd_op, - fc_dentry->fcd_parent, fc_dentry->fcd_ino, - fc_dentry->fcd_name.len, - fc_dentry->fcd_name.name, crc)) { + if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) { ret = -ENOSPC; goto lock_and_exit; } diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 816dedcbd541..ac0e11bbb445 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -704,22 +704,23 @@ static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, */ bool write = (vmf->flags & FAULT_FLAG_WRITE) && (vmf->vma->vm_flags & VM_SHARED); + struct address_space *mapping = vmf->vma->vm_file->f_mapping; pfn_t pfn; if (write) { sb_start_pagefault(sb); file_update_time(vmf->vma->vm_file); - down_read(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_lock_shared(mapping); retry: handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, EXT4_DATA_TRANS_BLOCKS(sb)); if (IS_ERR(handle)) { - up_read(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock_shared(mapping); sb_end_pagefault(sb); return VM_FAULT_SIGBUS; } } else { - down_read(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_lock_shared(mapping); } result = dax_iomap_fault(vmf, pe_size, &pfn, &error, &ext4_iomap_ops); if (write) { @@ -731,10 +732,10 @@ retry: /* Handling synchronous page fault? */ if (result & VM_FAULT_NEEDDSYNC) result = dax_finish_sync_fault(vmf, pe_size, pfn); - up_read(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock_shared(mapping); sb_end_pagefault(sb); } else { - up_read(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock_shared(mapping); } return result; @@ -756,7 +757,7 @@ static const struct vm_operations_struct ext4_dax_vm_ops = { #endif static const struct vm_operations_struct ext4_file_vm_ops = { - .fault = ext4_filemap_fault, + .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = ext4_page_mkwrite, }; @@ -822,7 +823,8 @@ static int ext4_sample_last_mounted(struct super_block *sb, if (IS_ERR(handle)) goto out; BUFFER_TRACE(sbi->s_sbh, "get_write_access"); - err = ext4_journal_get_write_access(handle, sbi->s_sbh); + err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh, + EXT4_JTR_NONE); if (err) goto out_journal; lock_buffer(sbi->s_sbh); diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index e89fc0f770b0..f73e5eb43eae 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -300,7 +300,8 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) } BUFFER_TRACE(bitmap_bh, "get_write_access"); - fatal = ext4_journal_get_write_access(handle, bitmap_bh); + fatal = ext4_journal_get_write_access(handle, sb, bitmap_bh, + EXT4_JTR_NONE); if (fatal) goto error_return; @@ -308,7 +309,8 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) gdp = ext4_get_group_desc(sb, block_group, &bh2); if (gdp) { BUFFER_TRACE(bh2, "get_write_access"); - fatal = ext4_journal_get_write_access(handle, bh2); + fatal = ext4_journal_get_write_access(handle, sb, bh2, + EXT4_JTR_NONE); } ext4_lock_group(sb, block_group); cleared = ext4_test_and_clear_bit(bit, bitmap_bh->b_data); @@ -1085,7 +1087,8 @@ repeat_in_this_group: } } BUFFER_TRACE(inode_bitmap_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, inode_bitmap_bh); + err = ext4_journal_get_write_access(handle, sb, inode_bitmap_bh, + EXT4_JTR_NONE); if (err) { ext4_std_error(sb, err); goto out; @@ -1127,7 +1130,8 @@ got: } BUFFER_TRACE(group_desc_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, group_desc_bh); + err = ext4_journal_get_write_access(handle, sb, group_desc_bh, + EXT4_JTR_NONE); if (err) { ext4_std_error(sb, err); goto out; @@ -1144,7 +1148,8 @@ got: goto out; } BUFFER_TRACE(block_bitmap_bh, "get block bitmap access"); - err = ext4_journal_get_write_access(handle, block_bitmap_bh); + err = ext4_journal_get_write_access(handle, sb, block_bitmap_bh, + EXT4_JTR_NONE); if (err) { brelse(block_bitmap_bh); ext4_std_error(sb, err); @@ -1583,8 +1588,8 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, num = sbi->s_itb_per_group - used_blks; BUFFER_TRACE(group_desc_bh, "get_write_access"); - ret = ext4_journal_get_write_access(handle, - group_desc_bh); + ret = ext4_journal_get_write_access(handle, sb, group_desc_bh, + EXT4_JTR_NONE); if (ret) goto err_out; diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index a7bc6ad656a9..89efa78ed4b2 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -354,7 +354,8 @@ static int ext4_alloc_branch(handle_t *handle, } lock_buffer(bh); BUFFER_TRACE(bh, "call get_create_access"); - err = ext4_journal_get_create_access(handle, bh); + err = ext4_journal_get_create_access(handle, ar->inode->i_sb, + bh, EXT4_JTR_NONE); if (err) { unlock_buffer(bh); goto failed; @@ -429,7 +430,8 @@ static int ext4_splice_branch(handle_t *handle, */ if (where->bh) { BUFFER_TRACE(where->bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, where->bh); + err = ext4_journal_get_write_access(handle, ar->inode->i_sb, + where->bh, EXT4_JTR_NONE); if (err) goto err_out; } @@ -728,7 +730,8 @@ static int ext4_ind_truncate_ensure_credits(handle_t *handle, return ret; if (bh) { BUFFER_TRACE(bh, "retaking write access"); - ret = ext4_journal_get_write_access(handle, bh); + ret = ext4_journal_get_write_access(handle, inode->i_sb, bh, + EXT4_JTR_NONE); if (unlikely(ret)) return ret; } @@ -916,7 +919,8 @@ static void ext4_free_data(handle_t *handle, struct inode *inode, if (this_bh) { /* For indirect block */ BUFFER_TRACE(this_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, this_bh); + err = ext4_journal_get_write_access(handle, inode->i_sb, + this_bh, EXT4_JTR_NONE); /* Important: if we can't update the indirect pointers * to the blocks, we can't free them. */ if (err) @@ -1079,7 +1083,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, */ BUFFER_TRACE(parent_bh, "get_write_access"); if (!ext4_journal_get_write_access(handle, - parent_bh)){ + inode->i_sb, parent_bh, + EXT4_JTR_NONE)) { *p = 0; BUFFER_TRACE(parent_bh, "call ext4_handle_dirty_metadata"); diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 70cb64db33f7..82bf4ff6be28 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -264,7 +264,8 @@ static int ext4_create_inline_data(handle_t *handle, return error; BUFFER_TRACE(is.iloc.bh, "get_write_access"); - error = ext4_journal_get_write_access(handle, is.iloc.bh); + error = ext4_journal_get_write_access(handle, inode->i_sb, is.iloc.bh, + EXT4_JTR_NONE); if (error) goto out; @@ -350,7 +351,8 @@ static int ext4_update_inline_data(handle_t *handle, struct inode *inode, goto out; BUFFER_TRACE(is.iloc.bh, "get_write_access"); - error = ext4_journal_get_write_access(handle, is.iloc.bh); + error = ext4_journal_get_write_access(handle, inode->i_sb, is.iloc.bh, + EXT4_JTR_NONE); if (error) goto out; @@ -427,7 +429,8 @@ static int ext4_destroy_inline_data_nolock(handle_t *handle, goto out; BUFFER_TRACE(is.iloc.bh, "get_write_access"); - error = ext4_journal_get_write_access(handle, is.iloc.bh); + error = ext4_journal_get_write_access(handle, inode->i_sb, is.iloc.bh, + EXT4_JTR_NONE); if (error) goto out; @@ -593,7 +596,7 @@ retry: ret = __block_write_begin(page, from, to, ext4_get_block); if (!ret && ext4_should_journal_data(inode)) { - ret = ext4_walk_page_buffers(handle, page_buffers(page), + ret = ext4_walk_page_buffers(handle, inode, page_buffers(page), from, to, NULL, do_journal_get_write_access); } @@ -682,7 +685,8 @@ int ext4_try_to_write_inline_data(struct address_space *mapping, goto convert; } - ret = ext4_journal_get_write_access(handle, iloc.bh); + ret = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh, + EXT4_JTR_NONE); if (ret) goto out; @@ -750,6 +754,12 @@ int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, ext4_write_lock_xattr(inode, &no_expand); BUG_ON(!ext4_has_inline_data(inode)); + /* + * ei->i_inline_off may have changed since ext4_write_begin() + * called ext4_try_to_write_inline_data() + */ + (void) ext4_find_inline_data_nolock(inode); + kaddr = kmap_atomic(page); ext4_write_inline_data(inode, &iloc, kaddr, pos, len); kunmap_atomic(kaddr); @@ -923,7 +933,8 @@ retry_journal: if (ret < 0) goto out_release_page; } - ret = ext4_journal_get_write_access(handle, iloc.bh); + ret = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh, + EXT4_JTR_NONE); if (ret) goto out_release_page; @@ -1028,7 +1039,8 @@ static int ext4_add_dirent_to_inline(handle_t *handle, return err; BUFFER_TRACE(iloc->bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, iloc->bh); + err = ext4_journal_get_write_access(handle, dir->i_sb, iloc->bh, + EXT4_JTR_NONE); if (err) return err; ext4_insert_dentry(dir, inode, de, inline_size, fname); @@ -1223,7 +1235,8 @@ static int ext4_convert_inline_data_nolock(handle_t *handle, } lock_buffer(data_bh); - error = ext4_journal_get_create_access(handle, data_bh); + error = ext4_journal_get_create_access(handle, inode->i_sb, data_bh, + EXT4_JTR_NONE); if (error) { unlock_buffer(data_bh); error = -EIO; @@ -1707,7 +1720,8 @@ int ext4_delete_inline_entry(handle_t *handle, } BUFFER_TRACE(bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, bh); + err = ext4_journal_get_write_access(handle, dir->i_sb, bh, + EXT4_JTR_NONE); if (err) goto out; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d8de607849df..d18852d6029c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -139,7 +139,6 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode, static void ext4_invalidatepage(struct page *page, unsigned int offset, unsigned int length); static int __ext4_journalled_writepage(struct page *page, unsigned int len); -static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh); static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents); @@ -869,7 +868,8 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, */ lock_buffer(bh); BUFFER_TRACE(bh, "call get_create_access"); - err = ext4_journal_get_create_access(handle, bh); + err = ext4_journal_get_create_access(handle, inode->i_sb, bh, + EXT4_JTR_NONE); if (unlikely(err)) { unlock_buffer(bh); goto errout; @@ -954,12 +954,12 @@ out_brelse: return err; } -int ext4_walk_page_buffers(handle_t *handle, +int ext4_walk_page_buffers(handle_t *handle, struct inode *inode, struct buffer_head *head, unsigned from, unsigned to, int *partial, - int (*fn)(handle_t *handle, + int (*fn)(handle_t *handle, struct inode *inode, struct buffer_head *bh)) { struct buffer_head *bh; @@ -978,7 +978,7 @@ int ext4_walk_page_buffers(handle_t *handle, *partial = 1; continue; } - err = (*fn)(handle, bh); + err = (*fn)(handle, inode, bh); if (!ret) ret = err; } @@ -1009,7 +1009,7 @@ int ext4_walk_page_buffers(handle_t *handle, * is elevated. We'll still have enough credits for the tiny quotafile * write. */ -int do_journal_get_write_access(handle_t *handle, +int do_journal_get_write_access(handle_t *handle, struct inode *inode, struct buffer_head *bh) { int dirty = buffer_dirty(bh); @@ -1028,7 +1028,8 @@ int do_journal_get_write_access(handle_t *handle, if (dirty) clear_buffer_dirty(bh); BUFFER_TRACE(bh, "get write access"); - ret = ext4_journal_get_write_access(handle, bh); + ret = ext4_journal_get_write_access(handle, inode->i_sb, bh, + EXT4_JTR_NONE); if (!ret && dirty) ret = ext4_handle_dirty_metadata(handle, NULL, bh); return ret; @@ -1208,8 +1209,8 @@ retry_journal: ret = __block_write_begin(page, pos, len, ext4_get_block); #endif if (!ret && ext4_should_journal_data(inode)) { - ret = ext4_walk_page_buffers(handle, page_buffers(page), - from, to, NULL, + ret = ext4_walk_page_buffers(handle, inode, + page_buffers(page), from, to, NULL, do_journal_get_write_access); } @@ -1253,7 +1254,8 @@ retry_journal: } /* For write_end() in data=journal mode */ -static int write_end_fn(handle_t *handle, struct buffer_head *bh) +static int write_end_fn(handle_t *handle, struct inode *inode, + struct buffer_head *bh) { int ret; if (!buffer_mapped(bh) || buffer_freed(bh)) @@ -1352,6 +1354,7 @@ errout: * to call ext4_handle_dirty_metadata() instead. */ static void ext4_journalled_zero_new_buffers(handle_t *handle, + struct inode *inode, struct page *page, unsigned from, unsigned to) { @@ -1370,7 +1373,7 @@ static void ext4_journalled_zero_new_buffers(handle_t *handle, size = min(to, block_end) - start; zero_user(page, start, size); - write_end_fn(handle, bh); + write_end_fn(handle, inode, bh); } clear_buffer_new(bh); } @@ -1412,13 +1415,13 @@ static int ext4_journalled_write_end(struct file *file, copied = ret; } else if (unlikely(copied < len) && !PageUptodate(page)) { copied = 0; - ext4_journalled_zero_new_buffers(handle, page, from, to); + ext4_journalled_zero_new_buffers(handle, inode, page, from, to); } else { if (unlikely(copied < len)) - ext4_journalled_zero_new_buffers(handle, page, + ext4_journalled_zero_new_buffers(handle, inode, page, from + copied, to); - ret = ext4_walk_page_buffers(handle, page_buffers(page), from, - from + copied, &partial, + ret = ext4_walk_page_buffers(handle, inode, page_buffers(page), + from, from + copied, &partial, write_end_fn); if (!partial) SetPageUptodate(page); @@ -1619,7 +1622,8 @@ static void ext4_print_free_blocks(struct inode *inode) return; } -static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh) +static int ext4_bh_delay_or_unwritten(handle_t *handle, struct inode *inode, + struct buffer_head *bh) { return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh); } @@ -1851,13 +1855,15 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, return 0; } -static int bget_one(handle_t *handle, struct buffer_head *bh) +static int bget_one(handle_t *handle, struct inode *inode, + struct buffer_head *bh) { get_bh(bh); return 0; } -static int bput_one(handle_t *handle, struct buffer_head *bh) +static int bput_one(handle_t *handle, struct inode *inode, + struct buffer_head *bh) { put_bh(bh); return 0; @@ -1888,7 +1894,7 @@ static int __ext4_journalled_writepage(struct page *page, BUG(); goto out; } - ext4_walk_page_buffers(handle, page_bufs, 0, len, + ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, NULL, bget_one); } /* @@ -1920,11 +1926,11 @@ static int __ext4_journalled_writepage(struct page *page, if (inline_data) { ret = ext4_mark_inode_dirty(handle, inode); } else { - ret = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL, - do_journal_get_write_access); + ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, + NULL, do_journal_get_write_access); - err = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL, - write_end_fn); + err = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, + NULL, write_end_fn); } if (ret == 0) ret = err; @@ -1941,7 +1947,7 @@ out: unlock_page(page); out_no_pagelock: if (!inline_data && page_bufs) - ext4_walk_page_buffers(NULL, page_bufs, 0, len, + ext4_walk_page_buffers(NULL, inode, page_bufs, 0, len, NULL, bput_one); brelse(inode_bh); return ret; @@ -2031,7 +2037,7 @@ static int ext4_writepage(struct page *page, * for the extremely common case, this is an optimization that * skips a useless round trip through ext4_bio_write_page(). */ - if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL, + if (ext4_walk_page_buffers(NULL, inode, page_bufs, 0, len, NULL, ext4_bh_delay_or_unwritten)) { redirty_page_for_writepage(wbc, page); if ((current->flags & PF_MEMALLOC) || @@ -3794,7 +3800,8 @@ static int __ext4_block_zero_page_range(handle_t *handle, } if (ext4_should_journal_data(inode)) { BUFFER_TRACE(bh, "get write access"); - err = ext4_journal_get_write_access(handle, bh); + err = ext4_journal_get_write_access(handle, inode->i_sb, bh, + EXT4_JTR_NONE); if (err) goto unlock; } @@ -3950,20 +3957,19 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, return ret; } -static void ext4_wait_dax_page(struct ext4_inode_info *ei) +static void ext4_wait_dax_page(struct inode *inode) { - up_write(&ei->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); schedule(); - down_write(&ei->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); } int ext4_break_layouts(struct inode *inode) { - struct ext4_inode_info *ei = EXT4_I(inode); struct page *page; int error; - if (WARN_ON_ONCE(!rwsem_is_locked(&ei->i_mmap_sem))) + if (WARN_ON_ONCE(!rwsem_is_locked(&inode->i_mapping->invalidate_lock))) return -EINVAL; do { @@ -3974,7 +3980,7 @@ int ext4_break_layouts(struct inode *inode) error = ___wait_var_event(&page->_refcount, atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE, 0, 0, - ext4_wait_dax_page(ei)); + ext4_wait_dax_page(inode)); } while (error == 0); return error; @@ -4005,9 +4011,9 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); if (ext4_has_inline_data(inode)) { - down_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_lock(mapping); ret = ext4_convert_inline_data(inode); - up_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); if (ret) return ret; } @@ -4058,7 +4064,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) * Prevent page faults from reinstantiating pages we have released from * page cache. */ - down_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_lock(mapping); ret = ext4_break_layouts(inode); if (ret) @@ -4131,7 +4137,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) out_stop: ext4_journal_stop(handle); out_dio: - up_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); out_mutex: inode_unlock(inode); return ret; @@ -4330,101 +4336,93 @@ static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino, bh = sb_getblk(sb, block); if (unlikely(!bh)) return -ENOMEM; - if (ext4_simulate_fail(sb, EXT4_SIM_INODE_EIO)) - goto simulate_eio; - if (!buffer_uptodate(bh)) { - lock_buffer(bh); + if (ext4_buffer_uptodate(bh)) + goto has_buffer; - if (ext4_buffer_uptodate(bh)) { - /* someone brought it uptodate while we waited */ - unlock_buffer(bh); - goto has_buffer; - } - - /* - * If we have all information of the inode in memory and this - * is the only valid inode in the block, we need not read the - * block. - */ - if (in_mem) { - struct buffer_head *bitmap_bh; - int i, start; + lock_buffer(bh); + /* + * If we have all information of the inode in memory and this + * is the only valid inode in the block, we need not read the + * block. + */ + if (in_mem) { + struct buffer_head *bitmap_bh; + int i, start; - start = inode_offset & ~(inodes_per_block - 1); + start = inode_offset & ~(inodes_per_block - 1); - /* Is the inode bitmap in cache? */ - bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp)); - if (unlikely(!bitmap_bh)) - goto make_io; + /* Is the inode bitmap in cache? */ + bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp)); + if (unlikely(!bitmap_bh)) + goto make_io; - /* - * If the inode bitmap isn't in cache then the - * optimisation may end up performing two reads instead - * of one, so skip it. - */ - if (!buffer_uptodate(bitmap_bh)) { - brelse(bitmap_bh); - goto make_io; - } - for (i = start; i < start + inodes_per_block; i++) { - if (i == inode_offset) - continue; - if (ext4_test_bit(i, bitmap_bh->b_data)) - break; - } + /* + * If the inode bitmap isn't in cache then the + * optimisation may end up performing two reads instead + * of one, so skip it. + */ + if (!buffer_uptodate(bitmap_bh)) { brelse(bitmap_bh); - if (i == start + inodes_per_block) { - /* all other inodes are free, so skip I/O */ - memset(bh->b_data, 0, bh->b_size); - set_buffer_uptodate(bh); - unlock_buffer(bh); - goto has_buffer; - } + goto make_io; + } + for (i = start; i < start + inodes_per_block; i++) { + if (i == inode_offset) + continue; + if (ext4_test_bit(i, bitmap_bh->b_data)) + break; } + brelse(bitmap_bh); + if (i == start + inodes_per_block) { + /* all other inodes are free, so skip I/O */ + memset(bh->b_data, 0, bh->b_size); + set_buffer_uptodate(bh); + unlock_buffer(bh); + goto has_buffer; + } + } make_io: - /* - * If we need to do any I/O, try to pre-readahead extra - * blocks from the inode table. - */ - blk_start_plug(&plug); - if (EXT4_SB(sb)->s_inode_readahead_blks) { - ext4_fsblk_t b, end, table; - unsigned num; - __u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks; - - table = ext4_inode_table(sb, gdp); - /* s_inode_readahead_blks is always a power of 2 */ - b = block & ~((ext4_fsblk_t) ra_blks - 1); - if (table > b) - b = table; - end = b + ra_blks; - num = EXT4_INODES_PER_GROUP(sb); - if (ext4_has_group_desc_csum(sb)) - num -= ext4_itable_unused_count(sb, gdp); - table += num / inodes_per_block; - if (end > table) - end = table; - while (b <= end) - ext4_sb_breadahead_unmovable(sb, b++); - } + /* + * If we need to do any I/O, try to pre-readahead extra + * blocks from the inode table. + */ + blk_start_plug(&plug); + if (EXT4_SB(sb)->s_inode_readahead_blks) { + ext4_fsblk_t b, end, table; + unsigned num; + __u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks; + + table = ext4_inode_table(sb, gdp); + /* s_inode_readahead_blks is always a power of 2 */ + b = block & ~((ext4_fsblk_t) ra_blks - 1); + if (table > b) + b = table; + end = b + ra_blks; + num = EXT4_INODES_PER_GROUP(sb); + if (ext4_has_group_desc_csum(sb)) + num -= ext4_itable_unused_count(sb, gdp); + table += num / inodes_per_block; + if (end > table) + end = table; + while (b <= end) + ext4_sb_breadahead_unmovable(sb, b++); + } - /* - * There are other valid inodes in the buffer, this inode - * has in-inode xattrs, or we don't have this inode in memory. - * Read the block from disk. - */ - trace_ext4_load_inode(sb, ino); - ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL); - blk_finish_plug(&plug); - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) { - simulate_eio: - if (ret_block) - *ret_block = block; - brelse(bh); - return -EIO; - } + /* + * There are other valid inodes in the buffer, this inode + * has in-inode xattrs, or we don't have this inode in memory. + * Read the block from disk. + */ + trace_ext4_load_inode(sb, ino); + ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL); + blk_finish_plug(&plug); + wait_on_buffer(bh); + ext4_simulate_fail_bh(sb, bh, EXT4_SIM_INODE_EIO); + if (!buffer_uptodate(bh)) { + if (ret_block) + *ret_block = block; + brelse(bh); + return -EIO; } has_buffer: iloc->bh = bh; @@ -4603,6 +4601,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, struct ext4_iloc iloc; struct ext4_inode *raw_inode; struct ext4_inode_info *ei; + struct ext4_super_block *es = EXT4_SB(sb)->s_es; struct inode *inode; journal_t *journal = EXT4_SB(sb)->s_journal; long ret; @@ -4613,9 +4612,13 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, projid_t i_projid; if ((!(flags & EXT4_IGET_SPECIAL) && - (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)) || + ((ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) || + ino == le32_to_cpu(es->s_usr_quota_inum) || + ino == le32_to_cpu(es->s_grp_quota_inum) || + ino == le32_to_cpu(es->s_prj_quota_inum) || + ino == le32_to_cpu(es->s_orphan_file_inum))) || (ino < EXT4_ROOT_INO) || - (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) { + (ino > le32_to_cpu(es->s_inodes_count))) { if (flags & EXT4_IGET_HANDLE) return ERR_PTR(-ESTALE); __ext4_error(sb, function, line, false, EFSCORRUPTED, 0, @@ -4928,8 +4931,14 @@ static int ext4_inode_blocks_set(handle_t *handle, ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); return 0; } + + /* + * This should never happen since sb->s_maxbytes should not have + * allowed this, sb->s_maxbytes was set according to the huge_file + * feature in ext4_fill_super(). + */ if (!ext4_has_feature_huge_file(sb)) - return -EFBIG; + return -EFSCORRUPTED; if (i_blocks <= 0xffffffffffffULL) { /* @@ -5032,16 +5041,14 @@ static int ext4_do_update_inode(handle_t *handle, spin_lock(&ei->i_raw_lock); - /* For fields not tracked in the in-memory inode, - * initialise them to zero for new inodes. */ + /* + * For fields not tracked in the in-memory inode, initialise them + * to zero for new inodes. + */ if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); err = ext4_inode_blocks_set(handle, raw_inode, ei); - if (err) { - spin_unlock(&ei->i_raw_lock); - goto out_brelse; - } raw_inode->i_mode = cpu_to_le16(inode->i_mode); i_uid = i_uid_read(inode); @@ -5050,10 +5057,11 @@ static int ext4_do_update_inode(handle_t *handle, if (!(test_opt(inode->i_sb, NO_UID32))) { raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid)); raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid)); -/* - * Fix up interoperability with old kernels. Otherwise, old inodes get - * re-used with the upper 16 bits of the uid/gid intact - */ + /* + * Fix up interoperability with old kernels. Otherwise, + * old inodes get re-used with the upper 16 bits of the + * uid/gid intact. + */ if (ei->i_dtime && list_empty(&ei->i_orphan)) { raw_inode->i_uid_high = 0; raw_inode->i_gid_high = 0; @@ -5122,8 +5130,9 @@ static int ext4_do_update_inode(handle_t *handle, } } - BUG_ON(!ext4_has_feature_project(inode->i_sb) && - i_projid != EXT4_DEF_PROJID); + if (i_projid != EXT4_DEF_PROJID && + !ext4_has_feature_project(inode->i_sb)) + err = err ?: -EFSCORRUPTED; if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) @@ -5131,6 +5140,11 @@ static int ext4_do_update_inode(handle_t *handle, ext4_inode_csum_set(inode, raw_inode, ei); spin_unlock(&ei->i_raw_lock); + if (err) { + EXT4_ERROR_INODE(inode, "corrupted inode contents"); + goto out_brelse; + } + if (inode->i_sb->s_flags & SB_LAZYTIME) ext4_update_other_inodes_time(inode->i_sb, inode->i_ino, bh->b_data); @@ -5138,13 +5152,15 @@ static int ext4_do_update_inode(handle_t *handle, BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, NULL, bh); if (err) - goto out_brelse; + goto out_error; ext4_clear_inode_state(inode, EXT4_STATE_NEW); if (set_large_file) { BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access"); - err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); + err = ext4_journal_get_write_access(handle, sb, + EXT4_SB(sb)->s_sbh, + EXT4_JTR_NONE); if (err) - goto out_brelse; + goto out_error; lock_buffer(EXT4_SB(sb)->s_sbh); ext4_set_feature_large_file(sb); ext4_superblock_csum_set(sb); @@ -5154,9 +5170,10 @@ static int ext4_do_update_inode(handle_t *handle, EXT4_SB(sb)->s_sbh); } ext4_update_inode_fsync_trans(handle, inode, need_datasync); +out_error: + ext4_std_error(inode->i_sb, err); out_brelse: brelse(bh); - ext4_std_error(inode->i_sb, err); return err; } @@ -5426,11 +5443,11 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, inode_dio_wait(inode); } - down_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); rc = ext4_break_layouts(inode); if (rc) { - up_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); goto err_out; } @@ -5506,7 +5523,7 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, error = rc; } out_mmap_sem: - up_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); } if (!error) { @@ -5743,7 +5760,8 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode, err = ext4_get_inode_loc(inode, iloc); if (!err) { BUFFER_TRACE(iloc->bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, iloc->bh); + err = ext4_journal_get_write_access(handle, inode->i_sb, + iloc->bh, EXT4_JTR_NONE); if (err) { brelse(iloc->bh); iloc->bh = NULL; @@ -5866,7 +5884,8 @@ int ext4_expand_extra_isize(struct inode *inode, ext4_write_lock_xattr(inode, &no_expand); BUFFER_TRACE(iloc->bh, "get_write_access"); - error = ext4_journal_get_write_access(handle, iloc->bh); + error = ext4_journal_get_write_access(handle, inode->i_sb, iloc->bh, + EXT4_JTR_NONE); if (error) { brelse(iloc->bh); goto out_unlock; @@ -5983,10 +6002,10 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) * data (and journalled aops don't know how to handle these cases). */ if (val) { - down_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); err = filemap_write_and_wait(inode->i_mapping); if (err < 0) { - up_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); return err; } } @@ -6019,7 +6038,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) percpu_up_write(&sbi->s_writepages_rwsem); if (val) - up_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); /* Finally we can mark the inode as dirty. */ @@ -6037,7 +6056,8 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) return err; } -static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh) +static int ext4_bh_unmapped(handle_t *handle, struct inode *inode, + struct buffer_head *bh) { return !buffer_mapped(bh); } @@ -6063,7 +6083,7 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) sb_start_pagefault(inode->i_sb); file_update_time(vma->vm_file); - down_read(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_lock_shared(mapping); err = ext4_convert_inline_data(inode); if (err) @@ -6110,7 +6130,7 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) * inode to the transaction's list to writeprotect pages on commit. */ if (page_has_buffers(page)) { - if (!ext4_walk_page_buffers(NULL, page_buffers(page), + if (!ext4_walk_page_buffers(NULL, inode, page_buffers(page), 0, len, NULL, ext4_bh_unmapped)) { /* Wait so that we don't change page under IO */ @@ -6156,11 +6176,13 @@ retry_alloc: err = __block_write_begin(page, 0, len, ext4_get_block); if (!err) { ret = VM_FAULT_SIGBUS; - if (ext4_walk_page_buffers(handle, page_buffers(page), - 0, len, NULL, do_journal_get_write_access)) + if (ext4_walk_page_buffers(handle, inode, + page_buffers(page), 0, len, NULL, + do_journal_get_write_access)) goto out_error; - if (ext4_walk_page_buffers(handle, page_buffers(page), - 0, len, NULL, write_end_fn)) + if (ext4_walk_page_buffers(handle, inode, + page_buffers(page), 0, len, NULL, + write_end_fn)) goto out_error; if (ext4_jbd2_inode_add_write(handle, inode, page_offset(page), len)) @@ -6176,7 +6198,7 @@ retry_alloc: out_ret: ret = block_page_mkwrite_return(err); out: - up_read(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock_shared(mapping); sb_end_pagefault(inode->i_sb); return ret; out_error: @@ -6184,15 +6206,3 @@ out_error: ext4_journal_stop(handle); goto out; } - -vm_fault_t ext4_filemap_fault(struct vm_fault *vmf) -{ - struct inode *inode = file_inode(vmf->vma->vm_file); - vm_fault_t ret; - - down_read(&EXT4_I(inode)->i_mmap_sem); - ret = filemap_fault(vmf); - up_read(&EXT4_I(inode)->i_mmap_sem); - - return ret; -} diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 6eed6170aded..606dee9e08a3 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -148,7 +148,7 @@ static long swap_inode_boot_loader(struct super_block *sb, goto journal_err_out; } - down_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); err = filemap_write_and_wait(inode->i_mapping); if (err) goto err_out; @@ -256,7 +256,7 @@ err_out1: ext4_double_up_write_data_sem(inode, inode_bl); err_out: - up_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); journal_err_out: unlock_two_nondirectories(inode, inode_bl); iput(inode_bl); @@ -1154,7 +1154,9 @@ resizefs_out: err = PTR_ERR(handle); goto pwsalt_err_exit; } - err = ext4_journal_get_write_access(handle, sbi->s_sbh); + err = ext4_journal_get_write_access(handle, sb, + sbi->s_sbh, + EXT4_JTR_NONE); if (err) goto pwsalt_err_journal; lock_buffer(sbi->s_sbh); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 089c958aa2c3..72bfac2d6dce 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -408,6 +408,10 @@ static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac); static bool ext4_mb_good_group(struct ext4_allocation_context *ac, ext4_group_t group, int cr); +static int ext4_try_to_trim_range(struct super_block *sb, + struct ext4_buddy *e4b, ext4_grpblk_t start, + ext4_grpblk_t max, ext4_grpblk_t minblocks); + /* * The algorithm using this percpu seq counter goes below: * 1. We sample the percpu discard_pa_seq counter before trying for block @@ -2474,6 +2478,12 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac, * This could return negative error code if something goes wrong * during ext4_mb_init_group(). This should not be called with * ext4_lock_group() held. + * + * Note: because we are conditionally operating with the group lock in + * the EXT4_MB_STRICT_CHECK case, we need to fake out sparse in this + * function using __acquire and __release. This means we need to be + * super careful before messing with the error path handling via "goto + * out"! */ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, ext4_group_t group, int cr) @@ -2487,8 +2497,10 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, if (sbi->s_mb_stats) atomic64_inc(&sbi->s_bal_cX_groups_considered[ac->ac_criteria]); - if (should_lock) + if (should_lock) { ext4_lock_group(sb, group); + __release(ext4_group_lock_ptr(sb, group)); + } free = grp->bb_free; if (free == 0) goto out; @@ -2496,8 +2508,10 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, goto out; if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) goto out; - if (should_lock) + if (should_lock) { + __acquire(ext4_group_lock_ptr(sb, group)); ext4_unlock_group(sb, group); + } /* We only do this if the grp has never been initialized */ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { @@ -2524,12 +2538,16 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, return ret; } - if (should_lock) + if (should_lock) { ext4_lock_group(sb, group); + __release(ext4_group_lock_ptr(sb, group)); + } ret = ext4_mb_good_group(ac, group, cr); out: - if (should_lock) + if (should_lock) { + __acquire(ext4_group_lock_ptr(sb, group)); ext4_unlock_group(sb, group); + } return ret; } @@ -2965,6 +2983,7 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) } static void *ext4_mb_seq_structs_summary_start(struct seq_file *seq, loff_t *pos) +__acquires(&EXT4_SB(sb)->s_mb_rb_lock) { struct super_block *sb = PDE_DATA(file_inode(seq->file)); unsigned long position; @@ -3037,6 +3056,7 @@ static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v) } static void ext4_mb_seq_structs_summary_stop(struct seq_file *seq, void *v) +__releases(&EXT4_SB(sb)->s_mb_rb_lock) { struct super_block *sb = PDE_DATA(file_inode(seq->file)); @@ -3308,6 +3328,57 @@ static int ext4_groupinfo_create_slab(size_t size) return 0; } +static void ext4_discard_work(struct work_struct *work) +{ + struct ext4_sb_info *sbi = container_of(work, + struct ext4_sb_info, s_discard_work); + struct super_block *sb = sbi->s_sb; + struct ext4_free_data *fd, *nfd; + struct ext4_buddy e4b; + struct list_head discard_list; + ext4_group_t grp, load_grp; + int err = 0; + + INIT_LIST_HEAD(&discard_list); + spin_lock(&sbi->s_md_lock); + list_splice_init(&sbi->s_discard_list, &discard_list); + spin_unlock(&sbi->s_md_lock); + + load_grp = UINT_MAX; + list_for_each_entry_safe(fd, nfd, &discard_list, efd_list) { + /* + * If filesystem is umounting or no memory or suffering + * from no space, give up the discard + */ + if ((sb->s_flags & SB_ACTIVE) && !err && + !atomic_read(&sbi->s_retry_alloc_pending)) { + grp = fd->efd_group; + if (grp != load_grp) { + if (load_grp != UINT_MAX) + ext4_mb_unload_buddy(&e4b); + + err = ext4_mb_load_buddy(sb, grp, &e4b); + if (err) { + kmem_cache_free(ext4_free_data_cachep, fd); + load_grp = UINT_MAX; + continue; + } else { + load_grp = grp; + } + } + + ext4_lock_group(sb, grp); + ext4_try_to_trim_range(sb, &e4b, fd->efd_start_cluster, + fd->efd_start_cluster + fd->efd_count - 1, 1); + ext4_unlock_group(sb, grp); + } + kmem_cache_free(ext4_free_data_cachep, fd); + } + + if (load_grp != UINT_MAX) + ext4_mb_unload_buddy(&e4b); +} + int ext4_mb_init(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -3376,6 +3447,9 @@ int ext4_mb_init(struct super_block *sb) spin_lock_init(&sbi->s_md_lock); sbi->s_mb_free_pending = 0; INIT_LIST_HEAD(&sbi->s_freed_data_list); + INIT_LIST_HEAD(&sbi->s_discard_list); + INIT_WORK(&sbi->s_discard_work, ext4_discard_work); + atomic_set(&sbi->s_retry_alloc_pending, 0); sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN; @@ -3474,6 +3548,14 @@ int ext4_mb_release(struct super_block *sb) struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); int count; + if (test_opt(sb, DISCARD)) { + /* + * wait the discard work to drain all of ext4_free_data + */ + flush_work(&sbi->s_discard_work); + WARN_ON_ONCE(!list_empty(&sbi->s_discard_list)); + } + if (sbi->s_group_info) { for (i = 0; i < ngroups; i++) { cond_resched(); @@ -3596,7 +3678,6 @@ static void ext4_free_data_in_buddy(struct super_block *sb, put_page(e4b.bd_bitmap_page); } ext4_unlock_group(sb, entry->efd_group); - kmem_cache_free(ext4_free_data_cachep, entry); ext4_mb_unload_buddy(&e4b); mb_debug(sb, "freed %d blocks in %d structures\n", count, @@ -3611,10 +3692,9 @@ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_free_data *entry, *tmp; - struct bio *discard_bio = NULL; struct list_head freed_data_list; struct list_head *cut_pos = NULL; - int err; + bool wake; INIT_LIST_HEAD(&freed_data_list); @@ -3629,30 +3709,20 @@ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid) cut_pos); spin_unlock(&sbi->s_md_lock); - if (test_opt(sb, DISCARD)) { - list_for_each_entry(entry, &freed_data_list, efd_list) { - err = ext4_issue_discard(sb, entry->efd_group, - entry->efd_start_cluster, - entry->efd_count, - &discard_bio); - if (err && err != -EOPNOTSUPP) { - ext4_msg(sb, KERN_WARNING, "discard request in" - " group:%d block:%d count:%d failed" - " with %d", entry->efd_group, - entry->efd_start_cluster, - entry->efd_count, err); - } else if (err == -EOPNOTSUPP) - break; - } + list_for_each_entry(entry, &freed_data_list, efd_list) + ext4_free_data_in_buddy(sb, entry); - if (discard_bio) { - submit_bio_wait(discard_bio); - bio_put(discard_bio); - } + if (test_opt(sb, DISCARD)) { + spin_lock(&sbi->s_md_lock); + wake = list_empty(&sbi->s_discard_list); + list_splice_tail(&freed_data_list, &sbi->s_discard_list); + spin_unlock(&sbi->s_md_lock); + if (wake) + queue_work(system_unbound_wq, &sbi->s_discard_work); + } else { + list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list) + kmem_cache_free(ext4_free_data_cachep, entry); } - - list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list) - ext4_free_data_in_buddy(sb, entry); } int __init ext4_init_mballoc(void) @@ -3726,7 +3796,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, } BUFFER_TRACE(bitmap_bh, "getting write access"); - err = ext4_journal_get_write_access(handle, bitmap_bh); + err = ext4_journal_get_write_access(handle, sb, bitmap_bh, + EXT4_JTR_NONE); if (err) goto out_err; @@ -3739,7 +3810,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, ext4_free_group_clusters(sb, gdp)); BUFFER_TRACE(gdp_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, gdp_bh); + err = ext4_journal_get_write_access(handle, sb, gdp_bh, EXT4_JTR_NONE); if (err) goto out_err; @@ -5916,7 +5987,8 @@ do_more: } BUFFER_TRACE(bitmap_bh, "getting write access"); - err = ext4_journal_get_write_access(handle, bitmap_bh); + err = ext4_journal_get_write_access(handle, sb, bitmap_bh, + EXT4_JTR_NONE); if (err) goto error_return; @@ -5926,7 +5998,7 @@ do_more: * using it */ BUFFER_TRACE(gd_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, gd_bh); + err = ext4_journal_get_write_access(handle, sb, gd_bh, EXT4_JTR_NONE); if (err) goto error_return; #ifdef AGGRESSIVE_CHECK @@ -6107,7 +6179,8 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, } BUFFER_TRACE(bitmap_bh, "getting write access"); - err = ext4_journal_get_write_access(handle, bitmap_bh); + err = ext4_journal_get_write_access(handle, sb, bitmap_bh, + EXT4_JTR_NONE); if (err) goto error_return; @@ -6117,7 +6190,7 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, * using it */ BUFFER_TRACE(gd_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, gd_bh); + err = ext4_journal_get_write_access(handle, sb, gd_bh, EXT4_JTR_NONE); if (err) goto error_return; @@ -6183,19 +6256,19 @@ error_return: * @sb: super block for the file system * @start: starting block of the free extent in the alloc. group * @count: number of blocks to TRIM - * @group: alloc. group we are working with * @e4b: ext4 buddy for the group * * Trim "count" blocks starting at "start" in the "group". To assure that no * one will allocate those blocks, mark it as used in buddy bitmap. This must * be called with under the group lock. */ -static int ext4_trim_extent(struct super_block *sb, int start, int count, - ext4_group_t group, struct ext4_buddy *e4b) +static int ext4_trim_extent(struct super_block *sb, + int start, int count, struct ext4_buddy *e4b) __releases(bitlock) __acquires(bitlock) { struct ext4_free_extent ex; + ext4_group_t group = e4b->bd_group; int ret = 0; trace_ext4_trim_extent(sb, group, start, count); @@ -6218,51 +6291,21 @@ __acquires(bitlock) return ret; } -/** - * ext4_trim_all_free -- function to trim all free space in alloc. group - * @sb: super block for file system - * @group: group to be trimmed - * @start: first group block to examine - * @max: last group block to examine - * @minblocks: minimum extent block count - * - * ext4_trim_all_free walks through group's buddy bitmap searching for free - * extents. When the free block is found, ext4_trim_extent is called to TRIM - * the extent. - * - * - * ext4_trim_all_free walks through group's block bitmap searching for free - * extents. When the free extent is found, mark it as used in group buddy - * bitmap. Then issue a TRIM command on this extent and free the extent in - * the group buddy bitmap. This is done until whole group is scanned. - */ -static ext4_grpblk_t -ext4_trim_all_free(struct super_block *sb, ext4_group_t group, - ext4_grpblk_t start, ext4_grpblk_t max, - ext4_grpblk_t minblocks) +static int ext4_try_to_trim_range(struct super_block *sb, + struct ext4_buddy *e4b, ext4_grpblk_t start, + ext4_grpblk_t max, ext4_grpblk_t minblocks) +__acquires(ext4_group_lock_ptr(sb, e4b->bd_group)) +__releases(ext4_group_lock_ptr(sb, e4b->bd_group)) { + ext4_grpblk_t next, count, free_count; void *bitmap; - ext4_grpblk_t next, count = 0, free_count = 0; - struct ext4_buddy e4b; int ret = 0; - trace_ext4_trim_all_free(sb, group, start, max); - - ret = ext4_mb_load_buddy(sb, group, &e4b); - if (ret) { - ext4_warning(sb, "Error %d loading buddy information for %u", - ret, group); - return ret; - } - bitmap = e4b.bd_bitmap; - - ext4_lock_group(sb, group); - if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) && - minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks)) - goto out; - - start = (e4b.bd_info->bb_first_free > start) ? - e4b.bd_info->bb_first_free : start; + bitmap = e4b->bd_bitmap; + start = (e4b->bd_info->bb_first_free > start) ? + e4b->bd_info->bb_first_free : start; + count = 0; + free_count = 0; while (start <= max) { start = mb_find_next_zero_bit(bitmap, max + 1, start); @@ -6271,8 +6314,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, next = mb_find_next_bit(bitmap, max + 1, start); if ((next - start) >= minblocks) { - ret = ext4_trim_extent(sb, start, - next - start, group, &e4b); + ret = ext4_trim_extent(sb, start, next - start, e4b); if (ret && ret != -EOPNOTSUPP) break; ret = 0; @@ -6287,25 +6329,64 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, } if (need_resched()) { - ext4_unlock_group(sb, group); + ext4_unlock_group(sb, e4b->bd_group); cond_resched(); - ext4_lock_group(sb, group); + ext4_lock_group(sb, e4b->bd_group); } - if ((e4b.bd_info->bb_free - free_count) < minblocks) + if ((e4b->bd_info->bb_free - free_count) < minblocks) break; } - if (!ret) { - ret = count; - EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info); + return count; +} + +/** + * ext4_trim_all_free -- function to trim all free space in alloc. group + * @sb: super block for file system + * @group: group to be trimmed + * @start: first group block to examine + * @max: last group block to examine + * @minblocks: minimum extent block count + * + * ext4_trim_all_free walks through group's block bitmap searching for free + * extents. When the free extent is found, mark it as used in group buddy + * bitmap. Then issue a TRIM command on this extent and free the extent in + * the group buddy bitmap. + */ +static ext4_grpblk_t +ext4_trim_all_free(struct super_block *sb, ext4_group_t group, + ext4_grpblk_t start, ext4_grpblk_t max, + ext4_grpblk_t minblocks) +{ + struct ext4_buddy e4b; + int ret; + + trace_ext4_trim_all_free(sb, group, start, max); + + ret = ext4_mb_load_buddy(sb, group, &e4b); + if (ret) { + ext4_warning(sb, "Error %d loading buddy information for %u", + ret, group); + return ret; + } + + ext4_lock_group(sb, group); + + if (!EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) || + minblocks < atomic_read(&EXT4_SB(sb)->s_last_trim_minblks)) { + ret = ext4_try_to_trim_range(sb, &e4b, start, max, minblocks); + if (ret >= 0) + EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info); + } else { + ret = 0; } -out: + ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); ext4_debug("trimmed %d blocks in the group %d\n", - count, group); + ret, group); return ret; } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index f3bbcd4efb56..da7698341d7d 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -70,7 +70,8 @@ static struct buffer_head *ext4_append(handle_t *handle, inode->i_size += inode->i_sb->s_blocksize; EXT4_I(inode)->i_disksize = inode->i_size; BUFFER_TRACE(bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, bh); + err = ext4_journal_get_write_access(handle, inode->i_sb, bh, + EXT4_JTR_NONE); if (err) { brelse(bh); ext4_std_error(inode->i_sb, err); @@ -1927,12 +1928,14 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, } BUFFER_TRACE(*bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, *bh); + err = ext4_journal_get_write_access(handle, dir->i_sb, *bh, + EXT4_JTR_NONE); if (err) goto journal_error; BUFFER_TRACE(frame->bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, frame->bh); + err = ext4_journal_get_write_access(handle, dir->i_sb, frame->bh, + EXT4_JTR_NONE); if (err) goto journal_error; @@ -2109,7 +2112,8 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, return err; } BUFFER_TRACE(bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, bh); + err = ext4_journal_get_write_access(handle, dir->i_sb, bh, + EXT4_JTR_NONE); if (err) { ext4_std_error(dir->i_sb, err); return err; @@ -2167,7 +2171,8 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, blocksize = dir->i_sb->s_blocksize; dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino)); BUFFER_TRACE(bh, "get_write_access"); - retval = ext4_journal_get_write_access(handle, bh); + retval = ext4_journal_get_write_access(handle, dir->i_sb, bh, + EXT4_JTR_NONE); if (retval) { ext4_std_error(dir->i_sb, retval); brelse(bh); @@ -2419,7 +2424,7 @@ again: } BUFFER_TRACE(bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, bh); + err = ext4_journal_get_write_access(handle, sb, bh, EXT4_JTR_NONE); if (err) goto journal_error; @@ -2476,7 +2481,8 @@ again: node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize, sb->s_blocksize); BUFFER_TRACE(frame->bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, frame->bh); + err = ext4_journal_get_write_access(handle, sb, frame->bh, + EXT4_JTR_NONE); if (err) goto journal_error; if (!add_level) { @@ -2486,8 +2492,9 @@ again: icount1, icount2)); BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ - err = ext4_journal_get_write_access(handle, - (frame - 1)->bh); + err = ext4_journal_get_write_access(handle, sb, + (frame - 1)->bh, + EXT4_JTR_NONE); if (err) goto journal_error; @@ -2636,7 +2643,8 @@ static int ext4_delete_entry(handle_t *handle, csum_size = sizeof(struct ext4_dir_entry_tail); BUFFER_TRACE(bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, bh); + err = ext4_journal_get_write_access(handle, dir->i_sb, bh, + EXT4_JTR_NONE); if (unlikely(err)) goto out; @@ -3046,186 +3054,6 @@ bool ext4_empty_dir(struct inode *inode) return true; } -/* - * ext4_orphan_add() links an unlinked or truncated inode into a list of - * such inodes, starting at the superblock, in case we crash before the - * file is closed/deleted, or in case the inode truncate spans multiple - * transactions and the last transaction is not recovered after a crash. - * - * At filesystem recovery time, we walk this list deleting unlinked - * inodes and truncating linked inodes in ext4_orphan_cleanup(). - * - * Orphan list manipulation functions must be called under i_mutex unless - * we are just creating the inode or deleting it. - */ -int ext4_orphan_add(handle_t *handle, struct inode *inode) -{ - struct super_block *sb = inode->i_sb; - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_iloc iloc; - int err = 0, rc; - bool dirty = false; - - if (!sbi->s_journal || is_bad_inode(inode)) - return 0; - - WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && - !inode_is_locked(inode)); - /* - * Exit early if inode already is on orphan list. This is a big speedup - * since we don't have to contend on the global s_orphan_lock. - */ - if (!list_empty(&EXT4_I(inode)->i_orphan)) - return 0; - - /* - * Orphan handling is only valid for files with data blocks - * being truncated, or files being unlinked. Note that we either - * hold i_mutex, or the inode can not be referenced from outside, - * so i_nlink should not be bumped due to race - */ - ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); - - BUFFER_TRACE(sbi->s_sbh, "get_write_access"); - err = ext4_journal_get_write_access(handle, sbi->s_sbh); - if (err) - goto out; - - err = ext4_reserve_inode_write(handle, inode, &iloc); - if (err) - goto out; - - mutex_lock(&sbi->s_orphan_lock); - /* - * Due to previous errors inode may be already a part of on-disk - * orphan list. If so skip on-disk list modification. - */ - if (!NEXT_ORPHAN(inode) || NEXT_ORPHAN(inode) > - (le32_to_cpu(sbi->s_es->s_inodes_count))) { - /* Insert this inode at the head of the on-disk orphan list */ - NEXT_ORPHAN(inode) = le32_to_cpu(sbi->s_es->s_last_orphan); - lock_buffer(sbi->s_sbh); - sbi->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); - ext4_superblock_csum_set(sb); - unlock_buffer(sbi->s_sbh); - dirty = true; - } - list_add(&EXT4_I(inode)->i_orphan, &sbi->s_orphan); - mutex_unlock(&sbi->s_orphan_lock); - - if (dirty) { - err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); - rc = ext4_mark_iloc_dirty(handle, inode, &iloc); - if (!err) - err = rc; - if (err) { - /* - * We have to remove inode from in-memory list if - * addition to on disk orphan list failed. Stray orphan - * list entries can cause panics at unmount time. - */ - mutex_lock(&sbi->s_orphan_lock); - list_del_init(&EXT4_I(inode)->i_orphan); - mutex_unlock(&sbi->s_orphan_lock); - } - } else - brelse(iloc.bh); - - jbd_debug(4, "superblock will point to %lu\n", inode->i_ino); - jbd_debug(4, "orphan inode %lu will point to %d\n", - inode->i_ino, NEXT_ORPHAN(inode)); -out: - ext4_std_error(sb, err); - return err; -} - -/* - * ext4_orphan_del() removes an unlinked or truncated inode from the list - * of such inodes stored on disk, because it is finally being cleaned up. - */ -int ext4_orphan_del(handle_t *handle, struct inode *inode) -{ - struct list_head *prev; - struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - __u32 ino_next; - struct ext4_iloc iloc; - int err = 0; - - if (!sbi->s_journal && !(sbi->s_mount_state & EXT4_ORPHAN_FS)) - return 0; - - WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && - !inode_is_locked(inode)); - /* Do this quick check before taking global s_orphan_lock. */ - if (list_empty(&ei->i_orphan)) - return 0; - - if (handle) { - /* Grab inode buffer early before taking global s_orphan_lock */ - err = ext4_reserve_inode_write(handle, inode, &iloc); - } - - mutex_lock(&sbi->s_orphan_lock); - jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino); - - prev = ei->i_orphan.prev; - list_del_init(&ei->i_orphan); - - /* If we're on an error path, we may not have a valid - * transaction handle with which to update the orphan list on - * disk, but we still need to remove the inode from the linked - * list in memory. */ - if (!handle || err) { - mutex_unlock(&sbi->s_orphan_lock); - goto out_err; - } - - ino_next = NEXT_ORPHAN(inode); - if (prev == &sbi->s_orphan) { - jbd_debug(4, "superblock will point to %u\n", ino_next); - BUFFER_TRACE(sbi->s_sbh, "get_write_access"); - err = ext4_journal_get_write_access(handle, sbi->s_sbh); - if (err) { - mutex_unlock(&sbi->s_orphan_lock); - goto out_brelse; - } - lock_buffer(sbi->s_sbh); - sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); - ext4_superblock_csum_set(inode->i_sb); - unlock_buffer(sbi->s_sbh); - mutex_unlock(&sbi->s_orphan_lock); - err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); - } else { - struct ext4_iloc iloc2; - struct inode *i_prev = - &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode; - - jbd_debug(4, "orphan inode %lu will point to %u\n", - i_prev->i_ino, ino_next); - err = ext4_reserve_inode_write(handle, i_prev, &iloc2); - if (err) { - mutex_unlock(&sbi->s_orphan_lock); - goto out_brelse; - } - NEXT_ORPHAN(i_prev) = ino_next; - err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2); - mutex_unlock(&sbi->s_orphan_lock); - } - if (err) - goto out_brelse; - NEXT_ORPHAN(inode) = 0; - err = ext4_mark_iloc_dirty(handle, inode, &iloc); -out_err: - ext4_std_error(inode->i_sb, err); - return err; - -out_brelse: - brelse(iloc.bh); - goto out_err; -} - static int ext4_rmdir(struct inode *dir, struct dentry *dentry) { int retval; @@ -3675,7 +3503,8 @@ static int ext4_rename_dir_prepare(handle_t *handle, struct ext4_renament *ent) if (le32_to_cpu(ent->parent_de->inode) != ent->dir->i_ino) return -EFSCORRUPTED; BUFFER_TRACE(ent->dir_bh, "get_write_access"); - return ext4_journal_get_write_access(handle, ent->dir_bh); + return ext4_journal_get_write_access(handle, ent->dir->i_sb, + ent->dir_bh, EXT4_JTR_NONE); } static int ext4_rename_dir_finish(handle_t *handle, struct ext4_renament *ent, @@ -3710,7 +3539,8 @@ static int ext4_setent(handle_t *handle, struct ext4_renament *ent, int retval, retval2; BUFFER_TRACE(ent->bh, "get write access"); - retval = ext4_journal_get_write_access(handle, ent->bh); + retval = ext4_journal_get_write_access(handle, ent->dir->i_sb, ent->bh, + EXT4_JTR_NONE); if (retval) return retval; ent->de->inode = cpu_to_le32(ino); diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c new file mode 100644 index 000000000000..53adc8f570a3 --- /dev/null +++ b/fs/ext4/orphan.c @@ -0,0 +1,652 @@ +/* + * Ext4 orphan inode handling + */ +#include <linux/fs.h> +#include <linux/quotaops.h> +#include <linux/buffer_head.h> + +#include "ext4.h" +#include "ext4_jbd2.h" + +static int ext4_orphan_file_add(handle_t *handle, struct inode *inode) +{ + int i, j, start; + struct ext4_orphan_info *oi = &EXT4_SB(inode->i_sb)->s_orphan_info; + int ret = 0; + bool found = false; + __le32 *bdata; + int inodes_per_ob = ext4_inodes_per_orphan_block(inode->i_sb); + int looped = 0; + + /* + * Find block with free orphan entry. Use CPU number for a naive hash + * for a search start in the orphan file + */ + start = raw_smp_processor_id()*13 % oi->of_blocks; + i = start; + do { + if (atomic_dec_if_positive(&oi->of_binfo[i].ob_free_entries) + >= 0) { + found = true; + break; + } + if (++i >= oi->of_blocks) + i = 0; + } while (i != start); + + if (!found) { + /* + * For now we don't grow or shrink orphan file. We just use + * whatever was allocated at mke2fs time. The additional + * credits we would have to reserve for each orphan inode + * operation just don't seem worth it. + */ + return -ENOSPC; + } + + ret = ext4_journal_get_write_access(handle, inode->i_sb, + oi->of_binfo[i].ob_bh, EXT4_JTR_ORPHAN_FILE); + if (ret) { + atomic_inc(&oi->of_binfo[i].ob_free_entries); + return ret; + } + + bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data); + /* Find empty slot in a block */ + j = 0; + do { + if (looped) { + /* + * Did we walk through the block several times without + * finding free entry? It is theoretically possible + * if entries get constantly allocated and freed or + * if the block is corrupted. Avoid indefinite looping + * and bail. We'll use orphan list instead. + */ + if (looped > 3) { + atomic_inc(&oi->of_binfo[i].ob_free_entries); + return -ENOSPC; + } + cond_resched(); + } + while (bdata[j]) { + if (++j >= inodes_per_ob) { + j = 0; + looped++; + } + } + } while (cmpxchg(&bdata[j], (__le32)0, cpu_to_le32(inode->i_ino)) != + (__le32)0); + + EXT4_I(inode)->i_orphan_idx = i * inodes_per_ob + j; + ext4_set_inode_state(inode, EXT4_STATE_ORPHAN_FILE); + + return ext4_handle_dirty_metadata(handle, NULL, oi->of_binfo[i].ob_bh); +} + +/* + * ext4_orphan_add() links an unlinked or truncated inode into a list of + * such inodes, starting at the superblock, in case we crash before the + * file is closed/deleted, or in case the inode truncate spans multiple + * transactions and the last transaction is not recovered after a crash. + * + * At filesystem recovery time, we walk this list deleting unlinked + * inodes and truncating linked inodes in ext4_orphan_cleanup(). + * + * Orphan list manipulation functions must be called under i_mutex unless + * we are just creating the inode or deleting it. + */ +int ext4_orphan_add(handle_t *handle, struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_iloc iloc; + int err = 0, rc; + bool dirty = false; + + if (!sbi->s_journal || is_bad_inode(inode)) + return 0; + + WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && + !inode_is_locked(inode)); + /* + * Inode orphaned in orphan file or in orphan list? + */ + if (ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE) || + !list_empty(&EXT4_I(inode)->i_orphan)) + return 0; + + /* + * Orphan handling is only valid for files with data blocks + * being truncated, or files being unlinked. Note that we either + * hold i_mutex, or the inode can not be referenced from outside, + * so i_nlink should not be bumped due to race + */ + ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); + + if (sbi->s_orphan_info.of_blocks) { + err = ext4_orphan_file_add(handle, inode); + /* + * Fallback to normal orphan list of orphan file is + * out of space + */ + if (err != -ENOSPC) + return err; + } + + BUFFER_TRACE(sbi->s_sbh, "get_write_access"); + err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh, + EXT4_JTR_NONE); + if (err) + goto out; + + err = ext4_reserve_inode_write(handle, inode, &iloc); + if (err) + goto out; + + mutex_lock(&sbi->s_orphan_lock); + /* + * Due to previous errors inode may be already a part of on-disk + * orphan list. If so skip on-disk list modification. + */ + if (!NEXT_ORPHAN(inode) || NEXT_ORPHAN(inode) > + (le32_to_cpu(sbi->s_es->s_inodes_count))) { + /* Insert this inode at the head of the on-disk orphan list */ + NEXT_ORPHAN(inode) = le32_to_cpu(sbi->s_es->s_last_orphan); + lock_buffer(sbi->s_sbh); + sbi->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); + ext4_superblock_csum_set(sb); + unlock_buffer(sbi->s_sbh); + dirty = true; + } + list_add(&EXT4_I(inode)->i_orphan, &sbi->s_orphan); + mutex_unlock(&sbi->s_orphan_lock); + + if (dirty) { + err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); + rc = ext4_mark_iloc_dirty(handle, inode, &iloc); + if (!err) + err = rc; + if (err) { + /* + * We have to remove inode from in-memory list if + * addition to on disk orphan list failed. Stray orphan + * list entries can cause panics at unmount time. + */ + mutex_lock(&sbi->s_orphan_lock); + list_del_init(&EXT4_I(inode)->i_orphan); + mutex_unlock(&sbi->s_orphan_lock); + } + } else + brelse(iloc.bh); + + jbd_debug(4, "superblock will point to %lu\n", inode->i_ino); + jbd_debug(4, "orphan inode %lu will point to %d\n", + inode->i_ino, NEXT_ORPHAN(inode)); +out: + ext4_std_error(sb, err); + return err; +} + +static int ext4_orphan_file_del(handle_t *handle, struct inode *inode) +{ + struct ext4_orphan_info *oi = &EXT4_SB(inode->i_sb)->s_orphan_info; + __le32 *bdata; + int blk, off; + int inodes_per_ob = ext4_inodes_per_orphan_block(inode->i_sb); + int ret = 0; + + if (!handle) + goto out; + blk = EXT4_I(inode)->i_orphan_idx / inodes_per_ob; + off = EXT4_I(inode)->i_orphan_idx % inodes_per_ob; + if (WARN_ON_ONCE(blk >= oi->of_blocks)) + goto out; + + ret = ext4_journal_get_write_access(handle, inode->i_sb, + oi->of_binfo[blk].ob_bh, EXT4_JTR_ORPHAN_FILE); + if (ret) + goto out; + + bdata = (__le32 *)(oi->of_binfo[blk].ob_bh->b_data); + bdata[off] = 0; + atomic_inc(&oi->of_binfo[blk].ob_free_entries); + ret = ext4_handle_dirty_metadata(handle, NULL, oi->of_binfo[blk].ob_bh); +out: + ext4_clear_inode_state(inode, EXT4_STATE_ORPHAN_FILE); + INIT_LIST_HEAD(&EXT4_I(inode)->i_orphan); + + return ret; +} + +/* + * ext4_orphan_del() removes an unlinked or truncated inode from the list + * of such inodes stored on disk, because it is finally being cleaned up. + */ +int ext4_orphan_del(handle_t *handle, struct inode *inode) +{ + struct list_head *prev; + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + __u32 ino_next; + struct ext4_iloc iloc; + int err = 0; + + if (!sbi->s_journal && !(sbi->s_mount_state & EXT4_ORPHAN_FS)) + return 0; + + WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && + !inode_is_locked(inode)); + if (ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE)) + return ext4_orphan_file_del(handle, inode); + + /* Do this quick check before taking global s_orphan_lock. */ + if (list_empty(&ei->i_orphan)) + return 0; + + if (handle) { + /* Grab inode buffer early before taking global s_orphan_lock */ + err = ext4_reserve_inode_write(handle, inode, &iloc); + } + + mutex_lock(&sbi->s_orphan_lock); + jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino); + + prev = ei->i_orphan.prev; + list_del_init(&ei->i_orphan); + + /* If we're on an error path, we may not have a valid + * transaction handle with which to update the orphan list on + * disk, but we still need to remove the inode from the linked + * list in memory. */ + if (!handle || err) { + mutex_unlock(&sbi->s_orphan_lock); + goto out_err; + } + + ino_next = NEXT_ORPHAN(inode); + if (prev == &sbi->s_orphan) { + jbd_debug(4, "superblock will point to %u\n", ino_next); + BUFFER_TRACE(sbi->s_sbh, "get_write_access"); + err = ext4_journal_get_write_access(handle, inode->i_sb, + sbi->s_sbh, EXT4_JTR_NONE); + if (err) { + mutex_unlock(&sbi->s_orphan_lock); + goto out_brelse; + } + lock_buffer(sbi->s_sbh); + sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); + ext4_superblock_csum_set(inode->i_sb); + unlock_buffer(sbi->s_sbh); + mutex_unlock(&sbi->s_orphan_lock); + err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); + } else { + struct ext4_iloc iloc2; + struct inode *i_prev = + &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode; + + jbd_debug(4, "orphan inode %lu will point to %u\n", + i_prev->i_ino, ino_next); + err = ext4_reserve_inode_write(handle, i_prev, &iloc2); + if (err) { + mutex_unlock(&sbi->s_orphan_lock); + goto out_brelse; + } + NEXT_ORPHAN(i_prev) = ino_next; + err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2); + mutex_unlock(&sbi->s_orphan_lock); + } + if (err) + goto out_brelse; + NEXT_ORPHAN(inode) = 0; + err = ext4_mark_iloc_dirty(handle, inode, &iloc); +out_err: + ext4_std_error(inode->i_sb, err); + return err; + +out_brelse: + brelse(iloc.bh); + goto out_err; +} + +#ifdef CONFIG_QUOTA +static int ext4_quota_on_mount(struct super_block *sb, int type) +{ + return dquot_quota_on_mount(sb, + rcu_dereference_protected(EXT4_SB(sb)->s_qf_names[type], + lockdep_is_held(&sb->s_umount)), + EXT4_SB(sb)->s_jquota_fmt, type); +} +#endif + +static void ext4_process_orphan(struct inode *inode, + int *nr_truncates, int *nr_orphans) +{ + struct super_block *sb = inode->i_sb; + int ret; + + dquot_initialize(inode); + if (inode->i_nlink) { + if (test_opt(sb, DEBUG)) + ext4_msg(sb, KERN_DEBUG, + "%s: truncating inode %lu to %lld bytes", + __func__, inode->i_ino, inode->i_size); + jbd_debug(2, "truncating inode %lu to %lld bytes\n", + inode->i_ino, inode->i_size); + inode_lock(inode); + truncate_inode_pages(inode->i_mapping, inode->i_size); + ret = ext4_truncate(inode); + if (ret) { + /* + * We need to clean up the in-core orphan list + * manually if ext4_truncate() failed to get a + * transaction handle. + */ + ext4_orphan_del(NULL, inode); + ext4_std_error(inode->i_sb, ret); + } + inode_unlock(inode); + (*nr_truncates)++; + } else { + if (test_opt(sb, DEBUG)) + ext4_msg(sb, KERN_DEBUG, + "%s: deleting unreferenced inode %lu", + __func__, inode->i_ino); + jbd_debug(2, "deleting unreferenced inode %lu\n", + inode->i_ino); + (*nr_orphans)++; + } + iput(inode); /* The delete magic happens here! */ +} + +/* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at + * the superblock) which were deleted from all directories, but held open by + * a process at the time of a crash. We walk the list and try to delete these + * inodes at recovery time (only with a read-write filesystem). + * + * In order to keep the orphan inode chain consistent during traversal (in + * case of crash during recovery), we link each inode into the superblock + * orphan list_head and handle it the same way as an inode deletion during + * normal operation (which journals the operations for us). + * + * We only do an iget() and an iput() on each inode, which is very safe if we + * accidentally point at an in-use or already deleted inode. The worst that + * can happen in this case is that we get a "bit already cleared" message from + * ext4_free_inode(). The only reason we would point at a wrong inode is if + * e2fsck was run on this filesystem, and it must have already done the orphan + * inode cleanup for us, so we can safely abort without any further action. + */ +void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es) +{ + unsigned int s_flags = sb->s_flags; + int nr_orphans = 0, nr_truncates = 0; + struct inode *inode; + int i, j; +#ifdef CONFIG_QUOTA + int quota_update = 0; +#endif + __le32 *bdata; + struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; + int inodes_per_ob = ext4_inodes_per_orphan_block(sb); + + if (!es->s_last_orphan && !oi->of_blocks) { + jbd_debug(4, "no orphan inodes to clean up\n"); + return; + } + + if (bdev_read_only(sb->s_bdev)) { + ext4_msg(sb, KERN_ERR, "write access " + "unavailable, skipping orphan cleanup"); + return; + } + + /* Check if feature set would not allow a r/w mount */ + if (!ext4_feature_set_ok(sb, 0)) { + ext4_msg(sb, KERN_INFO, "Skipping orphan cleanup due to " + "unknown ROCOMPAT features"); + return; + } + + if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { + /* don't clear list on RO mount w/ errors */ + if (es->s_last_orphan && !(s_flags & SB_RDONLY)) { + ext4_msg(sb, KERN_INFO, "Errors on filesystem, " + "clearing orphan list.\n"); + es->s_last_orphan = 0; + } + jbd_debug(1, "Skipping orphan recovery on fs with errors.\n"); + return; + } + + if (s_flags & SB_RDONLY) { + ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs"); + sb->s_flags &= ~SB_RDONLY; + } +#ifdef CONFIG_QUOTA + /* + * Turn on quotas which were not enabled for read-only mounts if + * filesystem has quota feature, so that they are updated correctly. + */ + if (ext4_has_feature_quota(sb) && (s_flags & SB_RDONLY)) { + int ret = ext4_enable_quotas(sb); + + if (!ret) + quota_update = 1; + else + ext4_msg(sb, KERN_ERR, + "Cannot turn on quotas: error %d", ret); + } + + /* Turn on journaled quotas used for old sytle */ + for (i = 0; i < EXT4_MAXQUOTAS; i++) { + if (EXT4_SB(sb)->s_qf_names[i]) { + int ret = ext4_quota_on_mount(sb, i); + + if (!ret) + quota_update = 1; + else + ext4_msg(sb, KERN_ERR, + "Cannot turn on journaled " + "quota: type %d: error %d", i, ret); + } + } +#endif + + while (es->s_last_orphan) { + /* + * We may have encountered an error during cleanup; if + * so, skip the rest. + */ + if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { + jbd_debug(1, "Skipping orphan recovery on fs with errors.\n"); + es->s_last_orphan = 0; + break; + } + + inode = ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan)); + if (IS_ERR(inode)) { + es->s_last_orphan = 0; + break; + } + + list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); + ext4_process_orphan(inode, &nr_truncates, &nr_orphans); + } + + for (i = 0; i < oi->of_blocks; i++) { + bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data); + for (j = 0; j < inodes_per_ob; j++) { + if (!bdata[j]) + continue; + inode = ext4_orphan_get(sb, le32_to_cpu(bdata[j])); + if (IS_ERR(inode)) + continue; + ext4_set_inode_state(inode, EXT4_STATE_ORPHAN_FILE); + EXT4_I(inode)->i_orphan_idx = i * inodes_per_ob + j; + ext4_process_orphan(inode, &nr_truncates, &nr_orphans); + } + } + +#define PLURAL(x) (x), ((x) == 1) ? "" : "s" + + if (nr_orphans) + ext4_msg(sb, KERN_INFO, "%d orphan inode%s deleted", + PLURAL(nr_orphans)); + if (nr_truncates) + ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up", + PLURAL(nr_truncates)); +#ifdef CONFIG_QUOTA + /* Turn off quotas if they were enabled for orphan cleanup */ + if (quota_update) { + for (i = 0; i < EXT4_MAXQUOTAS; i++) { + if (sb_dqopt(sb)->files[i]) + dquot_quota_off(sb, i); + } + } +#endif + sb->s_flags = s_flags; /* Restore SB_RDONLY status */ +} + +void ext4_release_orphan_info(struct super_block *sb) +{ + int i; + struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; + + if (!oi->of_blocks) + return; + for (i = 0; i < oi->of_blocks; i++) + brelse(oi->of_binfo[i].ob_bh); + kfree(oi->of_binfo); +} + +static struct ext4_orphan_block_tail *ext4_orphan_block_tail( + struct super_block *sb, + struct buffer_head *bh) +{ + return (struct ext4_orphan_block_tail *)(bh->b_data + sb->s_blocksize - + sizeof(struct ext4_orphan_block_tail)); +} + +static int ext4_orphan_file_block_csum_verify(struct super_block *sb, + struct buffer_head *bh) +{ + __u32 calculated; + int inodes_per_ob = ext4_inodes_per_orphan_block(sb); + struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; + struct ext4_orphan_block_tail *ot; + __le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr); + + if (!ext4_has_metadata_csum(sb)) + return 1; + + ot = ext4_orphan_block_tail(sb, bh); + calculated = ext4_chksum(EXT4_SB(sb), oi->of_csum_seed, + (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr)); + calculated = ext4_chksum(EXT4_SB(sb), calculated, (__u8 *)bh->b_data, + inodes_per_ob * sizeof(__u32)); + return le32_to_cpu(ot->ob_checksum) == calculated; +} + +/* This gets called only when checksumming is enabled */ +void ext4_orphan_file_block_trigger(struct jbd2_buffer_trigger_type *triggers, + struct buffer_head *bh, + void *data, size_t size) +{ + struct super_block *sb = EXT4_TRIGGER(triggers)->sb; + __u32 csum; + int inodes_per_ob = ext4_inodes_per_orphan_block(sb); + struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; + struct ext4_orphan_block_tail *ot; + __le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr); + + csum = ext4_chksum(EXT4_SB(sb), oi->of_csum_seed, + (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr)); + csum = ext4_chksum(EXT4_SB(sb), csum, (__u8 *)data, + inodes_per_ob * sizeof(__u32)); + ot = ext4_orphan_block_tail(sb, bh); + ot->ob_checksum = cpu_to_le32(csum); +} + +int ext4_init_orphan_info(struct super_block *sb) +{ + struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; + struct inode *inode; + int i, j; + int ret; + int free; + __le32 *bdata; + int inodes_per_ob = ext4_inodes_per_orphan_block(sb); + struct ext4_orphan_block_tail *ot; + ino_t orphan_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_orphan_file_inum); + + if (!ext4_has_feature_orphan_file(sb)) + return 0; + + inode = ext4_iget(sb, orphan_ino, EXT4_IGET_SPECIAL); + if (IS_ERR(inode)) { + ext4_msg(sb, KERN_ERR, "get orphan inode failed"); + return PTR_ERR(inode); + } + oi->of_blocks = inode->i_size >> sb->s_blocksize_bits; + oi->of_csum_seed = EXT4_I(inode)->i_csum_seed; + oi->of_binfo = kmalloc(oi->of_blocks*sizeof(struct ext4_orphan_block), + GFP_KERNEL); + if (!oi->of_binfo) { + ret = -ENOMEM; + goto out_put; + } + for (i = 0; i < oi->of_blocks; i++) { + oi->of_binfo[i].ob_bh = ext4_bread(NULL, inode, i, 0); + if (IS_ERR(oi->of_binfo[i].ob_bh)) { + ret = PTR_ERR(oi->of_binfo[i].ob_bh); + goto out_free; + } + if (!oi->of_binfo[i].ob_bh) { + ret = -EIO; + goto out_free; + } + ot = ext4_orphan_block_tail(sb, oi->of_binfo[i].ob_bh); + if (le32_to_cpu(ot->ob_magic) != EXT4_ORPHAN_BLOCK_MAGIC) { + ext4_error(sb, "orphan file block %d: bad magic", i); + ret = -EIO; + goto out_free; + } + if (!ext4_orphan_file_block_csum_verify(sb, + oi->of_binfo[i].ob_bh)) { + ext4_error(sb, "orphan file block %d: bad checksum", i); + ret = -EIO; + goto out_free; + } + bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data); + free = 0; + for (j = 0; j < inodes_per_ob; j++) + if (bdata[j] == 0) + free++; + atomic_set(&oi->of_binfo[i].ob_free_entries, free); + } + iput(inode); + return 0; +out_free: + for (i--; i >= 0; i--) + brelse(oi->of_binfo[i].ob_bh); + kfree(oi->of_binfo); +out_put: + iput(inode); + return ret; +} + +int ext4_orphan_file_empty(struct super_block *sb) +{ + struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; + int i; + int inodes_per_ob = ext4_inodes_per_orphan_block(sb); + + if (!ext4_has_feature_orphan_file(sb)) + return 1; + for (i = 0; i < oi->of_blocks; i++) + if (atomic_read(&oi->of_binfo[i].ob_free_entries) != + inodes_per_ob) + return 0; + return 1; +} diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 7a9f1adef679..b63cb88ccdae 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -409,7 +409,8 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, if (unlikely(!bh)) return ERR_PTR(-ENOMEM); BUFFER_TRACE(bh, "get_write_access"); - if ((err = ext4_journal_get_write_access(handle, bh))) { + err = ext4_journal_get_write_access(handle, sb, bh, EXT4_JTR_NONE); + if (err) { brelse(bh); bh = ERR_PTR(err); } else { @@ -474,7 +475,8 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle, return -ENOMEM; BUFFER_TRACE(bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, bh); + err = ext4_journal_get_write_access(handle, sb, bh, + EXT4_JTR_NONE); if (err) { brelse(bh); return err; @@ -569,7 +571,8 @@ static int setup_new_flex_group_blocks(struct super_block *sb, } BUFFER_TRACE(gdb, "get_write_access"); - err = ext4_journal_get_write_access(handle, gdb); + err = ext4_journal_get_write_access(handle, sb, gdb, + EXT4_JTR_NONE); if (err) { brelse(gdb); goto out; @@ -837,17 +840,18 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, } BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access"); - err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); + err = ext4_journal_get_write_access(handle, sb, EXT4_SB(sb)->s_sbh, + EXT4_JTR_NONE); if (unlikely(err)) goto errout; BUFFER_TRACE(gdb_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, gdb_bh); + err = ext4_journal_get_write_access(handle, sb, gdb_bh, EXT4_JTR_NONE); if (unlikely(err)) goto errout; BUFFER_TRACE(dind, "get_write_access"); - err = ext4_journal_get_write_access(handle, dind); + err = ext4_journal_get_write_access(handle, sb, dind, EXT4_JTR_NONE); if (unlikely(err)) { ext4_std_error(sb, err); goto errout; @@ -956,7 +960,7 @@ static int add_new_gdb_meta_bg(struct super_block *sb, n_group_desc[gdb_num] = gdb_bh; BUFFER_TRACE(gdb_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, gdb_bh); + err = ext4_journal_get_write_access(handle, sb, gdb_bh, EXT4_JTR_NONE); if (err) { kvfree(n_group_desc); brelse(gdb_bh); @@ -1042,7 +1046,8 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, for (i = 0; i < reserved_gdb; i++) { BUFFER_TRACE(primary[i], "get_write_access"); - if ((err = ext4_journal_get_write_access(handle, primary[i]))) + if ((err = ext4_journal_get_write_access(handle, sb, primary[i], + EXT4_JTR_NONE))) goto exit_bh; } @@ -1149,10 +1154,9 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data, backup_block, backup_block - ext4_group_first_block_no(sb, group)); BUFFER_TRACE(bh, "get_write_access"); - if ((err = ext4_journal_get_write_access(handle, bh))) { - brelse(bh); + if ((err = ext4_journal_get_write_access(handle, sb, bh, + EXT4_JTR_NONE))) break; - } lock_buffer(bh); memcpy(bh->b_data, data, size); if (rest) @@ -1232,7 +1236,8 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb, gdb_bh = sbi_array_rcu_deref(sbi, s_group_desc, gdb_num); BUFFER_TRACE(gdb_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, gdb_bh); + err = ext4_journal_get_write_access(handle, sb, gdb_bh, + EXT4_JTR_NONE); if (!err && reserved_gdb && ext4_bg_num_gdb(sb, group)) err = reserve_backup_gdb(handle, resize_inode, group); @@ -1509,7 +1514,8 @@ static int ext4_flex_group_add(struct super_block *sb, } BUFFER_TRACE(sbi->s_sbh, "get_write_access"); - err = ext4_journal_get_write_access(handle, sbi->s_sbh); + err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh, + EXT4_JTR_NONE); if (err) goto exit_journal; @@ -1722,7 +1728,8 @@ static int ext4_group_extend_no_check(struct super_block *sb, } BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access"); - err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); + err = ext4_journal_get_write_access(handle, sb, EXT4_SB(sb)->s_sbh, + EXT4_JTR_NONE); if (err) { ext4_warning(sb, "error %d on journal write access", err); goto errout; @@ -1884,7 +1891,8 @@ static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode) return PTR_ERR(handle); BUFFER_TRACE(sbi->s_sbh, "get_write_access"); - err = ext4_journal_get_write_access(handle, sbi->s_sbh); + err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh, + EXT4_JTR_NONE); if (err) goto errout; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index dfa09a277b56..136940af00b8 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -80,7 +80,6 @@ static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data); static inline int ext2_feature_set_ok(struct super_block *sb); static inline int ext3_feature_set_ok(struct super_block *sb); -static int ext4_feature_set_ok(struct super_block *sb, int readonly); static void ext4_destroy_lazyinit_thread(void); static void ext4_unregister_li_request(struct super_block *sb); static void ext4_clear_request_list(void); @@ -90,12 +89,9 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb, /* * Lock ordering * - * Note the difference between i_mmap_sem (EXT4_I(inode)->i_mmap_sem) and - * i_mmap_rwsem (inode->i_mmap_rwsem)! - * * page fault path: - * mmap_lock -> sb_start_pagefault -> i_mmap_sem (r) -> transaction start -> - * page lock -> i_data_sem (rw) + * mmap_lock -> sb_start_pagefault -> invalidate_lock (r) -> transaction start + * -> page lock -> i_data_sem (rw) * * buffered write path: * sb_start_write -> i_mutex -> mmap_lock @@ -103,8 +99,9 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb, * i_data_sem (rw) * * truncate: - * sb_start_write -> i_mutex -> i_mmap_sem (w) -> i_mmap_rwsem (w) -> page lock - * sb_start_write -> i_mutex -> i_mmap_sem (w) -> transaction start -> + * sb_start_write -> i_mutex -> invalidate_lock (w) -> i_mmap_rwsem (w) -> + * page lock + * sb_start_write -> i_mutex -> invalidate_lock (w) -> transaction start -> * i_data_sem (rw) * * direct IO: @@ -1175,6 +1172,7 @@ static void ext4_put_super(struct super_block *sb) flush_work(&sbi->s_error_work); destroy_workqueue(sbi->rsv_conversion_wq); + ext4_release_orphan_info(sb); /* * Unregister sysfs before destroying jbd2 journal. @@ -1200,6 +1198,7 @@ static void ext4_put_super(struct super_block *sb) if (!sb_rdonly(sb) && !aborted) { ext4_clear_feature_journal_needs_recovery(sb); + ext4_clear_feature_orphan_present(sb); es->s_state = cpu_to_le16(sbi->s_mount_state); } if (!sb_rdonly(sb)) @@ -1360,7 +1359,6 @@ static void init_once(void *foo) INIT_LIST_HEAD(&ei->i_orphan); init_rwsem(&ei->xattr_sem); init_rwsem(&ei->i_data_sem); - init_rwsem(&ei->i_mmap_sem); inode_init_once(&ei->vfs_inode); ext4_fc_init_inode(&ei->vfs_inode); } @@ -1585,14 +1583,12 @@ static int ext4_mark_dquot_dirty(struct dquot *dquot); static int ext4_write_info(struct super_block *sb, int type); static int ext4_quota_on(struct super_block *sb, int type, int format_id, const struct path *path); -static int ext4_quota_on_mount(struct super_block *sb, int type); static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off); static ssize_t ext4_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off); static int ext4_quota_enable(struct super_block *sb, int type, int format_id, unsigned int flags); -static int ext4_enable_quotas(struct super_block *sb); static struct dquot **ext4_get_dquots(struct inode *inode) { @@ -2687,8 +2683,11 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT); le16_add_cpu(&es->s_mnt_count, 1); ext4_update_tstamp(es, s_mtime); - if (sbi->s_journal) + if (sbi->s_journal) { ext4_set_feature_journal_needs_recovery(sb); + if (ext4_has_feature_orphan_file(sb)) + ext4_set_feature_orphan_present(sb); + } err = ext4_commit_super(sb); done: @@ -2970,169 +2969,6 @@ static int ext4_check_descriptors(struct super_block *sb, return 1; } -/* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at - * the superblock) which were deleted from all directories, but held open by - * a process at the time of a crash. We walk the list and try to delete these - * inodes at recovery time (only with a read-write filesystem). - * - * In order to keep the orphan inode chain consistent during traversal (in - * case of crash during recovery), we link each inode into the superblock - * orphan list_head and handle it the same way as an inode deletion during - * normal operation (which journals the operations for us). - * - * We only do an iget() and an iput() on each inode, which is very safe if we - * accidentally point at an in-use or already deleted inode. The worst that - * can happen in this case is that we get a "bit already cleared" message from - * ext4_free_inode(). The only reason we would point at a wrong inode is if - * e2fsck was run on this filesystem, and it must have already done the orphan - * inode cleanup for us, so we can safely abort without any further action. - */ -static void ext4_orphan_cleanup(struct super_block *sb, - struct ext4_super_block *es) -{ - unsigned int s_flags = sb->s_flags; - int ret, nr_orphans = 0, nr_truncates = 0; -#ifdef CONFIG_QUOTA - int quota_update = 0; - int i; -#endif - if (!es->s_last_orphan) { - jbd_debug(4, "no orphan inodes to clean up\n"); - return; - } - - if (bdev_read_only(sb->s_bdev)) { - ext4_msg(sb, KERN_ERR, "write access " - "unavailable, skipping orphan cleanup"); - return; - } - - /* Check if feature set would not allow a r/w mount */ - if (!ext4_feature_set_ok(sb, 0)) { - ext4_msg(sb, KERN_INFO, "Skipping orphan cleanup due to " - "unknown ROCOMPAT features"); - return; - } - - if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { - /* don't clear list on RO mount w/ errors */ - if (es->s_last_orphan && !(s_flags & SB_RDONLY)) { - ext4_msg(sb, KERN_INFO, "Errors on filesystem, " - "clearing orphan list.\n"); - es->s_last_orphan = 0; - } - jbd_debug(1, "Skipping orphan recovery on fs with errors.\n"); - return; - } - - if (s_flags & SB_RDONLY) { - ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs"); - sb->s_flags &= ~SB_RDONLY; - } -#ifdef CONFIG_QUOTA - /* - * Turn on quotas which were not enabled for read-only mounts if - * filesystem has quota feature, so that they are updated correctly. - */ - if (ext4_has_feature_quota(sb) && (s_flags & SB_RDONLY)) { - int ret = ext4_enable_quotas(sb); - - if (!ret) - quota_update = 1; - else - ext4_msg(sb, KERN_ERR, - "Cannot turn on quotas: error %d", ret); - } - - /* Turn on journaled quotas used for old sytle */ - for (i = 0; i < EXT4_MAXQUOTAS; i++) { - if (EXT4_SB(sb)->s_qf_names[i]) { - int ret = ext4_quota_on_mount(sb, i); - - if (!ret) - quota_update = 1; - else - ext4_msg(sb, KERN_ERR, - "Cannot turn on journaled " - "quota: type %d: error %d", i, ret); - } - } -#endif - - while (es->s_last_orphan) { - struct inode *inode; - - /* - * We may have encountered an error during cleanup; if - * so, skip the rest. - */ - if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { - jbd_debug(1, "Skipping orphan recovery on fs with errors.\n"); - es->s_last_orphan = 0; - break; - } - - inode = ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan)); - if (IS_ERR(inode)) { - es->s_last_orphan = 0; - break; - } - - list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); - dquot_initialize(inode); - if (inode->i_nlink) { - if (test_opt(sb, DEBUG)) - ext4_msg(sb, KERN_DEBUG, - "%s: truncating inode %lu to %lld bytes", - __func__, inode->i_ino, inode->i_size); - jbd_debug(2, "truncating inode %lu to %lld bytes\n", - inode->i_ino, inode->i_size); - inode_lock(inode); - truncate_inode_pages(inode->i_mapping, inode->i_size); - ret = ext4_truncate(inode); - if (ret) { - /* - * We need to clean up the in-core orphan list - * manually if ext4_truncate() failed to get a - * transaction handle. - */ - ext4_orphan_del(NULL, inode); - ext4_std_error(inode->i_sb, ret); - } - inode_unlock(inode); - nr_truncates++; - } else { - if (test_opt(sb, DEBUG)) - ext4_msg(sb, KERN_DEBUG, - "%s: deleting unreferenced inode %lu", - __func__, inode->i_ino); - jbd_debug(2, "deleting unreferenced inode %lu\n", - inode->i_ino); - nr_orphans++; - } - iput(inode); /* The delete magic happens here! */ - } - -#define PLURAL(x) (x), ((x) == 1) ? "" : "s" - - if (nr_orphans) - ext4_msg(sb, KERN_INFO, "%d orphan inode%s deleted", - PLURAL(nr_orphans)); - if (nr_truncates) - ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up", - PLURAL(nr_truncates)); -#ifdef CONFIG_QUOTA - /* Turn off quotas if they were enabled for orphan cleanup */ - if (quota_update) { - for (i = 0; i < EXT4_MAXQUOTAS; i++) { - if (sb_dqopt(sb)->files[i]) - dquot_quota_off(sb, i); - } - } -#endif - sb->s_flags = s_flags; /* Restore SB_RDONLY status */ -} - /* * Maximal extent format file size. * Resulting logical blkno at s_maxbytes must fit in our on-disk @@ -3312,7 +3148,7 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi) * Returns 1 if this filesystem can be mounted as requested, * 0 if it cannot be. */ -static int ext4_feature_set_ok(struct super_block *sb, int readonly) +int ext4_feature_set_ok(struct super_block *sb, int readonly) { if (ext4_has_unknown_ext4_incompat_features(sb)) { ext4_msg(sb, KERN_ERR, @@ -4014,6 +3850,20 @@ static const char *ext4_quota_mode(struct super_block *sb) #endif } +static void ext4_setup_csum_trigger(struct super_block *sb, + enum ext4_journal_trigger_type type, + void (*trigger)( + struct jbd2_buffer_trigger_type *type, + struct buffer_head *bh, + void *mapped_data, + size_t size)) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + + sbi->s_journal_triggers[type].sb = sb; + sbi->s_journal_triggers[type].tr_triggers.t_frozen = trigger; +} + static int ext4_fill_super(struct super_block *sb, void *data, int silent) { struct dax_device *dax_dev = fs_dax_get_by_bdev(sb->s_bdev); @@ -4112,6 +3962,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) silent = 1; goto cantfind_ext4; } + ext4_setup_csum_trigger(sb, EXT4_JTR_ORPHAN_FILE, + ext4_orphan_file_block_trigger); /* Load the checksum driver */ sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); @@ -4776,6 +4628,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sb->s_root = NULL; needs_recovery = (es->s_last_orphan != 0 || + ext4_has_feature_orphan_present(sb) || ext4_has_feature_journal_needs_recovery(sb)); if (ext4_has_feature_mmp(sb) && !sb_rdonly(sb)) @@ -5032,6 +4885,14 @@ no_journal: err = percpu_counter_init(&sbi->s_freeinodes_counter, freei, GFP_KERNEL); } + /* + * Update the checksum after updating free space/inode + * counters. Otherwise the superblock can have an incorrect + * checksum in the buffer cache until it is written out and + * e2fsprogs programs trying to open a file system immediately + * after it is mounted can fail. + */ + ext4_superblock_csum_set(sb); if (!err) err = percpu_counter_init(&sbi->s_dirs_counter, ext4_count_dirs(sb), GFP_KERNEL); @@ -5066,12 +4927,15 @@ no_journal: if (err) goto failed_mount7; + err = ext4_init_orphan_info(sb); + if (err) + goto failed_mount8; #ifdef CONFIG_QUOTA /* Enable quota usage during mount. */ if (ext4_has_feature_quota(sb) && !sb_rdonly(sb)) { err = ext4_enable_quotas(sb); if (err) - goto failed_mount8; + goto failed_mount9; } #endif /* CONFIG_QUOTA */ @@ -5090,7 +4954,7 @@ no_journal: ext4_msg(sb, KERN_INFO, "recovery complete"); err = ext4_mark_recovery_complete(sb, es); if (err) - goto failed_mount8; + goto failed_mount9; } if (EXT4_SB(sb)->s_journal) { if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) @@ -5136,6 +5000,8 @@ cantfind_ext4: ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); goto failed_mount; +failed_mount9: + ext4_release_orphan_info(sb); failed_mount8: ext4_unregister_sysfs(sb); kobject_put(&sbi->s_kobj); @@ -5646,8 +5512,15 @@ static int ext4_mark_recovery_complete(struct super_block *sb, if (err < 0) goto out; - if (ext4_has_feature_journal_needs_recovery(sb) && sb_rdonly(sb)) { + if (sb_rdonly(sb) && (ext4_has_feature_journal_needs_recovery(sb) || + ext4_has_feature_orphan_present(sb))) { + if (!ext4_orphan_file_empty(sb)) { + ext4_error(sb, "Orphan file not empty on read-only fs."); + err = -EFSCORRUPTED; + goto out; + } ext4_clear_feature_journal_needs_recovery(sb); + ext4_clear_feature_orphan_present(sb); ext4_commit_super(sb); } out: @@ -5790,6 +5663,8 @@ static int ext4_freeze(struct super_block *sb) /* Journal blocked and flushed, clear needs_recovery flag. */ ext4_clear_feature_journal_needs_recovery(sb); + if (ext4_orphan_file_empty(sb)) + ext4_clear_feature_orphan_present(sb); } error = ext4_commit_super(sb); @@ -5812,6 +5687,8 @@ static int ext4_unfreeze(struct super_block *sb) if (EXT4_SB(sb)->s_journal) { /* Reset the needs_recovery flag before the fs is unlocked. */ ext4_set_feature_journal_needs_recovery(sb); + if (ext4_has_feature_orphan_file(sb)) + ext4_set_feature_orphan_present(sb); } ext4_commit_super(sb); @@ -6015,7 +5892,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) * around from a previously readonly bdev mount, * require a full umount/remount for now. */ - if (es->s_last_orphan) { + if (es->s_last_orphan || !ext4_orphan_file_empty(sb)) { ext4_msg(sb, KERN_WARNING, "Couldn't " "remount RDWR because of unprocessed " "orphan inode list. Please " @@ -6312,16 +6189,6 @@ static int ext4_write_info(struct super_block *sb, int type) return ret; } -/* - * Turn on quotas during mount time - we need to find - * the quota file and such... - */ -static int ext4_quota_on_mount(struct super_block *sb, int type) -{ - return dquot_quota_on_mount(sb, get_qf_name(sb, EXT4_SB(sb), type), - EXT4_SB(sb)->s_jquota_fmt, type); -} - static void lockdep_set_quota_inode(struct inode *inode, int subclass) { struct ext4_inode_info *ei = EXT4_I(inode); @@ -6451,7 +6318,7 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id, } /* Enable usage tracking for all quota types. */ -static int ext4_enable_quotas(struct super_block *sb) +int ext4_enable_quotas(struct super_block *sb) { int type, err = 0; unsigned long qf_inums[EXT4_MAXQUOTAS] = { @@ -6609,7 +6476,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, if (!bh) goto out; BUFFER_TRACE(bh, "get write access"); - err = ext4_journal_get_write_access(handle, bh); + err = ext4_journal_get_write_access(handle, sb, bh, EXT4_JTR_NONE); if (err) { brelse(bh); return err; diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index dd05af983092..69109746e6e2 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -52,10 +52,20 @@ static const char *ext4_encrypted_get_link(struct dentry *dentry, return paddr; } +static int ext4_encrypted_symlink_getattr(struct user_namespace *mnt_userns, + const struct path *path, + struct kstat *stat, u32 request_mask, + unsigned int query_flags) +{ + ext4_getattr(mnt_userns, path, stat, request_mask, query_flags); + + return fscrypt_symlink_getattr(path, stat); +} + const struct inode_operations ext4_encrypted_symlink_inode_operations = { .get_link = ext4_encrypted_get_link, .setattr = ext4_setattr, - .getattr = ext4_getattr, + .getattr = ext4_encrypted_symlink_getattr, .listxattr = ext4_listxattr, }; diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h index bcbe3668c1d4..ce84aa2786c7 100644 --- a/fs/ext4/truncate.h +++ b/fs/ext4/truncate.h @@ -11,14 +11,16 @@ */ static inline void ext4_truncate_failed_write(struct inode *inode) { + struct address_space *mapping = inode->i_mapping; + /* * We don't need to call ext4_break_layouts() because the blocks we * are truncating were never visible to userspace. */ - down_write(&EXT4_I(inode)->i_mmap_sem); - truncate_inode_pages(inode->i_mapping, inode->i_size); + filemap_invalidate_lock(mapping); + truncate_inode_pages(mapping, inode->i_size); ext4_truncate(inode); - up_write(&EXT4_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); } /* diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 6dd5c05c444a..1e0fc1ed845b 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -791,7 +791,8 @@ static void ext4_xattr_update_super_block(handle_t *handle, return; BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access"); - if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) { + if (ext4_journal_get_write_access(handle, sb, EXT4_SB(sb)->s_sbh, + EXT4_JTR_NONE) == 0) { lock_buffer(EXT4_SB(sb)->s_sbh); ext4_set_feature_xattr(sb); ext4_superblock_csum_set(sb); @@ -1169,7 +1170,8 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent, continue; } if (err > 0) { - err = ext4_journal_get_write_access(handle, bh); + err = ext4_journal_get_write_access(handle, + parent->i_sb, bh, EXT4_JTR_NONE); if (err) { ext4_warning_inode(ea_inode, "Re-get write access err=%d", @@ -1230,7 +1232,8 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, int error = 0; BUFFER_TRACE(bh, "get_write_access"); - error = ext4_journal_get_write_access(handle, bh); + error = ext4_journal_get_write_access(handle, inode->i_sb, bh, + EXT4_JTR_NONE); if (error) goto out; @@ -1371,7 +1374,8 @@ retry: "ext4_getblk() return bh = NULL"); return -EFSCORRUPTED; } - ret = ext4_journal_get_write_access(handle, bh); + ret = ext4_journal_get_write_access(handle, ea_inode->i_sb, bh, + EXT4_JTR_NONE); if (ret) goto out; @@ -1855,7 +1859,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, if (s->base) { BUFFER_TRACE(bs->bh, "get_write_access"); - error = ext4_journal_get_write_access(handle, bs->bh); + error = ext4_journal_get_write_access(handle, sb, bs->bh, + EXT4_JTR_NONE); if (error) goto cleanup; lock_buffer(bs->bh); @@ -1987,8 +1992,9 @@ inserted: if (error) goto cleanup; BUFFER_TRACE(new_bh, "get_write_access"); - error = ext4_journal_get_write_access(handle, - new_bh); + error = ext4_journal_get_write_access( + handle, sb, new_bh, + EXT4_JTR_NONE); if (error) goto cleanup_dquot; lock_buffer(new_bh); @@ -2092,7 +2098,8 @@ getblk_failed: } lock_buffer(new_bh); - error = ext4_journal_get_create_access(handle, new_bh); + error = ext4_journal_get_create_access(handle, sb, + new_bh, EXT4_JTR_NONE); if (error) { unlock_buffer(new_bh); error = -EIO; @@ -2848,7 +2855,8 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, goto cleanup; } - error = ext4_journal_get_write_access(handle, iloc.bh); + error = ext4_journal_get_write_access(handle, inode->i_sb, + iloc.bh, EXT4_JTR_NONE); if (error) { EXT4_ERROR_INODE(inode, "write access (error %d)", error); diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 239ad9453b99..16e826e01f09 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -196,8 +196,11 @@ static struct posix_acl *__f2fs_get_acl(struct inode *inode, int type, return acl; } -struct posix_acl *f2fs_get_acl(struct inode *inode, int type) +struct posix_acl *f2fs_get_acl(struct inode *inode, int type, bool rcu) { + if (rcu) + return ERR_PTR(-ECHILD); + return __f2fs_get_acl(inode, type, NULL); } diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h index 986fd1bc780b..a26e33cab4ff 100644 --- a/fs/f2fs/acl.h +++ b/fs/f2fs/acl.h @@ -33,7 +33,7 @@ struct f2fs_acl_header { #ifdef CONFIG_F2FS_FS_POSIX_ACL -extern struct posix_acl *f2fs_get_acl(struct inode *, int); +extern struct posix_acl *f2fs_get_acl(struct inode *, int, bool); extern int f2fs_set_acl(struct user_namespace *, struct inode *, struct posix_acl *, int); extern int f2fs_init_acl(struct inode *, struct inode *, struct page *, diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index d2cf48c5a2e4..eb222b35edef 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3187,12 +3187,12 @@ static void f2fs_write_failed(struct address_space *mapping, loff_t to) /* In the fs-verity case, f2fs_end_enable_verity() does the truncate */ if (to > i_size && !f2fs_verity_in_progress(inode)) { down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock(mapping); truncate_pagecache(inode, i_size); f2fs_truncate_blocks(inode, i_size, true); - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); } } @@ -3852,7 +3852,7 @@ static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk, int ret = 0; down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); set_inode_flag(inode, FI_ALIGNED_WRITE); @@ -3894,7 +3894,7 @@ done: clear_inode_flag(inode, FI_DO_DEFRAG); clear_inode_flag(inode, FI_ALIGNED_WRITE); - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); return ret; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ee8eb33e2c25..906b2c4b50e7 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -754,7 +754,6 @@ struct f2fs_inode_info { /* avoid racing between foreground op and gc */ struct rw_semaphore i_gc_rwsem[2]; - struct rw_semaphore i_mmap_sem; struct rw_semaphore i_xattr_sem; /* avoid racing between reading and changing EAs */ int i_extra_isize; /* size of extra space located in i_addr */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 6afd4562335f..1ff333755721 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -38,10 +38,7 @@ static vm_fault_t f2fs_filemap_fault(struct vm_fault *vmf) struct inode *inode = file_inode(vmf->vma->vm_file); vm_fault_t ret; - down_read(&F2FS_I(inode)->i_mmap_sem); ret = filemap_fault(vmf); - up_read(&F2FS_I(inode)->i_mmap_sem); - if (!ret) f2fs_update_iostat(F2FS_I_SB(inode), APP_MAPPED_READ_IO, F2FS_BLKSIZE); @@ -101,7 +98,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) f2fs_bug_on(sbi, f2fs_has_inline_data(inode)); file_update_time(vmf->vma->vm_file); - down_read(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock_shared(inode->i_mapping); lock_page(page); if (unlikely(page->mapping != inode->i_mapping || page_offset(page) > i_size_read(inode) || @@ -159,7 +156,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) trace_f2fs_vm_page_mkwrite(page, DATA); out_sem: - up_read(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock_shared(inode->i_mapping); sb_end_pagefault(inode->i_sb); err: @@ -940,7 +937,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, } down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); truncate_setsize(inode, attr->ia_size); @@ -950,7 +947,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, * do not trim all blocks after i_size if target size is * larger than i_size. */ - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); if (err) return err; @@ -1095,7 +1092,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) blk_end = (loff_t)pg_end << PAGE_SHIFT; down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock(mapping); truncate_inode_pages_range(mapping, blk_start, blk_end - 1); @@ -1104,7 +1101,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) ret = f2fs_truncate_hole(inode, pg_start, pg_end); f2fs_unlock_op(sbi); - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); } } @@ -1339,7 +1336,7 @@ static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) /* avoid gc operation during block exchange */ down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); f2fs_lock_op(sbi); f2fs_drop_extent_tree(inode); @@ -1347,7 +1344,7 @@ static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) ret = __exchange_data_block(inode, inode, end, start, nrpages - end, true); f2fs_unlock_op(sbi); - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); return ret; } @@ -1378,13 +1375,13 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) return ret; /* write out all moved pages, if possible */ - down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); truncate_pagecache(inode, offset); new_size = i_size_read(inode) - len; ret = f2fs_truncate_blocks(inode, new_size, true); - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); if (!ret) f2fs_i_size_write(inode, new_size); return ret; @@ -1484,7 +1481,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, pgoff_t end; down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock(mapping); truncate_pagecache_range(inode, (loff_t)index << PAGE_SHIFT, @@ -1496,7 +1493,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, ret = f2fs_get_dnode_of_data(&dn, index, ALLOC_NODE); if (ret) { f2fs_unlock_op(sbi); - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); goto out; } @@ -1508,7 +1505,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, f2fs_put_dnode(&dn); f2fs_unlock_op(sbi); - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); f2fs_balance_fs(sbi, dn.node_changed); @@ -1543,6 +1540,7 @@ out: static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct address_space *mapping = inode->i_mapping; pgoff_t nr, pg_start, pg_end, delta, idx; loff_t new_size; int ret = 0; @@ -1565,14 +1563,14 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) f2fs_balance_fs(sbi, true); - down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock(mapping); ret = f2fs_truncate_blocks(inode, i_size_read(inode), true); - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); if (ret) return ret; /* write out all dirty pages from offset */ - ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); + ret = filemap_write_and_wait_range(mapping, offset, LLONG_MAX); if (ret) return ret; @@ -1583,7 +1581,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) /* avoid gc operation during block exchange */ down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock(mapping); truncate_pagecache(inode, offset); while (!ret && idx > pg_start) { @@ -1599,14 +1597,14 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) idx + delta, nr, false); f2fs_unlock_op(sbi); } - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); /* write out all moved pages, if possible */ - down_write(&F2FS_I(inode)->i_mmap_sem); - filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); + filemap_invalidate_lock(mapping); + filemap_write_and_wait_range(mapping, offset, LLONG_MAX); truncate_pagecache(inode, offset); - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); if (!ret) f2fs_i_size_write(inode, new_size); @@ -3440,7 +3438,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) goto out; down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); @@ -3476,7 +3474,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) } up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); out: inode_unlock(inode); @@ -3593,7 +3591,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) } down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); last_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); @@ -3629,7 +3627,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) } up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); if (ret >= 0) { clear_inode_flag(inode, FI_COMPRESS_RELEASED); @@ -3748,7 +3746,7 @@ static int f2fs_sec_trim_file(struct file *filp, unsigned long arg) goto err; down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock(mapping); ret = filemap_write_and_wait_range(mapping, range.start, to_end ? LLONG_MAX : end_addr - 1); @@ -3835,7 +3833,7 @@ static int f2fs_sec_trim_file(struct file *filp, unsigned long arg) ret = f2fs_secure_erase(prev_bdev, inode, prev_index, prev_block, len, range.flags); out: - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(mapping); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); err: inode_unlock(inode); @@ -4313,9 +4311,9 @@ write: /* if we couldn't write data, we should deallocate blocks. */ if (preallocated && i_size_read(inode) < target_size) { down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - down_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); f2fs_truncate(inode); - up_write(&F2FS_I(inode)->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); } diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index e149c8c66a71..9c528e583c9d 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -1323,9 +1323,19 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry, return target; } +static int f2fs_encrypted_symlink_getattr(struct user_namespace *mnt_userns, + const struct path *path, + struct kstat *stat, u32 request_mask, + unsigned int query_flags) +{ + f2fs_getattr(mnt_userns, path, stat, request_mask, query_flags); + + return fscrypt_symlink_getattr(path, stat); +} + const struct inode_operations f2fs_encrypted_symlink_inode_operations = { .get_link = f2fs_encrypted_get_link, - .getattr = f2fs_getattr, + .getattr = f2fs_encrypted_symlink_getattr, .setattr = f2fs_setattr, .listxattr = f2fs_listxattr, }; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 8fecd3050ccd..ce2ab1b85c11 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1289,7 +1289,6 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) mutex_init(&fi->inmem_lock); init_rwsem(&fi->i_gc_rwsem[READ]); init_rwsem(&fi->i_gc_rwsem[WRITE]); - init_rwsem(&fi->i_mmap_sem); init_rwsem(&fi->i_xattr_sem); /* Will be used by directory only */ diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 6642246206bd..daad532a4e2b 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -378,7 +378,7 @@ out: ret = kstrtol(name, 10, &data); if (ret) return ret; - if (data >= IOPRIO_BE_NR || data < 0) + if (data >= IOPRIO_NR_LEVELS || data < 0) return -EINVAL; cprc->ckpt_thread_ioprio = IOPRIO_PRIO_VALUE(class, data); diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index 860e884e56e8..978ac6751aeb 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -5,6 +5,7 @@ #include <linux/blkdev.h> #include <linux/sched/signal.h> +#include <linux/backing-dev-defs.h> #include "fat.h" struct fatent_operations { diff --git a/fs/fcntl.c b/fs/fcntl.c index f946bec8f1f1..68added37c15 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -150,7 +150,8 @@ void f_delown(struct file *filp) pid_t f_getown(struct file *filp) { pid_t pid = 0; - read_lock(&filp->f_owner.lock); + + read_lock_irq(&filp->f_owner.lock); rcu_read_lock(); if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type)) { pid = pid_vnr(filp->f_owner.pid); @@ -158,7 +159,7 @@ pid_t f_getown(struct file *filp) pid = -pid; } rcu_read_unlock(); - read_unlock(&filp->f_owner.lock); + read_unlock_irq(&filp->f_owner.lock); return pid; } @@ -208,7 +209,7 @@ static int f_getown_ex(struct file *filp, unsigned long arg) struct f_owner_ex owner = {}; int ret = 0; - read_lock(&filp->f_owner.lock); + read_lock_irq(&filp->f_owner.lock); rcu_read_lock(); if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type)) owner.pid = pid_vnr(filp->f_owner.pid); @@ -231,7 +232,7 @@ static int f_getown_ex(struct file *filp, unsigned long arg) ret = -EINVAL; break; } - read_unlock(&filp->f_owner.lock); + read_unlock_irq(&filp->f_owner.lock); if (!ret) { ret = copy_to_user(owner_p, &owner, sizeof(owner)); @@ -249,10 +250,10 @@ static int f_getowner_uids(struct file *filp, unsigned long arg) uid_t src[2]; int err; - read_lock(&filp->f_owner.lock); + read_lock_irq(&filp->f_owner.lock); src[0] = from_kuid(user_ns, filp->f_owner.uid); src[1] = from_kuid(user_ns, filp->f_owner.euid); - read_unlock(&filp->f_owner.lock); + read_unlock_irq(&filp->f_owner.lock); err = put_user(src[0], &dst[0]); err |= put_user(src[1], &dst[1]); @@ -1003,13 +1004,14 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band) { while (fa) { struct fown_struct *fown; + unsigned long flags; if (fa->magic != FASYNC_MAGIC) { printk(KERN_ERR "kill_fasync: bad magic number in " "fasync_struct!\n"); return; } - read_lock(&fa->fa_lock); + read_lock_irqsave(&fa->fa_lock, flags); if (fa->fa_file) { fown = &fa->fa_file->f_owner; /* Don't send SIGURG to processes which have not set a @@ -1018,7 +1020,7 @@ static void kill_fasync_rcu(struct fasync_struct *fa, int sig, int band) if (!(sig == SIGURG && fown->signum == 0)) send_sigio(fown, fa->fa_fd, band); } - read_unlock(&fa->fa_lock); + read_unlock_irqrestore(&fa->fa_lock, flags); fa = rcu_dereference(fa->fa_next); } } diff --git a/fs/file.c b/fs/file.c index 86dc9956af32..d8afa8266859 100644 --- a/fs/file.c +++ b/fs/file.c @@ -596,18 +596,32 @@ void fd_install(unsigned int fd, struct file *file) EXPORT_SYMBOL(fd_install); +/** + * pick_file - return file associatd with fd + * @files: file struct to retrieve file from + * @fd: file descriptor to retrieve file for + * + * If this functions returns an EINVAL error pointer the fd was beyond the + * current maximum number of file descriptors for that fdtable. + * + * Returns: The file associated with @fd, on error returns an error pointer. + */ static struct file *pick_file(struct files_struct *files, unsigned fd) { - struct file *file = NULL; + struct file *file; struct fdtable *fdt; spin_lock(&files->file_lock); fdt = files_fdtable(files); - if (fd >= fdt->max_fds) + if (fd >= fdt->max_fds) { + file = ERR_PTR(-EINVAL); goto out_unlock; + } file = fdt->fd[fd]; - if (!file) + if (!file) { + file = ERR_PTR(-EBADF); goto out_unlock; + } rcu_assign_pointer(fdt->fd[fd], NULL); __put_unused_fd(files, fd); @@ -622,7 +636,7 @@ int close_fd(unsigned fd) struct file *file; file = pick_file(files, fd); - if (!file) + if (IS_ERR(file)) return -EBADF; return filp_close(file, files); @@ -663,11 +677,16 @@ static inline void __range_close(struct files_struct *cur_fds, unsigned int fd, struct file *file; file = pick_file(cur_fds, fd++); - if (!file) + if (!IS_ERR(file)) { + /* found a valid file to close */ + filp_close(file, cur_fds); + cond_resched(); continue; + } - filp_close(file, cur_fds); - cond_resched(); + /* beyond the last fd in that table */ + if (PTR_ERR(file) == -EINVAL) + return; } } @@ -682,7 +701,6 @@ static inline void __range_close(struct files_struct *cur_fds, unsigned int fd, */ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags) { - unsigned int cur_max; struct task_struct *me = current; struct files_struct *cur_fds = me->files, *fds = NULL; @@ -692,26 +710,26 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags) if (fd > max_fd) return -EINVAL; - rcu_read_lock(); - cur_max = files_fdtable(cur_fds)->max_fds; - rcu_read_unlock(); - - /* cap to last valid index into fdtable */ - cur_max--; - if (flags & CLOSE_RANGE_UNSHARE) { int ret; unsigned int max_unshare_fds = NR_OPEN_MAX; /* - * If the requested range is greater than the current maximum, - * we're closing everything so only copy all file descriptors - * beneath the lowest file descriptor. - * If the caller requested all fds to be made cloexec copy all - * of the file descriptors since they still want to use them. + * If the caller requested all fds to be made cloexec we always + * copy all of the file descriptors since they still want to + * use them. */ - if (!(flags & CLOSE_RANGE_CLOEXEC) && (max_fd >= cur_max)) - max_unshare_fds = fd; + if (!(flags & CLOSE_RANGE_CLOEXEC)) { + /* + * If the requested range is greater than the current + * maximum, we're closing everything so only copy all + * file descriptors beneath the lowest file descriptor. + */ + rcu_read_lock(); + if (max_fd >= last_fd(files_fdtable(cur_fds))) + max_unshare_fds = fd; + rcu_read_unlock(); + } ret = unshare_fd(CLONE_FILES, max_unshare_fds, &fds); if (ret) @@ -725,8 +743,6 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags) swap(cur_fds, fds); } - max_fd = min(max_fd, cur_max); - if (flags & CLOSE_RANGE_CLOEXEC) __range_cloexec(cur_fds, fd, max_fd); else diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 4c3370548982..eb57dade6076 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -2730,23 +2730,6 @@ int write_inode_now(struct inode *inode, int sync) EXPORT_SYMBOL(write_inode_now); /** - * sync_inode - write an inode and its pages to disk. - * @inode: the inode to sync - * @wbc: controls the writeback mode - * - * sync_inode() will write an inode and its pages to disk. It will also - * correctly update the inode on its superblock's dirty inode lists and will - * update inode->i_state. - * - * The caller must have a ref on the inode. - */ -int sync_inode(struct inode *inode, struct writeback_control *wbc) -{ - return writeback_single_inode(inode, wbc); -} -EXPORT_SYMBOL(sync_inode); - -/** * sync_inode_metadata - write an inode to disk * @inode: the inode to sync * @wait: wait for I/O to complete. @@ -2762,6 +2745,6 @@ int sync_inode_metadata(struct inode *inode, int wait) .nr_to_write = 0, /* metadata-only */ }; - return sync_inode(inode, &wbc); + return writeback_single_inode(inode, &wbc); } EXPORT_SYMBOL(sync_inode_metadata); diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig index 427efa73b9bd..b313a978ae0a 100644 --- a/fs/fscache/Kconfig +++ b/fs/fscache/Kconfig @@ -14,6 +14,7 @@ config FSCACHE config FSCACHE_STATS bool "Gather statistical information on local caching" depends on FSCACHE && PROC_FS + select NETFS_STATS help This option causes statistical information to be gathered on local caching and exported through file: @@ -28,23 +29,6 @@ config FSCACHE_STATS See Documentation/filesystems/caching/fscache.rst for more information. -config FSCACHE_HISTOGRAM - bool "Gather latency information on local caching" - depends on FSCACHE && PROC_FS - help - This option causes latency information to be gathered on local - caching and exported through file: - - /proc/fs/fscache/histogram - - The generation of this histogram adds a certain amount of overhead to - execution as there are a number of points at which data is gathered, - and on a multi-CPU system these may be on cachelines that keep - bouncing between CPUs. On the other hand, the histogram may be - useful for debugging purposes. Saying 'N' here is recommended. - - See Documentation/filesystems/caching/fscache.rst for more information. - config FSCACHE_DEBUG bool "Debug FS-Cache" depends on FSCACHE @@ -54,10 +38,3 @@ config FSCACHE_DEBUG enabled by setting bits in /sys/modules/fscache/parameter/debug. See Documentation/filesystems/caching/fscache.rst for more information. - -config FSCACHE_OBJECT_LIST - bool "Maintain global object list for debugging purposes" - depends on FSCACHE && PROC_FS - help - Maintain a global list of active fscache objects that can be - retrieved through /proc/fs/fscache/objects for debugging purposes diff --git a/fs/fscache/Makefile b/fs/fscache/Makefile index 3b2ffa93ac18..03a871d689bb 100644 --- a/fs/fscache/Makefile +++ b/fs/fscache/Makefile @@ -16,7 +16,5 @@ fscache-y := \ fscache-$(CONFIG_PROC_FS) += proc.o fscache-$(CONFIG_FSCACHE_STATS) += stats.o -fscache-$(CONFIG_FSCACHE_HISTOGRAM) += histogram.o -fscache-$(CONFIG_FSCACHE_OBJECT_LIST) += object-list.o obj-$(CONFIG_FSCACHE) := fscache.o diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c index fcc136361415..bd4f44c1cce0 100644 --- a/fs/fscache/cache.c +++ b/fs/fscache/cache.c @@ -116,7 +116,7 @@ struct fscache_cache *fscache_select_cache_for_object( cache = NULL; spin_unlock(&cookie->lock); - _leave(" = %p [parent]", cache); + _leave(" = %s [parent]", cache ? cache->tag->name : "NULL"); return cache; } @@ -152,14 +152,14 @@ struct fscache_cache *fscache_select_cache_for_object( if (test_bit(FSCACHE_IOERROR, &tag->cache->flags)) return NULL; - _leave(" = %p [specific]", tag->cache); + _leave(" = %s [specific]", tag->name); return tag->cache; no_preference: /* netfs has no preference - just select first cache */ cache = list_entry(fscache_cache_list.next, struct fscache_cache, link); - _leave(" = %p [first]", cache); + _leave(" = %s [first]", cache->tag->name); return cache; } @@ -261,7 +261,6 @@ int fscache_add_cache(struct fscache_cache *cache, spin_lock(&cache->object_list_lock); list_add_tail(&ifsdef->cache_link, &cache->object_list); spin_unlock(&cache->object_list_lock); - fscache_objlist_add(ifsdef); /* add the cache's netfs definition index object to the top level index * cookie as a known backing object */ @@ -270,7 +269,7 @@ int fscache_add_cache(struct fscache_cache *cache, hlist_add_head(&ifsdef->cookie_link, &fscache_fsdef_index.backing_objects); - atomic_inc(&fscache_fsdef_index.usage); + refcount_inc(&fscache_fsdef_index.ref); /* done */ spin_unlock(&fscache_fsdef_index.lock); @@ -335,7 +334,7 @@ static void fscache_withdraw_all_objects(struct fscache_cache *cache, struct fscache_object, cache_link); list_move_tail(&object->cache_link, dying_objects); - _debug("withdraw %p", object->cookie); + _debug("withdraw %x", object->cookie->debug_id); /* This must be done under object_list_lock to prevent * a race with fscache_drop_object(). diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 751bc5b1cddf..cd42be646ed3 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -19,6 +19,8 @@ static atomic_t fscache_object_debug_id = ATOMIC_INIT(0); #define fscache_cookie_hash_shift 15 static struct hlist_bl_head fscache_cookie_hash[1 << fscache_cookie_hash_shift]; +static LIST_HEAD(fscache_cookies); +static DEFINE_RWLOCK(fscache_cookies_lock); static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie, loff_t object_size); @@ -29,21 +31,29 @@ static int fscache_attach_object(struct fscache_cookie *cookie, static void fscache_print_cookie(struct fscache_cookie *cookie, char prefix) { - struct hlist_node *object; + struct fscache_object *object; + struct hlist_node *o; const u8 *k; unsigned loop; - pr_err("%c-cookie c=%p [p=%p fl=%lx nc=%u na=%u]\n", - prefix, cookie, cookie->parent, cookie->flags, + pr_err("%c-cookie c=%08x [p=%08x fl=%lx nc=%u na=%u]\n", + prefix, + cookie->debug_id, + cookie->parent ? cookie->parent->debug_id : 0, + cookie->flags, atomic_read(&cookie->n_children), atomic_read(&cookie->n_active)); - pr_err("%c-cookie d=%p n=%p\n", - prefix, cookie->def, cookie->netfs_data); - - object = READ_ONCE(cookie->backing_objects.first); - if (object) - pr_err("%c-cookie o=%p\n", - prefix, hlist_entry(object, struct fscache_object, cookie_link)); + pr_err("%c-cookie d=%p{%s} n=%p\n", + prefix, + cookie->def, + cookie->def ? cookie->def->name : "?", + cookie->netfs_data); + + o = READ_ONCE(cookie->backing_objects.first); + if (o) { + object = hlist_entry(o, struct fscache_object, cookie_link); + pr_err("%c-cookie o=%u\n", prefix, object->debug_id); + } pr_err("%c-key=[%u] '", prefix, cookie->key_len); k = (cookie->key_len <= sizeof(cookie->inline_key)) ? @@ -57,6 +67,9 @@ void fscache_free_cookie(struct fscache_cookie *cookie) { if (cookie) { BUG_ON(!hlist_empty(&cookie->backing_objects)); + write_lock(&fscache_cookies_lock); + list_del(&cookie->proc_link); + write_unlock(&fscache_cookies_lock); if (cookie->aux_len > sizeof(cookie->inline_aux)) kfree(cookie->aux); if (cookie->key_len > sizeof(cookie->inline_key)) @@ -74,10 +87,8 @@ void fscache_free_cookie(struct fscache_cookie *cookie) static int fscache_set_key(struct fscache_cookie *cookie, const void *index_key, size_t index_key_len) { - unsigned long long h; u32 *buf; int bufs; - int i; bufs = DIV_ROUND_UP(index_key_len, sizeof(*buf)); @@ -91,17 +102,7 @@ static int fscache_set_key(struct fscache_cookie *cookie, } memcpy(buf, index_key, index_key_len); - - /* Calculate a hash and combine this with the length in the first word - * or first half word - */ - h = (unsigned long)cookie->parent; - h += index_key_len + cookie->type; - - for (i = 0; i < bufs; i++) - h += buf[i]; - - cookie->key_hash = h ^ (h >> 32); + cookie->key_hash = fscache_hash(0, buf, bufs); return 0; } @@ -129,6 +130,8 @@ static long fscache_compare_cookie(const struct fscache_cookie *a, return memcmp(ka, kb, a->key_len); } +static atomic_t fscache_cookie_debug_id = ATOMIC_INIT(1); + /* * Allocate a cookie. */ @@ -161,8 +164,9 @@ struct fscache_cookie *fscache_alloc_cookie( goto nomem; } - atomic_set(&cookie->usage, 1); + refcount_set(&cookie->ref, 1); atomic_set(&cookie->n_children, 0); + cookie->debug_id = atomic_inc_return(&fscache_cookie_debug_id); /* We keep the active count elevated until relinquishment to prevent an * attempt to wake up every time the object operations queue quiesces. @@ -181,6 +185,10 @@ struct fscache_cookie *fscache_alloc_cookie( /* radix tree insertion won't use the preallocation pool unless it's * told it may not wait */ INIT_RADIX_TREE(&cookie->stores, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); + + write_lock(&fscache_cookies_lock); + list_add_tail(&cookie->proc_link, &fscache_cookies); + write_unlock(&fscache_cookies_lock); return cookie; nomem: @@ -217,8 +225,8 @@ struct fscache_cookie *fscache_hash_cookie(struct fscache_cookie *candidate) collision: if (test_and_set_bit(FSCACHE_COOKIE_ACQUIRED, &cursor->flags)) { - trace_fscache_cookie(cursor, fscache_cookie_collision, - atomic_read(&cursor->usage)); + trace_fscache_cookie(cursor->debug_id, refcount_read(&cursor->ref), + fscache_cookie_collision); pr_err("Duplicate cookie detected\n"); fscache_print_cookie(cursor, 'O'); fscache_print_cookie(candidate, 'N'); @@ -297,7 +305,8 @@ struct fscache_cookie *__fscache_acquire_cookie( cookie = fscache_hash_cookie(candidate); if (!cookie) { - trace_fscache_cookie(candidate, fscache_cookie_discard, 1); + trace_fscache_cookie(candidate->debug_id, 1, + fscache_cookie_discard); goto out; } @@ -355,7 +364,7 @@ void __fscache_enable_cookie(struct fscache_cookie *cookie, bool (*can_enable)(void *data), void *data) { - _enter("%p", cookie); + _enter("%x", cookie->debug_id); trace_fscache_enable(cookie); @@ -452,10 +461,8 @@ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie, /* we may be required to wait for lookup to complete at this point */ if (!fscache_defer_lookup) { - _debug("non-deferred lookup %p", &cookie->flags); wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP, TASK_UNINTERRUPTIBLE); - _debug("complete"); if (test_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags)) goto unavailable; } @@ -480,7 +487,7 @@ static int fscache_alloc_object(struct fscache_cache *cache, struct fscache_object *object; int ret; - _enter("%p,%p{%s}", cache, cookie, cookie->def->name); + _enter("%s,%x{%s}", cache->tag->name, cookie->debug_id, cookie->def->name); spin_lock(&cookie->lock); hlist_for_each_entry(object, &cookie->backing_objects, @@ -600,8 +607,6 @@ static int fscache_attach_object(struct fscache_cookie *cookie, /* Attach to the cookie. The object already has a ref on it. */ hlist_add_head(&object->cookie_link, &cookie->backing_objects); - - fscache_objlist_add(object); ret = 0; cant_attach_object: @@ -658,7 +663,7 @@ EXPORT_SYMBOL(__fscache_invalidate); */ void __fscache_wait_on_invalidate(struct fscache_cookie *cookie) { - _enter("%p", cookie); + _enter("%x", cookie->debug_id); wait_on_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING, TASK_UNINTERRUPTIBLE); @@ -713,7 +718,7 @@ void __fscache_disable_cookie(struct fscache_cookie *cookie, struct fscache_object *object; bool awaken = false; - _enter("%p,%u", cookie, invalidate); + _enter("%x,%u", cookie->debug_id, invalidate); trace_fscache_disable(cookie); @@ -803,8 +808,8 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, return; } - _enter("%p{%s,%p,%d},%d", - cookie, cookie->def->name, cookie->netfs_data, + _enter("%x{%s,%d},%d", + cookie->debug_id, cookie->def->name, atomic_read(&cookie->n_active), retire); trace_fscache_relinquish(cookie, retire); @@ -821,13 +826,12 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, BUG_ON(!radix_tree_empty(&cookie->stores)); if (cookie->parent) { - ASSERTCMP(atomic_read(&cookie->parent->usage), >, 0); + ASSERTCMP(refcount_read(&cookie->parent->ref), >, 0); ASSERTCMP(atomic_read(&cookie->parent->n_children), >, 0); atomic_dec(&cookie->parent->n_children); } /* Dispose of the netfs's link to the cookie */ - ASSERTCMP(atomic_read(&cookie->usage), >, 0); fscache_cookie_put(cookie, fscache_cookie_put_relinquish); _leave(""); @@ -857,17 +861,17 @@ void fscache_cookie_put(struct fscache_cookie *cookie, enum fscache_cookie_trace where) { struct fscache_cookie *parent; - int usage; + int ref; - _enter("%p", cookie); + _enter("%x", cookie->debug_id); do { - usage = atomic_dec_return(&cookie->usage); - trace_fscache_cookie(cookie, where, usage); + unsigned int cookie_debug_id = cookie->debug_id; + bool zero = __refcount_dec_and_test(&cookie->ref, &ref); - if (usage > 0) + trace_fscache_cookie(cookie_debug_id, ref - 1, where); + if (!zero) return; - BUG_ON(usage < 0); parent = cookie->parent; fscache_unhash_cookie(cookie); @@ -881,6 +885,19 @@ void fscache_cookie_put(struct fscache_cookie *cookie, } /* + * Get a reference to a cookie. + */ +struct fscache_cookie *fscache_cookie_get(struct fscache_cookie *cookie, + enum fscache_cookie_trace where) +{ + int ref; + + __refcount_inc(&cookie->ref, &ref); + trace_fscache_cookie(cookie->debug_id, ref + 1, where); + return cookie; +} + +/* * check the consistency between the netfs inode and the backing cache * * NOTE: it only serves no-index type @@ -958,3 +975,97 @@ inconsistent: return -ESTALE; } EXPORT_SYMBOL(__fscache_check_consistency); + +/* + * Generate a list of extant cookies in /proc/fs/fscache/cookies + */ +static int fscache_cookies_seq_show(struct seq_file *m, void *v) +{ + struct fscache_cookie *cookie; + unsigned int keylen = 0, auxlen = 0; + char _type[3], *type; + u8 *p; + + if (v == &fscache_cookies) { + seq_puts(m, + "COOKIE PARENT USAGE CHILD ACT TY FL DEF NETFS_DATA\n" + "======== ======== ===== ===== === == === ================ ==========\n" + ); + return 0; + } + + cookie = list_entry(v, struct fscache_cookie, proc_link); + + switch (cookie->type) { + case 0: + type = "IX"; + break; + case 1: + type = "DT"; + break; + default: + snprintf(_type, sizeof(_type), "%02u", + cookie->type); + type = _type; + break; + } + + seq_printf(m, + "%08x %08x %5u %5u %3u %s %03lx %-16s %px", + cookie->debug_id, + cookie->parent ? cookie->parent->debug_id : 0, + refcount_read(&cookie->ref), + atomic_read(&cookie->n_children), + atomic_read(&cookie->n_active), + type, + cookie->flags, + cookie->def->name, + cookie->netfs_data); + + keylen = cookie->key_len; + auxlen = cookie->aux_len; + + if (keylen > 0 || auxlen > 0) { + seq_puts(m, " "); + p = keylen <= sizeof(cookie->inline_key) ? + cookie->inline_key : cookie->key; + for (; keylen > 0; keylen--) + seq_printf(m, "%02x", *p++); + if (auxlen > 0) { + seq_puts(m, ", "); + p = auxlen <= sizeof(cookie->inline_aux) ? + cookie->inline_aux : cookie->aux; + for (; auxlen > 0; auxlen--) + seq_printf(m, "%02x", *p++); + } + } + + seq_puts(m, "\n"); + return 0; +} + +static void *fscache_cookies_seq_start(struct seq_file *m, loff_t *_pos) + __acquires(fscache_cookies_lock) +{ + read_lock(&fscache_cookies_lock); + return seq_list_start_head(&fscache_cookies, *_pos); +} + +static void *fscache_cookies_seq_next(struct seq_file *m, void *v, loff_t *_pos) +{ + return seq_list_next(v, &fscache_cookies, _pos); +} + +static void fscache_cookies_seq_stop(struct seq_file *m, void *v) + __releases(rcu) +{ + read_unlock(&fscache_cookies_lock); +} + + +const struct seq_operations fscache_cookies_seq_ops = { + .start = fscache_cookies_seq_start, + .next = fscache_cookies_seq_next, + .stop = fscache_cookies_seq_stop, + .show = fscache_cookies_seq_show, +}; diff --git a/fs/fscache/fsdef.c b/fs/fscache/fsdef.c index 09ed8795ad86..0402673c680e 100644 --- a/fs/fscache/fsdef.c +++ b/fs/fscache/fsdef.c @@ -45,7 +45,8 @@ static struct fscache_cookie_def fscache_fsdef_index_def = { }; struct fscache_cookie fscache_fsdef_index = { - .usage = ATOMIC_INIT(1), + .debug_id = 1, + .ref = REFCOUNT_INIT(1), .n_active = ATOMIC_INIT(1), .lock = __SPIN_LOCK_UNLOCKED(fscache_fsdef_index.lock), .backing_objects = HLIST_HEAD_INIT, diff --git a/fs/fscache/histogram.c b/fs/fscache/histogram.c deleted file mode 100644 index 4e5beeaaf454..000000000000 --- a/fs/fscache/histogram.c +++ /dev/null @@ -1,87 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* FS-Cache latency histogram - * - * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - */ - -#define FSCACHE_DEBUG_LEVEL THREAD -#include <linux/module.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> -#include "internal.h" - -atomic_t fscache_obj_instantiate_histogram[HZ]; -atomic_t fscache_objs_histogram[HZ]; -atomic_t fscache_ops_histogram[HZ]; -atomic_t fscache_retrieval_delay_histogram[HZ]; -atomic_t fscache_retrieval_histogram[HZ]; - -/* - * display the time-taken histogram - */ -static int fscache_histogram_show(struct seq_file *m, void *v) -{ - unsigned long index; - unsigned n[5], t; - - switch ((unsigned long) v) { - case 1: - seq_puts(m, "JIFS SECS OBJ INST OP RUNS OBJ RUNS RETRV DLY RETRIEVLS\n"); - return 0; - case 2: - seq_puts(m, "===== ===== ========= ========= ========= ========= =========\n"); - return 0; - default: - index = (unsigned long) v - 3; - n[0] = atomic_read(&fscache_obj_instantiate_histogram[index]); - n[1] = atomic_read(&fscache_ops_histogram[index]); - n[2] = atomic_read(&fscache_objs_histogram[index]); - n[3] = atomic_read(&fscache_retrieval_delay_histogram[index]); - n[4] = atomic_read(&fscache_retrieval_histogram[index]); - if (!(n[0] | n[1] | n[2] | n[3] | n[4])) - return 0; - - t = (index * 1000) / HZ; - - seq_printf(m, "%4lu 0.%03u %9u %9u %9u %9u %9u\n", - index, t, n[0], n[1], n[2], n[3], n[4]); - return 0; - } -} - -/* - * set up the iterator to start reading from the first line - */ -static void *fscache_histogram_start(struct seq_file *m, loff_t *_pos) -{ - if ((unsigned long long)*_pos >= HZ + 2) - return NULL; - if (*_pos == 0) - *_pos = 1; - return (void *)(unsigned long) *_pos; -} - -/* - * move to the next line - */ -static void *fscache_histogram_next(struct seq_file *m, void *v, loff_t *pos) -{ - (*pos)++; - return (unsigned long long)*pos > HZ + 2 ? - NULL : (void *)(unsigned long) *pos; -} - -/* - * clean up after reading - */ -static void fscache_histogram_stop(struct seq_file *m, void *v) -{ -} - -const struct seq_operations fscache_histogram_ops = { - .start = fscache_histogram_start, - .stop = fscache_histogram_stop, - .next = fscache_histogram_next, - .show = fscache_histogram_show, -}; diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index c483863b740a..c3e4804b8fcb 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -45,6 +45,7 @@ extern struct fscache_cache *fscache_select_cache_for_object( * cookie.c */ extern struct kmem_cache *fscache_cookie_jar; +extern const struct seq_operations fscache_cookies_seq_ops; extern void fscache_free_cookie(struct fscache_cookie *); extern struct fscache_cookie *fscache_alloc_cookie(struct fscache_cookie *, @@ -53,9 +54,18 @@ extern struct fscache_cookie *fscache_alloc_cookie(struct fscache_cookie *, const void *, size_t, void *, loff_t); extern struct fscache_cookie *fscache_hash_cookie(struct fscache_cookie *); +extern struct fscache_cookie *fscache_cookie_get(struct fscache_cookie *, + enum fscache_cookie_trace); extern void fscache_cookie_put(struct fscache_cookie *, enum fscache_cookie_trace); +static inline void fscache_cookie_see(struct fscache_cookie *cookie, + enum fscache_cookie_trace where) +{ + trace_fscache_cookie(cookie->debug_id, refcount_read(&cookie->ref), + where); +} + /* * fsdef.c */ @@ -63,30 +73,6 @@ extern struct fscache_cookie fscache_fsdef_index; extern struct fscache_cookie_def fscache_fsdef_netfs_def; /* - * histogram.c - */ -#ifdef CONFIG_FSCACHE_HISTOGRAM -extern atomic_t fscache_obj_instantiate_histogram[HZ]; -extern atomic_t fscache_objs_histogram[HZ]; -extern atomic_t fscache_ops_histogram[HZ]; -extern atomic_t fscache_retrieval_delay_histogram[HZ]; -extern atomic_t fscache_retrieval_histogram[HZ]; - -static inline void fscache_hist(atomic_t histogram[], unsigned long start_jif) -{ - unsigned long jif = jiffies - start_jif; - if (jif >= HZ) - jif = HZ - 1; - atomic_inc(&histogram[jif]); -} - -extern const struct seq_operations fscache_histogram_ops; - -#else -#define fscache_hist(hist, start_jif) do {} while (0) -#endif - -/* * main.c */ extern unsigned fscache_defer_lookup; @@ -97,6 +83,8 @@ extern struct workqueue_struct *fscache_object_wq; extern struct workqueue_struct *fscache_op_wq; DECLARE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait); +extern unsigned int fscache_hash(unsigned int salt, unsigned int *data, unsigned int n); + static inline bool fscache_object_congested(void) { return workqueue_congested(WORK_CPU_UNBOUND, fscache_object_wq); @@ -108,19 +96,6 @@ static inline bool fscache_object_congested(void) extern void fscache_enqueue_object(struct fscache_object *); /* - * object-list.c - */ -#ifdef CONFIG_FSCACHE_OBJECT_LIST -extern const struct proc_ops fscache_objlist_proc_ops; - -extern void fscache_objlist_add(struct fscache_object *); -extern void fscache_objlist_remove(struct fscache_object *); -#else -#define fscache_objlist_add(object) do {} while(0) -#define fscache_objlist_remove(object) do {} while(0) -#endif - -/* * operation.c */ extern int fscache_submit_exclusive_op(struct fscache_object *, @@ -320,14 +295,6 @@ static inline void fscache_raise_event(struct fscache_object *object, fscache_enqueue_object(object); } -static inline void fscache_cookie_get(struct fscache_cookie *cookie, - enum fscache_cookie_trace where) -{ - int usage = atomic_inc_return(&cookie->usage); - - trace_fscache_cookie(cookie, where, usage); -} - /* * get an extra reference to a netfs retrieval context */ diff --git a/fs/fscache/main.c b/fs/fscache/main.c index c1e6cc9091aa..4207f98e405f 100644 --- a/fs/fscache/main.c +++ b/fs/fscache/main.c @@ -94,6 +94,45 @@ static struct ctl_table fscache_sysctls_root[] = { #endif /* + * Mixing scores (in bits) for (7,20): + * Input delta: 1-bit 2-bit + * 1 round: 330.3 9201.6 + * 2 rounds: 1246.4 25475.4 + * 3 rounds: 1907.1 31295.1 + * 4 rounds: 2042.3 31718.6 + * Perfect: 2048 31744 + * (32*64) (32*31/2 * 64) + */ +#define HASH_MIX(x, y, a) \ + ( x ^= (a), \ + y ^= x, x = rol32(x, 7),\ + x += y, y = rol32(y,20),\ + y *= 9 ) + +static inline unsigned int fold_hash(unsigned long x, unsigned long y) +{ + /* Use arch-optimized multiply if one exists */ + return __hash_32(y ^ __hash_32(x)); +} + +/* + * Generate a hash. This is derived from full_name_hash(), but we want to be + * sure it is arch independent and that it doesn't change as bits of the + * computed hash value might appear on disk. The caller also guarantees that + * the hashed data will be a series of aligned 32-bit words. + */ +unsigned int fscache_hash(unsigned int salt, unsigned int *data, unsigned int n) +{ + unsigned int a, x = 0, y = salt; + + for (; n; n--) { + a = *data++; + HASH_MIX(x, y, a); + } + return fold_hash(x, y); +} + +/* * initialise the fs caching module */ static int __init fscache_init(void) diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c index cce92216fa28..d6bdb7b5e723 100644 --- a/fs/fscache/netfs.c +++ b/fs/fscache/netfs.c @@ -37,7 +37,7 @@ int __fscache_register_netfs(struct fscache_netfs *netfs) if (!cookie) goto already_registered; if (cookie != candidate) { - trace_fscache_cookie(candidate, fscache_cookie_discard, 1); + trace_fscache_cookie(candidate->debug_id, 1, fscache_cookie_discard); fscache_free_cookie(candidate); } diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c deleted file mode 100644 index e106a1a1600d..000000000000 --- a/fs/fscache/object-list.c +++ /dev/null @@ -1,414 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* Global fscache object list maintainer and viewer - * - * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - */ - -#define FSCACHE_DEBUG_LEVEL COOKIE -#include <linux/module.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> -#include <linux/slab.h> -#include <linux/key.h> -#include <keys/user-type.h> -#include "internal.h" - -static struct rb_root fscache_object_list; -static DEFINE_RWLOCK(fscache_object_list_lock); - -struct fscache_objlist_data { - unsigned long config; /* display configuration */ -#define FSCACHE_OBJLIST_CONFIG_KEY 0x00000001 /* show object keys */ -#define FSCACHE_OBJLIST_CONFIG_AUX 0x00000002 /* show object auxdata */ -#define FSCACHE_OBJLIST_CONFIG_COOKIE 0x00000004 /* show objects with cookies */ -#define FSCACHE_OBJLIST_CONFIG_NOCOOKIE 0x00000008 /* show objects without cookies */ -#define FSCACHE_OBJLIST_CONFIG_BUSY 0x00000010 /* show busy objects */ -#define FSCACHE_OBJLIST_CONFIG_IDLE 0x00000020 /* show idle objects */ -#define FSCACHE_OBJLIST_CONFIG_PENDWR 0x00000040 /* show objects with pending writes */ -#define FSCACHE_OBJLIST_CONFIG_NOPENDWR 0x00000080 /* show objects without pending writes */ -#define FSCACHE_OBJLIST_CONFIG_READS 0x00000100 /* show objects with active reads */ -#define FSCACHE_OBJLIST_CONFIG_NOREADS 0x00000200 /* show objects without active reads */ -#define FSCACHE_OBJLIST_CONFIG_EVENTS 0x00000400 /* show objects with events */ -#define FSCACHE_OBJLIST_CONFIG_NOEVENTS 0x00000800 /* show objects without no events */ -#define FSCACHE_OBJLIST_CONFIG_WORK 0x00001000 /* show objects with work */ -#define FSCACHE_OBJLIST_CONFIG_NOWORK 0x00002000 /* show objects without work */ -}; - -/* - * Add an object to the object list - * - we use the address of the fscache_object structure as the key into the - * tree - */ -void fscache_objlist_add(struct fscache_object *obj) -{ - struct fscache_object *xobj; - struct rb_node **p = &fscache_object_list.rb_node, *parent = NULL; - - ASSERT(RB_EMPTY_NODE(&obj->objlist_link)); - - write_lock(&fscache_object_list_lock); - - while (*p) { - parent = *p; - xobj = rb_entry(parent, struct fscache_object, objlist_link); - - if (obj < xobj) - p = &(*p)->rb_left; - else if (obj > xobj) - p = &(*p)->rb_right; - else - BUG(); - } - - rb_link_node(&obj->objlist_link, parent, p); - rb_insert_color(&obj->objlist_link, &fscache_object_list); - - write_unlock(&fscache_object_list_lock); -} - -/* - * Remove an object from the object list. - */ -void fscache_objlist_remove(struct fscache_object *obj) -{ - if (RB_EMPTY_NODE(&obj->objlist_link)) - return; - - write_lock(&fscache_object_list_lock); - - BUG_ON(RB_EMPTY_ROOT(&fscache_object_list)); - rb_erase(&obj->objlist_link, &fscache_object_list); - - write_unlock(&fscache_object_list_lock); -} - -/* - * find the object in the tree on or after the specified index - */ -static struct fscache_object *fscache_objlist_lookup(loff_t *_pos) -{ - struct fscache_object *pobj, *obj = NULL, *minobj = NULL; - struct rb_node *p; - unsigned long pos; - - if (*_pos >= (unsigned long) ERR_PTR(-ENOENT)) - return NULL; - pos = *_pos; - - /* banners (can't represent line 0 by pos 0 as that would involve - * returning a NULL pointer) */ - if (pos == 0) - return (struct fscache_object *)(long)++(*_pos); - if (pos < 3) - return (struct fscache_object *)pos; - - pobj = (struct fscache_object *)pos; - p = fscache_object_list.rb_node; - while (p) { - obj = rb_entry(p, struct fscache_object, objlist_link); - if (pobj < obj) { - if (!minobj || minobj > obj) - minobj = obj; - p = p->rb_left; - } else if (pobj > obj) { - p = p->rb_right; - } else { - minobj = obj; - break; - } - obj = NULL; - } - - if (!minobj) - *_pos = (unsigned long) ERR_PTR(-ENOENT); - else if (minobj != obj) - *_pos = (unsigned long) minobj; - return minobj; -} - -/* - * set up the iterator to start reading from the first line - */ -static void *fscache_objlist_start(struct seq_file *m, loff_t *_pos) - __acquires(&fscache_object_list_lock) -{ - read_lock(&fscache_object_list_lock); - return fscache_objlist_lookup(_pos); -} - -/* - * move to the next line - */ -static void *fscache_objlist_next(struct seq_file *m, void *v, loff_t *_pos) -{ - (*_pos)++; - return fscache_objlist_lookup(_pos); -} - -/* - * clean up after reading - */ -static void fscache_objlist_stop(struct seq_file *m, void *v) - __releases(&fscache_object_list_lock) -{ - read_unlock(&fscache_object_list_lock); -} - -/* - * display an object - */ -static int fscache_objlist_show(struct seq_file *m, void *v) -{ - struct fscache_objlist_data *data = m->private; - struct fscache_object *obj = v; - struct fscache_cookie *cookie; - unsigned long config = data->config; - char _type[3], *type; - u8 *p; - - if ((unsigned long) v == 1) { - seq_puts(m, "OBJECT PARENT STAT CHLDN OPS OOP IPR EX READS" - " EM EV FL S" - " | NETFS_COOKIE_DEF TY FL NETFS_DATA"); - if (config & (FSCACHE_OBJLIST_CONFIG_KEY | - FSCACHE_OBJLIST_CONFIG_AUX)) - seq_puts(m, " "); - if (config & FSCACHE_OBJLIST_CONFIG_KEY) - seq_puts(m, "OBJECT_KEY"); - if ((config & (FSCACHE_OBJLIST_CONFIG_KEY | - FSCACHE_OBJLIST_CONFIG_AUX)) == - (FSCACHE_OBJLIST_CONFIG_KEY | FSCACHE_OBJLIST_CONFIG_AUX)) - seq_puts(m, ", "); - if (config & FSCACHE_OBJLIST_CONFIG_AUX) - seq_puts(m, "AUX_DATA"); - seq_puts(m, "\n"); - return 0; - } - - if ((unsigned long) v == 2) { - seq_puts(m, "======== ======== ==== ===== === === === == =====" - " == == == =" - " | ================ == == ================"); - if (config & (FSCACHE_OBJLIST_CONFIG_KEY | - FSCACHE_OBJLIST_CONFIG_AUX)) - seq_puts(m, " ================"); - seq_puts(m, "\n"); - return 0; - } - - /* filter out any unwanted objects */ -#define FILTER(criterion, _yes, _no) \ - do { \ - unsigned long yes = FSCACHE_OBJLIST_CONFIG_##_yes; \ - unsigned long no = FSCACHE_OBJLIST_CONFIG_##_no; \ - if (criterion) { \ - if (!(config & yes)) \ - return 0; \ - } else { \ - if (!(config & no)) \ - return 0; \ - } \ - } while(0) - - cookie = obj->cookie; - if (~config) { - FILTER(cookie->def, - COOKIE, NOCOOKIE); - FILTER(fscache_object_is_active(obj) || - obj->n_ops != 0 || - obj->n_obj_ops != 0 || - obj->flags || - !list_empty(&obj->dependents), - BUSY, IDLE); - FILTER(test_bit(FSCACHE_OBJECT_PENDING_WRITE, &obj->flags), - PENDWR, NOPENDWR); - FILTER(atomic_read(&obj->n_reads), - READS, NOREADS); - FILTER(obj->events & obj->event_mask, - EVENTS, NOEVENTS); - FILTER(work_busy(&obj->work), WORK, NOWORK); - } - - seq_printf(m, - "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %2lx %1x | ", - obj->debug_id, - obj->parent ? obj->parent->debug_id : -1, - obj->state->short_name, - obj->n_children, - obj->n_ops, - obj->n_obj_ops, - obj->n_in_progress, - obj->n_exclusive, - atomic_read(&obj->n_reads), - obj->event_mask, - obj->events, - obj->flags, - work_busy(&obj->work)); - - if (fscache_use_cookie(obj)) { - uint16_t keylen = 0, auxlen = 0; - - switch (cookie->type) { - case 0: - type = "IX"; - break; - case 1: - type = "DT"; - break; - default: - snprintf(_type, sizeof(_type), "%02u", - cookie->type); - type = _type; - break; - } - - seq_printf(m, "%-16s %s %2lx %16p", - cookie->def->name, - type, - cookie->flags, - cookie->netfs_data); - - if (config & FSCACHE_OBJLIST_CONFIG_KEY) - keylen = cookie->key_len; - - if (config & FSCACHE_OBJLIST_CONFIG_AUX) - auxlen = cookie->aux_len; - - if (keylen > 0 || auxlen > 0) { - seq_puts(m, " "); - p = keylen <= sizeof(cookie->inline_key) ? - cookie->inline_key : cookie->key; - for (; keylen > 0; keylen--) - seq_printf(m, "%02x", *p++); - if (auxlen > 0) { - if (config & FSCACHE_OBJLIST_CONFIG_KEY) - seq_puts(m, ", "); - p = auxlen <= sizeof(cookie->inline_aux) ? - cookie->inline_aux : cookie->aux; - for (; auxlen > 0; auxlen--) - seq_printf(m, "%02x", *p++); - } - } - - seq_puts(m, "\n"); - fscache_unuse_cookie(obj); - } else { - seq_puts(m, "<no_netfs>\n"); - } - return 0; -} - -static const struct seq_operations fscache_objlist_ops = { - .start = fscache_objlist_start, - .stop = fscache_objlist_stop, - .next = fscache_objlist_next, - .show = fscache_objlist_show, -}; - -/* - * get the configuration for filtering the list - */ -static void fscache_objlist_config(struct fscache_objlist_data *data) -{ -#ifdef CONFIG_KEYS - const struct user_key_payload *confkey; - unsigned long config; - struct key *key; - const char *buf; - int len; - - key = request_key(&key_type_user, "fscache:objlist", NULL); - if (IS_ERR(key)) - goto no_config; - - config = 0; - rcu_read_lock(); - - confkey = user_key_payload_rcu(key); - if (!confkey) { - /* key was revoked */ - rcu_read_unlock(); - key_put(key); - goto no_config; - } - - buf = confkey->data; - - for (len = confkey->datalen - 1; len >= 0; len--) { - switch (buf[len]) { - case 'K': config |= FSCACHE_OBJLIST_CONFIG_KEY; break; - case 'A': config |= FSCACHE_OBJLIST_CONFIG_AUX; break; - case 'C': config |= FSCACHE_OBJLIST_CONFIG_COOKIE; break; - case 'c': config |= FSCACHE_OBJLIST_CONFIG_NOCOOKIE; break; - case 'B': config |= FSCACHE_OBJLIST_CONFIG_BUSY; break; - case 'b': config |= FSCACHE_OBJLIST_CONFIG_IDLE; break; - case 'W': config |= FSCACHE_OBJLIST_CONFIG_PENDWR; break; - case 'w': config |= FSCACHE_OBJLIST_CONFIG_NOPENDWR; break; - case 'R': config |= FSCACHE_OBJLIST_CONFIG_READS; break; - case 'r': config |= FSCACHE_OBJLIST_CONFIG_NOREADS; break; - case 'S': config |= FSCACHE_OBJLIST_CONFIG_WORK; break; - case 's': config |= FSCACHE_OBJLIST_CONFIG_NOWORK; break; - } - } - - rcu_read_unlock(); - key_put(key); - - if (!(config & (FSCACHE_OBJLIST_CONFIG_COOKIE | FSCACHE_OBJLIST_CONFIG_NOCOOKIE))) - config |= FSCACHE_OBJLIST_CONFIG_COOKIE | FSCACHE_OBJLIST_CONFIG_NOCOOKIE; - if (!(config & (FSCACHE_OBJLIST_CONFIG_BUSY | FSCACHE_OBJLIST_CONFIG_IDLE))) - config |= FSCACHE_OBJLIST_CONFIG_BUSY | FSCACHE_OBJLIST_CONFIG_IDLE; - if (!(config & (FSCACHE_OBJLIST_CONFIG_PENDWR | FSCACHE_OBJLIST_CONFIG_NOPENDWR))) - config |= FSCACHE_OBJLIST_CONFIG_PENDWR | FSCACHE_OBJLIST_CONFIG_NOPENDWR; - if (!(config & (FSCACHE_OBJLIST_CONFIG_READS | FSCACHE_OBJLIST_CONFIG_NOREADS))) - config |= FSCACHE_OBJLIST_CONFIG_READS | FSCACHE_OBJLIST_CONFIG_NOREADS; - if (!(config & (FSCACHE_OBJLIST_CONFIG_EVENTS | FSCACHE_OBJLIST_CONFIG_NOEVENTS))) - config |= FSCACHE_OBJLIST_CONFIG_EVENTS | FSCACHE_OBJLIST_CONFIG_NOEVENTS; - if (!(config & (FSCACHE_OBJLIST_CONFIG_WORK | FSCACHE_OBJLIST_CONFIG_NOWORK))) - config |= FSCACHE_OBJLIST_CONFIG_WORK | FSCACHE_OBJLIST_CONFIG_NOWORK; - - data->config = config; - return; - -no_config: -#endif - data->config = ULONG_MAX; -} - -/* - * open "/proc/fs/fscache/objects" to provide a list of active objects - * - can be configured by a user-defined key added to the caller's keyrings - */ -static int fscache_objlist_open(struct inode *inode, struct file *file) -{ - struct fscache_objlist_data *data; - - data = __seq_open_private(file, &fscache_objlist_ops, sizeof(*data)); - if (!data) - return -ENOMEM; - - /* get the configuration key */ - fscache_objlist_config(data); - - return 0; -} - -/* - * clean up on close - */ -static int fscache_objlist_release(struct inode *inode, struct file *file) -{ - struct seq_file *m = file->private_data; - - kfree(m->private); - m->private = NULL; - return seq_release(inode, file); -} - -const struct proc_ops fscache_objlist_proc_ops = { - .proc_open = fscache_objlist_open, - .proc_read = seq_read, - .proc_lseek = seq_lseek, - .proc_release = fscache_objlist_release, -}; diff --git a/fs/fscache/object.c b/fs/fscache/object.c index cb2146e02cd5..f346a78f4bd6 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -277,13 +277,10 @@ static void fscache_object_work_func(struct work_struct *work) { struct fscache_object *object = container_of(work, struct fscache_object, work); - unsigned long start; _enter("{OBJ%x}", object->debug_id); - start = jiffies; fscache_object_sm_dispatcher(object); - fscache_hist(fscache_objs_histogram, start); fscache_put_object(object, fscache_obj_put_work); } @@ -436,7 +433,6 @@ static const struct fscache_state *fscache_parent_ready(struct fscache_object *o spin_lock(&parent->lock); parent->n_ops++; parent->n_obj_ops++; - object->lookup_jif = jiffies; spin_unlock(&parent->lock); _leave(""); @@ -522,7 +518,6 @@ void fscache_object_lookup_negative(struct fscache_object *object) set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags); clear_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags); - _debug("wake up lookup %p", &cookie->flags); clear_bit_unlock(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags); wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP); } @@ -596,7 +591,6 @@ static const struct fscache_state *fscache_object_available(struct fscache_objec object->cache->ops->lookup_complete(object); fscache_stat_d(&fscache_n_cop_lookup_complete); - fscache_hist(fscache_obj_instantiate_histogram, object->lookup_jif); fscache_stat(&fscache_n_object_avail); _leave(""); @@ -799,8 +793,6 @@ static void fscache_put_object(struct fscache_object *object, */ void fscache_object_destroy(struct fscache_object *object) { - fscache_objlist_remove(object); - /* We can get rid of the cookie now */ fscache_cookie_put(object->cookie, fscache_cookie_put_object); object->cookie = NULL; diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index 4a5651d4904e..433877107700 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -616,7 +616,6 @@ void fscache_op_work_func(struct work_struct *work) { struct fscache_operation *op = container_of(work, struct fscache_operation, work); - unsigned long start; _enter("{OBJ%x OP%x,%d}", op->object->debug_id, op->debug_id, atomic_read(&op->usage)); @@ -624,9 +623,7 @@ void fscache_op_work_func(struct work_struct *work) trace_fscache_op(op->object->cookie, op, fscache_op_work); ASSERT(op->processor != NULL); - start = jiffies; op->processor(op); - fscache_hist(fscache_ops_histogram, start); fscache_put_operation(op); _leave(""); diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 991b0a871744..27df94ef0e0b 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -289,7 +289,6 @@ static void fscache_release_retrieval_op(struct fscache_operation *_op) ASSERTIFCMP(op->op.state != FSCACHE_OP_ST_INITIALISED, atomic_read(&op->n_pages), ==, 0); - fscache_hist(fscache_retrieval_histogram, op->start_time); if (op->context) fscache_put_context(op->cookie, op->context); @@ -324,7 +323,6 @@ struct fscache_retrieval *fscache_alloc_retrieval( op->mapping = mapping; op->end_io_func = end_io_func; op->context = context; - op->start_time = jiffies; INIT_LIST_HEAD(&op->to_do); /* Pin the netfs read context in case we need to do the actual netfs @@ -340,8 +338,6 @@ struct fscache_retrieval *fscache_alloc_retrieval( */ int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie) { - unsigned long jif; - _enter(""); if (!test_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags)) { @@ -351,7 +347,6 @@ int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie) fscache_stat(&fscache_n_retrievals_wait); - jif = jiffies; if (wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP, TASK_INTERRUPTIBLE) != 0) { fscache_stat(&fscache_n_retrievals_intr); @@ -362,7 +357,6 @@ int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie) ASSERT(!test_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags)); smp_rmb(); - fscache_hist(fscache_retrieval_delay_histogram, jif); _leave(" = 0 [dly]"); return 0; } diff --git a/fs/fscache/proc.c b/fs/fscache/proc.c index 90a7bc22f7e1..061df8f61ffc 100644 --- a/fs/fscache/proc.c +++ b/fs/fscache/proc.c @@ -21,18 +21,16 @@ int __init fscache_proc_init(void) if (!proc_mkdir("fs/fscache", NULL)) goto error_dir; + if (!proc_create_seq("fs/fscache/cookies", S_IFREG | 0444, NULL, + &fscache_cookies_seq_ops)) + goto error_cookies; + #ifdef CONFIG_FSCACHE_STATS if (!proc_create_single("fs/fscache/stats", S_IFREG | 0444, NULL, fscache_stats_show)) goto error_stats; #endif -#ifdef CONFIG_FSCACHE_HISTOGRAM - if (!proc_create_seq("fs/fscache/histogram", S_IFREG | 0444, NULL, - &fscache_histogram_ops)) - goto error_histogram; -#endif - #ifdef CONFIG_FSCACHE_OBJECT_LIST if (!proc_create("fs/fscache/objects", S_IFREG | 0444, NULL, &fscache_objlist_proc_ops)) @@ -45,14 +43,12 @@ int __init fscache_proc_init(void) #ifdef CONFIG_FSCACHE_OBJECT_LIST error_objects: #endif -#ifdef CONFIG_FSCACHE_HISTOGRAM - remove_proc_entry("fs/fscache/histogram", NULL); -error_histogram: -#endif #ifdef CONFIG_FSCACHE_STATS remove_proc_entry("fs/fscache/stats", NULL); error_stats: #endif + remove_proc_entry("fs/fscache/cookies", NULL); +error_cookies: remove_proc_entry("fs/fscache", NULL); error_dir: _leave(" = -ENOMEM"); @@ -67,11 +63,9 @@ void fscache_proc_cleanup(void) #ifdef CONFIG_FSCACHE_OBJECT_LIST remove_proc_entry("fs/fscache/objects", NULL); #endif -#ifdef CONFIG_FSCACHE_HISTOGRAM - remove_proc_entry("fs/fscache/histogram", NULL); -#endif #ifdef CONFIG_FSCACHE_STATS remove_proc_entry("fs/fscache/stats", NULL); #endif + remove_proc_entry("fs/fscache/cookies", NULL); remove_proc_entry("fs/fscache", NULL); } diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c index 52b165319be1..337cb29a8dd5 100644 --- a/fs/fuse/acl.c +++ b/fs/fuse/acl.c @@ -11,7 +11,7 @@ #include <linux/posix_acl.h> #include <linux/posix_acl_xattr.h> -struct posix_acl *fuse_get_acl(struct inode *inode, int type) +struct posix_acl *fuse_get_acl(struct inode *inode, int type, bool rcu) { struct fuse_conn *fc = get_fuse_conn(inode); int size; @@ -19,6 +19,9 @@ struct posix_acl *fuse_get_acl(struct inode *inode, int type) void *value = NULL; struct posix_acl *acl; + if (rcu) + return ERR_PTR(-ECHILD); + if (fuse_is_bad(inode)) return ERR_PTR(-EIO); diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index e55723744f58..281d79f8b3d3 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -444,12 +444,12 @@ static int fuse_setup_new_dax_mapping(struct inode *inode, loff_t pos, /* * Can't do inline reclaim in fault path. We call * dax_layout_busy_page() before we free a range. And - * fuse_wait_dax_page() drops fi->i_mmap_sem lock and requires it. - * In fault path we enter with fi->i_mmap_sem held and can't drop - * it. Also in fault path we hold fi->i_mmap_sem shared and not - * exclusive, so that creates further issues with fuse_wait_dax_page(). - * Hence return -EAGAIN and fuse_dax_fault() will wait for a memory - * range to become free and retry. + * fuse_wait_dax_page() drops mapping->invalidate_lock and requires it. + * In fault path we enter with mapping->invalidate_lock held and can't + * drop it. Also in fault path we hold mapping->invalidate_lock shared + * and not exclusive, so that creates further issues with + * fuse_wait_dax_page(). Hence return -EAGAIN and fuse_dax_fault() + * will wait for a memory range to become free and retry. */ if (flags & IOMAP_FAULT) { alloc_dmap = alloc_dax_mapping(fcd); @@ -513,7 +513,7 @@ static int fuse_upgrade_dax_mapping(struct inode *inode, loff_t pos, down_write(&fi->dax->sem); node = interval_tree_iter_first(&fi->dax->tree, idx, idx); - /* We are holding either inode lock or i_mmap_sem, and that should + /* We are holding either inode lock or invalidate_lock, and that should * ensure that dmap can't be truncated. We are holding a reference * on dmap and that should make sure it can't be reclaimed. So dmap * should still be there in tree despite the fact we dropped and @@ -660,14 +660,12 @@ static const struct iomap_ops fuse_iomap_ops = { static void fuse_wait_dax_page(struct inode *inode) { - struct fuse_inode *fi = get_fuse_inode(inode); - - up_write(&fi->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); schedule(); - down_write(&fi->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); } -/* Should be called with fi->i_mmap_sem lock held exclusively */ +/* Should be called with mapping->invalidate_lock held exclusively */ static int __fuse_dax_break_layouts(struct inode *inode, bool *retry, loff_t start, loff_t end) { @@ -813,18 +811,18 @@ retry: * we do not want any read/write/mmap to make progress and try * to populate page cache or access memory we are trying to free. */ - down_read(&get_fuse_inode(inode)->i_mmap_sem); + filemap_invalidate_lock_shared(inode->i_mapping); ret = dax_iomap_fault(vmf, pe_size, &pfn, &error, &fuse_iomap_ops); if ((ret & VM_FAULT_ERROR) && error == -EAGAIN) { error = 0; retry = true; - up_read(&get_fuse_inode(inode)->i_mmap_sem); + filemap_invalidate_unlock_shared(inode->i_mapping); goto retry; } if (ret & VM_FAULT_NEEDDSYNC) ret = dax_finish_sync_fault(vmf, pe_size, pfn); - up_read(&get_fuse_inode(inode)->i_mmap_sem); + filemap_invalidate_unlock_shared(inode->i_mapping); if (write) sb_end_pagefault(sb); @@ -960,7 +958,7 @@ inode_inline_reclaim_one_dmap(struct fuse_conn_dax *fcd, struct inode *inode, int ret; struct interval_tree_node *node; - down_write(&fi->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); /* Lookup a dmap and corresponding file offset to reclaim. */ down_read(&fi->dax->sem); @@ -1021,7 +1019,7 @@ inode_inline_reclaim_one_dmap(struct fuse_conn_dax *fcd, struct inode *inode, out_write_dmap_sem: up_write(&fi->dax->sem); out_mmap_sem: - up_write(&fi->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); return dmap; } @@ -1050,10 +1048,10 @@ alloc_dax_mapping_reclaim(struct fuse_conn_dax *fcd, struct inode *inode) * had a reference or some other temporary failure, * Try again. We want to give up inline reclaim only * if there is no range assigned to this node. Otherwise - * if a deadlock is possible if we sleep with fi->i_mmap_sem - * held and worker to free memory can't make progress due - * to unavailability of fi->i_mmap_sem lock. So sleep - * only if fi->dax->nr=0 + * if a deadlock is possible if we sleep with + * mapping->invalidate_lock held and worker to free memory + * can't make progress due to unavailability of + * mapping->invalidate_lock. So sleep only if fi->dax->nr=0 */ if (retry) continue; @@ -1061,8 +1059,8 @@ alloc_dax_mapping_reclaim(struct fuse_conn_dax *fcd, struct inode *inode) * There are no mappings which can be reclaimed. Wait for one. * We are not holding fi->dax->sem. So it is possible * that range gets added now. But as we are not holding - * fi->i_mmap_sem, worker should still be able to free up - * a range and wake us up. + * mapping->invalidate_lock, worker should still be able to + * free up a range and wake us up. */ if (!fi->dax->nr && !(fcd->nr_free_ranges > 0)) { if (wait_event_killable_exclusive(fcd->range_waitq, @@ -1108,7 +1106,7 @@ static int lookup_and_reclaim_dmap_locked(struct fuse_conn_dax *fcd, /* * Free a range of memory. * Locking: - * 1. Take fi->i_mmap_sem to block dax faults. + * 1. Take mapping->invalidate_lock to block dax faults. * 2. Take fi->dax->sem to protect interval tree and also to make sure * read/write can not reuse a dmap which we might be freeing. */ @@ -1122,7 +1120,7 @@ static int lookup_and_reclaim_dmap(struct fuse_conn_dax *fcd, loff_t dmap_start = start_idx << FUSE_DAX_SHIFT; loff_t dmap_end = (dmap_start + FUSE_DAX_SZ) - 1; - down_write(&fi->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); ret = fuse_dax_break_layouts(inode, dmap_start, dmap_end); if (ret) { pr_debug("virtio_fs: fuse_dax_break_layouts() failed. err=%d\n", @@ -1134,7 +1132,7 @@ static int lookup_and_reclaim_dmap(struct fuse_conn_dax *fcd, ret = lookup_and_reclaim_dmap_locked(fcd, inode, start_idx); up_write(&fi->dax->sem); out_mmap_sem: - up_write(&fi->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); return ret; } @@ -1235,8 +1233,6 @@ void fuse_dax_conn_free(struct fuse_conn *fc) static int fuse_dax_mem_range_init(struct fuse_conn_dax *fcd) { long nr_pages, nr_ranges; - void *kaddr; - pfn_t pfn; struct fuse_dax_mapping *range; int ret, id; size_t dax_size = -1; @@ -1248,8 +1244,8 @@ static int fuse_dax_mem_range_init(struct fuse_conn_dax *fcd) INIT_DELAYED_WORK(&fcd->free_work, fuse_dax_free_mem_worker); id = dax_read_lock(); - nr_pages = dax_direct_access(fcd->dev, 0, PHYS_PFN(dax_size), &kaddr, - &pfn); + nr_pages = dax_direct_access(fcd->dev, 0, PHYS_PFN(dax_size), NULL, + NULL); dax_read_unlock(id); if (nr_pages < 0) { pr_debug("dax_direct_access() returned %ld\n", nr_pages); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index eade6f965b2e..d9b977c0f38d 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1556,6 +1556,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_conn *fc = fm->fc; struct fuse_inode *fi = get_fuse_inode(inode); + struct address_space *mapping = inode->i_mapping; FUSE_ARGS(args); struct fuse_setattr_in inarg; struct fuse_attr_out outarg; @@ -1580,11 +1581,11 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, } if (FUSE_IS_DAX(inode) && is_truncate) { - down_write(&fi->i_mmap_sem); + filemap_invalidate_lock(mapping); fault_blocked = true; err = fuse_dax_break_layouts(inode, 0, 0); if (err) { - up_write(&fi->i_mmap_sem); + filemap_invalidate_unlock(mapping); return err; } } @@ -1694,13 +1695,13 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, if ((is_truncate || !is_wb) && S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) { truncate_pagecache(inode, outarg.attr.size); - invalidate_inode_pages2(inode->i_mapping); + invalidate_inode_pages2(mapping); } clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); out: if (fault_blocked) - up_write(&fi->i_mmap_sem); + filemap_invalidate_unlock(mapping); return 0; @@ -1711,7 +1712,7 @@ error: clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); if (fault_blocked) - up_write(&fi->i_mmap_sem); + filemap_invalidate_unlock(mapping); return err; } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 97f860cfc195..621a662c19fb 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -243,7 +243,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) } if (dax_truncate) { - down_write(&get_fuse_inode(inode)->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); err = fuse_dax_break_layouts(inode, 0, 0); if (err) goto out; @@ -255,7 +255,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) out: if (dax_truncate) - up_write(&get_fuse_inode(inode)->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); if (is_wb_truncate | dax_truncate) { fuse_release_nowrite(inode); @@ -2920,7 +2920,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, if (lock_inode) { inode_lock(inode); if (block_faults) { - down_write(&fi->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); err = fuse_dax_break_layouts(inode, 0, 0); if (err) goto out; @@ -2976,7 +2976,7 @@ out: clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); if (block_faults) - up_write(&fi->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); if (lock_inode) inode_unlock(inode); @@ -3045,7 +3045,7 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, * modifications. Yet this does give less guarantees than if the * copying was performed with write(2). * - * To fix this a i_mmap_sem style lock could be used to prevent new + * To fix this a mapping->invalidate_lock could be used to prevent new * faults while the copy is ongoing. */ err = fuse_writeback_range(inode_out, pos_out, pos_out + len - 1); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 07829ce78695..3d18556a01ad 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -149,13 +149,6 @@ struct fuse_inode { /** Lock to protect write related fields */ spinlock_t lock; - /** - * Can't take inode lock in fault path (leads to circular dependency). - * Introduce another semaphore which can be taken in fault path and - * then other filesystem paths can take this to block faults. - */ - struct rw_semaphore i_mmap_sem; - #ifdef CONFIG_FUSE_DAX /* * Dax specific inode data @@ -1216,7 +1209,7 @@ extern const struct xattr_handler *fuse_acl_xattr_handlers[]; extern const struct xattr_handler *fuse_no_acl_xattr_handlers[]; struct posix_acl; -struct posix_acl *fuse_get_acl(struct inode *inode, int type); +struct posix_acl *fuse_get_acl(struct inode *inode, int type, bool rcu); int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode, struct posix_acl *acl, int type); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index b9beb39a4a18..e07e429f32e1 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -85,7 +85,6 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) fi->orig_ino = 0; fi->state = 0; mutex_init(&fi->mutex); - init_rwsem(&fi->i_mmap_sem); spin_lock_init(&fi->lock); fi->forget = fuse_alloc_forget(); if (!fi->forget) diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 9165d70ead07..734d1f05d823 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -57,13 +57,16 @@ static struct posix_acl *__gfs2_get_acl(struct inode *inode, int type) return acl; } -struct posix_acl *gfs2_get_acl(struct inode *inode, int type) +struct posix_acl *gfs2_get_acl(struct inode *inode, int type, bool rcu) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; bool need_unlock = false; struct posix_acl *acl; + if (rcu) + return ERR_PTR(-ECHILD); + if (!gfs2_glock_is_locked_by_me(ip->i_gl)) { int ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh); diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h index eccc6a43326c..cd180ca7c959 100644 --- a/fs/gfs2/acl.h +++ b/fs/gfs2/acl.h @@ -11,7 +11,7 @@ #define GFS2_ACL_MAX_ENTRIES(sdp) ((300 << (sdp)->sd_sb.sb_bsize_shift) >> 12) -extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type); +extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type, bool rcu); extern int __gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type); extern int gfs2_set_acl(struct user_namespace *mnt_userns, struct inode *inode, struct posix_acl *acl, int type); diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 81d8f064126e..005e920f5d4a 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -574,10 +574,9 @@ void adjust_fs_space(struct inode *inode) { struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); - struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode); struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; - struct buffer_head *m_bh, *l_bh; + struct buffer_head *m_bh; u64 fs_total, new_free; if (gfs2_trans_begin(sdp, 2 * RES_STATFS, 0) != 0) @@ -600,11 +599,7 @@ void adjust_fs_space(struct inode *inode) (unsigned long long)new_free); gfs2_statfs_change(sdp, new_free, new_free, 0); - if (gfs2_meta_inode_buffer(l_ip, &l_bh) != 0) - goto out2; - update_statfs(sdp, m_bh, l_bh); - brelse(l_bh); -out2: + update_statfs(sdp, m_bh); brelse(m_bh); out: sdp->sd_rindex_uptodate = 0; diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index ed8b67b21718..5414c2c33580 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1002,7 +1002,7 @@ static void gfs2_write_unlock(struct inode *inode) } static int gfs2_iomap_page_prepare(struct inode *inode, loff_t pos, - unsigned len, struct iomap *iomap) + unsigned len) { unsigned int blockmask = i_blocksize(inode) - 1; struct gfs2_sbd *sdp = GFS2_SB(inode); @@ -1013,8 +1013,7 @@ static int gfs2_iomap_page_prepare(struct inode *inode, loff_t pos, } static void gfs2_iomap_page_done(struct inode *inode, loff_t pos, - unsigned copied, struct page *page, - struct iomap *iomap) + unsigned copied, struct page *page) { struct gfs2_trans *tr = current->journal_info; struct gfs2_inode *ip = GFS2_I(inode); diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 84ec053d43b4..c559827cb6f9 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -1237,9 +1237,6 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl) if (!(fl->fl_flags & FL_POSIX)) return -ENOLCK; - if (__mandatory_lock(&ip->i_inode) && fl->fl_type != F_UNLCK) - return -ENOLCK; - if (cmd == F_CANCELLK) { /* Hack: */ cmd = F_SETLK; diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 1f3902ecdded..e0eaa9cf9fb6 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1494,12 +1494,11 @@ void gfs2_glock_dq(struct gfs2_holder *gh) list_del_init(&gh->gh_list); clear_bit(HIF_HOLDER, &gh->gh_iflags); - if (find_first_holder(gl) == NULL) { - if (list_empty(&gl->gl_holders) && - !test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && - !test_bit(GLF_DEMOTE, &gl->gl_flags)) - fast_path = 1; - } + if (list_empty(&gl->gl_holders) && + !test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && + !test_bit(GLF_DEMOTE, &gl->gl_flags)) + fast_path = 1; + if (!test_bit(GLF_LFLUSH, &gl->gl_flags) && demote_ok(gl)) gfs2_glock_add_to_lru(gl); @@ -2077,8 +2076,6 @@ static const char *hflags2str(char *buf, u16 flags, unsigned long iflags) *p++ = 'H'; if (test_bit(HIF_WAIT, &iflags)) *p++ = 'W'; - if (test_bit(HIF_FIRST, &iflags)) - *p++ = 'F'; *p = 0; return buf; } diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 54d3fbeb3002..79c621c7863d 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -33,16 +33,18 @@ extern struct workqueue_struct *gfs2_control_wq; static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh) { - fs_err(gl->gl_name.ln_sbd, + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + + fs_err(sdp, "AIL buffer %p: blocknr %llu state 0x%08lx mapping %p page " "state 0x%lx\n", bh, (unsigned long long)bh->b_blocknr, bh->b_state, bh->b_page->mapping, bh->b_page->flags); - fs_err(gl->gl_name.ln_sbd, "AIL glock %u:%llu mapping %p\n", + fs_err(sdp, "AIL glock %u:%llu mapping %p\n", gl->gl_name.ln_type, gl->gl_name.ln_number, gfs2_glock2aspace(gl)); - gfs2_lm(gl->gl_name.ln_sbd, "AIL error\n"); - gfs2_withdraw(gl->gl_name.ln_sbd); + gfs2_lm(sdp, "AIL error\n"); + gfs2_withdraw_delayed(sdp); } /** @@ -610,16 +612,13 @@ static int freeze_go_xmote_bh(struct gfs2_glock *gl) j_gl->gl_ops->go_inval(j_gl, DIO_METADATA); error = gfs2_find_jhead(sdp->sd_jdesc, &head, false); - if (error) - gfs2_consist(sdp); - if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) - gfs2_consist(sdp); - - /* Initialize some head of the log stuff */ - if (!gfs2_withdrawn(sdp)) { - sdp->sd_log_sequence = head.lh_sequence + 1; - gfs2_log_pointers_init(sdp, head.lh_blkno); - } + if (gfs2_assert_withdraw_delayed(sdp, !error)) + return error; + if (gfs2_assert_withdraw_delayed(sdp, head.lh_flags & + GFS2_LOG_HEAD_UNMOUNT)) + return -EIO; + sdp->sd_log_sequence = head.lh_sequence + 1; + gfs2_log_pointers_init(sdp, head.lh_blkno); } return 0; } diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index e6f820f146cb..0fe49770166e 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -253,7 +253,6 @@ struct gfs2_lkstats { enum { /* States */ HIF_HOLDER = 6, /* Set for gh that "holds" the glock */ - HIF_FIRST = 7, HIF_WAIT = 10, }; @@ -768,6 +767,7 @@ struct gfs2_sbd { struct gfs2_glock *sd_jinode_gl; struct gfs2_holder sd_sc_gh; + struct buffer_head *sd_sc_bh; struct gfs2_holder sd_qc_gh; struct completion sd_journal_ready; diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index dac040162ecc..50578f881e6d 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -299,6 +299,11 @@ static void gdlm_put_lock(struct gfs2_glock *gl) gfs2_sbstats_inc(gl, GFS2_LKS_DCOUNT); gfs2_update_request_times(gl); + /* don't want to call dlm if we've unmounted the lock protocol */ + if (test_bit(DFL_UNMOUNT, &ls->ls_recover_flags)) { + gfs2_glock_free(gl); + return; + } /* don't want to skip dlm_unlock writing the lvb when lock has one */ if (test_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags) && diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 42c15cfc0821..f0ee3ff6f9a8 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -594,7 +594,7 @@ void gfs2_log_reserve(struct gfs2_sbd *sdp, struct gfs2_trans *tr, { unsigned int blks = tr->tr_reserved; unsigned int revokes = tr->tr_revokes; - unsigned int revoke_blks = 0; + unsigned int revoke_blks; *extra_revokes = 0; if (revokes) { diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 8ee05d25dfa6..ca0bb3a73912 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -761,6 +761,32 @@ static void buf_lo_before_scan(struct gfs2_jdesc *jd, jd->jd_replayed_blocks = 0; } +#define obsolete_rgrp_replay \ +"Replaying 0x%llx from jid=%d/0x%llx but we already have a bh!\n" +#define obsolete_rgrp_replay2 \ +"busy:%d, pinned:%d rg_gen:0x%llx, j_gen:0x%llx\n" + +static void obsolete_rgrp(struct gfs2_jdesc *jd, struct buffer_head *bh_log, + u64 blkno) +{ + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + struct gfs2_rgrpd *rgd; + struct gfs2_rgrp *jrgd = (struct gfs2_rgrp *)bh_log->b_data; + + rgd = gfs2_blk2rgrpd(sdp, blkno, false); + if (rgd && rgd->rd_addr == blkno && + rgd->rd_bits && rgd->rd_bits->bi_bh) { + fs_info(sdp, obsolete_rgrp_replay, (unsigned long long)blkno, + jd->jd_jid, bh_log->b_blocknr); + fs_info(sdp, obsolete_rgrp_replay2, + buffer_busy(rgd->rd_bits->bi_bh) ? 1 : 0, + buffer_pinned(rgd->rd_bits->bi_bh), + rgd->rd_igeneration, + be64_to_cpu(jrgd->rg_igeneration)); + gfs2_dump_glock(NULL, rgd->rd_gl, true); + } +} + static int buf_lo_scan_elements(struct gfs2_jdesc *jd, u32 start, struct gfs2_log_descriptor *ld, __be64 *ptr, int pass) @@ -799,21 +825,9 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, u32 start, struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh_ip->b_data; - if (mh->mh_type == cpu_to_be32(GFS2_METATYPE_RG)) { - struct gfs2_rgrpd *rgd; - - rgd = gfs2_blk2rgrpd(sdp, blkno, false); - if (rgd && rgd->rd_addr == blkno && - rgd->rd_bits && rgd->rd_bits->bi_bh) { - fs_info(sdp, "Replaying 0x%llx but we " - "already have a bh!\n", - (unsigned long long)blkno); - fs_info(sdp, "busy:%d, pinned:%d\n", - buffer_busy(rgd->rd_bits->bi_bh) ? 1 : 0, - buffer_pinned(rgd->rd_bits->bi_bh)); - gfs2_dump_glock(NULL, rgd->rd_gl, true); - } - } + if (mh->mh_type == cpu_to_be32(GFS2_METATYPE_RG)) + obsolete_rgrp(jd, bh_log, blkno); + mark_buffer_dirty(bh_ip); } brelse(bh_log); diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 7c9619997355..72d30a682ece 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -258,8 +258,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, struct buffer_head *bh, *bhs[2]; int num = 0; - if (unlikely(gfs2_withdrawn(sdp)) && - (!sdp->sd_jdesc || gl != sdp->sd_jinode_gl)) { + if (unlikely(gfs2_withdrawn(sdp)) && !gfs2_withdraw_in_prog(sdp)) { *bhp = NULL; return -EIO; } @@ -317,7 +316,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh) { - if (unlikely(gfs2_withdrawn(sdp))) + if (unlikely(gfs2_withdrawn(sdp)) && !gfs2_withdraw_in_prog(sdp)) return -EIO; wait_on_buffer(bh); @@ -328,7 +327,7 @@ int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh) gfs2_io_error_bh_wd(sdp, bh); return -EIO; } - if (unlikely(gfs2_withdrawn(sdp))) + if (unlikely(gfs2_withdrawn(sdp)) && !gfs2_withdraw_in_prog(sdp)) return -EIO; return 0; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 5f4504dd0875..7f8410d8fdc1 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -614,6 +614,7 @@ static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh) break; } + d_mark_dontcache(jd->jd_inode); spin_lock(&sdp->sd_jindex_spin); jd->jd_jid = sdp->sd_journals++; jip = GFS2_I(jd->jd_inode); @@ -677,6 +678,7 @@ static int init_statfs(struct gfs2_sbd *sdp) error = PTR_ERR(lsi->si_sc_inode); fs_err(sdp, "can't find local \"sc\" file#%u: %d\n", jd->jd_jid, error); + kfree(lsi); goto free_local; } lsi->si_jid = jd->jd_jid; @@ -695,8 +697,16 @@ static int init_statfs(struct gfs2_sbd *sdp) fs_err(sdp, "can't lock local \"sc\" file: %d\n", error); goto free_local; } + /* read in the local statfs buffer - other nodes don't change it. */ + error = gfs2_meta_inode_buffer(ip, &sdp->sd_sc_bh); + if (error) { + fs_err(sdp, "Cannot read in local statfs: %d\n", error); + goto unlock_sd_gh; + } return 0; +unlock_sd_gh: + gfs2_glock_dq_uninit(&sdp->sd_sc_gh); free_local: free_local_statfs_inodes(sdp); iput(pn); @@ -710,6 +720,7 @@ out: static void uninit_statfs(struct gfs2_sbd *sdp) { if (!sdp->sd_args.ar_spectator) { + brelse(sdp->sd_sc_bh); gfs2_glock_dq_uninit(&sdp->sd_sc_gh); free_local_statfs_inodes(sdp); } @@ -1088,6 +1099,34 @@ void gfs2_online_uevent(struct gfs2_sbd *sdp) kobject_uevent_env(&sdp->sd_kobj, KOBJ_ONLINE, envp); } +static int init_threads(struct gfs2_sbd *sdp) +{ + struct task_struct *p; + int error = 0; + + p = kthread_run(gfs2_logd, sdp, "gfs2_logd"); + if (IS_ERR(p)) { + error = PTR_ERR(p); + fs_err(sdp, "can't start logd thread: %d\n", error); + return error; + } + sdp->sd_logd_process = p; + + p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad"); + if (IS_ERR(p)) { + error = PTR_ERR(p); + fs_err(sdp, "can't start quotad thread: %d\n", error); + goto fail; + } + sdp->sd_quotad_process = p; + return 0; + +fail: + kthread_stop(sdp->sd_logd_process); + sdp->sd_logd_process = NULL; + return error; +} + /** * gfs2_fill_super - Read in superblock * @sb: The VFS superblock @@ -1216,6 +1255,14 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc) goto fail_per_node; } + if (!sb_rdonly(sb)) { + error = init_threads(sdp); + if (error) { + gfs2_withdraw_delayed(sdp); + goto fail_per_node; + } + } + error = gfs2_freeze_lock(sdp, &freeze_gh, 0); if (error) goto fail_per_node; @@ -1225,6 +1272,12 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc) gfs2_freeze_unlock(&freeze_gh); if (error) { + if (sdp->sd_quotad_process) + kthread_stop(sdp->sd_quotad_process); + sdp->sd_quotad_process = NULL; + if (sdp->sd_logd_process) + kthread_stop(sdp->sd_logd_process); + sdp->sd_logd_process = NULL; fs_err(sdp, "can't make FS RW: %d\n", error); goto fail_per_node; } diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 4d4ceb0b6903..6e00d15ef0a8 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -119,34 +119,6 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd) return 0; } -static int init_threads(struct gfs2_sbd *sdp) -{ - struct task_struct *p; - int error = 0; - - p = kthread_run(gfs2_logd, sdp, "gfs2_logd"); - if (IS_ERR(p)) { - error = PTR_ERR(p); - fs_err(sdp, "can't start logd thread: %d\n", error); - return error; - } - sdp->sd_logd_process = p; - - p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad"); - if (IS_ERR(p)) { - error = PTR_ERR(p); - fs_err(sdp, "can't start quotad thread: %d\n", error); - goto fail; - } - sdp->sd_quotad_process = p; - return 0; - -fail: - kthread_stop(sdp->sd_logd_process); - sdp->sd_logd_process = NULL; - return error; -} - /** * gfs2_make_fs_rw - Turn a Read-Only FS into a Read-Write one * @sdp: the filesystem @@ -161,26 +133,17 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp) struct gfs2_log_header_host head; int error; - error = init_threads(sdp); - if (error) { - gfs2_withdraw_delayed(sdp); - return error; - } - j_gl->gl_ops->go_inval(j_gl, DIO_METADATA); - if (gfs2_withdrawn(sdp)) { - error = -EIO; - goto fail; - } + if (gfs2_withdrawn(sdp)) + return -EIO; error = gfs2_find_jhead(sdp->sd_jdesc, &head, false); if (error || gfs2_withdrawn(sdp)) - goto fail; + return error; if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) { gfs2_consist(sdp); - error = -EIO; - goto fail; + return -EIO; } /* Initialize some head of the log stuff */ @@ -188,20 +151,8 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp) gfs2_log_pointers_init(sdp, head.lh_blkno); error = gfs2_quota_init(sdp); - if (error || gfs2_withdrawn(sdp)) - goto fail; - - set_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); - - return 0; - -fail: - if (sdp->sd_quotad_process) - kthread_stop(sdp->sd_quotad_process); - sdp->sd_quotad_process = NULL; - if (sdp->sd_logd_process) - kthread_stop(sdp->sd_logd_process); - sdp->sd_logd_process = NULL; + if (!error && !gfs2_withdrawn(sdp)) + set_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); return error; } @@ -227,9 +178,8 @@ int gfs2_statfs_init(struct gfs2_sbd *sdp) { struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; - struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode); struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; - struct buffer_head *m_bh, *l_bh; + struct buffer_head *m_bh; struct gfs2_holder gh; int error; @@ -248,21 +198,15 @@ int gfs2_statfs_init(struct gfs2_sbd *sdp) sizeof(struct gfs2_dinode)); spin_unlock(&sdp->sd_statfs_spin); } else { - error = gfs2_meta_inode_buffer(l_ip, &l_bh); - if (error) - goto out_m_bh; - spin_lock(&sdp->sd_statfs_spin); gfs2_statfs_change_in(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode)); - gfs2_statfs_change_in(l_sc, l_bh->b_data + + gfs2_statfs_change_in(l_sc, sdp->sd_sc_bh->b_data + sizeof(struct gfs2_dinode)); spin_unlock(&sdp->sd_statfs_spin); - brelse(l_bh); } -out_m_bh: brelse(m_bh); out: gfs2_glock_dq_uninit(&gh); @@ -275,22 +219,17 @@ void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode); struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; - struct buffer_head *l_bh; s64 x, y; int need_sync = 0; - int error; - - error = gfs2_meta_inode_buffer(l_ip, &l_bh); - if (error) - return; - gfs2_trans_add_meta(l_ip->i_gl, l_bh); + gfs2_trans_add_meta(l_ip->i_gl, sdp->sd_sc_bh); spin_lock(&sdp->sd_statfs_spin); l_sc->sc_total += total; l_sc->sc_free += free; l_sc->sc_dinodes += dinodes; - gfs2_statfs_change_out(l_sc, l_bh->b_data + sizeof(struct gfs2_dinode)); + gfs2_statfs_change_out(l_sc, sdp->sd_sc_bh->b_data + + sizeof(struct gfs2_dinode)); if (sdp->sd_args.ar_statfs_percent) { x = 100 * l_sc->sc_free; y = m_sc->sc_free * sdp->sd_args.ar_statfs_percent; @@ -299,20 +238,18 @@ void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, } spin_unlock(&sdp->sd_statfs_spin); - brelse(l_bh); if (need_sync) gfs2_wake_up_statfs(sdp); } -void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh, - struct buffer_head *l_bh) +void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh) { struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode); struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; - gfs2_trans_add_meta(l_ip->i_gl, l_bh); + gfs2_trans_add_meta(l_ip->i_gl, sdp->sd_sc_bh); gfs2_trans_add_meta(m_ip->i_gl, m_bh); spin_lock(&sdp->sd_statfs_spin); @@ -320,7 +257,7 @@ void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh, m_sc->sc_free += l_sc->sc_free; m_sc->sc_dinodes += l_sc->sc_dinodes; memset(l_sc, 0, sizeof(struct gfs2_statfs_change)); - memset(l_bh->b_data + sizeof(struct gfs2_dinode), + memset(sdp->sd_sc_bh->b_data + sizeof(struct gfs2_dinode), 0, sizeof(struct gfs2_statfs_change)); gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode)); spin_unlock(&sdp->sd_statfs_spin); @@ -330,11 +267,10 @@ int gfs2_statfs_sync(struct super_block *sb, int type) { struct gfs2_sbd *sdp = sb->s_fs_info; struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); - struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode); struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; struct gfs2_holder gh; - struct buffer_head *m_bh, *l_bh; + struct buffer_head *m_bh; int error; error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, GL_NOCACHE, @@ -355,21 +291,15 @@ int gfs2_statfs_sync(struct super_block *sb, int type) } spin_unlock(&sdp->sd_statfs_spin); - error = gfs2_meta_inode_buffer(l_ip, &l_bh); - if (error) - goto out_bh; - error = gfs2_trans_begin(sdp, 2 * RES_DINODE, 0); if (error) - goto out_bh2; + goto out_bh; - update_statfs(sdp, m_bh, l_bh); + update_statfs(sdp, m_bh); sdp->sd_statfs_force_sync = 0; gfs2_trans_end(sdp); -out_bh2: - brelse(l_bh); out_bh: brelse(m_bh); out_unlock: @@ -675,6 +605,7 @@ restart: gfs2_glock_dq_uninit(&sdp->sd_journal_gh); if (gfs2_holder_initialized(&sdp->sd_jinode_gh)) gfs2_glock_dq_uninit(&sdp->sd_jinode_gh); + brelse(sdp->sd_sc_bh); gfs2_glock_dq_uninit(&sdp->sd_sc_gh); gfs2_glock_dq_uninit(&sdp->sd_qc_gh); free_local_statfs_inodes(sdp); @@ -1016,7 +947,7 @@ static int gfs2_drop_inode(struct inode *inode) gfs2_glock_hold(gl); if (!gfs2_queue_delete_work(gl, 0)) gfs2_glock_queue_put(gl); - return false; + return 0; } return generic_drop_inode(inode); diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h index ec4affb33ed5..58d13fd77aed 100644 --- a/fs/gfs2/super.h +++ b/fs/gfs2/super.h @@ -43,8 +43,7 @@ extern void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf); extern void gfs2_statfs_change_out(const struct gfs2_statfs_change_host *sc, void *buf); -extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh, - struct buffer_head *l_bh); +extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh); extern int gfs2_statfs_sync(struct super_block *sb, int type); extern void gfs2_freeze_func(struct work_struct *work); diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index f4325b44956d..cf345a86ef67 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -278,6 +278,7 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp) goto skip_recovery; } sdp->sd_jdesc->jd_inode = inode; + d_mark_dontcache(inode); /* * Now wait until recovery is complete. @@ -295,7 +296,7 @@ skip_recovery: fs_warn(sdp, "Journal recovery complete for jid %d.\n", sdp->sd_lockstruct.ls_jid); else - fs_warn(sdp, "Journal recovery skipped for %d until next " + fs_warn(sdp, "Journal recovery skipped for jid %d until next " "mount.\n", sdp->sd_lockstruct.ls_jid); fs_warn(sdp, "Glock dequeues delayed: %lu\n", sdp->sd_glock_dqs_held); sdp->sd_glock_dqs_held = 0; diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h index 69e1a0ae5a4d..78ec190f4155 100644 --- a/fs/gfs2/util.h +++ b/fs/gfs2/util.h @@ -218,6 +218,11 @@ static inline bool gfs2_withdrawing(struct gfs2_sbd *sdp) !test_bit(SDF_WITHDRAWN, &sdp->sd_flags); } +static inline bool gfs2_withdraw_in_prog(struct gfs2_sbd *sdp) +{ + return test_bit(SDF_WITHDRAW_IN_PROG, &sdp->sd_flags); +} + #define gfs2_tune_get(sdp, field) \ gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field) diff --git a/fs/hpfs/Kconfig b/fs/hpfs/Kconfig index 2b36dc6f0a10..ec975f466877 100644 --- a/fs/hpfs/Kconfig +++ b/fs/hpfs/Kconfig @@ -2,6 +2,7 @@ config HPFS_FS tristate "OS/2 HPFS file system support" depends on BLOCK + select FS_IOMAP help OS/2 is IBM's operating system for PC's, the same as Warp, and HPFS is the file system used for organizing files on OS/2 hard disk diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index c3a49aacf20a..fb37f57130aa 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -9,6 +9,7 @@ #include "hpfs_fn.h" #include <linux/mpage.h> +#include <linux/iomap.h> #include <linux/fiemap.h> #define BLOCKS(size) (((size) + 511) >> 9) @@ -116,6 +117,47 @@ static int hpfs_get_block(struct inode *inode, sector_t iblock, struct buffer_he return r; } +static int hpfs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, + unsigned flags, struct iomap *iomap, struct iomap *srcmap) +{ + struct super_block *sb = inode->i_sb; + unsigned int blkbits = inode->i_blkbits; + unsigned int n_secs; + secno s; + + if (WARN_ON_ONCE(flags & (IOMAP_WRITE | IOMAP_ZERO))) + return -EINVAL; + + iomap->bdev = inode->i_sb->s_bdev; + iomap->offset = offset; + + hpfs_lock(sb); + s = hpfs_bmap(inode, offset >> blkbits, &n_secs); + if (s) { + n_secs = hpfs_search_hotfix_map_for_range(sb, s, + min_t(loff_t, n_secs, length)); + if (unlikely(!n_secs)) { + s = hpfs_search_hotfix_map(sb, s); + n_secs = 1; + } + iomap->type = IOMAP_MAPPED; + iomap->flags = IOMAP_F_MERGED; + iomap->addr = (u64)s << blkbits; + iomap->length = (u64)n_secs << blkbits; + } else { + iomap->type = IOMAP_HOLE; + iomap->addr = IOMAP_NULL_ADDR; + iomap->length = 1 << blkbits; + } + + hpfs_unlock(sb); + return 0; +} + +static const struct iomap_ops hpfs_iomap_ops = { + .iomap_begin = hpfs_iomap_begin, +}; + static int hpfs_readpage(struct file *file, struct page *page) { return mpage_readpage(page, hpfs_get_block); @@ -192,7 +234,14 @@ static sector_t _hpfs_bmap(struct address_space *mapping, sector_t block) static int hpfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { - return generic_block_fiemap(inode, fieinfo, start, len, hpfs_get_block); + int ret; + + inode_lock(inode); + len = min_t(u64, len, i_size_read(inode)); + ret = iomap_fiemap(inode, fieinfo, start, len, &hpfs_iomap_ops); + inode_unlock(inode); + + return ret; } const struct address_space_operations hpfs_aops = { diff --git a/fs/inode.c b/fs/inode.c index c93500d84264..84c528cd1955 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -190,6 +190,8 @@ int inode_init_always(struct super_block *sb, struct inode *inode) mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); mapping->private_data = NULL; mapping->writeback_index = 0; + __init_rwsem(&mapping->invalidate_lock, "mapping.invalidate_lock", + &sb->s_type->invalidate_lock_key); inode->i_private = NULL; inode->i_mapping = mapping; INIT_HLIST_HEAD(&inode->i_dentry); /* buggered by rcu freeing */ diff --git a/fs/internal.h b/fs/internal.h index 82e8eb32ff3d..68a2ae029a27 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -48,8 +48,8 @@ static inline int emergency_thaw_bdev(struct super_block *sb) /* * buffer.c */ -extern int __block_write_begin_int(struct page *page, loff_t pos, unsigned len, - get_block_t *get_block, struct iomap *iomap); +int __block_write_begin_int(struct page *page, loff_t pos, unsigned len, + get_block_t *get_block, const struct iomap *iomap); /* * char_dev.c @@ -71,11 +71,15 @@ extern int filename_lookup(int dfd, struct filename *name, unsigned flags, struct path *path, struct path *root); extern int vfs_path_lookup(struct dentry *, struct vfsmount *, const char *, unsigned int, struct path *); -long do_rmdir(int dfd, struct filename *name); -long do_unlinkat(int dfd, struct filename *name); +int do_rmdir(int dfd, struct filename *name); +int do_unlinkat(int dfd, struct filename *name); int may_linkat(struct user_namespace *mnt_userns, struct path *link); int do_renameat2(int olddfd, struct filename *oldname, int newdfd, struct filename *newname, unsigned int flags); +int do_mkdirat(int dfd, struct filename *name, umode_t mode); +int do_symlinkat(struct filename *from, int newdfd, struct filename *to); +int do_linkat(int olddfd, struct filename *old, int newdfd, + struct filename *new, int flags); /* * namespace.c diff --git a/fs/io-wq.c b/fs/io-wq.c index 12fc19353bb0..cd9bd095fb1b 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -51,6 +51,10 @@ struct io_worker { struct completion ref_done; + unsigned long create_state; + struct callback_head create_work; + int create_index; + struct rcu_head rcu; }; @@ -129,7 +133,7 @@ struct io_cb_cancel_data { bool cancel_all; }; -static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index); +static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index, bool first); static void io_wqe_dec_running(struct io_worker *worker); static bool io_worker_get(struct io_worker *worker) @@ -174,7 +178,7 @@ static void io_worker_exit(struct io_worker *worker) complete(&worker->ref_done); wait_for_completion(&worker->ref_done); - raw_spin_lock_irq(&wqe->lock); + raw_spin_lock(&wqe->lock); if (worker->flags & IO_WORKER_F_FREE) hlist_nulls_del_rcu(&worker->nulls_node); list_del_rcu(&worker->all_list); @@ -184,7 +188,7 @@ static void io_worker_exit(struct io_worker *worker) worker->flags = 0; current->flags &= ~PF_IO_WORKER; preempt_enable(); - raw_spin_unlock_irq(&wqe->lock); + raw_spin_unlock(&wqe->lock); kfree_rcu(worker, rcu); io_worker_ref_put(wqe->wq); @@ -248,18 +252,21 @@ static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct) rcu_read_unlock(); if (!ret) { - bool do_create = false; + bool do_create = false, first = false; - raw_spin_lock_irq(&wqe->lock); + raw_spin_lock(&wqe->lock); if (acct->nr_workers < acct->max_workers) { - atomic_inc(&acct->nr_running); - atomic_inc(&wqe->wq->worker_refs); + if (!acct->nr_workers) + first = true; acct->nr_workers++; do_create = true; } - raw_spin_unlock_irq(&wqe->lock); - if (do_create) - create_io_worker(wqe->wq, wqe, acct->index); + raw_spin_unlock(&wqe->lock); + if (do_create) { + atomic_inc(&acct->nr_running); + atomic_inc(&wqe->wq->worker_refs); + create_io_worker(wqe->wq, wqe, acct->index, first); + } } } @@ -270,50 +277,63 @@ static void io_wqe_inc_running(struct io_worker *worker) atomic_inc(&acct->nr_running); } -struct create_worker_data { - struct callback_head work; - struct io_wqe *wqe; - int index; -}; - static void create_worker_cb(struct callback_head *cb) { - struct create_worker_data *cwd; + struct io_worker *worker; struct io_wq *wq; struct io_wqe *wqe; struct io_wqe_acct *acct; + bool do_create = false, first = false; - cwd = container_of(cb, struct create_worker_data, work); - wqe = cwd->wqe; + worker = container_of(cb, struct io_worker, create_work); + wqe = worker->wqe; wq = wqe->wq; - acct = &wqe->acct[cwd->index]; - raw_spin_lock_irq(&wqe->lock); - if (acct->nr_workers < acct->max_workers) + acct = &wqe->acct[worker->create_index]; + raw_spin_lock(&wqe->lock); + if (acct->nr_workers < acct->max_workers) { + if (!acct->nr_workers) + first = true; acct->nr_workers++; - raw_spin_unlock_irq(&wqe->lock); - create_io_worker(wq, cwd->wqe, cwd->index); - kfree(cwd); + do_create = true; + } + raw_spin_unlock(&wqe->lock); + if (do_create) { + create_io_worker(wq, wqe, worker->create_index, first); + } else { + atomic_dec(&acct->nr_running); + io_worker_ref_put(wq); + } + clear_bit_unlock(0, &worker->create_state); + io_worker_release(worker); } -static void io_queue_worker_create(struct io_wqe *wqe, struct io_wqe_acct *acct) +static void io_queue_worker_create(struct io_wqe *wqe, struct io_worker *worker, + struct io_wqe_acct *acct) { - struct create_worker_data *cwd; struct io_wq *wq = wqe->wq; /* raced with exit, just ignore create call */ if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) goto fail; + if (!io_worker_get(worker)) + goto fail; + /* + * create_state manages ownership of create_work/index. We should + * only need one entry per worker, as the worker going to sleep + * will trigger the condition, and waking will clear it once it + * runs the task_work. + */ + if (test_bit(0, &worker->create_state) || + test_and_set_bit_lock(0, &worker->create_state)) + goto fail_release; - cwd = kmalloc(sizeof(*cwd), GFP_ATOMIC); - if (cwd) { - init_task_work(&cwd->work, create_worker_cb); - cwd->wqe = wqe; - cwd->index = acct->index; - if (!task_work_add(wq->task, &cwd->work, TWA_SIGNAL)) - return; - - kfree(cwd); - } + init_task_work(&worker->create_work, create_worker_cb); + worker->create_index = acct->index; + if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL)) + return; + clear_bit_unlock(0, &worker->create_state); +fail_release: + io_worker_release(worker); fail: atomic_dec(&acct->nr_running); io_worker_ref_put(wq); @@ -331,7 +351,7 @@ static void io_wqe_dec_running(struct io_worker *worker) if (atomic_dec_and_test(&acct->nr_running) && io_wqe_run_queue(wqe)) { atomic_inc(&acct->nr_running); atomic_inc(&wqe->wq->worker_refs); - io_queue_worker_create(wqe, acct); + io_queue_worker_create(wqe, worker, acct); } } @@ -404,7 +424,28 @@ static void io_wait_on_hash(struct io_wqe *wqe, unsigned int hash) spin_unlock(&wq->hash->wait.lock); } -static struct io_wq_work *io_get_next_work(struct io_wqe *wqe) +/* + * We can always run the work if the worker is currently the same type as + * the work (eg both are bound, or both are unbound). If they are not the + * same, only allow it if incrementing the worker count would be allowed. + */ +static bool io_worker_can_run_work(struct io_worker *worker, + struct io_wq_work *work) +{ + struct io_wqe_acct *acct; + + if (!(worker->flags & IO_WORKER_F_BOUND) != + !(work->flags & IO_WQ_WORK_UNBOUND)) + return true; + + /* not the same type, check if we'd go over the limit */ + acct = io_work_get_acct(worker->wqe, work); + return acct->nr_workers < acct->max_workers; +} + +static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, + struct io_worker *worker, + bool *stalled) __must_hold(wqe->lock) { struct io_wq_work_node *node, *prev; @@ -416,6 +457,9 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe) work = container_of(node, struct io_wq_work, list); + if (!io_worker_can_run_work(worker, work)) + break; + /* not hashed, can run anytime */ if (!io_wq_is_hashed(work)) { wq_list_del(&wqe->work_list, node, prev); @@ -442,6 +486,7 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe) raw_spin_unlock(&wqe->lock); io_wait_on_hash(wqe, stall_hash); raw_spin_lock(&wqe->lock); + *stalled = true; } return NULL; @@ -465,9 +510,9 @@ static void io_assign_current_work(struct io_worker *worker, cond_resched(); } - spin_lock_irq(&worker->lock); + spin_lock(&worker->lock); worker->cur_work = work; - spin_unlock_irq(&worker->lock); + spin_unlock(&worker->lock); } static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work); @@ -481,6 +526,7 @@ static void io_worker_handle_work(struct io_worker *worker) do { struct io_wq_work *work; + bool stalled; get_next: /* * If we got some work, mark us as busy. If we didn't, but @@ -489,13 +535,14 @@ get_next: * can't make progress, any work completion or insertion will * clear the stalled flag. */ - work = io_get_next_work(wqe); + stalled = false; + work = io_get_next_work(wqe, worker, &stalled); if (work) __io_worker_busy(wqe, worker, work); - else if (!wq_list_empty(&wqe->work_list)) + else if (stalled) wqe->flags |= IO_WQE_FLAG_STALLED; - raw_spin_unlock_irq(&wqe->lock); + raw_spin_unlock(&wqe->lock); if (!work) break; io_assign_current_work(worker, work); @@ -527,16 +574,16 @@ get_next: clear_bit(hash, &wq->hash->map); if (wq_has_sleeper(&wq->hash->wait)) wake_up(&wq->hash->wait); - raw_spin_lock_irq(&wqe->lock); + raw_spin_lock(&wqe->lock); wqe->flags &= ~IO_WQE_FLAG_STALLED; /* skip unnecessary unlock-lock wqe->lock */ if (!work) goto get_next; - raw_spin_unlock_irq(&wqe->lock); + raw_spin_unlock(&wqe->lock); } } while (work); - raw_spin_lock_irq(&wqe->lock); + raw_spin_lock(&wqe->lock); } while (1); } @@ -557,13 +604,13 @@ static int io_wqe_worker(void *data) set_current_state(TASK_INTERRUPTIBLE); loop: - raw_spin_lock_irq(&wqe->lock); + raw_spin_lock(&wqe->lock); if (io_wqe_run_queue(wqe)) { io_worker_handle_work(worker); goto loop; } __io_worker_idle(wqe, worker); - raw_spin_unlock_irq(&wqe->lock); + raw_spin_unlock(&wqe->lock); if (io_flush_signals()) continue; ret = schedule_timeout(WORKER_IDLE_TIMEOUT); @@ -582,7 +629,7 @@ loop: } if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) { - raw_spin_lock_irq(&wqe->lock); + raw_spin_lock(&wqe->lock); io_worker_handle_work(worker); } @@ -624,12 +671,12 @@ void io_wq_worker_sleeping(struct task_struct *tsk) worker->flags &= ~IO_WORKER_F_RUNNING; - raw_spin_lock_irq(&worker->wqe->lock); + raw_spin_lock(&worker->wqe->lock); io_wqe_dec_running(worker); - raw_spin_unlock_irq(&worker->wqe->lock); + raw_spin_unlock(&worker->wqe->lock); } -static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) +static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index, bool first) { struct io_wqe_acct *acct = &wqe->acct[index]; struct io_worker *worker; @@ -652,9 +699,9 @@ static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) kfree(worker); fail: atomic_dec(&acct->nr_running); - raw_spin_lock_irq(&wqe->lock); + raw_spin_lock(&wqe->lock); acct->nr_workers--; - raw_spin_unlock_irq(&wqe->lock); + raw_spin_unlock(&wqe->lock); io_worker_ref_put(wq); return; } @@ -664,15 +711,15 @@ fail: set_cpus_allowed_ptr(tsk, wqe->cpu_mask); tsk->flags |= PF_NO_SETAFFINITY; - raw_spin_lock_irq(&wqe->lock); + raw_spin_lock(&wqe->lock); hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list); list_add_tail_rcu(&worker->all_list, &wqe->all_list); worker->flags |= IO_WORKER_F_FREE; if (index == IO_WQ_ACCT_BOUND) worker->flags |= IO_WORKER_F_BOUND; - if ((acct->nr_workers == 1) && (worker->flags & IO_WORKER_F_BOUND)) + if (first && (worker->flags & IO_WORKER_F_BOUND)) worker->flags |= IO_WORKER_F_FIXED; - raw_spin_unlock_irq(&wqe->lock); + raw_spin_unlock(&wqe->lock); wake_up_new_task(tsk); } @@ -747,8 +794,7 @@ append: static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) { struct io_wqe_acct *acct = io_work_get_acct(wqe, work); - int work_flags; - unsigned long flags; + bool do_wake; /* * If io-wq is exiting for this task, or if the request has explicitly @@ -760,14 +806,14 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) return; } - work_flags = work->flags; - raw_spin_lock_irqsave(&wqe->lock, flags); + raw_spin_lock(&wqe->lock); io_wqe_insert_work(wqe, work); wqe->flags &= ~IO_WQE_FLAG_STALLED; - raw_spin_unlock_irqrestore(&wqe->lock, flags); + do_wake = (work->flags & IO_WQ_WORK_CONCURRENT) || + !atomic_read(&acct->nr_running); + raw_spin_unlock(&wqe->lock); - if ((work_flags & IO_WQ_WORK_CONCURRENT) || - !atomic_read(&acct->nr_running)) + if (do_wake) io_wqe_wake_worker(wqe, acct); } @@ -793,19 +839,18 @@ void io_wq_hash_work(struct io_wq_work *work, void *val) static bool io_wq_worker_cancel(struct io_worker *worker, void *data) { struct io_cb_cancel_data *match = data; - unsigned long flags; /* * Hold the lock to avoid ->cur_work going out of scope, caller * may dereference the passed in work. */ - spin_lock_irqsave(&worker->lock, flags); + spin_lock(&worker->lock); if (worker->cur_work && match->fn(worker->cur_work, match->data)) { set_notify_signal(worker->task); match->nr_running++; } - spin_unlock_irqrestore(&worker->lock, flags); + spin_unlock(&worker->lock); return match->nr_running && !match->cancel_all; } @@ -833,16 +878,15 @@ static void io_wqe_cancel_pending_work(struct io_wqe *wqe, { struct io_wq_work_node *node, *prev; struct io_wq_work *work; - unsigned long flags; retry: - raw_spin_lock_irqsave(&wqe->lock, flags); + raw_spin_lock(&wqe->lock); wq_list_for_each(node, prev, &wqe->work_list) { work = container_of(node, struct io_wq_work, list); if (!match->fn(work, match->data)) continue; io_wqe_remove_pending(wqe, work, prev); - raw_spin_unlock_irqrestore(&wqe->lock, flags); + raw_spin_unlock(&wqe->lock); io_run_cancel(work, wqe); match->nr_pending++; if (!match->cancel_all) @@ -851,7 +895,7 @@ retry: /* not safe to continue after unlock */ goto retry; } - raw_spin_unlock_irqrestore(&wqe->lock, flags); + raw_spin_unlock(&wqe->lock); } static void io_wqe_cancel_running_work(struct io_wqe *wqe, @@ -992,12 +1036,12 @@ err_wq: static bool io_task_work_match(struct callback_head *cb, void *data) { - struct create_worker_data *cwd; + struct io_worker *worker; if (cb->func != create_worker_cb) return false; - cwd = container_of(cb, struct create_worker_data, work); - return cwd->wqe->wq == data; + worker = container_of(cb, struct io_worker, create_work); + return worker->wqe->wq == data; } void io_wq_exit_start(struct io_wq *wq) @@ -1014,12 +1058,13 @@ static void io_wq_exit_workers(struct io_wq *wq) return; while ((cb = task_work_cancel_match(wq->task, io_task_work_match, wq)) != NULL) { - struct create_worker_data *cwd; + struct io_worker *worker; - cwd = container_of(cb, struct create_worker_data, work); - atomic_dec(&cwd->wqe->acct[cwd->index].nr_running); + worker = container_of(cb, struct io_worker, create_work); + atomic_dec(&worker->wqe->acct[worker->create_index].nr_running); io_worker_ref_put(wq); - kfree(cwd); + clear_bit_unlock(0, &worker->create_state); + io_worker_release(worker); } rcu_read_lock(); @@ -1131,6 +1176,35 @@ int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask) return 0; } +/* + * Set max number of unbounded workers, returns old value. If new_count is 0, + * then just return the old value. + */ +int io_wq_max_workers(struct io_wq *wq, int *new_count) +{ + int i, node, prev = 0; + + for (i = 0; i < 2; i++) { + if (new_count[i] > task_rlimit(current, RLIMIT_NPROC)) + new_count[i] = task_rlimit(current, RLIMIT_NPROC); + } + + rcu_read_lock(); + for_each_node(node) { + struct io_wqe_acct *acct; + + for (i = 0; i < 2; i++) { + acct = &wq->wqes[node]->acct[i]; + prev = max_t(int, acct->max_workers, prev); + if (new_count[i]) + acct->max_workers = new_count[i]; + new_count[i] = prev; + } + } + rcu_read_unlock(); + return 0; +} + static __init int io_wq_init(void) { int ret; diff --git a/fs/io-wq.h b/fs/io-wq.h index 3999ee58ff26..bf5c4c533760 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -44,6 +44,7 @@ static inline void wq_list_add_after(struct io_wq_work_node *node, static inline void wq_list_add_tail(struct io_wq_work_node *node, struct io_wq_work_list *list) { + node->next = NULL; if (!list->first) { list->last = node; WRITE_ONCE(list->first, node); @@ -51,7 +52,6 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node, list->last->next = node; list->last = node; } - node->next = NULL; } static inline void wq_list_cut(struct io_wq_work_list *list, @@ -128,6 +128,7 @@ void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work); void io_wq_hash_work(struct io_wq_work *work, void *val); int io_wq_cpu_affinity(struct io_wq *wq, cpumask_var_t mask); +int io_wq_max_workers(struct io_wq *wq, int *new_count); static inline bool io_wq_is_hashed(struct io_wq_work *work) { diff --git a/fs/io_uring.c b/fs/io_uring.c index bf548af0426c..6f35b1285865 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -78,6 +78,7 @@ #include <linux/task_work.h> #include <linux/pagemap.h> #include <linux/io_uring.h> +#include <linux/tracehook.h> #define CREATE_TRACE_POINTS #include <trace/events/io_uring.h> @@ -91,17 +92,12 @@ #define IORING_MAX_CQ_ENTRIES (2 * IORING_MAX_ENTRIES) #define IORING_SQPOLL_CAP_ENTRIES_VALUE 8 -/* - * Shift of 9 is 512 entries, or exactly one page on 64-bit archs - */ -#define IORING_FILE_TABLE_SHIFT 9 -#define IORING_MAX_FILES_TABLE (1U << IORING_FILE_TABLE_SHIFT) -#define IORING_FILE_TABLE_MASK (IORING_MAX_FILES_TABLE - 1) -#define IORING_MAX_FIXED_FILES (64 * IORING_MAX_FILES_TABLE) +/* only define max */ +#define IORING_MAX_FIXED_FILES (1U << 15) #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ IORING_REGISTER_LAST + IORING_OP_LAST) -#define IO_RSRC_TAG_TABLE_SHIFT 9 +#define IO_RSRC_TAG_TABLE_SHIFT (PAGE_SHIFT - 3) #define IO_RSRC_TAG_TABLE_MAX (1U << IO_RSRC_TAG_TABLE_SHIFT) #define IO_RSRC_TAG_TABLE_MASK (IO_RSRC_TAG_TABLE_MAX - 1) @@ -234,8 +230,7 @@ struct io_rsrc_put { }; struct io_file_table { - /* two level table */ - struct io_fixed_file **files; + struct io_fixed_file *files; }; struct io_rsrc_node { @@ -300,18 +295,10 @@ struct io_sq_data { struct completion exited; }; -#define IO_IOPOLL_BATCH 8 #define IO_COMPL_BATCH 32 #define IO_REQ_CACHE_SIZE 32 #define IO_REQ_ALLOC_BATCH 8 -struct io_comp_state { - struct io_kiocb *reqs[IO_COMPL_BATCH]; - unsigned int nr; - /* inline/task_work completion list, under ->uring_lock */ - struct list_head free_list; -}; - struct io_submit_link { struct io_kiocb *head; struct io_kiocb *last; @@ -332,14 +319,11 @@ struct io_submit_state { /* * Batch completion logic */ - struct io_comp_state comp; + struct io_kiocb *compl_reqs[IO_COMPL_BATCH]; + unsigned int compl_nr; + /* inline/task_work completion list, under ->uring_lock */ + struct list_head free_list; - /* - * File reference cache - */ - struct file *file; - unsigned int fd; - unsigned int file_refs; unsigned int ios_left; }; @@ -391,6 +375,7 @@ struct io_ring_ctx { struct io_submit_state submit_state; struct list_head timeout_list; + struct list_head ltimeout_list; struct list_head cq_overflow_list; struct xarray io_buffers; struct xarray personalities; @@ -425,6 +410,8 @@ struct io_ring_ctx { struct { spinlock_t completion_lock; + spinlock_t timeout_lock; + /* * ->iopoll_list is protected by the ctx->uring_lock for * io_uring instances that don't use IORING_SETUP_SQPOLL. @@ -486,8 +473,8 @@ struct io_uring_task { spinlock_t task_lock; struct io_wq_work_list task_list; - unsigned long task_state; struct callback_head task_work; + bool task_running; }; /* @@ -522,6 +509,7 @@ struct io_timeout_data { struct hrtimer timer; struct timespec64 ts; enum hrtimer_mode mode; + u32 flags; }; struct io_accept { @@ -529,6 +517,7 @@ struct io_accept { struct sockaddr __user *addr; int __user *addr_len; int flags; + u32 file_slot; unsigned long nofile; }; @@ -552,6 +541,8 @@ struct io_timeout { struct list_head list; /* head of the link, used by linked timeouts only */ struct io_kiocb *head; + /* for linked completions */ + struct io_kiocb *prev; }; struct io_timeout_rem { @@ -561,6 +552,7 @@ struct io_timeout_rem { /* timeout update */ struct timespec64 ts; u32 flags; + bool ltimeout; }; struct io_rw { @@ -592,6 +584,7 @@ struct io_sr_msg { struct io_open { struct file *file; int dfd; + u32 file_slot; struct filename *filename; struct open_how how; unsigned long nofile; @@ -674,9 +667,31 @@ struct io_unlink { struct filename *filename; }; +struct io_mkdir { + struct file *file; + int dfd; + umode_t mode; + struct filename *filename; +}; + +struct io_symlink { + struct file *file; + int new_dfd; + struct filename *oldpath; + struct filename *newpath; +}; + +struct io_hardlink { + struct file *file; + int old_dfd; + int new_dfd; + struct filename *oldpath; + struct filename *newpath; + int flags; +}; + struct io_completion { struct file *file; - struct list_head list; u32 cflags; }; @@ -718,14 +733,15 @@ enum { REQ_F_NEED_CLEANUP_BIT, REQ_F_POLLED_BIT, REQ_F_BUFFER_SELECTED_BIT, - REQ_F_LTIMEOUT_ACTIVE_BIT, REQ_F_COMPLETE_INLINE_BIT, REQ_F_REISSUE_BIT, REQ_F_DONT_REISSUE_BIT, REQ_F_CREDS_BIT, + REQ_F_REFCOUNT_BIT, + REQ_F_ARM_LTIMEOUT_BIT, /* keep async read/write and isreg together and in order */ - REQ_F_ASYNC_READ_BIT, - REQ_F_ASYNC_WRITE_BIT, + REQ_F_NOWAIT_READ_BIT, + REQ_F_NOWAIT_WRITE_BIT, REQ_F_ISREG_BIT, /* not a real bit, just to check we're not overflowing the space */ @@ -762,8 +778,6 @@ enum { REQ_F_POLLED = BIT(REQ_F_POLLED_BIT), /* buffer already selected */ REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT), - /* linked timeout is active, i.e. prepared by link's head */ - REQ_F_LTIMEOUT_ACTIVE = BIT(REQ_F_LTIMEOUT_ACTIVE_BIT), /* completion is deferred through io_comp_state */ REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT), /* caller should reissue async */ @@ -771,13 +785,17 @@ enum { /* don't attempt request reissue, see io_rw_reissue() */ REQ_F_DONT_REISSUE = BIT(REQ_F_DONT_REISSUE_BIT), /* supports async reads */ - REQ_F_ASYNC_READ = BIT(REQ_F_ASYNC_READ_BIT), + REQ_F_NOWAIT_READ = BIT(REQ_F_NOWAIT_READ_BIT), /* supports async writes */ - REQ_F_ASYNC_WRITE = BIT(REQ_F_ASYNC_WRITE_BIT), + REQ_F_NOWAIT_WRITE = BIT(REQ_F_NOWAIT_WRITE_BIT), /* regular file */ REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), /* has creds assigned */ REQ_F_CREDS = BIT(REQ_F_CREDS_BIT), + /* skip refcounting if not set */ + REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT), + /* there is a linked timeout that has to be armed */ + REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT), }; struct async_poll { @@ -785,7 +803,7 @@ struct async_poll { struct io_poll_iocb *double_poll; }; -typedef void (*io_req_tw_func_t)(struct io_kiocb *req); +typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked); struct io_task_work { union { @@ -831,6 +849,9 @@ struct io_kiocb { struct io_shutdown shutdown; struct io_rename rename; struct io_unlink unlink; + struct io_mkdir mkdir; + struct io_symlink symlink; + struct io_hardlink hardlink; /* use only after cleaning per-op data, see io_clean_op() */ struct io_completion compl; }; @@ -1042,39 +1063,43 @@ static const struct io_op_def io_op_defs[] = { }, [IORING_OP_RENAMEAT] = {}, [IORING_OP_UNLINKAT] = {}, + [IORING_OP_MKDIRAT] = {}, + [IORING_OP_SYMLINKAT] = {}, + [IORING_OP_LINKAT] = {}, }; +/* requests with any of those set should undergo io_disarm_next() */ +#define IO_DISARM_MASK (REQ_F_ARM_LTIMEOUT | REQ_F_LINK_TIMEOUT | REQ_F_FAIL) + static bool io_disarm_next(struct io_kiocb *req); static void io_uring_del_tctx_node(unsigned long index); static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct task_struct *task, bool cancel_all); static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); -static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx); static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data, long res, unsigned int cflags); static void io_put_req(struct io_kiocb *req); -static void io_put_req_deferred(struct io_kiocb *req, int nr); +static void io_put_req_deferred(struct io_kiocb *req); static void io_dismantle_req(struct io_kiocb *req); -static void io_put_task(struct task_struct *task, int nr); -static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req); static void io_queue_linked_timeout(struct io_kiocb *req); static int __io_register_rsrc_update(struct io_ring_ctx *ctx, unsigned type, struct io_uring_rsrc_update2 *up, unsigned nr_args); static void io_clean_op(struct io_kiocb *req); -static struct file *io_file_get(struct io_submit_state *state, +static struct file *io_file_get(struct io_ring_ctx *ctx, struct io_kiocb *req, int fd, bool fixed); static void __io_queue_sqe(struct io_kiocb *req); static void io_rsrc_put_work(struct work_struct *work); static void io_req_task_queue(struct io_kiocb *req); static void io_submit_flush_completions(struct io_ring_ctx *ctx); -static bool io_poll_remove_waitqs(struct io_kiocb *req); static int io_req_prep_async(struct io_kiocb *req); -static void io_fallback_req_func(struct work_struct *unused); +static int io_install_fixed_file(struct io_kiocb *req, struct file *file, + unsigned int issue_flags, u32 slot_index); +static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer); static struct kmem_cache *req_cachep; @@ -1093,9 +1118,65 @@ struct sock *io_uring_get_socket(struct file *file) } EXPORT_SYMBOL(io_uring_get_socket); +static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked) +{ + if (!*locked) { + mutex_lock(&ctx->uring_lock); + *locked = true; + } +} + #define io_for_each_link(pos, head) \ for (pos = (head); pos; pos = pos->link) +/* + * Shamelessly stolen from the mm implementation of page reference checking, + * see commit f958d7b528b1 for details. + */ +#define req_ref_zero_or_close_to_overflow(req) \ + ((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u) + +static inline bool req_ref_inc_not_zero(struct io_kiocb *req) +{ + WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT)); + return atomic_inc_not_zero(&req->refs); +} + +static inline bool req_ref_put_and_test(struct io_kiocb *req) +{ + if (likely(!(req->flags & REQ_F_REFCOUNT))) + return true; + + WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); + return atomic_dec_and_test(&req->refs); +} + +static inline void req_ref_put(struct io_kiocb *req) +{ + WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT)); + WARN_ON_ONCE(req_ref_put_and_test(req)); +} + +static inline void req_ref_get(struct io_kiocb *req) +{ + WARN_ON_ONCE(!(req->flags & REQ_F_REFCOUNT)); + WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); + atomic_inc(&req->refs); +} + +static inline void __io_req_set_refcount(struct io_kiocb *req, int nr) +{ + if (!(req->flags & REQ_F_REFCOUNT)) { + req->flags |= REQ_F_REFCOUNT; + atomic_set(&req->refs, nr); + } +} + +static inline void io_req_set_refcount(struct io_kiocb *req) +{ + __io_req_set_refcount(req, 1); +} + static inline void io_req_set_rsrc_node(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; @@ -1140,6 +1221,12 @@ static inline void req_set_fail(struct io_kiocb *req) req->flags |= REQ_F_FAIL; } +static inline void req_fail_link_node(struct io_kiocb *req, int res) +{ + req_set_fail(req); + req->result = res; +} + static void io_ring_ctx_ref_free(struct percpu_ref *ref) { struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs); @@ -1152,6 +1239,27 @@ static inline bool io_is_timeout_noseq(struct io_kiocb *req) return !req->timeout.off; } +static void io_fallback_req_func(struct work_struct *work) +{ + struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, + fallback_work.work); + struct llist_node *node = llist_del_all(&ctx->fallback_llist); + struct io_kiocb *req, *tmp; + bool locked = false; + + percpu_ref_get(&ctx->refs); + llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node) + req->io_task_work.func(req, &locked); + + if (locked) { + if (ctx->submit_state.compl_nr) + io_submit_flush_completions(ctx); + mutex_unlock(&ctx->uring_lock); + } + percpu_ref_put(&ctx->refs); + +} + static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) { struct io_ring_ctx *ctx; @@ -1197,15 +1305,17 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) mutex_init(&ctx->uring_lock); init_waitqueue_head(&ctx->cq_wait); spin_lock_init(&ctx->completion_lock); + spin_lock_init(&ctx->timeout_lock); INIT_LIST_HEAD(&ctx->iopoll_list); INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); + INIT_LIST_HEAD(&ctx->ltimeout_list); spin_lock_init(&ctx->rsrc_ref_lock); INIT_LIST_HEAD(&ctx->rsrc_ref_list); INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work); init_llist_head(&ctx->rsrc_put_llist); INIT_LIST_HEAD(&ctx->tctx_list); - INIT_LIST_HEAD(&ctx->submit_state.comp.free_list); + INIT_LIST_HEAD(&ctx->submit_state.free_list); INIT_LIST_HEAD(&ctx->locked_free_list); INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func); return ctx; @@ -1235,6 +1345,20 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq) return false; } +#define FFS_ASYNC_READ 0x1UL +#define FFS_ASYNC_WRITE 0x2UL +#ifdef CONFIG_64BIT +#define FFS_ISREG 0x4UL +#else +#define FFS_ISREG 0x0UL +#endif +#define FFS_MASK ~(FFS_ASYNC_READ|FFS_ASYNC_WRITE|FFS_ISREG) + +static inline bool io_req_ffs_set(struct io_kiocb *req) +{ + return IS_ENABLED(CONFIG_64BIT) && (req->flags & REQ_F_FIXED_FILE); +} + static void io_req_track_inflight(struct io_kiocb *req) { if (!(req->flags & REQ_F_INFLIGHT)) { @@ -1243,6 +1367,32 @@ static void io_req_track_inflight(struct io_kiocb *req) } } +static inline void io_unprep_linked_timeout(struct io_kiocb *req) +{ + req->flags &= ~REQ_F_LINK_TIMEOUT; +} + +static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req) +{ + if (WARN_ON_ONCE(!req->link)) + return NULL; + + req->flags &= ~REQ_F_ARM_LTIMEOUT; + req->flags |= REQ_F_LINK_TIMEOUT; + + /* linked timeouts should have two refs once prep'ed */ + io_req_set_refcount(req); + __io_req_set_refcount(req->link, 2); + return req->link; +} + +static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) +{ + if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT))) + return NULL; + return __io_prep_linked_timeout(req); +} + static void io_prep_async_work(struct io_kiocb *req) { const struct io_op_def *def = &io_op_defs[req->opcode]; @@ -1282,22 +1432,25 @@ static void io_prep_async_link(struct io_kiocb *req) if (req->flags & REQ_F_LINK_TIMEOUT) { struct io_ring_ctx *ctx = req->ctx; - spin_lock_irq(&ctx->completion_lock); + spin_lock(&ctx->completion_lock); io_for_each_link(cur, req) io_prep_async_work(cur); - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); } else { io_for_each_link(cur, req) io_prep_async_work(cur); } } -static void io_queue_async_work(struct io_kiocb *req) +static void io_queue_async_work(struct io_kiocb *req, bool *locked) { struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *link = io_prep_linked_timeout(req); struct io_uring_task *tctx = req->task->io_uring; + /* must not take the lock, NULL it as a precaution */ + locked = NULL; + BUG_ON(!tctx); BUG_ON(!tctx->io_wq); @@ -1323,6 +1476,7 @@ static void io_queue_async_work(struct io_kiocb *req) static void io_kill_timeout(struct io_kiocb *req, int status) __must_hold(&req->ctx->completion_lock) + __must_hold(&req->ctx->timeout_lock) { struct io_timeout_data *io = req->async_data; @@ -1331,7 +1485,7 @@ static void io_kill_timeout(struct io_kiocb *req, int status) atomic_read(&req->ctx->cq_timeouts) + 1); list_del_init(&req->timeout.list); io_cqring_fill_event(req->ctx, req->user_data, status, 0); - io_put_req_deferred(req, 1); + io_put_req_deferred(req); } } @@ -1350,9 +1504,11 @@ static void io_queue_deferred(struct io_ring_ctx *ctx) } static void io_flush_timeouts(struct io_ring_ctx *ctx) + __must_hold(&ctx->completion_lock) { u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); + spin_lock_irq(&ctx->timeout_lock); while (!list_empty(&ctx->timeout_list)) { u32 events_needed, events_got; struct io_kiocb *req = list_first_entry(&ctx->timeout_list, @@ -1377,6 +1533,7 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx) io_kill_timeout(req, 0); } ctx->cq_last_tm_flush = seq; + spin_unlock_irq(&ctx->timeout_lock); } static void __io_commit_cqring_flush(struct io_ring_ctx *ctx) @@ -1433,13 +1590,22 @@ static inline bool io_should_trigger_evfd(struct io_ring_ctx *ctx) return !ctx->eventfd_async || io_wq_current_is_worker(); } +/* + * This should only get called when at least one event has been posted. + * Some applications rely on the eventfd notification count only changing + * IFF a new CQE has been added to the CQ ring. There's no depedency on + * 1:1 relationship between how many times this function is called (and + * hence the eventfd count) and number of CQEs posted to the CQ ring. + */ static void io_cqring_ev_posted(struct io_ring_ctx *ctx) { - /* see waitqueue_active() comment */ - smp_mb(); - - if (waitqueue_active(&ctx->cq_wait)) - wake_up(&ctx->cq_wait); + /* + * wake_up_all() may seem excessive, but io_wake_function() and + * io_should_wake() handle the termination of the loop and only + * wake as many waiters as we need to. + */ + if (wq_has_sleeper(&ctx->cq_wait)) + wake_up_all(&ctx->cq_wait); if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait)) wake_up(&ctx->sq_data->wait); if (io_should_trigger_evfd(ctx)) @@ -1452,12 +1618,9 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx) static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) { - /* see waitqueue_active() comment */ - smp_mb(); - if (ctx->flags & IORING_SETUP_SQPOLL) { - if (waitqueue_active(&ctx->cq_wait)) - wake_up(&ctx->cq_wait); + if (wq_has_sleeper(&ctx->cq_wait)) + wake_up_all(&ctx->cq_wait); } if (io_should_trigger_evfd(ctx)) eventfd_signal(ctx->cq_ev_fd, 1); @@ -1470,14 +1633,13 @@ static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) /* Returns true if there are no backlogged entries after the flush */ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) { - unsigned long flags; bool all_flushed, posted; if (!force && __io_cqring_events(ctx) == ctx->cq_entries) return false; posted = false; - spin_lock_irqsave(&ctx->completion_lock, flags); + spin_lock(&ctx->completion_lock); while (!list_empty(&ctx->cq_overflow_list)) { struct io_uring_cqe *cqe = io_get_cqe(ctx); struct io_overflow_cqe *ocqe; @@ -1499,18 +1661,19 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) all_flushed = list_empty(&ctx->cq_overflow_list); if (all_flushed) { clear_bit(0, &ctx->check_cq_overflow); - ctx->rings->sq_flags &= ~IORING_SQ_CQ_OVERFLOW; + WRITE_ONCE(ctx->rings->sq_flags, + ctx->rings->sq_flags & ~IORING_SQ_CQ_OVERFLOW); } if (posted) io_commit_cqring(ctx); - spin_unlock_irqrestore(&ctx->completion_lock, flags); + spin_unlock(&ctx->completion_lock); if (posted) io_cqring_ev_posted(ctx); return all_flushed; } -static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) +static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx) { bool ret = true; @@ -1518,7 +1681,7 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) /* iopoll syncs against uring_lock, not completion_lock */ if (ctx->flags & IORING_SETUP_IOPOLL) mutex_lock(&ctx->uring_lock); - ret = __io_cqring_overflow_flush(ctx, force); + ret = __io_cqring_overflow_flush(ctx, false); if (ctx->flags & IORING_SETUP_IOPOLL) mutex_unlock(&ctx->uring_lock); } @@ -1526,39 +1689,37 @@ static bool io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) return ret; } -/* - * Shamelessly stolen from the mm implementation of page reference checking, - * see commit f958d7b528b1 for details. - */ -#define req_ref_zero_or_close_to_overflow(req) \ - ((unsigned int) atomic_read(&(req->refs)) + 127u <= 127u) - -static inline bool req_ref_inc_not_zero(struct io_kiocb *req) +/* must to be called somewhat shortly after putting a request */ +static inline void io_put_task(struct task_struct *task, int nr) { - return atomic_inc_not_zero(&req->refs); -} + struct io_uring_task *tctx = task->io_uring; -static inline bool req_ref_sub_and_test(struct io_kiocb *req, int refs) -{ - WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); - return atomic_sub_and_test(refs, &req->refs); + if (likely(task == current)) { + tctx->cached_refs += nr; + } else { + percpu_counter_sub(&tctx->inflight, nr); + if (unlikely(atomic_read(&tctx->in_idle))) + wake_up(&tctx->wait); + put_task_struct_many(task, nr); + } } -static inline bool req_ref_put_and_test(struct io_kiocb *req) +static void io_task_refs_refill(struct io_uring_task *tctx) { - WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); - return atomic_dec_and_test(&req->refs); -} + unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR; -static inline void req_ref_put(struct io_kiocb *req) -{ - WARN_ON_ONCE(req_ref_put_and_test(req)); + percpu_counter_add(&tctx->inflight, refill); + refcount_add(refill, ¤t->usage); + tctx->cached_refs += refill; } -static inline void req_ref_get(struct io_kiocb *req) +static inline void io_get_task_refs(int nr) { - WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); - atomic_inc(&req->refs); + struct io_uring_task *tctx = current->io_uring; + + tctx->cached_refs -= nr; + if (unlikely(tctx->cached_refs < 0)) + io_task_refs_refill(tctx); } static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, @@ -1578,7 +1739,9 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, } if (list_empty(&ctx->cq_overflow_list)) { set_bit(0, &ctx->check_cq_overflow); - ctx->rings->sq_flags |= IORING_SQ_CQ_OVERFLOW; + WRITE_ONCE(ctx->rings->sq_flags, + ctx->rings->sq_flags | IORING_SQ_CQ_OVERFLOW); + } ocqe->cqe.user_data = user_data; ocqe->cqe.res = res; @@ -1620,9 +1783,8 @@ static void io_req_complete_post(struct io_kiocb *req, long res, unsigned int cflags) { struct io_ring_ctx *ctx = req->ctx; - unsigned long flags; - spin_lock_irqsave(&ctx->completion_lock, flags); + spin_lock(&ctx->completion_lock); __io_cqring_fill_event(ctx, req->user_data, res, cflags); /* * If we're the last reference to this request, add to our locked @@ -1630,7 +1792,7 @@ static void io_req_complete_post(struct io_kiocb *req, long res, */ if (req_ref_put_and_test(req)) { if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { - if (req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_FAIL)) + if (req->flags & IO_DISARM_MASK) io_disarm_next(req); if (req->link) { io_req_task_queue(req->link); @@ -1639,14 +1801,14 @@ static void io_req_complete_post(struct io_kiocb *req, long res, } io_dismantle_req(req); io_put_task(req->task, 1); - list_add(&req->compl.list, &ctx->locked_free_list); + list_add(&req->inflight_entry, &ctx->locked_free_list); ctx->locked_free_nr++; } else { if (!percpu_ref_tryget(&ctx->refs)) req = NULL; } io_commit_cqring(ctx); - spin_unlock_irqrestore(&ctx->completion_lock, flags); + spin_unlock(&ctx->completion_lock); if (req) { io_cqring_ev_posted(ctx); @@ -1686,24 +1848,35 @@ static inline void io_req_complete(struct io_kiocb *req, long res) static void io_req_complete_failed(struct io_kiocb *req, long res) { req_set_fail(req); - io_put_req(req); io_req_complete_post(req, res, 0); } +/* + * Don't initialise the fields below on every allocation, but do that in + * advance and keep them valid across allocations. + */ +static void io_preinit_req(struct io_kiocb *req, struct io_ring_ctx *ctx) +{ + req->ctx = ctx; + req->link = NULL; + req->async_data = NULL; + /* not necessary, but safer to zero */ + req->result = 0; +} + static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, - struct io_comp_state *cs) + struct io_submit_state *state) { - spin_lock_irq(&ctx->completion_lock); - list_splice_init(&ctx->locked_free_list, &cs->free_list); + spin_lock(&ctx->completion_lock); + list_splice_init(&ctx->locked_free_list, &state->free_list); ctx->locked_free_nr = 0; - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); } /* Returns true IFF there are requests in the cache */ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx) { struct io_submit_state *state = &ctx->submit_state; - struct io_comp_state *cs = &state->comp; int nr; /* @@ -1712,14 +1885,14 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx) * side cache. */ if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH) - io_flush_cached_locked_reqs(ctx, cs); + io_flush_cached_locked_reqs(ctx, state); nr = state->free_reqs; - while (!list_empty(&cs->free_list)) { - struct io_kiocb *req = list_first_entry(&cs->free_list, - struct io_kiocb, compl.list); + while (!list_empty(&state->free_list)) { + struct io_kiocb *req = list_first_entry(&state->free_list, + struct io_kiocb, inflight_entry); - list_del(&req->compl.list); + list_del(&req->inflight_entry); state->reqs[nr++] = req; if (nr == ARRAY_SIZE(state->reqs)) break; @@ -1729,48 +1902,41 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx) return nr != 0; } +/* + * A request might get retired back into the request caches even before opcode + * handlers and io_issue_sqe() are done with it, e.g. inline completion path. + * Because of that, io_alloc_req() should be called only under ->uring_lock + * and with extra caution to not get a request that is still worked on. + */ static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx) + __must_hold(&ctx->uring_lock) { struct io_submit_state *state = &ctx->submit_state; + gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; + int ret, i; BUILD_BUG_ON(ARRAY_SIZE(state->reqs) < IO_REQ_ALLOC_BATCH); - if (!state->free_reqs) { - gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; - int ret, i; - - if (io_flush_cached_reqs(ctx)) - goto got_req; - - ret = kmem_cache_alloc_bulk(req_cachep, gfp, IO_REQ_ALLOC_BATCH, - state->reqs); + if (likely(state->free_reqs || io_flush_cached_reqs(ctx))) + goto got_req; - /* - * Bulk alloc is all-or-nothing. If we fail to get a batch, - * retry single alloc to be on the safe side. - */ - if (unlikely(ret <= 0)) { - state->reqs[0] = kmem_cache_alloc(req_cachep, gfp); - if (!state->reqs[0]) - return NULL; - ret = 1; - } + ret = kmem_cache_alloc_bulk(req_cachep, gfp, IO_REQ_ALLOC_BATCH, + state->reqs); - /* - * Don't initialise the fields below on every allocation, but - * do that in advance and keep valid on free. - */ - for (i = 0; i < ret; i++) { - struct io_kiocb *req = state->reqs[i]; - - req->ctx = ctx; - req->link = NULL; - req->async_data = NULL; - /* not necessary, but safer to zero */ - req->result = 0; - } - state->free_reqs = ret; + /* + * Bulk alloc is all-or-nothing. If we fail to get a batch, + * retry single alloc to be on the safe side. + */ + if (unlikely(ret <= 0)) { + state->reqs[0] = kmem_cache_alloc(req_cachep, gfp); + if (!state->reqs[0]) + return NULL; + ret = 1; } + + for (i = 0; i < ret; i++) + io_preinit_req(state->reqs[i], ctx); + state->free_reqs = ret; got_req: state->free_reqs--; return state->reqs[state->free_reqs]; @@ -1798,17 +1964,6 @@ static void io_dismantle_req(struct io_kiocb *req) } } -/* must to be called somewhat shortly after putting a request */ -static inline void io_put_task(struct task_struct *task, int nr) -{ - struct io_uring_task *tctx = task->io_uring; - - percpu_counter_sub(&tctx->inflight, nr); - if (unlikely(atomic_read(&tctx->in_idle))) - wake_up(&tctx->wait); - put_task_struct_many(task, nr); -} - static void __io_free_req(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; @@ -1816,7 +1971,11 @@ static void __io_free_req(struct io_kiocb *req) io_dismantle_req(req); io_put_task(req->task, 1); - kmem_cache_free(req_cachep, req); + spin_lock(&ctx->completion_lock); + list_add(&req->inflight_entry, &ctx->locked_free_list); + ctx->locked_free_nr++; + spin_unlock(&ctx->completion_lock); + percpu_ref_put(&ctx->refs); } @@ -1830,22 +1989,20 @@ static inline void io_remove_next_linked(struct io_kiocb *req) static bool io_kill_linked_timeout(struct io_kiocb *req) __must_hold(&req->ctx->completion_lock) + __must_hold(&req->ctx->timeout_lock) { struct io_kiocb *link = req->link; - /* - * Can happen if a linked timeout fired and link had been like - * req -> link t-out -> link t-out [-> ...] - */ - if (link && (link->flags & REQ_F_LTIMEOUT_ACTIVE)) { + if (link && link->opcode == IORING_OP_LINK_TIMEOUT) { struct io_timeout_data *io = link->async_data; io_remove_next_linked(req); link->timeout.head = NULL; if (hrtimer_try_to_cancel(&io->timer) != -1) { + list_del(&link->timeout.list); io_cqring_fill_event(link->ctx, link->user_data, -ECANCELED, 0); - io_put_req_deferred(link, 1); + io_put_req_deferred(link); return true; } } @@ -1859,12 +2016,17 @@ static void io_fail_links(struct io_kiocb *req) req->link = NULL; while (link) { + long res = -ECANCELED; + + if (link->flags & REQ_F_FAIL) + res = link->result; + nxt = link->link; link->link = NULL; trace_io_uring_fail_link(req, link); - io_cqring_fill_event(link->ctx, link->user_data, -ECANCELED, 0); - io_put_req_deferred(link, 2); + io_cqring_fill_event(link->ctx, link->user_data, res, 0); + io_put_req_deferred(link); link = nxt; } } @@ -1874,8 +2036,24 @@ static bool io_disarm_next(struct io_kiocb *req) { bool posted = false; - if (likely(req->flags & REQ_F_LINK_TIMEOUT)) + if (req->flags & REQ_F_ARM_LTIMEOUT) { + struct io_kiocb *link = req->link; + + req->flags &= ~REQ_F_ARM_LTIMEOUT; + if (link && link->opcode == IORING_OP_LINK_TIMEOUT) { + io_remove_next_linked(req); + io_cqring_fill_event(link->ctx, link->user_data, + -ECANCELED, 0); + io_put_req_deferred(link); + posted = true; + } + } else if (req->flags & REQ_F_LINK_TIMEOUT) { + struct io_ring_ctx *ctx = req->ctx; + + spin_lock_irq(&ctx->timeout_lock); posted = io_kill_linked_timeout(req); + spin_unlock_irq(&ctx->timeout_lock); + } if (unlikely((req->flags & REQ_F_FAIL) && !(req->flags & REQ_F_HARDLINK))) { posted |= (req->link != NULL); @@ -1894,16 +2072,15 @@ static struct io_kiocb *__io_req_find_next(struct io_kiocb *req) * dependencies to the next request. In case of failure, fail the rest * of the chain. */ - if (req->flags & (REQ_F_LINK_TIMEOUT | REQ_F_FAIL)) { + if (req->flags & IO_DISARM_MASK) { struct io_ring_ctx *ctx = req->ctx; - unsigned long flags; bool posted; - spin_lock_irqsave(&ctx->completion_lock, flags); + spin_lock(&ctx->completion_lock); posted = io_disarm_next(req); if (posted) io_commit_cqring(req->ctx); - spin_unlock_irqrestore(&ctx->completion_lock, flags); + spin_unlock(&ctx->completion_lock); if (posted) io_cqring_ev_posted(ctx); } @@ -1919,20 +2096,22 @@ static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) return __io_req_find_next(req); } -static void ctx_flush_and_put(struct io_ring_ctx *ctx) +static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked) { if (!ctx) return; - if (ctx->submit_state.comp.nr) { - mutex_lock(&ctx->uring_lock); - io_submit_flush_completions(ctx); + if (*locked) { + if (ctx->submit_state.compl_nr) + io_submit_flush_completions(ctx); mutex_unlock(&ctx->uring_lock); + *locked = false; } percpu_ref_put(&ctx->refs); } static void tctx_task_work(struct callback_head *cb) { + bool locked = false; struct io_ring_ctx *ctx = NULL; struct io_uring_task *tctx = container_of(cb, struct io_uring_task, task_work); @@ -1943,37 +2122,32 @@ static void tctx_task_work(struct callback_head *cb) spin_lock_irq(&tctx->task_lock); node = tctx->task_list.first; INIT_WQ_LIST(&tctx->task_list); + if (!node) + tctx->task_running = false; spin_unlock_irq(&tctx->task_lock); + if (!node) + break; - while (node) { + do { struct io_wq_work_node *next = node->next; struct io_kiocb *req = container_of(node, struct io_kiocb, io_task_work.node); if (req->ctx != ctx) { - ctx_flush_and_put(ctx); + ctx_flush_and_put(ctx, &locked); ctx = req->ctx; + /* if not contended, grab and improve batching */ + locked = mutex_trylock(&ctx->uring_lock); percpu_ref_get(&ctx->refs); } - req->io_task_work.func(req); + req->io_task_work.func(req, &locked); node = next; - } - if (wq_list_empty(&tctx->task_list)) { - spin_lock_irq(&tctx->task_lock); - clear_bit(0, &tctx->task_state); - if (wq_list_empty(&tctx->task_list)) { - spin_unlock_irq(&tctx->task_lock); - break; - } - spin_unlock_irq(&tctx->task_lock); - /* another tctx_task_work() is enqueued, yield */ - if (test_and_set_bit(0, &tctx->task_state)) - break; - } + } while (node); + cond_resched(); } - ctx_flush_and_put(ctx); + ctx_flush_and_put(ctx, &locked); } static void io_req_task_work_add(struct io_kiocb *req) @@ -1983,19 +2157,20 @@ static void io_req_task_work_add(struct io_kiocb *req) enum task_work_notify_mode notify; struct io_wq_work_node *node; unsigned long flags; + bool running; WARN_ON_ONCE(!tctx); spin_lock_irqsave(&tctx->task_lock, flags); wq_list_add_tail(&req->io_task_work.node, &tctx->task_list); + running = tctx->task_running; + if (!running) + tctx->task_running = true; spin_unlock_irqrestore(&tctx->task_lock, flags); /* task_work already pending, we're done */ - if (test_bit(0, &tctx->task_state) || - test_and_set_bit(0, &tctx->task_state)) + if (running) return; - if (unlikely(tsk->flags & PF_EXITING)) - goto fail; /* * SQPOLL kernel thread doesn't need notification, just a wakeup. For @@ -2008,9 +2183,9 @@ static void io_req_task_work_add(struct io_kiocb *req) wake_up_process(tsk); return; } -fail: - clear_bit(0, &tctx->task_state); + spin_lock_irqsave(&tctx->task_lock, flags); + tctx->task_running = false; node = tctx->task_list.first; INIT_WQ_LIST(&tctx->task_list); spin_unlock_irqrestore(&tctx->task_lock, flags); @@ -2024,27 +2199,25 @@ fail: } } -static void io_req_task_cancel(struct io_kiocb *req) +static void io_req_task_cancel(struct io_kiocb *req, bool *locked) { struct io_ring_ctx *ctx = req->ctx; - /* ctx is guaranteed to stay alive while we hold uring_lock */ - mutex_lock(&ctx->uring_lock); + /* not needed for normal modes, but SQPOLL depends on it */ + io_tw_lock(ctx, locked); io_req_complete_failed(req, req->result); - mutex_unlock(&ctx->uring_lock); } -static void io_req_task_submit(struct io_kiocb *req) +static void io_req_task_submit(struct io_kiocb *req, bool *locked) { struct io_ring_ctx *ctx = req->ctx; - /* ctx stays valid until unlock, even if we drop all ours ctx->refs */ - mutex_lock(&ctx->uring_lock); - if (!(req->task->flags & PF_EXITING) && !req->task->in_execve) + io_tw_lock(ctx, locked); + /* req->task == current here, checking PF_EXITING is safe */ + if (likely(!(req->task->flags & PF_EXITING))) __io_queue_sqe(req); else io_req_complete_failed(req, -EFAULT); - mutex_unlock(&ctx->uring_lock); } static void io_req_task_queue_fail(struct io_kiocb *req, int ret) @@ -2080,6 +2253,11 @@ static void io_free_req(struct io_kiocb *req) __io_free_req(req); } +static void io_free_req_work(struct io_kiocb *req, bool *locked) +{ + io_free_req(req); +} + struct req_batch { struct task_struct *task; int task_refs; @@ -2096,10 +2274,10 @@ static inline void io_init_req_batch(struct req_batch *rb) static void io_req_free_batch_finish(struct io_ring_ctx *ctx, struct req_batch *rb) { - if (rb->task) - io_put_task(rb->task, rb->task_refs); if (rb->ctx_refs) percpu_ref_put_many(&ctx->refs, rb->ctx_refs); + if (rb->task) + io_put_task(rb->task, rb->task_refs); } static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req, @@ -2120,37 +2298,37 @@ static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req, if (state->free_reqs != ARRAY_SIZE(state->reqs)) state->reqs[state->free_reqs++] = req; else - list_add(&req->compl.list, &state->comp.free_list); + list_add(&req->inflight_entry, &state->free_list); } static void io_submit_flush_completions(struct io_ring_ctx *ctx) + __must_hold(&ctx->uring_lock) { - struct io_comp_state *cs = &ctx->submit_state.comp; - int i, nr = cs->nr; + struct io_submit_state *state = &ctx->submit_state; + int i, nr = state->compl_nr; struct req_batch rb; - spin_lock_irq(&ctx->completion_lock); + spin_lock(&ctx->completion_lock); for (i = 0; i < nr; i++) { - struct io_kiocb *req = cs->reqs[i]; + struct io_kiocb *req = state->compl_reqs[i]; __io_cqring_fill_event(ctx, req->user_data, req->result, req->compl.cflags); } io_commit_cqring(ctx); - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); io_cqring_ev_posted(ctx); io_init_req_batch(&rb); for (i = 0; i < nr; i++) { - struct io_kiocb *req = cs->reqs[i]; + struct io_kiocb *req = state->compl_reqs[i]; - /* submission and completion refs */ - if (req_ref_sub_and_test(req, 2)) + if (req_ref_put_and_test(req)) io_req_free_batch(&rb, req, &ctx->submit_state); } io_req_free_batch_finish(ctx, &rb); - cs->nr = 0; + state->compl_nr = 0; } /* @@ -2174,16 +2352,12 @@ static inline void io_put_req(struct io_kiocb *req) io_free_req(req); } -static void io_free_req_deferred(struct io_kiocb *req) +static inline void io_put_req_deferred(struct io_kiocb *req) { - req->io_task_work.func = io_free_req; - io_req_task_work_add(req); -} - -static inline void io_put_req_deferred(struct io_kiocb *req, int refs) -{ - if (req_ref_sub_and_test(req, refs)) - io_free_req_deferred(req); + if (req_ref_put_and_test(req)) { + req->io_task_work.func = io_free_req_work; + io_req_task_work_add(req); + } } static unsigned io_cqring_events(struct io_ring_ctx *ctx) @@ -2216,15 +2390,17 @@ static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req) { struct io_buffer *kbuf; + if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) + return 0; kbuf = (struct io_buffer *) (unsigned long) req->rw.addr; return io_put_kbuf(req, kbuf); } static inline bool io_run_task_work(void) { - if (current->task_works) { + if (test_thread_flag(TIF_NOTIFY_SIGNAL) || current->task_works) { __set_current_state(TASK_RUNNING); - task_work_run(); + tracehook_notify_signal(); return true; } @@ -2235,7 +2411,7 @@ static inline bool io_run_task_work(void) * Find and free completed poll iocbs */ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, - struct list_head *done, bool resubmit) + struct list_head *done) { struct req_batch rb; struct io_kiocb *req; @@ -2245,23 +2421,18 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, io_init_req_batch(&rb); while (!list_empty(done)) { - int cflags = 0; - req = list_first_entry(done, struct io_kiocb, inflight_entry); list_del(&req->inflight_entry); - if (READ_ONCE(req->result) == -EAGAIN && resubmit && + if (READ_ONCE(req->result) == -EAGAIN && !(req->flags & REQ_F_DONT_REISSUE)) { req->iopoll_completed = 0; - req_ref_get(req); io_req_task_queue_reissue(req); continue; } - if (req->flags & REQ_F_BUFFER_SELECTED) - cflags = io_put_rw_kbuf(req); - - __io_cqring_fill_event(ctx, req->user_data, req->result, cflags); + __io_cqring_fill_event(ctx, req->user_data, req->result, + io_put_rw_kbuf(req)); (*nr_events)++; if (req_ref_put_and_test(req)) @@ -2274,12 +2445,11 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, } static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, - long min, bool resubmit) + long min) { struct io_kiocb *req, *tmp; LIST_HEAD(done); bool spin; - int ret; /* * Only spin for completions if we don't have multiple devices hanging @@ -2287,9 +2457,9 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, */ spin = !ctx->poll_multi_queue && *nr_events < min; - ret = 0; list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) { struct kiocb *kiocb = &req->rw.kiocb; + int ret; /* * Move completed and retryable entries to our local lists. @@ -2304,22 +2474,20 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, break; ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin); - if (ret < 0) - break; + if (unlikely(ret < 0)) + return ret; + else if (ret) + spin = false; /* iopoll may have completed current req */ if (READ_ONCE(req->iopoll_completed)) list_move_tail(&req->inflight_entry, &done); - - if (ret && spin) - spin = false; - ret = 0; } if (!list_empty(&done)) - io_iopoll_complete(ctx, nr_events, &done, resubmit); + io_iopoll_complete(ctx, nr_events, &done); - return ret; + return 0; } /* @@ -2335,7 +2503,7 @@ static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) while (!list_empty(&ctx->iopoll_list)) { unsigned int nr_events = 0; - io_do_iopoll(ctx, &nr_events, 0, false); + io_do_iopoll(ctx, &nr_events, 0); /* let it sleep and repeat later if can't complete a request */ if (nr_events == 0) @@ -2397,7 +2565,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) list_empty(&ctx->iopoll_list)) break; } - ret = io_do_iopoll(ctx, &nr_events, min, true); + ret = io_do_iopoll(ctx, &nr_events, min); } while (!ret && nr_events < min && !need_resched()); out: mutex_unlock(&ctx->uring_lock); @@ -2466,42 +2634,57 @@ static bool io_rw_should_reissue(struct io_kiocb *req) } #endif -static void io_fallback_req_func(struct work_struct *work) +static bool __io_complete_rw_common(struct io_kiocb *req, long res) { - struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, - fallback_work.work); - struct llist_node *node = llist_del_all(&ctx->fallback_llist); - struct io_kiocb *req, *tmp; - - llist_for_each_entry_safe(req, tmp, node, io_task_work.fallback_node) - req->io_task_work.func(req); -} - -static void __io_complete_rw(struct io_kiocb *req, long res, long res2, - unsigned int issue_flags) -{ - int cflags = 0; - if (req->rw.kiocb.ki_flags & IOCB_WRITE) kiocb_end_write(req); if (res != req->result) { if ((res == -EAGAIN || res == -EOPNOTSUPP) && io_rw_should_reissue(req)) { req->flags |= REQ_F_REISSUE; - return; + return true; } req_set_fail(req); + req->result = res; + } + return false; +} + +static void io_req_task_complete(struct io_kiocb *req, bool *locked) +{ + unsigned int cflags = io_put_rw_kbuf(req); + long res = req->result; + + if (*locked) { + struct io_ring_ctx *ctx = req->ctx; + struct io_submit_state *state = &ctx->submit_state; + + io_req_complete_state(req, res, cflags); + state->compl_reqs[state->compl_nr++] = req; + if (state->compl_nr == ARRAY_SIZE(state->compl_reqs)) + io_submit_flush_completions(ctx); + } else { + io_req_complete_post(req, res, cflags); } - if (req->flags & REQ_F_BUFFER_SELECTED) - cflags = io_put_rw_kbuf(req); - __io_req_complete(req, issue_flags, res, cflags); +} + +static void __io_complete_rw(struct io_kiocb *req, long res, long res2, + unsigned int issue_flags) +{ + if (__io_complete_rw_common(req, res)) + return; + __io_req_complete(req, 0, req->result, io_put_rw_kbuf(req)); } static void io_complete_rw(struct kiocb *kiocb, long res, long res2) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); - __io_complete_rw(req, res, res2, 0); + if (__io_complete_rw_common(req, res)) + return; + req->result = res; + req->io_task_work.func = io_req_task_complete; + io_req_task_work_add(req); } static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) @@ -2587,40 +2770,6 @@ static void io_iopoll_req_issued(struct io_kiocb *req) } } -static inline void io_state_file_put(struct io_submit_state *state) -{ - if (state->file_refs) { - fput_many(state->file, state->file_refs); - state->file_refs = 0; - } -} - -/* - * Get as many references to a file as we have IOs left in this submission, - * assuming most submissions are for one file, or at least that each file - * has more than one submission. - */ -static struct file *__io_file_get(struct io_submit_state *state, int fd) -{ - if (!state) - return fget(fd); - - if (state->file_refs) { - if (state->fd == fd) { - state->file_refs--; - return state->file; - } - io_state_file_put(state); - } - state->file = fget_many(fd, state->ios_left); - if (unlikely(!state->file)) - return NULL; - - state->fd = fd; - state->file_refs = state->ios_left - 1; - return state->file; -} - static bool io_bdev_nowait(struct block_device *bdev) { return !bdev || blk_queue_nowait(bdev_get_queue(bdev)); @@ -2631,7 +2780,7 @@ static bool io_bdev_nowait(struct block_device *bdev) * any file. For now, just ensure that anything potentially problematic is done * inline. */ -static bool __io_file_supports_async(struct file *file, int rw) +static bool __io_file_supports_nowait(struct file *file, int rw) { umode_t mode = file_inode(file)->i_mode; @@ -2664,14 +2813,14 @@ static bool __io_file_supports_async(struct file *file, int rw) return file->f_op->write_iter != NULL; } -static bool io_file_supports_async(struct io_kiocb *req, int rw) +static bool io_file_supports_nowait(struct io_kiocb *req, int rw) { - if (rw == READ && (req->flags & REQ_F_ASYNC_READ)) + if (rw == READ && (req->flags & REQ_F_NOWAIT_READ)) return true; - else if (rw == WRITE && (req->flags & REQ_F_ASYNC_WRITE)) + else if (rw == WRITE && (req->flags & REQ_F_NOWAIT_WRITE)) return true; - return __io_file_supports_async(req->file, rw); + return __io_file_supports_nowait(req->file, rw); } static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -2682,7 +2831,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) unsigned ioprio; int ret; - if (!(req->flags & REQ_F_ISREG) && S_ISREG(file_inode(file)->i_mode)) + if (!io_req_ffs_set(req) && S_ISREG(file_inode(file)->i_mode)) req->flags |= REQ_F_ISREG; kiocb->ki_pos = READ_ONCE(sqe->off); @@ -2715,7 +2864,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) !kiocb->ki_filp->f_op->iopoll) return -EOPNOTSUPP; - kiocb->ki_flags |= IOCB_HIPRI; + kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE; kiocb->ki_complete = io_complete_rw_iopoll; req->iopoll_completed = 0; } else { @@ -2782,15 +2931,11 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret, if (check_reissue && (req->flags & REQ_F_REISSUE)) { req->flags &= ~REQ_F_REISSUE; if (io_resubmit_prep(req)) { - req_ref_get(req); io_req_task_queue_reissue(req); } else { - int cflags = 0; - req_set_fail(req); - if (req->flags & REQ_F_BUFFER_SELECTED) - cflags = io_put_rw_kbuf(req); - __io_req_complete(req, issue_flags, ret, cflags); + __io_req_complete(req, issue_flags, ret, + io_put_rw_kbuf(req)); } } } @@ -3208,9 +3353,6 @@ static int io_async_buf_func(struct wait_queue_entry *wait, unsigned mode, req->rw.kiocb.ki_flags &= ~IOCB_WAITQ; list_del_init(&wait->entry); - - /* submit ref gets dropped, acquire a new one */ - req_ref_get(req); io_req_task_queue(req); return 1; } @@ -3295,7 +3437,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) kiocb->ki_flags |= IOCB_NOWAIT; /* If the file doesn't support async, just async punt */ - if (force_nonblock && !io_file_supports_async(req, READ)) { + if (force_nonblock && !io_file_supports_nowait(req, READ)) { ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true); return ret ?: -EAGAIN; } @@ -3400,7 +3542,7 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) kiocb->ki_flags |= IOCB_NOWAIT; /* If the file doesn't support async, just async punt */ - if (force_nonblock && !io_file_supports_async(req, WRITE)) + if (force_nonblock && !io_file_supports_nowait(req, WRITE)) goto copy_iov; /* file path doesn't support NOWAIT for non-direct_IO */ @@ -3475,7 +3617,7 @@ static int io_renameat_prep(struct io_kiocb *req, if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (sqe->ioprio || sqe->buf_index) + if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; @@ -3526,7 +3668,8 @@ static int io_unlinkat_prep(struct io_kiocb *req, if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index) + if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index || + sqe->splice_fd_in) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) return -EBADF; @@ -3566,14 +3709,157 @@ static int io_unlinkat(struct io_kiocb *req, unsigned int issue_flags) return 0; } +static int io_mkdirat_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + struct io_mkdir *mkd = &req->mkdir; + const char __user *fname; + + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + if (sqe->ioprio || sqe->off || sqe->rw_flags || sqe->buf_index || + sqe->splice_fd_in) + return -EINVAL; + if (unlikely(req->flags & REQ_F_FIXED_FILE)) + return -EBADF; + + mkd->dfd = READ_ONCE(sqe->fd); + mkd->mode = READ_ONCE(sqe->len); + + fname = u64_to_user_ptr(READ_ONCE(sqe->addr)); + mkd->filename = getname(fname); + if (IS_ERR(mkd->filename)) + return PTR_ERR(mkd->filename); + + req->flags |= REQ_F_NEED_CLEANUP; + return 0; +} + +static int io_mkdirat(struct io_kiocb *req, int issue_flags) +{ + struct io_mkdir *mkd = &req->mkdir; + int ret; + + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + + ret = do_mkdirat(mkd->dfd, mkd->filename, mkd->mode); + + req->flags &= ~REQ_F_NEED_CLEANUP; + if (ret < 0) + req_set_fail(req); + io_req_complete(req, ret); + return 0; +} + +static int io_symlinkat_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + struct io_symlink *sl = &req->symlink; + const char __user *oldpath, *newpath; + + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + if (sqe->ioprio || sqe->len || sqe->rw_flags || sqe->buf_index || + sqe->splice_fd_in) + return -EINVAL; + if (unlikely(req->flags & REQ_F_FIXED_FILE)) + return -EBADF; + + sl->new_dfd = READ_ONCE(sqe->fd); + oldpath = u64_to_user_ptr(READ_ONCE(sqe->addr)); + newpath = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + + sl->oldpath = getname(oldpath); + if (IS_ERR(sl->oldpath)) + return PTR_ERR(sl->oldpath); + + sl->newpath = getname(newpath); + if (IS_ERR(sl->newpath)) { + putname(sl->oldpath); + return PTR_ERR(sl->newpath); + } + + req->flags |= REQ_F_NEED_CLEANUP; + return 0; +} + +static int io_symlinkat(struct io_kiocb *req, int issue_flags) +{ + struct io_symlink *sl = &req->symlink; + int ret; + + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + + ret = do_symlinkat(sl->oldpath, sl->new_dfd, sl->newpath); + + req->flags &= ~REQ_F_NEED_CLEANUP; + if (ret < 0) + req_set_fail(req); + io_req_complete(req, ret); + return 0; +} + +static int io_linkat_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + struct io_hardlink *lnk = &req->hardlink; + const char __user *oldf, *newf; + + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + if (sqe->ioprio || sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) + return -EINVAL; + if (unlikely(req->flags & REQ_F_FIXED_FILE)) + return -EBADF; + + lnk->old_dfd = READ_ONCE(sqe->fd); + lnk->new_dfd = READ_ONCE(sqe->len); + oldf = u64_to_user_ptr(READ_ONCE(sqe->addr)); + newf = u64_to_user_ptr(READ_ONCE(sqe->addr2)); + lnk->flags = READ_ONCE(sqe->hardlink_flags); + + lnk->oldpath = getname(oldf); + if (IS_ERR(lnk->oldpath)) + return PTR_ERR(lnk->oldpath); + + lnk->newpath = getname(newf); + if (IS_ERR(lnk->newpath)) { + putname(lnk->oldpath); + return PTR_ERR(lnk->newpath); + } + + req->flags |= REQ_F_NEED_CLEANUP; + return 0; +} + +static int io_linkat(struct io_kiocb *req, int issue_flags) +{ + struct io_hardlink *lnk = &req->hardlink; + int ret; + + if (issue_flags & IO_URING_F_NONBLOCK) + return -EAGAIN; + + ret = do_linkat(lnk->old_dfd, lnk->oldpath, lnk->new_dfd, + lnk->newpath, lnk->flags); + + req->flags &= ~REQ_F_NEED_CLEANUP; + if (ret < 0) + req_set_fail(req); + io_req_complete(req, ret); + return 0; +} + static int io_shutdown_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { #if defined(CONFIG_NET) if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags || - sqe->buf_index) + if (unlikely(sqe->ioprio || sqe->off || sqe->addr || sqe->rw_flags || + sqe->buf_index || sqe->splice_fd_in)) return -EINVAL; req->shutdown.how = READ_ONCE(sqe->len); @@ -3622,7 +3908,7 @@ static int __io_splice_prep(struct io_kiocb *req, if (unlikely(sp->flags & ~valid_flags)) return -EINVAL; - sp->file_in = io_file_get(NULL, req, READ_ONCE(sqe->splice_fd_in), + sp->file_in = io_file_get(req->ctx, req, READ_ONCE(sqe->splice_fd_in), (sp->flags & SPLICE_F_FD_IN_FIXED)); if (!sp->file_in) return -EBADF; @@ -3721,7 +4007,8 @@ static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index)) + if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index || + sqe->splice_fd_in)) return -EINVAL; req->sync.flags = READ_ONCE(sqe->fsync_flags); @@ -3754,7 +4041,8 @@ static int io_fsync(struct io_kiocb *req, unsigned int issue_flags) static int io_fallocate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - if (sqe->ioprio || sqe->buf_index || sqe->rw_flags) + if (sqe->ioprio || sqe->buf_index || sqe->rw_flags || + sqe->splice_fd_in) return -EINVAL; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; @@ -3785,6 +4073,8 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe const char __user *fname; int ret; + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; if (unlikely(sqe->ioprio || sqe->buf_index)) return -EINVAL; if (unlikely(req->flags & REQ_F_FIXED_FILE)) @@ -3802,6 +4092,11 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe req->open.filename = NULL; return ret; } + + req->open.file_slot = READ_ONCE(sqe->file_index); + if (req->open.file_slot && (req->open.how.flags & O_CLOEXEC)) + return -EINVAL; + req->open.nofile = rlimit(RLIMIT_NOFILE); req->flags |= REQ_F_NEED_CLEANUP; return 0; @@ -3809,12 +4104,9 @@ static int __io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe static int io_openat_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - u64 flags, mode; + u64 mode = READ_ONCE(sqe->len); + u64 flags = READ_ONCE(sqe->open_flags); - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; - mode = READ_ONCE(sqe->len); - flags = READ_ONCE(sqe->open_flags); req->open.how = build_open_how(flags, mode); return __io_openat_prep(req, sqe); } @@ -3825,8 +4117,6 @@ static int io_openat2_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) size_t len; int ret; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) - return -EINVAL; how = u64_to_user_ptr(READ_ONCE(sqe->addr2)); len = READ_ONCE(sqe->len); if (len < OPEN_HOW_SIZE_VER0) @@ -3844,8 +4134,8 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags) { struct open_flags op; struct file *file; - bool nonblock_set; - bool resolve_nonblock; + bool resolve_nonblock, nonblock_set; + bool fixed = !!req->open.file_slot; int ret; ret = build_open_flags(&req->open.how, &op); @@ -3864,9 +4154,11 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags) op.open_flag |= O_NONBLOCK; } - ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile); - if (ret < 0) - goto err; + if (!fixed) { + ret = __get_unused_fd_flags(req->open.how.flags, req->open.nofile); + if (ret < 0) + goto err; + } file = do_filp_open(req->open.dfd, req->open.filename, &op); if (IS_ERR(file)) { @@ -3875,7 +4167,8 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags) * marginal gain for something that is now known to be a slower * path. So just put it, and we'll get a new one when we retry. */ - put_unused_fd(ret); + if (!fixed) + put_unused_fd(ret); ret = PTR_ERR(file); /* only retry if RESOLVE_CACHED wasn't already set by application */ @@ -3888,7 +4181,12 @@ static int io_openat2(struct io_kiocb *req, unsigned int issue_flags) if ((issue_flags & IO_URING_F_NONBLOCK) && !nonblock_set) file->f_flags &= ~O_NONBLOCK; fsnotify_open(file); - fd_install(ret, file); + + if (!fixed) + fd_install(ret, file); + else + ret = io_install_fixed_file(req, file, issue_flags, + req->open.file_slot - 1); err: putname(req->open.filename); req->flags &= ~REQ_F_NEED_CLEANUP; @@ -3909,7 +4207,8 @@ static int io_remove_buffers_prep(struct io_kiocb *req, struct io_provide_buf *p = &req->pbuf; u64 tmp; - if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off) + if (sqe->ioprio || sqe->rw_flags || sqe->addr || sqe->len || sqe->off || + sqe->splice_fd_in) return -EINVAL; tmp = READ_ONCE(sqe->fd); @@ -3980,7 +4279,7 @@ static int io_provide_buffers_prep(struct io_kiocb *req, struct io_provide_buf *p = &req->pbuf; u64 tmp; - if (sqe->ioprio || sqe->rw_flags) + if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in) return -EINVAL; tmp = READ_ONCE(sqe->fd); @@ -4067,7 +4366,7 @@ static int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { #if defined(CONFIG_EPOLL) - if (sqe->ioprio || sqe->buf_index) + if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) return -EINVAL; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; @@ -4113,7 +4412,7 @@ static int io_epoll_ctl(struct io_kiocb *req, unsigned int issue_flags) static int io_madvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { #if defined(CONFIG_ADVISE_SYSCALLS) && defined(CONFIG_MMU) - if (sqe->ioprio || sqe->buf_index || sqe->off) + if (sqe->ioprio || sqe->buf_index || sqe->off || sqe->splice_fd_in) return -EINVAL; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; @@ -4148,7 +4447,7 @@ static int io_madvise(struct io_kiocb *req, unsigned int issue_flags) static int io_fadvise_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - if (sqe->ioprio || sqe->buf_index || sqe->addr) + if (sqe->ioprio || sqe->buf_index || sqe->addr || sqe->splice_fd_in) return -EINVAL; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; @@ -4186,7 +4485,7 @@ static int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (sqe->ioprio || sqe->buf_index) + if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) return -EINVAL; if (req->flags & REQ_F_FIXED_FILE) return -EBADF; @@ -4222,7 +4521,7 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (sqe->ioprio || sqe->off || sqe->addr || sqe->len || - sqe->rw_flags || sqe->buf_index) + sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) return -EINVAL; if (req->flags & REQ_F_FIXED_FILE) return -EBADF; @@ -4283,7 +4582,8 @@ static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index)) + if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index || + sqe->splice_fd_in)) return -EINVAL; req->sync.off = READ_ONCE(sqe->off); @@ -4717,6 +5017,15 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) accept->addr_len = u64_to_user_ptr(READ_ONCE(sqe->addr2)); accept->flags = READ_ONCE(sqe->accept_flags); accept->nofile = rlimit(RLIMIT_NOFILE); + + accept->file_slot = READ_ONCE(sqe->file_index); + if (accept->file_slot && ((req->open.how.flags & O_CLOEXEC) || + (accept->flags & SOCK_CLOEXEC))) + return -EINVAL; + if (accept->flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) + return -EINVAL; + if (SOCK_NONBLOCK != O_NONBLOCK && (accept->flags & SOCK_NONBLOCK)) + accept->flags = (accept->flags & ~SOCK_NONBLOCK) | O_NONBLOCK; return 0; } @@ -4725,20 +5034,35 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags) struct io_accept *accept = &req->accept; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0; - int ret; + bool fixed = !!accept->file_slot; + struct file *file; + int ret, fd; if (req->file->f_flags & O_NONBLOCK) req->flags |= REQ_F_NOWAIT; - ret = __sys_accept4_file(req->file, file_flags, accept->addr, - accept->addr_len, accept->flags, - accept->nofile); - if (ret == -EAGAIN && force_nonblock) - return -EAGAIN; - if (ret < 0) { + if (!fixed) { + fd = __get_unused_fd_flags(accept->flags, accept->nofile); + if (unlikely(fd < 0)) + return fd; + } + file = do_accept(req->file, file_flags, accept->addr, accept->addr_len, + accept->flags); + if (IS_ERR(file)) { + if (!fixed) + put_unused_fd(fd); + ret = PTR_ERR(file); + if (ret == -EAGAIN && force_nonblock) + return -EAGAIN; if (ret == -ERESTARTSYS) ret = -EINTR; req_set_fail(req); + } else if (!fixed) { + fd_install(fd, file); + ret = fd; + } else { + ret = io_install_fixed_file(req, file, issue_flags, + accept->file_slot - 1); } __io_req_complete(req, issue_flags, ret, 0); return 0; @@ -4758,7 +5082,8 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags) + if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags || + sqe->splice_fd_in) return -EINVAL; conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); @@ -4871,6 +5196,7 @@ static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll) { struct io_ring_ctx *ctx = req->ctx; + /* req->task == current here, checking PF_EXITING is safe */ if (unlikely(req->task->flags & PF_EXITING)) WRITE_ONCE(poll->canceled, true); @@ -4880,7 +5206,7 @@ static bool io_poll_rewait(struct io_kiocb *req, struct io_poll_iocb *poll) req->result = vfs_poll(req->file, &pt) & poll->events; } - spin_lock_irq(&ctx->completion_lock); + spin_lock(&ctx->completion_lock); if (!req->result && !READ_ONCE(poll->canceled)) { add_wait_queue(poll->head, &poll->wait); return true; @@ -4914,12 +5240,12 @@ static void io_poll_remove_double(struct io_kiocb *req) if (poll && poll->head) { struct wait_queue_head *head = poll->head; - spin_lock(&head->lock); + spin_lock_irq(&head->lock); list_del_init(&poll->wait.entry); if (poll->wait.private) req_ref_put(req); poll->head = NULL; - spin_unlock(&head->lock); + spin_unlock_irq(&head->lock); } } @@ -4949,13 +5275,13 @@ static bool io_poll_complete(struct io_kiocb *req, __poll_t mask) return !(flags & IORING_CQE_F_MORE); } -static void io_poll_task_func(struct io_kiocb *req) +static void io_poll_task_func(struct io_kiocb *req, bool *locked) { struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *nxt; if (io_poll_rewait(req, &req->poll)) { - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); } else { bool done; @@ -4967,13 +5293,13 @@ static void io_poll_task_func(struct io_kiocb *req) req->result = 0; add_wait_queue(req->poll.head, &req->poll.wait); } - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); io_cqring_ev_posted(ctx); if (done) { nxt = io_put_req_find_next(req); if (nxt) - io_req_task_submit(nxt); + io_req_task_submit(nxt, locked); } } } @@ -4984,6 +5310,7 @@ static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode, struct io_kiocb *req = wait->private; struct io_poll_iocb *poll = io_poll_get_single(req); __poll_t mask = key_to_poll(key); + unsigned long flags; /* for instances that support it check for an event match first: */ if (mask && !(mask & poll->events)) @@ -4996,13 +5323,13 @@ static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode, if (poll->head) { bool done; - spin_lock(&poll->head->lock); + spin_lock_irqsave(&poll->head->lock, flags); done = list_empty(&poll->wait.entry); if (!done) list_del_init(&poll->wait.entry); /* make sure double remove sees this as being gone */ wait->private = NULL; - spin_unlock(&poll->head->lock); + spin_unlock_irqrestore(&poll->head->lock, flags); if (!done) { /* use wait func handler, so it matches the rq type */ poll->wait.func(&poll->wait, mode, sync, key); @@ -5039,8 +5366,13 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, if (unlikely(pt->nr_entries)) { struct io_poll_iocb *poll_one = poll; + /* double add on the same waitqueue head, ignore */ + if (poll_one->head == head) + return; /* already have a 2nd entry, fail a third attempt */ if (*poll_ptr) { + if ((*poll_ptr)->head == head) + return; pt->error = -EINVAL; return; } @@ -5050,9 +5382,6 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, */ if (!(poll_one->events & EPOLLONESHOT)) poll_one->events |= EPOLLONESHOT; - /* double add on the same waitqueue head, ignore */ - if (poll_one->head == head) - return; poll = kmalloc(sizeof(*poll), GFP_ATOMIC); if (!poll) { pt->error = -ENOMEM; @@ -5082,7 +5411,7 @@ static void io_async_queue_proc(struct file *file, struct wait_queue_head *head, __io_queue_proc(&apoll->poll, pt, head, &apoll->double_poll); } -static void io_async_task_func(struct io_kiocb *req) +static void io_async_task_func(struct io_kiocb *req, bool *locked) { struct async_poll *apoll = req->apoll; struct io_ring_ctx *ctx = req->ctx; @@ -5090,16 +5419,16 @@ static void io_async_task_func(struct io_kiocb *req) trace_io_uring_task_run(req->ctx, req, req->opcode, req->user_data); if (io_poll_rewait(req, &apoll->poll)) { - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); return; } hash_del(&req->hash_node); io_poll_remove_double(req); - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); if (!READ_ONCE(apoll->poll.canceled)) - io_req_task_submit(req); + io_req_task_submit(req, locked); else io_req_complete_failed(req, -ECANCELED); } @@ -5148,11 +5477,11 @@ static __poll_t __io_arm_poll_handler(struct io_kiocb *req, if (unlikely(!ipt->nr_entries) && !ipt->error) ipt->error = -EINVAL; - spin_lock_irq(&ctx->completion_lock); + spin_lock(&ctx->completion_lock); if (ipt->error || (mask && (poll->events & EPOLLONESHOT))) io_poll_remove_double(req); if (likely(poll->head)) { - spin_lock(&poll->head->lock); + spin_lock_irq(&poll->head->lock); if (unlikely(list_empty(&poll->wait.entry))) { if (ipt->error) cancel = true; @@ -5165,7 +5494,7 @@ static __poll_t __io_arm_poll_handler(struct io_kiocb *req, WRITE_ONCE(poll->canceled, true); else if (!poll->done) /* actually waiting for an event */ io_poll_req_insert(req); - spin_unlock(&poll->head->lock); + spin_unlock_irq(&poll->head->lock); } return mask; @@ -5207,7 +5536,7 @@ static int io_arm_poll_handler(struct io_kiocb *req) } /* if we can't nonblock try, then no point in arming a poll handler */ - if (!io_file_supports_async(req, rw)) + if (!io_file_supports_nowait(req, rw)) return IO_APOLL_ABORTED; apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); @@ -5217,16 +5546,14 @@ static int io_arm_poll_handler(struct io_kiocb *req) req->apoll = apoll; req->flags |= REQ_F_POLLED; ipt.pt._qproc = io_async_queue_proc; + io_req_set_refcount(req); ret = __io_arm_poll_handler(req, &apoll->poll, &ipt, mask, io_async_wake); - if (ret || ipt.error) { - spin_unlock_irq(&ctx->completion_lock); - if (ret) - return IO_APOLL_READY; - return IO_APOLL_ABORTED; - } - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); + if (ret || ipt.error) + return ret ? IO_APOLL_READY : IO_APOLL_ABORTED; + trace_io_uring_poll_arm(ctx, req, req->opcode, req->user_data, mask, apoll->poll.events); return IO_APOLL_OK; @@ -5240,19 +5567,19 @@ static bool __io_poll_remove_one(struct io_kiocb *req, if (!poll->head) return false; - spin_lock(&poll->head->lock); + spin_lock_irq(&poll->head->lock); if (do_cancel) WRITE_ONCE(poll->canceled, true); if (!list_empty(&poll->wait.entry)) { list_del_init(&poll->wait.entry); do_complete = true; } - spin_unlock(&poll->head->lock); + spin_unlock_irq(&poll->head->lock); hash_del(&req->hash_node); return do_complete; } -static bool io_poll_remove_waitqs(struct io_kiocb *req) +static bool io_poll_remove_one(struct io_kiocb *req) __must_hold(&req->ctx->completion_lock) { bool do_complete; @@ -5260,26 +5587,12 @@ static bool io_poll_remove_waitqs(struct io_kiocb *req) io_poll_remove_double(req); do_complete = __io_poll_remove_one(req, io_poll_get_single(req), true); - if (req->opcode != IORING_OP_POLL_ADD && do_complete) { - /* non-poll requests have submit ref still */ - req_ref_put(req); - } - return do_complete; -} - -static bool io_poll_remove_one(struct io_kiocb *req) - __must_hold(&req->ctx->completion_lock) -{ - bool do_complete; - - do_complete = io_poll_remove_waitqs(req); if (do_complete) { io_cqring_fill_event(req->ctx, req->user_data, -ECANCELED, 0); io_commit_cqring(req->ctx); req_set_fail(req); - io_put_req_deferred(req, 1); + io_put_req_deferred(req); } - return do_complete; } @@ -5293,7 +5606,7 @@ static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, struct io_kiocb *req; int posted = 0, i; - spin_lock_irq(&ctx->completion_lock); + spin_lock(&ctx->completion_lock); for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { struct hlist_head *list; @@ -5303,7 +5616,7 @@ static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, posted += io_poll_remove_one(req); } } - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); if (posted) io_cqring_ev_posted(ctx); @@ -5366,7 +5679,7 @@ static int io_poll_update_prep(struct io_kiocb *req, if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (sqe->ioprio || sqe->buf_index) + if (sqe->ioprio || sqe->buf_index || sqe->splice_fd_in) return -EINVAL; flags = READ_ONCE(sqe->len); if (flags & ~(IORING_POLL_UPDATE_EVENTS | IORING_POLL_UPDATE_USER_DATA | @@ -5421,6 +5734,7 @@ static int io_poll_add_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe if (flags & ~IORING_POLL_ADD_MULTI) return -EINVAL; + io_req_set_refcount(req); poll->events = io_poll_parse_events(sqe, flags); return 0; } @@ -5441,7 +5755,7 @@ static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) ipt.error = 0; io_poll_complete(req, mask); } - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); if (mask) { io_cqring_ev_posted(ctx); @@ -5458,7 +5772,7 @@ static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags) bool completing; int ret; - spin_lock_irq(&ctx->completion_lock); + spin_lock(&ctx->completion_lock); preq = io_poll_find(ctx, req->poll_update.old_user_data, true); if (!preq) { ret = -ENOENT; @@ -5485,7 +5799,7 @@ static int io_poll_update(struct io_kiocb *req, unsigned int issue_flags) ret = 0; err: if (ret < 0) { - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); req_set_fail(req); io_req_complete(req, ret); return 0; @@ -5498,7 +5812,7 @@ err: } if (req->poll_update.update_user_data) preq->user_data = req->poll_update.new_user_data; - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); /* complete update request, we're done with it */ io_req_complete(req, ret); @@ -5513,6 +5827,12 @@ err: return 0; } +static void io_req_task_timeout(struct io_kiocb *req, bool *locked) +{ + req_set_fail(req); + io_req_complete_post(req, -ETIME, 0); +} + static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) { struct io_timeout_data *data = container_of(timer, @@ -5521,24 +5841,20 @@ static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) struct io_ring_ctx *ctx = req->ctx; unsigned long flags; - spin_lock_irqsave(&ctx->completion_lock, flags); + spin_lock_irqsave(&ctx->timeout_lock, flags); list_del_init(&req->timeout.list); atomic_set(&req->ctx->cq_timeouts, atomic_read(&req->ctx->cq_timeouts) + 1); + spin_unlock_irqrestore(&ctx->timeout_lock, flags); - io_cqring_fill_event(ctx, req->user_data, -ETIME, 0); - io_commit_cqring(ctx); - spin_unlock_irqrestore(&ctx->completion_lock, flags); - - io_cqring_ev_posted(ctx); - req_set_fail(req); - io_put_req(req); + req->io_task_work.func = io_req_task_timeout; + io_req_task_work_add(req); return HRTIMER_NORESTART; } static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx, __u64 user_data) - __must_hold(&ctx->completion_lock) + __must_hold(&ctx->timeout_lock) { struct io_timeout_data *io; struct io_kiocb *req; @@ -5561,6 +5877,7 @@ static struct io_kiocb *io_timeout_extract(struct io_ring_ctx *ctx, static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) __must_hold(&ctx->completion_lock) + __must_hold(&ctx->timeout_lock) { struct io_kiocb *req = io_timeout_extract(ctx, user_data); @@ -5569,13 +5886,54 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) req_set_fail(req); io_cqring_fill_event(ctx, req->user_data, -ECANCELED, 0); - io_put_req_deferred(req, 1); + io_put_req_deferred(req); + return 0; +} + +static clockid_t io_timeout_get_clock(struct io_timeout_data *data) +{ + switch (data->flags & IORING_TIMEOUT_CLOCK_MASK) { + case IORING_TIMEOUT_BOOTTIME: + return CLOCK_BOOTTIME; + case IORING_TIMEOUT_REALTIME: + return CLOCK_REALTIME; + default: + /* can't happen, vetted at prep time */ + WARN_ON_ONCE(1); + fallthrough; + case 0: + return CLOCK_MONOTONIC; + } +} + +static int io_linked_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, + struct timespec64 *ts, enum hrtimer_mode mode) + __must_hold(&ctx->timeout_lock) +{ + struct io_timeout_data *io; + struct io_kiocb *req; + bool found = false; + + list_for_each_entry(req, &ctx->ltimeout_list, timeout.list) { + found = user_data == req->user_data; + if (found) + break; + } + if (!found) + return -ENOENT; + + io = req->async_data; + if (hrtimer_try_to_cancel(&io->timer) == -1) + return -EALREADY; + hrtimer_init(&io->timer, io_timeout_get_clock(io), mode); + io->timer.function = io_link_timeout_fn; + hrtimer_start(&io->timer, timespec64_to_ktime(*ts), mode); return 0; } static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, struct timespec64 *ts, enum hrtimer_mode mode) - __must_hold(&ctx->completion_lock) + __must_hold(&ctx->timeout_lock) { struct io_kiocb *req = io_timeout_extract(ctx, user_data); struct io_timeout_data *data; @@ -5586,7 +5944,7 @@ static int io_timeout_update(struct io_ring_ctx *ctx, __u64 user_data, req->timeout.off = 0; /* noseq */ data = req->async_data; list_add_tail(&req->timeout.list, &ctx->timeout_list); - hrtimer_init(&data->timer, CLOCK_MONOTONIC, mode); + hrtimer_init(&data->timer, io_timeout_get_clock(data), mode); data->timer.function = io_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(*ts), mode); return 0; @@ -5601,13 +5959,18 @@ static int io_timeout_remove_prep(struct io_kiocb *req, return -EINVAL; if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) return -EINVAL; - if (sqe->ioprio || sqe->buf_index || sqe->len) + if (sqe->ioprio || sqe->buf_index || sqe->len || sqe->splice_fd_in) return -EINVAL; + tr->ltimeout = false; tr->addr = READ_ONCE(sqe->addr); tr->flags = READ_ONCE(sqe->timeout_flags); - if (tr->flags & IORING_TIMEOUT_UPDATE) { - if (tr->flags & ~(IORING_TIMEOUT_UPDATE|IORING_TIMEOUT_ABS)) + if (tr->flags & IORING_TIMEOUT_UPDATE_MASK) { + if (hweight32(tr->flags & IORING_TIMEOUT_CLOCK_MASK) > 1) + return -EINVAL; + if (tr->flags & IORING_LINK_TIMEOUT_UPDATE) + tr->ltimeout = true; + if (tr->flags & ~(IORING_TIMEOUT_UPDATE_MASK|IORING_TIMEOUT_ABS)) return -EINVAL; if (get_timespec64(&tr->ts, u64_to_user_ptr(sqe->addr2))) return -EFAULT; @@ -5634,20 +5997,26 @@ static int io_timeout_remove(struct io_kiocb *req, unsigned int issue_flags) struct io_ring_ctx *ctx = req->ctx; int ret; - spin_lock_irq(&ctx->completion_lock); - if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) + if (!(req->timeout_rem.flags & IORING_TIMEOUT_UPDATE)) { + spin_lock(&ctx->completion_lock); + spin_lock_irq(&ctx->timeout_lock); ret = io_timeout_cancel(ctx, tr->addr); - else - ret = io_timeout_update(ctx, tr->addr, &tr->ts, - io_translate_timeout_mode(tr->flags)); + spin_unlock_irq(&ctx->timeout_lock); + spin_unlock(&ctx->completion_lock); + } else { + enum hrtimer_mode mode = io_translate_timeout_mode(tr->flags); + + spin_lock_irq(&ctx->timeout_lock); + if (tr->ltimeout) + ret = io_linked_timeout_update(ctx, tr->addr, &tr->ts, mode); + else + ret = io_timeout_update(ctx, tr->addr, &tr->ts, mode); + spin_unlock_irq(&ctx->timeout_lock); + } - io_cqring_fill_event(ctx, req->user_data, ret, 0); - io_commit_cqring(ctx); - spin_unlock_irq(&ctx->completion_lock); - io_cqring_ev_posted(ctx); if (ret < 0) req_set_fail(req); - io_put_req(req); + io_req_complete_post(req, ret, 0); return 0; } @@ -5660,14 +6029,19 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (sqe->ioprio || sqe->buf_index || sqe->len != 1) + if (sqe->ioprio || sqe->buf_index || sqe->len != 1 || + sqe->splice_fd_in) return -EINVAL; if (off && is_timeout_link) return -EINVAL; flags = READ_ONCE(sqe->timeout_flags); - if (flags & ~IORING_TIMEOUT_ABS) + if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK)) + return -EINVAL; + /* more than one clock specified is invalid, obviously */ + if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1) return -EINVAL; + INIT_LIST_HEAD(&req->timeout.list); req->timeout.off = off; if (unlikely(off && !req->ctx->off_timeout_used)) req->ctx->off_timeout_used = true; @@ -5677,14 +6051,24 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, data = req->async_data; data->req = req; + data->flags = flags; if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr))) return -EFAULT; data->mode = io_translate_timeout_mode(flags); - hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode); - if (is_timeout_link) - io_req_track_inflight(req); + hrtimer_init(&data->timer, io_timeout_get_clock(data), data->mode); + + if (is_timeout_link) { + struct io_submit_link *link = &req->ctx->submit_state.link; + + if (!link->head) + return -EINVAL; + if (link->last->opcode == IORING_OP_LINK_TIMEOUT) + return -EINVAL; + req->timeout.head = link->last; + link->last->flags |= REQ_F_ARM_LTIMEOUT; + } return 0; } @@ -5695,7 +6079,7 @@ static int io_timeout(struct io_kiocb *req, unsigned int issue_flags) struct list_head *entry; u32 tail, off = req->timeout.off; - spin_lock_irq(&ctx->completion_lock); + spin_lock_irq(&ctx->timeout_lock); /* * sqe->off holds how many events that need to occur for this @@ -5734,7 +6118,7 @@ add: list_add(&req->timeout.list, entry); data->timer.function = io_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); - spin_unlock_irq(&ctx->completion_lock); + spin_unlock_irq(&ctx->timeout_lock); return 0; } @@ -5777,31 +6161,27 @@ static int io_async_cancel_one(struct io_uring_task *tctx, u64 user_data, return ret; } -static void io_async_find_and_cancel(struct io_ring_ctx *ctx, - struct io_kiocb *req, __u64 sqe_addr, - int success_ret) +static int io_try_cancel_userdata(struct io_kiocb *req, u64 sqe_addr) { - unsigned long flags; + struct io_ring_ctx *ctx = req->ctx; int ret; + WARN_ON_ONCE(!io_wq_current_is_worker() && req->task != current); + ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx); - spin_lock_irqsave(&ctx->completion_lock, flags); if (ret != -ENOENT) - goto done; + return ret; + + spin_lock(&ctx->completion_lock); + spin_lock_irq(&ctx->timeout_lock); ret = io_timeout_cancel(ctx, sqe_addr); + spin_unlock_irq(&ctx->timeout_lock); if (ret != -ENOENT) - goto done; + goto out; ret = io_poll_cancel(ctx, sqe_addr, false); -done: - if (!ret) - ret = success_ret; - io_cqring_fill_event(ctx, req->user_data, ret, 0); - io_commit_cqring(ctx); - spin_unlock_irqrestore(&ctx->completion_lock, flags); - io_cqring_ev_posted(ctx); - - if (ret < 0) - req_set_fail(req); +out: + spin_unlock(&ctx->completion_lock); + return ret; } static int io_async_cancel_prep(struct io_kiocb *req, @@ -5811,7 +6191,8 @@ static int io_async_cancel_prep(struct io_kiocb *req, return -EINVAL; if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) return -EINVAL; - if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags) + if (sqe->ioprio || sqe->off || sqe->len || sqe->cancel_flags || + sqe->splice_fd_in) return -EINVAL; req->cancel.addr = READ_ONCE(sqe->addr); @@ -5825,18 +6206,9 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) struct io_tctx_node *node; int ret; - /* tasks should wait for their io-wq threads, so safe w/o sync */ - ret = io_async_cancel_one(req->task->io_uring, sqe_addr, ctx); - spin_lock_irq(&ctx->completion_lock); - if (ret != -ENOENT) - goto done; - ret = io_timeout_cancel(ctx, sqe_addr); - if (ret != -ENOENT) - goto done; - ret = io_poll_cancel(ctx, sqe_addr, false); + ret = io_try_cancel_userdata(req, sqe_addr); if (ret != -ENOENT) goto done; - spin_unlock_irq(&ctx->completion_lock); /* slow path, try all io-wq's */ io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); @@ -5849,17 +6221,10 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) break; } io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); - - spin_lock_irq(&ctx->completion_lock); done: - io_cqring_fill_event(ctx, req->user_data, ret, 0); - io_commit_cqring(ctx); - spin_unlock_irq(&ctx->completion_lock); - io_cqring_ev_posted(ctx); - if (ret < 0) req_set_fail(req); - io_put_req(req); + io_req_complete_post(req, ret, 0); return 0; } @@ -5868,7 +6233,7 @@ static int io_rsrc_update_prep(struct io_kiocb *req, { if (unlikely(req->flags & (REQ_F_FIXED_FILE | REQ_F_BUFFER_SELECT))) return -EINVAL; - if (sqe->ioprio || sqe->rw_flags) + if (sqe->ioprio || sqe->rw_flags || sqe->splice_fd_in) return -EINVAL; req->rsrc_update.offset = READ_ONCE(sqe->off); @@ -5976,6 +6341,12 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return io_renameat_prep(req, sqe); case IORING_OP_UNLINKAT: return io_unlinkat_prep(req, sqe); + case IORING_OP_MKDIRAT: + return io_mkdirat_prep(req, sqe); + case IORING_OP_SYMLINKAT: + return io_symlinkat_prep(req, sqe); + case IORING_OP_LINKAT: + return io_linkat_prep(req, sqe); } printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", @@ -6070,11 +6441,11 @@ fail: return true; } - spin_lock_irq(&ctx->completion_lock); + spin_lock(&ctx->completion_lock); if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) { - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); kfree(de); - io_queue_async_work(req); + io_queue_async_work(req, NULL); return true; } @@ -6082,7 +6453,7 @@ fail: de->req = req; de->seq = seq; list_add_tail(&de->list, &ctx->defer_list); - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); return true; } @@ -6139,6 +6510,17 @@ static void io_clean_op(struct io_kiocb *req) case IORING_OP_UNLINKAT: putname(req->unlink.filename); break; + case IORING_OP_MKDIRAT: + putname(req->mkdir.filename); + break; + case IORING_OP_SYMLINKAT: + putname(req->symlink.oldpath); + putname(req->symlink.newpath); + break; + case IORING_OP_LINKAT: + putname(req->hardlink.oldpath); + putname(req->hardlink.newpath); + break; } } if ((req->flags & REQ_F_POLLED) && req->apoll) { @@ -6267,6 +6649,15 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) case IORING_OP_UNLINKAT: ret = io_unlinkat(req, issue_flags); break; + case IORING_OP_MKDIRAT: + ret = io_mkdirat(req, issue_flags); + break; + case IORING_OP_SYMLINKAT: + ret = io_symlinkat(req, issue_flags); + break; + case IORING_OP_LINKAT: + ret = io_linkat(req, issue_flags); + break; default: ret = -EINVAL; break; @@ -6283,16 +6674,31 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) return 0; } +static struct io_wq_work *io_wq_free_work(struct io_wq_work *work) +{ + struct io_kiocb *req = container_of(work, struct io_kiocb, work); + + req = io_put_req_find_next(req); + return req ? &req->work : NULL; +} + static void io_wq_submit_work(struct io_wq_work *work) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); struct io_kiocb *timeout; int ret = 0; + /* one will be dropped by ->io_free_work() after returning to io-wq */ + if (!(req->flags & REQ_F_REFCOUNT)) + __io_req_set_refcount(req, 2); + else + req_ref_get(req); + timeout = io_prep_linked_timeout(req); if (timeout) io_queue_linked_timeout(timeout); + /* either cancelled or io-wq is dying, so don't touch tctx->iowq */ if (work->flags & IO_WQ_WORK_CANCEL) ret = -ECANCELED; @@ -6311,29 +6717,14 @@ static void io_wq_submit_work(struct io_wq_work *work) } /* avoid locking problems by failing it from a clean context */ - if (ret) { - /* io-wq is going to take one down */ - req_ref_get(req); + if (ret) io_req_task_queue_fail(req, ret); - } } -#define FFS_ASYNC_READ 0x1UL -#define FFS_ASYNC_WRITE 0x2UL -#ifdef CONFIG_64BIT -#define FFS_ISREG 0x4UL -#else -#define FFS_ISREG 0x0UL -#endif -#define FFS_MASK ~(FFS_ASYNC_READ|FFS_ASYNC_WRITE|FFS_ISREG) - static inline struct io_fixed_file *io_fixed_file_slot(struct io_file_table *table, - unsigned i) + unsigned i) { - struct io_fixed_file *table_l2; - - table_l2 = table->files[i >> IORING_FILE_TABLE_SHIFT]; - return &table_l2[i & IORING_FILE_TABLE_MASK]; + return &table->files[i]; } static inline struct file *io_file_from_index(struct io_ring_ctx *ctx, @@ -6348,45 +6739,69 @@ static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file { unsigned long file_ptr = (unsigned long) file; - if (__io_file_supports_async(file, READ)) + if (__io_file_supports_nowait(file, READ)) file_ptr |= FFS_ASYNC_READ; - if (__io_file_supports_async(file, WRITE)) + if (__io_file_supports_nowait(file, WRITE)) file_ptr |= FFS_ASYNC_WRITE; if (S_ISREG(file_inode(file)->i_mode)) file_ptr |= FFS_ISREG; file_slot->file_ptr = file_ptr; } -static struct file *io_file_get(struct io_submit_state *state, - struct io_kiocb *req, int fd, bool fixed) +static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx, + struct io_kiocb *req, int fd) { - struct io_ring_ctx *ctx = req->ctx; struct file *file; + unsigned long file_ptr; - if (fixed) { - unsigned long file_ptr; + if (unlikely((unsigned int)fd >= ctx->nr_user_files)) + return NULL; + fd = array_index_nospec(fd, ctx->nr_user_files); + file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr; + file = (struct file *) (file_ptr & FFS_MASK); + file_ptr &= ~FFS_MASK; + /* mask in overlapping REQ_F and FFS bits */ + req->flags |= (file_ptr << REQ_F_NOWAIT_READ_BIT); + io_req_set_rsrc_node(req); + return file; +} - if (unlikely((unsigned int)fd >= ctx->nr_user_files)) - return NULL; - fd = array_index_nospec(fd, ctx->nr_user_files); - file_ptr = io_fixed_file_slot(&ctx->file_table, fd)->file_ptr; - file = (struct file *) (file_ptr & FFS_MASK); - file_ptr &= ~FFS_MASK; - /* mask in overlapping REQ_F and FFS bits */ - req->flags |= (file_ptr << REQ_F_ASYNC_READ_BIT); - io_req_set_rsrc_node(req); - } else { - trace_io_uring_file_get(ctx, fd); - file = __io_file_get(state, fd); +static struct file *io_file_get_normal(struct io_ring_ctx *ctx, + struct io_kiocb *req, int fd) +{ + struct file *file = fget(fd); - /* we don't allow fixed io_uring files */ - if (file && unlikely(file->f_op == &io_uring_fops)) - io_req_track_inflight(req); - } + trace_io_uring_file_get(ctx, fd); + /* we don't allow fixed io_uring files */ + if (file && unlikely(file->f_op == &io_uring_fops)) + io_req_track_inflight(req); return file; } +static inline struct file *io_file_get(struct io_ring_ctx *ctx, + struct io_kiocb *req, int fd, bool fixed) +{ + if (fixed) + return io_file_get_fixed(ctx, req, fd); + else + return io_file_get_normal(ctx, req, fd); +} + +static void io_req_task_link_timeout(struct io_kiocb *req, bool *locked) +{ + struct io_kiocb *prev = req->timeout.prev; + int ret; + + if (prev) { + ret = io_try_cancel_userdata(req, prev->user_data); + io_req_complete_post(req, ret ?: -ETIME, 0); + io_put_req(prev); + } else { + io_req_complete_post(req, -ETIME, 0); + } +} + static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) { struct io_timeout_data *data = container_of(timer, @@ -6395,7 +6810,7 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) struct io_ring_ctx *ctx = req->ctx; unsigned long flags; - spin_lock_irqsave(&ctx->completion_lock, flags); + spin_lock_irqsave(&ctx->timeout_lock, flags); prev = req->timeout.head; req->timeout.head = NULL; @@ -6408,15 +6823,12 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) if (!req_ref_inc_not_zero(prev)) prev = NULL; } - spin_unlock_irqrestore(&ctx->completion_lock, flags); + list_del(&req->timeout.list); + req->timeout.prev = prev; + spin_unlock_irqrestore(&ctx->timeout_lock, flags); - if (prev) { - io_async_find_and_cancel(ctx, req, prev->user_data, -ETIME); - io_put_req_deferred(prev, 1); - io_put_req_deferred(req, 1); - } else { - io_req_complete_post(req, -ETIME, 0); - } + req->io_task_work.func = io_req_task_link_timeout; + io_req_task_work_add(req); return HRTIMER_NORESTART; } @@ -6424,7 +6836,7 @@ static void io_queue_linked_timeout(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; - spin_lock_irq(&ctx->completion_lock); + spin_lock_irq(&ctx->timeout_lock); /* * If the back reference is NULL, then our linked request finished * before we got a chance to setup the timer @@ -6435,29 +6847,17 @@ static void io_queue_linked_timeout(struct io_kiocb *req) data->timer.function = io_link_timeout_fn; hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); + list_add_tail(&req->timeout.list, &ctx->ltimeout_list); } - spin_unlock_irq(&ctx->completion_lock); + spin_unlock_irq(&ctx->timeout_lock); /* drop submission reference */ io_put_req(req); } -static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) -{ - struct io_kiocb *nxt = req->link; - - if (!nxt || (req->flags & REQ_F_LINK_TIMEOUT) || - nxt->opcode != IORING_OP_LINK_TIMEOUT) - return NULL; - - nxt->timeout.head = req; - nxt->flags |= REQ_F_LTIMEOUT_ACTIVE; - req->flags |= REQ_F_LINK_TIMEOUT; - return nxt; -} - static void __io_queue_sqe(struct io_kiocb *req) + __must_hold(&req->ctx->uring_lock) { - struct io_kiocb *linked_timeout = io_prep_linked_timeout(req); + struct io_kiocb *linked_timeout; int ret; issue_sqe: @@ -6468,50 +6868,60 @@ issue_sqe: * doesn't support non-blocking read/write attempts */ if (likely(!ret)) { - /* drop submission reference */ if (req->flags & REQ_F_COMPLETE_INLINE) { struct io_ring_ctx *ctx = req->ctx; - struct io_comp_state *cs = &ctx->submit_state.comp; + struct io_submit_state *state = &ctx->submit_state; - cs->reqs[cs->nr++] = req; - if (cs->nr == ARRAY_SIZE(cs->reqs)) + state->compl_reqs[state->compl_nr++] = req; + if (state->compl_nr == ARRAY_SIZE(state->compl_reqs)) io_submit_flush_completions(ctx); - } else { - io_put_req(req); + return; } + + linked_timeout = io_prep_linked_timeout(req); + if (linked_timeout) + io_queue_linked_timeout(linked_timeout); } else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { + linked_timeout = io_prep_linked_timeout(req); + switch (io_arm_poll_handler(req)) { case IO_APOLL_READY: + if (linked_timeout) + io_unprep_linked_timeout(req); goto issue_sqe; case IO_APOLL_ABORTED: /* * Queued up for async execution, worker will release * submit reference when the iocb is actually submitted. */ - io_queue_async_work(req); + io_queue_async_work(req, NULL); break; } + + if (linked_timeout) + io_queue_linked_timeout(linked_timeout); } else { io_req_complete_failed(req, ret); } - if (linked_timeout) - io_queue_linked_timeout(linked_timeout); } static inline void io_queue_sqe(struct io_kiocb *req) + __must_hold(&req->ctx->uring_lock) { if (unlikely(req->ctx->drain_active) && io_drain_req(req)) return; - if (likely(!(req->flags & REQ_F_FORCE_ASYNC))) { + if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)))) { __io_queue_sqe(req); + } else if (req->flags & REQ_F_FAIL) { + io_req_complete_failed(req, req->result); } else { int ret = io_req_prep_async(req); if (unlikely(ret)) io_req_complete_failed(req, ret); else - io_queue_async_work(req); + io_queue_async_work(req, NULL); } } @@ -6543,19 +6953,19 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx, static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, const struct io_uring_sqe *sqe) + __must_hold(&ctx->uring_lock) { struct io_submit_state *state; unsigned int sqe_flags; int personality, ret = 0; + /* req is partially pre-initialised, see io_preinit_req() */ req->opcode = READ_ONCE(sqe->opcode); /* same numerical values with corresponding REQ_F_*, safe to copy */ req->flags = sqe_flags = READ_ONCE(sqe->flags); req->user_data = READ_ONCE(sqe->user_data); req->file = NULL; req->fixed_rsrc_refs = NULL; - /* one is dropped after submission, the other at completion */ - atomic_set(&req->refs, 2); req->task = current; /* enforce forwards compatibility on users */ @@ -6593,9 +7003,8 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, } if (io_op_defs[req->opcode].needs_file) { - bool fixed = req->flags & REQ_F_FIXED_FILE; - - req->file = io_file_get(state, req, READ_ONCE(sqe->fd), fixed); + req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd), + (sqe_flags & IOSQE_FIXED_FILE)); if (unlikely(!req->file)) ret = -EBADF; } @@ -6606,6 +7015,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, const struct io_uring_sqe *sqe) + __must_hold(&ctx->uring_lock) { struct io_submit_link *link = &ctx->submit_state.link; int ret; @@ -6613,20 +7023,34 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, ret = io_init_req(ctx, req, sqe); if (unlikely(ret)) { fail_req: + /* fail even hard links since we don't submit */ if (link->head) { - /* fail even hard links since we don't submit */ - req_set_fail(link->head); - io_req_complete_failed(link->head, -ECANCELED); - link->head = NULL; + /* + * we can judge a link req is failed or cancelled by if + * REQ_F_FAIL is set, but the head is an exception since + * it may be set REQ_F_FAIL because of other req's failure + * so let's leverage req->result to distinguish if a head + * is set REQ_F_FAIL because of its failure or other req's + * failure so that we can set the correct ret code for it. + * init result here to avoid affecting the normal path. + */ + if (!(link->head->flags & REQ_F_FAIL)) + req_fail_link_node(link->head, -ECANCELED); + } else if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) { + /* + * the current req is a normal req, we should return + * error and thus break the submittion loop. + */ + io_req_complete_failed(req, ret); + return ret; } - io_req_complete_failed(req, ret); - return ret; + req_fail_link_node(req, ret); + } else { + ret = io_req_prep(req, sqe); + if (unlikely(ret)) + goto fail_req; } - ret = io_req_prep(req, sqe); - if (unlikely(ret)) - goto fail_req; - /* don't need @sqe from now on */ trace_io_uring_submit_sqe(ctx, req, req->opcode, req->user_data, req->flags, true, @@ -6642,9 +7066,14 @@ fail_req: if (link->head) { struct io_kiocb *head = link->head; - ret = io_req_prep_async(req); - if (unlikely(ret)) - goto fail_req; + if (!(req->flags & REQ_F_FAIL)) { + ret = io_req_prep_async(req); + if (unlikely(ret)) { + req_fail_link_node(req, ret); + if (!(head->flags & REQ_F_FAIL)) + req_fail_link_node(head, -ECANCELED); + } + } trace_io_uring_link(ctx, req, head); link->last->link = req; link->last = req; @@ -6674,11 +7103,10 @@ static void io_submit_state_end(struct io_submit_state *state, { if (state->link.head) io_queue_sqe(state->link.head); - if (state->comp.nr) + if (state->compl_nr) io_submit_flush_completions(ctx); if (state->plug_started) blk_finish_plug(&state->plug); - io_state_file_put(state); } /* @@ -6738,26 +7166,17 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) } static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) + __must_hold(&ctx->uring_lock) { - struct io_uring_task *tctx; int submitted = 0; /* make sure SQ entry isn't read before tail */ nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx)); if (!percpu_ref_tryget_many(&ctx->refs, nr)) return -EAGAIN; + io_get_task_refs(nr); - tctx = current->io_uring; - tctx->cached_refs -= nr; - if (unlikely(tctx->cached_refs < 0)) { - unsigned int refill = -tctx->cached_refs + IO_TCTX_REFS_CACHE_NR; - - percpu_counter_add(&tctx->inflight, refill); - refcount_add(refill, ¤t->usage); - tctx->cached_refs += refill; - } io_submit_state_start(&ctx->submit_state, nr); - while (submitted < nr) { const struct io_uring_sqe *sqe; struct io_kiocb *req; @@ -6770,7 +7189,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) } sqe = io_get_sqe(ctx); if (unlikely(!sqe)) { - kmem_cache_free(req_cachep, req); + list_add(&req->inflight_entry, &ctx->submit_state.free_list); break; } /* will complete beyond this point, count as submitted */ @@ -6802,16 +7221,18 @@ static inline bool io_sqd_events_pending(struct io_sq_data *sqd) static inline void io_ring_set_wakeup_flag(struct io_ring_ctx *ctx) { /* Tell userspace we may need a wakeup call */ - spin_lock_irq(&ctx->completion_lock); - ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP; - spin_unlock_irq(&ctx->completion_lock); + spin_lock(&ctx->completion_lock); + WRITE_ONCE(ctx->rings->sq_flags, + ctx->rings->sq_flags | IORING_SQ_NEED_WAKEUP); + spin_unlock(&ctx->completion_lock); } static inline void io_ring_clear_wakeup_flag(struct io_ring_ctx *ctx) { - spin_lock_irq(&ctx->completion_lock); - ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; - spin_unlock_irq(&ctx->completion_lock); + spin_lock(&ctx->completion_lock); + WRITE_ONCE(ctx->rings->sq_flags, + ctx->rings->sq_flags & ~IORING_SQ_NEED_WAKEUP); + spin_unlock(&ctx->completion_lock); } static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) @@ -6833,7 +7254,7 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) mutex_lock(&ctx->uring_lock); if (!list_empty(&ctx->iopoll_list)) - io_do_iopoll(ctx, &nr_events, 0, true); + io_do_iopoll(ctx, &nr_events, 0); /* * Don't submit if refs are dying, good for io_uring_register(), @@ -6968,21 +7389,21 @@ static int io_sq_thread(void *data) struct io_wait_queue { struct wait_queue_entry wq; struct io_ring_ctx *ctx; - unsigned to_wait; + unsigned cq_tail; unsigned nr_timeouts; }; static inline bool io_should_wake(struct io_wait_queue *iowq) { struct io_ring_ctx *ctx = iowq->ctx; + int dist = ctx->cached_cq_tail - (int) iowq->cq_tail; /* * Wake up if we have enough events, or if a timeout occurred since we * started waiting. For timeouts, we always want to return to userspace, * regardless of event count. */ - return io_cqring_events(ctx) >= iowq->to_wait || - atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts; + return dist >= 0 || atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts; } static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, @@ -7038,21 +7459,13 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, const sigset_t __user *sig, size_t sigsz, struct __kernel_timespec __user *uts) { - struct io_wait_queue iowq = { - .wq = { - .private = current, - .func = io_wake_function, - .entry = LIST_HEAD_INIT(iowq.wq.entry), - }, - .ctx = ctx, - .to_wait = min_events, - }; + struct io_wait_queue iowq; struct io_rings *rings = ctx->rings; signed long timeout = MAX_SCHEDULE_TIMEOUT; int ret; do { - io_cqring_overflow_flush(ctx, false); + io_cqring_overflow_flush(ctx); if (io_cqring_events(ctx) >= min_events) return 0; if (!io_run_task_work()) @@ -7080,11 +7493,17 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, timeout = timespec64_to_jiffies(&ts); } + init_waitqueue_func_entry(&iowq.wq, io_wake_function); + iowq.wq.private = current; + INIT_LIST_HEAD(&iowq.wq.entry); + iowq.ctx = ctx; iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); + iowq.cq_tail = READ_ONCE(ctx->rings->cq.head) + min_events; + trace_io_uring_cqring_wait(ctx, min_events); do { /* if we can't even flush overflow, don't wait for more */ - if (!io_cqring_overflow_flush(ctx, false)) { + if (!io_cqring_overflow_flush(ctx)) { ret = -EBUSY; break; } @@ -7115,14 +7534,14 @@ static void **io_alloc_page_table(size_t size) size_t init_size = size; void **table; - table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL); + table = kcalloc(nr_tables, sizeof(*table), GFP_KERNEL_ACCOUNT); if (!table) return NULL; for (i = 0; i < nr_tables; i++) { unsigned int this_size = min_t(size_t, size, PAGE_SIZE); - table[i] = kzalloc(this_size, GFP_KERNEL); + table[i] = kzalloc(this_size, GFP_KERNEL_ACCOUNT); if (!table[i]) { io_free_page_table(table, init_size); return NULL; @@ -7132,20 +7551,54 @@ static void **io_alloc_page_table(size_t size) return table; } -static inline void io_rsrc_ref_lock(struct io_ring_ctx *ctx) +static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node) { - spin_lock_bh(&ctx->rsrc_ref_lock); + percpu_ref_exit(&ref_node->refs); + kfree(ref_node); } -static inline void io_rsrc_ref_unlock(struct io_ring_ctx *ctx) +static void io_rsrc_node_ref_zero(struct percpu_ref *ref) { - spin_unlock_bh(&ctx->rsrc_ref_lock); + struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs); + struct io_ring_ctx *ctx = node->rsrc_data->ctx; + unsigned long flags; + bool first_add = false; + + spin_lock_irqsave(&ctx->rsrc_ref_lock, flags); + node->done = true; + + while (!list_empty(&ctx->rsrc_ref_list)) { + node = list_first_entry(&ctx->rsrc_ref_list, + struct io_rsrc_node, node); + /* recycle ref nodes in order */ + if (!node->done) + break; + list_del(&node->node); + first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist); + } + spin_unlock_irqrestore(&ctx->rsrc_ref_lock, flags); + + if (first_add) + mod_delayed_work(system_wq, &ctx->rsrc_put_work, HZ); } -static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node) +static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx) { - percpu_ref_exit(&ref_node->refs); - kfree(ref_node); + struct io_rsrc_node *ref_node; + + ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); + if (!ref_node) + return NULL; + + if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero, + 0, GFP_KERNEL)) { + kfree(ref_node); + return NULL; + } + INIT_LIST_HEAD(&ref_node->node); + INIT_LIST_HEAD(&ref_node->rsrc_list); + ref_node->done = false; + return ref_node; } static void io_rsrc_node_switch(struct io_ring_ctx *ctx, @@ -7158,9 +7611,9 @@ static void io_rsrc_node_switch(struct io_ring_ctx *ctx, struct io_rsrc_node *rsrc_node = ctx->rsrc_node; rsrc_node->rsrc_data = data_to_kill; - io_rsrc_ref_lock(ctx); + spin_lock_irq(&ctx->rsrc_ref_lock); list_add_tail(&rsrc_node->node, &ctx->rsrc_ref_list); - io_rsrc_ref_unlock(ctx); + spin_unlock_irq(&ctx->rsrc_ref_lock); atomic_inc(&data_to_kill->refs); percpu_ref_kill(&rsrc_node->refs); @@ -7199,17 +7652,19 @@ static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ct /* kill initial ref, already quiesced if zero */ if (atomic_dec_and_test(&data->refs)) break; + mutex_unlock(&ctx->uring_lock); flush_delayed_work(&ctx->rsrc_put_work); ret = wait_for_completion_interruptible(&data->done); - if (!ret) + if (!ret) { + mutex_lock(&ctx->uring_lock); break; + } atomic_inc(&data->refs); /* wait for all works potentially completing data->done */ flush_delayed_work(&ctx->rsrc_put_work); reinit_completion(&data->done); - mutex_unlock(&ctx->uring_lock); ret = io_run_task_work_sig(); mutex_lock(&ctx->uring_lock); } while (ret >= 0); @@ -7277,17 +7732,14 @@ fail: static bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files) { - size_t size = nr_files * sizeof(struct io_fixed_file); - - table->files = (struct io_fixed_file **)io_alloc_page_table(size); + table->files = kvcalloc(nr_files, sizeof(table->files[0]), + GFP_KERNEL_ACCOUNT); return !!table->files; } -static void io_free_file_tables(struct io_file_table *table, unsigned nr_files) +static void io_free_file_tables(struct io_file_table *table) { - size_t size = nr_files * sizeof(struct io_fixed_file); - - io_free_page_table((void **)table->files, size); + kvfree(table->files); table->files = NULL; } @@ -7312,7 +7764,7 @@ static void __io_sqe_files_unregister(struct io_ring_ctx *ctx) fput(file); } #endif - io_free_file_tables(&ctx->file_table, ctx->nr_user_files); + io_free_file_tables(&ctx->file_table); io_rsrc_data_free(ctx->file_data); ctx->file_data = NULL; ctx->nr_user_files = 0; @@ -7628,11 +8080,11 @@ static void __io_rsrc_put_work(struct io_rsrc_node *ref_node) bool lock_ring = ctx->flags & IORING_SETUP_IOPOLL; io_ring_submit_lock(ctx, lock_ring); - spin_lock_irq(&ctx->completion_lock); + spin_lock(&ctx->completion_lock); io_cqring_fill_event(ctx, prsrc->tag, 0, 0); ctx->cq_extra++; io_commit_cqring(ctx); - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); io_cqring_ev_posted(ctx); io_ring_submit_unlock(ctx, lock_ring); } @@ -7664,49 +8116,6 @@ static void io_rsrc_put_work(struct work_struct *work) } } -static void io_rsrc_node_ref_zero(struct percpu_ref *ref) -{ - struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs); - struct io_ring_ctx *ctx = node->rsrc_data->ctx; - bool first_add = false; - - io_rsrc_ref_lock(ctx); - node->done = true; - - while (!list_empty(&ctx->rsrc_ref_list)) { - node = list_first_entry(&ctx->rsrc_ref_list, - struct io_rsrc_node, node); - /* recycle ref nodes in order */ - if (!node->done) - break; - list_del(&node->node); - first_add |= llist_add(&node->llist, &ctx->rsrc_put_llist); - } - io_rsrc_ref_unlock(ctx); - - if (first_add) - mod_delayed_work(system_wq, &ctx->rsrc_put_work, HZ); -} - -static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx) -{ - struct io_rsrc_node *ref_node; - - ref_node = kzalloc(sizeof(*ref_node), GFP_KERNEL); - if (!ref_node) - return NULL; - - if (percpu_ref_init(&ref_node->refs, io_rsrc_node_ref_zero, - 0, GFP_KERNEL)) { - kfree(ref_node); - return NULL; - } - INIT_LIST_HEAD(&ref_node->node); - INIT_LIST_HEAD(&ref_node->rsrc_list); - ref_node->done = false; - return ref_node; -} - static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args, u64 __user *tags) { @@ -7721,6 +8130,8 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, return -EINVAL; if (nr_args > IORING_MAX_FIXED_FILES) return -EMFILE; + if (nr_args > rlimit(RLIMIT_NOFILE)) + return -EMFILE; ret = io_rsrc_node_switch_start(ctx); if (ret) return ret; @@ -7779,7 +8190,7 @@ out_fput: if (file) fput(file); } - io_free_file_tables(&ctx->file_table, nr_args); + io_free_file_tables(&ctx->file_table); ctx->nr_user_files = 0; out_free: io_rsrc_data_free(ctx->file_data); @@ -7830,6 +8241,46 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file, #endif } +static int io_install_fixed_file(struct io_kiocb *req, struct file *file, + unsigned int issue_flags, u32 slot_index) +{ + struct io_ring_ctx *ctx = req->ctx; + bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; + struct io_fixed_file *file_slot; + int ret = -EBADF; + + io_ring_submit_lock(ctx, !force_nonblock); + if (file->f_op == &io_uring_fops) + goto err; + ret = -ENXIO; + if (!ctx->file_data) + goto err; + ret = -EINVAL; + if (slot_index >= ctx->nr_user_files) + goto err; + + slot_index = array_index_nospec(slot_index, ctx->nr_user_files); + file_slot = io_fixed_file_slot(&ctx->file_table, slot_index); + ret = -EBADF; + if (file_slot->file_ptr) + goto err; + + *io_get_tag_slot(ctx->file_data, slot_index) = 0; + io_fixed_file_set(file_slot, file); + ret = io_sqe_file_register(ctx, file, slot_index); + if (ret) { + file_slot->file_ptr = 0; + goto err; + } + + ret = 0; +err: + io_ring_submit_unlock(ctx, !force_nonblock); + if (ret) + fput(file); + return ret; +} + static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, struct io_rsrc_node *node, void *rsrc) { @@ -7925,14 +8376,6 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx, return done ? done : err; } -static struct io_wq_work *io_free_work(struct io_wq_work *work) -{ - struct io_kiocb *req = container_of(work, struct io_kiocb, work); - - req = io_put_req_find_next(req); - return req ? &req->work : NULL; -} - static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx, struct task_struct *task) { @@ -7956,7 +8399,7 @@ static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx, data.hash = hash; data.task = task; - data.free_work = io_free_work; + data.free_work = io_wq_free_work; data.do_work = io_wq_submit_work; /* Do QD, or 4 * CPUS, whatever is smallest */ @@ -8623,43 +9066,36 @@ static void io_destroy_buffers(struct io_ring_ctx *ctx) __io_remove_buffers(ctx, buf, index, -1U); } -static void io_req_cache_free(struct list_head *list, struct task_struct *tsk) +static void io_req_cache_free(struct list_head *list) { struct io_kiocb *req, *nxt; - list_for_each_entry_safe(req, nxt, list, compl.list) { - if (tsk && req->task != tsk) - continue; - list_del(&req->compl.list); + list_for_each_entry_safe(req, nxt, list, inflight_entry) { + list_del(&req->inflight_entry); kmem_cache_free(req_cachep, req); } } static void io_req_caches_free(struct io_ring_ctx *ctx) { - struct io_submit_state *submit_state = &ctx->submit_state; - struct io_comp_state *cs = &ctx->submit_state.comp; + struct io_submit_state *state = &ctx->submit_state; mutex_lock(&ctx->uring_lock); - if (submit_state->free_reqs) { - kmem_cache_free_bulk(req_cachep, submit_state->free_reqs, - submit_state->reqs); - submit_state->free_reqs = 0; + if (state->free_reqs) { + kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs); + state->free_reqs = 0; } - io_flush_cached_locked_reqs(ctx, cs); - io_req_cache_free(&cs->free_list, NULL); + io_flush_cached_locked_reqs(ctx, state); + io_req_cache_free(&state->free_list); mutex_unlock(&ctx->uring_lock); } -static bool io_wait_rsrc_data(struct io_rsrc_data *data) +static void io_wait_rsrc_data(struct io_rsrc_data *data) { - if (!data) - return false; - if (!atomic_dec_and_test(&data->refs)) + if (data && !atomic_dec_and_test(&data->refs)) wait_for_completion(&data->done); - return true; } static void io_ring_ctx_free(struct io_ring_ctx *ctx) @@ -8671,10 +9107,14 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) ctx->mm_account = NULL; } + /* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */ + io_wait_rsrc_data(ctx->buf_data); + io_wait_rsrc_data(ctx->file_data); + mutex_lock(&ctx->uring_lock); - if (io_wait_rsrc_data(ctx->buf_data)) + if (ctx->buf_data) __io_sqe_buffers_unregister(ctx); - if (io_wait_rsrc_data(ctx->file_data)) + if (ctx->file_data) __io_sqe_files_unregister(ctx); if (ctx->rings) __io_cqring_overflow_flush(ctx, true); @@ -8700,6 +9140,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) sock_release(ctx->ring_sock); } #endif + WARN_ON_ONCE(!list_empty(&ctx->ltimeout_list)); io_mem_free(ctx->rings); io_mem_free(ctx->sq_sqes); @@ -8799,6 +9240,7 @@ static void io_ring_exit_work(struct work_struct *work) { struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work); unsigned long timeout = jiffies + HZ * 60 * 5; + unsigned long interval = HZ / 20; struct io_tctx_exit exit; struct io_tctx_node *node; int ret; @@ -8823,8 +9265,11 @@ static void io_ring_exit_work(struct work_struct *work) io_sq_thread_unpark(sqd); } - WARN_ON_ONCE(time_after(jiffies, timeout)); - } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)); + if (WARN_ON_ONCE(time_after(jiffies, timeout))) { + /* there is little hope left, don't run it too often */ + interval = HZ * 60; + } + } while (!wait_for_completion_timeout(&ctx->ref_comp, interval)); init_completion(&exit.completion); init_task_work(&exit.task_work, io_tctx_exit_cb); @@ -8853,8 +9298,8 @@ static void io_ring_exit_work(struct work_struct *work) mutex_lock(&ctx->uring_lock); } mutex_unlock(&ctx->uring_lock); - spin_lock_irq(&ctx->completion_lock); - spin_unlock_irq(&ctx->completion_lock); + spin_lock(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); io_ring_ctx_free(ctx); } @@ -8866,16 +9311,18 @@ static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, struct io_kiocb *req, *tmp; int canceled = 0; - spin_lock_irq(&ctx->completion_lock); + spin_lock(&ctx->completion_lock); + spin_lock_irq(&ctx->timeout_lock); list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) { if (io_match_task(req, tsk, cancel_all)) { io_kill_timeout(req, -ECANCELED); canceled++; } } + spin_unlock_irq(&ctx->timeout_lock); if (canceled != 0) io_commit_cqring(ctx); - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); if (canceled != 0) io_cqring_ev_posted(ctx); return canceled != 0; @@ -8931,13 +9378,12 @@ static bool io_cancel_task_cb(struct io_wq_work *work, void *data) bool ret; if (!cancel->all && (req->flags & REQ_F_LINK_TIMEOUT)) { - unsigned long flags; struct io_ring_ctx *ctx = req->ctx; /* protect against races with linked timeouts */ - spin_lock_irqsave(&ctx->completion_lock, flags); + spin_lock(&ctx->completion_lock); ret = io_match_task(req, cancel->task, cancel->all); - spin_unlock_irqrestore(&ctx->completion_lock, flags); + spin_unlock(&ctx->completion_lock); } else { ret = io_match_task(req, cancel->task, cancel->all); } @@ -8950,14 +9396,14 @@ static bool io_cancel_defer_files(struct io_ring_ctx *ctx, struct io_defer_entry *de; LIST_HEAD(list); - spin_lock_irq(&ctx->completion_lock); + spin_lock(&ctx->completion_lock); list_for_each_entry_reverse(de, &ctx->defer_list, list) { if (io_match_task(de->req, task, cancel_all)) { list_cut_position(&list, &ctx->defer_list, &de->list); break; } } - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); if (list_empty(&list)) return false; @@ -9122,8 +9568,8 @@ static void io_uring_clean_tctx(struct io_uring_task *tctx) * Must be after io_uring_del_task_file() (removes nodes under * uring_lock) to avoid race with io_uring_try_cancel_iowq(). */ - tctx->io_wq = NULL; io_wq_put_and_exit(wq); + tctx->io_wq = NULL; } } @@ -9139,9 +9585,11 @@ static void io_uring_drop_tctx_refs(struct task_struct *task) struct io_uring_task *tctx = task->io_uring; unsigned int refs = tctx->cached_refs; - tctx->cached_refs = 0; - percpu_counter_sub(&tctx->inflight, refs); - put_task_struct_many(task, refs); + if (refs) { + tctx->cached_refs = 0; + percpu_counter_sub(&tctx->inflight, refs); + put_task_struct_many(task, refs); + } } /* @@ -9162,9 +9610,9 @@ static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) if (tctx->io_wq) io_wq_exit_start(tctx->io_wq); - io_uring_drop_tctx_refs(current); atomic_inc(&tctx->in_idle); do { + io_uring_drop_tctx_refs(current); /* read completions before cancelations */ inflight = tctx_inflight(tctx, !cancel_all); if (!inflight) @@ -9188,6 +9636,7 @@ static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) } prepare_to_wait(&tctx->wait, &wait, TASK_UNINTERRUPTIBLE); + io_uring_drop_tctx_refs(current); /* * If we've seen completions, retry without waiting. This * avoids a race where a completion comes in before we did @@ -9206,9 +9655,9 @@ static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) } } -void __io_uring_cancel(struct files_struct *files) +void __io_uring_cancel(bool cancel_all) { - io_uring_cancel_generic(!files, NULL); + io_uring_cancel_generic(cancel_all, NULL); } static void *io_uring_validate_mmap_request(struct file *file, @@ -9368,11 +9817,12 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, */ ret = 0; if (ctx->flags & IORING_SETUP_SQPOLL) { - io_cqring_overflow_flush(ctx, false); + io_cqring_overflow_flush(ctx); - ret = -EOWNERDEAD; - if (unlikely(ctx->sq_data->thread == NULL)) + if (unlikely(ctx->sq_data->thread == NULL)) { + ret = -EOWNERDEAD; goto out; + } if (flags & IORING_ENTER_SQ_WAKEUP) wake_up(&ctx->sq_data->wait); if (flags & IORING_ENTER_SQ_WAIT) { @@ -9503,7 +9953,7 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) io_uring_show_cred(m, index, cred); } seq_printf(m, "PollList:\n"); - spin_lock_irq(&ctx->completion_lock); + spin_lock(&ctx->completion_lock); for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { struct hlist_head *list = &ctx->cancel_hash[i]; struct io_kiocb *req; @@ -9512,7 +9962,7 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) seq_printf(m, " op=%d, task_works=%d\n", req->opcode, req->task->task_works != NULL); } - spin_unlock_irq(&ctx->completion_lock); + spin_unlock(&ctx->completion_lock); if (has_lock) mutex_unlock(&ctx->uring_lock); } @@ -9840,10 +10290,11 @@ static int io_register_personality(struct io_ring_ctx *ctx) ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds, XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL); - if (!ret) - return id; - put_cred(creds); - return ret; + if (ret < 0) { + put_cred(creds); + return ret; + } + return id; } static int io_register_restrictions(struct io_ring_ctx *ctx, void __user *arg, @@ -10044,6 +10495,31 @@ static int io_unregister_iowq_aff(struct io_ring_ctx *ctx) return io_wq_cpu_affinity(tctx->io_wq, NULL); } +static int io_register_iowq_max_workers(struct io_ring_ctx *ctx, + void __user *arg) +{ + struct io_uring_task *tctx = current->io_uring; + __u32 new_count[2]; + int i, ret; + + if (!tctx || !tctx->io_wq) + return -EINVAL; + if (copy_from_user(new_count, arg, sizeof(new_count))) + return -EFAULT; + for (i = 0; i < ARRAY_SIZE(new_count); i++) + if (new_count[i] > INT_MAX) + return -EINVAL; + + ret = io_wq_max_workers(tctx->io_wq, new_count); + if (ret) + return ret; + + if (copy_to_user(arg, new_count, sizeof(new_count))) + return -EFAULT; + + return 0; +} + static bool io_register_op_must_quiesce(int op) { switch (op) { @@ -10061,12 +10537,40 @@ static bool io_register_op_must_quiesce(int op) case IORING_REGISTER_BUFFERS_UPDATE: case IORING_REGISTER_IOWQ_AFF: case IORING_UNREGISTER_IOWQ_AFF: + case IORING_REGISTER_IOWQ_MAX_WORKERS: return false; default: return true; } } +static int io_ctx_quiesce(struct io_ring_ctx *ctx) +{ + long ret; + + percpu_ref_kill(&ctx->refs); + + /* + * Drop uring mutex before waiting for references to exit. If another + * thread is currently inside io_uring_enter() it might need to grab the + * uring_lock to make progress. If we hold it here across the drain + * wait, then we can deadlock. It's safe to drop the mutex here, since + * no new references will come in after we've killed the percpu ref. + */ + mutex_unlock(&ctx->uring_lock); + do { + ret = wait_for_completion_interruptible(&ctx->ref_comp); + if (!ret) + break; + ret = io_run_task_work_sig(); + } while (ret >= 0); + mutex_lock(&ctx->uring_lock); + + if (ret) + io_refs_resurrect(&ctx->refs, &ctx->ref_comp); + return ret; +} + static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, void __user *arg, unsigned nr_args) __releases(ctx->uring_lock) @@ -10091,31 +10595,9 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, } if (io_register_op_must_quiesce(opcode)) { - percpu_ref_kill(&ctx->refs); - - /* - * Drop uring mutex before waiting for references to exit. If - * another thread is currently inside io_uring_enter() it might - * need to grab the uring_lock to make progress. If we hold it - * here across the drain wait, then we can deadlock. It's safe - * to drop the mutex here, since no new references will come in - * after we've killed the percpu ref. - */ - mutex_unlock(&ctx->uring_lock); - do { - ret = wait_for_completion_interruptible(&ctx->ref_comp); - if (!ret) - break; - ret = io_run_task_work_sig(); - if (ret < 0) - break; - } while (1); - mutex_lock(&ctx->uring_lock); - - if (ret) { - io_refs_resurrect(&ctx->refs, &ctx->ref_comp); + ret = io_ctx_quiesce(ctx); + if (ret) return ret; - } } switch (opcode) { @@ -10212,6 +10694,12 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_unregister_iowq_aff(ctx); break; + case IORING_REGISTER_IOWQ_MAX_WORKERS: + ret = -EINVAL; + if (!arg || nr_args != 2) + break; + ret = io_register_iowq_max_workers(ctx, arg); + break; default: ret = -EINVAL; break; @@ -10293,11 +10781,16 @@ static int __init io_uring_init(void) BUILD_BUG_SQE_ELEM(40, __u16, buf_group); BUILD_BUG_SQE_ELEM(42, __u16, personality); BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in); + BUILD_BUG_SQE_ELEM(44, __u32, file_index); BUILD_BUG_ON(sizeof(struct io_uring_files_update) != sizeof(struct io_uring_rsrc_update)); BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) > sizeof(struct io_uring_rsrc_update2)); + + /* ->buf_index is u16 */ + BUILD_BUG_ON(IORING_MAX_REG_BUFFERS >= (1u << 16)); + /* should fit into one byte */ BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8)); diff --git a/fs/ioctl.c b/fs/ioctl.c index 1e2204fa9963..504e69578112 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -263,209 +263,6 @@ static long ioctl_file_clone_range(struct file *file, args.src_length, args.dest_offset); } -#ifdef CONFIG_BLOCK - -static inline sector_t logical_to_blk(struct inode *inode, loff_t offset) -{ - return (offset >> inode->i_blkbits); -} - -static inline loff_t blk_to_logical(struct inode *inode, sector_t blk) -{ - return (blk << inode->i_blkbits); -} - -/** - * __generic_block_fiemap - FIEMAP for block based inodes (no locking) - * @inode: the inode to map - * @fieinfo: the fiemap info struct that will be passed back to userspace - * @start: where to start mapping in the inode - * @len: how much space to map - * @get_block: the fs's get_block function - * - * This does FIEMAP for block based inodes. Basically it will just loop - * through get_block until we hit the number of extents we want to map, or we - * go past the end of the file and hit a hole. - * - * If it is possible to have data blocks beyond a hole past @inode->i_size, then - * please do not use this function, it will stop at the first unmapped block - * beyond i_size. - * - * If you use this function directly, you need to do your own locking. Use - * generic_block_fiemap if you want the locking done for you. - */ -static int __generic_block_fiemap(struct inode *inode, - struct fiemap_extent_info *fieinfo, loff_t start, - loff_t len, get_block_t *get_block) -{ - struct buffer_head map_bh; - sector_t start_blk, last_blk; - loff_t isize = i_size_read(inode); - u64 logical = 0, phys = 0, size = 0; - u32 flags = FIEMAP_EXTENT_MERGED; - bool past_eof = false, whole_file = false; - int ret = 0; - - ret = fiemap_prep(inode, fieinfo, start, &len, FIEMAP_FLAG_SYNC); - if (ret) - return ret; - - /* - * Either the i_mutex or other appropriate locking needs to be held - * since we expect isize to not change at all through the duration of - * this call. - */ - if (len >= isize) { - whole_file = true; - len = isize; - } - - /* - * Some filesystems can't deal with being asked to map less than - * blocksize, so make sure our len is at least block length. - */ - if (logical_to_blk(inode, len) == 0) - len = blk_to_logical(inode, 1); - - start_blk = logical_to_blk(inode, start); - last_blk = logical_to_blk(inode, start + len - 1); - - do { - /* - * we set b_size to the total size we want so it will map as - * many contiguous blocks as possible at once - */ - memset(&map_bh, 0, sizeof(struct buffer_head)); - map_bh.b_size = len; - - ret = get_block(inode, start_blk, &map_bh, 0); - if (ret) - break; - - /* HOLE */ - if (!buffer_mapped(&map_bh)) { - start_blk++; - - /* - * We want to handle the case where there is an - * allocated block at the front of the file, and then - * nothing but holes up to the end of the file properly, - * to make sure that extent at the front gets properly - * marked with FIEMAP_EXTENT_LAST - */ - if (!past_eof && - blk_to_logical(inode, start_blk) >= isize) - past_eof = 1; - - /* - * First hole after going past the EOF, this is our - * last extent - */ - if (past_eof && size) { - flags = FIEMAP_EXTENT_MERGED|FIEMAP_EXTENT_LAST; - ret = fiemap_fill_next_extent(fieinfo, logical, - phys, size, - flags); - } else if (size) { - ret = fiemap_fill_next_extent(fieinfo, logical, - phys, size, flags); - size = 0; - } - - /* if we have holes up to/past EOF then we're done */ - if (start_blk > last_blk || past_eof || ret) - break; - } else { - /* - * We have gone over the length of what we wanted to - * map, and it wasn't the entire file, so add the extent - * we got last time and exit. - * - * This is for the case where say we want to map all the - * way up to the second to the last block in a file, but - * the last block is a hole, making the second to last - * block FIEMAP_EXTENT_LAST. In this case we want to - * see if there is a hole after the second to last block - * so we can mark it properly. If we found data after - * we exceeded the length we were requesting, then we - * are good to go, just add the extent to the fieinfo - * and break - */ - if (start_blk > last_blk && !whole_file) { - ret = fiemap_fill_next_extent(fieinfo, logical, - phys, size, - flags); - break; - } - - /* - * if size != 0 then we know we already have an extent - * to add, so add it. - */ - if (size) { - ret = fiemap_fill_next_extent(fieinfo, logical, - phys, size, - flags); - if (ret) - break; - } - - logical = blk_to_logical(inode, start_blk); - phys = blk_to_logical(inode, map_bh.b_blocknr); - size = map_bh.b_size; - flags = FIEMAP_EXTENT_MERGED; - - start_blk += logical_to_blk(inode, size); - - /* - * If we are past the EOF, then we need to make sure as - * soon as we find a hole that the last extent we found - * is marked with FIEMAP_EXTENT_LAST - */ - if (!past_eof && logical + size >= isize) - past_eof = true; - } - cond_resched(); - if (fatal_signal_pending(current)) { - ret = -EINTR; - break; - } - - } while (1); - - /* If ret is 1 then we just hit the end of the extent array */ - if (ret == 1) - ret = 0; - - return ret; -} - -/** - * generic_block_fiemap - FIEMAP for block based inodes - * @inode: The inode to map - * @fieinfo: The mapping information - * @start: The initial block to map - * @len: The length of the extect to attempt to map - * @get_block: The block mapping function for the fs - * - * Calls __generic_block_fiemap to map the inode, after taking - * the inode's mutex lock. - */ - -int generic_block_fiemap(struct inode *inode, - struct fiemap_extent_info *fieinfo, u64 start, - u64 len, get_block_t *get_block) -{ - int ret; - inode_lock(inode); - ret = __generic_block_fiemap(inode, fieinfo, start, len, get_block); - inode_unlock(inode); - return ret; -} -EXPORT_SYMBOL(generic_block_fiemap); - -#endif /* CONFIG_BLOCK */ - /* * This provides compatibility with legacy XFS pre-allocation ioctls * which predate the fallocate syscall. @@ -817,6 +614,14 @@ static int fileattr_set_prepare(struct inode *inode, if ((old_ma->fsx_xflags ^ fa->fsx_xflags) & FS_XFLAG_PROJINHERIT) return -EINVAL; + } else { + /* + * Caller is allowed to change the project ID. If it is being + * changed, make sure that the new value is valid. + */ + if (old_ma->fsx_projid != fa->fsx_projid && + !projid_valid(make_kprojid(&init_user_ns, fa->fsx_projid))) + return -EINVAL; } /* Check extent size hints. */ diff --git a/fs/iomap/Makefile b/fs/iomap/Makefile index eef2722d93a1..4143a3ff89db 100644 --- a/fs/iomap/Makefile +++ b/fs/iomap/Makefile @@ -9,9 +9,9 @@ ccflags-y += -I $(srctree)/$(src) # needed for trace events obj-$(CONFIG_FS_IOMAP) += iomap.o iomap-y += trace.o \ - apply.o \ buffered-io.o \ direct-io.o \ fiemap.o \ + iter.o \ seek.o iomap-$(CONFIG_SWAP) += swapfile.o diff --git a/fs/iomap/apply.c b/fs/iomap/apply.c deleted file mode 100644 index 26ab6563181f..000000000000 --- a/fs/iomap/apply.c +++ /dev/null @@ -1,99 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2010 Red Hat, Inc. - * Copyright (c) 2016-2018 Christoph Hellwig. - */ -#include <linux/module.h> -#include <linux/compiler.h> -#include <linux/fs.h> -#include <linux/iomap.h> -#include "trace.h" - -/* - * Execute a iomap write on a segment of the mapping that spans a - * contiguous range of pages that have identical block mapping state. - * - * This avoids the need to map pages individually, do individual allocations - * for each page and most importantly avoid the need for filesystem specific - * locking per page. Instead, all the operations are amortised over the entire - * range of pages. It is assumed that the filesystems will lock whatever - * resources they require in the iomap_begin call, and release them in the - * iomap_end call. - */ -loff_t -iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, - const struct iomap_ops *ops, void *data, iomap_actor_t actor) -{ - struct iomap iomap = { .type = IOMAP_HOLE }; - struct iomap srcmap = { .type = IOMAP_HOLE }; - loff_t written = 0, ret; - u64 end; - - trace_iomap_apply(inode, pos, length, flags, ops, actor, _RET_IP_); - - /* - * Need to map a range from start position for length bytes. This can - * span multiple pages - it is only guaranteed to return a range of a - * single type of pages (e.g. all into a hole, all mapped or all - * unwritten). Failure at this point has nothing to undo. - * - * If allocation is required for this range, reserve the space now so - * that the allocation is guaranteed to succeed later on. Once we copy - * the data into the page cache pages, then we cannot fail otherwise we - * expose transient stale data. If the reserve fails, we can safely - * back out at this point as there is nothing to undo. - */ - ret = ops->iomap_begin(inode, pos, length, flags, &iomap, &srcmap); - if (ret) - return ret; - if (WARN_ON(iomap.offset > pos)) { - written = -EIO; - goto out; - } - if (WARN_ON(iomap.length == 0)) { - written = -EIO; - goto out; - } - - trace_iomap_apply_dstmap(inode, &iomap); - if (srcmap.type != IOMAP_HOLE) - trace_iomap_apply_srcmap(inode, &srcmap); - - /* - * Cut down the length to the one actually provided by the filesystem, - * as it might not be able to give us the whole size that we requested. - */ - end = iomap.offset + iomap.length; - if (srcmap.type != IOMAP_HOLE) - end = min(end, srcmap.offset + srcmap.length); - if (pos + length > end) - length = end - pos; - - /* - * Now that we have guaranteed that the space allocation will succeed, - * we can do the copy-in page by page without having to worry about - * failures exposing transient data. - * - * To support COW operations, we read in data for partially blocks from - * the srcmap if the file system filled it in. In that case we the - * length needs to be limited to the earlier of the ends of the iomaps. - * If the file system did not provide a srcmap we pass in the normal - * iomap into the actors so that they don't need to have special - * handling for the two cases. - */ - written = actor(inode, pos, length, data, &iomap, - srcmap.type != IOMAP_HOLE ? &srcmap : &iomap); - -out: - /* - * Now the data has been copied, commit the range we've copied. This - * should not fail unless the filesystem has had a fatal error. - */ - if (ops->iomap_end) { - ret = ops->iomap_end(inode, pos, length, - written > 0 ? written : 0, - flags, &iomap); - } - - return written ? written : ret; -} diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 87ccb3438bec..9cc5798423d1 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -36,7 +36,7 @@ static inline struct iomap_page *to_iomap_page(struct page *page) { /* * per-block data is stored in the head page. Callers should - * not be dealing with tail pages (and if they are, they can + * not be dealing with tail pages, and if they are, they can * call thp_head() first. */ VM_BUG_ON_PGFLAGS(PageTail(page), page); @@ -98,7 +98,7 @@ iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop, unsigned last = (poff + plen - 1) >> block_bits; /* - * If the block size is smaller than the page size we need to check the + * If the block size is smaller than the page size, we need to check the * per-block uptodate status and adjust the offset and length if needed * to avoid reading in already uptodate ranges. */ @@ -126,7 +126,7 @@ iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop, } /* - * If the extent spans the block that contains the i_size we need to + * If the extent spans the block that contains the i_size, we need to * handle both halves separately so that we properly zero data in the * page cache for blocks that are entirely outside of i_size. */ @@ -205,60 +205,67 @@ struct iomap_readpage_ctx { struct readahead_control *rac; }; -static void -iomap_read_inline_data(struct inode *inode, struct page *page, - struct iomap *iomap) +static loff_t iomap_read_inline_data(const struct iomap_iter *iter, + struct page *page) { - size_t size = i_size_read(inode); + const struct iomap *iomap = iomap_iter_srcmap(iter); + size_t size = i_size_read(iter->inode) - iomap->offset; + size_t poff = offset_in_page(iomap->offset); void *addr; if (PageUptodate(page)) - return; - - BUG_ON(page_has_private(page)); - BUG_ON(page->index); - BUG_ON(size > PAGE_SIZE - offset_in_page(iomap->inline_data)); - - addr = kmap_atomic(page); + return PAGE_SIZE - poff; + + if (WARN_ON_ONCE(size > PAGE_SIZE - poff)) + return -EIO; + if (WARN_ON_ONCE(size > PAGE_SIZE - + offset_in_page(iomap->inline_data))) + return -EIO; + if (WARN_ON_ONCE(size > iomap->length)) + return -EIO; + if (poff > 0) + iomap_page_create(iter->inode, page); + + addr = kmap_local_page(page) + poff; memcpy(addr, iomap->inline_data, size); - memset(addr + size, 0, PAGE_SIZE - size); - kunmap_atomic(addr); - SetPageUptodate(page); + memset(addr + size, 0, PAGE_SIZE - poff - size); + kunmap_local(addr); + iomap_set_range_uptodate(page, poff, PAGE_SIZE - poff); + return PAGE_SIZE - poff; } -static inline bool iomap_block_needs_zeroing(struct inode *inode, - struct iomap *iomap, loff_t pos) +static inline bool iomap_block_needs_zeroing(const struct iomap_iter *iter, + loff_t pos) { - return iomap->type != IOMAP_MAPPED || - (iomap->flags & IOMAP_F_NEW) || - pos >= i_size_read(inode); + const struct iomap *srcmap = iomap_iter_srcmap(iter); + + return srcmap->type != IOMAP_MAPPED || + (srcmap->flags & IOMAP_F_NEW) || + pos >= i_size_read(iter->inode); } -static loff_t -iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, - struct iomap *iomap, struct iomap *srcmap) +static loff_t iomap_readpage_iter(const struct iomap_iter *iter, + struct iomap_readpage_ctx *ctx, loff_t offset) { - struct iomap_readpage_ctx *ctx = data; + const struct iomap *iomap = &iter->iomap; + loff_t pos = iter->pos + offset; + loff_t length = iomap_length(iter) - offset; struct page *page = ctx->cur_page; struct iomap_page *iop; - bool same_page = false, is_contig = false; loff_t orig_pos = pos; unsigned poff, plen; sector_t sector; - if (iomap->type == IOMAP_INLINE) { - WARN_ON_ONCE(pos); - iomap_read_inline_data(inode, page, iomap); - return PAGE_SIZE; - } + if (iomap->type == IOMAP_INLINE) + return min(iomap_read_inline_data(iter, page), length); /* zero post-eof blocks as the page may be mapped */ - iop = iomap_page_create(inode, page); - iomap_adjust_read_range(inode, iop, &pos, length, &poff, &plen); + iop = iomap_page_create(iter->inode, page); + iomap_adjust_read_range(iter->inode, iop, &pos, length, &poff, &plen); if (plen == 0) goto done; - if (iomap_block_needs_zeroing(inode, iomap, pos)) { + if (iomap_block_needs_zeroing(iter, pos)) { zero_user(page, poff, plen); iomap_set_range_uptodate(page, poff, plen); goto done; @@ -268,16 +275,10 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, if (iop) atomic_add(plen, &iop->read_bytes_pending); - /* Try to merge into a previous segment if we can */ sector = iomap_sector(iomap, pos); - if (ctx->bio && bio_end_sector(ctx->bio) == sector) { - if (__bio_try_merge_page(ctx->bio, page, plen, poff, - &same_page)) - goto done; - is_contig = true; - } - - if (!is_contig || bio_full(ctx->bio, plen)) { + if (!ctx->bio || + bio_end_sector(ctx->bio) != sector || + bio_add_page(ctx->bio, page, plen, poff) != plen) { gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL); gfp_t orig_gfp = gfp; unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE); @@ -301,13 +302,12 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, ctx->bio->bi_iter.bi_sector = sector; bio_set_dev(ctx->bio, iomap->bdev); ctx->bio->bi_end_io = iomap_read_end_io; + __bio_add_page(ctx->bio, page, plen, poff); } - - bio_add_page(ctx->bio, page, plen, poff); done: /* * Move the caller beyond our range so that it keeps making progress. - * For that we have to include any leading non-uptodate ranges, but + * For that, we have to include any leading non-uptodate ranges, but * we can skip trailing ones as they will be handled in the next * iteration. */ @@ -317,23 +317,23 @@ done: int iomap_readpage(struct page *page, const struct iomap_ops *ops) { - struct iomap_readpage_ctx ctx = { .cur_page = page }; - struct inode *inode = page->mapping->host; - unsigned poff; - loff_t ret; + struct iomap_iter iter = { + .inode = page->mapping->host, + .pos = page_offset(page), + .len = PAGE_SIZE, + }; + struct iomap_readpage_ctx ctx = { + .cur_page = page, + }; + int ret; trace_iomap_readpage(page->mapping->host, 1); - for (poff = 0; poff < PAGE_SIZE; poff += ret) { - ret = iomap_apply(inode, page_offset(page) + poff, - PAGE_SIZE - poff, 0, ops, &ctx, - iomap_readpage_actor); - if (ret <= 0) { - WARN_ON_ONCE(ret == 0); - SetPageError(page); - break; - } - } + while ((ret = iomap_iter(&iter, ops)) > 0) + iter.processed = iomap_readpage_iter(&iter, &ctx, 0); + + if (ret < 0) + SetPageError(page); if (ctx.bio) { submit_bio(ctx.bio); @@ -344,23 +344,22 @@ iomap_readpage(struct page *page, const struct iomap_ops *ops) } /* - * Just like mpage_readahead and block_read_full_page we always + * Just like mpage_readahead and block_read_full_page, we always * return 0 and just mark the page as PageError on errors. This - * should be cleaned up all through the stack eventually. + * should be cleaned up throughout the stack eventually. */ return 0; } EXPORT_SYMBOL_GPL(iomap_readpage); -static loff_t -iomap_readahead_actor(struct inode *inode, loff_t pos, loff_t length, - void *data, struct iomap *iomap, struct iomap *srcmap) +static loff_t iomap_readahead_iter(const struct iomap_iter *iter, + struct iomap_readpage_ctx *ctx) { - struct iomap_readpage_ctx *ctx = data; + loff_t length = iomap_length(iter); loff_t done, ret; for (done = 0; done < length; done += ret) { - if (ctx->cur_page && offset_in_page(pos + done) == 0) { + if (ctx->cur_page && offset_in_page(iter->pos + done) == 0) { if (!ctx->cur_page_in_bio) unlock_page(ctx->cur_page); put_page(ctx->cur_page); @@ -370,8 +369,7 @@ iomap_readahead_actor(struct inode *inode, loff_t pos, loff_t length, ctx->cur_page = readahead_page(ctx->rac); ctx->cur_page_in_bio = false; } - ret = iomap_readpage_actor(inode, pos + done, length - done, - ctx, iomap, srcmap); + ret = iomap_readpage_iter(iter, ctx, done); } return done; @@ -394,25 +392,19 @@ iomap_readahead_actor(struct inode *inode, loff_t pos, loff_t length, */ void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops) { - struct inode *inode = rac->mapping->host; - loff_t pos = readahead_pos(rac); - size_t length = readahead_length(rac); + struct iomap_iter iter = { + .inode = rac->mapping->host, + .pos = readahead_pos(rac), + .len = readahead_length(rac), + }; struct iomap_readpage_ctx ctx = { .rac = rac, }; - trace_iomap_readahead(inode, readahead_count(rac)); + trace_iomap_readahead(rac->mapping->host, readahead_count(rac)); - while (length > 0) { - ssize_t ret = iomap_apply(inode, pos, length, 0, ops, - &ctx, iomap_readahead_actor); - if (ret <= 0) { - WARN_ON_ONCE(ret == 0); - break; - } - pos += ret; - length -= ret; - } + while (iomap_iter(&iter, ops) > 0) + iter.processed = iomap_readahead_iter(&iter, &ctx); if (ctx.bio) submit_bio(ctx.bio); @@ -467,7 +459,7 @@ iomap_releasepage(struct page *page, gfp_t gfp_mask) /* * mm accommodates an old ext3 case where clean pages might not have had * the dirty bit cleared. Thus, it can send actual dirty pages to - * ->releasepage() via shrink_active_list(), skip those here. + * ->releasepage() via shrink_active_list(); skip those here. */ if (PageDirty(page) || PageWriteback(page)) return 0; @@ -482,7 +474,7 @@ iomap_invalidatepage(struct page *page, unsigned int offset, unsigned int len) trace_iomap_invalidatepage(page->mapping->host, offset, len); /* - * If we are invalidating the entire page, clear the dirty state from it + * If we're invalidating the entire page, clear the dirty state from it * and release it to avoid unnecessary buildup of the LRU. */ if (offset == 0 && len == PAGE_SIZE) { @@ -516,10 +508,6 @@ iomap_migrate_page(struct address_space *mapping, struct page *newpage, EXPORT_SYMBOL_GPL(iomap_migrate_page); #endif /* CONFIG_MIGRATION */ -enum { - IOMAP_WRITE_F_UNSHARE = (1 << 0), -}; - static void iomap_write_failed(struct inode *inode, loff_t pos, unsigned len) { @@ -535,7 +523,7 @@ iomap_write_failed(struct inode *inode, loff_t pos, unsigned len) static int iomap_read_page_sync(loff_t block_start, struct page *page, unsigned poff, - unsigned plen, struct iomap *iomap) + unsigned plen, const struct iomap *iomap) { struct bio_vec bvec; struct bio bio; @@ -548,12 +536,12 @@ iomap_read_page_sync(loff_t block_start, struct page *page, unsigned poff, return submit_bio_wait(&bio); } -static int -__iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, int flags, - struct page *page, struct iomap *srcmap) +static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, + unsigned len, struct page *page) { - struct iomap_page *iop = iomap_page_create(inode, page); - loff_t block_size = i_blocksize(inode); + const struct iomap *srcmap = iomap_iter_srcmap(iter); + struct iomap_page *iop = iomap_page_create(iter->inode, page); + loff_t block_size = i_blocksize(iter->inode); loff_t block_start = round_down(pos, block_size); loff_t block_end = round_up(pos + len, block_size); unsigned from = offset_in_page(pos), to = from + len, poff, plen; @@ -563,18 +551,18 @@ __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, int flags, ClearPageError(page); do { - iomap_adjust_read_range(inode, iop, &block_start, + iomap_adjust_read_range(iter->inode, iop, &block_start, block_end - block_start, &poff, &plen); if (plen == 0) break; - if (!(flags & IOMAP_WRITE_F_UNSHARE) && + if (!(iter->flags & IOMAP_UNSHARE) && (from <= poff || from >= poff + plen) && (to <= poff || to >= poff + plen)) continue; - if (iomap_block_needs_zeroing(inode, srcmap, block_start)) { - if (WARN_ON_ONCE(flags & IOMAP_WRITE_F_UNSHARE)) + if (iomap_block_needs_zeroing(iter, block_start)) { + if (WARN_ON_ONCE(iter->flags & IOMAP_UNSHARE)) return -EIO; zero_user_segments(page, poff, from, to, poff + plen); } else { @@ -589,41 +577,54 @@ __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, int flags, return 0; } -static int -iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, - struct page **pagep, struct iomap *iomap, struct iomap *srcmap) +static int iomap_write_begin_inline(const struct iomap_iter *iter, + struct page *page) { - const struct iomap_page_ops *page_ops = iomap->page_ops; + int ret; + + /* needs more work for the tailpacking case; disable for now */ + if (WARN_ON_ONCE(iomap_iter_srcmap(iter)->offset != 0)) + return -EIO; + ret = iomap_read_inline_data(iter, page); + if (ret < 0) + return ret; + return 0; +} + +static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos, + unsigned len, struct page **pagep) +{ + const struct iomap_page_ops *page_ops = iter->iomap.page_ops; + const struct iomap *srcmap = iomap_iter_srcmap(iter); struct page *page; int status = 0; - BUG_ON(pos + len > iomap->offset + iomap->length); - if (srcmap != iomap) + BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length); + if (srcmap != &iter->iomap) BUG_ON(pos + len > srcmap->offset + srcmap->length); if (fatal_signal_pending(current)) return -EINTR; if (page_ops && page_ops->page_prepare) { - status = page_ops->page_prepare(inode, pos, len, iomap); + status = page_ops->page_prepare(iter->inode, pos, len); if (status) return status; } - page = grab_cache_page_write_begin(inode->i_mapping, pos >> PAGE_SHIFT, - AOP_FLAG_NOFS); + page = grab_cache_page_write_begin(iter->inode->i_mapping, + pos >> PAGE_SHIFT, AOP_FLAG_NOFS); if (!page) { status = -ENOMEM; goto out_no_page; } if (srcmap->type == IOMAP_INLINE) - iomap_read_inline_data(inode, page, srcmap); - else if (iomap->flags & IOMAP_F_BUFFER_HEAD) + status = iomap_write_begin_inline(iter, page); + else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) status = __block_write_begin_int(page, pos, len, NULL, srcmap); else - status = __iomap_write_begin(inode, pos, len, flags, page, - srcmap); + status = __iomap_write_begin(iter, pos, len, page); if (unlikely(status)) goto out_unlock; @@ -634,11 +635,11 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, out_unlock: unlock_page(page); put_page(page); - iomap_write_failed(inode, pos, len); + iomap_write_failed(iter->inode, pos, len); out_no_page: if (page_ops && page_ops->page_done) - page_ops->page_done(inode, pos, 0, NULL, iomap); + page_ops->page_done(iter->inode, pos, 0, NULL); return status; } @@ -650,13 +651,13 @@ static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len, /* * The blocks that were entirely written will now be uptodate, so we * don't have to worry about a readpage reading them and overwriting a - * partial write. However if we have encountered a short write and only + * partial write. However, if we've encountered a short write and only * partially written into a block, it will not be marked uptodate, so a * readpage might come in and destroy our partial write. * - * Do the simplest thing, and just treat any short write to a non - * uptodate page as a zero-length write, and force the caller to redo - * the whole thing. + * Do the simplest thing and just treat any short write to a + * non-uptodate page as a zero-length write, and force the caller to + * redo the whole thing. */ if (unlikely(copied < len && !PageUptodate(page))) return 0; @@ -665,39 +666,40 @@ static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len, return copied; } -static size_t iomap_write_end_inline(struct inode *inode, struct page *page, - struct iomap *iomap, loff_t pos, size_t copied) +static size_t iomap_write_end_inline(const struct iomap_iter *iter, + struct page *page, loff_t pos, size_t copied) { + const struct iomap *iomap = &iter->iomap; void *addr; WARN_ON_ONCE(!PageUptodate(page)); - BUG_ON(pos + copied > PAGE_SIZE - offset_in_page(iomap->inline_data)); + BUG_ON(!iomap_inline_data_valid(iomap)); flush_dcache_page(page); - addr = kmap_atomic(page); - memcpy(iomap->inline_data + pos, addr + pos, copied); - kunmap_atomic(addr); + addr = kmap_local_page(page) + pos; + memcpy(iomap_inline_data(iomap, pos), addr, copied); + kunmap_local(addr); - mark_inode_dirty(inode); + mark_inode_dirty(iter->inode); return copied; } /* Returns the number of bytes copied. May be 0. Cannot be an errno. */ -static size_t iomap_write_end(struct inode *inode, loff_t pos, size_t len, - size_t copied, struct page *page, struct iomap *iomap, - struct iomap *srcmap) +static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len, + size_t copied, struct page *page) { - const struct iomap_page_ops *page_ops = iomap->page_ops; - loff_t old_size = inode->i_size; + const struct iomap_page_ops *page_ops = iter->iomap.page_ops; + const struct iomap *srcmap = iomap_iter_srcmap(iter); + loff_t old_size = iter->inode->i_size; size_t ret; if (srcmap->type == IOMAP_INLINE) { - ret = iomap_write_end_inline(inode, page, iomap, pos, copied); + ret = iomap_write_end_inline(iter, page, pos, copied); } else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) { - ret = block_write_end(NULL, inode->i_mapping, pos, len, copied, - page, NULL); + ret = block_write_end(NULL, iter->inode->i_mapping, pos, len, + copied, page, NULL); } else { - ret = __iomap_write_end(inode, pos, len, copied, page); + ret = __iomap_write_end(iter->inode, pos, len, copied, page); } /* @@ -706,29 +708,28 @@ static size_t iomap_write_end(struct inode *inode, loff_t pos, size_t len, * preferably after I/O completion so that no stale data is exposed. */ if (pos + ret > old_size) { - i_size_write(inode, pos + ret); - iomap->flags |= IOMAP_F_SIZE_CHANGED; + i_size_write(iter->inode, pos + ret); + iter->iomap.flags |= IOMAP_F_SIZE_CHANGED; } unlock_page(page); if (old_size < pos) - pagecache_isize_extended(inode, old_size, pos); + pagecache_isize_extended(iter->inode, old_size, pos); if (page_ops && page_ops->page_done) - page_ops->page_done(inode, pos, ret, page, iomap); + page_ops->page_done(iter->inode, pos, ret, page); put_page(page); if (ret < len) - iomap_write_failed(inode, pos, len); + iomap_write_failed(iter->inode, pos, len); return ret; } -static loff_t -iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data, - struct iomap *iomap, struct iomap *srcmap) +static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) { - struct iov_iter *i = data; - long status = 0; + loff_t length = iomap_length(iter); + loff_t pos = iter->pos; ssize_t written = 0; + long status = 0; do { struct page *page; @@ -744,7 +745,7 @@ again: bytes = length; /* - * Bring in the user page that we will copy from _first_. + * Bring in the user page that we'll copy from _first_. * Otherwise there's a nasty deadlock on copying from the * same page as we're writing to, without it being marked * up-to-date. @@ -754,18 +755,16 @@ again: break; } - status = iomap_write_begin(inode, pos, bytes, 0, &page, iomap, - srcmap); + status = iomap_write_begin(iter, pos, bytes, &page); if (unlikely(status)) break; - if (mapping_writably_mapped(inode->i_mapping)) + if (mapping_writably_mapped(iter->inode->i_mapping)) flush_dcache_page(page); copied = copy_page_from_iter_atomic(page, offset, bytes, i); - status = iomap_write_end(inode, pos, bytes, copied, page, iomap, - srcmap); + status = iomap_write_end(iter, pos, bytes, copied, page); if (unlikely(copied != status)) iov_iter_revert(i, copied - status); @@ -786,36 +785,38 @@ again: written += status; length -= status; - balance_dirty_pages_ratelimited(inode->i_mapping); + balance_dirty_pages_ratelimited(iter->inode->i_mapping); } while (iov_iter_count(i) && length); return written ? written : status; } ssize_t -iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter, +iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i, const struct iomap_ops *ops) { - struct inode *inode = iocb->ki_filp->f_mapping->host; - loff_t pos = iocb->ki_pos, ret = 0, written = 0; - - while (iov_iter_count(iter)) { - ret = iomap_apply(inode, pos, iov_iter_count(iter), - IOMAP_WRITE, ops, iter, iomap_write_actor); - if (ret <= 0) - break; - pos += ret; - written += ret; - } + struct iomap_iter iter = { + .inode = iocb->ki_filp->f_mapping->host, + .pos = iocb->ki_pos, + .len = iov_iter_count(i), + .flags = IOMAP_WRITE, + }; + int ret; - return written ? written : ret; + while ((ret = iomap_iter(&iter, ops)) > 0) + iter.processed = iomap_write_iter(&iter, i); + if (iter.pos == iocb->ki_pos) + return ret; + return iter.pos - iocb->ki_pos; } EXPORT_SYMBOL_GPL(iomap_file_buffered_write); -static loff_t -iomap_unshare_actor(struct inode *inode, loff_t pos, loff_t length, void *data, - struct iomap *iomap, struct iomap *srcmap) +static loff_t iomap_unshare_iter(struct iomap_iter *iter) { + struct iomap *iomap = &iter->iomap; + const struct iomap *srcmap = iomap_iter_srcmap(iter); + loff_t pos = iter->pos; + loff_t length = iomap_length(iter); long status = 0; loff_t written = 0; @@ -831,13 +832,11 @@ iomap_unshare_actor(struct inode *inode, loff_t pos, loff_t length, void *data, unsigned long bytes = min_t(loff_t, PAGE_SIZE - offset, length); struct page *page; - status = iomap_write_begin(inode, pos, bytes, - IOMAP_WRITE_F_UNSHARE, &page, iomap, srcmap); + status = iomap_write_begin(iter, pos, bytes, &page); if (unlikely(status)) return status; - status = iomap_write_end(inode, pos, bytes, bytes, page, iomap, - srcmap); + status = iomap_write_end(iter, pos, bytes, bytes, page); if (WARN_ON_ONCE(status == 0)) return -EIO; @@ -847,7 +846,7 @@ iomap_unshare_actor(struct inode *inode, loff_t pos, loff_t length, void *data, written += status; length -= status; - balance_dirty_pages_ratelimited(inode->i_mapping); + balance_dirty_pages_ratelimited(iter->inode->i_mapping); } while (length); return written; @@ -857,44 +856,43 @@ int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, const struct iomap_ops *ops) { - loff_t ret; - - while (len) { - ret = iomap_apply(inode, pos, len, IOMAP_WRITE, ops, NULL, - iomap_unshare_actor); - if (ret <= 0) - return ret; - pos += ret; - len -= ret; - } + struct iomap_iter iter = { + .inode = inode, + .pos = pos, + .len = len, + .flags = IOMAP_WRITE | IOMAP_UNSHARE, + }; + int ret; - return 0; + while ((ret = iomap_iter(&iter, ops)) > 0) + iter.processed = iomap_unshare_iter(&iter); + return ret; } EXPORT_SYMBOL_GPL(iomap_file_unshare); -static s64 iomap_zero(struct inode *inode, loff_t pos, u64 length, - struct iomap *iomap, struct iomap *srcmap) +static s64 __iomap_zero_iter(struct iomap_iter *iter, loff_t pos, u64 length) { struct page *page; int status; unsigned offset = offset_in_page(pos); unsigned bytes = min_t(u64, PAGE_SIZE - offset, length); - status = iomap_write_begin(inode, pos, bytes, 0, &page, iomap, srcmap); + status = iomap_write_begin(iter, pos, bytes, &page); if (status) return status; zero_user(page, offset, bytes); mark_page_accessed(page); - return iomap_write_end(inode, pos, bytes, bytes, page, iomap, srcmap); + return iomap_write_end(iter, pos, bytes, bytes, page); } -static loff_t iomap_zero_range_actor(struct inode *inode, loff_t pos, - loff_t length, void *data, struct iomap *iomap, - struct iomap *srcmap) +static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) { - bool *did_zero = data; + struct iomap *iomap = &iter->iomap; + const struct iomap *srcmap = iomap_iter_srcmap(iter); + loff_t pos = iter->pos; + loff_t length = iomap_length(iter); loff_t written = 0; /* already zeroed? we're done. */ @@ -904,10 +902,10 @@ static loff_t iomap_zero_range_actor(struct inode *inode, loff_t pos, do { s64 bytes; - if (IS_DAX(inode)) + if (IS_DAX(iter->inode)) bytes = dax_iomap_zero(pos, length, iomap); else - bytes = iomap_zero(inode, pos, length, iomap, srcmap); + bytes = __iomap_zero_iter(iter, pos, length); if (bytes < 0) return bytes; @@ -925,19 +923,17 @@ int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, const struct iomap_ops *ops) { - loff_t ret; - - while (len > 0) { - ret = iomap_apply(inode, pos, len, IOMAP_ZERO, - ops, did_zero, iomap_zero_range_actor); - if (ret <= 0) - return ret; - - pos += ret; - len -= ret; - } + struct iomap_iter iter = { + .inode = inode, + .pos = pos, + .len = len, + .flags = IOMAP_ZERO, + }; + int ret; - return 0; + while ((ret = iomap_iter(&iter, ops)) > 0) + iter.processed = iomap_zero_iter(&iter, did_zero); + return ret; } EXPORT_SYMBOL_GPL(iomap_zero_range); @@ -955,15 +951,15 @@ iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, } EXPORT_SYMBOL_GPL(iomap_truncate_page); -static loff_t -iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length, - void *data, struct iomap *iomap, struct iomap *srcmap) +static loff_t iomap_page_mkwrite_iter(struct iomap_iter *iter, + struct page *page) { - struct page *page = data; + loff_t length = iomap_length(iter); int ret; - if (iomap->flags & IOMAP_F_BUFFER_HEAD) { - ret = __block_write_begin_int(page, pos, length, NULL, iomap); + if (iter->iomap.flags & IOMAP_F_BUFFER_HEAD) { + ret = __block_write_begin_int(page, iter->pos, length, NULL, + &iter->iomap); if (ret) return ret; block_commit_write(page, 0, length); @@ -977,29 +973,24 @@ iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length, vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) { + struct iomap_iter iter = { + .inode = file_inode(vmf->vma->vm_file), + .flags = IOMAP_WRITE | IOMAP_FAULT, + }; struct page *page = vmf->page; - struct inode *inode = file_inode(vmf->vma->vm_file); - unsigned long length; - loff_t offset; ssize_t ret; lock_page(page); - ret = page_mkwrite_check_truncate(page, inode); + ret = page_mkwrite_check_truncate(page, iter.inode); if (ret < 0) goto out_unlock; - length = ret; - - offset = page_offset(page); - while (length > 0) { - ret = iomap_apply(inode, offset, length, - IOMAP_WRITE | IOMAP_FAULT, ops, page, - iomap_page_mkwrite_actor); - if (unlikely(ret <= 0)) - goto out_unlock; - offset += ret; - length -= ret; - } + iter.pos = page_offset(page); + iter.len = ret; + while ((ret = iomap_iter(&iter, ops)) > 0) + iter.processed = iomap_page_mkwrite_iter(&iter, page); + if (ret < 0) + goto out_unlock; wait_for_stable_page(page); return VM_FAULT_LOCKED; out_unlock: @@ -1016,7 +1007,7 @@ iomap_finish_page_writeback(struct inode *inode, struct page *page, if (error) { SetPageError(page); - mapping_set_error(inode->i_mapping, -EIO); + mapping_set_error(inode->i_mapping, error); } WARN_ON_ONCE(i_blocks_per_page(inode, page) > 1 && !iop); @@ -1153,7 +1144,7 @@ static void iomap_writepage_end_bio(struct bio *bio) * Submit the final bio for an ioend. * * If @error is non-zero, it means that we have a situation where some part of - * the submission process has failed after we have marked paged for writeback + * the submission process has failed after we've marked pages for writeback * and unlocked them. In this situation, we need to fail the bio instead of * submitting it. This typically only happens on a filesystem shutdown. */ @@ -1168,7 +1159,7 @@ iomap_submit_ioend(struct iomap_writepage_ctx *wpc, struct iomap_ioend *ioend, error = wpc->ops->prepare_ioend(ioend, error); if (error) { /* - * If we are failing the IO now, just mark the ioend with an + * If we're failing the IO now, just mark the ioend with an * error and finish it. This will run IO completion immediately * as there is only one reference to the ioend at this point in * time. @@ -1210,7 +1201,7 @@ iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc, /* * Allocate a new bio, and chain the old bio to the new one. * - * Note that we have to do perform the chaining in this unintuitive order + * Note that we have to perform the chaining in this unintuitive order * so that the bi_private linkage is set up in the right direction for the * traversal in iomap_finish_ioend(). */ @@ -1249,7 +1240,7 @@ iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset, /* * Test to see if we have an existing ioend structure that we could append to - * first, otherwise finish off the current ioend and start another. + * first; otherwise finish off the current ioend and start another. */ static void iomap_add_to_ioend(struct inode *inode, loff_t offset, struct page *page, @@ -1259,7 +1250,6 @@ iomap_add_to_ioend(struct inode *inode, loff_t offset, struct page *page, sector_t sector = iomap_sector(&wpc->iomap, offset); unsigned len = i_blocksize(inode); unsigned poff = offset & (PAGE_SIZE - 1); - bool merged, same_page = false; if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, offset, sector)) { if (wpc->ioend) @@ -1267,19 +1257,13 @@ iomap_add_to_ioend(struct inode *inode, loff_t offset, struct page *page, wpc->ioend = iomap_alloc_ioend(inode, wpc, offset, sector, wbc); } - merged = __bio_try_merge_page(wpc->ioend->io_bio, page, len, poff, - &same_page); - if (iop) - atomic_add(len, &iop->write_bytes_pending); - - if (!merged) { - if (bio_full(wpc->ioend->io_bio, len)) { - wpc->ioend->io_bio = - iomap_chain_bio(wpc->ioend->io_bio); - } - bio_add_page(wpc->ioend->io_bio, page, len, poff); + if (bio_add_page(wpc->ioend->io_bio, page, len, poff) != len) { + wpc->ioend->io_bio = iomap_chain_bio(wpc->ioend->io_bio); + __bio_add_page(wpc->ioend->io_bio, page, len, poff); } + if (iop) + atomic_add(len, &iop->write_bytes_pending); wpc->ioend->io_size += len; wbc_account_cgroup_owner(wbc, page, len); } @@ -1287,9 +1271,9 @@ iomap_add_to_ioend(struct inode *inode, loff_t offset, struct page *page, /* * We implement an immediate ioend submission policy here to avoid needing to * chain multiple ioends and hence nest mempool allocations which can violate - * forward progress guarantees we need to provide. The current ioend we are - * adding blocks to is cached on the writepage context, and if the new block - * does not append to the cached ioend it will create a new ioend and cache that + * the forward progress guarantees we need to provide. The current ioend we're + * adding blocks to is cached in the writepage context, and if the new block + * doesn't append to the cached ioend, it will create a new ioend and cache that * instead. * * If a new ioend is created and cached, the old ioend is returned and queued @@ -1351,7 +1335,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, if (unlikely(error)) { /* * Let the filesystem know what portion of the current page - * failed to map. If the page wasn't been added to ioend, it + * failed to map. If the page hasn't been added to ioend, it * won't be affected by I/O completion and we must unlock it * now. */ @@ -1368,7 +1352,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, unlock_page(page); /* - * Preserve the original error if there was one, otherwise catch + * Preserve the original error if there was one; catch * submission errors here and propagate into subsequent ioend * submissions. */ @@ -1395,8 +1379,8 @@ done: /* * Write out a dirty page. * - * For delalloc space on the page we need to allocate space and flush it. - * For unwritten space on the page we need to start the conversion to + * For delalloc space on the page, we need to allocate space and flush it. + * For unwritten space on the page, we need to start the conversion to * regular allocated space. */ static int @@ -1411,7 +1395,7 @@ iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data) trace_iomap_writepage(inode, page_offset(page), PAGE_SIZE); /* - * Refuse to write the page out if we are called from reclaim context. + * Refuse to write the page out if we're called from reclaim context. * * This avoids stack overflows when called from deeply used stacks in * random callers for direct reclaim or memcg reclaim. We explicitly @@ -1456,20 +1440,20 @@ iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data) unsigned offset_into_page = offset & (PAGE_SIZE - 1); /* - * Skip the page if it is fully outside i_size, e.g. due to a - * truncate operation that is in progress. We must redirty the + * Skip the page if it's fully outside i_size, e.g. due to a + * truncate operation that's in progress. We must redirty the * page so that reclaim stops reclaiming it. Otherwise * iomap_vm_releasepage() is called on it and gets confused. * - * Note that the end_index is unsigned long, it would overflow - * if the given offset is greater than 16TB on 32-bit system - * and if we do check the page is fully outside i_size or not - * via "if (page->index >= end_index + 1)" as "end_index + 1" - * will be evaluated to 0. Hence this page will be redirtied - * and be written out repeatedly which would result in an - * infinite loop, the user program that perform this operation - * will hang. Instead, we can verify this situation by checking - * if the page to write is totally beyond the i_size or if it's + * Note that the end_index is unsigned long. If the given + * offset is greater than 16TB on a 32-bit system then if we + * checked if the page is fully outside i_size with + * "if (page->index >= end_index + 1)", "end_index + 1" would + * overflow and evaluate to 0. Hence this page would be + * redirtied and written out repeatedly, which would result in + * an infinite loop; the user program performing this operation + * would hang. Instead, we can detect this situation by + * checking if the page is totally beyond i_size or if its * offset is just equal to the EOF. */ if (page->index > end_index || diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 9398b8c31323..4ecd255e0511 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2010 Red Hat, Inc. - * Copyright (c) 2016-2018 Christoph Hellwig. + * Copyright (c) 2016-2021 Christoph Hellwig. */ #include <linux/module.h> #include <linux/compiler.h> @@ -59,19 +59,17 @@ int iomap_dio_iopoll(struct kiocb *kiocb, bool spin) } EXPORT_SYMBOL_GPL(iomap_dio_iopoll); -static void iomap_dio_submit_bio(struct iomap_dio *dio, struct iomap *iomap, - struct bio *bio, loff_t pos) +static void iomap_dio_submit_bio(const struct iomap_iter *iter, + struct iomap_dio *dio, struct bio *bio, loff_t pos) { atomic_inc(&dio->ref); if (dio->iocb->ki_flags & IOCB_HIPRI) bio_set_polled(bio, dio->iocb); - dio->submit.last_queue = bdev_get_queue(iomap->bdev); + dio->submit.last_queue = bdev_get_queue(iter->iomap.bdev); if (dio->dops && dio->dops->submit_io) - dio->submit.cookie = dio->dops->submit_io( - file_inode(dio->iocb->ki_filp), - iomap, bio, pos); + dio->submit.cookie = dio->dops->submit_io(iter, bio, pos); else dio->submit.cookie = submit_bio(bio); } @@ -181,24 +179,23 @@ static void iomap_dio_bio_end_io(struct bio *bio) } } -static void -iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos, - unsigned len) +static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, + loff_t pos, unsigned len) { struct page *page = ZERO_PAGE(0); int flags = REQ_SYNC | REQ_IDLE; struct bio *bio; bio = bio_alloc(GFP_KERNEL, 1); - bio_set_dev(bio, iomap->bdev); - bio->bi_iter.bi_sector = iomap_sector(iomap, pos); + bio_set_dev(bio, iter->iomap.bdev); + bio->bi_iter.bi_sector = iomap_sector(&iter->iomap, pos); bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; get_page(page); __bio_add_page(bio, page, len, 0); bio_set_op_attrs(bio, REQ_OP_WRITE, flags); - iomap_dio_submit_bio(dio, iomap, bio, pos); + iomap_dio_submit_bio(iter, dio, bio, pos); } /* @@ -206,8 +203,8 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos, * mapping, and whether or not we want FUA. Note that we can end up * clearing the WRITE_FUA flag in the dio request. */ -static inline unsigned int -iomap_dio_bio_opflags(struct iomap_dio *dio, struct iomap *iomap, bool use_fua) +static inline unsigned int iomap_dio_bio_opflags(struct iomap_dio *dio, + const struct iomap *iomap, bool use_fua) { unsigned int opflags = REQ_SYNC | REQ_IDLE; @@ -229,13 +226,16 @@ iomap_dio_bio_opflags(struct iomap_dio *dio, struct iomap *iomap, bool use_fua) return opflags; } -static loff_t -iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, - struct iomap_dio *dio, struct iomap *iomap) +static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, + struct iomap_dio *dio) { + const struct iomap *iomap = &iter->iomap; + struct inode *inode = iter->inode; unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev)); unsigned int fs_block_size = i_blocksize(inode), pad; unsigned int align = iov_iter_alignment(dio->submit.iter); + loff_t length = iomap_length(iter); + loff_t pos = iter->pos; unsigned int bio_opf; struct bio *bio; bool need_zeroout = false; @@ -286,7 +286,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, /* zero out from the start of the block to the write offset */ pad = pos & (fs_block_size - 1); if (pad) - iomap_dio_zero(dio, iomap, pos - pad, pad); + iomap_dio_zero(iter, dio, pos - pad, pad); } /* @@ -339,7 +339,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS); - iomap_dio_submit_bio(dio, iomap, bio, pos); + iomap_dio_submit_bio(iter, dio, bio, pos); pos += n; } while (nr_pages); @@ -355,7 +355,7 @@ zero_tail: /* zero out from the end of the write to the end of the block */ pad = pos & (fs_block_size - 1); if (pad) - iomap_dio_zero(dio, iomap, pos, fs_block_size - pad); + iomap_dio_zero(iter, dio, pos, fs_block_size - pad); } out: /* Undo iter limitation to current extent */ @@ -365,65 +365,67 @@ out: return ret; } -static loff_t -iomap_dio_hole_actor(loff_t length, struct iomap_dio *dio) +static loff_t iomap_dio_hole_iter(const struct iomap_iter *iter, + struct iomap_dio *dio) { - length = iov_iter_zero(length, dio->submit.iter); + loff_t length = iov_iter_zero(iomap_length(iter), dio->submit.iter); + dio->size += length; return length; } -static loff_t -iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length, - struct iomap_dio *dio, struct iomap *iomap) +static loff_t iomap_dio_inline_iter(const struct iomap_iter *iomi, + struct iomap_dio *dio) { + const struct iomap *iomap = &iomi->iomap; struct iov_iter *iter = dio->submit.iter; + void *inline_data = iomap_inline_data(iomap, iomi->pos); + loff_t length = iomap_length(iomi); + loff_t pos = iomi->pos; size_t copied; - BUG_ON(pos + length > PAGE_SIZE - offset_in_page(iomap->inline_data)); + if (WARN_ON_ONCE(!iomap_inline_data_valid(iomap))) + return -EIO; if (dio->flags & IOMAP_DIO_WRITE) { - loff_t size = inode->i_size; + loff_t size = iomi->inode->i_size; if (pos > size) - memset(iomap->inline_data + size, 0, pos - size); - copied = copy_from_iter(iomap->inline_data + pos, length, iter); + memset(iomap_inline_data(iomap, size), 0, pos - size); + copied = copy_from_iter(inline_data, length, iter); if (copied) { if (pos + copied > size) - i_size_write(inode, pos + copied); - mark_inode_dirty(inode); + i_size_write(iomi->inode, pos + copied); + mark_inode_dirty(iomi->inode); } } else { - copied = copy_to_iter(iomap->inline_data + pos, length, iter); + copied = copy_to_iter(inline_data, length, iter); } dio->size += copied; return copied; } -static loff_t -iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, - void *data, struct iomap *iomap, struct iomap *srcmap) +static loff_t iomap_dio_iter(const struct iomap_iter *iter, + struct iomap_dio *dio) { - struct iomap_dio *dio = data; - - switch (iomap->type) { + switch (iter->iomap.type) { case IOMAP_HOLE: if (WARN_ON_ONCE(dio->flags & IOMAP_DIO_WRITE)) return -EIO; - return iomap_dio_hole_actor(length, dio); + return iomap_dio_hole_iter(iter, dio); case IOMAP_UNWRITTEN: if (!(dio->flags & IOMAP_DIO_WRITE)) - return iomap_dio_hole_actor(length, dio); - return iomap_dio_bio_actor(inode, pos, length, dio, iomap); + return iomap_dio_hole_iter(iter, dio); + return iomap_dio_bio_iter(iter, dio); case IOMAP_MAPPED: - return iomap_dio_bio_actor(inode, pos, length, dio, iomap); + return iomap_dio_bio_iter(iter, dio); case IOMAP_INLINE: - return iomap_dio_inline_actor(inode, pos, length, dio, iomap); + return iomap_dio_inline_iter(iter, dio); case IOMAP_DELALLOC: /* * DIO is not serialised against mmap() access at all, and so * if the page_mkwrite occurs between the writeback and the - * iomap_apply() call in the DIO path, then it will see the + * iomap_iter() call in the DIO path, then it will see the * DELALLOC block that the page-mkwrite allocated. */ pr_warn_ratelimited("Direct I/O collision with buffered writes! File: %pD4 Comm: %.20s\n", @@ -454,16 +456,19 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, { struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = file_inode(iocb->ki_filp); - size_t count = iov_iter_count(iter); - loff_t pos = iocb->ki_pos; - loff_t end = iocb->ki_pos + count - 1, ret = 0; + struct iomap_iter iomi = { + .inode = inode, + .pos = iocb->ki_pos, + .len = iov_iter_count(iter), + .flags = IOMAP_DIRECT, + }; + loff_t end = iomi.pos + iomi.len - 1, ret = 0; bool wait_for_completion = is_sync_kiocb(iocb) || (dio_flags & IOMAP_DIO_FORCE_WAIT); - unsigned int iomap_flags = IOMAP_DIRECT; struct blk_plug plug; struct iomap_dio *dio; - if (!count) + if (!iomi.len) return NULL; dio = kmalloc(sizeof(*dio), GFP_KERNEL); @@ -484,29 +489,30 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, dio->submit.last_queue = NULL; if (iov_iter_rw(iter) == READ) { - if (pos >= dio->i_size) + if (iomi.pos >= dio->i_size) goto out_free_dio; if (iocb->ki_flags & IOCB_NOWAIT) { - if (filemap_range_needs_writeback(mapping, pos, end)) { + if (filemap_range_needs_writeback(mapping, iomi.pos, + end)) { ret = -EAGAIN; goto out_free_dio; } - iomap_flags |= IOMAP_NOWAIT; + iomi.flags |= IOMAP_NOWAIT; } if (iter_is_iovec(iter)) dio->flags |= IOMAP_DIO_DIRTY; } else { - iomap_flags |= IOMAP_WRITE; + iomi.flags |= IOMAP_WRITE; dio->flags |= IOMAP_DIO_WRITE; if (iocb->ki_flags & IOCB_NOWAIT) { - if (filemap_range_has_page(mapping, pos, end)) { + if (filemap_range_has_page(mapping, iomi.pos, end)) { ret = -EAGAIN; goto out_free_dio; } - iomap_flags |= IOMAP_NOWAIT; + iomi.flags |= IOMAP_NOWAIT; } /* for data sync or sync, we need sync completion processing */ @@ -525,12 +531,13 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) { ret = -EAGAIN; - if (pos >= dio->i_size || pos + count > dio->i_size) + if (iomi.pos >= dio->i_size || + iomi.pos + iomi.len > dio->i_size) goto out_free_dio; - iomap_flags |= IOMAP_OVERWRITE_ONLY; + iomi.flags |= IOMAP_OVERWRITE_ONLY; } - ret = filemap_write_and_wait_range(mapping, pos, end); + ret = filemap_write_and_wait_range(mapping, iomi.pos, end); if (ret) goto out_free_dio; @@ -540,9 +547,10 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, * If this invalidation fails, let the caller fall back to * buffered I/O. */ - if (invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, - end >> PAGE_SHIFT)) { - trace_iomap_dio_invalidate_fail(inode, pos, count); + if (invalidate_inode_pages2_range(mapping, + iomi.pos >> PAGE_SHIFT, end >> PAGE_SHIFT)) { + trace_iomap_dio_invalidate_fail(inode, iomi.pos, + iomi.len); ret = -ENOTBLK; goto out_free_dio; } @@ -557,31 +565,23 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, inode_dio_begin(inode); blk_start_plug(&plug); - do { - ret = iomap_apply(inode, pos, count, iomap_flags, ops, dio, - iomap_dio_actor); - if (ret <= 0) { - /* magic error code to fall back to buffered I/O */ - if (ret == -ENOTBLK) { - wait_for_completion = true; - ret = 0; - } - break; - } - pos += ret; - - if (iov_iter_rw(iter) == READ && pos >= dio->i_size) { - /* - * We only report that we've read data up to i_size. - * Revert iter to a state corresponding to that as - * some callers (such as splice code) rely on it. - */ - iov_iter_revert(iter, pos - dio->i_size); - break; - } - } while ((count = iov_iter_count(iter)) > 0); + while ((ret = iomap_iter(&iomi, ops)) > 0) + iomi.processed = iomap_dio_iter(&iomi, dio); blk_finish_plug(&plug); + /* + * We only report that we've read data up to i_size. + * Revert iter to a state corresponding to that as some callers (such + * as the splice code) rely on it. + */ + if (iov_iter_rw(iter) == READ && iomi.pos >= dio->i_size) + iov_iter_revert(iter, iomi.pos - dio->i_size); + + /* magic error code to fall back to buffered I/O */ + if (ret == -ENOTBLK) { + wait_for_completion = true; + ret = 0; + } if (ret < 0) iomap_dio_set_error(dio, ret); diff --git a/fs/iomap/fiemap.c b/fs/iomap/fiemap.c index aab070df4a21..66cf267c68ae 100644 --- a/fs/iomap/fiemap.c +++ b/fs/iomap/fiemap.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright (c) 2016-2018 Christoph Hellwig. + * Copyright (c) 2016-2021 Christoph Hellwig. */ #include <linux/module.h> #include <linux/compiler.h> @@ -8,13 +8,8 @@ #include <linux/iomap.h> #include <linux/fiemap.h> -struct fiemap_ctx { - struct fiemap_extent_info *fi; - struct iomap prev; -}; - static int iomap_to_fiemap(struct fiemap_extent_info *fi, - struct iomap *iomap, u32 flags) + const struct iomap *iomap, u32 flags) { switch (iomap->type) { case IOMAP_HOLE: @@ -43,24 +38,22 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi, iomap->length, flags); } -static loff_t -iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, - struct iomap *iomap, struct iomap *srcmap) +static loff_t iomap_fiemap_iter(const struct iomap_iter *iter, + struct fiemap_extent_info *fi, struct iomap *prev) { - struct fiemap_ctx *ctx = data; - loff_t ret = length; + int ret; - if (iomap->type == IOMAP_HOLE) - return length; + if (iter->iomap.type == IOMAP_HOLE) + return iomap_length(iter); - ret = iomap_to_fiemap(ctx->fi, &ctx->prev, 0); - ctx->prev = *iomap; + ret = iomap_to_fiemap(fi, prev, 0); + *prev = iter->iomap; switch (ret) { case 0: /* success */ - return length; + return iomap_length(iter); case 1: /* extent array full */ return 0; - default: + default: /* error */ return ret; } } @@ -68,73 +61,63 @@ iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, u64 start, u64 len, const struct iomap_ops *ops) { - struct fiemap_ctx ctx; - loff_t ret; - - memset(&ctx, 0, sizeof(ctx)); - ctx.fi = fi; - ctx.prev.type = IOMAP_HOLE; + struct iomap_iter iter = { + .inode = inode, + .pos = start, + .len = len, + .flags = IOMAP_REPORT, + }; + struct iomap prev = { + .type = IOMAP_HOLE, + }; + int ret; - ret = fiemap_prep(inode, fi, start, &len, 0); + ret = fiemap_prep(inode, fi, start, &iter.len, 0); if (ret) return ret; - while (len > 0) { - ret = iomap_apply(inode, start, len, IOMAP_REPORT, ops, &ctx, - iomap_fiemap_actor); - /* inode with no (attribute) mapping will give ENOENT */ - if (ret == -ENOENT) - break; - if (ret < 0) - return ret; - if (ret == 0) - break; - - start += ret; - len -= ret; - } + while ((ret = iomap_iter(&iter, ops)) > 0) + iter.processed = iomap_fiemap_iter(&iter, fi, &prev); - if (ctx.prev.type != IOMAP_HOLE) { - ret = iomap_to_fiemap(fi, &ctx.prev, FIEMAP_EXTENT_LAST); + if (prev.type != IOMAP_HOLE) { + ret = iomap_to_fiemap(fi, &prev, FIEMAP_EXTENT_LAST); if (ret < 0) return ret; } + /* inode with no (attribute) mapping will give ENOENT */ + if (ret < 0 && ret != -ENOENT) + return ret; return 0; } EXPORT_SYMBOL_GPL(iomap_fiemap); -static loff_t -iomap_bmap_actor(struct inode *inode, loff_t pos, loff_t length, - void *data, struct iomap *iomap, struct iomap *srcmap) -{ - sector_t *bno = data, addr; - - if (iomap->type == IOMAP_MAPPED) { - addr = (pos - iomap->offset + iomap->addr) >> inode->i_blkbits; - *bno = addr; - } - return 0; -} - /* legacy ->bmap interface. 0 is the error return (!) */ sector_t iomap_bmap(struct address_space *mapping, sector_t bno, const struct iomap_ops *ops) { - struct inode *inode = mapping->host; - loff_t pos = bno << inode->i_blkbits; - unsigned blocksize = i_blocksize(inode); + struct iomap_iter iter = { + .inode = mapping->host, + .pos = (loff_t)bno << mapping->host->i_blkbits, + .len = i_blocksize(mapping->host), + .flags = IOMAP_REPORT, + }; + const unsigned int blkshift = mapping->host->i_blkbits - SECTOR_SHIFT; int ret; if (filemap_write_and_wait(mapping)) return 0; bno = 0; - ret = iomap_apply(inode, pos, blocksize, 0, ops, &bno, - iomap_bmap_actor); + while ((ret = iomap_iter(&iter, ops)) > 0) { + if (iter.iomap.type == IOMAP_MAPPED) + bno = iomap_sector(&iter.iomap, iter.pos) >> blkshift; + /* leave iter.processed unset to abort loop */ + } if (ret) return 0; + return bno; } EXPORT_SYMBOL_GPL(iomap_bmap); diff --git a/fs/iomap/iter.c b/fs/iomap/iter.c new file mode 100644 index 000000000000..a1c7592d2ade --- /dev/null +++ b/fs/iomap/iter.c @@ -0,0 +1,80 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2010 Red Hat, Inc. + * Copyright (c) 2016-2021 Christoph Hellwig. + */ +#include <linux/fs.h> +#include <linux/iomap.h> +#include "trace.h" + +static inline int iomap_iter_advance(struct iomap_iter *iter) +{ + /* handle the previous iteration (if any) */ + if (iter->iomap.length) { + if (iter->processed <= 0) + return iter->processed; + if (WARN_ON_ONCE(iter->processed > iomap_length(iter))) + return -EIO; + iter->pos += iter->processed; + iter->len -= iter->processed; + if (!iter->len) + return 0; + } + + /* clear the state for the next iteration */ + iter->processed = 0; + memset(&iter->iomap, 0, sizeof(iter->iomap)); + memset(&iter->srcmap, 0, sizeof(iter->srcmap)); + return 1; +} + +static inline void iomap_iter_done(struct iomap_iter *iter) +{ + WARN_ON_ONCE(iter->iomap.offset > iter->pos); + WARN_ON_ONCE(iter->iomap.length == 0); + WARN_ON_ONCE(iter->iomap.offset + iter->iomap.length <= iter->pos); + + trace_iomap_iter_dstmap(iter->inode, &iter->iomap); + if (iter->srcmap.type != IOMAP_HOLE) + trace_iomap_iter_srcmap(iter->inode, &iter->srcmap); +} + +/** + * iomap_iter - iterate over a ranges in a file + * @iter: iteration structue + * @ops: iomap ops provided by the file system + * + * Iterate over filesystem-provided space mappings for the provided file range. + * + * This function handles cleanup of resources acquired for iteration when the + * filesystem indicates there are no more space mappings, which means that this + * function must be called in a loop that continues as long it returns a + * positive value. If 0 or a negative value is returned, the caller must not + * return to the loop body. Within a loop body, there are two ways to break out + * of the loop body: leave @iter.processed unchanged, or set it to a negative + * errno. + */ +int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops) +{ + int ret; + + if (iter->iomap.length && ops->iomap_end) { + ret = ops->iomap_end(iter->inode, iter->pos, iomap_length(iter), + iter->processed > 0 ? iter->processed : 0, + iter->flags, &iter->iomap); + if (ret < 0 && !iter->processed) + return ret; + } + + trace_iomap_iter(iter, ops, _RET_IP_); + ret = iomap_iter_advance(iter); + if (ret <= 0) + return ret; + + ret = ops->iomap_begin(iter->inode, iter->pos, iter->len, iter->flags, + &iter->iomap, &iter->srcmap); + if (ret < 0) + return ret; + iomap_iter_done(iter); + return 1; +} diff --git a/fs/iomap/seek.c b/fs/iomap/seek.c index ce6fb810854f..a845c012b50c 100644 --- a/fs/iomap/seek.c +++ b/fs/iomap/seek.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2017 Red Hat, Inc. - * Copyright (c) 2018 Christoph Hellwig. + * Copyright (c) 2018-2021 Christoph Hellwig. */ #include <linux/module.h> #include <linux/compiler.h> @@ -10,21 +10,20 @@ #include <linux/pagemap.h> #include <linux/pagevec.h> -static loff_t -iomap_seek_hole_actor(struct inode *inode, loff_t start, loff_t length, - void *data, struct iomap *iomap, struct iomap *srcmap) +static loff_t iomap_seek_hole_iter(const struct iomap_iter *iter, + loff_t *hole_pos) { - loff_t offset = start; + loff_t length = iomap_length(iter); - switch (iomap->type) { + switch (iter->iomap.type) { case IOMAP_UNWRITTEN: - offset = mapping_seek_hole_data(inode->i_mapping, start, - start + length, SEEK_HOLE); - if (offset == start + length) + *hole_pos = mapping_seek_hole_data(iter->inode->i_mapping, + iter->pos, iter->pos + length, SEEK_HOLE); + if (*hole_pos == iter->pos + length) return length; - fallthrough; + return 0; case IOMAP_HOLE: - *(loff_t *)data = offset; + *hole_pos = iter->pos; return 0; default: return length; @@ -32,70 +31,73 @@ iomap_seek_hole_actor(struct inode *inode, loff_t start, loff_t length, } loff_t -iomap_seek_hole(struct inode *inode, loff_t offset, const struct iomap_ops *ops) +iomap_seek_hole(struct inode *inode, loff_t pos, const struct iomap_ops *ops) { loff_t size = i_size_read(inode); - loff_t ret; + struct iomap_iter iter = { + .inode = inode, + .pos = pos, + .flags = IOMAP_REPORT, + }; + int ret; /* Nothing to be found before or beyond the end of the file. */ - if (offset < 0 || offset >= size) + if (pos < 0 || pos >= size) return -ENXIO; - while (offset < size) { - ret = iomap_apply(inode, offset, size - offset, IOMAP_REPORT, - ops, &offset, iomap_seek_hole_actor); - if (ret < 0) - return ret; - if (ret == 0) - break; - offset += ret; - } - - return offset; + iter.len = size - pos; + while ((ret = iomap_iter(&iter, ops)) > 0) + iter.processed = iomap_seek_hole_iter(&iter, &pos); + if (ret < 0) + return ret; + if (iter.len) /* found hole before EOF */ + return pos; + return size; } EXPORT_SYMBOL_GPL(iomap_seek_hole); -static loff_t -iomap_seek_data_actor(struct inode *inode, loff_t start, loff_t length, - void *data, struct iomap *iomap, struct iomap *srcmap) +static loff_t iomap_seek_data_iter(const struct iomap_iter *iter, + loff_t *hole_pos) { - loff_t offset = start; + loff_t length = iomap_length(iter); - switch (iomap->type) { + switch (iter->iomap.type) { case IOMAP_HOLE: return length; case IOMAP_UNWRITTEN: - offset = mapping_seek_hole_data(inode->i_mapping, start, - start + length, SEEK_DATA); - if (offset < 0) + *hole_pos = mapping_seek_hole_data(iter->inode->i_mapping, + iter->pos, iter->pos + length, SEEK_DATA); + if (*hole_pos < 0) return length; - fallthrough; + return 0; default: - *(loff_t *)data = offset; + *hole_pos = iter->pos; return 0; } } loff_t -iomap_seek_data(struct inode *inode, loff_t offset, const struct iomap_ops *ops) +iomap_seek_data(struct inode *inode, loff_t pos, const struct iomap_ops *ops) { loff_t size = i_size_read(inode); - loff_t ret; + struct iomap_iter iter = { + .inode = inode, + .pos = pos, + .flags = IOMAP_REPORT, + }; + int ret; /* Nothing to be found before or beyond the end of the file. */ - if (offset < 0 || offset >= size) + if (pos < 0 || pos >= size) return -ENXIO; - while (offset < size) { - ret = iomap_apply(inode, offset, size - offset, IOMAP_REPORT, - ops, &offset, iomap_seek_data_actor); - if (ret < 0) - return ret; - if (ret == 0) - return offset; - offset += ret; - } - + iter.len = size - pos; + while ((ret = iomap_iter(&iter, ops)) > 0) + iter.processed = iomap_seek_data_iter(&iter, &pos); + if (ret < 0) + return ret; + if (iter.len) /* found data before EOF */ + return pos; /* We've reached the end of the file without finding data */ return -ENXIO; } diff --git a/fs/iomap/swapfile.c b/fs/iomap/swapfile.c index 6250ca6a1f85..5fc0ac36dee3 100644 --- a/fs/iomap/swapfile.c +++ b/fs/iomap/swapfile.c @@ -31,11 +31,16 @@ static int iomap_swapfile_add_extent(struct iomap_swapfile_info *isi) { struct iomap *iomap = &isi->iomap; unsigned long nr_pages; + unsigned long max_pages; uint64_t first_ppage; uint64_t first_ppage_reported; uint64_t next_ppage; int error; + if (unlikely(isi->nr_pages >= isi->sis->max)) + return 0; + max_pages = isi->sis->max - isi->nr_pages; + /* * Round the start up and the end down so that the physical * extent aligns to a page boundary. @@ -48,6 +53,7 @@ static int iomap_swapfile_add_extent(struct iomap_swapfile_info *isi) if (first_ppage >= next_ppage) return 0; nr_pages = next_ppage - first_ppage; + nr_pages = min(nr_pages, max_pages); /* * Calculate how much swap space we're adding; the first page contains @@ -88,13 +94,9 @@ static int iomap_swapfile_fail(struct iomap_swapfile_info *isi, const char *str) * swap only cares about contiguous page-aligned physical extents and makes no * distinction between written and unwritten extents. */ -static loff_t iomap_swapfile_activate_actor(struct inode *inode, loff_t pos, - loff_t count, void *data, struct iomap *iomap, - struct iomap *srcmap) +static loff_t iomap_swapfile_iter(const struct iomap_iter *iter, + struct iomap *iomap, struct iomap_swapfile_info *isi) { - struct iomap_swapfile_info *isi = data; - int error; - switch (iomap->type) { case IOMAP_MAPPED: case IOMAP_UNWRITTEN: @@ -125,12 +127,12 @@ static loff_t iomap_swapfile_activate_actor(struct inode *inode, loff_t pos, isi->iomap.length += iomap->length; } else { /* Otherwise, add the retained iomap and store this one. */ - error = iomap_swapfile_add_extent(isi); + int error = iomap_swapfile_add_extent(isi); if (error) return error; memcpy(&isi->iomap, iomap, sizeof(isi->iomap)); } - return count; + return iomap_length(iter); } /* @@ -141,16 +143,19 @@ int iomap_swapfile_activate(struct swap_info_struct *sis, struct file *swap_file, sector_t *pagespan, const struct iomap_ops *ops) { + struct inode *inode = swap_file->f_mapping->host; + struct iomap_iter iter = { + .inode = inode, + .pos = 0, + .len = ALIGN_DOWN(i_size_read(inode), PAGE_SIZE), + .flags = IOMAP_REPORT, + }; struct iomap_swapfile_info isi = { .sis = sis, .lowest_ppage = (sector_t)-1ULL, .file = swap_file, }; - struct address_space *mapping = swap_file->f_mapping; - struct inode *inode = mapping->host; - loff_t pos = 0; - loff_t len = ALIGN_DOWN(i_size_read(inode), PAGE_SIZE); - loff_t ret; + int ret; /* * Persist all file mapping metadata so that we won't have any @@ -160,15 +165,10 @@ int iomap_swapfile_activate(struct swap_info_struct *sis, if (ret) return ret; - while (len > 0) { - ret = iomap_apply(inode, pos, len, IOMAP_REPORT, - ops, &isi, iomap_swapfile_activate_actor); - if (ret <= 0) - return ret; - - pos += ret; - len -= ret; - } + while ((ret = iomap_iter(&iter, ops)) > 0) + iter.processed = iomap_swapfile_iter(&iter, &iter.iomap, &isi); + if (ret < 0) + return ret; if (isi.iomap.length) { ret = iomap_swapfile_add_extent(&isi); diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h index fdc7ae388476..65e39785c284 100644 --- a/fs/iomap/trace.h +++ b/fs/iomap/trace.h @@ -1,9 +1,18 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* - * Copyright (c) 2009-2019 Christoph Hellwig + * Copyright (c) 2009-2021 Christoph Hellwig * - * NOTE: none of these tracepoints shall be consider a stable kernel ABI + * NOTE: none of these tracepoints shall be considered a stable kernel ABI * as they can change at any time. + * + * Current conventions for printing numbers measuring specific units: + * + * offset: byte offset into a subcomponent of a file operation + * pos: file offset, in bytes + * length: length of a file operation, in bytes + * ino: inode number + * + * Numbers describing space allocations should be formatted in hexadecimal. */ #undef TRACE_SYSTEM #define TRACE_SYSTEM iomap @@ -42,14 +51,14 @@ DEFINE_READPAGE_EVENT(iomap_readpage); DEFINE_READPAGE_EVENT(iomap_readahead); DECLARE_EVENT_CLASS(iomap_range_class, - TP_PROTO(struct inode *inode, unsigned long off, unsigned int len), + TP_PROTO(struct inode *inode, loff_t off, u64 len), TP_ARGS(inode, off, len), TP_STRUCT__entry( __field(dev_t, dev) __field(u64, ino) __field(loff_t, size) - __field(unsigned long, offset) - __field(unsigned int, length) + __field(loff_t, offset) + __field(u64, length) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; @@ -58,8 +67,7 @@ DECLARE_EVENT_CLASS(iomap_range_class, __entry->offset = off; __entry->length = len; ), - TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset %lx " - "length %x", + TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx length 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->size, @@ -69,7 +77,7 @@ DECLARE_EVENT_CLASS(iomap_range_class, #define DEFINE_RANGE_EVENT(name) \ DEFINE_EVENT(iomap_range_class, name, \ - TP_PROTO(struct inode *inode, unsigned long off, unsigned int len),\ + TP_PROTO(struct inode *inode, loff_t off, u64 len),\ TP_ARGS(inode, off, len)) DEFINE_RANGE_EVENT(iomap_writepage); DEFINE_RANGE_EVENT(iomap_releasepage); @@ -122,8 +130,8 @@ DECLARE_EVENT_CLASS(iomap_class, __entry->flags = iomap->flags; __entry->bdev = iomap->bdev ? iomap->bdev->bd_dev : 0; ), - TP_printk("dev %d:%d ino 0x%llx bdev %d:%d addr %lld offset %lld " - "length %llu type %s flags %s", + TP_printk("dev %d:%d ino 0x%llx bdev %d:%d addr 0x%llx offset 0x%llx " + "length 0x%llx type %s flags %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, MAJOR(__entry->bdev), MINOR(__entry->bdev), @@ -138,36 +146,32 @@ DECLARE_EVENT_CLASS(iomap_class, DEFINE_EVENT(iomap_class, name, \ TP_PROTO(struct inode *inode, struct iomap *iomap), \ TP_ARGS(inode, iomap)) -DEFINE_IOMAP_EVENT(iomap_apply_dstmap); -DEFINE_IOMAP_EVENT(iomap_apply_srcmap); +DEFINE_IOMAP_EVENT(iomap_iter_dstmap); +DEFINE_IOMAP_EVENT(iomap_iter_srcmap); -TRACE_EVENT(iomap_apply, - TP_PROTO(struct inode *inode, loff_t pos, loff_t length, - unsigned int flags, const void *ops, void *actor, - unsigned long caller), - TP_ARGS(inode, pos, length, flags, ops, actor, caller), +TRACE_EVENT(iomap_iter, + TP_PROTO(struct iomap_iter *iter, const void *ops, + unsigned long caller), + TP_ARGS(iter, ops, caller), TP_STRUCT__entry( __field(dev_t, dev) __field(u64, ino) __field(loff_t, pos) - __field(loff_t, length) + __field(u64, length) __field(unsigned int, flags) __field(const void *, ops) - __field(void *, actor) __field(unsigned long, caller) ), TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; - __entry->ino = inode->i_ino; - __entry->pos = pos; - __entry->length = length; - __entry->flags = flags; + __entry->dev = iter->inode->i_sb->s_dev; + __entry->ino = iter->inode->i_ino; + __entry->pos = iter->pos; + __entry->length = iomap_length(iter); + __entry->flags = iter->flags; __entry->ops = ops; - __entry->actor = actor; __entry->caller = caller; ), - TP_printk("dev %d:%d ino 0x%llx pos %lld length %lld flags %s (0x%x) " - "ops %ps caller %pS actor %ps", + TP_printk("dev %d:%d ino 0x%llx pos 0x%llx length 0x%llx flags %s (0x%x) ops %ps caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->pos, @@ -175,8 +179,7 @@ TRACE_EVENT(iomap_apply, __print_flags(__entry->flags, "|", IOMAP_FLAGS_STRINGS), __entry->flags, __entry->ops, - (void *)__entry->caller, - __entry->actor) + (void *)__entry->caller) ); #endif /* _IOMAP_TRACE_H */ diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 21edc423b79f..678e2c51b855 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -155,7 +155,6 @@ struct iso9660_options{ unsigned int overriderockperm:1; unsigned int uid_set:1; unsigned int gid_set:1; - unsigned int utf8:1; unsigned char map; unsigned char check; unsigned int blocksize; @@ -356,7 +355,6 @@ static int parse_options(char *options, struct iso9660_options *popt) popt->gid = GLOBAL_ROOT_GID; popt->uid = GLOBAL_ROOT_UID; popt->iocharset = NULL; - popt->utf8 = 0; popt->overriderockperm = 0; popt->session=-1; popt->sbsector=-1; @@ -389,10 +387,13 @@ static int parse_options(char *options, struct iso9660_options *popt) case Opt_cruft: popt->cruft = 1; break; +#ifdef CONFIG_JOLIET case Opt_utf8: - popt->utf8 = 1; + kfree(popt->iocharset); + popt->iocharset = kstrdup("utf8", GFP_KERNEL); + if (!popt->iocharset) + return 0; break; -#ifdef CONFIG_JOLIET case Opt_iocharset: kfree(popt->iocharset); popt->iocharset = match_strdup(&args[0]); @@ -495,7 +496,6 @@ static int isofs_show_options(struct seq_file *m, struct dentry *root) if (sbi->s_nocompress) seq_puts(m, ",nocompress"); if (sbi->s_overriderockperm) seq_puts(m, ",overriderockperm"); if (sbi->s_showassoc) seq_puts(m, ",showassoc"); - if (sbi->s_utf8) seq_puts(m, ",utf8"); if (sbi->s_check) seq_printf(m, ",check=%c", sbi->s_check); if (sbi->s_mapping) seq_printf(m, ",map=%c", sbi->s_mapping); @@ -518,9 +518,10 @@ static int isofs_show_options(struct seq_file *m, struct dentry *root) seq_printf(m, ",fmode=%o", sbi->s_fmode); #ifdef CONFIG_JOLIET - if (sbi->s_nls_iocharset && - strcmp(sbi->s_nls_iocharset->charset, CONFIG_NLS_DEFAULT) != 0) + if (sbi->s_nls_iocharset) seq_printf(m, ",iocharset=%s", sbi->s_nls_iocharset->charset); + else + seq_puts(m, ",iocharset=utf8"); #endif return 0; } @@ -863,14 +864,13 @@ root_found: sbi->s_nls_iocharset = NULL; #ifdef CONFIG_JOLIET - if (joliet_level && opt.utf8 == 0) { + if (joliet_level) { char *p = opt.iocharset ? opt.iocharset : CONFIG_NLS_DEFAULT; - sbi->s_nls_iocharset = load_nls(p); - if (! sbi->s_nls_iocharset) { - /* Fail only if explicit charset specified */ - if (opt.iocharset) + if (strcmp(p, "utf8") != 0) { + sbi->s_nls_iocharset = opt.iocharset ? + load_nls(opt.iocharset) : load_nls_default(); + if (!sbi->s_nls_iocharset) goto out_freesbi; - sbi->s_nls_iocharset = load_nls_default(); } } #endif @@ -886,7 +886,6 @@ root_found: sbi->s_gid = opt.gid; sbi->s_uid_set = opt.uid_set; sbi->s_gid_set = opt.gid_set; - sbi->s_utf8 = opt.utf8; sbi->s_nocompress = opt.nocompress; sbi->s_overriderockperm = opt.overriderockperm; /* diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h index 055ec6c586f7..dcdc191ed183 100644 --- a/fs/isofs/isofs.h +++ b/fs/isofs/isofs.h @@ -44,7 +44,6 @@ struct isofs_sb_info { unsigned char s_session; unsigned int s_high_sierra:1; unsigned int s_rock:2; - unsigned int s_utf8:1; unsigned int s_cruft:1; /* Broken disks with high byte of length * containing junk */ unsigned int s_nocompress:1; diff --git a/fs/isofs/joliet.c b/fs/isofs/joliet.c index be8b6a9d0b92..c0f04a1e7f69 100644 --- a/fs/isofs/joliet.c +++ b/fs/isofs/joliet.c @@ -41,14 +41,12 @@ uni16_to_x8(unsigned char *ascii, __be16 *uni, int len, struct nls_table *nls) int get_joliet_filename(struct iso_directory_record * de, unsigned char *outname, struct inode * inode) { - unsigned char utf8; struct nls_table *nls; unsigned char len = 0; - utf8 = ISOFS_SB(inode->i_sb)->s_utf8; nls = ISOFS_SB(inode->i_sb)->s_nls_iocharset; - if (utf8) { + if (!nls) { len = utf16s_to_utf8s((const wchar_t *) de->name, de->name_len[0] >> 1, UTF16_BIG_ENDIAN, outname, PAGE_SIZE); diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index d47a0d96bf30..8ca3527189f8 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -179,8 +179,8 @@ static int jbd2_descriptor_block_csum_verify(journal_t *j, void *buf) if (!jbd2_journal_has_csum_v2or3(j)) return 1; - tail = (struct jbd2_journal_block_tail *)(buf + j->j_blocksize - - sizeof(struct jbd2_journal_block_tail)); + tail = (struct jbd2_journal_block_tail *)((char *)buf + + j->j_blocksize - sizeof(struct jbd2_journal_block_tail)); provided = tail->t_checksum; tail->t_checksum = 0; calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize); @@ -196,7 +196,7 @@ static int jbd2_descriptor_block_csum_verify(journal_t *j, void *buf) static int count_tags(journal_t *journal, struct buffer_head *bh) { char * tagp; - journal_block_tag_t * tag; + journal_block_tag_t tag; int nr = 0, size = journal->j_blocksize; int tag_bytes = journal_tag_bytes(journal); @@ -206,14 +206,14 @@ static int count_tags(journal_t *journal, struct buffer_head *bh) tagp = &bh->b_data[sizeof(journal_header_t)]; while ((tagp - bh->b_data + tag_bytes) <= size) { - tag = (journal_block_tag_t *) tagp; + memcpy(&tag, tagp, sizeof(tag)); nr++; tagp += tag_bytes; - if (!(tag->t_flags & cpu_to_be16(JBD2_FLAG_SAME_UUID))) + if (!(tag.t_flags & cpu_to_be16(JBD2_FLAG_SAME_UUID))) tagp += 16; - if (tag->t_flags & cpu_to_be16(JBD2_FLAG_LAST_TAG)) + if (tag.t_flags & cpu_to_be16(JBD2_FLAG_LAST_TAG)) break; } @@ -433,9 +433,9 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf) } static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag, + journal_block_tag3_t *tag3, void *buf, __u32 sequence) { - journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag; __u32 csum32; __be32 seq; @@ -496,7 +496,7 @@ static int do_one_pass(journal_t *journal, while (1) { int flags; char * tagp; - journal_block_tag_t * tag; + journal_block_tag_t tag; struct buffer_head * obh; struct buffer_head * nbh; @@ -613,8 +613,8 @@ static int do_one_pass(journal_t *journal, <= journal->j_blocksize - descr_csum_size) { unsigned long io_block; - tag = (journal_block_tag_t *) tagp; - flags = be16_to_cpu(tag->t_flags); + memcpy(&tag, tagp, sizeof(tag)); + flags = be16_to_cpu(tag.t_flags); io_block = next_log_block++; wrap(journal, next_log_block); @@ -632,7 +632,7 @@ static int do_one_pass(journal_t *journal, J_ASSERT(obh != NULL); blocknr = read_tag_block(journal, - tag); + &tag); /* If the block has been * revoked, then we're all done @@ -647,8 +647,8 @@ static int do_one_pass(journal_t *journal, /* Look for block corruption */ if (!jbd2_block_tag_csum_verify( - journal, tag, obh->b_data, - be32_to_cpu(tmp->h_sequence))) { + journal, &tag, (journal_block_tag3_t *)tagp, + obh->b_data, be32_to_cpu(tmp->h_sequence))) { brelse(obh); success = -EFSBADCRC; printk(KERN_ERR "JBD2: Invalid " @@ -760,7 +760,6 @@ static int do_one_pass(journal_t *journal, */ jbd_debug(1, "JBD2: Invalid checksum ignored in transaction %u, likely stale data\n", next_commit_ID); - err = 0; brelse(bh); goto done; } @@ -897,7 +896,7 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, { jbd2_journal_revoke_header_t *header; int offset, max; - int csum_size = 0; + unsigned csum_size = 0; __u32 rcount; int record_len = 4; diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 8804e126805f..6a3caedd2285 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -223,9 +223,15 @@ static void sub_reserved_credits(journal_t *journal, int blocks) * with j_state_lock held for reading. Returns 0 if handle joined the running * transaction. Returns 1 if we had to wait, j_state_lock is dropped, and * caller must retry. + * + * Note: because j_state_lock may be dropped depending on the return + * value, we need to fake out sparse so ti doesn't complain about a + * locking imbalance. Callers of add_transaction_credits will need to + * make a similar accomodation. */ static int add_transaction_credits(journal_t *journal, int blocks, int rsv_blocks) +__must_hold(&journal->j_state_lock) { transaction_t *t = journal->j_running_transaction; int needed; @@ -238,6 +244,7 @@ static int add_transaction_credits(journal_t *journal, int blocks, if (t->t_state != T_RUNNING) { WARN_ON_ONCE(t->t_state >= T_FLUSH); wait_transaction_locked(journal); + __acquire(&journal->j_state_lock); /* fake out sparse */ return 1; } @@ -266,10 +273,12 @@ static int add_transaction_credits(journal_t *journal, int blocks, wait_event(journal->j_wait_reserved, atomic_read(&journal->j_reserved_credits) + total <= journal->j_max_transaction_buffers); + __acquire(&journal->j_state_lock); /* fake out sparse */ return 1; } wait_transaction_locked(journal); + __acquire(&journal->j_state_lock); /* fake out sparse */ return 1; } @@ -293,6 +302,7 @@ static int add_transaction_credits(journal_t *journal, int blocks, journal->j_max_transaction_buffers) __jbd2_log_wait_for_space(journal); write_unlock(&journal->j_state_lock); + __acquire(&journal->j_state_lock); /* fake out sparse */ return 1; } @@ -310,6 +320,7 @@ static int add_transaction_credits(journal_t *journal, int blocks, wait_event(journal->j_wait_reserved, atomic_read(&journal->j_reserved_credits) + rsv_blocks <= journal->j_max_transaction_buffers / 2); + __acquire(&journal->j_state_lock); /* fake out sparse */ return 1; } return 0; @@ -413,8 +424,14 @@ repeat: if (!handle->h_reserved) { /* We may have dropped j_state_lock - restart in that case */ - if (add_transaction_credits(journal, blocks, rsv_blocks)) + if (add_transaction_credits(journal, blocks, rsv_blocks)) { + /* + * add_transaction_credits releases + * j_state_lock on a non-zero return + */ + __release(&journal->j_state_lock); goto repeat; + } } else { /* * We have handle reserved so we are allowed to join T_LOCKED @@ -1404,7 +1421,7 @@ void jbd2_journal_set_triggers(struct buffer_head *bh, { struct journal_head *jh = jbd2_journal_grab_journal_head(bh); - if (WARN_ON(!jh)) + if (WARN_ON_ONCE(!jh)) return; jh->b_triggers = type; jbd2_journal_put_journal_head(jh); diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index 55a79df70d24..e945e3484788 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -173,12 +173,15 @@ static void *jffs2_acl_to_medium(const struct posix_acl *acl, size_t *size) return ERR_PTR(-EINVAL); } -struct posix_acl *jffs2_get_acl(struct inode *inode, int type) +struct posix_acl *jffs2_get_acl(struct inode *inode, int type, bool rcu) { struct posix_acl *acl; char *value = NULL; int rc, xprefix; + if (rcu) + return ERR_PTR(-ECHILD); + switch (type) { case ACL_TYPE_ACCESS: xprefix = JFFS2_XPREFIX_ACL_ACCESS; diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h index 62c50da9d493..9d9fb7cf093e 100644 --- a/fs/jffs2/acl.h +++ b/fs/jffs2/acl.h @@ -27,7 +27,7 @@ struct jffs2_acl_header { #ifdef CONFIG_JFFS2_FS_POSIX_ACL -struct posix_acl *jffs2_get_acl(struct inode *inode, int type); +struct posix_acl *jffs2_get_acl(struct inode *inode, int type, bool rcu); int jffs2_set_acl(struct user_namespace *mnt_userns, struct inode *inode, struct posix_acl *acl, int type); extern int jffs2_init_acl_pre(struct inode *, struct inode *, umode_t *); diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index 43c285c3d2a7..a653f34c6e26 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -14,13 +14,16 @@ #include "jfs_xattr.h" #include "jfs_acl.h" -struct posix_acl *jfs_get_acl(struct inode *inode, int type) +struct posix_acl *jfs_get_acl(struct inode *inode, int type, bool rcu) { struct posix_acl *acl; char *ea_name; int size; char *value = NULL; + if (rcu) + return ERR_PTR(-ECHILD); + switch(type) { case ACL_TYPE_ACCESS: ea_name = XATTR_NAME_POSIX_ACL_ACCESS; diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h index 7ae389a7a366..3de40286d31f 100644 --- a/fs/jfs/jfs_acl.h +++ b/fs/jfs/jfs_acl.h @@ -7,7 +7,7 @@ #ifdef CONFIG_JFS_POSIX_ACL -struct posix_acl *jfs_get_acl(struct inode *inode, int type); +struct posix_acl *jfs_get_acl(struct inode *inode, int type, bool rcu); int jfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, struct posix_acl *acl, int type); int jfs_init_acl(tid_t, struct inode *, struct inode *); diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 33166ec90a11..ba581429bf7b 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -17,7 +17,7 @@ #include "kernfs-internal.h" -DEFINE_MUTEX(kernfs_mutex); +DECLARE_RWSEM(kernfs_rwsem); static DEFINE_SPINLOCK(kernfs_rename_lock); /* kn->parent and ->name */ static char kernfs_pr_cont_buf[PATH_MAX]; /* protected by rename_lock */ static DEFINE_SPINLOCK(kernfs_idr_lock); /* root->ino_idr */ @@ -26,7 +26,7 @@ static DEFINE_SPINLOCK(kernfs_idr_lock); /* root->ino_idr */ static bool kernfs_active(struct kernfs_node *kn) { - lockdep_assert_held(&kernfs_mutex); + lockdep_assert_held(&kernfs_rwsem); return atomic_read(&kn->active) >= 0; } @@ -340,7 +340,7 @@ static int kernfs_sd_compare(const struct kernfs_node *left, * @kn->parent->dir.children. * * Locking: - * mutex_lock(kernfs_mutex) + * kernfs_rwsem held exclusive * * RETURNS: * 0 on susccess -EEXIST on failure. @@ -372,6 +372,7 @@ static int kernfs_link_sibling(struct kernfs_node *kn) /* successfully added, account subdir number */ if (kernfs_type(kn) == KERNFS_DIR) kn->parent->dir.subdirs++; + kernfs_inc_rev(kn->parent); return 0; } @@ -385,7 +386,7 @@ static int kernfs_link_sibling(struct kernfs_node *kn) * removed, %false if @kn wasn't on the rbtree. * * Locking: - * mutex_lock(kernfs_mutex) + * kernfs_rwsem held exclusive */ static bool kernfs_unlink_sibling(struct kernfs_node *kn) { @@ -394,6 +395,7 @@ static bool kernfs_unlink_sibling(struct kernfs_node *kn) if (kernfs_type(kn) == KERNFS_DIR) kn->parent->dir.subdirs--; + kernfs_inc_rev(kn->parent); rb_erase(&kn->rb, &kn->parent->dir.children); RB_CLEAR_NODE(&kn->rb); @@ -455,14 +457,14 @@ void kernfs_put_active(struct kernfs_node *kn) * return after draining is complete. */ static void kernfs_drain(struct kernfs_node *kn) - __releases(&kernfs_mutex) __acquires(&kernfs_mutex) + __releases(&kernfs_rwsem) __acquires(&kernfs_rwsem) { struct kernfs_root *root = kernfs_root(kn); - lockdep_assert_held(&kernfs_mutex); + lockdep_assert_held_write(&kernfs_rwsem); WARN_ON_ONCE(kernfs_active(kn)); - mutex_unlock(&kernfs_mutex); + up_write(&kernfs_rwsem); if (kernfs_lockdep(kn)) { rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_); @@ -481,7 +483,7 @@ static void kernfs_drain(struct kernfs_node *kn) kernfs_drain_open_files(kn); - mutex_lock(&kernfs_mutex); + down_write(&kernfs_rwsem); } /** @@ -720,7 +722,7 @@ int kernfs_add_one(struct kernfs_node *kn) bool has_ns; int ret; - mutex_lock(&kernfs_mutex); + down_write(&kernfs_rwsem); ret = -EINVAL; has_ns = kernfs_ns_enabled(parent); @@ -751,7 +753,7 @@ int kernfs_add_one(struct kernfs_node *kn) ps_iattr->ia_mtime = ps_iattr->ia_ctime; } - mutex_unlock(&kernfs_mutex); + up_write(&kernfs_rwsem); /* * Activate the new node unless CREATE_DEACTIVATED is requested. @@ -765,7 +767,7 @@ int kernfs_add_one(struct kernfs_node *kn) return 0; out_unlock: - mutex_unlock(&kernfs_mutex); + up_write(&kernfs_rwsem); return ret; } @@ -786,7 +788,7 @@ static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent, bool has_ns = kernfs_ns_enabled(parent); unsigned int hash; - lockdep_assert_held(&kernfs_mutex); + lockdep_assert_held(&kernfs_rwsem); if (has_ns != (bool)ns) { WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n", @@ -818,7 +820,7 @@ static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent, size_t len; char *p, *name; - lockdep_assert_held(&kernfs_mutex); + lockdep_assert_held_read(&kernfs_rwsem); /* grab kernfs_rename_lock to piggy back on kernfs_pr_cont_buf */ spin_lock_irq(&kernfs_rename_lock); @@ -858,10 +860,10 @@ struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent, { struct kernfs_node *kn; - mutex_lock(&kernfs_mutex); + down_read(&kernfs_rwsem); kn = kernfs_find_ns(parent, name, ns); kernfs_get(kn); - mutex_unlock(&kernfs_mutex); + up_read(&kernfs_rwsem); return kn; } @@ -882,10 +884,10 @@ struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent, { struct kernfs_node *kn; - mutex_lock(&kernfs_mutex); + down_read(&kernfs_rwsem); kn = kernfs_walk_ns(parent, path, ns); kernfs_get(kn); - mutex_unlock(&kernfs_mutex); + up_read(&kernfs_rwsem); return kn; } @@ -1037,12 +1039,34 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags) if (flags & LOOKUP_RCU) return -ECHILD; - /* Always perform fresh lookup for negatives */ - if (d_really_is_negative(dentry)) - goto out_bad_unlocked; + /* Negative hashed dentry? */ + if (d_really_is_negative(dentry)) { + struct kernfs_node *parent; + + /* If the kernfs parent node has changed discard and + * proceed to ->lookup. + */ + down_read(&kernfs_rwsem); + spin_lock(&dentry->d_lock); + parent = kernfs_dentry_node(dentry->d_parent); + if (parent) { + if (kernfs_dir_changed(parent, dentry)) { + spin_unlock(&dentry->d_lock); + up_read(&kernfs_rwsem); + return 0; + } + } + spin_unlock(&dentry->d_lock); + up_read(&kernfs_rwsem); + + /* The kernfs parent node hasn't changed, leave the + * dentry negative and return success. + */ + return 1; + } kn = kernfs_dentry_node(dentry); - mutex_lock(&kernfs_mutex); + down_read(&kernfs_rwsem); /* The kernfs node has been deactivated */ if (!kernfs_active(kn)) @@ -1061,11 +1085,10 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags) kernfs_info(dentry->d_sb)->ns != kn->ns) goto out_bad; - mutex_unlock(&kernfs_mutex); + up_read(&kernfs_rwsem); return 1; out_bad: - mutex_unlock(&kernfs_mutex); -out_bad_unlocked: + up_read(&kernfs_rwsem); return 0; } @@ -1077,37 +1100,29 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { - struct dentry *ret; struct kernfs_node *parent = dir->i_private; struct kernfs_node *kn; - struct inode *inode; + struct inode *inode = NULL; const void *ns = NULL; - mutex_lock(&kernfs_mutex); - + down_read(&kernfs_rwsem); if (kernfs_ns_enabled(parent)) ns = kernfs_info(dir->i_sb)->ns; kn = kernfs_find_ns(parent, dentry->d_name.name, ns); - - /* no such entry */ - if (!kn || !kernfs_active(kn)) { - ret = NULL; - goto out_unlock; - } - /* attach dentry and inode */ - inode = kernfs_get_inode(dir->i_sb, kn); - if (!inode) { - ret = ERR_PTR(-ENOMEM); - goto out_unlock; + if (kn && kernfs_active(kn)) { + inode = kernfs_get_inode(dir->i_sb, kn); + if (!inode) + inode = ERR_PTR(-ENOMEM); } + /* Needed only for negative dentry validation */ + if (!inode) + kernfs_set_rev(parent, dentry); + up_read(&kernfs_rwsem); - /* instantiate and hash dentry */ - ret = d_splice_alias(inode, dentry); - out_unlock: - mutex_unlock(&kernfs_mutex); - return ret; + /* instantiate and hash (possibly negative) dentry */ + return d_splice_alias(inode, dentry); } static int kernfs_iop_mkdir(struct user_namespace *mnt_userns, @@ -1227,7 +1242,7 @@ static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos, { struct rb_node *rbn; - lockdep_assert_held(&kernfs_mutex); + lockdep_assert_held_write(&kernfs_rwsem); /* if first iteration, visit leftmost descendant which may be root */ if (!pos) @@ -1263,7 +1278,7 @@ void kernfs_activate(struct kernfs_node *kn) { struct kernfs_node *pos; - mutex_lock(&kernfs_mutex); + down_write(&kernfs_rwsem); pos = NULL; while ((pos = kernfs_next_descendant_post(pos, kn))) { @@ -1277,14 +1292,14 @@ void kernfs_activate(struct kernfs_node *kn) pos->flags |= KERNFS_ACTIVATED; } - mutex_unlock(&kernfs_mutex); + up_write(&kernfs_rwsem); } static void __kernfs_remove(struct kernfs_node *kn) { struct kernfs_node *pos; - lockdep_assert_held(&kernfs_mutex); + lockdep_assert_held_write(&kernfs_rwsem); /* * Short-circuit if non-root @kn has already finished removal. @@ -1307,7 +1322,7 @@ static void __kernfs_remove(struct kernfs_node *kn) pos = kernfs_leftmost_descendant(kn); /* - * kernfs_drain() drops kernfs_mutex temporarily and @pos's + * kernfs_drain() drops kernfs_rwsem temporarily and @pos's * base ref could have been put by someone else by the time * the function returns. Make sure it doesn't go away * underneath us. @@ -1354,9 +1369,9 @@ static void __kernfs_remove(struct kernfs_node *kn) */ void kernfs_remove(struct kernfs_node *kn) { - mutex_lock(&kernfs_mutex); + down_write(&kernfs_rwsem); __kernfs_remove(kn); - mutex_unlock(&kernfs_mutex); + up_write(&kernfs_rwsem); } /** @@ -1443,17 +1458,17 @@ bool kernfs_remove_self(struct kernfs_node *kn) { bool ret; - mutex_lock(&kernfs_mutex); + down_write(&kernfs_rwsem); kernfs_break_active_protection(kn); /* * SUICIDAL is used to arbitrate among competing invocations. Only * the first one will actually perform removal. When the removal * is complete, SUICIDED is set and the active ref is restored - * while holding kernfs_mutex. The ones which lost arbitration - * waits for SUICDED && drained which can happen only after the - * enclosing kernfs operation which executed the winning instance - * of kernfs_remove_self() finished. + * while kernfs_rwsem for held exclusive. The ones which lost + * arbitration waits for SUICIDED && drained which can happen only + * after the enclosing kernfs operation which executed the winning + * instance of kernfs_remove_self() finished. */ if (!(kn->flags & KERNFS_SUICIDAL)) { kn->flags |= KERNFS_SUICIDAL; @@ -1471,9 +1486,9 @@ bool kernfs_remove_self(struct kernfs_node *kn) atomic_read(&kn->active) == KN_DEACTIVATED_BIAS) break; - mutex_unlock(&kernfs_mutex); + up_write(&kernfs_rwsem); schedule(); - mutex_lock(&kernfs_mutex); + down_write(&kernfs_rwsem); } finish_wait(waitq, &wait); WARN_ON_ONCE(!RB_EMPTY_NODE(&kn->rb)); @@ -1481,12 +1496,12 @@ bool kernfs_remove_self(struct kernfs_node *kn) } /* - * This must be done while holding kernfs_mutex; otherwise, waiting - * for SUICIDED && deactivated could finish prematurely. + * This must be done while kernfs_rwsem held exclusive; otherwise, + * waiting for SUICIDED && deactivated could finish prematurely. */ kernfs_unbreak_active_protection(kn); - mutex_unlock(&kernfs_mutex); + up_write(&kernfs_rwsem); return ret; } @@ -1510,13 +1525,13 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name, return -ENOENT; } - mutex_lock(&kernfs_mutex); + down_write(&kernfs_rwsem); kn = kernfs_find_ns(parent, name, ns); if (kn) __kernfs_remove(kn); - mutex_unlock(&kernfs_mutex); + up_write(&kernfs_rwsem); if (kn) return 0; @@ -1542,7 +1557,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, if (!kn->parent) return -EINVAL; - mutex_lock(&kernfs_mutex); + down_write(&kernfs_rwsem); error = -ENOENT; if (!kernfs_active(kn) || !kernfs_active(new_parent) || @@ -1596,7 +1611,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, error = 0; out: - mutex_unlock(&kernfs_mutex); + up_write(&kernfs_rwsem); return error; } @@ -1671,7 +1686,7 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx) if (!dir_emit_dots(file, ctx)) return 0; - mutex_lock(&kernfs_mutex); + down_read(&kernfs_rwsem); if (kernfs_ns_enabled(parent)) ns = kernfs_info(dentry->d_sb)->ns; @@ -1688,12 +1703,12 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx) file->private_data = pos; kernfs_get(pos); - mutex_unlock(&kernfs_mutex); + up_read(&kernfs_rwsem); if (!dir_emit(ctx, name, len, ino, type)) return 0; - mutex_lock(&kernfs_mutex); + down_read(&kernfs_rwsem); } - mutex_unlock(&kernfs_mutex); + up_read(&kernfs_rwsem); file->private_data = NULL; ctx->pos = INT_MAX; return 0; diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index c75719312147..60e2a86c535e 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -860,7 +860,7 @@ repeat: spin_unlock_irq(&kernfs_notify_lock); /* kick fsnotify */ - mutex_lock(&kernfs_mutex); + down_write(&kernfs_rwsem); list_for_each_entry(info, &kernfs_root(kn)->supers, node) { struct kernfs_node *parent; @@ -898,7 +898,7 @@ repeat: iput(inode); } - mutex_unlock(&kernfs_mutex); + up_write(&kernfs_rwsem); kernfs_put(kn); goto repeat; } diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index 26f2aa3586f9..c0eae1725435 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -100,9 +100,9 @@ int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) { int ret; - mutex_lock(&kernfs_mutex); + down_write(&kernfs_rwsem); ret = __kernfs_setattr(kn, iattr); - mutex_unlock(&kernfs_mutex); + up_write(&kernfs_rwsem); return ret; } @@ -116,7 +116,7 @@ int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (!kn) return -EINVAL; - mutex_lock(&kernfs_mutex); + down_write(&kernfs_rwsem); error = setattr_prepare(&init_user_ns, dentry, iattr); if (error) goto out; @@ -129,7 +129,7 @@ int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, setattr_copy(&init_user_ns, inode, iattr); out: - mutex_unlock(&kernfs_mutex); + up_write(&kernfs_rwsem); return error; } @@ -185,11 +185,13 @@ int kernfs_iop_getattr(struct user_namespace *mnt_userns, struct inode *inode = d_inode(path->dentry); struct kernfs_node *kn = inode->i_private; - mutex_lock(&kernfs_mutex); + down_read(&kernfs_rwsem); + spin_lock(&inode->i_lock); kernfs_refresh_inode(kn, inode); - mutex_unlock(&kernfs_mutex); - generic_fillattr(&init_user_ns, inode, stat); + spin_unlock(&inode->i_lock); + up_read(&kernfs_rwsem); + return 0; } @@ -272,17 +274,21 @@ int kernfs_iop_permission(struct user_namespace *mnt_userns, struct inode *inode, int mask) { struct kernfs_node *kn; + int ret; if (mask & MAY_NOT_BLOCK) return -ECHILD; kn = inode->i_private; - mutex_lock(&kernfs_mutex); + down_read(&kernfs_rwsem); + spin_lock(&inode->i_lock); kernfs_refresh_inode(kn, inode); - mutex_unlock(&kernfs_mutex); + ret = generic_permission(&init_user_ns, inode, mask); + spin_unlock(&inode->i_lock); + up_read(&kernfs_rwsem); - return generic_permission(&init_user_ns, inode, mask); + return ret; } int kernfs_xattr_get(struct kernfs_node *kn, const char *name, diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index ccc3b44f6306..f9cc912c31e1 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -13,6 +13,7 @@ #include <linux/lockdep.h> #include <linux/fs.h> #include <linux/mutex.h> +#include <linux/rwsem.h> #include <linux/xattr.h> #include <linux/kernfs.h> @@ -69,7 +70,7 @@ struct kernfs_super_info { */ const void *ns; - /* anchored at kernfs_root->supers, protected by kernfs_mutex */ + /* anchored at kernfs_root->supers, protected by kernfs_rwsem */ struct list_head node; }; #define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info)) @@ -81,6 +82,25 @@ static inline struct kernfs_node *kernfs_dentry_node(struct dentry *dentry) return d_inode(dentry)->i_private; } +static inline void kernfs_set_rev(struct kernfs_node *parent, + struct dentry *dentry) +{ + dentry->d_time = parent->dir.rev; +} + +static inline void kernfs_inc_rev(struct kernfs_node *parent) +{ + parent->dir.rev++; +} + +static inline bool kernfs_dir_changed(struct kernfs_node *parent, + struct dentry *dentry) +{ + if (parent->dir.rev != dentry->d_time) + return true; + return false; +} + extern const struct super_operations kernfs_sops; extern struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache; @@ -102,7 +122,7 @@ int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr); /* * dir.c */ -extern struct mutex kernfs_mutex; +extern struct rw_semaphore kernfs_rwsem; extern const struct dentry_operations kernfs_dops; extern const struct file_operations kernfs_dir_fops; extern const struct inode_operations kernfs_dir_iops; diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index 9dc7e7a64e10..f2f909d09f52 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -255,9 +255,9 @@ static int kernfs_fill_super(struct super_block *sb, struct kernfs_fs_context *k sb->s_shrink.seeks = 0; /* get root inode, initialize and unlock it */ - mutex_lock(&kernfs_mutex); + down_read(&kernfs_rwsem); inode = kernfs_get_inode(sb, info->root->kn); - mutex_unlock(&kernfs_mutex); + up_read(&kernfs_rwsem); if (!inode) { pr_debug("kernfs: could not get root inode\n"); return -ENOMEM; @@ -344,9 +344,9 @@ int kernfs_get_tree(struct fs_context *fc) } sb->s_flags |= SB_ACTIVE; - mutex_lock(&kernfs_mutex); + down_write(&kernfs_rwsem); list_add(&info->node, &info->root->supers); - mutex_unlock(&kernfs_mutex); + up_write(&kernfs_rwsem); } fc->root = dget(sb->s_root); @@ -372,9 +372,9 @@ void kernfs_kill_sb(struct super_block *sb) { struct kernfs_super_info *info = kernfs_info(sb); - mutex_lock(&kernfs_mutex); + down_write(&kernfs_rwsem); list_del(&info->node); - mutex_unlock(&kernfs_mutex); + up_write(&kernfs_rwsem); /* * Remove the superblock from fs_supers/s_instances diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c index 5432883d819f..c8f8e41b8411 100644 --- a/fs/kernfs/symlink.c +++ b/fs/kernfs/symlink.c @@ -116,9 +116,9 @@ static int kernfs_getlink(struct inode *inode, char *path) struct kernfs_node *target = kn->symlink.target_kn; int error; - mutex_lock(&kernfs_mutex); + down_read(&kernfs_rwsem); error = kernfs_get_target_path(parent, target, path); - mutex_unlock(&kernfs_mutex); + up_read(&kernfs_rwsem); return error; } diff --git a/fs/ksmbd/Kconfig b/fs/ksmbd/Kconfig new file mode 100644 index 000000000000..b83cbd756ae5 --- /dev/null +++ b/fs/ksmbd/Kconfig @@ -0,0 +1,68 @@ +config SMB_SERVER + tristate "SMB3 server support (EXPERIMENTAL)" + depends on INET + depends on MULTIUSER + depends on FILE_LOCKING + select NLS + select NLS_UTF8 + select CRYPTO + select CRYPTO_MD4 + select CRYPTO_MD5 + select CRYPTO_HMAC + select CRYPTO_ECB + select CRYPTO_LIB_DES + select CRYPTO_SHA256 + select CRYPTO_CMAC + select CRYPTO_SHA512 + select CRYPTO_AEAD2 + select CRYPTO_CCM + select CRYPTO_GCM + select ASN1 + select OID_REGISTRY + default n + help + Choose Y here if you want to allow SMB3 compliant clients + to access files residing on this system using SMB3 protocol. + To compile the SMB3 server support as a module, + choose M here: the module will be called ksmbd. + + You may choose to use a samba server instead, in which + case you can choose N here. + + You also need to install user space programs which can be found + in ksmbd-tools, available from + https://github.com/cifsd-team/ksmbd-tools. + More detail about how to run the ksmbd kernel server is + available via README file + (https://github.com/cifsd-team/ksmbd-tools/blob/master/README). + + ksmbd kernel server includes support for auto-negotiation, + Secure negotiate, Pre-authentication integrity, oplock/lease, + compound requests, multi-credit, packet signing, RDMA(smbdirect), + smb3 encryption, copy-offload, secure per-user session + establishment via NTLM or NTLMv2. + +config SMB_SERVER_SMBDIRECT + bool "Support for SMB Direct protocol" + depends on SMB_SERVER=m && INFINIBAND && INFINIBAND_ADDR_TRANS || SMB_SERVER=y && INFINIBAND=y && INFINIBAND_ADDR_TRANS=y + select SG_POOL + default n + + help + Enables SMB Direct support for SMB 3.0, 3.02 and 3.1.1. + + SMB Direct allows transferring SMB packets over RDMA. If unsure, + say N. + +config SMB_SERVER_CHECK_CAP_NET_ADMIN + bool "Enable check network administration capability" + depends on SMB_SERVER + default y + + help + Prevent unprivileged processes to start the ksmbd kernel server. + +config SMB_SERVER_KERBEROS5 + bool "Support for Kerberos 5" + depends on SMB_SERVER + default n diff --git a/fs/ksmbd/Makefile b/fs/ksmbd/Makefile new file mode 100644 index 000000000000..7d6337a7dee4 --- /dev/null +++ b/fs/ksmbd/Makefile @@ -0,0 +1,20 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +# +# Makefile for Linux SMB3 kernel server +# +obj-$(CONFIG_SMB_SERVER) += ksmbd.o + +ksmbd-y := unicode.o auth.o vfs.o vfs_cache.o server.o ndr.o \ + misc.o oplock.o connection.o ksmbd_work.o crypto_ctx.o \ + mgmt/ksmbd_ida.o mgmt/user_config.o mgmt/share_config.o \ + mgmt/tree_connect.o mgmt/user_session.o smb_common.o \ + transport_tcp.o transport_ipc.o smbacl.o smb2pdu.o \ + smb2ops.o smb2misc.o ksmbd_spnego_negtokeninit.asn1.o \ + ksmbd_spnego_negtokentarg.asn1.o asn1.o + +$(obj)/asn1.o: $(obj)/ksmbd_spnego_negtokeninit.asn1.h $(obj)/ksmbd_spnego_negtokentarg.asn1.h + +$(obj)/ksmbd_spnego_negtokeninit.asn1.o: $(obj)/ksmbd_spnego_negtokeninit.asn1.c $(obj)/ksmbd_spnego_negtokeninit.asn1.h +$(obj)/ksmbd_spnego_negtokentarg.asn1.o: $(obj)/ksmbd_spnego_negtokentarg.asn1.c $(obj)/ksmbd_spnego_negtokentarg.asn1.h + +ksmbd-$(CONFIG_SMB_SERVER_SMBDIRECT) += transport_rdma.o diff --git a/fs/ksmbd/asn1.c b/fs/ksmbd/asn1.c new file mode 100644 index 000000000000..b014f4638610 --- /dev/null +++ b/fs/ksmbd/asn1.c @@ -0,0 +1,343 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * The ASB.1/BER parsing code is derived from ip_nat_snmp_basic.c which was in + * turn derived from the gxsnmp package by Gregory McLean & Jochen Friedrich + * + * Copyright (c) 2000 RP Internet (www.rpi.net.au). + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/oid_registry.h> + +#include "glob.h" + +#include "asn1.h" +#include "connection.h" +#include "auth.h" +#include "ksmbd_spnego_negtokeninit.asn1.h" +#include "ksmbd_spnego_negtokentarg.asn1.h" + +#define SPNEGO_OID_LEN 7 +#define NTLMSSP_OID_LEN 10 +#define KRB5_OID_LEN 7 +#define KRB5U2U_OID_LEN 8 +#define MSKRB5_OID_LEN 7 +static unsigned long SPNEGO_OID[7] = { 1, 3, 6, 1, 5, 5, 2 }; +static unsigned long NTLMSSP_OID[10] = { 1, 3, 6, 1, 4, 1, 311, 2, 2, 10 }; +static unsigned long KRB5_OID[7] = { 1, 2, 840, 113554, 1, 2, 2 }; +static unsigned long KRB5U2U_OID[8] = { 1, 2, 840, 113554, 1, 2, 2, 3 }; +static unsigned long MSKRB5_OID[7] = { 1, 2, 840, 48018, 1, 2, 2 }; + +static char NTLMSSP_OID_STR[NTLMSSP_OID_LEN] = { 0x2b, 0x06, 0x01, 0x04, 0x01, + 0x82, 0x37, 0x02, 0x02, 0x0a }; + +static bool +asn1_subid_decode(const unsigned char **begin, const unsigned char *end, + unsigned long *subid) +{ + const unsigned char *ptr = *begin; + unsigned char ch; + + *subid = 0; + + do { + if (ptr >= end) + return false; + + ch = *ptr++; + *subid <<= 7; + *subid |= ch & 0x7F; + } while ((ch & 0x80) == 0x80); + + *begin = ptr; + return true; +} + +static bool asn1_oid_decode(const unsigned char *value, size_t vlen, + unsigned long **oid, size_t *oidlen) +{ + const unsigned char *iptr = value, *end = value + vlen; + unsigned long *optr; + unsigned long subid; + + vlen += 1; + if (vlen < 2 || vlen > UINT_MAX / sizeof(unsigned long)) + goto fail_nullify; + + *oid = kmalloc(vlen * sizeof(unsigned long), GFP_KERNEL); + if (!*oid) + return false; + + optr = *oid; + + if (!asn1_subid_decode(&iptr, end, &subid)) + goto fail; + + if (subid < 40) { + optr[0] = 0; + optr[1] = subid; + } else if (subid < 80) { + optr[0] = 1; + optr[1] = subid - 40; + } else { + optr[0] = 2; + optr[1] = subid - 80; + } + + *oidlen = 2; + optr += 2; + + while (iptr < end) { + if (++(*oidlen) > vlen) + goto fail; + + if (!asn1_subid_decode(&iptr, end, optr++)) + goto fail; + } + return true; + +fail: + kfree(*oid); +fail_nullify: + *oid = NULL; + return false; +} + +static bool oid_eq(unsigned long *oid1, unsigned int oid1len, + unsigned long *oid2, unsigned int oid2len) +{ + if (oid1len != oid2len) + return false; + + return memcmp(oid1, oid2, oid1len) == 0; +} + +int +ksmbd_decode_negTokenInit(unsigned char *security_blob, int length, + struct ksmbd_conn *conn) +{ + return asn1_ber_decoder(&ksmbd_spnego_negtokeninit_decoder, conn, + security_blob, length); +} + +int +ksmbd_decode_negTokenTarg(unsigned char *security_blob, int length, + struct ksmbd_conn *conn) +{ + return asn1_ber_decoder(&ksmbd_spnego_negtokentarg_decoder, conn, + security_blob, length); +} + +static int compute_asn_hdr_len_bytes(int len) +{ + if (len > 0xFFFFFF) + return 4; + else if (len > 0xFFFF) + return 3; + else if (len > 0xFF) + return 2; + else if (len > 0x7F) + return 1; + else + return 0; +} + +static void encode_asn_tag(char *buf, unsigned int *ofs, char tag, char seq, + int length) +{ + int i; + int index = *ofs; + char hdr_len = compute_asn_hdr_len_bytes(length); + int len = length + 2 + hdr_len; + + /* insert tag */ + buf[index++] = tag; + + if (!hdr_len) { + buf[index++] = len; + } else { + buf[index++] = 0x80 | hdr_len; + for (i = hdr_len - 1; i >= 0; i--) + buf[index++] = (len >> (i * 8)) & 0xFF; + } + + /* insert seq */ + len = len - (index - *ofs); + buf[index++] = seq; + + if (!hdr_len) { + buf[index++] = len; + } else { + buf[index++] = 0x80 | hdr_len; + for (i = hdr_len - 1; i >= 0; i--) + buf[index++] = (len >> (i * 8)) & 0xFF; + } + + *ofs += (index - *ofs); +} + +int build_spnego_ntlmssp_neg_blob(unsigned char **pbuffer, u16 *buflen, + char *ntlm_blob, int ntlm_blob_len) +{ + char *buf; + unsigned int ofs = 0; + int neg_result_len = 4 + compute_asn_hdr_len_bytes(1) * 2 + 1; + int oid_len = 4 + compute_asn_hdr_len_bytes(NTLMSSP_OID_LEN) * 2 + + NTLMSSP_OID_LEN; + int ntlmssp_len = 4 + compute_asn_hdr_len_bytes(ntlm_blob_len) * 2 + + ntlm_blob_len; + int total_len = 4 + compute_asn_hdr_len_bytes(neg_result_len + + oid_len + ntlmssp_len) * 2 + + neg_result_len + oid_len + ntlmssp_len; + + buf = kmalloc(total_len, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + /* insert main gss header */ + encode_asn_tag(buf, &ofs, 0xa1, 0x30, neg_result_len + oid_len + + ntlmssp_len); + + /* insert neg result */ + encode_asn_tag(buf, &ofs, 0xa0, 0x0a, 1); + buf[ofs++] = 1; + + /* insert oid */ + encode_asn_tag(buf, &ofs, 0xa1, 0x06, NTLMSSP_OID_LEN); + memcpy(buf + ofs, NTLMSSP_OID_STR, NTLMSSP_OID_LEN); + ofs += NTLMSSP_OID_LEN; + + /* insert response token - ntlmssp blob */ + encode_asn_tag(buf, &ofs, 0xa2, 0x04, ntlm_blob_len); + memcpy(buf + ofs, ntlm_blob, ntlm_blob_len); + ofs += ntlm_blob_len; + + *pbuffer = buf; + *buflen = total_len; + return 0; +} + +int build_spnego_ntlmssp_auth_blob(unsigned char **pbuffer, u16 *buflen, + int neg_result) +{ + char *buf; + unsigned int ofs = 0; + int neg_result_len = 4 + compute_asn_hdr_len_bytes(1) * 2 + 1; + int total_len = 4 + compute_asn_hdr_len_bytes(neg_result_len) * 2 + + neg_result_len; + + buf = kmalloc(total_len, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + /* insert main gss header */ + encode_asn_tag(buf, &ofs, 0xa1, 0x30, neg_result_len); + + /* insert neg result */ + encode_asn_tag(buf, &ofs, 0xa0, 0x0a, 1); + if (neg_result) + buf[ofs++] = 2; + else + buf[ofs++] = 0; + + *pbuffer = buf; + *buflen = total_len; + return 0; +} + +int ksmbd_gssapi_this_mech(void *context, size_t hdrlen, unsigned char tag, + const void *value, size_t vlen) +{ + unsigned long *oid; + size_t oidlen; + int err = 0; + + if (!asn1_oid_decode(value, vlen, &oid, &oidlen)) { + err = -EBADMSG; + goto out; + } + + if (!oid_eq(oid, oidlen, SPNEGO_OID, SPNEGO_OID_LEN)) + err = -EBADMSG; + kfree(oid); +out: + if (err) { + char buf[50]; + + sprint_oid(value, vlen, buf, sizeof(buf)); + ksmbd_debug(AUTH, "Unexpected OID: %s\n", buf); + } + return err; +} + +int ksmbd_neg_token_init_mech_type(void *context, size_t hdrlen, + unsigned char tag, const void *value, + size_t vlen) +{ + struct ksmbd_conn *conn = context; + unsigned long *oid; + size_t oidlen; + int mech_type; + char buf[50]; + + if (!asn1_oid_decode(value, vlen, &oid, &oidlen)) + goto fail; + + if (oid_eq(oid, oidlen, NTLMSSP_OID, NTLMSSP_OID_LEN)) + mech_type = KSMBD_AUTH_NTLMSSP; + else if (oid_eq(oid, oidlen, MSKRB5_OID, MSKRB5_OID_LEN)) + mech_type = KSMBD_AUTH_MSKRB5; + else if (oid_eq(oid, oidlen, KRB5_OID, KRB5_OID_LEN)) + mech_type = KSMBD_AUTH_KRB5; + else if (oid_eq(oid, oidlen, KRB5U2U_OID, KRB5U2U_OID_LEN)) + mech_type = KSMBD_AUTH_KRB5U2U; + else + goto fail; + + conn->auth_mechs |= mech_type; + if (conn->preferred_auth_mech == 0) + conn->preferred_auth_mech = mech_type; + + kfree(oid); + return 0; + +fail: + kfree(oid); + sprint_oid(value, vlen, buf, sizeof(buf)); + ksmbd_debug(AUTH, "Unexpected OID: %s\n", buf); + return -EBADMSG; +} + +int ksmbd_neg_token_init_mech_token(void *context, size_t hdrlen, + unsigned char tag, const void *value, + size_t vlen) +{ + struct ksmbd_conn *conn = context; + + conn->mechToken = kmalloc(vlen + 1, GFP_KERNEL); + if (!conn->mechToken) + return -ENOMEM; + + memcpy(conn->mechToken, value, vlen); + conn->mechToken[vlen] = '\0'; + return 0; +} + +int ksmbd_neg_token_targ_resp_token(void *context, size_t hdrlen, + unsigned char tag, const void *value, + size_t vlen) +{ + struct ksmbd_conn *conn = context; + + conn->mechToken = kmalloc(vlen + 1, GFP_KERNEL); + if (!conn->mechToken) + return -ENOMEM; + + memcpy(conn->mechToken, value, vlen); + conn->mechToken[vlen] = '\0'; + return 0; +} diff --git a/fs/ksmbd/asn1.h b/fs/ksmbd/asn1.h new file mode 100644 index 000000000000..ce105f4ce305 --- /dev/null +++ b/fs/ksmbd/asn1.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * The ASB.1/BER parsing code is derived from ip_nat_snmp_basic.c which was in + * turn derived from the gxsnmp package by Gregory McLean & Jochen Friedrich + * + * Copyright (c) 2000 RP Internet (www.rpi.net.au). + * Copyright (C) 2018 Samsung Electronics Co., Ltd. + */ + +#ifndef __ASN1_H__ +#define __ASN1_H__ + +int ksmbd_decode_negTokenInit(unsigned char *security_blob, int length, + struct ksmbd_conn *conn); +int ksmbd_decode_negTokenTarg(unsigned char *security_blob, int length, + struct ksmbd_conn *conn); +int build_spnego_ntlmssp_neg_blob(unsigned char **pbuffer, u16 *buflen, + char *ntlm_blob, int ntlm_blob_len); +int build_spnego_ntlmssp_auth_blob(unsigned char **pbuffer, u16 *buflen, + int neg_result); +#endif /* __ASN1_H__ */ diff --git a/fs/ksmbd/auth.c b/fs/ksmbd/auth.c new file mode 100644 index 000000000000..de36f12070bf --- /dev/null +++ b/fs/ksmbd/auth.c @@ -0,0 +1,1364 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2016 Namjae Jeon <linkinjeon@kernel.org> + * Copyright (C) 2018 Samsung Electronics Co., Ltd. + */ + +#include <linux/kernel.h> +#include <linux/fs.h> +#include <linux/uaccess.h> +#include <linux/backing-dev.h> +#include <linux/writeback.h> +#include <linux/uio.h> +#include <linux/xattr.h> +#include <crypto/hash.h> +#include <crypto/aead.h> +#include <linux/random.h> +#include <linux/scatterlist.h> + +#include "auth.h" +#include "glob.h" + +#include <linux/fips.h> +#include <crypto/des.h> + +#include "server.h" +#include "smb_common.h" +#include "connection.h" +#include "mgmt/user_session.h" +#include "mgmt/user_config.h" +#include "crypto_ctx.h" +#include "transport_ipc.h" + +/* + * Fixed format data defining GSS header and fixed string + * "not_defined_in_RFC4178@please_ignore". + * So sec blob data in neg phase could be generated statically. + */ +static char NEGOTIATE_GSS_HEADER[AUTH_GSS_LENGTH] = { +#ifdef CONFIG_SMB_SERVER_KERBEROS5 + 0x60, 0x5e, 0x06, 0x06, 0x2b, 0x06, 0x01, 0x05, + 0x05, 0x02, 0xa0, 0x54, 0x30, 0x52, 0xa0, 0x24, + 0x30, 0x22, 0x06, 0x09, 0x2a, 0x86, 0x48, 0x86, + 0xf7, 0x12, 0x01, 0x02, 0x02, 0x06, 0x09, 0x2a, + 0x86, 0x48, 0x82, 0xf7, 0x12, 0x01, 0x02, 0x02, + 0x06, 0x0a, 0x2b, 0x06, 0x01, 0x04, 0x01, 0x82, + 0x37, 0x02, 0x02, 0x0a, 0xa3, 0x2a, 0x30, 0x28, + 0xa0, 0x26, 0x1b, 0x24, 0x6e, 0x6f, 0x74, 0x5f, + 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x64, 0x5f, + 0x69, 0x6e, 0x5f, 0x52, 0x46, 0x43, 0x34, 0x31, + 0x37, 0x38, 0x40, 0x70, 0x6c, 0x65, 0x61, 0x73, + 0x65, 0x5f, 0x69, 0x67, 0x6e, 0x6f, 0x72, 0x65 +#else + 0x60, 0x48, 0x06, 0x06, 0x2b, 0x06, 0x01, 0x05, + 0x05, 0x02, 0xa0, 0x3e, 0x30, 0x3c, 0xa0, 0x0e, + 0x30, 0x0c, 0x06, 0x0a, 0x2b, 0x06, 0x01, 0x04, + 0x01, 0x82, 0x37, 0x02, 0x02, 0x0a, 0xa3, 0x2a, + 0x30, 0x28, 0xa0, 0x26, 0x1b, 0x24, 0x6e, 0x6f, + 0x74, 0x5f, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, + 0x64, 0x5f, 0x69, 0x6e, 0x5f, 0x52, 0x46, 0x43, + 0x34, 0x31, 0x37, 0x38, 0x40, 0x70, 0x6c, 0x65, + 0x61, 0x73, 0x65, 0x5f, 0x69, 0x67, 0x6e, 0x6f, + 0x72, 0x65 +#endif +}; + +void ksmbd_copy_gss_neg_header(void *buf) +{ + memcpy(buf, NEGOTIATE_GSS_HEADER, AUTH_GSS_LENGTH); +} + +static void +str_to_key(unsigned char *str, unsigned char *key) +{ + int i; + + key[0] = str[0] >> 1; + key[1] = ((str[0] & 0x01) << 6) | (str[1] >> 2); + key[2] = ((str[1] & 0x03) << 5) | (str[2] >> 3); + key[3] = ((str[2] & 0x07) << 4) | (str[3] >> 4); + key[4] = ((str[3] & 0x0F) << 3) | (str[4] >> 5); + key[5] = ((str[4] & 0x1F) << 2) | (str[5] >> 6); + key[6] = ((str[5] & 0x3F) << 1) | (str[6] >> 7); + key[7] = str[6] & 0x7F; + for (i = 0; i < 8; i++) + key[i] = (key[i] << 1); +} + +static int +smbhash(unsigned char *out, const unsigned char *in, unsigned char *key) +{ + unsigned char key2[8]; + struct des_ctx ctx; + + if (fips_enabled) { + ksmbd_debug(AUTH, "FIPS compliance enabled: DES not permitted\n"); + return -ENOENT; + } + + str_to_key(key, key2); + des_expand_key(&ctx, key2, DES_KEY_SIZE); + des_encrypt(&ctx, out, in); + memzero_explicit(&ctx, sizeof(ctx)); + return 0; +} + +static int ksmbd_enc_p24(unsigned char *p21, const unsigned char *c8, unsigned char *p24) +{ + int rc; + + rc = smbhash(p24, c8, p21); + if (rc) + return rc; + rc = smbhash(p24 + 8, c8, p21 + 7); + if (rc) + return rc; + return smbhash(p24 + 16, c8, p21 + 14); +} + +/* produce a md4 message digest from data of length n bytes */ +static int ksmbd_enc_md4(unsigned char *md4_hash, unsigned char *link_str, + int link_len) +{ + int rc; + struct ksmbd_crypto_ctx *ctx; + + ctx = ksmbd_crypto_ctx_find_md4(); + if (!ctx) { + ksmbd_debug(AUTH, "Crypto md4 allocation error\n"); + return -ENOMEM; + } + + rc = crypto_shash_init(CRYPTO_MD4(ctx)); + if (rc) { + ksmbd_debug(AUTH, "Could not init md4 shash\n"); + goto out; + } + + rc = crypto_shash_update(CRYPTO_MD4(ctx), link_str, link_len); + if (rc) { + ksmbd_debug(AUTH, "Could not update with link_str\n"); + goto out; + } + + rc = crypto_shash_final(CRYPTO_MD4(ctx), md4_hash); + if (rc) + ksmbd_debug(AUTH, "Could not generate md4 hash\n"); +out: + ksmbd_release_crypto_ctx(ctx); + return rc; +} + +static int ksmbd_enc_update_sess_key(unsigned char *md5_hash, char *nonce, + char *server_challenge, int len) +{ + int rc; + struct ksmbd_crypto_ctx *ctx; + + ctx = ksmbd_crypto_ctx_find_md5(); + if (!ctx) { + ksmbd_debug(AUTH, "Crypto md5 allocation error\n"); + return -ENOMEM; + } + + rc = crypto_shash_init(CRYPTO_MD5(ctx)); + if (rc) { + ksmbd_debug(AUTH, "Could not init md5 shash\n"); + goto out; + } + + rc = crypto_shash_update(CRYPTO_MD5(ctx), server_challenge, len); + if (rc) { + ksmbd_debug(AUTH, "Could not update with challenge\n"); + goto out; + } + + rc = crypto_shash_update(CRYPTO_MD5(ctx), nonce, len); + if (rc) { + ksmbd_debug(AUTH, "Could not update with nonce\n"); + goto out; + } + + rc = crypto_shash_final(CRYPTO_MD5(ctx), md5_hash); + if (rc) + ksmbd_debug(AUTH, "Could not generate md5 hash\n"); +out: + ksmbd_release_crypto_ctx(ctx); + return rc; +} + +/** + * ksmbd_gen_sess_key() - function to generate session key + * @sess: session of connection + * @hash: source hash value to be used for find session key + * @hmac: source hmac value to be used for finding session key + * + */ +static int ksmbd_gen_sess_key(struct ksmbd_session *sess, char *hash, + char *hmac) +{ + struct ksmbd_crypto_ctx *ctx; + int rc; + + ctx = ksmbd_crypto_ctx_find_hmacmd5(); + if (!ctx) { + ksmbd_debug(AUTH, "could not crypto alloc hmacmd5\n"); + return -ENOMEM; + } + + rc = crypto_shash_setkey(CRYPTO_HMACMD5_TFM(ctx), + hash, + CIFS_HMAC_MD5_HASH_SIZE); + if (rc) { + ksmbd_debug(AUTH, "hmacmd5 set key fail error %d\n", rc); + goto out; + } + + rc = crypto_shash_init(CRYPTO_HMACMD5(ctx)); + if (rc) { + ksmbd_debug(AUTH, "could not init hmacmd5 error %d\n", rc); + goto out; + } + + rc = crypto_shash_update(CRYPTO_HMACMD5(ctx), + hmac, + SMB2_NTLMV2_SESSKEY_SIZE); + if (rc) { + ksmbd_debug(AUTH, "Could not update with response error %d\n", rc); + goto out; + } + + rc = crypto_shash_final(CRYPTO_HMACMD5(ctx), sess->sess_key); + if (rc) { + ksmbd_debug(AUTH, "Could not generate hmacmd5 hash error %d\n", rc); + goto out; + } + +out: + ksmbd_release_crypto_ctx(ctx); + return rc; +} + +static int calc_ntlmv2_hash(struct ksmbd_session *sess, char *ntlmv2_hash, + char *dname) +{ + int ret, len, conv_len; + wchar_t *domain = NULL; + __le16 *uniname = NULL; + struct ksmbd_crypto_ctx *ctx; + + ctx = ksmbd_crypto_ctx_find_hmacmd5(); + if (!ctx) { + ksmbd_debug(AUTH, "can't generate ntlmv2 hash\n"); + return -ENOMEM; + } + + ret = crypto_shash_setkey(CRYPTO_HMACMD5_TFM(ctx), + user_passkey(sess->user), + CIFS_ENCPWD_SIZE); + if (ret) { + ksmbd_debug(AUTH, "Could not set NT Hash as a key\n"); + goto out; + } + + ret = crypto_shash_init(CRYPTO_HMACMD5(ctx)); + if (ret) { + ksmbd_debug(AUTH, "could not init hmacmd5\n"); + goto out; + } + + /* convert user_name to unicode */ + len = strlen(user_name(sess->user)); + uniname = kzalloc(2 + UNICODE_LEN(len), GFP_KERNEL); + if (!uniname) { + ret = -ENOMEM; + goto out; + } + + conv_len = smb_strtoUTF16(uniname, user_name(sess->user), len, + sess->conn->local_nls); + if (conv_len < 0 || conv_len > len) { + ret = -EINVAL; + goto out; + } + UniStrupr(uniname); + + ret = crypto_shash_update(CRYPTO_HMACMD5(ctx), + (char *)uniname, + UNICODE_LEN(conv_len)); + if (ret) { + ksmbd_debug(AUTH, "Could not update with user\n"); + goto out; + } + + /* Convert domain name or conn name to unicode and uppercase */ + len = strlen(dname); + domain = kzalloc(2 + UNICODE_LEN(len), GFP_KERNEL); + if (!domain) { + ret = -ENOMEM; + goto out; + } + + conv_len = smb_strtoUTF16((__le16 *)domain, dname, len, + sess->conn->local_nls); + if (conv_len < 0 || conv_len > len) { + ret = -EINVAL; + goto out; + } + + ret = crypto_shash_update(CRYPTO_HMACMD5(ctx), + (char *)domain, + UNICODE_LEN(conv_len)); + if (ret) { + ksmbd_debug(AUTH, "Could not update with domain\n"); + goto out; + } + + ret = crypto_shash_final(CRYPTO_HMACMD5(ctx), ntlmv2_hash); + if (ret) + ksmbd_debug(AUTH, "Could not generate md5 hash\n"); +out: + kfree(uniname); + kfree(domain); + ksmbd_release_crypto_ctx(ctx); + return ret; +} + +/** + * ksmbd_auth_ntlm() - NTLM authentication handler + * @sess: session of connection + * @pw_buf: NTLM challenge response + * @passkey: user password + * + * Return: 0 on success, error number on error + */ +int ksmbd_auth_ntlm(struct ksmbd_session *sess, char *pw_buf) +{ + int rc; + unsigned char p21[21]; + char key[CIFS_AUTH_RESP_SIZE]; + + memset(p21, '\0', 21); + memcpy(p21, user_passkey(sess->user), CIFS_NTHASH_SIZE); + rc = ksmbd_enc_p24(p21, sess->ntlmssp.cryptkey, key); + if (rc) { + pr_err("password processing failed\n"); + return rc; + } + + ksmbd_enc_md4(sess->sess_key, user_passkey(sess->user), + CIFS_SMB1_SESSKEY_SIZE); + memcpy(sess->sess_key + CIFS_SMB1_SESSKEY_SIZE, key, + CIFS_AUTH_RESP_SIZE); + sess->sequence_number = 1; + + if (strncmp(pw_buf, key, CIFS_AUTH_RESP_SIZE) != 0) { + ksmbd_debug(AUTH, "ntlmv1 authentication failed\n"); + return -EINVAL; + } + + ksmbd_debug(AUTH, "ntlmv1 authentication pass\n"); + return 0; +} + +/** + * ksmbd_auth_ntlmv2() - NTLMv2 authentication handler + * @sess: session of connection + * @ntlmv2: NTLMv2 challenge response + * @blen: NTLMv2 blob length + * @domain_name: domain name + * + * Return: 0 on success, error number on error + */ +int ksmbd_auth_ntlmv2(struct ksmbd_session *sess, struct ntlmv2_resp *ntlmv2, + int blen, char *domain_name) +{ + char ntlmv2_hash[CIFS_ENCPWD_SIZE]; + char ntlmv2_rsp[CIFS_HMAC_MD5_HASH_SIZE]; + struct ksmbd_crypto_ctx *ctx; + char *construct = NULL; + int rc, len; + + ctx = ksmbd_crypto_ctx_find_hmacmd5(); + if (!ctx) { + ksmbd_debug(AUTH, "could not crypto alloc hmacmd5\n"); + return -ENOMEM; + } + + rc = calc_ntlmv2_hash(sess, ntlmv2_hash, domain_name); + if (rc) { + ksmbd_debug(AUTH, "could not get v2 hash rc %d\n", rc); + goto out; + } + + rc = crypto_shash_setkey(CRYPTO_HMACMD5_TFM(ctx), + ntlmv2_hash, + CIFS_HMAC_MD5_HASH_SIZE); + if (rc) { + ksmbd_debug(AUTH, "Could not set NTLMV2 Hash as a key\n"); + goto out; + } + + rc = crypto_shash_init(CRYPTO_HMACMD5(ctx)); + if (rc) { + ksmbd_debug(AUTH, "Could not init hmacmd5\n"); + goto out; + } + + len = CIFS_CRYPTO_KEY_SIZE + blen; + construct = kzalloc(len, GFP_KERNEL); + if (!construct) { + rc = -ENOMEM; + goto out; + } + + memcpy(construct, sess->ntlmssp.cryptkey, CIFS_CRYPTO_KEY_SIZE); + memcpy(construct + CIFS_CRYPTO_KEY_SIZE, &ntlmv2->blob_signature, blen); + + rc = crypto_shash_update(CRYPTO_HMACMD5(ctx), construct, len); + if (rc) { + ksmbd_debug(AUTH, "Could not update with response\n"); + goto out; + } + + rc = crypto_shash_final(CRYPTO_HMACMD5(ctx), ntlmv2_rsp); + if (rc) { + ksmbd_debug(AUTH, "Could not generate md5 hash\n"); + goto out; + } + + rc = ksmbd_gen_sess_key(sess, ntlmv2_hash, ntlmv2_rsp); + if (rc) { + ksmbd_debug(AUTH, "Could not generate sess key\n"); + goto out; + } + + if (memcmp(ntlmv2->ntlmv2_hash, ntlmv2_rsp, CIFS_HMAC_MD5_HASH_SIZE) != 0) + rc = -EINVAL; +out: + ksmbd_release_crypto_ctx(ctx); + kfree(construct); + return rc; +} + +/** + * __ksmbd_auth_ntlmv2() - NTLM2(extended security) authentication handler + * @sess: session of connection + * @client_nonce: client nonce from LM response. + * @ntlm_resp: ntlm response data from client. + * + * Return: 0 on success, error number on error + */ +static int __ksmbd_auth_ntlmv2(struct ksmbd_session *sess, char *client_nonce, + char *ntlm_resp) +{ + char sess_key[CIFS_SMB1_SESSKEY_SIZE] = {0}; + int rc; + unsigned char p21[21]; + char key[CIFS_AUTH_RESP_SIZE]; + + rc = ksmbd_enc_update_sess_key(sess_key, + client_nonce, + (char *)sess->ntlmssp.cryptkey, 8); + if (rc) { + pr_err("password processing failed\n"); + goto out; + } + + memset(p21, '\0', 21); + memcpy(p21, user_passkey(sess->user), CIFS_NTHASH_SIZE); + rc = ksmbd_enc_p24(p21, sess_key, key); + if (rc) { + pr_err("password processing failed\n"); + goto out; + } + + if (memcmp(ntlm_resp, key, CIFS_AUTH_RESP_SIZE) != 0) + rc = -EINVAL; +out: + return rc; +} + +/** + * ksmbd_decode_ntlmssp_auth_blob() - helper function to construct + * authenticate blob + * @authblob: authenticate blob source pointer + * @usr: user details + * @sess: session of connection + * + * Return: 0 on success, error number on error + */ +int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob, + int blob_len, struct ksmbd_session *sess) +{ + char *domain_name; + unsigned int lm_off, nt_off; + unsigned short nt_len; + int ret; + + if (blob_len < sizeof(struct authenticate_message)) { + ksmbd_debug(AUTH, "negotiate blob len %d too small\n", + blob_len); + return -EINVAL; + } + + if (memcmp(authblob->Signature, "NTLMSSP", 8)) { + ksmbd_debug(AUTH, "blob signature incorrect %s\n", + authblob->Signature); + return -EINVAL; + } + + lm_off = le32_to_cpu(authblob->LmChallengeResponse.BufferOffset); + nt_off = le32_to_cpu(authblob->NtChallengeResponse.BufferOffset); + nt_len = le16_to_cpu(authblob->NtChallengeResponse.Length); + + /* process NTLM authentication */ + if (nt_len == CIFS_AUTH_RESP_SIZE) { + if (le32_to_cpu(authblob->NegotiateFlags) & + NTLMSSP_NEGOTIATE_EXTENDED_SEC) + return __ksmbd_auth_ntlmv2(sess, (char *)authblob + + lm_off, (char *)authblob + nt_off); + else + return ksmbd_auth_ntlm(sess, (char *)authblob + + nt_off); + } + + /* TODO : use domain name that imported from configuration file */ + domain_name = smb_strndup_from_utf16((const char *)authblob + + le32_to_cpu(authblob->DomainName.BufferOffset), + le16_to_cpu(authblob->DomainName.Length), true, + sess->conn->local_nls); + if (IS_ERR(domain_name)) + return PTR_ERR(domain_name); + + /* process NTLMv2 authentication */ + ksmbd_debug(AUTH, "decode_ntlmssp_authenticate_blob dname%s\n", + domain_name); + ret = ksmbd_auth_ntlmv2(sess, (struct ntlmv2_resp *)((char *)authblob + nt_off), + nt_len - CIFS_ENCPWD_SIZE, + domain_name); + kfree(domain_name); + return ret; +} + +/** + * ksmbd_decode_ntlmssp_neg_blob() - helper function to construct + * negotiate blob + * @negblob: negotiate blob source pointer + * @rsp: response header pointer to be updated + * @sess: session of connection + * + */ +int ksmbd_decode_ntlmssp_neg_blob(struct negotiate_message *negblob, + int blob_len, struct ksmbd_session *sess) +{ + if (blob_len < sizeof(struct negotiate_message)) { + ksmbd_debug(AUTH, "negotiate blob len %d too small\n", + blob_len); + return -EINVAL; + } + + if (memcmp(negblob->Signature, "NTLMSSP", 8)) { + ksmbd_debug(AUTH, "blob signature incorrect %s\n", + negblob->Signature); + return -EINVAL; + } + + sess->ntlmssp.client_flags = le32_to_cpu(negblob->NegotiateFlags); + return 0; +} + +/** + * ksmbd_build_ntlmssp_challenge_blob() - helper function to construct + * challenge blob + * @chgblob: challenge blob source pointer to initialize + * @rsp: response header pointer to be updated + * @sess: session of connection + * + */ +unsigned int +ksmbd_build_ntlmssp_challenge_blob(struct challenge_message *chgblob, + struct ksmbd_session *sess) +{ + struct target_info *tinfo; + wchar_t *name; + __u8 *target_name; + unsigned int flags, blob_off, blob_len, type, target_info_len = 0; + int len, uni_len, conv_len; + int cflags = sess->ntlmssp.client_flags; + + memcpy(chgblob->Signature, NTLMSSP_SIGNATURE, 8); + chgblob->MessageType = NtLmChallenge; + + flags = NTLMSSP_NEGOTIATE_UNICODE | + NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_TARGET_TYPE_SERVER | + NTLMSSP_NEGOTIATE_TARGET_INFO; + + if (cflags & NTLMSSP_NEGOTIATE_SIGN) { + flags |= NTLMSSP_NEGOTIATE_SIGN; + flags |= cflags & (NTLMSSP_NEGOTIATE_128 | + NTLMSSP_NEGOTIATE_56); + } + + if (cflags & NTLMSSP_NEGOTIATE_ALWAYS_SIGN) + flags |= NTLMSSP_NEGOTIATE_ALWAYS_SIGN; + + if (cflags & NTLMSSP_REQUEST_TARGET) + flags |= NTLMSSP_REQUEST_TARGET; + + if (sess->conn->use_spnego && + (cflags & NTLMSSP_NEGOTIATE_EXTENDED_SEC)) + flags |= NTLMSSP_NEGOTIATE_EXTENDED_SEC; + + chgblob->NegotiateFlags = cpu_to_le32(flags); + len = strlen(ksmbd_netbios_name()); + name = kmalloc(2 + UNICODE_LEN(len), GFP_KERNEL); + if (!name) + return -ENOMEM; + + conv_len = smb_strtoUTF16((__le16 *)name, ksmbd_netbios_name(), len, + sess->conn->local_nls); + if (conv_len < 0 || conv_len > len) { + kfree(name); + return -EINVAL; + } + + uni_len = UNICODE_LEN(conv_len); + + blob_off = sizeof(struct challenge_message); + blob_len = blob_off + uni_len; + + chgblob->TargetName.Length = cpu_to_le16(uni_len); + chgblob->TargetName.MaximumLength = cpu_to_le16(uni_len); + chgblob->TargetName.BufferOffset = cpu_to_le32(blob_off); + + /* Initialize random conn challenge */ + get_random_bytes(sess->ntlmssp.cryptkey, sizeof(__u64)); + memcpy(chgblob->Challenge, sess->ntlmssp.cryptkey, + CIFS_CRYPTO_KEY_SIZE); + + /* Add Target Information to security buffer */ + chgblob->TargetInfoArray.BufferOffset = cpu_to_le32(blob_len); + + target_name = (__u8 *)chgblob + blob_off; + memcpy(target_name, name, uni_len); + tinfo = (struct target_info *)(target_name + uni_len); + + chgblob->TargetInfoArray.Length = 0; + /* Add target info list for NetBIOS/DNS settings */ + for (type = NTLMSSP_AV_NB_COMPUTER_NAME; + type <= NTLMSSP_AV_DNS_DOMAIN_NAME; type++) { + tinfo->Type = cpu_to_le16(type); + tinfo->Length = cpu_to_le16(uni_len); + memcpy(tinfo->Content, name, uni_len); + tinfo = (struct target_info *)((char *)tinfo + 4 + uni_len); + target_info_len += 4 + uni_len; + } + + /* Add terminator subblock */ + tinfo->Type = 0; + tinfo->Length = 0; + target_info_len += 4; + + chgblob->TargetInfoArray.Length = cpu_to_le16(target_info_len); + chgblob->TargetInfoArray.MaximumLength = cpu_to_le16(target_info_len); + blob_len += target_info_len; + kfree(name); + ksmbd_debug(AUTH, "NTLMSSP SecurityBufferLength %d\n", blob_len); + return blob_len; +} + +#ifdef CONFIG_SMB_SERVER_KERBEROS5 +int ksmbd_krb5_authenticate(struct ksmbd_session *sess, char *in_blob, + int in_len, char *out_blob, int *out_len) +{ + struct ksmbd_spnego_authen_response *resp; + struct ksmbd_user *user = NULL; + int retval; + + resp = ksmbd_ipc_spnego_authen_request(in_blob, in_len); + if (!resp) { + ksmbd_debug(AUTH, "SPNEGO_AUTHEN_REQUEST failure\n"); + return -EINVAL; + } + + if (!(resp->login_response.status & KSMBD_USER_FLAG_OK)) { + ksmbd_debug(AUTH, "krb5 authentication failure\n"); + retval = -EPERM; + goto out; + } + + if (*out_len <= resp->spnego_blob_len) { + ksmbd_debug(AUTH, "buf len %d, but blob len %d\n", + *out_len, resp->spnego_blob_len); + retval = -EINVAL; + goto out; + } + + if (resp->session_key_len > sizeof(sess->sess_key)) { + ksmbd_debug(AUTH, "session key is too long\n"); + retval = -EINVAL; + goto out; + } + + user = ksmbd_alloc_user(&resp->login_response); + if (!user) { + ksmbd_debug(AUTH, "login failure\n"); + retval = -ENOMEM; + goto out; + } + sess->user = user; + + memcpy(sess->sess_key, resp->payload, resp->session_key_len); + memcpy(out_blob, resp->payload + resp->session_key_len, + resp->spnego_blob_len); + *out_len = resp->spnego_blob_len; + retval = 0; +out: + kvfree(resp); + return retval; +} +#else +int ksmbd_krb5_authenticate(struct ksmbd_session *sess, char *in_blob, + int in_len, char *out_blob, int *out_len) +{ + return -EOPNOTSUPP; +} +#endif + +/** + * ksmbd_sign_smb2_pdu() - function to generate packet signing + * @conn: connection + * @key: signing key + * @iov: buffer iov array + * @n_vec: number of iovecs + * @sig: signature value generated for client request packet + * + */ +int ksmbd_sign_smb2_pdu(struct ksmbd_conn *conn, char *key, struct kvec *iov, + int n_vec, char *sig) +{ + struct ksmbd_crypto_ctx *ctx; + int rc, i; + + ctx = ksmbd_crypto_ctx_find_hmacsha256(); + if (!ctx) { + ksmbd_debug(AUTH, "could not crypto alloc hmacmd5\n"); + return -ENOMEM; + } + + rc = crypto_shash_setkey(CRYPTO_HMACSHA256_TFM(ctx), + key, + SMB2_NTLMV2_SESSKEY_SIZE); + if (rc) + goto out; + + rc = crypto_shash_init(CRYPTO_HMACSHA256(ctx)); + if (rc) { + ksmbd_debug(AUTH, "hmacsha256 init error %d\n", rc); + goto out; + } + + for (i = 0; i < n_vec; i++) { + rc = crypto_shash_update(CRYPTO_HMACSHA256(ctx), + iov[i].iov_base, + iov[i].iov_len); + if (rc) { + ksmbd_debug(AUTH, "hmacsha256 update error %d\n", rc); + goto out; + } + } + + rc = crypto_shash_final(CRYPTO_HMACSHA256(ctx), sig); + if (rc) + ksmbd_debug(AUTH, "hmacsha256 generation error %d\n", rc); +out: + ksmbd_release_crypto_ctx(ctx); + return rc; +} + +/** + * ksmbd_sign_smb3_pdu() - function to generate packet signing + * @conn: connection + * @key: signing key + * @iov: buffer iov array + * @n_vec: number of iovecs + * @sig: signature value generated for client request packet + * + */ +int ksmbd_sign_smb3_pdu(struct ksmbd_conn *conn, char *key, struct kvec *iov, + int n_vec, char *sig) +{ + struct ksmbd_crypto_ctx *ctx; + int rc, i; + + ctx = ksmbd_crypto_ctx_find_cmacaes(); + if (!ctx) { + ksmbd_debug(AUTH, "could not crypto alloc cmac\n"); + return -ENOMEM; + } + + rc = crypto_shash_setkey(CRYPTO_CMACAES_TFM(ctx), + key, + SMB2_CMACAES_SIZE); + if (rc) + goto out; + + rc = crypto_shash_init(CRYPTO_CMACAES(ctx)); + if (rc) { + ksmbd_debug(AUTH, "cmaces init error %d\n", rc); + goto out; + } + + for (i = 0; i < n_vec; i++) { + rc = crypto_shash_update(CRYPTO_CMACAES(ctx), + iov[i].iov_base, + iov[i].iov_len); + if (rc) { + ksmbd_debug(AUTH, "cmaces update error %d\n", rc); + goto out; + } + } + + rc = crypto_shash_final(CRYPTO_CMACAES(ctx), sig); + if (rc) + ksmbd_debug(AUTH, "cmaces generation error %d\n", rc); +out: + ksmbd_release_crypto_ctx(ctx); + return rc; +} + +struct derivation { + struct kvec label; + struct kvec context; + bool binding; +}; + +static int generate_key(struct ksmbd_session *sess, struct kvec label, + struct kvec context, __u8 *key, unsigned int key_size) +{ + unsigned char zero = 0x0; + __u8 i[4] = {0, 0, 0, 1}; + __u8 L128[4] = {0, 0, 0, 128}; + __u8 L256[4] = {0, 0, 1, 0}; + int rc; + unsigned char prfhash[SMB2_HMACSHA256_SIZE]; + unsigned char *hashptr = prfhash; + struct ksmbd_crypto_ctx *ctx; + + memset(prfhash, 0x0, SMB2_HMACSHA256_SIZE); + memset(key, 0x0, key_size); + + ctx = ksmbd_crypto_ctx_find_hmacsha256(); + if (!ctx) { + ksmbd_debug(AUTH, "could not crypto alloc hmacmd5\n"); + return -ENOMEM; + } + + rc = crypto_shash_setkey(CRYPTO_HMACSHA256_TFM(ctx), + sess->sess_key, + SMB2_NTLMV2_SESSKEY_SIZE); + if (rc) + goto smb3signkey_ret; + + rc = crypto_shash_init(CRYPTO_HMACSHA256(ctx)); + if (rc) { + ksmbd_debug(AUTH, "hmacsha256 init error %d\n", rc); + goto smb3signkey_ret; + } + + rc = crypto_shash_update(CRYPTO_HMACSHA256(ctx), i, 4); + if (rc) { + ksmbd_debug(AUTH, "could not update with n\n"); + goto smb3signkey_ret; + } + + rc = crypto_shash_update(CRYPTO_HMACSHA256(ctx), + label.iov_base, + label.iov_len); + if (rc) { + ksmbd_debug(AUTH, "could not update with label\n"); + goto smb3signkey_ret; + } + + rc = crypto_shash_update(CRYPTO_HMACSHA256(ctx), &zero, 1); + if (rc) { + ksmbd_debug(AUTH, "could not update with zero\n"); + goto smb3signkey_ret; + } + + rc = crypto_shash_update(CRYPTO_HMACSHA256(ctx), + context.iov_base, + context.iov_len); + if (rc) { + ksmbd_debug(AUTH, "could not update with context\n"); + goto smb3signkey_ret; + } + + if (sess->conn->cipher_type == SMB2_ENCRYPTION_AES256_CCM || + sess->conn->cipher_type == SMB2_ENCRYPTION_AES256_GCM) + rc = crypto_shash_update(CRYPTO_HMACSHA256(ctx), L256, 4); + else + rc = crypto_shash_update(CRYPTO_HMACSHA256(ctx), L128, 4); + if (rc) { + ksmbd_debug(AUTH, "could not update with L\n"); + goto smb3signkey_ret; + } + + rc = crypto_shash_final(CRYPTO_HMACSHA256(ctx), hashptr); + if (rc) { + ksmbd_debug(AUTH, "Could not generate hmacmd5 hash error %d\n", + rc); + goto smb3signkey_ret; + } + + memcpy(key, hashptr, key_size); + +smb3signkey_ret: + ksmbd_release_crypto_ctx(ctx); + return rc; +} + +static int generate_smb3signingkey(struct ksmbd_session *sess, + struct ksmbd_conn *conn, + const struct derivation *signing) +{ + int rc; + struct channel *chann; + char *key; + + chann = lookup_chann_list(sess, conn); + if (!chann) + return 0; + + if (sess->conn->dialect >= SMB30_PROT_ID && signing->binding) + key = chann->smb3signingkey; + else + key = sess->smb3signingkey; + + rc = generate_key(sess, signing->label, signing->context, key, + SMB3_SIGN_KEY_SIZE); + if (rc) + return rc; + + if (!(sess->conn->dialect >= SMB30_PROT_ID && signing->binding)) + memcpy(chann->smb3signingkey, key, SMB3_SIGN_KEY_SIZE); + + ksmbd_debug(AUTH, "dumping generated AES signing keys\n"); + ksmbd_debug(AUTH, "Session Id %llu\n", sess->id); + ksmbd_debug(AUTH, "Session Key %*ph\n", + SMB2_NTLMV2_SESSKEY_SIZE, sess->sess_key); + ksmbd_debug(AUTH, "Signing Key %*ph\n", + SMB3_SIGN_KEY_SIZE, key); + return 0; +} + +int ksmbd_gen_smb30_signingkey(struct ksmbd_session *sess, + struct ksmbd_conn *conn) +{ + struct derivation d; + + d.label.iov_base = "SMB2AESCMAC"; + d.label.iov_len = 12; + d.context.iov_base = "SmbSign"; + d.context.iov_len = 8; + d.binding = conn->binding; + + return generate_smb3signingkey(sess, conn, &d); +} + +int ksmbd_gen_smb311_signingkey(struct ksmbd_session *sess, + struct ksmbd_conn *conn) +{ + struct derivation d; + + d.label.iov_base = "SMBSigningKey"; + d.label.iov_len = 14; + if (conn->binding) { + struct preauth_session *preauth_sess; + + preauth_sess = ksmbd_preauth_session_lookup(conn, sess->id); + if (!preauth_sess) + return -ENOENT; + d.context.iov_base = preauth_sess->Preauth_HashValue; + } else { + d.context.iov_base = sess->Preauth_HashValue; + } + d.context.iov_len = 64; + d.binding = conn->binding; + + return generate_smb3signingkey(sess, conn, &d); +} + +struct derivation_twin { + struct derivation encryption; + struct derivation decryption; +}; + +static int generate_smb3encryptionkey(struct ksmbd_session *sess, + const struct derivation_twin *ptwin) +{ + int rc; + + rc = generate_key(sess, ptwin->encryption.label, + ptwin->encryption.context, sess->smb3encryptionkey, + SMB3_ENC_DEC_KEY_SIZE); + if (rc) + return rc; + + rc = generate_key(sess, ptwin->decryption.label, + ptwin->decryption.context, + sess->smb3decryptionkey, SMB3_ENC_DEC_KEY_SIZE); + if (rc) + return rc; + + ksmbd_debug(AUTH, "dumping generated AES encryption keys\n"); + ksmbd_debug(AUTH, "Cipher type %d\n", sess->conn->cipher_type); + ksmbd_debug(AUTH, "Session Id %llu\n", sess->id); + ksmbd_debug(AUTH, "Session Key %*ph\n", + SMB2_NTLMV2_SESSKEY_SIZE, sess->sess_key); + if (sess->conn->cipher_type == SMB2_ENCRYPTION_AES256_CCM || + sess->conn->cipher_type == SMB2_ENCRYPTION_AES256_GCM) { + ksmbd_debug(AUTH, "ServerIn Key %*ph\n", + SMB3_GCM256_CRYPTKEY_SIZE, sess->smb3encryptionkey); + ksmbd_debug(AUTH, "ServerOut Key %*ph\n", + SMB3_GCM256_CRYPTKEY_SIZE, sess->smb3decryptionkey); + } else { + ksmbd_debug(AUTH, "ServerIn Key %*ph\n", + SMB3_GCM128_CRYPTKEY_SIZE, sess->smb3encryptionkey); + ksmbd_debug(AUTH, "ServerOut Key %*ph\n", + SMB3_GCM128_CRYPTKEY_SIZE, sess->smb3decryptionkey); + } + return 0; +} + +int ksmbd_gen_smb30_encryptionkey(struct ksmbd_session *sess) +{ + struct derivation_twin twin; + struct derivation *d; + + d = &twin.encryption; + d->label.iov_base = "SMB2AESCCM"; + d->label.iov_len = 11; + d->context.iov_base = "ServerOut"; + d->context.iov_len = 10; + + d = &twin.decryption; + d->label.iov_base = "SMB2AESCCM"; + d->label.iov_len = 11; + d->context.iov_base = "ServerIn "; + d->context.iov_len = 10; + + return generate_smb3encryptionkey(sess, &twin); +} + +int ksmbd_gen_smb311_encryptionkey(struct ksmbd_session *sess) +{ + struct derivation_twin twin; + struct derivation *d; + + d = &twin.encryption; + d->label.iov_base = "SMBS2CCipherKey"; + d->label.iov_len = 16; + d->context.iov_base = sess->Preauth_HashValue; + d->context.iov_len = 64; + + d = &twin.decryption; + d->label.iov_base = "SMBC2SCipherKey"; + d->label.iov_len = 16; + d->context.iov_base = sess->Preauth_HashValue; + d->context.iov_len = 64; + + return generate_smb3encryptionkey(sess, &twin); +} + +int ksmbd_gen_preauth_integrity_hash(struct ksmbd_conn *conn, char *buf, + __u8 *pi_hash) +{ + int rc; + struct smb2_hdr *rcv_hdr = (struct smb2_hdr *)buf; + char *all_bytes_msg = (char *)&rcv_hdr->ProtocolId; + int msg_size = be32_to_cpu(rcv_hdr->smb2_buf_length); + struct ksmbd_crypto_ctx *ctx = NULL; + + if (conn->preauth_info->Preauth_HashId != + SMB2_PREAUTH_INTEGRITY_SHA512) + return -EINVAL; + + ctx = ksmbd_crypto_ctx_find_sha512(); + if (!ctx) { + ksmbd_debug(AUTH, "could not alloc sha512\n"); + return -ENOMEM; + } + + rc = crypto_shash_init(CRYPTO_SHA512(ctx)); + if (rc) { + ksmbd_debug(AUTH, "could not init shashn"); + goto out; + } + + rc = crypto_shash_update(CRYPTO_SHA512(ctx), pi_hash, 64); + if (rc) { + ksmbd_debug(AUTH, "could not update with n\n"); + goto out; + } + + rc = crypto_shash_update(CRYPTO_SHA512(ctx), all_bytes_msg, msg_size); + if (rc) { + ksmbd_debug(AUTH, "could not update with n\n"); + goto out; + } + + rc = crypto_shash_final(CRYPTO_SHA512(ctx), pi_hash); + if (rc) { + ksmbd_debug(AUTH, "Could not generate hash err : %d\n", rc); + goto out; + } +out: + ksmbd_release_crypto_ctx(ctx); + return rc; +} + +int ksmbd_gen_sd_hash(struct ksmbd_conn *conn, char *sd_buf, int len, + __u8 *pi_hash) +{ + int rc; + struct ksmbd_crypto_ctx *ctx = NULL; + + ctx = ksmbd_crypto_ctx_find_sha256(); + if (!ctx) { + ksmbd_debug(AUTH, "could not alloc sha256\n"); + return -ENOMEM; + } + + rc = crypto_shash_init(CRYPTO_SHA256(ctx)); + if (rc) { + ksmbd_debug(AUTH, "could not init shashn"); + goto out; + } + + rc = crypto_shash_update(CRYPTO_SHA256(ctx), sd_buf, len); + if (rc) { + ksmbd_debug(AUTH, "could not update with n\n"); + goto out; + } + + rc = crypto_shash_final(CRYPTO_SHA256(ctx), pi_hash); + if (rc) { + ksmbd_debug(AUTH, "Could not generate hash err : %d\n", rc); + goto out; + } +out: + ksmbd_release_crypto_ctx(ctx); + return rc; +} + +static int ksmbd_get_encryption_key(struct ksmbd_conn *conn, __u64 ses_id, + int enc, u8 *key) +{ + struct ksmbd_session *sess; + u8 *ses_enc_key; + + sess = ksmbd_session_lookup_all(conn, ses_id); + if (!sess) + return -EINVAL; + + ses_enc_key = enc ? sess->smb3encryptionkey : + sess->smb3decryptionkey; + memcpy(key, ses_enc_key, SMB3_ENC_DEC_KEY_SIZE); + + return 0; +} + +static inline void smb2_sg_set_buf(struct scatterlist *sg, const void *buf, + unsigned int buflen) +{ + void *addr; + + if (is_vmalloc_addr(buf)) + addr = vmalloc_to_page(buf); + else + addr = virt_to_page(buf); + sg_set_page(sg, addr, buflen, offset_in_page(buf)); +} + +static struct scatterlist *ksmbd_init_sg(struct kvec *iov, unsigned int nvec, + u8 *sign) +{ + struct scatterlist *sg; + unsigned int assoc_data_len = sizeof(struct smb2_transform_hdr) - 24; + int i, nr_entries[3] = {0}, total_entries = 0, sg_idx = 0; + + if (!nvec) + return NULL; + + for (i = 0; i < nvec - 1; i++) { + unsigned long kaddr = (unsigned long)iov[i + 1].iov_base; + + if (is_vmalloc_addr(iov[i + 1].iov_base)) { + nr_entries[i] = ((kaddr + iov[i + 1].iov_len + + PAGE_SIZE - 1) >> PAGE_SHIFT) - + (kaddr >> PAGE_SHIFT); + } else { + nr_entries[i]++; + } + total_entries += nr_entries[i]; + } + + /* Add two entries for transform header and signature */ + total_entries += 2; + + sg = kmalloc_array(total_entries, sizeof(struct scatterlist), GFP_KERNEL); + if (!sg) + return NULL; + + sg_init_table(sg, total_entries); + smb2_sg_set_buf(&sg[sg_idx++], iov[0].iov_base + 24, assoc_data_len); + for (i = 0; i < nvec - 1; i++) { + void *data = iov[i + 1].iov_base; + int len = iov[i + 1].iov_len; + + if (is_vmalloc_addr(data)) { + int j, offset = offset_in_page(data); + + for (j = 0; j < nr_entries[i]; j++) { + unsigned int bytes = PAGE_SIZE - offset; + + if (!len) + break; + + if (bytes > len) + bytes = len; + + sg_set_page(&sg[sg_idx++], + vmalloc_to_page(data), bytes, + offset_in_page(data)); + + data += bytes; + len -= bytes; + offset = 0; + } + } else { + sg_set_page(&sg[sg_idx++], virt_to_page(data), len, + offset_in_page(data)); + } + } + smb2_sg_set_buf(&sg[sg_idx], sign, SMB2_SIGNATURE_SIZE); + return sg; +} + +int ksmbd_crypt_message(struct ksmbd_conn *conn, struct kvec *iov, + unsigned int nvec, int enc) +{ + struct smb2_transform_hdr *tr_hdr = + (struct smb2_transform_hdr *)iov[0].iov_base; + unsigned int assoc_data_len = sizeof(struct smb2_transform_hdr) - 24; + int rc; + struct scatterlist *sg; + u8 sign[SMB2_SIGNATURE_SIZE] = {}; + u8 key[SMB3_ENC_DEC_KEY_SIZE]; + struct aead_request *req; + char *iv; + unsigned int iv_len; + struct crypto_aead *tfm; + unsigned int crypt_len = le32_to_cpu(tr_hdr->OriginalMessageSize); + struct ksmbd_crypto_ctx *ctx; + + rc = ksmbd_get_encryption_key(conn, + le64_to_cpu(tr_hdr->SessionId), + enc, + key); + if (rc) { + pr_err("Could not get %scryption key\n", enc ? "en" : "de"); + return rc; + } + + if (conn->cipher_type == SMB2_ENCRYPTION_AES128_GCM || + conn->cipher_type == SMB2_ENCRYPTION_AES256_GCM) + ctx = ksmbd_crypto_ctx_find_gcm(); + else + ctx = ksmbd_crypto_ctx_find_ccm(); + if (!ctx) { + pr_err("crypto alloc failed\n"); + return -ENOMEM; + } + + if (conn->cipher_type == SMB2_ENCRYPTION_AES128_GCM || + conn->cipher_type == SMB2_ENCRYPTION_AES256_GCM) + tfm = CRYPTO_GCM(ctx); + else + tfm = CRYPTO_CCM(ctx); + + if (conn->cipher_type == SMB2_ENCRYPTION_AES256_CCM || + conn->cipher_type == SMB2_ENCRYPTION_AES256_GCM) + rc = crypto_aead_setkey(tfm, key, SMB3_GCM256_CRYPTKEY_SIZE); + else + rc = crypto_aead_setkey(tfm, key, SMB3_GCM128_CRYPTKEY_SIZE); + if (rc) { + pr_err("Failed to set aead key %d\n", rc); + goto free_ctx; + } + + rc = crypto_aead_setauthsize(tfm, SMB2_SIGNATURE_SIZE); + if (rc) { + pr_err("Failed to set authsize %d\n", rc); + goto free_ctx; + } + + req = aead_request_alloc(tfm, GFP_KERNEL); + if (!req) { + rc = -ENOMEM; + goto free_ctx; + } + + if (!enc) { + memcpy(sign, &tr_hdr->Signature, SMB2_SIGNATURE_SIZE); + crypt_len += SMB2_SIGNATURE_SIZE; + } + + sg = ksmbd_init_sg(iov, nvec, sign); + if (!sg) { + pr_err("Failed to init sg\n"); + rc = -ENOMEM; + goto free_req; + } + + iv_len = crypto_aead_ivsize(tfm); + iv = kzalloc(iv_len, GFP_KERNEL); + if (!iv) { + rc = -ENOMEM; + goto free_sg; + } + + if (conn->cipher_type == SMB2_ENCRYPTION_AES128_GCM || + conn->cipher_type == SMB2_ENCRYPTION_AES256_GCM) { + memcpy(iv, (char *)tr_hdr->Nonce, SMB3_AES_GCM_NONCE); + } else { + iv[0] = 3; + memcpy(iv + 1, (char *)tr_hdr->Nonce, SMB3_AES_CCM_NONCE); + } + + aead_request_set_crypt(req, sg, sg, crypt_len, iv); + aead_request_set_ad(req, assoc_data_len); + aead_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL); + + if (enc) + rc = crypto_aead_encrypt(req); + else + rc = crypto_aead_decrypt(req); + if (rc) + goto free_iv; + + if (enc) + memcpy(&tr_hdr->Signature, sign, SMB2_SIGNATURE_SIZE); + +free_iv: + kfree(iv); +free_sg: + kfree(sg); +free_req: + kfree(req); +free_ctx: + ksmbd_release_crypto_ctx(ctx); + return rc; +} diff --git a/fs/ksmbd/auth.h b/fs/ksmbd/auth.h new file mode 100644 index 000000000000..9c2d4badd05d --- /dev/null +++ b/fs/ksmbd/auth.h @@ -0,0 +1,67 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2018 Samsung Electronics Co., Ltd. + */ + +#ifndef __AUTH_H__ +#define __AUTH_H__ + +#include "ntlmssp.h" + +#ifdef CONFIG_SMB_SERVER_KERBEROS5 +#define AUTH_GSS_LENGTH 96 +#define AUTH_GSS_PADDING 0 +#else +#define AUTH_GSS_LENGTH 74 +#define AUTH_GSS_PADDING 6 +#endif + +#define CIFS_HMAC_MD5_HASH_SIZE (16) +#define CIFS_NTHASH_SIZE (16) + +/* + * Size of the ntlm client response + */ +#define CIFS_AUTH_RESP_SIZE 24 +#define CIFS_SMB1_SIGNATURE_SIZE 8 +#define CIFS_SMB1_SESSKEY_SIZE 16 + +#define KSMBD_AUTH_NTLMSSP 0x0001 +#define KSMBD_AUTH_KRB5 0x0002 +#define KSMBD_AUTH_MSKRB5 0x0004 +#define KSMBD_AUTH_KRB5U2U 0x0008 + +struct ksmbd_session; +struct ksmbd_conn; +struct kvec; + +int ksmbd_crypt_message(struct ksmbd_conn *conn, struct kvec *iov, + unsigned int nvec, int enc); +void ksmbd_copy_gss_neg_header(void *buf); +int ksmbd_auth_ntlm(struct ksmbd_session *sess, char *pw_buf); +int ksmbd_auth_ntlmv2(struct ksmbd_session *sess, struct ntlmv2_resp *ntlmv2, + int blen, char *domain_name); +int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob, + int blob_len, struct ksmbd_session *sess); +int ksmbd_decode_ntlmssp_neg_blob(struct negotiate_message *negblob, + int blob_len, struct ksmbd_session *sess); +unsigned int +ksmbd_build_ntlmssp_challenge_blob(struct challenge_message *chgblob, + struct ksmbd_session *sess); +int ksmbd_krb5_authenticate(struct ksmbd_session *sess, char *in_blob, + int in_len, char *out_blob, int *out_len); +int ksmbd_sign_smb2_pdu(struct ksmbd_conn *conn, char *key, struct kvec *iov, + int n_vec, char *sig); +int ksmbd_sign_smb3_pdu(struct ksmbd_conn *conn, char *key, struct kvec *iov, + int n_vec, char *sig); +int ksmbd_gen_smb30_signingkey(struct ksmbd_session *sess, + struct ksmbd_conn *conn); +int ksmbd_gen_smb311_signingkey(struct ksmbd_session *sess, + struct ksmbd_conn *conn); +int ksmbd_gen_smb30_encryptionkey(struct ksmbd_session *sess); +int ksmbd_gen_smb311_encryptionkey(struct ksmbd_session *sess); +int ksmbd_gen_preauth_integrity_hash(struct ksmbd_conn *conn, char *buf, + __u8 *pi_hash); +int ksmbd_gen_sd_hash(struct ksmbd_conn *conn, char *sd_buf, int len, + __u8 *pi_hash); +#endif diff --git a/fs/ksmbd/connection.c b/fs/ksmbd/connection.c new file mode 100644 index 000000000000..af086d35398a --- /dev/null +++ b/fs/ksmbd/connection.c @@ -0,0 +1,413 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2016 Namjae Jeon <namjae.jeon@protocolfreedom.org> + * Copyright (C) 2018 Samsung Electronics Co., Ltd. + */ + +#include <linux/mutex.h> +#include <linux/freezer.h> +#include <linux/module.h> + +#include "server.h" +#include "smb_common.h" +#include "mgmt/ksmbd_ida.h" +#include "connection.h" +#include "transport_tcp.h" +#include "transport_rdma.h" + +static DEFINE_MUTEX(init_lock); + +static struct ksmbd_conn_ops default_conn_ops; + +LIST_HEAD(conn_list); +DEFINE_RWLOCK(conn_list_lock); + +/** + * ksmbd_conn_free() - free resources of the connection instance + * + * @conn: connection instance to be cleand up + * + * During the thread termination, the corresponding conn instance + * resources(sock/memory) are released and finally the conn object is freed. + */ +void ksmbd_conn_free(struct ksmbd_conn *conn) +{ + write_lock(&conn_list_lock); + list_del(&conn->conns_list); + write_unlock(&conn_list_lock); + + kvfree(conn->request_buf); + kfree(conn->preauth_info); + kfree(conn); +} + +/** + * ksmbd_conn_alloc() - initialize a new connection instance + * + * Return: ksmbd_conn struct on success, otherwise NULL + */ +struct ksmbd_conn *ksmbd_conn_alloc(void) +{ + struct ksmbd_conn *conn; + + conn = kzalloc(sizeof(struct ksmbd_conn), GFP_KERNEL); + if (!conn) + return NULL; + + conn->need_neg = true; + conn->status = KSMBD_SESS_NEW; + conn->local_nls = load_nls("utf8"); + if (!conn->local_nls) + conn->local_nls = load_nls_default(); + atomic_set(&conn->req_running, 0); + atomic_set(&conn->r_count, 0); + init_waitqueue_head(&conn->req_running_q); + INIT_LIST_HEAD(&conn->conns_list); + INIT_LIST_HEAD(&conn->sessions); + INIT_LIST_HEAD(&conn->requests); + INIT_LIST_HEAD(&conn->async_requests); + spin_lock_init(&conn->request_lock); + spin_lock_init(&conn->credits_lock); + ida_init(&conn->async_ida); + + spin_lock_init(&conn->llist_lock); + INIT_LIST_HEAD(&conn->lock_list); + + write_lock(&conn_list_lock); + list_add(&conn->conns_list, &conn_list); + write_unlock(&conn_list_lock); + return conn; +} + +bool ksmbd_conn_lookup_dialect(struct ksmbd_conn *c) +{ + struct ksmbd_conn *t; + bool ret = false; + + read_lock(&conn_list_lock); + list_for_each_entry(t, &conn_list, conns_list) { + if (memcmp(t->ClientGUID, c->ClientGUID, SMB2_CLIENT_GUID_SIZE)) + continue; + + ret = true; + break; + } + read_unlock(&conn_list_lock); + return ret; +} + +void ksmbd_conn_enqueue_request(struct ksmbd_work *work) +{ + struct ksmbd_conn *conn = work->conn; + struct list_head *requests_queue = NULL; + + if (conn->ops->get_cmd_val(work) != SMB2_CANCEL_HE) { + requests_queue = &conn->requests; + work->syncronous = true; + } + + if (requests_queue) { + atomic_inc(&conn->req_running); + spin_lock(&conn->request_lock); + list_add_tail(&work->request_entry, requests_queue); + spin_unlock(&conn->request_lock); + } +} + +int ksmbd_conn_try_dequeue_request(struct ksmbd_work *work) +{ + struct ksmbd_conn *conn = work->conn; + int ret = 1; + + if (list_empty(&work->request_entry) && + list_empty(&work->async_request_entry)) + return 0; + + if (!work->multiRsp) + atomic_dec(&conn->req_running); + spin_lock(&conn->request_lock); + if (!work->multiRsp) { + list_del_init(&work->request_entry); + if (work->syncronous == false) + list_del_init(&work->async_request_entry); + ret = 0; + } + spin_unlock(&conn->request_lock); + + wake_up_all(&conn->req_running_q); + return ret; +} + +static void ksmbd_conn_lock(struct ksmbd_conn *conn) +{ + mutex_lock(&conn->srv_mutex); +} + +static void ksmbd_conn_unlock(struct ksmbd_conn *conn) +{ + mutex_unlock(&conn->srv_mutex); +} + +void ksmbd_conn_wait_idle(struct ksmbd_conn *conn) +{ + wait_event(conn->req_running_q, atomic_read(&conn->req_running) < 2); +} + +int ksmbd_conn_write(struct ksmbd_work *work) +{ + struct ksmbd_conn *conn = work->conn; + struct smb_hdr *rsp_hdr = work->response_buf; + size_t len = 0; + int sent; + struct kvec iov[3]; + int iov_idx = 0; + + ksmbd_conn_try_dequeue_request(work); + if (!rsp_hdr) { + pr_err("NULL response header\n"); + return -EINVAL; + } + + if (work->tr_buf) { + iov[iov_idx] = (struct kvec) { work->tr_buf, + sizeof(struct smb2_transform_hdr) }; + len += iov[iov_idx++].iov_len; + } + + if (work->aux_payload_sz) { + iov[iov_idx] = (struct kvec) { rsp_hdr, work->resp_hdr_sz }; + len += iov[iov_idx++].iov_len; + iov[iov_idx] = (struct kvec) { work->aux_payload_buf, work->aux_payload_sz }; + len += iov[iov_idx++].iov_len; + } else { + if (work->tr_buf) + iov[iov_idx].iov_len = work->resp_hdr_sz; + else + iov[iov_idx].iov_len = get_rfc1002_len(rsp_hdr) + 4; + iov[iov_idx].iov_base = rsp_hdr; + len += iov[iov_idx++].iov_len; + } + + ksmbd_conn_lock(conn); + sent = conn->transport->ops->writev(conn->transport, &iov[0], + iov_idx, len, + work->need_invalidate_rkey, + work->remote_key); + ksmbd_conn_unlock(conn); + + if (sent < 0) { + pr_err("Failed to send message: %d\n", sent); + return sent; + } + + return 0; +} + +int ksmbd_conn_rdma_read(struct ksmbd_conn *conn, void *buf, + unsigned int buflen, u32 remote_key, u64 remote_offset, + u32 remote_len) +{ + int ret = -EINVAL; + + if (conn->transport->ops->rdma_read) + ret = conn->transport->ops->rdma_read(conn->transport, + buf, buflen, + remote_key, remote_offset, + remote_len); + return ret; +} + +int ksmbd_conn_rdma_write(struct ksmbd_conn *conn, void *buf, + unsigned int buflen, u32 remote_key, + u64 remote_offset, u32 remote_len) +{ + int ret = -EINVAL; + + if (conn->transport->ops->rdma_write) + ret = conn->transport->ops->rdma_write(conn->transport, + buf, buflen, + remote_key, remote_offset, + remote_len); + return ret; +} + +bool ksmbd_conn_alive(struct ksmbd_conn *conn) +{ + if (!ksmbd_server_running()) + return false; + + if (conn->status == KSMBD_SESS_EXITING) + return false; + + if (kthread_should_stop()) + return false; + + if (atomic_read(&conn->stats.open_files_count) > 0) + return true; + + /* + * Stop current session if the time that get last request from client + * is bigger than deadtime user configured and opening file count is + * zero. + */ + if (server_conf.deadtime > 0 && + time_after(jiffies, conn->last_active + server_conf.deadtime)) { + ksmbd_debug(CONN, "No response from client in %lu minutes\n", + server_conf.deadtime / SMB_ECHO_INTERVAL); + return false; + } + return true; +} + +/** + * ksmbd_conn_handler_loop() - session thread to listen on new smb requests + * @p: connection instance + * + * One thread each per connection + * + * Return: 0 on success + */ +int ksmbd_conn_handler_loop(void *p) +{ + struct ksmbd_conn *conn = (struct ksmbd_conn *)p; + struct ksmbd_transport *t = conn->transport; + unsigned int pdu_size; + char hdr_buf[4] = {0,}; + int size; + + mutex_init(&conn->srv_mutex); + __module_get(THIS_MODULE); + + if (t->ops->prepare && t->ops->prepare(t)) + goto out; + + conn->last_active = jiffies; + while (ksmbd_conn_alive(conn)) { + if (try_to_freeze()) + continue; + + kvfree(conn->request_buf); + conn->request_buf = NULL; + + size = t->ops->read(t, hdr_buf, sizeof(hdr_buf)); + if (size != sizeof(hdr_buf)) + break; + + pdu_size = get_rfc1002_len(hdr_buf); + ksmbd_debug(CONN, "RFC1002 header %u bytes\n", pdu_size); + + /* make sure we have enough to get to SMB header end */ + if (!ksmbd_pdu_size_has_room(pdu_size)) { + ksmbd_debug(CONN, "SMB request too short (%u bytes)\n", + pdu_size); + continue; + } + + /* 4 for rfc1002 length field */ + size = pdu_size + 4; + conn->request_buf = kvmalloc(size, GFP_KERNEL); + if (!conn->request_buf) + continue; + + memcpy(conn->request_buf, hdr_buf, sizeof(hdr_buf)); + if (!ksmbd_smb_request(conn)) + break; + + /* + * We already read 4 bytes to find out PDU size, now + * read in PDU + */ + size = t->ops->read(t, conn->request_buf + 4, pdu_size); + if (size < 0) { + pr_err("sock_read failed: %d\n", size); + break; + } + + if (size != pdu_size) { + pr_err("PDU error. Read: %d, Expected: %d\n", + size, pdu_size); + continue; + } + + if (!default_conn_ops.process_fn) { + pr_err("No connection request callback\n"); + break; + } + + if (default_conn_ops.process_fn(conn)) { + pr_err("Cannot handle request\n"); + break; + } + } + +out: + /* Wait till all reference dropped to the Server object*/ + while (atomic_read(&conn->r_count) > 0) + schedule_timeout(HZ); + + unload_nls(conn->local_nls); + if (default_conn_ops.terminate_fn) + default_conn_ops.terminate_fn(conn); + t->ops->disconnect(t); + module_put(THIS_MODULE); + return 0; +} + +void ksmbd_conn_init_server_callbacks(struct ksmbd_conn_ops *ops) +{ + default_conn_ops.process_fn = ops->process_fn; + default_conn_ops.terminate_fn = ops->terminate_fn; +} + +int ksmbd_conn_transport_init(void) +{ + int ret; + + mutex_lock(&init_lock); + ret = ksmbd_tcp_init(); + if (ret) { + pr_err("Failed to init TCP subsystem: %d\n", ret); + goto out; + } + + ret = ksmbd_rdma_init(); + if (ret) { + pr_err("Failed to init RDMA subsystem: %d\n", ret); + goto out; + } +out: + mutex_unlock(&init_lock); + return ret; +} + +static void stop_sessions(void) +{ + struct ksmbd_conn *conn; + +again: + read_lock(&conn_list_lock); + list_for_each_entry(conn, &conn_list, conns_list) { + struct task_struct *task; + + task = conn->transport->handler; + if (task) + ksmbd_debug(CONN, "Stop session handler %s/%d\n", + task->comm, task_pid_nr(task)); + conn->status = KSMBD_SESS_EXITING; + } + read_unlock(&conn_list_lock); + + if (!list_empty(&conn_list)) { + schedule_timeout_interruptible(HZ / 10); /* 100ms */ + goto again; + } +} + +void ksmbd_conn_transport_destroy(void) +{ + mutex_lock(&init_lock); + ksmbd_tcp_destroy(); + ksmbd_rdma_destroy(); + stop_sessions(); + mutex_unlock(&init_lock); +} diff --git a/fs/ksmbd/connection.h b/fs/ksmbd/connection.h new file mode 100644 index 000000000000..e5403c587a58 --- /dev/null +++ b/fs/ksmbd/connection.h @@ -0,0 +1,213 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2018 Samsung Electronics Co., Ltd. + */ + +#ifndef __KSMBD_CONNECTION_H__ +#define __KSMBD_CONNECTION_H__ + +#include <linux/list.h> +#include <linux/ip.h> +#include <net/sock.h> +#include <net/tcp.h> +#include <net/inet_connection_sock.h> +#include <net/request_sock.h> +#include <linux/kthread.h> +#include <linux/nls.h> + +#include "smb_common.h" +#include "ksmbd_work.h" + +#define KSMBD_SOCKET_BACKLOG 16 + +/* + * WARNING + * + * This is nothing but a HACK. Session status should move to channel + * or to session. As of now we have 1 tcp_conn : 1 ksmbd_session, but + * we need to change it to 1 tcp_conn : N ksmbd_sessions. + */ +enum { + KSMBD_SESS_NEW = 0, + KSMBD_SESS_GOOD, + KSMBD_SESS_EXITING, + KSMBD_SESS_NEED_RECONNECT, + KSMBD_SESS_NEED_NEGOTIATE +}; + +struct ksmbd_stats { + atomic_t open_files_count; + atomic64_t request_served; +}; + +struct ksmbd_transport; + +struct ksmbd_conn { + struct smb_version_values *vals; + struct smb_version_ops *ops; + struct smb_version_cmds *cmds; + unsigned int max_cmds; + struct mutex srv_mutex; + int status; + unsigned int cli_cap; + char *request_buf; + struct ksmbd_transport *transport; + struct nls_table *local_nls; + struct list_head conns_list; + /* smb session 1 per user */ + struct list_head sessions; + unsigned long last_active; + /* How many request are running currently */ + atomic_t req_running; + /* References which are made for this Server object*/ + atomic_t r_count; + unsigned short total_credits; + unsigned short max_credits; + spinlock_t credits_lock; + wait_queue_head_t req_running_q; + /* Lock to protect requests list*/ + spinlock_t request_lock; + struct list_head requests; + struct list_head async_requests; + int connection_type; + struct ksmbd_stats stats; + char ClientGUID[SMB2_CLIENT_GUID_SIZE]; + union { + /* pending trans request table */ + struct trans_state *recent_trans; + /* Used by ntlmssp */ + char *ntlmssp_cryptkey; + }; + + spinlock_t llist_lock; + struct list_head lock_list; + + struct preauth_integrity_info *preauth_info; + + bool need_neg; + unsigned int auth_mechs; + unsigned int preferred_auth_mech; + bool sign; + bool use_spnego:1; + __u16 cli_sec_mode; + __u16 srv_sec_mode; + /* dialect index that server chose */ + __u16 dialect; + + char *mechToken; + + struct ksmbd_conn_ops *conn_ops; + + /* Preauth Session Table */ + struct list_head preauth_sess_table; + + struct sockaddr_storage peer_addr; + + /* Identifier for async message */ + struct ida async_ida; + + __le16 cipher_type; + __le16 compress_algorithm; + bool posix_ext_supported; + bool signing_negotiated; + __le16 signing_algorithm; + bool binding; +}; + +struct ksmbd_conn_ops { + int (*process_fn)(struct ksmbd_conn *conn); + int (*terminate_fn)(struct ksmbd_conn *conn); +}; + +struct ksmbd_transport_ops { + int (*prepare)(struct ksmbd_transport *t); + void (*disconnect)(struct ksmbd_transport *t); + int (*read)(struct ksmbd_transport *t, char *buf, unsigned int size); + int (*writev)(struct ksmbd_transport *t, struct kvec *iovs, int niov, + int size, bool need_invalidate_rkey, + unsigned int remote_key); + int (*rdma_read)(struct ksmbd_transport *t, void *buf, unsigned int len, + u32 remote_key, u64 remote_offset, u32 remote_len); + int (*rdma_write)(struct ksmbd_transport *t, void *buf, + unsigned int len, u32 remote_key, u64 remote_offset, + u32 remote_len); +}; + +struct ksmbd_transport { + struct ksmbd_conn *conn; + struct ksmbd_transport_ops *ops; + struct task_struct *handler; +}; + +#define KSMBD_TCP_RECV_TIMEOUT (7 * HZ) +#define KSMBD_TCP_SEND_TIMEOUT (5 * HZ) +#define KSMBD_TCP_PEER_SOCKADDR(c) ((struct sockaddr *)&((c)->peer_addr)) + +extern struct list_head conn_list; +extern rwlock_t conn_list_lock; + +bool ksmbd_conn_alive(struct ksmbd_conn *conn); +void ksmbd_conn_wait_idle(struct ksmbd_conn *conn); +struct ksmbd_conn *ksmbd_conn_alloc(void); +void ksmbd_conn_free(struct ksmbd_conn *conn); +bool ksmbd_conn_lookup_dialect(struct ksmbd_conn *c); +int ksmbd_conn_write(struct ksmbd_work *work); +int ksmbd_conn_rdma_read(struct ksmbd_conn *conn, void *buf, + unsigned int buflen, u32 remote_key, u64 remote_offset, + u32 remote_len); +int ksmbd_conn_rdma_write(struct ksmbd_conn *conn, void *buf, + unsigned int buflen, u32 remote_key, u64 remote_offset, + u32 remote_len); +void ksmbd_conn_enqueue_request(struct ksmbd_work *work); +int ksmbd_conn_try_dequeue_request(struct ksmbd_work *work); +void ksmbd_conn_init_server_callbacks(struct ksmbd_conn_ops *ops); +int ksmbd_conn_handler_loop(void *p); +int ksmbd_conn_transport_init(void); +void ksmbd_conn_transport_destroy(void); + +/* + * WARNING + * + * This is a hack. We will move status to a proper place once we land + * a multi-sessions support. + */ +static inline bool ksmbd_conn_good(struct ksmbd_work *work) +{ + return work->conn->status == KSMBD_SESS_GOOD; +} + +static inline bool ksmbd_conn_need_negotiate(struct ksmbd_work *work) +{ + return work->conn->status == KSMBD_SESS_NEED_NEGOTIATE; +} + +static inline bool ksmbd_conn_need_reconnect(struct ksmbd_work *work) +{ + return work->conn->status == KSMBD_SESS_NEED_RECONNECT; +} + +static inline bool ksmbd_conn_exiting(struct ksmbd_work *work) +{ + return work->conn->status == KSMBD_SESS_EXITING; +} + +static inline void ksmbd_conn_set_good(struct ksmbd_work *work) +{ + work->conn->status = KSMBD_SESS_GOOD; +} + +static inline void ksmbd_conn_set_need_negotiate(struct ksmbd_work *work) +{ + work->conn->status = KSMBD_SESS_NEED_NEGOTIATE; +} + +static inline void ksmbd_conn_set_need_reconnect(struct ksmbd_work *work) +{ + work->conn->status = KSMBD_SESS_NEED_RECONNECT; +} + +static inline void ksmbd_conn_set_exiting(struct ksmbd_work *work) +{ + work->conn->status = KSMBD_SESS_EXITING; +} +#endif /* __CONNECTION_H__ */ diff --git a/fs/ksmbd/crypto_ctx.c b/fs/ksmbd/crypto_ctx.c new file mode 100644 index 000000000000..5f4b1008d17e --- /dev/null +++ b/fs/ksmbd/crypto_ctx.c @@ -0,0 +1,282 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2019 Samsung Electronics Co., Ltd. + */ + +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/err.h> +#include <linux/slab.h> +#include <linux/wait.h> +#include <linux/sched.h> + +#include "glob.h" +#include "crypto_ctx.h" + +struct crypto_ctx_list { + spinlock_t ctx_lock; + int avail_ctx; + struct list_head idle_ctx; + wait_queue_head_t ctx_wait; +}; + +static struct crypto_ctx_list ctx_list; + +static inline void free_aead(struct crypto_aead *aead) +{ + if (aead) + crypto_free_aead(aead); +} + +static void free_shash(struct shash_desc *shash) +{ + if (shash) { + crypto_free_shash(shash->tfm); + kfree(shash); + } +} + +static struct crypto_aead *alloc_aead(int id) +{ + struct crypto_aead *tfm = NULL; + + switch (id) { + case CRYPTO_AEAD_AES_GCM: + tfm = crypto_alloc_aead("gcm(aes)", 0, 0); + break; + case CRYPTO_AEAD_AES_CCM: + tfm = crypto_alloc_aead("ccm(aes)", 0, 0); + break; + default: + pr_err("Does not support encrypt ahead(id : %d)\n", id); + return NULL; + } + + if (IS_ERR(tfm)) { + pr_err("Failed to alloc encrypt aead : %ld\n", PTR_ERR(tfm)); + return NULL; + } + + return tfm; +} + +static struct shash_desc *alloc_shash_desc(int id) +{ + struct crypto_shash *tfm = NULL; + struct shash_desc *shash; + + switch (id) { + case CRYPTO_SHASH_HMACMD5: + tfm = crypto_alloc_shash("hmac(md5)", 0, 0); + break; + case CRYPTO_SHASH_HMACSHA256: + tfm = crypto_alloc_shash("hmac(sha256)", 0, 0); + break; + case CRYPTO_SHASH_CMACAES: + tfm = crypto_alloc_shash("cmac(aes)", 0, 0); + break; + case CRYPTO_SHASH_SHA256: + tfm = crypto_alloc_shash("sha256", 0, 0); + break; + case CRYPTO_SHASH_SHA512: + tfm = crypto_alloc_shash("sha512", 0, 0); + break; + case CRYPTO_SHASH_MD4: + tfm = crypto_alloc_shash("md4", 0, 0); + break; + case CRYPTO_SHASH_MD5: + tfm = crypto_alloc_shash("md5", 0, 0); + break; + default: + return NULL; + } + + if (IS_ERR(tfm)) + return NULL; + + shash = kzalloc(sizeof(*shash) + crypto_shash_descsize(tfm), + GFP_KERNEL); + if (!shash) + crypto_free_shash(tfm); + else + shash->tfm = tfm; + return shash; +} + +static void ctx_free(struct ksmbd_crypto_ctx *ctx) +{ + int i; + + for (i = 0; i < CRYPTO_SHASH_MAX; i++) + free_shash(ctx->desc[i]); + for (i = 0; i < CRYPTO_AEAD_MAX; i++) + free_aead(ctx->ccmaes[i]); + kfree(ctx); +} + +static struct ksmbd_crypto_ctx *ksmbd_find_crypto_ctx(void) +{ + struct ksmbd_crypto_ctx *ctx; + + while (1) { + spin_lock(&ctx_list.ctx_lock); + if (!list_empty(&ctx_list.idle_ctx)) { + ctx = list_entry(ctx_list.idle_ctx.next, + struct ksmbd_crypto_ctx, + list); + list_del(&ctx->list); + spin_unlock(&ctx_list.ctx_lock); + return ctx; + } + + if (ctx_list.avail_ctx > num_online_cpus()) { + spin_unlock(&ctx_list.ctx_lock); + wait_event(ctx_list.ctx_wait, + !list_empty(&ctx_list.idle_ctx)); + continue; + } + + ctx_list.avail_ctx++; + spin_unlock(&ctx_list.ctx_lock); + + ctx = kzalloc(sizeof(struct ksmbd_crypto_ctx), GFP_KERNEL); + if (!ctx) { + spin_lock(&ctx_list.ctx_lock); + ctx_list.avail_ctx--; + spin_unlock(&ctx_list.ctx_lock); + wait_event(ctx_list.ctx_wait, + !list_empty(&ctx_list.idle_ctx)); + continue; + } + break; + } + return ctx; +} + +void ksmbd_release_crypto_ctx(struct ksmbd_crypto_ctx *ctx) +{ + if (!ctx) + return; + + spin_lock(&ctx_list.ctx_lock); + if (ctx_list.avail_ctx <= num_online_cpus()) { + list_add(&ctx->list, &ctx_list.idle_ctx); + spin_unlock(&ctx_list.ctx_lock); + wake_up(&ctx_list.ctx_wait); + return; + } + + ctx_list.avail_ctx--; + spin_unlock(&ctx_list.ctx_lock); + ctx_free(ctx); +} + +static struct ksmbd_crypto_ctx *____crypto_shash_ctx_find(int id) +{ + struct ksmbd_crypto_ctx *ctx; + + if (id >= CRYPTO_SHASH_MAX) + return NULL; + + ctx = ksmbd_find_crypto_ctx(); + if (ctx->desc[id]) + return ctx; + + ctx->desc[id] = alloc_shash_desc(id); + if (ctx->desc[id]) + return ctx; + ksmbd_release_crypto_ctx(ctx); + return NULL; +} + +struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_hmacmd5(void) +{ + return ____crypto_shash_ctx_find(CRYPTO_SHASH_HMACMD5); +} + +struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_hmacsha256(void) +{ + return ____crypto_shash_ctx_find(CRYPTO_SHASH_HMACSHA256); +} + +struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_cmacaes(void) +{ + return ____crypto_shash_ctx_find(CRYPTO_SHASH_CMACAES); +} + +struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_sha256(void) +{ + return ____crypto_shash_ctx_find(CRYPTO_SHASH_SHA256); +} + +struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_sha512(void) +{ + return ____crypto_shash_ctx_find(CRYPTO_SHASH_SHA512); +} + +struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_md4(void) +{ + return ____crypto_shash_ctx_find(CRYPTO_SHASH_MD4); +} + +struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_md5(void) +{ + return ____crypto_shash_ctx_find(CRYPTO_SHASH_MD5); +} + +static struct ksmbd_crypto_ctx *____crypto_aead_ctx_find(int id) +{ + struct ksmbd_crypto_ctx *ctx; + + if (id >= CRYPTO_AEAD_MAX) + return NULL; + + ctx = ksmbd_find_crypto_ctx(); + if (ctx->ccmaes[id]) + return ctx; + + ctx->ccmaes[id] = alloc_aead(id); + if (ctx->ccmaes[id]) + return ctx; + ksmbd_release_crypto_ctx(ctx); + return NULL; +} + +struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_gcm(void) +{ + return ____crypto_aead_ctx_find(CRYPTO_AEAD_AES_GCM); +} + +struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_ccm(void) +{ + return ____crypto_aead_ctx_find(CRYPTO_AEAD_AES_CCM); +} + +void ksmbd_crypto_destroy(void) +{ + struct ksmbd_crypto_ctx *ctx; + + while (!list_empty(&ctx_list.idle_ctx)) { + ctx = list_entry(ctx_list.idle_ctx.next, + struct ksmbd_crypto_ctx, + list); + list_del(&ctx->list); + ctx_free(ctx); + } +} + +int ksmbd_crypto_create(void) +{ + struct ksmbd_crypto_ctx *ctx; + + spin_lock_init(&ctx_list.ctx_lock); + INIT_LIST_HEAD(&ctx_list.idle_ctx); + init_waitqueue_head(&ctx_list.ctx_wait); + ctx_list.avail_ctx = 1; + + ctx = kzalloc(sizeof(struct ksmbd_crypto_ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + list_add(&ctx->list, &ctx_list.idle_ctx); + return 0; +} diff --git a/fs/ksmbd/crypto_ctx.h b/fs/ksmbd/crypto_ctx.h new file mode 100644 index 000000000000..ef11154b43df --- /dev/null +++ b/fs/ksmbd/crypto_ctx.h @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2019 Samsung Electronics Co., Ltd. + */ + +#ifndef __CRYPTO_CTX_H__ +#define __CRYPTO_CTX_H__ + +#include <crypto/hash.h> +#include <crypto/aead.h> + +enum { + CRYPTO_SHASH_HMACMD5 = 0, + CRYPTO_SHASH_HMACSHA256, + CRYPTO_SHASH_CMACAES, + CRYPTO_SHASH_SHA256, + CRYPTO_SHASH_SHA512, + CRYPTO_SHASH_MD4, + CRYPTO_SHASH_MD5, + CRYPTO_SHASH_MAX, +}; + +enum { + CRYPTO_AEAD_AES_GCM = 16, + CRYPTO_AEAD_AES_CCM, + CRYPTO_AEAD_MAX, +}; + +enum { + CRYPTO_BLK_ECBDES = 32, + CRYPTO_BLK_MAX, +}; + +struct ksmbd_crypto_ctx { + struct list_head list; + + struct shash_desc *desc[CRYPTO_SHASH_MAX]; + struct crypto_aead *ccmaes[CRYPTO_AEAD_MAX]; +}; + +#define CRYPTO_HMACMD5(c) ((c)->desc[CRYPTO_SHASH_HMACMD5]) +#define CRYPTO_HMACSHA256(c) ((c)->desc[CRYPTO_SHASH_HMACSHA256]) +#define CRYPTO_CMACAES(c) ((c)->desc[CRYPTO_SHASH_CMACAES]) +#define CRYPTO_SHA256(c) ((c)->desc[CRYPTO_SHASH_SHA256]) +#define CRYPTO_SHA512(c) ((c)->desc[CRYPTO_SHASH_SHA512]) +#define CRYPTO_MD4(c) ((c)->desc[CRYPTO_SHASH_MD4]) +#define CRYPTO_MD5(c) ((c)->desc[CRYPTO_SHASH_MD5]) + +#define CRYPTO_HMACMD5_TFM(c) ((c)->desc[CRYPTO_SHASH_HMACMD5]->tfm) +#define CRYPTO_HMACSHA256_TFM(c)\ + ((c)->desc[CRYPTO_SHASH_HMACSHA256]->tfm) +#define CRYPTO_CMACAES_TFM(c) ((c)->desc[CRYPTO_SHASH_CMACAES]->tfm) +#define CRYPTO_SHA256_TFM(c) ((c)->desc[CRYPTO_SHASH_SHA256]->tfm) +#define CRYPTO_SHA512_TFM(c) ((c)->desc[CRYPTO_SHASH_SHA512]->tfm) +#define CRYPTO_MD4_TFM(c) ((c)->desc[CRYPTO_SHASH_MD4]->tfm) +#define CRYPTO_MD5_TFM(c) ((c)->desc[CRYPTO_SHASH_MD5]->tfm) + +#define CRYPTO_GCM(c) ((c)->ccmaes[CRYPTO_AEAD_AES_GCM]) +#define CRYPTO_CCM(c) ((c)->ccmaes[CRYPTO_AEAD_AES_CCM]) + +void ksmbd_release_crypto_ctx(struct ksmbd_crypto_ctx *ctx); +struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_hmacmd5(void); +struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_hmacsha256(void); +struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_cmacaes(void); +struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_sha512(void); +struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_sha256(void); +struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_md4(void); +struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_md5(void); +struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_gcm(void); +struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_ccm(void); +void ksmbd_crypto_destroy(void); +int ksmbd_crypto_create(void); + +#endif /* __CRYPTO_CTX_H__ */ diff --git a/fs/ksmbd/glob.h b/fs/ksmbd/glob.h new file mode 100644 index 000000000000..49a5a3afa118 --- /dev/null +++ b/fs/ksmbd/glob.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2016 Namjae Jeon <linkinjeon@kernel.org> + * Copyright (C) 2018 Samsung Electronics Co., Ltd. + */ + +#ifndef __KSMBD_GLOB_H +#define __KSMBD_GLOB_H + +#include <linux/ctype.h> + +#include "unicode.h" +#include "vfs_cache.h" + +#define KSMBD_VERSION "3.1.9" + +extern int ksmbd_debug_types; + +#define KSMBD_DEBUG_SMB BIT(0) +#define KSMBD_DEBUG_AUTH BIT(1) +#define KSMBD_DEBUG_VFS BIT(2) +#define KSMBD_DEBUG_OPLOCK BIT(3) +#define KSMBD_DEBUG_IPC BIT(4) +#define KSMBD_DEBUG_CONN BIT(5) +#define KSMBD_DEBUG_RDMA BIT(6) +#define KSMBD_DEBUG_ALL (KSMBD_DEBUG_SMB | KSMBD_DEBUG_AUTH | \ + KSMBD_DEBUG_VFS | KSMBD_DEBUG_OPLOCK | \ + KSMBD_DEBUG_IPC | KSMBD_DEBUG_CONN | \ + KSMBD_DEBUG_RDMA) + +#ifdef pr_fmt +#undef pr_fmt +#endif + +#ifdef SUBMOD_NAME +#define pr_fmt(fmt) "ksmbd: " SUBMOD_NAME ": " fmt +#else +#define pr_fmt(fmt) "ksmbd: " fmt +#endif + +#define ksmbd_debug(type, fmt, ...) \ + do { \ + if (ksmbd_debug_types & KSMBD_DEBUG_##type) \ + pr_info(fmt, ##__VA_ARGS__); \ + } while (0) + +#define UNICODE_LEN(x) ((x) * 2) + +#endif /* __KSMBD_GLOB_H */ diff --git a/fs/ksmbd/ksmbd_netlink.h b/fs/ksmbd/ksmbd_netlink.h new file mode 100644 index 000000000000..2fbe2bc1e093 --- /dev/null +++ b/fs/ksmbd/ksmbd_netlink.h @@ -0,0 +1,395 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2018 Samsung Electronics Co., Ltd. + * + * linux-ksmbd-devel@lists.sourceforge.net + */ + +#ifndef _LINUX_KSMBD_SERVER_H +#define _LINUX_KSMBD_SERVER_H + +#include <linux/types.h> + +/* + * This is a userspace ABI to communicate data between ksmbd and user IPC + * daemon using netlink. This is added to track and cache user account DB + * and share configuration info from userspace. + * + * - KSMBD_EVENT_HEARTBEAT_REQUEST(ksmbd_heartbeat) + * This event is to check whether user IPC daemon is alive. If user IPC + * daemon is dead, ksmbd keep existing connection till disconnecting and + * new connection will be denied. + * + * - KSMBD_EVENT_STARTING_UP(ksmbd_startup_request) + * This event is to receive the information that initializes the ksmbd + * server from the user IPC daemon and to start the server. The global + * section parameters are given from smb.conf as initialization + * information. + * + * - KSMBD_EVENT_SHUTTING_DOWN(ksmbd_shutdown_request) + * This event is to shutdown ksmbd server. + * + * - KSMBD_EVENT_LOGIN_REQUEST/RESPONSE(ksmbd_login_request/response) + * This event is to get user account info to user IPC daemon. + * + * - KSMBD_EVENT_SHARE_CONFIG_REQUEST/RESPONSE(ksmbd_share_config_request/response) + * This event is to get net share configuration info. + * + * - KSMBD_EVENT_TREE_CONNECT_REQUEST/RESPONSE(ksmbd_tree_connect_request/response) + * This event is to get session and tree connect info. + * + * - KSMBD_EVENT_TREE_DISCONNECT_REQUEST(ksmbd_tree_disconnect_request) + * This event is to send tree disconnect info to user IPC daemon. + * + * - KSMBD_EVENT_LOGOUT_REQUEST(ksmbd_logout_request) + * This event is to send logout request to user IPC daemon. + * + * - KSMBD_EVENT_RPC_REQUEST/RESPONSE(ksmbd_rpc_command) + * This event is to make DCE/RPC request like srvsvc, wkssvc, lsarpc, + * samr to be processed in userspace. + * + * - KSMBD_EVENT_SPNEGO_AUTHEN_REQUEST/RESPONSE(ksmbd_spnego_authen_request/response) + * This event is to make kerberos authentication to be processed in + * userspace. + */ + +#define KSMBD_GENL_NAME "SMBD_GENL" +#define KSMBD_GENL_VERSION 0x01 + +#define KSMBD_REQ_MAX_ACCOUNT_NAME_SZ 48 +#define KSMBD_REQ_MAX_HASH_SZ 18 +#define KSMBD_REQ_MAX_SHARE_NAME 64 + +/* + * IPC heartbeat frame to check whether user IPC daemon is alive. + */ +struct ksmbd_heartbeat { + __u32 handle; +}; + +/* + * Global config flags. + */ +#define KSMBD_GLOBAL_FLAG_INVALID (0) +#define KSMBD_GLOBAL_FLAG_SMB2_LEASES BIT(0) +#define KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION BIT(1) +#define KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL BIT(2) + +/* + * IPC request for ksmbd server startup + */ +struct ksmbd_startup_request { + __u32 flags; /* Flags for global config */ + __s32 signing; /* Signing enabled */ + __s8 min_prot[16]; /* The minimum SMB protocol version */ + __s8 max_prot[16]; /* The maximum SMB protocol version */ + __s8 netbios_name[16]; + __s8 work_group[64]; /* Workgroup */ + __s8 server_string[64]; /* Server string */ + __u16 tcp_port; /* tcp port */ + __u16 ipc_timeout; /* + * specifies the number of seconds + * server will wait for the userspace to + * reply to heartbeat frames. + */ + __u32 deadtime; /* Number of minutes of inactivity */ + __u32 file_max; /* Limits the maximum number of open files */ + __u32 smb2_max_write; /* MAX write size */ + __u32 smb2_max_read; /* MAX read size */ + __u32 smb2_max_trans; /* MAX trans size */ + __u32 share_fake_fscaps; /* + * Support some special application that + * makes QFSINFO calls to check whether + * we set the SPARSE_FILES bit (0x40). + */ + __u32 sub_auth[3]; /* Subauth value for Security ID */ + __u32 ifc_list_sz; /* interfaces list size */ + __s8 ____payload[]; +}; + +#define KSMBD_STARTUP_CONFIG_INTERFACES(s) ((s)->____payload) + +/* + * IPC request to shutdown ksmbd server. + */ +struct ksmbd_shutdown_request { + __s32 reserved; +}; + +/* + * IPC user login request. + */ +struct ksmbd_login_request { + __u32 handle; + __s8 account[KSMBD_REQ_MAX_ACCOUNT_NAME_SZ]; /* user account name */ +}; + +/* + * IPC user login response. + */ +struct ksmbd_login_response { + __u32 handle; + __u32 gid; /* group id */ + __u32 uid; /* user id */ + __s8 account[KSMBD_REQ_MAX_ACCOUNT_NAME_SZ]; /* user account name */ + __u16 status; + __u16 hash_sz; /* hash size */ + __s8 hash[KSMBD_REQ_MAX_HASH_SZ]; /* password hash */ +}; + +/* + * IPC request to fetch net share config. + */ +struct ksmbd_share_config_request { + __u32 handle; + __s8 share_name[KSMBD_REQ_MAX_SHARE_NAME]; /* share name */ +}; + +/* + * IPC response to the net share config request. + */ +struct ksmbd_share_config_response { + __u32 handle; + __u32 flags; + __u16 create_mask; + __u16 directory_mask; + __u16 force_create_mode; + __u16 force_directory_mode; + __u16 force_uid; + __u16 force_gid; + __u32 veto_list_sz; + __s8 ____payload[]; +}; + +#define KSMBD_SHARE_CONFIG_VETO_LIST(s) ((s)->____payload) + +static inline char * +ksmbd_share_config_path(struct ksmbd_share_config_response *sc) +{ + char *p = sc->____payload; + + if (sc->veto_list_sz) + p += sc->veto_list_sz + 1; + + return p; +} + +/* + * IPC request for tree connection. This request include session and tree + * connect info from client. + */ +struct ksmbd_tree_connect_request { + __u32 handle; + __u16 account_flags; + __u16 flags; + __u64 session_id; + __u64 connect_id; + __s8 account[KSMBD_REQ_MAX_ACCOUNT_NAME_SZ]; + __s8 share[KSMBD_REQ_MAX_SHARE_NAME]; + __s8 peer_addr[64]; +}; + +/* + * IPC Response structure for tree connection. + */ +struct ksmbd_tree_connect_response { + __u32 handle; + __u16 status; + __u16 connection_flags; +}; + +/* + * IPC Request struture to disconnect tree connection. + */ +struct ksmbd_tree_disconnect_request { + __u64 session_id; /* session id */ + __u64 connect_id; /* tree connection id */ +}; + +/* + * IPC Response structure to logout user account. + */ +struct ksmbd_logout_request { + __s8 account[KSMBD_REQ_MAX_ACCOUNT_NAME_SZ]; /* user account name */ +}; + +/* + * RPC command structure to send rpc request like srvsvc or wkssvc to + * IPC user daemon. + */ +struct ksmbd_rpc_command { + __u32 handle; + __u32 flags; + __u32 payload_sz; + __u8 payload[]; +}; + +/* + * IPC Request Kerberos authentication + */ +struct ksmbd_spnego_authen_request { + __u32 handle; + __u16 spnego_blob_len; /* the length of spnego_blob */ + __u8 spnego_blob[0]; /* + * the GSS token from SecurityBuffer of + * SMB2 SESSION SETUP request + */ +}; + +/* + * Response data which includes the GSS token and the session key generated by + * user daemon. + */ +struct ksmbd_spnego_authen_response { + __u32 handle; + struct ksmbd_login_response login_response; /* + * the login response with + * a user identified by the + * GSS token from a client + */ + __u16 session_key_len; /* the length of the session key */ + __u16 spnego_blob_len; /* + * the length of the GSS token which will be + * stored in SecurityBuffer of SMB2 SESSION + * SETUP response + */ + __u8 payload[]; /* session key + AP_REP */ +}; + +/* + * This also used as NETLINK attribute type value. + * + * NOTE: + * Response message type value should be equal to + * request message type value + 1. + */ +enum ksmbd_event { + KSMBD_EVENT_UNSPEC = 0, + KSMBD_EVENT_HEARTBEAT_REQUEST, + + KSMBD_EVENT_STARTING_UP, + KSMBD_EVENT_SHUTTING_DOWN, + + KSMBD_EVENT_LOGIN_REQUEST, + KSMBD_EVENT_LOGIN_RESPONSE = 5, + + KSMBD_EVENT_SHARE_CONFIG_REQUEST, + KSMBD_EVENT_SHARE_CONFIG_RESPONSE, + + KSMBD_EVENT_TREE_CONNECT_REQUEST, + KSMBD_EVENT_TREE_CONNECT_RESPONSE, + + KSMBD_EVENT_TREE_DISCONNECT_REQUEST = 10, + + KSMBD_EVENT_LOGOUT_REQUEST, + + KSMBD_EVENT_RPC_REQUEST, + KSMBD_EVENT_RPC_RESPONSE, + + KSMBD_EVENT_SPNEGO_AUTHEN_REQUEST, + KSMBD_EVENT_SPNEGO_AUTHEN_RESPONSE = 15, + + KSMBD_EVENT_MAX +}; + +/* + * Enumeration for IPC tree connect status. + */ +enum KSMBD_TREE_CONN_STATUS { + KSMBD_TREE_CONN_STATUS_OK = 0, + KSMBD_TREE_CONN_STATUS_NOMEM, + KSMBD_TREE_CONN_STATUS_NO_SHARE, + KSMBD_TREE_CONN_STATUS_NO_USER, + KSMBD_TREE_CONN_STATUS_INVALID_USER, + KSMBD_TREE_CONN_STATUS_HOST_DENIED = 5, + KSMBD_TREE_CONN_STATUS_CONN_EXIST, + KSMBD_TREE_CONN_STATUS_TOO_MANY_CONNS, + KSMBD_TREE_CONN_STATUS_TOO_MANY_SESSIONS, + KSMBD_TREE_CONN_STATUS_ERROR, +}; + +/* + * User config flags. + */ +#define KSMBD_USER_FLAG_INVALID (0) +#define KSMBD_USER_FLAG_OK BIT(0) +#define KSMBD_USER_FLAG_BAD_PASSWORD BIT(1) +#define KSMBD_USER_FLAG_BAD_UID BIT(2) +#define KSMBD_USER_FLAG_BAD_USER BIT(3) +#define KSMBD_USER_FLAG_GUEST_ACCOUNT BIT(4) + +/* + * Share config flags. + */ +#define KSMBD_SHARE_FLAG_INVALID (0) +#define KSMBD_SHARE_FLAG_AVAILABLE BIT(0) +#define KSMBD_SHARE_FLAG_BROWSEABLE BIT(1) +#define KSMBD_SHARE_FLAG_WRITEABLE BIT(2) +#define KSMBD_SHARE_FLAG_READONLY BIT(3) +#define KSMBD_SHARE_FLAG_GUEST_OK BIT(4) +#define KSMBD_SHARE_FLAG_GUEST_ONLY BIT(5) +#define KSMBD_SHARE_FLAG_STORE_DOS_ATTRS BIT(6) +#define KSMBD_SHARE_FLAG_OPLOCKS BIT(7) +#define KSMBD_SHARE_FLAG_PIPE BIT(8) +#define KSMBD_SHARE_FLAG_HIDE_DOT_FILES BIT(9) +#define KSMBD_SHARE_FLAG_INHERIT_OWNER BIT(10) +#define KSMBD_SHARE_FLAG_STREAMS BIT(11) +#define KSMBD_SHARE_FLAG_FOLLOW_SYMLINKS BIT(12) +#define KSMBD_SHARE_FLAG_ACL_XATTR BIT(13) + +/* + * Tree connect request flags. + */ +#define KSMBD_TREE_CONN_FLAG_REQUEST_SMB1 (0) +#define KSMBD_TREE_CONN_FLAG_REQUEST_IPV6 BIT(0) +#define KSMBD_TREE_CONN_FLAG_REQUEST_SMB2 BIT(1) + +/* + * Tree connect flags. + */ +#define KSMBD_TREE_CONN_FLAG_GUEST_ACCOUNT BIT(0) +#define KSMBD_TREE_CONN_FLAG_READ_ONLY BIT(1) +#define KSMBD_TREE_CONN_FLAG_WRITABLE BIT(2) +#define KSMBD_TREE_CONN_FLAG_ADMIN_ACCOUNT BIT(3) + +/* + * RPC over IPC. + */ +#define KSMBD_RPC_METHOD_RETURN BIT(0) +#define KSMBD_RPC_SRVSVC_METHOD_INVOKE BIT(1) +#define KSMBD_RPC_SRVSVC_METHOD_RETURN (KSMBD_RPC_SRVSVC_METHOD_INVOKE | KSMBD_RPC_METHOD_RETURN) +#define KSMBD_RPC_WKSSVC_METHOD_INVOKE BIT(2) +#define KSMBD_RPC_WKSSVC_METHOD_RETURN (KSMBD_RPC_WKSSVC_METHOD_INVOKE | KSMBD_RPC_METHOD_RETURN) +#define KSMBD_RPC_IOCTL_METHOD (BIT(3) | KSMBD_RPC_METHOD_RETURN) +#define KSMBD_RPC_OPEN_METHOD BIT(4) +#define KSMBD_RPC_WRITE_METHOD BIT(5) +#define KSMBD_RPC_READ_METHOD (BIT(6) | KSMBD_RPC_METHOD_RETURN) +#define KSMBD_RPC_CLOSE_METHOD BIT(7) +#define KSMBD_RPC_RAP_METHOD (BIT(8) | KSMBD_RPC_METHOD_RETURN) +#define KSMBD_RPC_RESTRICTED_CONTEXT BIT(9) +#define KSMBD_RPC_SAMR_METHOD_INVOKE BIT(10) +#define KSMBD_RPC_SAMR_METHOD_RETURN (KSMBD_RPC_SAMR_METHOD_INVOKE | KSMBD_RPC_METHOD_RETURN) +#define KSMBD_RPC_LSARPC_METHOD_INVOKE BIT(11) +#define KSMBD_RPC_LSARPC_METHOD_RETURN (KSMBD_RPC_LSARPC_METHOD_INVOKE | KSMBD_RPC_METHOD_RETURN) + +/* + * RPC status definitions. + */ +#define KSMBD_RPC_OK 0 +#define KSMBD_RPC_EBAD_FUNC 0x00000001 +#define KSMBD_RPC_EACCESS_DENIED 0x00000005 +#define KSMBD_RPC_EBAD_FID 0x00000006 +#define KSMBD_RPC_ENOMEM 0x00000008 +#define KSMBD_RPC_EBAD_DATA 0x0000000D +#define KSMBD_RPC_ENOTIMPLEMENTED 0x00000040 +#define KSMBD_RPC_EINVALID_PARAMETER 0x00000057 +#define KSMBD_RPC_EMORE_DATA 0x000000EA +#define KSMBD_RPC_EINVALID_LEVEL 0x0000007C +#define KSMBD_RPC_SOME_NOT_MAPPED 0x00000107 + +#define KSMBD_CONFIG_OPT_DISABLED 0 +#define KSMBD_CONFIG_OPT_ENABLED 1 +#define KSMBD_CONFIG_OPT_AUTO 2 +#define KSMBD_CONFIG_OPT_MANDATORY 3 + +#endif /* _LINUX_KSMBD_SERVER_H */ diff --git a/fs/ksmbd/ksmbd_spnego_negtokeninit.asn1 b/fs/ksmbd/ksmbd_spnego_negtokeninit.asn1 new file mode 100644 index 000000000000..0065f191b54b --- /dev/null +++ b/fs/ksmbd/ksmbd_spnego_negtokeninit.asn1 @@ -0,0 +1,31 @@ +GSSAPI ::= + [APPLICATION 0] IMPLICIT SEQUENCE { + thisMech + OBJECT IDENTIFIER ({ksmbd_gssapi_this_mech}), + negotiationToken + NegotiationToken + } + +MechType ::= OBJECT IDENTIFIER ({ksmbd_neg_token_init_mech_type}) + +MechTypeList ::= SEQUENCE OF MechType + +NegTokenInit ::= + SEQUENCE { + mechTypes + [0] MechTypeList, + reqFlags + [1] BIT STRING OPTIONAL, + mechToken + [2] OCTET STRING OPTIONAL ({ksmbd_neg_token_init_mech_token}), + mechListMIC + [3] OCTET STRING OPTIONAL + } + +NegotiationToken ::= + CHOICE { + negTokenInit + [0] NegTokenInit, + negTokenTarg + [1] ANY + } diff --git a/fs/ksmbd/ksmbd_spnego_negtokentarg.asn1 b/fs/ksmbd/ksmbd_spnego_negtokentarg.asn1 new file mode 100644 index 000000000000..1151933e7b9c --- /dev/null +++ b/fs/ksmbd/ksmbd_spnego_negtokentarg.asn1 @@ -0,0 +1,19 @@ +GSSAPI ::= + CHOICE { + negTokenInit + [0] ANY, + negTokenTarg + [1] NegTokenTarg + } + +NegTokenTarg ::= + SEQUENCE { + negResult + [0] ENUMERATED OPTIONAL, + supportedMech + [1] OBJECT IDENTIFIER OPTIONAL, + responseToken + [2] OCTET STRING OPTIONAL ({ksmbd_neg_token_targ_resp_token}), + mechListMIC + [3] OCTET STRING OPTIONAL + } diff --git a/fs/ksmbd/ksmbd_work.c b/fs/ksmbd/ksmbd_work.c new file mode 100644 index 000000000000..fd58eb4809f6 --- /dev/null +++ b/fs/ksmbd/ksmbd_work.c @@ -0,0 +1,80 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2019 Samsung Electronics Co., Ltd. + */ + +#include <linux/list.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/workqueue.h> + +#include "server.h" +#include "connection.h" +#include "ksmbd_work.h" +#include "mgmt/ksmbd_ida.h" + +static struct kmem_cache *work_cache; +static struct workqueue_struct *ksmbd_wq; + +struct ksmbd_work *ksmbd_alloc_work_struct(void) +{ + struct ksmbd_work *work = kmem_cache_zalloc(work_cache, GFP_KERNEL); + + if (work) { + work->compound_fid = KSMBD_NO_FID; + work->compound_pfid = KSMBD_NO_FID; + INIT_LIST_HEAD(&work->request_entry); + INIT_LIST_HEAD(&work->async_request_entry); + INIT_LIST_HEAD(&work->fp_entry); + INIT_LIST_HEAD(&work->interim_entry); + } + return work; +} + +void ksmbd_free_work_struct(struct ksmbd_work *work) +{ + WARN_ON(work->saved_cred != NULL); + + kvfree(work->response_buf); + kvfree(work->aux_payload_buf); + kfree(work->tr_buf); + kvfree(work->request_buf); + if (work->async_id) + ksmbd_release_id(&work->conn->async_ida, work->async_id); + kmem_cache_free(work_cache, work); +} + +void ksmbd_work_pool_destroy(void) +{ + kmem_cache_destroy(work_cache); +} + +int ksmbd_work_pool_init(void) +{ + work_cache = kmem_cache_create("ksmbd_work_cache", + sizeof(struct ksmbd_work), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!work_cache) + return -ENOMEM; + return 0; +} + +int ksmbd_workqueue_init(void) +{ + ksmbd_wq = alloc_workqueue("ksmbd-io", 0, 0); + if (!ksmbd_wq) + return -ENOMEM; + return 0; +} + +void ksmbd_workqueue_destroy(void) +{ + flush_workqueue(ksmbd_wq); + destroy_workqueue(ksmbd_wq); + ksmbd_wq = NULL; +} + +bool ksmbd_queue_work(struct ksmbd_work *work) +{ + return queue_work(ksmbd_wq, &work->work); +} diff --git a/fs/ksmbd/ksmbd_work.h b/fs/ksmbd/ksmbd_work.h new file mode 100644 index 000000000000..f7156bc50049 --- /dev/null +++ b/fs/ksmbd/ksmbd_work.h @@ -0,0 +1,117 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2019 Samsung Electronics Co., Ltd. + */ + +#ifndef __KSMBD_WORK_H__ +#define __KSMBD_WORK_H__ + +#include <linux/ctype.h> +#include <linux/workqueue.h> + +struct ksmbd_conn; +struct ksmbd_session; +struct ksmbd_tree_connect; + +enum { + KSMBD_WORK_ACTIVE = 0, + KSMBD_WORK_CANCELLED, + KSMBD_WORK_CLOSED, +}; + +/* one of these for every pending CIFS request at the connection */ +struct ksmbd_work { + /* Server corresponding to this mid */ + struct ksmbd_conn *conn; + struct ksmbd_session *sess; + struct ksmbd_tree_connect *tcon; + + /* Pointer to received SMB header */ + void *request_buf; + /* Response buffer */ + void *response_buf; + + /* Read data buffer */ + void *aux_payload_buf; + + /* Next cmd hdr in compound req buf*/ + int next_smb2_rcv_hdr_off; + /* Next cmd hdr in compound rsp buf*/ + int next_smb2_rsp_hdr_off; + + /* + * Current Local FID assigned compound response if SMB2 CREATE + * command is present in compound request + */ + u64 compound_fid; + u64 compound_pfid; + u64 compound_sid; + + const struct cred *saved_cred; + + /* Number of granted credits */ + unsigned int credits_granted; + + /* response smb header size */ + unsigned int resp_hdr_sz; + unsigned int response_sz; + /* Read data count */ + unsigned int aux_payload_sz; + + void *tr_buf; + + unsigned char state; + /* Multiple responses for one request e.g. SMB ECHO */ + bool multiRsp:1; + /* No response for cancelled request */ + bool send_no_response:1; + /* Request is encrypted */ + bool encrypted:1; + /* Is this SYNC or ASYNC ksmbd_work */ + bool syncronous:1; + bool need_invalidate_rkey:1; + + unsigned int remote_key; + /* cancel works */ + int async_id; + void **cancel_argv; + void (*cancel_fn)(void **argv); + + struct work_struct work; + /* List head at conn->requests */ + struct list_head request_entry; + /* List head at conn->async_requests */ + struct list_head async_request_entry; + struct list_head fp_entry; + struct list_head interim_entry; +}; + +/** + * ksmbd_resp_buf_next - Get next buffer on compound response. + * @work: smb work containing response buffer + */ +static inline void *ksmbd_resp_buf_next(struct ksmbd_work *work) +{ + return work->response_buf + work->next_smb2_rsp_hdr_off; +} + +/** + * ksmbd_req_buf_next - Get next buffer on compound request. + * @work: smb work containing response buffer + */ +static inline void *ksmbd_req_buf_next(struct ksmbd_work *work) +{ + return work->request_buf + work->next_smb2_rcv_hdr_off; +} + +struct ksmbd_work *ksmbd_alloc_work_struct(void); +void ksmbd_free_work_struct(struct ksmbd_work *work); + +void ksmbd_work_pool_destroy(void); +int ksmbd_work_pool_init(void); + +int ksmbd_workqueue_init(void); +void ksmbd_workqueue_destroy(void); +bool ksmbd_queue_work(struct ksmbd_work *work); + +#endif /* __KSMBD_WORK_H__ */ diff --git a/fs/ksmbd/mgmt/ksmbd_ida.c b/fs/ksmbd/mgmt/ksmbd_ida.c new file mode 100644 index 000000000000..54194d959a5e --- /dev/null +++ b/fs/ksmbd/mgmt/ksmbd_ida.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018 Samsung Electronics Co., Ltd. + */ + +#include "ksmbd_ida.h" + +static inline int __acquire_id(struct ida *ida, int from, int to) +{ + return ida_simple_get(ida, from, to, GFP_KERNEL); +} + +int ksmbd_acquire_smb2_tid(struct ida *ida) +{ + int id; + + id = __acquire_id(ida, 1, 0xFFFFFFFF); + + return id; +} + +int ksmbd_acquire_smb2_uid(struct ida *ida) +{ + int id; + + id = __acquire_id(ida, 1, 0); + if (id == 0xFFFE) + id = __acquire_id(ida, 1, 0); + + return id; +} + +int ksmbd_acquire_async_msg_id(struct ida *ida) +{ + return __acquire_id(ida, 1, 0); +} + +int ksmbd_acquire_id(struct ida *ida) +{ + return __acquire_id(ida, 0, 0); +} + +void ksmbd_release_id(struct ida *ida, int id) +{ + ida_simple_remove(ida, id); +} diff --git a/fs/ksmbd/mgmt/ksmbd_ida.h b/fs/ksmbd/mgmt/ksmbd_ida.h new file mode 100644 index 000000000000..2bc07b16cfde --- /dev/null +++ b/fs/ksmbd/mgmt/ksmbd_ida.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2018 Samsung Electronics Co., Ltd. + */ + +#ifndef __KSMBD_IDA_MANAGEMENT_H__ +#define __KSMBD_IDA_MANAGEMENT_H__ + +#include <linux/slab.h> +#include <linux/idr.h> + +/* + * 2.2.1.6.7 TID Generation + * The value 0xFFFF MUST NOT be used as a valid TID. All other + * possible values for TID, including zero (0x0000), are valid. + * The value 0xFFFF is used to specify all TIDs or no TID, + * depending upon the context in which it is used. + */ +int ksmbd_acquire_smb2_tid(struct ida *ida); + +/* + * 2.2.1.6.8 UID Generation + * The value 0xFFFE was declared reserved in the LAN Manager 1.0 + * documentation, so a value of 0xFFFE SHOULD NOT be used as a + * valid UID.<21> All other possible values for a UID, excluding + * zero (0x0000), are valid. + */ +int ksmbd_acquire_smb2_uid(struct ida *ida); +int ksmbd_acquire_async_msg_id(struct ida *ida); + +int ksmbd_acquire_id(struct ida *ida); + +void ksmbd_release_id(struct ida *ida, int id); +#endif /* __KSMBD_IDA_MANAGEMENT_H__ */ diff --git a/fs/ksmbd/mgmt/share_config.c b/fs/ksmbd/mgmt/share_config.c new file mode 100644 index 000000000000..cb72d30f5b71 --- /dev/null +++ b/fs/ksmbd/mgmt/share_config.c @@ -0,0 +1,238 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018 Samsung Electronics Co., Ltd. + */ + +#include <linux/list.h> +#include <linux/jhash.h> +#include <linux/slab.h> +#include <linux/rwsem.h> +#include <linux/parser.h> +#include <linux/namei.h> +#include <linux/sched.h> +#include <linux/mm.h> + +#include "share_config.h" +#include "user_config.h" +#include "user_session.h" +#include "../transport_ipc.h" + +#define SHARE_HASH_BITS 3 +static DEFINE_HASHTABLE(shares_table, SHARE_HASH_BITS); +static DECLARE_RWSEM(shares_table_lock); + +struct ksmbd_veto_pattern { + char *pattern; + struct list_head list; +}; + +static unsigned int share_name_hash(char *name) +{ + return jhash(name, strlen(name), 0); +} + +static void kill_share(struct ksmbd_share_config *share) +{ + while (!list_empty(&share->veto_list)) { + struct ksmbd_veto_pattern *p; + + p = list_entry(share->veto_list.next, + struct ksmbd_veto_pattern, + list); + list_del(&p->list); + kfree(p->pattern); + kfree(p); + } + + if (share->path) + path_put(&share->vfs_path); + kfree(share->name); + kfree(share->path); + kfree(share); +} + +void __ksmbd_share_config_put(struct ksmbd_share_config *share) +{ + down_write(&shares_table_lock); + hash_del(&share->hlist); + up_write(&shares_table_lock); + + kill_share(share); +} + +static struct ksmbd_share_config * +__get_share_config(struct ksmbd_share_config *share) +{ + if (!atomic_inc_not_zero(&share->refcount)) + return NULL; + return share; +} + +static struct ksmbd_share_config *__share_lookup(char *name) +{ + struct ksmbd_share_config *share; + unsigned int key = share_name_hash(name); + + hash_for_each_possible(shares_table, share, hlist, key) { + if (!strcmp(name, share->name)) + return share; + } + return NULL; +} + +static int parse_veto_list(struct ksmbd_share_config *share, + char *veto_list, + int veto_list_sz) +{ + int sz = 0; + + if (!veto_list_sz) + return 0; + + while (veto_list_sz > 0) { + struct ksmbd_veto_pattern *p; + + sz = strlen(veto_list); + if (!sz) + break; + + p = kzalloc(sizeof(struct ksmbd_veto_pattern), GFP_KERNEL); + if (!p) + return -ENOMEM; + + p->pattern = kstrdup(veto_list, GFP_KERNEL); + if (!p->pattern) { + kfree(p); + return -ENOMEM; + } + + list_add(&p->list, &share->veto_list); + + veto_list += sz + 1; + veto_list_sz -= (sz + 1); + } + + return 0; +} + +static struct ksmbd_share_config *share_config_request(char *name) +{ + struct ksmbd_share_config_response *resp; + struct ksmbd_share_config *share = NULL; + struct ksmbd_share_config *lookup; + int ret; + + resp = ksmbd_ipc_share_config_request(name); + if (!resp) + return NULL; + + if (resp->flags == KSMBD_SHARE_FLAG_INVALID) + goto out; + + share = kzalloc(sizeof(struct ksmbd_share_config), GFP_KERNEL); + if (!share) + goto out; + + share->flags = resp->flags; + atomic_set(&share->refcount, 1); + INIT_LIST_HEAD(&share->veto_list); + share->name = kstrdup(name, GFP_KERNEL); + + if (!test_share_config_flag(share, KSMBD_SHARE_FLAG_PIPE)) { + share->path = kstrdup(ksmbd_share_config_path(resp), + GFP_KERNEL); + if (share->path) + share->path_sz = strlen(share->path); + share->create_mask = resp->create_mask; + share->directory_mask = resp->directory_mask; + share->force_create_mode = resp->force_create_mode; + share->force_directory_mode = resp->force_directory_mode; + share->force_uid = resp->force_uid; + share->force_gid = resp->force_gid; + ret = parse_veto_list(share, + KSMBD_SHARE_CONFIG_VETO_LIST(resp), + resp->veto_list_sz); + if (!ret && share->path) { + ret = kern_path(share->path, 0, &share->vfs_path); + if (ret) { + ksmbd_debug(SMB, "failed to access '%s'\n", + share->path); + /* Avoid put_path() */ + kfree(share->path); + share->path = NULL; + } + } + if (ret || !share->name) { + kill_share(share); + share = NULL; + goto out; + } + } + + down_write(&shares_table_lock); + lookup = __share_lookup(name); + if (lookup) + lookup = __get_share_config(lookup); + if (!lookup) { + hash_add(shares_table, &share->hlist, share_name_hash(name)); + } else { + kill_share(share); + share = lookup; + } + up_write(&shares_table_lock); + +out: + kvfree(resp); + return share; +} + +static void strtolower(char *share_name) +{ + while (*share_name) { + *share_name = tolower(*share_name); + share_name++; + } +} + +struct ksmbd_share_config *ksmbd_share_config_get(char *name) +{ + struct ksmbd_share_config *share; + + strtolower(name); + + down_read(&shares_table_lock); + share = __share_lookup(name); + if (share) + share = __get_share_config(share); + up_read(&shares_table_lock); + + if (share) + return share; + return share_config_request(name); +} + +bool ksmbd_share_veto_filename(struct ksmbd_share_config *share, + const char *filename) +{ + struct ksmbd_veto_pattern *p; + + list_for_each_entry(p, &share->veto_list, list) { + if (match_wildcard(p->pattern, filename)) + return true; + } + return false; +} + +void ksmbd_share_configs_cleanup(void) +{ + struct ksmbd_share_config *share; + struct hlist_node *tmp; + int i; + + down_write(&shares_table_lock); + hash_for_each_safe(shares_table, i, tmp, share, hlist) { + hash_del(&share->hlist); + kill_share(share); + } + up_write(&shares_table_lock); +} diff --git a/fs/ksmbd/mgmt/share_config.h b/fs/ksmbd/mgmt/share_config.h new file mode 100644 index 000000000000..953befc94e84 --- /dev/null +++ b/fs/ksmbd/mgmt/share_config.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2018 Samsung Electronics Co., Ltd. + */ + +#ifndef __SHARE_CONFIG_MANAGEMENT_H__ +#define __SHARE_CONFIG_MANAGEMENT_H__ + +#include <linux/workqueue.h> +#include <linux/hashtable.h> +#include <linux/path.h> + +struct ksmbd_share_config { + char *name; + char *path; + + unsigned int path_sz; + unsigned int flags; + struct list_head veto_list; + + struct path vfs_path; + + atomic_t refcount; + struct hlist_node hlist; + unsigned short create_mask; + unsigned short directory_mask; + unsigned short force_create_mode; + unsigned short force_directory_mode; + unsigned short force_uid; + unsigned short force_gid; +}; + +#define KSMBD_SHARE_INVALID_UID ((__u16)-1) +#define KSMBD_SHARE_INVALID_GID ((__u16)-1) + +static inline int share_config_create_mode(struct ksmbd_share_config *share, + umode_t posix_mode) +{ + if (!share->force_create_mode) { + if (!posix_mode) + return share->create_mask; + else + return posix_mode & share->create_mask; + } + return share->force_create_mode & share->create_mask; +} + +static inline int share_config_directory_mode(struct ksmbd_share_config *share, + umode_t posix_mode) +{ + if (!share->force_directory_mode) { + if (!posix_mode) + return share->directory_mask; + else + return posix_mode & share->directory_mask; + } + + return share->force_directory_mode & share->directory_mask; +} + +static inline int test_share_config_flag(struct ksmbd_share_config *share, + int flag) +{ + return share->flags & flag; +} + +void __ksmbd_share_config_put(struct ksmbd_share_config *share); + +static inline void ksmbd_share_config_put(struct ksmbd_share_config *share) +{ + if (!atomic_dec_and_test(&share->refcount)) + return; + __ksmbd_share_config_put(share); +} + +struct ksmbd_share_config *ksmbd_share_config_get(char *name); +bool ksmbd_share_veto_filename(struct ksmbd_share_config *share, + const char *filename); +void ksmbd_share_configs_cleanup(void); + +#endif /* __SHARE_CONFIG_MANAGEMENT_H__ */ diff --git a/fs/ksmbd/mgmt/tree_connect.c b/fs/ksmbd/mgmt/tree_connect.c new file mode 100644 index 000000000000..0d28e723a28c --- /dev/null +++ b/fs/ksmbd/mgmt/tree_connect.c @@ -0,0 +1,121 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018 Samsung Electronics Co., Ltd. + */ + +#include <linux/list.h> +#include <linux/slab.h> +#include <linux/xarray.h> + +#include "../transport_ipc.h" +#include "../connection.h" + +#include "tree_connect.h" +#include "user_config.h" +#include "share_config.h" +#include "user_session.h" + +struct ksmbd_tree_conn_status +ksmbd_tree_conn_connect(struct ksmbd_session *sess, char *share_name) +{ + struct ksmbd_tree_conn_status status = {-EINVAL, NULL}; + struct ksmbd_tree_connect_response *resp = NULL; + struct ksmbd_share_config *sc; + struct ksmbd_tree_connect *tree_conn = NULL; + struct sockaddr *peer_addr; + int ret; + + sc = ksmbd_share_config_get(share_name); + if (!sc) + return status; + + tree_conn = kzalloc(sizeof(struct ksmbd_tree_connect), GFP_KERNEL); + if (!tree_conn) { + status.ret = -ENOMEM; + goto out_error; + } + + tree_conn->id = ksmbd_acquire_tree_conn_id(sess); + if (tree_conn->id < 0) { + status.ret = -EINVAL; + goto out_error; + } + + peer_addr = KSMBD_TCP_PEER_SOCKADDR(sess->conn); + resp = ksmbd_ipc_tree_connect_request(sess, + sc, + tree_conn, + peer_addr); + if (!resp) { + status.ret = -EINVAL; + goto out_error; + } + + status.ret = resp->status; + if (status.ret != KSMBD_TREE_CONN_STATUS_OK) + goto out_error; + + tree_conn->flags = resp->connection_flags; + tree_conn->user = sess->user; + tree_conn->share_conf = sc; + status.tree_conn = tree_conn; + + ret = xa_err(xa_store(&sess->tree_conns, tree_conn->id, tree_conn, + GFP_KERNEL)); + if (ret) { + status.ret = -ENOMEM; + goto out_error; + } + kvfree(resp); + return status; + +out_error: + if (tree_conn) + ksmbd_release_tree_conn_id(sess, tree_conn->id); + ksmbd_share_config_put(sc); + kfree(tree_conn); + kvfree(resp); + return status; +} + +int ksmbd_tree_conn_disconnect(struct ksmbd_session *sess, + struct ksmbd_tree_connect *tree_conn) +{ + int ret; + + ret = ksmbd_ipc_tree_disconnect_request(sess->id, tree_conn->id); + ksmbd_release_tree_conn_id(sess, tree_conn->id); + xa_erase(&sess->tree_conns, tree_conn->id); + ksmbd_share_config_put(tree_conn->share_conf); + kfree(tree_conn); + return ret; +} + +struct ksmbd_tree_connect *ksmbd_tree_conn_lookup(struct ksmbd_session *sess, + unsigned int id) +{ + return xa_load(&sess->tree_conns, id); +} + +struct ksmbd_share_config *ksmbd_tree_conn_share(struct ksmbd_session *sess, + unsigned int id) +{ + struct ksmbd_tree_connect *tc; + + tc = ksmbd_tree_conn_lookup(sess, id); + if (tc) + return tc->share_conf; + return NULL; +} + +int ksmbd_tree_conn_session_logoff(struct ksmbd_session *sess) +{ + int ret = 0; + struct ksmbd_tree_connect *tc; + unsigned long id; + + xa_for_each(&sess->tree_conns, id, tc) + ret |= ksmbd_tree_conn_disconnect(sess, tc); + xa_destroy(&sess->tree_conns); + return ret; +} diff --git a/fs/ksmbd/mgmt/tree_connect.h b/fs/ksmbd/mgmt/tree_connect.h new file mode 100644 index 000000000000..18e2a996e0aa --- /dev/null +++ b/fs/ksmbd/mgmt/tree_connect.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2018 Samsung Electronics Co., Ltd. + */ + +#ifndef __TREE_CONNECT_MANAGEMENT_H__ +#define __TREE_CONNECT_MANAGEMENT_H__ + +#include <linux/hashtable.h> + +#include "../ksmbd_netlink.h" + +struct ksmbd_share_config; +struct ksmbd_user; + +struct ksmbd_tree_connect { + int id; + + unsigned int flags; + struct ksmbd_share_config *share_conf; + struct ksmbd_user *user; + + struct list_head list; + + int maximal_access; + bool posix_extensions; +}; + +struct ksmbd_tree_conn_status { + unsigned int ret; + struct ksmbd_tree_connect *tree_conn; +}; + +static inline int test_tree_conn_flag(struct ksmbd_tree_connect *tree_conn, + int flag) +{ + return tree_conn->flags & flag; +} + +struct ksmbd_session; + +struct ksmbd_tree_conn_status +ksmbd_tree_conn_connect(struct ksmbd_session *sess, char *share_name); + +int ksmbd_tree_conn_disconnect(struct ksmbd_session *sess, + struct ksmbd_tree_connect *tree_conn); + +struct ksmbd_tree_connect *ksmbd_tree_conn_lookup(struct ksmbd_session *sess, + unsigned int id); + +struct ksmbd_share_config *ksmbd_tree_conn_share(struct ksmbd_session *sess, + unsigned int id); + +int ksmbd_tree_conn_session_logoff(struct ksmbd_session *sess); + +#endif /* __TREE_CONNECT_MANAGEMENT_H__ */ diff --git a/fs/ksmbd/mgmt/user_config.c b/fs/ksmbd/mgmt/user_config.c new file mode 100644 index 000000000000..d21629ae5c89 --- /dev/null +++ b/fs/ksmbd/mgmt/user_config.c @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018 Samsung Electronics Co., Ltd. + */ + +#include <linux/slab.h> +#include <linux/mm.h> + +#include "user_config.h" +#include "../transport_ipc.h" + +struct ksmbd_user *ksmbd_login_user(const char *account) +{ + struct ksmbd_login_response *resp; + struct ksmbd_user *user = NULL; + + resp = ksmbd_ipc_login_request(account); + if (!resp) + return NULL; + + if (!(resp->status & KSMBD_USER_FLAG_OK)) + goto out; + + user = ksmbd_alloc_user(resp); +out: + kvfree(resp); + return user; +} + +struct ksmbd_user *ksmbd_alloc_user(struct ksmbd_login_response *resp) +{ + struct ksmbd_user *user = NULL; + + user = kmalloc(sizeof(struct ksmbd_user), GFP_KERNEL); + if (!user) + return NULL; + + user->name = kstrdup(resp->account, GFP_KERNEL); + user->flags = resp->status; + user->gid = resp->gid; + user->uid = resp->uid; + user->passkey_sz = resp->hash_sz; + user->passkey = kmalloc(resp->hash_sz, GFP_KERNEL); + if (user->passkey) + memcpy(user->passkey, resp->hash, resp->hash_sz); + + if (!user->name || !user->passkey) { + kfree(user->name); + kfree(user->passkey); + kfree(user); + user = NULL; + } + return user; +} + +void ksmbd_free_user(struct ksmbd_user *user) +{ + ksmbd_ipc_logout_request(user->name); + kfree(user->name); + kfree(user->passkey); + kfree(user); +} + +int ksmbd_anonymous_user(struct ksmbd_user *user) +{ + if (user->name[0] == '\0') + return 1; + return 0; +} diff --git a/fs/ksmbd/mgmt/user_config.h b/fs/ksmbd/mgmt/user_config.h new file mode 100644 index 000000000000..b2bb074a0150 --- /dev/null +++ b/fs/ksmbd/mgmt/user_config.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2018 Samsung Electronics Co., Ltd. + */ + +#ifndef __USER_CONFIG_MANAGEMENT_H__ +#define __USER_CONFIG_MANAGEMENT_H__ + +#include "../glob.h" + +struct ksmbd_user { + unsigned short flags; + + unsigned int uid; + unsigned int gid; + + char *name; + + size_t passkey_sz; + char *passkey; +}; + +static inline bool user_guest(struct ksmbd_user *user) +{ + return user->flags & KSMBD_USER_FLAG_GUEST_ACCOUNT; +} + +static inline void set_user_flag(struct ksmbd_user *user, int flag) +{ + user->flags |= flag; +} + +static inline int test_user_flag(struct ksmbd_user *user, int flag) +{ + return user->flags & flag; +} + +static inline void set_user_guest(struct ksmbd_user *user) +{ +} + +static inline char *user_passkey(struct ksmbd_user *user) +{ + return user->passkey; +} + +static inline char *user_name(struct ksmbd_user *user) +{ + return user->name; +} + +static inline unsigned int user_uid(struct ksmbd_user *user) +{ + return user->uid; +} + +static inline unsigned int user_gid(struct ksmbd_user *user) +{ + return user->gid; +} + +struct ksmbd_user *ksmbd_login_user(const char *account); +struct ksmbd_user *ksmbd_alloc_user(struct ksmbd_login_response *resp); +void ksmbd_free_user(struct ksmbd_user *user); +int ksmbd_anonymous_user(struct ksmbd_user *user); +#endif /* __USER_CONFIG_MANAGEMENT_H__ */ diff --git a/fs/ksmbd/mgmt/user_session.c b/fs/ksmbd/mgmt/user_session.c new file mode 100644 index 000000000000..8d8ffd8c6f19 --- /dev/null +++ b/fs/ksmbd/mgmt/user_session.c @@ -0,0 +1,369 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018 Samsung Electronics Co., Ltd. + */ + +#include <linux/list.h> +#include <linux/slab.h> +#include <linux/rwsem.h> +#include <linux/xarray.h> + +#include "ksmbd_ida.h" +#include "user_session.h" +#include "user_config.h" +#include "tree_connect.h" +#include "../transport_ipc.h" +#include "../connection.h" +#include "../vfs_cache.h" + +static DEFINE_IDA(session_ida); + +#define SESSION_HASH_BITS 3 +static DEFINE_HASHTABLE(sessions_table, SESSION_HASH_BITS); +static DECLARE_RWSEM(sessions_table_lock); + +struct ksmbd_session_rpc { + int id; + unsigned int method; + struct list_head list; +}; + +static void free_channel_list(struct ksmbd_session *sess) +{ + struct channel *chann, *tmp; + + list_for_each_entry_safe(chann, tmp, &sess->ksmbd_chann_list, + chann_list) { + list_del(&chann->chann_list); + kfree(chann); + } +} + +static void __session_rpc_close(struct ksmbd_session *sess, + struct ksmbd_session_rpc *entry) +{ + struct ksmbd_rpc_command *resp; + + resp = ksmbd_rpc_close(sess, entry->id); + if (!resp) + pr_err("Unable to close RPC pipe %d\n", entry->id); + + kvfree(resp); + ksmbd_rpc_id_free(entry->id); + kfree(entry); +} + +static void ksmbd_session_rpc_clear_list(struct ksmbd_session *sess) +{ + struct ksmbd_session_rpc *entry; + + while (!list_empty(&sess->rpc_handle_list)) { + entry = list_entry(sess->rpc_handle_list.next, + struct ksmbd_session_rpc, + list); + + list_del(&entry->list); + __session_rpc_close(sess, entry); + } +} + +static int __rpc_method(char *rpc_name) +{ + if (!strcmp(rpc_name, "\\srvsvc") || !strcmp(rpc_name, "srvsvc")) + return KSMBD_RPC_SRVSVC_METHOD_INVOKE; + + if (!strcmp(rpc_name, "\\wkssvc") || !strcmp(rpc_name, "wkssvc")) + return KSMBD_RPC_WKSSVC_METHOD_INVOKE; + + if (!strcmp(rpc_name, "LANMAN") || !strcmp(rpc_name, "lanman")) + return KSMBD_RPC_RAP_METHOD; + + if (!strcmp(rpc_name, "\\samr") || !strcmp(rpc_name, "samr")) + return KSMBD_RPC_SAMR_METHOD_INVOKE; + + if (!strcmp(rpc_name, "\\lsarpc") || !strcmp(rpc_name, "lsarpc")) + return KSMBD_RPC_LSARPC_METHOD_INVOKE; + + pr_err("Unsupported RPC: %s\n", rpc_name); + return 0; +} + +int ksmbd_session_rpc_open(struct ksmbd_session *sess, char *rpc_name) +{ + struct ksmbd_session_rpc *entry; + struct ksmbd_rpc_command *resp; + int method; + + method = __rpc_method(rpc_name); + if (!method) + return -EINVAL; + + entry = kzalloc(sizeof(struct ksmbd_session_rpc), GFP_KERNEL); + if (!entry) + return -EINVAL; + + list_add(&entry->list, &sess->rpc_handle_list); + entry->method = method; + entry->id = ksmbd_ipc_id_alloc(); + if (entry->id < 0) + goto error; + + resp = ksmbd_rpc_open(sess, entry->id); + if (!resp) + goto error; + + kvfree(resp); + return entry->id; +error: + list_del(&entry->list); + kfree(entry); + return -EINVAL; +} + +void ksmbd_session_rpc_close(struct ksmbd_session *sess, int id) +{ + struct ksmbd_session_rpc *entry; + + list_for_each_entry(entry, &sess->rpc_handle_list, list) { + if (entry->id == id) { + list_del(&entry->list); + __session_rpc_close(sess, entry); + break; + } + } +} + +int ksmbd_session_rpc_method(struct ksmbd_session *sess, int id) +{ + struct ksmbd_session_rpc *entry; + + list_for_each_entry(entry, &sess->rpc_handle_list, list) { + if (entry->id == id) + return entry->method; + } + return 0; +} + +void ksmbd_session_destroy(struct ksmbd_session *sess) +{ + if (!sess) + return; + + if (!atomic_dec_and_test(&sess->refcnt)) + return; + + list_del(&sess->sessions_entry); + + down_write(&sessions_table_lock); + hash_del(&sess->hlist); + up_write(&sessions_table_lock); + + if (sess->user) + ksmbd_free_user(sess->user); + + ksmbd_tree_conn_session_logoff(sess); + ksmbd_destroy_file_table(&sess->file_table); + ksmbd_session_rpc_clear_list(sess); + free_channel_list(sess); + kfree(sess->Preauth_HashValue); + ksmbd_release_id(&session_ida, sess->id); + kfree(sess); +} + +static struct ksmbd_session *__session_lookup(unsigned long long id) +{ + struct ksmbd_session *sess; + + hash_for_each_possible(sessions_table, sess, hlist, id) { + if (id == sess->id) + return sess; + } + return NULL; +} + +void ksmbd_session_register(struct ksmbd_conn *conn, + struct ksmbd_session *sess) +{ + sess->conn = conn; + list_add(&sess->sessions_entry, &conn->sessions); +} + +void ksmbd_sessions_deregister(struct ksmbd_conn *conn) +{ + struct ksmbd_session *sess; + + while (!list_empty(&conn->sessions)) { + sess = list_entry(conn->sessions.next, + struct ksmbd_session, + sessions_entry); + + ksmbd_session_destroy(sess); + } +} + +static bool ksmbd_session_id_match(struct ksmbd_session *sess, + unsigned long long id) +{ + return sess->id == id; +} + +struct ksmbd_session *ksmbd_session_lookup(struct ksmbd_conn *conn, + unsigned long long id) +{ + struct ksmbd_session *sess = NULL; + + list_for_each_entry(sess, &conn->sessions, sessions_entry) { + if (ksmbd_session_id_match(sess, id)) + return sess; + } + return NULL; +} + +int get_session(struct ksmbd_session *sess) +{ + return atomic_inc_not_zero(&sess->refcnt); +} + +void put_session(struct ksmbd_session *sess) +{ + if (atomic_dec_and_test(&sess->refcnt)) + pr_err("get/%s seems to be mismatched.", __func__); +} + +struct ksmbd_session *ksmbd_session_lookup_slowpath(unsigned long long id) +{ + struct ksmbd_session *sess; + + down_read(&sessions_table_lock); + sess = __session_lookup(id); + if (sess) { + if (!get_session(sess)) + sess = NULL; + } + up_read(&sessions_table_lock); + + return sess; +} + +struct ksmbd_session *ksmbd_session_lookup_all(struct ksmbd_conn *conn, + unsigned long long id) +{ + struct ksmbd_session *sess; + + sess = ksmbd_session_lookup(conn, id); + if (!sess && conn->binding) + sess = ksmbd_session_lookup_slowpath(id); + return sess; +} + +struct preauth_session *ksmbd_preauth_session_alloc(struct ksmbd_conn *conn, + u64 sess_id) +{ + struct preauth_session *sess; + + sess = kmalloc(sizeof(struct preauth_session), GFP_KERNEL); + if (!sess) + return NULL; + + sess->id = sess_id; + memcpy(sess->Preauth_HashValue, conn->preauth_info->Preauth_HashValue, + PREAUTH_HASHVALUE_SIZE); + list_add(&sess->preauth_entry, &conn->preauth_sess_table); + + return sess; +} + +static bool ksmbd_preauth_session_id_match(struct preauth_session *sess, + unsigned long long id) +{ + return sess->id == id; +} + +struct preauth_session *ksmbd_preauth_session_lookup(struct ksmbd_conn *conn, + unsigned long long id) +{ + struct preauth_session *sess = NULL; + + list_for_each_entry(sess, &conn->preauth_sess_table, preauth_entry) { + if (ksmbd_preauth_session_id_match(sess, id)) + return sess; + } + return NULL; +} + +static int __init_smb2_session(struct ksmbd_session *sess) +{ + int id = ksmbd_acquire_smb2_uid(&session_ida); + + if (id < 0) + return -EINVAL; + sess->id = id; + return 0; +} + +static struct ksmbd_session *__session_create(int protocol) +{ + struct ksmbd_session *sess; + int ret; + + sess = kzalloc(sizeof(struct ksmbd_session), GFP_KERNEL); + if (!sess) + return NULL; + + if (ksmbd_init_file_table(&sess->file_table)) + goto error; + + set_session_flag(sess, protocol); + INIT_LIST_HEAD(&sess->sessions_entry); + xa_init(&sess->tree_conns); + INIT_LIST_HEAD(&sess->ksmbd_chann_list); + INIT_LIST_HEAD(&sess->rpc_handle_list); + sess->sequence_number = 1; + atomic_set(&sess->refcnt, 1); + + switch (protocol) { + case CIFDS_SESSION_FLAG_SMB2: + ret = __init_smb2_session(sess); + break; + default: + ret = -EINVAL; + break; + } + + if (ret) + goto error; + + ida_init(&sess->tree_conn_ida); + + if (protocol == CIFDS_SESSION_FLAG_SMB2) { + down_write(&sessions_table_lock); + hash_add(sessions_table, &sess->hlist, sess->id); + up_write(&sessions_table_lock); + } + return sess; + +error: + ksmbd_session_destroy(sess); + return NULL; +} + +struct ksmbd_session *ksmbd_smb2_session_create(void) +{ + return __session_create(CIFDS_SESSION_FLAG_SMB2); +} + +int ksmbd_acquire_tree_conn_id(struct ksmbd_session *sess) +{ + int id = -EINVAL; + + if (test_session_flag(sess, CIFDS_SESSION_FLAG_SMB2)) + id = ksmbd_acquire_smb2_tid(&sess->tree_conn_ida); + + return id; +} + +void ksmbd_release_tree_conn_id(struct ksmbd_session *sess, int id) +{ + if (id >= 0) + ksmbd_release_id(&sess->tree_conn_ida, id); +} diff --git a/fs/ksmbd/mgmt/user_session.h b/fs/ksmbd/mgmt/user_session.h new file mode 100644 index 000000000000..82289c3cbd2b --- /dev/null +++ b/fs/ksmbd/mgmt/user_session.h @@ -0,0 +1,106 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2018 Samsung Electronics Co., Ltd. + */ + +#ifndef __USER_SESSION_MANAGEMENT_H__ +#define __USER_SESSION_MANAGEMENT_H__ + +#include <linux/hashtable.h> +#include <linux/xarray.h> + +#include "../smb_common.h" +#include "../ntlmssp.h" + +#define CIFDS_SESSION_FLAG_SMB2 BIT(1) + +#define PREAUTH_HASHVALUE_SIZE 64 + +struct ksmbd_file_table; + +struct channel { + __u8 smb3signingkey[SMB3_SIGN_KEY_SIZE]; + struct ksmbd_conn *conn; + struct list_head chann_list; +}; + +struct preauth_session { + __u8 Preauth_HashValue[PREAUTH_HASHVALUE_SIZE]; + u64 id; + struct list_head preauth_entry; +}; + +struct ksmbd_session { + u64 id; + + struct ksmbd_user *user; + struct ksmbd_conn *conn; + unsigned int sequence_number; + unsigned int flags; + + bool sign; + bool enc; + bool is_anonymous; + + int state; + __u8 *Preauth_HashValue; + + struct ntlmssp_auth ntlmssp; + char sess_key[CIFS_KEY_SIZE]; + + struct hlist_node hlist; + struct list_head ksmbd_chann_list; + struct xarray tree_conns; + struct ida tree_conn_ida; + struct list_head rpc_handle_list; + + __u8 smb3encryptionkey[SMB3_ENC_DEC_KEY_SIZE]; + __u8 smb3decryptionkey[SMB3_ENC_DEC_KEY_SIZE]; + __u8 smb3signingkey[SMB3_SIGN_KEY_SIZE]; + + struct list_head sessions_entry; + struct ksmbd_file_table file_table; + atomic_t refcnt; +}; + +static inline int test_session_flag(struct ksmbd_session *sess, int bit) +{ + return sess->flags & bit; +} + +static inline void set_session_flag(struct ksmbd_session *sess, int bit) +{ + sess->flags |= bit; +} + +static inline void clear_session_flag(struct ksmbd_session *sess, int bit) +{ + sess->flags &= ~bit; +} + +struct ksmbd_session *ksmbd_smb2_session_create(void); + +void ksmbd_session_destroy(struct ksmbd_session *sess); + +struct ksmbd_session *ksmbd_session_lookup_slowpath(unsigned long long id); +struct ksmbd_session *ksmbd_session_lookup(struct ksmbd_conn *conn, + unsigned long long id); +void ksmbd_session_register(struct ksmbd_conn *conn, + struct ksmbd_session *sess); +void ksmbd_sessions_deregister(struct ksmbd_conn *conn); +struct ksmbd_session *ksmbd_session_lookup_all(struct ksmbd_conn *conn, + unsigned long long id); +struct preauth_session *ksmbd_preauth_session_alloc(struct ksmbd_conn *conn, + u64 sess_id); +struct preauth_session *ksmbd_preauth_session_lookup(struct ksmbd_conn *conn, + unsigned long long id); + +int ksmbd_acquire_tree_conn_id(struct ksmbd_session *sess); +void ksmbd_release_tree_conn_id(struct ksmbd_session *sess, int id); + +int ksmbd_session_rpc_open(struct ksmbd_session *sess, char *rpc_name); +void ksmbd_session_rpc_close(struct ksmbd_session *sess, int id); +int ksmbd_session_rpc_method(struct ksmbd_session *sess, int id); +int get_session(struct ksmbd_session *sess); +void put_session(struct ksmbd_session *sess); +#endif /* __USER_SESSION_MANAGEMENT_H__ */ diff --git a/fs/ksmbd/misc.c b/fs/ksmbd/misc.c new file mode 100644 index 000000000000..0b307ca28a19 --- /dev/null +++ b/fs/ksmbd/misc.c @@ -0,0 +1,338 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2016 Namjae Jeon <linkinjeon@kernel.org> + * Copyright (C) 2018 Samsung Electronics Co., Ltd. + */ + +#include <linux/kernel.h> +#include <linux/xattr.h> +#include <linux/fs.h> + +#include "misc.h" +#include "smb_common.h" +#include "connection.h" +#include "vfs.h" + +#include "mgmt/share_config.h" + +/** + * match_pattern() - compare a string with a pattern which might include + * wildcard '*' and '?' + * TODO : implement consideration about DOS_DOT, DOS_QM and DOS_STAR + * + * @string: string to compare with a pattern + * @len: string length + * @pattern: pattern string which might include wildcard '*' and '?' + * + * Return: 0 if pattern matched with the string, otherwise non zero value + */ +int match_pattern(const char *str, size_t len, const char *pattern) +{ + const char *s = str; + const char *p = pattern; + bool star = false; + + while (*s && len) { + switch (*p) { + case '?': + s++; + len--; + p++; + break; + case '*': + star = true; + str = s; + if (!*++p) + return true; + pattern = p; + break; + default: + if (tolower(*s) == tolower(*p)) { + s++; + len--; + p++; + } else { + if (!star) + return false; + str++; + s = str; + p = pattern; + } + break; + } + } + + if (*p == '*') + ++p; + return !*p; +} + +/* + * is_char_allowed() - check for valid character + * @ch: input character to be checked + * + * Return: 1 if char is allowed, otherwise 0 + */ +static inline int is_char_allowed(char ch) +{ + /* check for control chars, wildcards etc. */ + if (!(ch & 0x80) && + (ch <= 0x1f || + ch == '?' || ch == '"' || ch == '<' || + ch == '>' || ch == '|' || ch == '*')) + return 0; + + return 1; +} + +int ksmbd_validate_filename(char *filename) +{ + while (*filename) { + char c = *filename; + + filename++; + if (!is_char_allowed(c)) { + ksmbd_debug(VFS, "File name validation failed: 0x%x\n", c); + return -ENOENT; + } + } + + return 0; +} + +static int ksmbd_validate_stream_name(char *stream_name) +{ + while (*stream_name) { + char c = *stream_name; + + stream_name++; + if (c == '/' || c == ':' || c == '\\') { + pr_err("Stream name validation failed: %c\n", c); + return -ENOENT; + } + } + + return 0; +} + +int parse_stream_name(char *filename, char **stream_name, int *s_type) +{ + char *stream_type; + char *s_name; + int rc = 0; + + s_name = filename; + filename = strsep(&s_name, ":"); + ksmbd_debug(SMB, "filename : %s, streams : %s\n", filename, s_name); + if (strchr(s_name, ':')) { + stream_type = s_name; + s_name = strsep(&stream_type, ":"); + + rc = ksmbd_validate_stream_name(s_name); + if (rc < 0) { + rc = -ENOENT; + goto out; + } + + ksmbd_debug(SMB, "stream name : %s, stream type : %s\n", s_name, + stream_type); + if (!strncasecmp("$data", stream_type, 5)) + *s_type = DATA_STREAM; + else if (!strncasecmp("$index_allocation", stream_type, 17)) + *s_type = DIR_STREAM; + else + rc = -ENOENT; + } + + *stream_name = s_name; +out: + return rc; +} + +/** + * convert_to_nt_pathname() - extract and return windows path string + * whose share directory prefix was removed from file path + * @filename : unix filename + * @sharepath: share path string + * + * Return : windows path string or error + */ + +char *convert_to_nt_pathname(char *filename, char *sharepath) +{ + char *ab_pathname; + int len, name_len; + + name_len = strlen(filename); + ab_pathname = kmalloc(name_len, GFP_KERNEL); + if (!ab_pathname) + return NULL; + + ab_pathname[0] = '\\'; + ab_pathname[1] = '\0'; + + len = strlen(sharepath); + if (!strncmp(filename, sharepath, len) && name_len != len) { + strscpy(ab_pathname, &filename[len], name_len); + ksmbd_conv_path_to_windows(ab_pathname); + } + + return ab_pathname; +} + +int get_nlink(struct kstat *st) +{ + int nlink; + + nlink = st->nlink; + if (S_ISDIR(st->mode)) + nlink--; + + return nlink; +} + +void ksmbd_conv_path_to_unix(char *path) +{ + strreplace(path, '\\', '/'); +} + +void ksmbd_strip_last_slash(char *path) +{ + int len = strlen(path); + + while (len && path[len - 1] == '/') { + path[len - 1] = '\0'; + len--; + } +} + +void ksmbd_conv_path_to_windows(char *path) +{ + strreplace(path, '/', '\\'); +} + +/** + * ksmbd_extract_sharename() - get share name from tree connect request + * @treename: buffer containing tree name and share name + * + * Return: share name on success, otherwise error + */ +char *ksmbd_extract_sharename(char *treename) +{ + char *name = treename; + char *dst; + char *pos = strrchr(name, '\\'); + + if (pos) + name = (pos + 1); + + /* caller has to free the memory */ + dst = kstrdup(name, GFP_KERNEL); + if (!dst) + return ERR_PTR(-ENOMEM); + return dst; +} + +/** + * convert_to_unix_name() - convert windows name to unix format + * @path: name to be converted + * @tid: tree id of mathing share + * + * Return: converted name on success, otherwise NULL + */ +char *convert_to_unix_name(struct ksmbd_share_config *share, char *name) +{ + int no_slash = 0, name_len, path_len; + char *new_name; + + if (name[0] == '/') + name++; + + path_len = share->path_sz; + name_len = strlen(name); + new_name = kmalloc(path_len + name_len + 2, GFP_KERNEL); + if (!new_name) + return new_name; + + memcpy(new_name, share->path, path_len); + if (new_name[path_len - 1] != '/') { + new_name[path_len] = '/'; + no_slash = 1; + } + + memcpy(new_name + path_len + no_slash, name, name_len); + path_len += name_len + no_slash; + new_name[path_len] = 0x00; + return new_name; +} + +char *ksmbd_convert_dir_info_name(struct ksmbd_dir_info *d_info, + const struct nls_table *local_nls, + int *conv_len) +{ + char *conv; + int sz = min(4 * d_info->name_len, PATH_MAX); + + if (!sz) + return NULL; + + conv = kmalloc(sz, GFP_KERNEL); + if (!conv) + return NULL; + + /* XXX */ + *conv_len = smbConvertToUTF16((__le16 *)conv, d_info->name, + d_info->name_len, local_nls, 0); + *conv_len *= 2; + + /* We allocate buffer twice bigger than needed. */ + conv[*conv_len] = 0x00; + conv[*conv_len + 1] = 0x00; + return conv; +} + +/* + * Convert the NT UTC (based 1601-01-01, in hundred nanosecond units) + * into Unix UTC (based 1970-01-01, in seconds). + */ +struct timespec64 ksmbd_NTtimeToUnix(__le64 ntutc) +{ + struct timespec64 ts; + + /* Subtract the NTFS time offset, then convert to 1s intervals. */ + s64 t = le64_to_cpu(ntutc) - NTFS_TIME_OFFSET; + u64 abs_t; + + /* + * Unfortunately can not use normal 64 bit division on 32 bit arch, but + * the alternative, do_div, does not work with negative numbers so have + * to special case them + */ + if (t < 0) { + abs_t = -t; + ts.tv_nsec = do_div(abs_t, 10000000) * 100; + ts.tv_nsec = -ts.tv_nsec; + ts.tv_sec = -abs_t; + } else { + abs_t = t; + ts.tv_nsec = do_div(abs_t, 10000000) * 100; + ts.tv_sec = abs_t; + } + + return ts; +} + +/* Convert the Unix UTC into NT UTC. */ +inline u64 ksmbd_UnixTimeToNT(struct timespec64 t) +{ + /* Convert to 100ns intervals and then add the NTFS time offset. */ + return (u64)t.tv_sec * 10000000 + t.tv_nsec / 100 + NTFS_TIME_OFFSET; +} + +inline long long ksmbd_systime(void) +{ + struct timespec64 ts; + + ktime_get_real_ts64(&ts); + return ksmbd_UnixTimeToNT(ts); +} diff --git a/fs/ksmbd/misc.h b/fs/ksmbd/misc.h new file mode 100644 index 000000000000..af8717d4d85b --- /dev/null +++ b/fs/ksmbd/misc.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2018 Samsung Electronics Co., Ltd. + */ + +#ifndef __KSMBD_MISC_H__ +#define __KSMBD_MISC_H__ + +struct ksmbd_share_config; +struct nls_table; +struct kstat; +struct ksmbd_file; + +int match_pattern(const char *str, size_t len, const char *pattern); +int ksmbd_validate_filename(char *filename); +int parse_stream_name(char *filename, char **stream_name, int *s_type); +char *convert_to_nt_pathname(char *filename, char *sharepath); +int get_nlink(struct kstat *st); +void ksmbd_conv_path_to_unix(char *path); +void ksmbd_strip_last_slash(char *path); +void ksmbd_conv_path_to_windows(char *path); +char *ksmbd_extract_sharename(char *treename); +char *convert_to_unix_name(struct ksmbd_share_config *share, char *name); + +#define KSMBD_DIR_INFO_ALIGNMENT 8 +struct ksmbd_dir_info; +char *ksmbd_convert_dir_info_name(struct ksmbd_dir_info *d_info, + const struct nls_table *local_nls, + int *conv_len); + +#define NTFS_TIME_OFFSET ((u64)(369 * 365 + 89) * 24 * 3600 * 10000000) +struct timespec64 ksmbd_NTtimeToUnix(__le64 ntutc); +u64 ksmbd_UnixTimeToNT(struct timespec64 t); +long long ksmbd_systime(void); +#endif /* __KSMBD_MISC_H__ */ diff --git a/fs/ksmbd/ndr.c b/fs/ksmbd/ndr.c new file mode 100644 index 000000000000..2243a2c64b37 --- /dev/null +++ b/fs/ksmbd/ndr.c @@ -0,0 +1,345 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2021 Samsung Electronics Co., Ltd. + * Author(s): Namjae Jeon <linkinjeon@kernel.org> + */ + +#include <linux/fs.h> + +#include "glob.h" +#include "ndr.h" + +static inline char *ndr_get_field(struct ndr *n) +{ + return n->data + n->offset; +} + +static int try_to_realloc_ndr_blob(struct ndr *n, size_t sz) +{ + char *data; + + data = krealloc(n->data, n->offset + sz + 1024, GFP_KERNEL); + if (!data) + return -ENOMEM; + + n->data = data; + n->length += 1024; + memset(n->data + n->offset, 0, 1024); + return 0; +} + +static void ndr_write_int16(struct ndr *n, __u16 value) +{ + if (n->length <= n->offset + sizeof(value)) + try_to_realloc_ndr_blob(n, sizeof(value)); + + *(__le16 *)ndr_get_field(n) = cpu_to_le16(value); + n->offset += sizeof(value); +} + +static void ndr_write_int32(struct ndr *n, __u32 value) +{ + if (n->length <= n->offset + sizeof(value)) + try_to_realloc_ndr_blob(n, sizeof(value)); + + *(__le32 *)ndr_get_field(n) = cpu_to_le32(value); + n->offset += sizeof(value); +} + +static void ndr_write_int64(struct ndr *n, __u64 value) +{ + if (n->length <= n->offset + sizeof(value)) + try_to_realloc_ndr_blob(n, sizeof(value)); + + *(__le64 *)ndr_get_field(n) = cpu_to_le64(value); + n->offset += sizeof(value); +} + +static int ndr_write_bytes(struct ndr *n, void *value, size_t sz) +{ + if (n->length <= n->offset + sz) + try_to_realloc_ndr_blob(n, sz); + + memcpy(ndr_get_field(n), value, sz); + n->offset += sz; + return 0; +} + +static int ndr_write_string(struct ndr *n, char *value) +{ + size_t sz; + + sz = strlen(value) + 1; + if (n->length <= n->offset + sz) + try_to_realloc_ndr_blob(n, sz); + + memcpy(ndr_get_field(n), value, sz); + n->offset += sz; + n->offset = ALIGN(n->offset, 2); + return 0; +} + +static int ndr_read_string(struct ndr *n, void *value, size_t sz) +{ + int len = strnlen(ndr_get_field(n), sz); + + memcpy(value, ndr_get_field(n), len); + len++; + n->offset += len; + n->offset = ALIGN(n->offset, 2); + return 0; +} + +static int ndr_read_bytes(struct ndr *n, void *value, size_t sz) +{ + memcpy(value, ndr_get_field(n), sz); + n->offset += sz; + return 0; +} + +static __u16 ndr_read_int16(struct ndr *n) +{ + __u16 ret; + + ret = le16_to_cpu(*(__le16 *)ndr_get_field(n)); + n->offset += sizeof(__u16); + return ret; +} + +static __u32 ndr_read_int32(struct ndr *n) +{ + __u32 ret; + + ret = le32_to_cpu(*(__le32 *)ndr_get_field(n)); + n->offset += sizeof(__u32); + return ret; +} + +static __u64 ndr_read_int64(struct ndr *n) +{ + __u64 ret; + + ret = le64_to_cpu(*(__le64 *)ndr_get_field(n)); + n->offset += sizeof(__u64); + return ret; +} + +int ndr_encode_dos_attr(struct ndr *n, struct xattr_dos_attrib *da) +{ + char hex_attr[12] = {0}; + + n->offset = 0; + n->length = 1024; + n->data = kzalloc(n->length, GFP_KERNEL); + if (!n->data) + return -ENOMEM; + + if (da->version == 3) { + snprintf(hex_attr, 10, "0x%x", da->attr); + ndr_write_string(n, hex_attr); + } else { + ndr_write_string(n, ""); + } + ndr_write_int16(n, da->version); + ndr_write_int32(n, da->version); + + ndr_write_int32(n, da->flags); + ndr_write_int32(n, da->attr); + if (da->version == 3) { + ndr_write_int32(n, da->ea_size); + ndr_write_int64(n, da->size); + ndr_write_int64(n, da->alloc_size); + } else { + ndr_write_int64(n, da->itime); + } + ndr_write_int64(n, da->create_time); + if (da->version == 3) + ndr_write_int64(n, da->change_time); + return 0; +} + +int ndr_decode_dos_attr(struct ndr *n, struct xattr_dos_attrib *da) +{ + char *hex_attr; + int version2; + + hex_attr = kzalloc(n->length, GFP_KERNEL); + if (!hex_attr) + return -ENOMEM; + + n->offset = 0; + ndr_read_string(n, hex_attr, n->length); + kfree(hex_attr); + da->version = ndr_read_int16(n); + + if (da->version != 3 && da->version != 4) { + pr_err("v%d version is not supported\n", da->version); + return -EINVAL; + } + + version2 = ndr_read_int32(n); + if (da->version != version2) { + pr_err("ndr version mismatched(version: %d, version2: %d)\n", + da->version, version2); + return -EINVAL; + } + + ndr_read_int32(n); + da->attr = ndr_read_int32(n); + if (da->version == 4) { + da->itime = ndr_read_int64(n); + da->create_time = ndr_read_int64(n); + } else { + ndr_read_int32(n); + ndr_read_int64(n); + ndr_read_int64(n); + da->create_time = ndr_read_int64(n); + ndr_read_int64(n); + } + + return 0; +} + +static int ndr_encode_posix_acl_entry(struct ndr *n, struct xattr_smb_acl *acl) +{ + int i; + + ndr_write_int32(n, acl->count); + n->offset = ALIGN(n->offset, 8); + ndr_write_int32(n, acl->count); + ndr_write_int32(n, 0); + + for (i = 0; i < acl->count; i++) { + n->offset = ALIGN(n->offset, 8); + ndr_write_int16(n, acl->entries[i].type); + ndr_write_int16(n, acl->entries[i].type); + + if (acl->entries[i].type == SMB_ACL_USER) { + n->offset = ALIGN(n->offset, 8); + ndr_write_int64(n, acl->entries[i].uid); + } else if (acl->entries[i].type == SMB_ACL_GROUP) { + n->offset = ALIGN(n->offset, 8); + ndr_write_int64(n, acl->entries[i].gid); + } + + /* push permission */ + ndr_write_int32(n, acl->entries[i].perm); + } + + return 0; +} + +int ndr_encode_posix_acl(struct ndr *n, + struct user_namespace *user_ns, + struct inode *inode, + struct xattr_smb_acl *acl, + struct xattr_smb_acl *def_acl) +{ + int ref_id = 0x00020000; + + n->offset = 0; + n->length = 1024; + n->data = kzalloc(n->length, GFP_KERNEL); + if (!n->data) + return -ENOMEM; + + if (acl) { + /* ACL ACCESS */ + ndr_write_int32(n, ref_id); + ref_id += 4; + } else { + ndr_write_int32(n, 0); + } + + if (def_acl) { + /* DEFAULT ACL ACCESS */ + ndr_write_int32(n, ref_id); + ref_id += 4; + } else { + ndr_write_int32(n, 0); + } + + ndr_write_int64(n, from_kuid(user_ns, inode->i_uid)); + ndr_write_int64(n, from_kgid(user_ns, inode->i_gid)); + ndr_write_int32(n, inode->i_mode); + + if (acl) { + ndr_encode_posix_acl_entry(n, acl); + if (def_acl) + ndr_encode_posix_acl_entry(n, def_acl); + } + return 0; +} + +int ndr_encode_v4_ntacl(struct ndr *n, struct xattr_ntacl *acl) +{ + int ref_id = 0x00020004; + + n->offset = 0; + n->length = 2048; + n->data = kzalloc(n->length, GFP_KERNEL); + if (!n->data) + return -ENOMEM; + + ndr_write_int16(n, acl->version); + ndr_write_int32(n, acl->version); + ndr_write_int16(n, 2); + ndr_write_int32(n, ref_id); + + /* push hash type and hash 64bytes */ + ndr_write_int16(n, acl->hash_type); + ndr_write_bytes(n, acl->hash, XATTR_SD_HASH_SIZE); + ndr_write_bytes(n, acl->desc, acl->desc_len); + ndr_write_int64(n, acl->current_time); + ndr_write_bytes(n, acl->posix_acl_hash, XATTR_SD_HASH_SIZE); + + /* push ndr for security descriptor */ + ndr_write_bytes(n, acl->sd_buf, acl->sd_size); + + return 0; +} + +int ndr_decode_v4_ntacl(struct ndr *n, struct xattr_ntacl *acl) +{ + int version2; + + n->offset = 0; + acl->version = ndr_read_int16(n); + if (acl->version != 4) { + pr_err("v%d version is not supported\n", acl->version); + return -EINVAL; + } + + version2 = ndr_read_int32(n); + if (acl->version != version2) { + pr_err("ndr version mismatched(version: %d, version2: %d)\n", + acl->version, version2); + return -EINVAL; + } + + /* Read Level */ + ndr_read_int16(n); + /* Read Ref Id */ + ndr_read_int32(n); + acl->hash_type = ndr_read_int16(n); + ndr_read_bytes(n, acl->hash, XATTR_SD_HASH_SIZE); + + ndr_read_bytes(n, acl->desc, 10); + if (strncmp(acl->desc, "posix_acl", 9)) { + pr_err("Invalid acl description : %s\n", acl->desc); + return -EINVAL; + } + + /* Read Time */ + ndr_read_int64(n); + /* Read Posix ACL hash */ + ndr_read_bytes(n, acl->posix_acl_hash, XATTR_SD_HASH_SIZE); + acl->sd_size = n->length - n->offset; + acl->sd_buf = kzalloc(acl->sd_size, GFP_KERNEL); + if (!acl->sd_buf) + return -ENOMEM; + + ndr_read_bytes(n, acl->sd_buf, acl->sd_size); + + return 0; +} diff --git a/fs/ksmbd/ndr.h b/fs/ksmbd/ndr.h new file mode 100644 index 000000000000..60ca265d1bb0 --- /dev/null +++ b/fs/ksmbd/ndr.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2020 Samsung Electronics Co., Ltd. + * Author(s): Namjae Jeon <linkinjeon@kernel.org> + */ + +struct ndr { + char *data; + int offset; + int length; +}; + +#define NDR_NTSD_OFFSETOF 0xA0 + +int ndr_encode_dos_attr(struct ndr *n, struct xattr_dos_attrib *da); +int ndr_decode_dos_attr(struct ndr *n, struct xattr_dos_attrib *da); +int ndr_encode_posix_acl(struct ndr *n, struct user_namespace *user_ns, + struct inode *inode, struct xattr_smb_acl *acl, + struct xattr_smb_acl *def_acl); +int ndr_encode_v4_ntacl(struct ndr *n, struct xattr_ntacl *acl); +int ndr_encode_v3_ntacl(struct ndr *n, struct xattr_ntacl *acl); +int ndr_decode_v4_ntacl(struct ndr *n, struct xattr_ntacl *acl); diff --git a/fs/ksmbd/nterr.h b/fs/ksmbd/nterr.h new file mode 100644 index 000000000000..2f358f88a018 --- /dev/null +++ b/fs/ksmbd/nterr.h @@ -0,0 +1,543 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Unix SMB/Netbios implementation. + * Version 1.9. + * NT error code constants + * Copyright (C) Andrew Tridgell 1992-2000 + * Copyright (C) John H Terpstra 1996-2000 + * Copyright (C) Luke Kenneth Casson Leighton 1996-2000 + * Copyright (C) Paul Ashton 1998-2000 + */ + +#ifndef _NTERR_H +#define _NTERR_H + +/* Win32 Status codes. */ +#define NT_STATUS_MORE_ENTRIES 0x0105 +#define NT_ERROR_INVALID_PARAMETER 0x0057 +#define NT_ERROR_INSUFFICIENT_BUFFER 0x007a +#define NT_STATUS_1804 0x070c +#define NT_STATUS_NOTIFY_ENUM_DIR 0x010c +#define NT_STATUS_INVALID_LOCK_RANGE (0xC0000000 | 0x01a1) +/* + * Win32 Error codes extracted using a loop in smbclient then printing a netmon + * sniff to a file. + */ + +#define NT_STATUS_OK 0x0000 +#define NT_STATUS_SOME_UNMAPPED 0x0107 +#define NT_STATUS_BUFFER_OVERFLOW 0x80000005 +#define NT_STATUS_NO_MORE_ENTRIES 0x8000001a +#define NT_STATUS_MEDIA_CHANGED 0x8000001c +#define NT_STATUS_END_OF_MEDIA 0x8000001e +#define NT_STATUS_MEDIA_CHECK 0x80000020 +#define NT_STATUS_NO_DATA_DETECTED 0x8000001c +#define NT_STATUS_STOPPED_ON_SYMLINK 0x8000002d +#define NT_STATUS_DEVICE_REQUIRES_CLEANING 0x80000288 +#define NT_STATUS_DEVICE_DOOR_OPEN 0x80000288 +#define NT_STATUS_UNSUCCESSFUL (0xC0000000 | 0x0001) +#define NT_STATUS_NOT_IMPLEMENTED (0xC0000000 | 0x0002) +#define NT_STATUS_INVALID_INFO_CLASS (0xC0000000 | 0x0003) +#define NT_STATUS_INFO_LENGTH_MISMATCH (0xC0000000 | 0x0004) +#define NT_STATUS_ACCESS_VIOLATION (0xC0000000 | 0x0005) +#define NT_STATUS_IN_PAGE_ERROR (0xC0000000 | 0x0006) +#define NT_STATUS_PAGEFILE_QUOTA (0xC0000000 | 0x0007) +#define NT_STATUS_INVALID_HANDLE (0xC0000000 | 0x0008) +#define NT_STATUS_BAD_INITIAL_STACK (0xC0000000 | 0x0009) +#define NT_STATUS_BAD_INITIAL_PC (0xC0000000 | 0x000a) +#define NT_STATUS_INVALID_CID (0xC0000000 | 0x000b) +#define NT_STATUS_TIMER_NOT_CANCELED (0xC0000000 | 0x000c) +#define NT_STATUS_INVALID_PARAMETER (0xC0000000 | 0x000d) +#define NT_STATUS_NO_SUCH_DEVICE (0xC0000000 | 0x000e) +#define NT_STATUS_NO_SUCH_FILE (0xC0000000 | 0x000f) +#define NT_STATUS_INVALID_DEVICE_REQUEST (0xC0000000 | 0x0010) +#define NT_STATUS_END_OF_FILE (0xC0000000 | 0x0011) +#define NT_STATUS_WRONG_VOLUME (0xC0000000 | 0x0012) +#define NT_STATUS_NO_MEDIA_IN_DEVICE (0xC0000000 | 0x0013) +#define NT_STATUS_UNRECOGNIZED_MEDIA (0xC0000000 | 0x0014) +#define NT_STATUS_NONEXISTENT_SECTOR (0xC0000000 | 0x0015) +#define NT_STATUS_MORE_PROCESSING_REQUIRED (0xC0000000 | 0x0016) +#define NT_STATUS_NO_MEMORY (0xC0000000 | 0x0017) +#define NT_STATUS_CONFLICTING_ADDRESSES (0xC0000000 | 0x0018) +#define NT_STATUS_NOT_MAPPED_VIEW (0xC0000000 | 0x0019) +#define NT_STATUS_UNABLE_TO_FREE_VM (0x80000000 | 0x001a) +#define NT_STATUS_UNABLE_TO_DELETE_SECTION (0xC0000000 | 0x001b) +#define NT_STATUS_INVALID_SYSTEM_SERVICE (0xC0000000 | 0x001c) +#define NT_STATUS_ILLEGAL_INSTRUCTION (0xC0000000 | 0x001d) +#define NT_STATUS_INVALID_LOCK_SEQUENCE (0xC0000000 | 0x001e) +#define NT_STATUS_INVALID_VIEW_SIZE (0xC0000000 | 0x001f) +#define NT_STATUS_INVALID_FILE_FOR_SECTION (0xC0000000 | 0x0020) +#define NT_STATUS_ALREADY_COMMITTED (0xC0000000 | 0x0021) +#define NT_STATUS_ACCESS_DENIED (0xC0000000 | 0x0022) +#define NT_STATUS_BUFFER_TOO_SMALL (0xC0000000 | 0x0023) +#define NT_STATUS_OBJECT_TYPE_MISMATCH (0xC0000000 | 0x0024) +#define NT_STATUS_NONCONTINUABLE_EXCEPTION (0xC0000000 | 0x0025) +#define NT_STATUS_INVALID_DISPOSITION (0xC0000000 | 0x0026) +#define NT_STATUS_UNWIND (0xC0000000 | 0x0027) +#define NT_STATUS_BAD_STACK (0xC0000000 | 0x0028) +#define NT_STATUS_INVALID_UNWIND_TARGET (0xC0000000 | 0x0029) +#define NT_STATUS_NOT_LOCKED (0xC0000000 | 0x002a) +#define NT_STATUS_PARITY_ERROR (0xC0000000 | 0x002b) +#define NT_STATUS_UNABLE_TO_DECOMMIT_VM (0xC0000000 | 0x002c) +#define NT_STATUS_NOT_COMMITTED (0xC0000000 | 0x002d) +#define NT_STATUS_INVALID_PORT_ATTRIBUTES (0xC0000000 | 0x002e) +#define NT_STATUS_PORT_MESSAGE_TOO_LONG (0xC0000000 | 0x002f) +#define NT_STATUS_INVALID_PARAMETER_MIX (0xC0000000 | 0x0030) +#define NT_STATUS_INVALID_QUOTA_LOWER (0xC0000000 | 0x0031) +#define NT_STATUS_DISK_CORRUPT_ERROR (0xC0000000 | 0x0032) +#define NT_STATUS_OBJECT_NAME_INVALID (0xC0000000 | 0x0033) +#define NT_STATUS_OBJECT_NAME_NOT_FOUND (0xC0000000 | 0x0034) +#define NT_STATUS_OBJECT_NAME_COLLISION (0xC0000000 | 0x0035) +#define NT_STATUS_HANDLE_NOT_WAITABLE (0xC0000000 | 0x0036) +#define NT_STATUS_PORT_DISCONNECTED (0xC0000000 | 0x0037) +#define NT_STATUS_DEVICE_ALREADY_ATTACHED (0xC0000000 | 0x0038) +#define NT_STATUS_OBJECT_PATH_INVALID (0xC0000000 | 0x0039) +#define NT_STATUS_OBJECT_PATH_NOT_FOUND (0xC0000000 | 0x003a) +#define NT_STATUS_OBJECT_PATH_SYNTAX_BAD (0xC0000000 | 0x003b) +#define NT_STATUS_DATA_OVERRUN (0xC0000000 | 0x003c) +#define NT_STATUS_DATA_LATE_ERROR (0xC0000000 | 0x003d) +#define NT_STATUS_DATA_ERROR (0xC0000000 | 0x003e) +#define NT_STATUS_CRC_ERROR (0xC0000000 | 0x003f) +#define NT_STATUS_SECTION_TOO_BIG (0xC0000000 | 0x0040) +#define NT_STATUS_PORT_CONNECTION_REFUSED (0xC0000000 | 0x0041) +#define NT_STATUS_INVALID_PORT_HANDLE (0xC0000000 | 0x0042) +#define NT_STATUS_SHARING_VIOLATION (0xC0000000 | 0x0043) +#define NT_STATUS_QUOTA_EXCEEDED (0xC0000000 | 0x0044) +#define NT_STATUS_INVALID_PAGE_PROTECTION (0xC0000000 | 0x0045) +#define NT_STATUS_MUTANT_NOT_OWNED (0xC0000000 | 0x0046) +#define NT_STATUS_SEMAPHORE_LIMIT_EXCEEDED (0xC0000000 | 0x0047) +#define NT_STATUS_PORT_ALREADY_SET (0xC0000000 | 0x0048) +#define NT_STATUS_SECTION_NOT_IMAGE (0xC0000000 | 0x0049) +#define NT_STATUS_SUSPEND_COUNT_EXCEEDED (0xC0000000 | 0x004a) +#define NT_STATUS_THREAD_IS_TERMINATING (0xC0000000 | 0x004b) +#define NT_STATUS_BAD_WORKING_SET_LIMIT (0xC0000000 | 0x004c) +#define NT_STATUS_INCOMPATIBLE_FILE_MAP (0xC0000000 | 0x004d) +#define NT_STATUS_SECTION_PROTECTION (0xC0000000 | 0x004e) +#define NT_STATUS_EAS_NOT_SUPPORTED (0xC0000000 | 0x004f) +#define NT_STATUS_EA_TOO_LARGE (0xC0000000 | 0x0050) +#define NT_STATUS_NONEXISTENT_EA_ENTRY (0xC0000000 | 0x0051) +#define NT_STATUS_NO_EAS_ON_FILE (0xC0000000 | 0x0052) +#define NT_STATUS_EA_CORRUPT_ERROR (0xC0000000 | 0x0053) +#define NT_STATUS_FILE_LOCK_CONFLICT (0xC0000000 | 0x0054) +#define NT_STATUS_LOCK_NOT_GRANTED (0xC0000000 | 0x0055) +#define NT_STATUS_DELETE_PENDING (0xC0000000 | 0x0056) +#define NT_STATUS_CTL_FILE_NOT_SUPPORTED (0xC0000000 | 0x0057) +#define NT_STATUS_UNKNOWN_REVISION (0xC0000000 | 0x0058) +#define NT_STATUS_REVISION_MISMATCH (0xC0000000 | 0x0059) +#define NT_STATUS_INVALID_OWNER (0xC0000000 | 0x005a) +#define NT_STATUS_INVALID_PRIMARY_GROUP (0xC0000000 | 0x005b) +#define NT_STATUS_NO_IMPERSONATION_TOKEN (0xC0000000 | 0x005c) +#define NT_STATUS_CANT_DISABLE_MANDATORY (0xC0000000 | 0x005d) +#define NT_STATUS_NO_LOGON_SERVERS (0xC0000000 | 0x005e) +#define NT_STATUS_NO_SUCH_LOGON_SESSION (0xC0000000 | 0x005f) +#define NT_STATUS_NO_SUCH_PRIVILEGE (0xC0000000 | 0x0060) +#define NT_STATUS_PRIVILEGE_NOT_HELD (0xC0000000 | 0x0061) +#define NT_STATUS_INVALID_ACCOUNT_NAME (0xC0000000 | 0x0062) +#define NT_STATUS_USER_EXISTS (0xC0000000 | 0x0063) +#define NT_STATUS_NO_SUCH_USER (0xC0000000 | 0x0064) +#define NT_STATUS_GROUP_EXISTS (0xC0000000 | 0x0065) +#define NT_STATUS_NO_SUCH_GROUP (0xC0000000 | 0x0066) +#define NT_STATUS_MEMBER_IN_GROUP (0xC0000000 | 0x0067) +#define NT_STATUS_MEMBER_NOT_IN_GROUP (0xC0000000 | 0x0068) +#define NT_STATUS_LAST_ADMIN (0xC0000000 | 0x0069) +#define NT_STATUS_WRONG_PASSWORD (0xC0000000 | 0x006a) +#define NT_STATUS_ILL_FORMED_PASSWORD (0xC0000000 | 0x006b) +#define NT_STATUS_PASSWORD_RESTRICTION (0xC0000000 | 0x006c) +#define NT_STATUS_LOGON_FAILURE (0xC0000000 | 0x006d) +#define NT_STATUS_ACCOUNT_RESTRICTION (0xC0000000 | 0x006e) +#define NT_STATUS_INVALID_LOGON_HOURS (0xC0000000 | 0x006f) +#define NT_STATUS_INVALID_WORKSTATION (0xC0000000 | 0x0070) +#define NT_STATUS_PASSWORD_EXPIRED (0xC0000000 | 0x0071) +#define NT_STATUS_ACCOUNT_DISABLED (0xC0000000 | 0x0072) +#define NT_STATUS_NONE_MAPPED (0xC0000000 | 0x0073) +#define NT_STATUS_TOO_MANY_LUIDS_REQUESTED (0xC0000000 | 0x0074) +#define NT_STATUS_LUIDS_EXHAUSTED (0xC0000000 | 0x0075) +#define NT_STATUS_INVALID_SUB_AUTHORITY (0xC0000000 | 0x0076) +#define NT_STATUS_INVALID_ACL (0xC0000000 | 0x0077) +#define NT_STATUS_INVALID_SID (0xC0000000 | 0x0078) +#define NT_STATUS_INVALID_SECURITY_DESCR (0xC0000000 | 0x0079) +#define NT_STATUS_PROCEDURE_NOT_FOUND (0xC0000000 | 0x007a) +#define NT_STATUS_INVALID_IMAGE_FORMAT (0xC0000000 | 0x007b) +#define NT_STATUS_NO_TOKEN (0xC0000000 | 0x007c) +#define NT_STATUS_BAD_INHERITANCE_ACL (0xC0000000 | 0x007d) +#define NT_STATUS_RANGE_NOT_LOCKED (0xC0000000 | 0x007e) +#define NT_STATUS_DISK_FULL (0xC0000000 | 0x007f) +#define NT_STATUS_SERVER_DISABLED (0xC0000000 | 0x0080) +#define NT_STATUS_SERVER_NOT_DISABLED (0xC0000000 | 0x0081) +#define NT_STATUS_TOO_MANY_GUIDS_REQUESTED (0xC0000000 | 0x0082) +#define NT_STATUS_GUIDS_EXHAUSTED (0xC0000000 | 0x0083) +#define NT_STATUS_INVALID_ID_AUTHORITY (0xC0000000 | 0x0084) +#define NT_STATUS_AGENTS_EXHAUSTED (0xC0000000 | 0x0085) +#define NT_STATUS_INVALID_VOLUME_LABEL (0xC0000000 | 0x0086) +#define NT_STATUS_SECTION_NOT_EXTENDED (0xC0000000 | 0x0087) +#define NT_STATUS_NOT_MAPPED_DATA (0xC0000000 | 0x0088) +#define NT_STATUS_RESOURCE_DATA_NOT_FOUND (0xC0000000 | 0x0089) +#define NT_STATUS_RESOURCE_TYPE_NOT_FOUND (0xC0000000 | 0x008a) +#define NT_STATUS_RESOURCE_NAME_NOT_FOUND (0xC0000000 | 0x008b) +#define NT_STATUS_ARRAY_BOUNDS_EXCEEDED (0xC0000000 | 0x008c) +#define NT_STATUS_FLOAT_DENORMAL_OPERAND (0xC0000000 | 0x008d) +#define NT_STATUS_FLOAT_DIVIDE_BY_ZERO (0xC0000000 | 0x008e) +#define NT_STATUS_FLOAT_INEXACT_RESULT (0xC0000000 | 0x008f) +#define NT_STATUS_FLOAT_INVALID_OPERATION (0xC0000000 | 0x0090) +#define NT_STATUS_FLOAT_OVERFLOW (0xC0000000 | 0x0091) +#define NT_STATUS_FLOAT_STACK_CHECK (0xC0000000 | 0x0092) +#define NT_STATUS_FLOAT_UNDERFLOW (0xC0000000 | 0x0093) +#define NT_STATUS_INTEGER_DIVIDE_BY_ZERO (0xC0000000 | 0x0094) +#define NT_STATUS_INTEGER_OVERFLOW (0xC0000000 | 0x0095) +#define NT_STATUS_PRIVILEGED_INSTRUCTION (0xC0000000 | 0x0096) +#define NT_STATUS_TOO_MANY_PAGING_FILES (0xC0000000 | 0x0097) +#define NT_STATUS_FILE_INVALID (0xC0000000 | 0x0098) +#define NT_STATUS_ALLOTTED_SPACE_EXCEEDED (0xC0000000 | 0x0099) +#define NT_STATUS_INSUFFICIENT_RESOURCES (0xC0000000 | 0x009a) +#define NT_STATUS_DFS_EXIT_PATH_FOUND (0xC0000000 | 0x009b) +#define NT_STATUS_DEVICE_DATA_ERROR (0xC0000000 | 0x009c) +#define NT_STATUS_DEVICE_NOT_CONNECTED (0xC0000000 | 0x009d) +#define NT_STATUS_DEVICE_POWER_FAILURE (0xC0000000 | 0x009e) +#define NT_STATUS_FREE_VM_NOT_AT_BASE (0xC0000000 | 0x009f) +#define NT_STATUS_MEMORY_NOT_ALLOCATED (0xC0000000 | 0x00a0) +#define NT_STATUS_WORKING_SET_QUOTA (0xC0000000 | 0x00a1) +#define NT_STATUS_MEDIA_WRITE_PROTECTED (0xC0000000 | 0x00a2) +#define NT_STATUS_DEVICE_NOT_READY (0xC0000000 | 0x00a3) +#define NT_STATUS_INVALID_GROUP_ATTRIBUTES (0xC0000000 | 0x00a4) +#define NT_STATUS_BAD_IMPERSONATION_LEVEL (0xC0000000 | 0x00a5) +#define NT_STATUS_CANT_OPEN_ANONYMOUS (0xC0000000 | 0x00a6) +#define NT_STATUS_BAD_VALIDATION_CLASS (0xC0000000 | 0x00a7) +#define NT_STATUS_BAD_TOKEN_TYPE (0xC0000000 | 0x00a8) +#define NT_STATUS_BAD_MASTER_BOOT_RECORD (0xC0000000 | 0x00a9) +#define NT_STATUS_INSTRUCTION_MISALIGNMENT (0xC0000000 | 0x00aa) +#define NT_STATUS_INSTANCE_NOT_AVAILABLE (0xC0000000 | 0x00ab) +#define NT_STATUS_PIPE_NOT_AVAILABLE (0xC0000000 | 0x00ac) +#define NT_STATUS_INVALID_PIPE_STATE (0xC0000000 | 0x00ad) +#define NT_STATUS_PIPE_BUSY (0xC0000000 | 0x00ae) +#define NT_STATUS_ILLEGAL_FUNCTION (0xC0000000 | 0x00af) +#define NT_STATUS_PIPE_DISCONNECTED (0xC0000000 | 0x00b0) +#define NT_STATUS_PIPE_CLOSING (0xC0000000 | 0x00b1) +#define NT_STATUS_PIPE_CONNECTED (0xC0000000 | 0x00b2) +#define NT_STATUS_PIPE_LISTENING (0xC0000000 | 0x00b3) +#define NT_STATUS_INVALID_READ_MODE (0xC0000000 | 0x00b4) +#define NT_STATUS_IO_TIMEOUT (0xC0000000 | 0x00b5) +#define NT_STATUS_FILE_FORCED_CLOSED (0xC0000000 | 0x00b6) +#define NT_STATUS_PROFILING_NOT_STARTED (0xC0000000 | 0x00b7) +#define NT_STATUS_PROFILING_NOT_STOPPED (0xC0000000 | 0x00b8) +#define NT_STATUS_COULD_NOT_INTERPRET (0xC0000000 | 0x00b9) +#define NT_STATUS_FILE_IS_A_DIRECTORY (0xC0000000 | 0x00ba) +#define NT_STATUS_NOT_SUPPORTED (0xC0000000 | 0x00bb) +#define NT_STATUS_REMOTE_NOT_LISTENING (0xC0000000 | 0x00bc) +#define NT_STATUS_DUPLICATE_NAME (0xC0000000 | 0x00bd) +#define NT_STATUS_BAD_NETWORK_PATH (0xC0000000 | 0x00be) +#define NT_STATUS_NETWORK_BUSY (0xC0000000 | 0x00bf) +#define NT_STATUS_DEVICE_DOES_NOT_EXIST (0xC0000000 | 0x00c0) +#define NT_STATUS_TOO_MANY_COMMANDS (0xC0000000 | 0x00c1) +#define NT_STATUS_ADAPTER_HARDWARE_ERROR (0xC0000000 | 0x00c2) +#define NT_STATUS_INVALID_NETWORK_RESPONSE (0xC0000000 | 0x00c3) +#define NT_STATUS_UNEXPECTED_NETWORK_ERROR (0xC0000000 | 0x00c4) +#define NT_STATUS_BAD_REMOTE_ADAPTER (0xC0000000 | 0x00c5) +#define NT_STATUS_PRINT_QUEUE_FULL (0xC0000000 | 0x00c6) +#define NT_STATUS_NO_SPOOL_SPACE (0xC0000000 | 0x00c7) +#define NT_STATUS_PRINT_CANCELLED (0xC0000000 | 0x00c8) +#define NT_STATUS_NETWORK_NAME_DELETED (0xC0000000 | 0x00c9) +#define NT_STATUS_NETWORK_ACCESS_DENIED (0xC0000000 | 0x00ca) +#define NT_STATUS_BAD_DEVICE_TYPE (0xC0000000 | 0x00cb) +#define NT_STATUS_BAD_NETWORK_NAME (0xC0000000 | 0x00cc) +#define NT_STATUS_TOO_MANY_NAMES (0xC0000000 | 0x00cd) +#define NT_STATUS_TOO_MANY_SESSIONS (0xC0000000 | 0x00ce) +#define NT_STATUS_SHARING_PAUSED (0xC0000000 | 0x00cf) +#define NT_STATUS_REQUEST_NOT_ACCEPTED (0xC0000000 | 0x00d0) +#define NT_STATUS_REDIRECTOR_PAUSED (0xC0000000 | 0x00d1) +#define NT_STATUS_NET_WRITE_FAULT (0xC0000000 | 0x00d2) +#define NT_STATUS_PROFILING_AT_LIMIT (0xC0000000 | 0x00d3) +#define NT_STATUS_NOT_SAME_DEVICE (0xC0000000 | 0x00d4) +#define NT_STATUS_FILE_RENAMED (0xC0000000 | 0x00d5) +#define NT_STATUS_VIRTUAL_CIRCUIT_CLOSED (0xC0000000 | 0x00d6) +#define NT_STATUS_NO_SECURITY_ON_OBJECT (0xC0000000 | 0x00d7) +#define NT_STATUS_CANT_WAIT (0xC0000000 | 0x00d8) +#define NT_STATUS_PIPE_EMPTY (0xC0000000 | 0x00d9) +#define NT_STATUS_CANT_ACCESS_DOMAIN_INFO (0xC0000000 | 0x00da) +#define NT_STATUS_CANT_TERMINATE_SELF (0xC0000000 | 0x00db) +#define NT_STATUS_INVALID_SERVER_STATE (0xC0000000 | 0x00dc) +#define NT_STATUS_INVALID_DOMAIN_STATE (0xC0000000 | 0x00dd) +#define NT_STATUS_INVALID_DOMAIN_ROLE (0xC0000000 | 0x00de) +#define NT_STATUS_NO_SUCH_DOMAIN (0xC0000000 | 0x00df) +#define NT_STATUS_DOMAIN_EXISTS (0xC0000000 | 0x00e0) +#define NT_STATUS_DOMAIN_LIMIT_EXCEEDED (0xC0000000 | 0x00e1) +#define NT_STATUS_OPLOCK_NOT_GRANTED (0xC0000000 | 0x00e2) +#define NT_STATUS_INVALID_OPLOCK_PROTOCOL (0xC0000000 | 0x00e3) +#define NT_STATUS_INTERNAL_DB_CORRUPTION (0xC0000000 | 0x00e4) +#define NT_STATUS_INTERNAL_ERROR (0xC0000000 | 0x00e5) +#define NT_STATUS_GENERIC_NOT_MAPPED (0xC0000000 | 0x00e6) +#define NT_STATUS_BAD_DESCRIPTOR_FORMAT (0xC0000000 | 0x00e7) +#define NT_STATUS_INVALID_USER_BUFFER (0xC0000000 | 0x00e8) +#define NT_STATUS_UNEXPECTED_IO_ERROR (0xC0000000 | 0x00e9) +#define NT_STATUS_UNEXPECTED_MM_CREATE_ERR (0xC0000000 | 0x00ea) +#define NT_STATUS_UNEXPECTED_MM_MAP_ERROR (0xC0000000 | 0x00eb) +#define NT_STATUS_UNEXPECTED_MM_EXTEND_ERR (0xC0000000 | 0x00ec) +#define NT_STATUS_NOT_LOGON_PROCESS (0xC0000000 | 0x00ed) +#define NT_STATUS_LOGON_SESSION_EXISTS (0xC0000000 | 0x00ee) +#define NT_STATUS_INVALID_PARAMETER_1 (0xC0000000 | 0x00ef) +#define NT_STATUS_INVALID_PARAMETER_2 (0xC0000000 | 0x00f0) +#define NT_STATUS_INVALID_PARAMETER_3 (0xC0000000 | 0x00f1) +#define NT_STATUS_INVALID_PARAMETER_4 (0xC0000000 | 0x00f2) +#define NT_STATUS_INVALID_PARAMETER_5 (0xC0000000 | 0x00f3) +#define NT_STATUS_INVALID_PARAMETER_6 (0xC0000000 | 0x00f4) +#define NT_STATUS_INVALID_PARAMETER_7 (0xC0000000 | 0x00f5) +#define NT_STATUS_INVALID_PARAMETER_8 (0xC0000000 | 0x00f6) +#define NT_STATUS_INVALID_PARAMETER_9 (0xC0000000 | 0x00f7) +#define NT_STATUS_INVALID_PARAMETER_10 (0xC0000000 | 0x00f8) +#define NT_STATUS_INVALID_PARAMETER_11 (0xC0000000 | 0x00f9) +#define NT_STATUS_INVALID_PARAMETER_12 (0xC0000000 | 0x00fa) +#define NT_STATUS_REDIRECTOR_NOT_STARTED (0xC0000000 | 0x00fb) +#define NT_STATUS_REDIRECTOR_STARTED (0xC0000000 | 0x00fc) +#define NT_STATUS_STACK_OVERFLOW (0xC0000000 | 0x00fd) +#define NT_STATUS_NO_SUCH_PACKAGE (0xC0000000 | 0x00fe) +#define NT_STATUS_BAD_FUNCTION_TABLE (0xC0000000 | 0x00ff) +#define NT_STATUS_DIRECTORY_NOT_EMPTY (0xC0000000 | 0x0101) +#define NT_STATUS_FILE_CORRUPT_ERROR (0xC0000000 | 0x0102) +#define NT_STATUS_NOT_A_DIRECTORY (0xC0000000 | 0x0103) +#define NT_STATUS_BAD_LOGON_SESSION_STATE (0xC0000000 | 0x0104) +#define NT_STATUS_LOGON_SESSION_COLLISION (0xC0000000 | 0x0105) +#define NT_STATUS_NAME_TOO_LONG (0xC0000000 | 0x0106) +#define NT_STATUS_FILES_OPEN (0xC0000000 | 0x0107) +#define NT_STATUS_CONNECTION_IN_USE (0xC0000000 | 0x0108) +#define NT_STATUS_MESSAGE_NOT_FOUND (0xC0000000 | 0x0109) +#define NT_STATUS_PROCESS_IS_TERMINATING (0xC0000000 | 0x010a) +#define NT_STATUS_INVALID_LOGON_TYPE (0xC0000000 | 0x010b) +#define NT_STATUS_NO_GUID_TRANSLATION (0xC0000000 | 0x010c) +#define NT_STATUS_CANNOT_IMPERSONATE (0xC0000000 | 0x010d) +#define NT_STATUS_IMAGE_ALREADY_LOADED (0xC0000000 | 0x010e) +#define NT_STATUS_ABIOS_NOT_PRESENT (0xC0000000 | 0x010f) +#define NT_STATUS_ABIOS_LID_NOT_EXIST (0xC0000000 | 0x0110) +#define NT_STATUS_ABIOS_LID_ALREADY_OWNED (0xC0000000 | 0x0111) +#define NT_STATUS_ABIOS_NOT_LID_OWNER (0xC0000000 | 0x0112) +#define NT_STATUS_ABIOS_INVALID_COMMAND (0xC0000000 | 0x0113) +#define NT_STATUS_ABIOS_INVALID_LID (0xC0000000 | 0x0114) +#define NT_STATUS_ABIOS_SELECTOR_NOT_AVAILABLE (0xC0000000 | 0x0115) +#define NT_STATUS_ABIOS_INVALID_SELECTOR (0xC0000000 | 0x0116) +#define NT_STATUS_NO_LDT (0xC0000000 | 0x0117) +#define NT_STATUS_INVALID_LDT_SIZE (0xC0000000 | 0x0118) +#define NT_STATUS_INVALID_LDT_OFFSET (0xC0000000 | 0x0119) +#define NT_STATUS_INVALID_LDT_DESCRIPTOR (0xC0000000 | 0x011a) +#define NT_STATUS_INVALID_IMAGE_NE_FORMAT (0xC0000000 | 0x011b) +#define NT_STATUS_RXACT_INVALID_STATE (0xC0000000 | 0x011c) +#define NT_STATUS_RXACT_COMMIT_FAILURE (0xC0000000 | 0x011d) +#define NT_STATUS_MAPPED_FILE_SIZE_ZERO (0xC0000000 | 0x011e) +#define NT_STATUS_TOO_MANY_OPENED_FILES (0xC0000000 | 0x011f) +#define NT_STATUS_CANCELLED (0xC0000000 | 0x0120) +#define NT_STATUS_CANNOT_DELETE (0xC0000000 | 0x0121) +#define NT_STATUS_INVALID_COMPUTER_NAME (0xC0000000 | 0x0122) +#define NT_STATUS_FILE_DELETED (0xC0000000 | 0x0123) +#define NT_STATUS_SPECIAL_ACCOUNT (0xC0000000 | 0x0124) +#define NT_STATUS_SPECIAL_GROUP (0xC0000000 | 0x0125) +#define NT_STATUS_SPECIAL_USER (0xC0000000 | 0x0126) +#define NT_STATUS_MEMBERS_PRIMARY_GROUP (0xC0000000 | 0x0127) +#define NT_STATUS_FILE_CLOSED (0xC0000000 | 0x0128) +#define NT_STATUS_TOO_MANY_THREADS (0xC0000000 | 0x0129) +#define NT_STATUS_THREAD_NOT_IN_PROCESS (0xC0000000 | 0x012a) +#define NT_STATUS_TOKEN_ALREADY_IN_USE (0xC0000000 | 0x012b) +#define NT_STATUS_PAGEFILE_QUOTA_EXCEEDED (0xC0000000 | 0x012c) +#define NT_STATUS_COMMITMENT_LIMIT (0xC0000000 | 0x012d) +#define NT_STATUS_INVALID_IMAGE_LE_FORMAT (0xC0000000 | 0x012e) +#define NT_STATUS_INVALID_IMAGE_NOT_MZ (0xC0000000 | 0x012f) +#define NT_STATUS_INVALID_IMAGE_PROTECT (0xC0000000 | 0x0130) +#define NT_STATUS_INVALID_IMAGE_WIN_16 (0xC0000000 | 0x0131) +#define NT_STATUS_LOGON_SERVER_CONFLICT (0xC0000000 | 0x0132) +#define NT_STATUS_TIME_DIFFERENCE_AT_DC (0xC0000000 | 0x0133) +#define NT_STATUS_SYNCHRONIZATION_REQUIRED (0xC0000000 | 0x0134) +#define NT_STATUS_DLL_NOT_FOUND (0xC0000000 | 0x0135) +#define NT_STATUS_OPEN_FAILED (0xC0000000 | 0x0136) +#define NT_STATUS_IO_PRIVILEGE_FAILED (0xC0000000 | 0x0137) +#define NT_STATUS_ORDINAL_NOT_FOUND (0xC0000000 | 0x0138) +#define NT_STATUS_ENTRYPOINT_NOT_FOUND (0xC0000000 | 0x0139) +#define NT_STATUS_CONTROL_C_EXIT (0xC0000000 | 0x013a) +#define NT_STATUS_LOCAL_DISCONNECT (0xC0000000 | 0x013b) +#define NT_STATUS_REMOTE_DISCONNECT (0xC0000000 | 0x013c) +#define NT_STATUS_REMOTE_RESOURCES (0xC0000000 | 0x013d) +#define NT_STATUS_LINK_FAILED (0xC0000000 | 0x013e) +#define NT_STATUS_LINK_TIMEOUT (0xC0000000 | 0x013f) +#define NT_STATUS_INVALID_CONNECTION (0xC0000000 | 0x0140) +#define NT_STATUS_INVALID_ADDRESS (0xC0000000 | 0x0141) +#define NT_STATUS_DLL_INIT_FAILED (0xC0000000 | 0x0142) +#define NT_STATUS_MISSING_SYSTEMFILE (0xC0000000 | 0x0143) +#define NT_STATUS_UNHANDLED_EXCEPTION (0xC0000000 | 0x0144) +#define NT_STATUS_APP_INIT_FAILURE (0xC0000000 | 0x0145) +#define NT_STATUS_PAGEFILE_CREATE_FAILED (0xC0000000 | 0x0146) +#define NT_STATUS_NO_PAGEFILE (0xC0000000 | 0x0147) +#define NT_STATUS_INVALID_LEVEL (0xC0000000 | 0x0148) +#define NT_STATUS_WRONG_PASSWORD_CORE (0xC0000000 | 0x0149) +#define NT_STATUS_ILLEGAL_FLOAT_CONTEXT (0xC0000000 | 0x014a) +#define NT_STATUS_PIPE_BROKEN (0xC0000000 | 0x014b) +#define NT_STATUS_REGISTRY_CORRUPT (0xC0000000 | 0x014c) +#define NT_STATUS_REGISTRY_IO_FAILED (0xC0000000 | 0x014d) +#define NT_STATUS_NO_EVENT_PAIR (0xC0000000 | 0x014e) +#define NT_STATUS_UNRECOGNIZED_VOLUME (0xC0000000 | 0x014f) +#define NT_STATUS_SERIAL_NO_DEVICE_INITED (0xC0000000 | 0x0150) +#define NT_STATUS_NO_SUCH_ALIAS (0xC0000000 | 0x0151) +#define NT_STATUS_MEMBER_NOT_IN_ALIAS (0xC0000000 | 0x0152) +#define NT_STATUS_MEMBER_IN_ALIAS (0xC0000000 | 0x0153) +#define NT_STATUS_ALIAS_EXISTS (0xC0000000 | 0x0154) +#define NT_STATUS_LOGON_NOT_GRANTED (0xC0000000 | 0x0155) +#define NT_STATUS_TOO_MANY_SECRETS (0xC0000000 | 0x0156) +#define NT_STATUS_SECRET_TOO_LONG (0xC0000000 | 0x0157) +#define NT_STATUS_INTERNAL_DB_ERROR (0xC0000000 | 0x0158) +#define NT_STATUS_FULLSCREEN_MODE (0xC0000000 | 0x0159) +#define NT_STATUS_TOO_MANY_CONTEXT_IDS (0xC0000000 | 0x015a) +#define NT_STATUS_LOGON_TYPE_NOT_GRANTED (0xC0000000 | 0x015b) +#define NT_STATUS_NOT_REGISTRY_FILE (0xC0000000 | 0x015c) +#define NT_STATUS_NT_CROSS_ENCRYPTION_REQUIRED (0xC0000000 | 0x015d) +#define NT_STATUS_DOMAIN_CTRLR_CONFIG_ERROR (0xC0000000 | 0x015e) +#define NT_STATUS_FT_MISSING_MEMBER (0xC0000000 | 0x015f) +#define NT_STATUS_ILL_FORMED_SERVICE_ENTRY (0xC0000000 | 0x0160) +#define NT_STATUS_ILLEGAL_CHARACTER (0xC0000000 | 0x0161) +#define NT_STATUS_UNMAPPABLE_CHARACTER (0xC0000000 | 0x0162) +#define NT_STATUS_UNDEFINED_CHARACTER (0xC0000000 | 0x0163) +#define NT_STATUS_FLOPPY_VOLUME (0xC0000000 | 0x0164) +#define NT_STATUS_FLOPPY_ID_MARK_NOT_FOUND (0xC0000000 | 0x0165) +#define NT_STATUS_FLOPPY_WRONG_CYLINDER (0xC0000000 | 0x0166) +#define NT_STATUS_FLOPPY_UNKNOWN_ERROR (0xC0000000 | 0x0167) +#define NT_STATUS_FLOPPY_BAD_REGISTERS (0xC0000000 | 0x0168) +#define NT_STATUS_DISK_RECALIBRATE_FAILED (0xC0000000 | 0x0169) +#define NT_STATUS_DISK_OPERATION_FAILED (0xC0000000 | 0x016a) +#define NT_STATUS_DISK_RESET_FAILED (0xC0000000 | 0x016b) +#define NT_STATUS_SHARED_IRQ_BUSY (0xC0000000 | 0x016c) +#define NT_STATUS_FT_ORPHANING (0xC0000000 | 0x016d) +#define NT_STATUS_PARTITION_FAILURE (0xC0000000 | 0x0172) +#define NT_STATUS_INVALID_BLOCK_LENGTH (0xC0000000 | 0x0173) +#define NT_STATUS_DEVICE_NOT_PARTITIONED (0xC0000000 | 0x0174) +#define NT_STATUS_UNABLE_TO_LOCK_MEDIA (0xC0000000 | 0x0175) +#define NT_STATUS_UNABLE_TO_UNLOAD_MEDIA (0xC0000000 | 0x0176) +#define NT_STATUS_EOM_OVERFLOW (0xC0000000 | 0x0177) +#define NT_STATUS_NO_MEDIA (0xC0000000 | 0x0178) +#define NT_STATUS_NO_SUCH_MEMBER (0xC0000000 | 0x017a) +#define NT_STATUS_INVALID_MEMBER (0xC0000000 | 0x017b) +#define NT_STATUS_KEY_DELETED (0xC0000000 | 0x017c) +#define NT_STATUS_NO_LOG_SPACE (0xC0000000 | 0x017d) +#define NT_STATUS_TOO_MANY_SIDS (0xC0000000 | 0x017e) +#define NT_STATUS_LM_CROSS_ENCRYPTION_REQUIRED (0xC0000000 | 0x017f) +#define NT_STATUS_KEY_HAS_CHILDREN (0xC0000000 | 0x0180) +#define NT_STATUS_CHILD_MUST_BE_VOLATILE (0xC0000000 | 0x0181) +#define NT_STATUS_DEVICE_CONFIGURATION_ERROR (0xC0000000 | 0x0182) +#define NT_STATUS_DRIVER_INTERNAL_ERROR (0xC0000000 | 0x0183) +#define NT_STATUS_INVALID_DEVICE_STATE (0xC0000000 | 0x0184) +#define NT_STATUS_IO_DEVICE_ERROR (0xC0000000 | 0x0185) +#define NT_STATUS_DEVICE_PROTOCOL_ERROR (0xC0000000 | 0x0186) +#define NT_STATUS_BACKUP_CONTROLLER (0xC0000000 | 0x0187) +#define NT_STATUS_LOG_FILE_FULL (0xC0000000 | 0x0188) +#define NT_STATUS_TOO_LATE (0xC0000000 | 0x0189) +#define NT_STATUS_NO_TRUST_LSA_SECRET (0xC0000000 | 0x018a) +#define NT_STATUS_NO_TRUST_SAM_ACCOUNT (0xC0000000 | 0x018b) +#define NT_STATUS_TRUSTED_DOMAIN_FAILURE (0xC0000000 | 0x018c) +#define NT_STATUS_TRUSTED_RELATIONSHIP_FAILURE (0xC0000000 | 0x018d) +#define NT_STATUS_EVENTLOG_FILE_CORRUPT (0xC0000000 | 0x018e) +#define NT_STATUS_EVENTLOG_CANT_START (0xC0000000 | 0x018f) +#define NT_STATUS_TRUST_FAILURE (0xC0000000 | 0x0190) +#define NT_STATUS_MUTANT_LIMIT_EXCEEDED (0xC0000000 | 0x0191) +#define NT_STATUS_NETLOGON_NOT_STARTED (0xC0000000 | 0x0192) +#define NT_STATUS_ACCOUNT_EXPIRED (0xC0000000 | 0x0193) +#define NT_STATUS_POSSIBLE_DEADLOCK (0xC0000000 | 0x0194) +#define NT_STATUS_NETWORK_CREDENTIAL_CONFLICT (0xC0000000 | 0x0195) +#define NT_STATUS_REMOTE_SESSION_LIMIT (0xC0000000 | 0x0196) +#define NT_STATUS_EVENTLOG_FILE_CHANGED (0xC0000000 | 0x0197) +#define NT_STATUS_NOLOGON_INTERDOMAIN_TRUST_ACCOUNT (0xC0000000 | 0x0198) +#define NT_STATUS_NOLOGON_WORKSTATION_TRUST_ACCOUNT (0xC0000000 | 0x0199) +#define NT_STATUS_NOLOGON_SERVER_TRUST_ACCOUNT (0xC0000000 | 0x019a) +#define NT_STATUS_DOMAIN_TRUST_INCONSISTENT (0xC0000000 | 0x019b) +#define NT_STATUS_FS_DRIVER_REQUIRED (0xC0000000 | 0x019c) +#define NT_STATUS_NO_USER_SESSION_KEY (0xC0000000 | 0x0202) +#define NT_STATUS_USER_SESSION_DELETED (0xC0000000 | 0x0203) +#define NT_STATUS_RESOURCE_LANG_NOT_FOUND (0xC0000000 | 0x0204) +#define NT_STATUS_INSUFF_SERVER_RESOURCES (0xC0000000 | 0x0205) +#define NT_STATUS_INVALID_BUFFER_SIZE (0xC0000000 | 0x0206) +#define NT_STATUS_INVALID_ADDRESS_COMPONENT (0xC0000000 | 0x0207) +#define NT_STATUS_INVALID_ADDRESS_WILDCARD (0xC0000000 | 0x0208) +#define NT_STATUS_TOO_MANY_ADDRESSES (0xC0000000 | 0x0209) +#define NT_STATUS_ADDRESS_ALREADY_EXISTS (0xC0000000 | 0x020a) +#define NT_STATUS_ADDRESS_CLOSED (0xC0000000 | 0x020b) +#define NT_STATUS_CONNECTION_DISCONNECTED (0xC0000000 | 0x020c) +#define NT_STATUS_CONNECTION_RESET (0xC0000000 | 0x020d) +#define NT_STATUS_TOO_MANY_NODES (0xC0000000 | 0x020e) +#define NT_STATUS_TRANSACTION_ABORTED (0xC0000000 | 0x020f) +#define NT_STATUS_TRANSACTION_TIMED_OUT (0xC0000000 | 0x0210) +#define NT_STATUS_TRANSACTION_NO_RELEASE (0xC0000000 | 0x0211) +#define NT_STATUS_TRANSACTION_NO_MATCH (0xC0000000 | 0x0212) +#define NT_STATUS_TRANSACTION_RESPONDED (0xC0000000 | 0x0213) +#define NT_STATUS_TRANSACTION_INVALID_ID (0xC0000000 | 0x0214) +#define NT_STATUS_TRANSACTION_INVALID_TYPE (0xC0000000 | 0x0215) +#define NT_STATUS_NOT_SERVER_SESSION (0xC0000000 | 0x0216) +#define NT_STATUS_NOT_CLIENT_SESSION (0xC0000000 | 0x0217) +#define NT_STATUS_CANNOT_LOAD_REGISTRY_FILE (0xC0000000 | 0x0218) +#define NT_STATUS_DEBUG_ATTACH_FAILED (0xC0000000 | 0x0219) +#define NT_STATUS_SYSTEM_PROCESS_TERMINATED (0xC0000000 | 0x021a) +#define NT_STATUS_DATA_NOT_ACCEPTED (0xC0000000 | 0x021b) +#define NT_STATUS_NO_BROWSER_SERVERS_FOUND (0xC0000000 | 0x021c) +#define NT_STATUS_VDM_HARD_ERROR (0xC0000000 | 0x021d) +#define NT_STATUS_DRIVER_CANCEL_TIMEOUT (0xC0000000 | 0x021e) +#define NT_STATUS_REPLY_MESSAGE_MISMATCH (0xC0000000 | 0x021f) +#define NT_STATUS_MAPPED_ALIGNMENT (0xC0000000 | 0x0220) +#define NT_STATUS_IMAGE_CHECKSUM_MISMATCH (0xC0000000 | 0x0221) +#define NT_STATUS_LOST_WRITEBEHIND_DATA (0xC0000000 | 0x0222) +#define NT_STATUS_CLIENT_SERVER_PARAMETERS_INVALID (0xC0000000 | 0x0223) +#define NT_STATUS_PASSWORD_MUST_CHANGE (0xC0000000 | 0x0224) +#define NT_STATUS_NOT_FOUND (0xC0000000 | 0x0225) +#define NT_STATUS_NOT_TINY_STREAM (0xC0000000 | 0x0226) +#define NT_STATUS_RECOVERY_FAILURE (0xC0000000 | 0x0227) +#define NT_STATUS_STACK_OVERFLOW_READ (0xC0000000 | 0x0228) +#define NT_STATUS_FAIL_CHECK (0xC0000000 | 0x0229) +#define NT_STATUS_DUPLICATE_OBJECTID (0xC0000000 | 0x022a) +#define NT_STATUS_OBJECTID_EXISTS (0xC0000000 | 0x022b) +#define NT_STATUS_CONVERT_TO_LARGE (0xC0000000 | 0x022c) +#define NT_STATUS_RETRY (0xC0000000 | 0x022d) +#define NT_STATUS_FOUND_OUT_OF_SCOPE (0xC0000000 | 0x022e) +#define NT_STATUS_ALLOCATE_BUCKET (0xC0000000 | 0x022f) +#define NT_STATUS_PROPSET_NOT_FOUND (0xC0000000 | 0x0230) +#define NT_STATUS_MARSHALL_OVERFLOW (0xC0000000 | 0x0231) +#define NT_STATUS_INVALID_VARIANT (0xC0000000 | 0x0232) +#define NT_STATUS_DOMAIN_CONTROLLER_NOT_FOUND (0xC0000000 | 0x0233) +#define NT_STATUS_ACCOUNT_LOCKED_OUT (0xC0000000 | 0x0234) +#define NT_STATUS_HANDLE_NOT_CLOSABLE (0xC0000000 | 0x0235) +#define NT_STATUS_CONNECTION_REFUSED (0xC0000000 | 0x0236) +#define NT_STATUS_GRACEFUL_DISCONNECT (0xC0000000 | 0x0237) +#define NT_STATUS_ADDRESS_ALREADY_ASSOCIATED (0xC0000000 | 0x0238) +#define NT_STATUS_ADDRESS_NOT_ASSOCIATED (0xC0000000 | 0x0239) +#define NT_STATUS_CONNECTION_INVALID (0xC0000000 | 0x023a) +#define NT_STATUS_CONNECTION_ACTIVE (0xC0000000 | 0x023b) +#define NT_STATUS_NETWORK_UNREACHABLE (0xC0000000 | 0x023c) +#define NT_STATUS_HOST_UNREACHABLE (0xC0000000 | 0x023d) +#define NT_STATUS_PROTOCOL_UNREACHABLE (0xC0000000 | 0x023e) +#define NT_STATUS_PORT_UNREACHABLE (0xC0000000 | 0x023f) +#define NT_STATUS_REQUEST_ABORTED (0xC0000000 | 0x0240) +#define NT_STATUS_CONNECTION_ABORTED (0xC0000000 | 0x0241) +#define NT_STATUS_BAD_COMPRESSION_BUFFER (0xC0000000 | 0x0242) +#define NT_STATUS_USER_MAPPED_FILE (0xC0000000 | 0x0243) +#define NT_STATUS_AUDIT_FAILED (0xC0000000 | 0x0244) +#define NT_STATUS_TIMER_RESOLUTION_NOT_SET (0xC0000000 | 0x0245) +#define NT_STATUS_CONNECTION_COUNT_LIMIT (0xC0000000 | 0x0246) +#define NT_STATUS_LOGIN_TIME_RESTRICTION (0xC0000000 | 0x0247) +#define NT_STATUS_LOGIN_WKSTA_RESTRICTION (0xC0000000 | 0x0248) +#define NT_STATUS_IMAGE_MP_UP_MISMATCH (0xC0000000 | 0x0249) +#define NT_STATUS_INSUFFICIENT_LOGON_INFO (0xC0000000 | 0x0250) +#define NT_STATUS_BAD_DLL_ENTRYPOINT (0xC0000000 | 0x0251) +#define NT_STATUS_BAD_SERVICE_ENTRYPOINT (0xC0000000 | 0x0252) +#define NT_STATUS_LPC_REPLY_LOST (0xC0000000 | 0x0253) +#define NT_STATUS_IP_ADDRESS_CONFLICT1 (0xC0000000 | 0x0254) +#define NT_STATUS_IP_ADDRESS_CONFLICT2 (0xC0000000 | 0x0255) +#define NT_STATUS_REGISTRY_QUOTA_LIMIT (0xC0000000 | 0x0256) +#define NT_STATUS_PATH_NOT_COVERED (0xC0000000 | 0x0257) +#define NT_STATUS_NO_CALLBACK_ACTIVE (0xC0000000 | 0x0258) +#define NT_STATUS_LICENSE_QUOTA_EXCEEDED (0xC0000000 | 0x0259) +#define NT_STATUS_PWD_TOO_SHORT (0xC0000000 | 0x025a) +#define NT_STATUS_PWD_TOO_RECENT (0xC0000000 | 0x025b) +#define NT_STATUS_PWD_HISTORY_CONFLICT (0xC0000000 | 0x025c) +#define NT_STATUS_PLUGPLAY_NO_DEVICE (0xC0000000 | 0x025e) +#define NT_STATUS_UNSUPPORTED_COMPRESSION (0xC0000000 | 0x025f) +#define NT_STATUS_INVALID_HW_PROFILE (0xC0000000 | 0x0260) +#define NT_STATUS_INVALID_PLUGPLAY_DEVICE_PATH (0xC0000000 | 0x0261) +#define NT_STATUS_DRIVER_ORDINAL_NOT_FOUND (0xC0000000 | 0x0262) +#define NT_STATUS_DRIVER_ENTRYPOINT_NOT_FOUND (0xC0000000 | 0x0263) +#define NT_STATUS_RESOURCE_NOT_OWNED (0xC0000000 | 0x0264) +#define NT_STATUS_TOO_MANY_LINKS (0xC0000000 | 0x0265) +#define NT_STATUS_QUOTA_LIST_INCONSISTENT (0xC0000000 | 0x0266) +#define NT_STATUS_FILE_IS_OFFLINE (0xC0000000 | 0x0267) +#define NT_STATUS_NETWORK_SESSION_EXPIRED (0xC0000000 | 0x035c) +#define NT_STATUS_NO_SUCH_JOB (0xC0000000 | 0xEDE) /* scheduler */ +#define NT_STATUS_NO_PREAUTH_INTEGRITY_HASH_OVERLAP (0xC0000000 | 0x5D0000) +#define NT_STATUS_PENDING 0x00000103 +#endif /* _NTERR_H */ diff --git a/fs/ksmbd/ntlmssp.h b/fs/ksmbd/ntlmssp.h new file mode 100644 index 000000000000..adaf4c0cbe8f --- /dev/null +++ b/fs/ksmbd/ntlmssp.h @@ -0,0 +1,169 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +/* + * Copyright (c) International Business Machines Corp., 2002,2007 + * Author(s): Steve French (sfrench@us.ibm.com) + */ + +#ifndef __KSMBD_NTLMSSP_H +#define __KSMBD_NTLMSSP_H + +#define NTLMSSP_SIGNATURE "NTLMSSP" + +/* Security blob target info data */ +#define TGT_Name "KSMBD" + +/* + * Size of the crypto key returned on the negotiate SMB in bytes + */ +#define CIFS_CRYPTO_KEY_SIZE (8) +#define CIFS_KEY_SIZE (40) + +/* + * Size of encrypted user password in bytes + */ +#define CIFS_ENCPWD_SIZE (16) +#define CIFS_CPHTXT_SIZE (16) + +/* Message Types */ +#define NtLmNegotiate cpu_to_le32(1) +#define NtLmChallenge cpu_to_le32(2) +#define NtLmAuthenticate cpu_to_le32(3) +#define UnknownMessage cpu_to_le32(8) + +/* Negotiate Flags */ +#define NTLMSSP_NEGOTIATE_UNICODE 0x01 /* Text strings are unicode */ +#define NTLMSSP_NEGOTIATE_OEM 0x02 /* Text strings are in OEM */ +#define NTLMSSP_REQUEST_TARGET 0x04 /* Srv returns its auth realm */ +/* define reserved9 0x08 */ +#define NTLMSSP_NEGOTIATE_SIGN 0x0010 /* Request signing capability */ +#define NTLMSSP_NEGOTIATE_SEAL 0x0020 /* Request confidentiality */ +#define NTLMSSP_NEGOTIATE_DGRAM 0x0040 +#define NTLMSSP_NEGOTIATE_LM_KEY 0x0080 /* Use LM session key */ +/* defined reserved 8 0x0100 */ +#define NTLMSSP_NEGOTIATE_NTLM 0x0200 /* NTLM authentication */ +#define NTLMSSP_NEGOTIATE_NT_ONLY 0x0400 /* Lanman not allowed */ +#define NTLMSSP_ANONYMOUS 0x0800 +#define NTLMSSP_NEGOTIATE_DOMAIN_SUPPLIED 0x1000 /* reserved6 */ +#define NTLMSSP_NEGOTIATE_WORKSTATION_SUPPLIED 0x2000 +#define NTLMSSP_NEGOTIATE_LOCAL_CALL 0x4000 /* client/server same machine */ +#define NTLMSSP_NEGOTIATE_ALWAYS_SIGN 0x8000 /* Sign. All security levels */ +#define NTLMSSP_TARGET_TYPE_DOMAIN 0x10000 +#define NTLMSSP_TARGET_TYPE_SERVER 0x20000 +#define NTLMSSP_TARGET_TYPE_SHARE 0x40000 +#define NTLMSSP_NEGOTIATE_EXTENDED_SEC 0x80000 /* NB:not related to NTLMv2 pwd*/ +/* #define NTLMSSP_REQUEST_INIT_RESP 0x100000 */ +#define NTLMSSP_NEGOTIATE_IDENTIFY 0x100000 +#define NTLMSSP_REQUEST_ACCEPT_RESP 0x200000 /* reserved5 */ +#define NTLMSSP_REQUEST_NON_NT_KEY 0x400000 +#define NTLMSSP_NEGOTIATE_TARGET_INFO 0x800000 +/* #define reserved4 0x1000000 */ +#define NTLMSSP_NEGOTIATE_VERSION 0x2000000 /* we do not set */ +/* #define reserved3 0x4000000 */ +/* #define reserved2 0x8000000 */ +/* #define reserved1 0x10000000 */ +#define NTLMSSP_NEGOTIATE_128 0x20000000 +#define NTLMSSP_NEGOTIATE_KEY_XCH 0x40000000 +#define NTLMSSP_NEGOTIATE_56 0x80000000 + +/* Define AV Pair Field IDs */ +enum av_field_type { + NTLMSSP_AV_EOL = 0, + NTLMSSP_AV_NB_COMPUTER_NAME, + NTLMSSP_AV_NB_DOMAIN_NAME, + NTLMSSP_AV_DNS_COMPUTER_NAME, + NTLMSSP_AV_DNS_DOMAIN_NAME, + NTLMSSP_AV_DNS_TREE_NAME, + NTLMSSP_AV_FLAGS, + NTLMSSP_AV_TIMESTAMP, + NTLMSSP_AV_RESTRICTION, + NTLMSSP_AV_TARGET_NAME, + NTLMSSP_AV_CHANNEL_BINDINGS +}; + +/* Although typedefs are not commonly used for structure definitions */ +/* in the Linux kernel, in this particular case they are useful */ +/* to more closely match the standards document for NTLMSSP from */ +/* OpenGroup and to make the code more closely match the standard in */ +/* appearance */ + +struct security_buffer { + __le16 Length; + __le16 MaximumLength; + __le32 BufferOffset; /* offset to buffer */ +} __packed; + +struct target_info { + __le16 Type; + __le16 Length; + __u8 Content[0]; +} __packed; + +struct negotiate_message { + __u8 Signature[sizeof(NTLMSSP_SIGNATURE)]; + __le32 MessageType; /* NtLmNegotiate = 1 */ + __le32 NegotiateFlags; + struct security_buffer DomainName; /* RFC 1001 style and ASCII */ + struct security_buffer WorkstationName; /* RFC 1001 and ASCII */ + /* + * struct security_buffer for version info not present since we + * do not set the version is present flag + */ + char DomainString[0]; + /* followed by WorkstationString */ +} __packed; + +struct challenge_message { + __u8 Signature[sizeof(NTLMSSP_SIGNATURE)]; + __le32 MessageType; /* NtLmChallenge = 2 */ + struct security_buffer TargetName; + __le32 NegotiateFlags; + __u8 Challenge[CIFS_CRYPTO_KEY_SIZE]; + __u8 Reserved[8]; + struct security_buffer TargetInfoArray; + /* + * struct security_buffer for version info not present since we + * do not set the version is present flag + */ +} __packed; + +struct authenticate_message { + __u8 Signature[sizeof(NTLMSSP_SIGNATURE)]; + __le32 MessageType; /* NtLmsAuthenticate = 3 */ + struct security_buffer LmChallengeResponse; + struct security_buffer NtChallengeResponse; + struct security_buffer DomainName; + struct security_buffer UserName; + struct security_buffer WorkstationName; + struct security_buffer SessionKey; + __le32 NegotiateFlags; + /* + * struct security_buffer for version info not present since we + * do not set the version is present flag + */ + char UserString[0]; +} __packed; + +struct ntlmv2_resp { + char ntlmv2_hash[CIFS_ENCPWD_SIZE]; + __le32 blob_signature; + __u32 reserved; + __le64 time; + __u64 client_chal; /* random */ + __u32 reserved2; + /* array of name entries could follow ending in minimum 4 byte struct */ +} __packed; + +/* per smb session structure/fields */ +struct ntlmssp_auth { + /* whether session key is per smb session */ + bool sesskey_per_smbsess; + /* sent by client in type 1 ntlmsssp exchange */ + __u32 client_flags; + /* sent by server in type 2 ntlmssp exchange */ + __u32 conn_flags; + /* sent to server */ + unsigned char ciphertext[CIFS_CPHTXT_SIZE]; + /* used by ntlmssp */ + char cryptkey[CIFS_CRYPTO_KEY_SIZE]; +}; +#endif /* __KSMBD_NTLMSSP_H */ diff --git a/fs/ksmbd/oplock.c b/fs/ksmbd/oplock.c new file mode 100644 index 000000000000..6ace6c2f22dc --- /dev/null +++ b/fs/ksmbd/oplock.c @@ -0,0 +1,1709 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2016 Namjae Jeon <linkinjeon@kernel.org> + * Copyright (C) 2018 Samsung Electronics Co., Ltd. + */ + +#include <linux/moduleparam.h> + +#include "glob.h" +#include "oplock.h" + +#include "smb_common.h" +#include "smbstatus.h" +#include "connection.h" +#include "mgmt/user_session.h" +#include "mgmt/share_config.h" +#include "mgmt/tree_connect.h" + +static LIST_HEAD(lease_table_list); +static DEFINE_RWLOCK(lease_list_lock); + +/** + * alloc_opinfo() - allocate a new opinfo object for oplock info + * @work: smb work + * @id: fid of open file + * @Tid: tree id of connection + * + * Return: allocated opinfo object on success, otherwise NULL + */ +static struct oplock_info *alloc_opinfo(struct ksmbd_work *work, + u64 id, __u16 Tid) +{ + struct ksmbd_session *sess = work->sess; + struct oplock_info *opinfo; + + opinfo = kzalloc(sizeof(struct oplock_info), GFP_KERNEL); + if (!opinfo) + return NULL; + + opinfo->sess = sess; + opinfo->conn = sess->conn; + opinfo->level = SMB2_OPLOCK_LEVEL_NONE; + opinfo->op_state = OPLOCK_STATE_NONE; + opinfo->pending_break = 0; + opinfo->fid = id; + opinfo->Tid = Tid; + INIT_LIST_HEAD(&opinfo->op_entry); + INIT_LIST_HEAD(&opinfo->interim_list); + init_waitqueue_head(&opinfo->oplock_q); + init_waitqueue_head(&opinfo->oplock_brk); + atomic_set(&opinfo->refcount, 1); + atomic_set(&opinfo->breaking_cnt, 0); + + return opinfo; +} + +static void lease_add_list(struct oplock_info *opinfo) +{ + struct lease_table *lb = opinfo->o_lease->l_lb; + + spin_lock(&lb->lb_lock); + list_add_rcu(&opinfo->lease_entry, &lb->lease_list); + spin_unlock(&lb->lb_lock); +} + +static void lease_del_list(struct oplock_info *opinfo) +{ + struct lease_table *lb = opinfo->o_lease->l_lb; + + if (!lb) + return; + + spin_lock(&lb->lb_lock); + if (list_empty(&opinfo->lease_entry)) { + spin_unlock(&lb->lb_lock); + return; + } + + list_del_init(&opinfo->lease_entry); + opinfo->o_lease->l_lb = NULL; + spin_unlock(&lb->lb_lock); +} + +static void lb_add(struct lease_table *lb) +{ + write_lock(&lease_list_lock); + list_add(&lb->l_entry, &lease_table_list); + write_unlock(&lease_list_lock); +} + +static int alloc_lease(struct oplock_info *opinfo, struct lease_ctx_info *lctx) +{ + struct lease *lease; + + lease = kmalloc(sizeof(struct lease), GFP_KERNEL); + if (!lease) + return -ENOMEM; + + memcpy(lease->lease_key, lctx->lease_key, SMB2_LEASE_KEY_SIZE); + lease->state = lctx->req_state; + lease->new_state = 0; + lease->flags = lctx->flags; + lease->duration = lctx->duration; + memcpy(lease->parent_lease_key, lctx->parent_lease_key, SMB2_LEASE_KEY_SIZE); + lease->version = lctx->version; + lease->epoch = 0; + INIT_LIST_HEAD(&opinfo->lease_entry); + opinfo->o_lease = lease; + + return 0; +} + +static void free_lease(struct oplock_info *opinfo) +{ + struct lease *lease; + + lease = opinfo->o_lease; + kfree(lease); +} + +static void free_opinfo(struct oplock_info *opinfo) +{ + if (opinfo->is_lease) + free_lease(opinfo); + kfree(opinfo); +} + +static inline void opinfo_free_rcu(struct rcu_head *rcu_head) +{ + struct oplock_info *opinfo; + + opinfo = container_of(rcu_head, struct oplock_info, rcu_head); + free_opinfo(opinfo); +} + +struct oplock_info *opinfo_get(struct ksmbd_file *fp) +{ + struct oplock_info *opinfo; + + rcu_read_lock(); + opinfo = rcu_dereference(fp->f_opinfo); + if (opinfo && !atomic_inc_not_zero(&opinfo->refcount)) + opinfo = NULL; + rcu_read_unlock(); + + return opinfo; +} + +static struct oplock_info *opinfo_get_list(struct ksmbd_inode *ci) +{ + struct oplock_info *opinfo; + + if (list_empty(&ci->m_op_list)) + return NULL; + + rcu_read_lock(); + opinfo = list_first_or_null_rcu(&ci->m_op_list, struct oplock_info, + op_entry); + if (opinfo && !atomic_inc_not_zero(&opinfo->refcount)) + opinfo = NULL; + rcu_read_unlock(); + + return opinfo; +} + +void opinfo_put(struct oplock_info *opinfo) +{ + if (!atomic_dec_and_test(&opinfo->refcount)) + return; + + call_rcu(&opinfo->rcu_head, opinfo_free_rcu); +} + +static void opinfo_add(struct oplock_info *opinfo) +{ + struct ksmbd_inode *ci = opinfo->o_fp->f_ci; + + write_lock(&ci->m_lock); + list_add_rcu(&opinfo->op_entry, &ci->m_op_list); + write_unlock(&ci->m_lock); +} + +static void opinfo_del(struct oplock_info *opinfo) +{ + struct ksmbd_inode *ci = opinfo->o_fp->f_ci; + + if (opinfo->is_lease) { + write_lock(&lease_list_lock); + lease_del_list(opinfo); + write_unlock(&lease_list_lock); + } + write_lock(&ci->m_lock); + list_del_rcu(&opinfo->op_entry); + write_unlock(&ci->m_lock); +} + +static unsigned long opinfo_count(struct ksmbd_file *fp) +{ + if (ksmbd_stream_fd(fp)) + return atomic_read(&fp->f_ci->sop_count); + else + return atomic_read(&fp->f_ci->op_count); +} + +static void opinfo_count_inc(struct ksmbd_file *fp) +{ + if (ksmbd_stream_fd(fp)) + return atomic_inc(&fp->f_ci->sop_count); + else + return atomic_inc(&fp->f_ci->op_count); +} + +static void opinfo_count_dec(struct ksmbd_file *fp) +{ + if (ksmbd_stream_fd(fp)) + return atomic_dec(&fp->f_ci->sop_count); + else + return atomic_dec(&fp->f_ci->op_count); +} + +/** + * opinfo_write_to_read() - convert a write oplock to read oplock + * @opinfo: current oplock info + * + * Return: 0 on success, otherwise -EINVAL + */ +int opinfo_write_to_read(struct oplock_info *opinfo) +{ + struct lease *lease = opinfo->o_lease; + + if (!(opinfo->level == SMB2_OPLOCK_LEVEL_BATCH || + opinfo->level == SMB2_OPLOCK_LEVEL_EXCLUSIVE)) { + pr_err("bad oplock(0x%x)\n", opinfo->level); + if (opinfo->is_lease) + pr_err("lease state(0x%x)\n", lease->state); + return -EINVAL; + } + opinfo->level = SMB2_OPLOCK_LEVEL_II; + + if (opinfo->is_lease) + lease->state = lease->new_state; + return 0; +} + +/** + * opinfo_read_handle_to_read() - convert a read/handle oplock to read oplock + * @opinfo: current oplock info + * + * Return: 0 on success, otherwise -EINVAL + */ +int opinfo_read_handle_to_read(struct oplock_info *opinfo) +{ + struct lease *lease = opinfo->o_lease; + + lease->state = lease->new_state; + opinfo->level = SMB2_OPLOCK_LEVEL_II; + return 0; +} + +/** + * opinfo_write_to_none() - convert a write oplock to none + * @opinfo: current oplock info + * + * Return: 0 on success, otherwise -EINVAL + */ +int opinfo_write_to_none(struct oplock_info *opinfo) +{ + struct lease *lease = opinfo->o_lease; + + if (!(opinfo->level == SMB2_OPLOCK_LEVEL_BATCH || + opinfo->level == SMB2_OPLOCK_LEVEL_EXCLUSIVE)) { + pr_err("bad oplock(0x%x)\n", opinfo->level); + if (opinfo->is_lease) + pr_err("lease state(0x%x)\n", lease->state); + return -EINVAL; + } + opinfo->level = SMB2_OPLOCK_LEVEL_NONE; + if (opinfo->is_lease) + lease->state = lease->new_state; + return 0; +} + +/** + * opinfo_read_to_none() - convert a write read to none + * @opinfo: current oplock info + * + * Return: 0 on success, otherwise -EINVAL + */ +int opinfo_read_to_none(struct oplock_info *opinfo) +{ + struct lease *lease = opinfo->o_lease; + + if (opinfo->level != SMB2_OPLOCK_LEVEL_II) { + pr_err("bad oplock(0x%x)\n", opinfo->level); + if (opinfo->is_lease) + pr_err("lease state(0x%x)\n", lease->state); + return -EINVAL; + } + opinfo->level = SMB2_OPLOCK_LEVEL_NONE; + if (opinfo->is_lease) + lease->state = lease->new_state; + return 0; +} + +/** + * lease_read_to_write() - upgrade lease state from read to write + * @opinfo: current lease info + * + * Return: 0 on success, otherwise -EINVAL + */ +int lease_read_to_write(struct oplock_info *opinfo) +{ + struct lease *lease = opinfo->o_lease; + + if (!(lease->state & SMB2_LEASE_READ_CACHING_LE)) { + ksmbd_debug(OPLOCK, "bad lease state(0x%x)\n", lease->state); + return -EINVAL; + } + + lease->new_state = SMB2_LEASE_NONE_LE; + lease->state |= SMB2_LEASE_WRITE_CACHING_LE; + if (lease->state & SMB2_LEASE_HANDLE_CACHING_LE) + opinfo->level = SMB2_OPLOCK_LEVEL_BATCH; + else + opinfo->level = SMB2_OPLOCK_LEVEL_EXCLUSIVE; + return 0; +} + +/** + * lease_none_upgrade() - upgrade lease state from none + * @opinfo: current lease info + * @new_state: new lease state + * + * Return: 0 on success, otherwise -EINVAL + */ +static int lease_none_upgrade(struct oplock_info *opinfo, __le32 new_state) +{ + struct lease *lease = opinfo->o_lease; + + if (!(lease->state == SMB2_LEASE_NONE_LE)) { + ksmbd_debug(OPLOCK, "bad lease state(0x%x)\n", lease->state); + return -EINVAL; + } + + lease->new_state = SMB2_LEASE_NONE_LE; + lease->state = new_state; + if (lease->state & SMB2_LEASE_HANDLE_CACHING_LE) + if (lease->state & SMB2_LEASE_WRITE_CACHING_LE) + opinfo->level = SMB2_OPLOCK_LEVEL_BATCH; + else + opinfo->level = SMB2_OPLOCK_LEVEL_II; + else if (lease->state & SMB2_LEASE_WRITE_CACHING_LE) + opinfo->level = SMB2_OPLOCK_LEVEL_EXCLUSIVE; + else if (lease->state & SMB2_LEASE_READ_CACHING_LE) + opinfo->level = SMB2_OPLOCK_LEVEL_II; + + return 0; +} + +/** + * close_id_del_oplock() - release oplock object at file close time + * @fp: ksmbd file pointer + */ +void close_id_del_oplock(struct ksmbd_file *fp) +{ + struct oplock_info *opinfo; + + if (S_ISDIR(file_inode(fp->filp)->i_mode)) + return; + + opinfo = opinfo_get(fp); + if (!opinfo) + return; + + opinfo_del(opinfo); + + rcu_assign_pointer(fp->f_opinfo, NULL); + if (opinfo->op_state == OPLOCK_ACK_WAIT) { + opinfo->op_state = OPLOCK_CLOSING; + wake_up_interruptible_all(&opinfo->oplock_q); + if (opinfo->is_lease) { + atomic_set(&opinfo->breaking_cnt, 0); + wake_up_interruptible_all(&opinfo->oplock_brk); + } + } + + opinfo_count_dec(fp); + atomic_dec(&opinfo->refcount); + opinfo_put(opinfo); +} + +/** + * grant_write_oplock() - grant exclusive/batch oplock or write lease + * @opinfo_new: new oplock info object + * @req_oplock: request oplock + * @lctx: lease context information + * + * Return: 0 + */ +static void grant_write_oplock(struct oplock_info *opinfo_new, int req_oplock, + struct lease_ctx_info *lctx) +{ + struct lease *lease = opinfo_new->o_lease; + + if (req_oplock == SMB2_OPLOCK_LEVEL_BATCH) + opinfo_new->level = SMB2_OPLOCK_LEVEL_BATCH; + else + opinfo_new->level = SMB2_OPLOCK_LEVEL_EXCLUSIVE; + + if (lctx) { + lease->state = lctx->req_state; + memcpy(lease->lease_key, lctx->lease_key, SMB2_LEASE_KEY_SIZE); + } +} + +/** + * grant_read_oplock() - grant level2 oplock or read lease + * @opinfo_new: new oplock info object + * @lctx: lease context information + * + * Return: 0 + */ +static void grant_read_oplock(struct oplock_info *opinfo_new, + struct lease_ctx_info *lctx) +{ + struct lease *lease = opinfo_new->o_lease; + + opinfo_new->level = SMB2_OPLOCK_LEVEL_II; + + if (lctx) { + lease->state = SMB2_LEASE_READ_CACHING_LE; + if (lctx->req_state & SMB2_LEASE_HANDLE_CACHING_LE) + lease->state |= SMB2_LEASE_HANDLE_CACHING_LE; + memcpy(lease->lease_key, lctx->lease_key, SMB2_LEASE_KEY_SIZE); + } +} + +/** + * grant_none_oplock() - grant none oplock or none lease + * @opinfo_new: new oplock info object + * @lctx: lease context information + * + * Return: 0 + */ +static void grant_none_oplock(struct oplock_info *opinfo_new, + struct lease_ctx_info *lctx) +{ + struct lease *lease = opinfo_new->o_lease; + + opinfo_new->level = SMB2_OPLOCK_LEVEL_NONE; + + if (lctx) { + lease->state = 0; + memcpy(lease->lease_key, lctx->lease_key, SMB2_LEASE_KEY_SIZE); + } +} + +static inline int compare_guid_key(struct oplock_info *opinfo, + const char *guid1, const char *key1) +{ + const char *guid2, *key2; + + guid2 = opinfo->conn->ClientGUID; + key2 = opinfo->o_lease->lease_key; + if (!memcmp(guid1, guid2, SMB2_CLIENT_GUID_SIZE) && + !memcmp(key1, key2, SMB2_LEASE_KEY_SIZE)) + return 1; + + return 0; +} + +/** + * same_client_has_lease() - check whether current lease request is + * from lease owner of file + * @ci: master file pointer + * @client_guid: Client GUID + * @lctx: lease context information + * + * Return: oplock(lease) object on success, otherwise NULL + */ +static struct oplock_info *same_client_has_lease(struct ksmbd_inode *ci, + char *client_guid, + struct lease_ctx_info *lctx) +{ + int ret; + struct lease *lease; + struct oplock_info *opinfo; + struct oplock_info *m_opinfo = NULL; + + if (!lctx) + return NULL; + + /* + * Compare lease key and client_guid to know request from same owner + * of same client + */ + read_lock(&ci->m_lock); + list_for_each_entry(opinfo, &ci->m_op_list, op_entry) { + if (!opinfo->is_lease) + continue; + read_unlock(&ci->m_lock); + lease = opinfo->o_lease; + + ret = compare_guid_key(opinfo, client_guid, lctx->lease_key); + if (ret) { + m_opinfo = opinfo; + /* skip upgrading lease about breaking lease */ + if (atomic_read(&opinfo->breaking_cnt)) { + read_lock(&ci->m_lock); + continue; + } + + /* upgrading lease */ + if ((atomic_read(&ci->op_count) + + atomic_read(&ci->sop_count)) == 1) { + if (lease->state == + (lctx->req_state & lease->state)) { + lease->state |= lctx->req_state; + if (lctx->req_state & + SMB2_LEASE_WRITE_CACHING_LE) + lease_read_to_write(opinfo); + } + } else if ((atomic_read(&ci->op_count) + + atomic_read(&ci->sop_count)) > 1) { + if (lctx->req_state == + (SMB2_LEASE_READ_CACHING_LE | + SMB2_LEASE_HANDLE_CACHING_LE)) + lease->state = lctx->req_state; + } + + if (lctx->req_state && lease->state == + SMB2_LEASE_NONE_LE) + lease_none_upgrade(opinfo, lctx->req_state); + } + read_lock(&ci->m_lock); + } + read_unlock(&ci->m_lock); + + return m_opinfo; +} + +static void wait_for_break_ack(struct oplock_info *opinfo) +{ + int rc = 0; + + rc = wait_event_interruptible_timeout(opinfo->oplock_q, + opinfo->op_state == OPLOCK_STATE_NONE || + opinfo->op_state == OPLOCK_CLOSING, + OPLOCK_WAIT_TIME); + + /* is this a timeout ? */ + if (!rc) { + if (opinfo->is_lease) + opinfo->o_lease->state = SMB2_LEASE_NONE_LE; + opinfo->level = SMB2_OPLOCK_LEVEL_NONE; + opinfo->op_state = OPLOCK_STATE_NONE; + } +} + +static void wake_up_oplock_break(struct oplock_info *opinfo) +{ + clear_bit_unlock(0, &opinfo->pending_break); + /* memory barrier is needed for wake_up_bit() */ + smp_mb__after_atomic(); + wake_up_bit(&opinfo->pending_break, 0); +} + +static int oplock_break_pending(struct oplock_info *opinfo, int req_op_level) +{ + while (test_and_set_bit(0, &opinfo->pending_break)) { + wait_on_bit(&opinfo->pending_break, 0, TASK_UNINTERRUPTIBLE); + + /* Not immediately break to none. */ + opinfo->open_trunc = 0; + + if (opinfo->op_state == OPLOCK_CLOSING) + return -ENOENT; + else if (!opinfo->is_lease && opinfo->level <= req_op_level) + return 1; + } + + if (!opinfo->is_lease && opinfo->level <= req_op_level) { + wake_up_oplock_break(opinfo); + return 1; + } + return 0; +} + +static inline int allocate_oplock_break_buf(struct ksmbd_work *work) +{ + work->response_buf = kzalloc(MAX_CIFS_SMALL_BUFFER_SIZE, GFP_KERNEL); + if (!work->response_buf) + return -ENOMEM; + work->response_sz = MAX_CIFS_SMALL_BUFFER_SIZE; + return 0; +} + +/** + * __smb2_oplock_break_noti() - send smb2 oplock break cmd from conn + * to client + * @wk: smb work object + * + * There are two ways this function can be called. 1- while file open we break + * from exclusive/batch lock to levelII oplock and 2- while file write/truncate + * we break from levelII oplock no oplock. + * work->request_buf contains oplock_info. + */ +static void __smb2_oplock_break_noti(struct work_struct *wk) +{ + struct smb2_oplock_break *rsp = NULL; + struct ksmbd_work *work = container_of(wk, struct ksmbd_work, work); + struct ksmbd_conn *conn = work->conn; + struct oplock_break_info *br_info = work->request_buf; + struct smb2_hdr *rsp_hdr; + struct ksmbd_file *fp; + + fp = ksmbd_lookup_durable_fd(br_info->fid); + if (!fp) { + atomic_dec(&conn->r_count); + ksmbd_free_work_struct(work); + return; + } + + if (allocate_oplock_break_buf(work)) { + pr_err("smb2_allocate_rsp_buf failed! "); + atomic_dec(&conn->r_count); + ksmbd_fd_put(work, fp); + ksmbd_free_work_struct(work); + return; + } + + rsp_hdr = work->response_buf; + memset(rsp_hdr, 0, sizeof(struct smb2_hdr) + 2); + rsp_hdr->smb2_buf_length = + cpu_to_be32(smb2_hdr_size_no_buflen(conn->vals)); + rsp_hdr->ProtocolId = SMB2_PROTO_NUMBER; + rsp_hdr->StructureSize = SMB2_HEADER_STRUCTURE_SIZE; + rsp_hdr->CreditRequest = cpu_to_le16(0); + rsp_hdr->Command = SMB2_OPLOCK_BREAK; + rsp_hdr->Flags = (SMB2_FLAGS_SERVER_TO_REDIR); + rsp_hdr->NextCommand = 0; + rsp_hdr->MessageId = cpu_to_le64(-1); + rsp_hdr->Id.SyncId.ProcessId = 0; + rsp_hdr->Id.SyncId.TreeId = 0; + rsp_hdr->SessionId = 0; + memset(rsp_hdr->Signature, 0, 16); + + rsp = work->response_buf; + + rsp->StructureSize = cpu_to_le16(24); + if (!br_info->open_trunc && + (br_info->level == SMB2_OPLOCK_LEVEL_BATCH || + br_info->level == SMB2_OPLOCK_LEVEL_EXCLUSIVE)) + rsp->OplockLevel = SMB2_OPLOCK_LEVEL_II; + else + rsp->OplockLevel = SMB2_OPLOCK_LEVEL_NONE; + rsp->Reserved = 0; + rsp->Reserved2 = 0; + rsp->PersistentFid = cpu_to_le64(fp->persistent_id); + rsp->VolatileFid = cpu_to_le64(fp->volatile_id); + + inc_rfc1001_len(rsp, 24); + + ksmbd_debug(OPLOCK, + "sending oplock break v_id %llu p_id = %llu lock level = %d\n", + rsp->VolatileFid, rsp->PersistentFid, rsp->OplockLevel); + + ksmbd_fd_put(work, fp); + ksmbd_conn_write(work); + ksmbd_free_work_struct(work); + atomic_dec(&conn->r_count); +} + +/** + * smb2_oplock_break_noti() - send smb2 exclusive/batch to level2 oplock + * break command from server to client + * @opinfo: oplock info object + * + * Return: 0 on success, otherwise error + */ +static int smb2_oplock_break_noti(struct oplock_info *opinfo) +{ + struct ksmbd_conn *conn = opinfo->conn; + struct oplock_break_info *br_info; + int ret = 0; + struct ksmbd_work *work = ksmbd_alloc_work_struct(); + + if (!work) + return -ENOMEM; + + br_info = kmalloc(sizeof(struct oplock_break_info), GFP_KERNEL); + if (!br_info) { + ksmbd_free_work_struct(work); + return -ENOMEM; + } + + br_info->level = opinfo->level; + br_info->fid = opinfo->fid; + br_info->open_trunc = opinfo->open_trunc; + + work->request_buf = (char *)br_info; + work->conn = conn; + work->sess = opinfo->sess; + + atomic_inc(&conn->r_count); + if (opinfo->op_state == OPLOCK_ACK_WAIT) { + INIT_WORK(&work->work, __smb2_oplock_break_noti); + ksmbd_queue_work(work); + + wait_for_break_ack(opinfo); + } else { + __smb2_oplock_break_noti(&work->work); + if (opinfo->level == SMB2_OPLOCK_LEVEL_II) + opinfo->level = SMB2_OPLOCK_LEVEL_NONE; + } + return ret; +} + +/** + * __smb2_lease_break_noti() - send lease break command from server + * to client + * @wk: smb work object + */ +static void __smb2_lease_break_noti(struct work_struct *wk) +{ + struct smb2_lease_break *rsp = NULL; + struct ksmbd_work *work = container_of(wk, struct ksmbd_work, work); + struct lease_break_info *br_info = work->request_buf; + struct ksmbd_conn *conn = work->conn; + struct smb2_hdr *rsp_hdr; + + if (allocate_oplock_break_buf(work)) { + ksmbd_debug(OPLOCK, "smb2_allocate_rsp_buf failed! "); + ksmbd_free_work_struct(work); + atomic_dec(&conn->r_count); + return; + } + + rsp_hdr = work->response_buf; + memset(rsp_hdr, 0, sizeof(struct smb2_hdr) + 2); + rsp_hdr->smb2_buf_length = + cpu_to_be32(smb2_hdr_size_no_buflen(conn->vals)); + rsp_hdr->ProtocolId = SMB2_PROTO_NUMBER; + rsp_hdr->StructureSize = SMB2_HEADER_STRUCTURE_SIZE; + rsp_hdr->CreditRequest = cpu_to_le16(0); + rsp_hdr->Command = SMB2_OPLOCK_BREAK; + rsp_hdr->Flags = (SMB2_FLAGS_SERVER_TO_REDIR); + rsp_hdr->NextCommand = 0; + rsp_hdr->MessageId = cpu_to_le64(-1); + rsp_hdr->Id.SyncId.ProcessId = 0; + rsp_hdr->Id.SyncId.TreeId = 0; + rsp_hdr->SessionId = 0; + memset(rsp_hdr->Signature, 0, 16); + + rsp = work->response_buf; + rsp->StructureSize = cpu_to_le16(44); + rsp->Epoch = br_info->epoch; + rsp->Flags = 0; + + if (br_info->curr_state & (SMB2_LEASE_WRITE_CACHING_LE | + SMB2_LEASE_HANDLE_CACHING_LE)) + rsp->Flags = SMB2_NOTIFY_BREAK_LEASE_FLAG_ACK_REQUIRED; + + memcpy(rsp->LeaseKey, br_info->lease_key, SMB2_LEASE_KEY_SIZE); + rsp->CurrentLeaseState = br_info->curr_state; + rsp->NewLeaseState = br_info->new_state; + rsp->BreakReason = 0; + rsp->AccessMaskHint = 0; + rsp->ShareMaskHint = 0; + + inc_rfc1001_len(rsp, 44); + + ksmbd_conn_write(work); + ksmbd_free_work_struct(work); + atomic_dec(&conn->r_count); +} + +/** + * smb2_lease_break_noti() - break lease when a new client request + * write lease + * @opinfo: conains lease state information + * + * Return: 0 on success, otherwise error + */ +static int smb2_lease_break_noti(struct oplock_info *opinfo) +{ + struct ksmbd_conn *conn = opinfo->conn; + struct list_head *tmp, *t; + struct ksmbd_work *work; + struct lease_break_info *br_info; + struct lease *lease = opinfo->o_lease; + + work = ksmbd_alloc_work_struct(); + if (!work) + return -ENOMEM; + + br_info = kmalloc(sizeof(struct lease_break_info), GFP_KERNEL); + if (!br_info) { + ksmbd_free_work_struct(work); + return -ENOMEM; + } + + br_info->curr_state = lease->state; + br_info->new_state = lease->new_state; + if (lease->version == 2) + br_info->epoch = cpu_to_le16(++lease->epoch); + else + br_info->epoch = 0; + memcpy(br_info->lease_key, lease->lease_key, SMB2_LEASE_KEY_SIZE); + + work->request_buf = (char *)br_info; + work->conn = conn; + work->sess = opinfo->sess; + + atomic_inc(&conn->r_count); + if (opinfo->op_state == OPLOCK_ACK_WAIT) { + list_for_each_safe(tmp, t, &opinfo->interim_list) { + struct ksmbd_work *in_work; + + in_work = list_entry(tmp, struct ksmbd_work, + interim_entry); + setup_async_work(in_work, NULL, NULL); + smb2_send_interim_resp(in_work, STATUS_PENDING); + list_del(&in_work->interim_entry); + } + INIT_WORK(&work->work, __smb2_lease_break_noti); + ksmbd_queue_work(work); + wait_for_break_ack(opinfo); + } else { + __smb2_lease_break_noti(&work->work); + if (opinfo->o_lease->new_state == SMB2_LEASE_NONE_LE) { + opinfo->level = SMB2_OPLOCK_LEVEL_NONE; + opinfo->o_lease->state = SMB2_LEASE_NONE_LE; + } + } + return 0; +} + +static void wait_lease_breaking(struct oplock_info *opinfo) +{ + if (!opinfo->is_lease) + return; + + wake_up_interruptible_all(&opinfo->oplock_brk); + if (atomic_read(&opinfo->breaking_cnt)) { + int ret = 0; + + ret = wait_event_interruptible_timeout(opinfo->oplock_brk, + atomic_read(&opinfo->breaking_cnt) == 0, + HZ); + if (!ret) + atomic_set(&opinfo->breaking_cnt, 0); + } +} + +static int oplock_break(struct oplock_info *brk_opinfo, int req_op_level) +{ + int err = 0; + + /* Need to break exclusive/batch oplock, write lease or overwrite_if */ + ksmbd_debug(OPLOCK, + "request to send oplock(level : 0x%x) break notification\n", + brk_opinfo->level); + + if (brk_opinfo->is_lease) { + struct lease *lease = brk_opinfo->o_lease; + + atomic_inc(&brk_opinfo->breaking_cnt); + + err = oplock_break_pending(brk_opinfo, req_op_level); + if (err) + return err < 0 ? err : 0; + + if (brk_opinfo->open_trunc) { + /* + * Create overwrite break trigger the lease break to + * none. + */ + lease->new_state = SMB2_LEASE_NONE_LE; + } else { + if (lease->state & SMB2_LEASE_WRITE_CACHING_LE) { + if (lease->state & SMB2_LEASE_HANDLE_CACHING_LE) + lease->new_state = + SMB2_LEASE_READ_CACHING_LE | + SMB2_LEASE_HANDLE_CACHING_LE; + else + lease->new_state = + SMB2_LEASE_READ_CACHING_LE; + } else { + if (lease->state & SMB2_LEASE_HANDLE_CACHING_LE) + lease->new_state = + SMB2_LEASE_READ_CACHING_LE; + else + lease->new_state = SMB2_LEASE_NONE_LE; + } + } + + if (lease->state & (SMB2_LEASE_WRITE_CACHING_LE | + SMB2_LEASE_HANDLE_CACHING_LE)) + brk_opinfo->op_state = OPLOCK_ACK_WAIT; + else + atomic_dec(&brk_opinfo->breaking_cnt); + } else { + err = oplock_break_pending(brk_opinfo, req_op_level); + if (err) + return err < 0 ? err : 0; + + if (brk_opinfo->level == SMB2_OPLOCK_LEVEL_BATCH || + brk_opinfo->level == SMB2_OPLOCK_LEVEL_EXCLUSIVE) + brk_opinfo->op_state = OPLOCK_ACK_WAIT; + } + + if (brk_opinfo->is_lease) + err = smb2_lease_break_noti(brk_opinfo); + else + err = smb2_oplock_break_noti(brk_opinfo); + + ksmbd_debug(OPLOCK, "oplock granted = %d\n", brk_opinfo->level); + if (brk_opinfo->op_state == OPLOCK_CLOSING) + err = -ENOENT; + wake_up_oplock_break(brk_opinfo); + + wait_lease_breaking(brk_opinfo); + + return err; +} + +void destroy_lease_table(struct ksmbd_conn *conn) +{ + struct lease_table *lb, *lbtmp; + struct oplock_info *opinfo; + + write_lock(&lease_list_lock); + if (list_empty(&lease_table_list)) { + write_unlock(&lease_list_lock); + return; + } + + list_for_each_entry_safe(lb, lbtmp, &lease_table_list, l_entry) { + if (conn && memcmp(lb->client_guid, conn->ClientGUID, + SMB2_CLIENT_GUID_SIZE)) + continue; +again: + rcu_read_lock(); + list_for_each_entry_rcu(opinfo, &lb->lease_list, + lease_entry) { + rcu_read_unlock(); + lease_del_list(opinfo); + goto again; + } + rcu_read_unlock(); + list_del(&lb->l_entry); + kfree(lb); + } + write_unlock(&lease_list_lock); +} + +int find_same_lease_key(struct ksmbd_session *sess, struct ksmbd_inode *ci, + struct lease_ctx_info *lctx) +{ + struct oplock_info *opinfo; + int err = 0; + struct lease_table *lb; + + if (!lctx) + return err; + + read_lock(&lease_list_lock); + if (list_empty(&lease_table_list)) { + read_unlock(&lease_list_lock); + return 0; + } + + list_for_each_entry(lb, &lease_table_list, l_entry) { + if (!memcmp(lb->client_guid, sess->conn->ClientGUID, + SMB2_CLIENT_GUID_SIZE)) + goto found; + } + read_unlock(&lease_list_lock); + + return 0; + +found: + rcu_read_lock(); + list_for_each_entry_rcu(opinfo, &lb->lease_list, lease_entry) { + if (!atomic_inc_not_zero(&opinfo->refcount)) + continue; + rcu_read_unlock(); + if (opinfo->o_fp->f_ci == ci) + goto op_next; + err = compare_guid_key(opinfo, sess->conn->ClientGUID, + lctx->lease_key); + if (err) { + err = -EINVAL; + ksmbd_debug(OPLOCK, + "found same lease key is already used in other files\n"); + opinfo_put(opinfo); + goto out; + } +op_next: + opinfo_put(opinfo); + rcu_read_lock(); + } + rcu_read_unlock(); + +out: + read_unlock(&lease_list_lock); + return err; +} + +static void copy_lease(struct oplock_info *op1, struct oplock_info *op2) +{ + struct lease *lease1 = op1->o_lease; + struct lease *lease2 = op2->o_lease; + + op2->level = op1->level; + lease2->state = lease1->state; + memcpy(lease2->lease_key, lease1->lease_key, + SMB2_LEASE_KEY_SIZE); + lease2->duration = lease1->duration; + lease2->flags = lease1->flags; +} + +static int add_lease_global_list(struct oplock_info *opinfo) +{ + struct lease_table *lb; + + read_lock(&lease_list_lock); + list_for_each_entry(lb, &lease_table_list, l_entry) { + if (!memcmp(lb->client_guid, opinfo->conn->ClientGUID, + SMB2_CLIENT_GUID_SIZE)) { + opinfo->o_lease->l_lb = lb; + lease_add_list(opinfo); + read_unlock(&lease_list_lock); + return 0; + } + } + read_unlock(&lease_list_lock); + + lb = kmalloc(sizeof(struct lease_table), GFP_KERNEL); + if (!lb) + return -ENOMEM; + + memcpy(lb->client_guid, opinfo->conn->ClientGUID, + SMB2_CLIENT_GUID_SIZE); + INIT_LIST_HEAD(&lb->lease_list); + spin_lock_init(&lb->lb_lock); + opinfo->o_lease->l_lb = lb; + lease_add_list(opinfo); + lb_add(lb); + return 0; +} + +static void set_oplock_level(struct oplock_info *opinfo, int level, + struct lease_ctx_info *lctx) +{ + switch (level) { + case SMB2_OPLOCK_LEVEL_BATCH: + case SMB2_OPLOCK_LEVEL_EXCLUSIVE: + grant_write_oplock(opinfo, level, lctx); + break; + case SMB2_OPLOCK_LEVEL_II: + grant_read_oplock(opinfo, lctx); + break; + default: + grant_none_oplock(opinfo, lctx); + break; + } +} + +/** + * smb_grant_oplock() - handle oplock/lease request on file open + * @work: smb work + * @req_op_level: oplock level + * @pid: id of open file + * @fp: ksmbd file pointer + * @tid: Tree id of connection + * @lctx: lease context information on file open + * @share_ret: share mode + * + * Return: 0 on success, otherwise error + */ +int smb_grant_oplock(struct ksmbd_work *work, int req_op_level, u64 pid, + struct ksmbd_file *fp, __u16 tid, + struct lease_ctx_info *lctx, int share_ret) +{ + struct ksmbd_session *sess = work->sess; + int err = 0; + struct oplock_info *opinfo = NULL, *prev_opinfo = NULL; + struct ksmbd_inode *ci = fp->f_ci; + bool prev_op_has_lease; + __le32 prev_op_state = 0; + + /* not support directory lease */ + if (S_ISDIR(file_inode(fp->filp)->i_mode)) + return 0; + + opinfo = alloc_opinfo(work, pid, tid); + if (!opinfo) + return -ENOMEM; + + if (lctx) { + err = alloc_lease(opinfo, lctx); + if (err) + goto err_out; + opinfo->is_lease = 1; + } + + /* ci does not have any oplock */ + if (!opinfo_count(fp)) + goto set_lev; + + /* grant none-oplock if second open is trunc */ + if (fp->attrib_only && fp->cdoption != FILE_OVERWRITE_IF_LE && + fp->cdoption != FILE_OVERWRITE_LE && + fp->cdoption != FILE_SUPERSEDE_LE) { + req_op_level = SMB2_OPLOCK_LEVEL_NONE; + goto set_lev; + } + + if (lctx) { + struct oplock_info *m_opinfo; + + /* is lease already granted ? */ + m_opinfo = same_client_has_lease(ci, sess->conn->ClientGUID, + lctx); + if (m_opinfo) { + copy_lease(m_opinfo, opinfo); + if (atomic_read(&m_opinfo->breaking_cnt)) + opinfo->o_lease->flags = + SMB2_LEASE_FLAG_BREAK_IN_PROGRESS_LE; + goto out; + } + } + prev_opinfo = opinfo_get_list(ci); + if (!prev_opinfo || + (prev_opinfo->level == SMB2_OPLOCK_LEVEL_NONE && lctx)) + goto set_lev; + prev_op_has_lease = prev_opinfo->is_lease; + if (prev_op_has_lease) + prev_op_state = prev_opinfo->o_lease->state; + + if (share_ret < 0 && + prev_opinfo->level == SMB2_OPLOCK_LEVEL_EXCLUSIVE) { + err = share_ret; + opinfo_put(prev_opinfo); + goto err_out; + } + + if (prev_opinfo->level != SMB2_OPLOCK_LEVEL_BATCH && + prev_opinfo->level != SMB2_OPLOCK_LEVEL_EXCLUSIVE) { + opinfo_put(prev_opinfo); + goto op_break_not_needed; + } + + list_add(&work->interim_entry, &prev_opinfo->interim_list); + err = oplock_break(prev_opinfo, SMB2_OPLOCK_LEVEL_II); + opinfo_put(prev_opinfo); + if (err == -ENOENT) + goto set_lev; + /* Check all oplock was freed by close */ + else if (err < 0) + goto err_out; + +op_break_not_needed: + if (share_ret < 0) { + err = share_ret; + goto err_out; + } + + if (req_op_level != SMB2_OPLOCK_LEVEL_NONE) + req_op_level = SMB2_OPLOCK_LEVEL_II; + + /* grant fixed oplock on stacked locking between lease and oplock */ + if (prev_op_has_lease && !lctx) + if (prev_op_state & SMB2_LEASE_HANDLE_CACHING_LE) + req_op_level = SMB2_OPLOCK_LEVEL_NONE; + + if (!prev_op_has_lease && lctx) { + req_op_level = SMB2_OPLOCK_LEVEL_II; + lctx->req_state = SMB2_LEASE_READ_CACHING_LE; + } + +set_lev: + set_oplock_level(opinfo, req_op_level, lctx); + +out: + rcu_assign_pointer(fp->f_opinfo, opinfo); + opinfo->o_fp = fp; + + opinfo_count_inc(fp); + opinfo_add(opinfo); + if (opinfo->is_lease) { + err = add_lease_global_list(opinfo); + if (err) + goto err_out; + } + + return 0; +err_out: + free_opinfo(opinfo); + return err; +} + +/** + * smb_break_all_write_oplock() - break batch/exclusive oplock to level2 + * @work: smb work + * @fp: ksmbd file pointer + * @is_trunc: truncate on open + */ +static void smb_break_all_write_oplock(struct ksmbd_work *work, + struct ksmbd_file *fp, int is_trunc) +{ + struct oplock_info *brk_opinfo; + + brk_opinfo = opinfo_get_list(fp->f_ci); + if (!brk_opinfo) + return; + if (brk_opinfo->level != SMB2_OPLOCK_LEVEL_BATCH && + brk_opinfo->level != SMB2_OPLOCK_LEVEL_EXCLUSIVE) { + opinfo_put(brk_opinfo); + return; + } + + brk_opinfo->open_trunc = is_trunc; + list_add(&work->interim_entry, &brk_opinfo->interim_list); + oplock_break(brk_opinfo, SMB2_OPLOCK_LEVEL_II); + opinfo_put(brk_opinfo); +} + +/** + * smb_break_all_levII_oplock() - send level2 oplock or read lease break command + * from server to client + * @work: smb work + * @fp: ksmbd file pointer + * @is_trunc: truncate on open + */ +void smb_break_all_levII_oplock(struct ksmbd_work *work, struct ksmbd_file *fp, + int is_trunc) +{ + struct oplock_info *op, *brk_op; + struct ksmbd_inode *ci; + struct ksmbd_conn *conn = work->sess->conn; + + if (!test_share_config_flag(work->tcon->share_conf, + KSMBD_SHARE_FLAG_OPLOCKS)) + return; + + ci = fp->f_ci; + op = opinfo_get(fp); + + rcu_read_lock(); + list_for_each_entry_rcu(brk_op, &ci->m_op_list, op_entry) { + if (!atomic_inc_not_zero(&brk_op->refcount)) + continue; + rcu_read_unlock(); + if (brk_op->is_lease && (brk_op->o_lease->state & + (~(SMB2_LEASE_READ_CACHING_LE | + SMB2_LEASE_HANDLE_CACHING_LE)))) { + ksmbd_debug(OPLOCK, "unexpected lease state(0x%x)\n", + brk_op->o_lease->state); + goto next; + } else if (brk_op->level != + SMB2_OPLOCK_LEVEL_II) { + ksmbd_debug(OPLOCK, "unexpected oplock(0x%x)\n", + brk_op->level); + goto next; + } + + /* Skip oplock being break to none */ + if (brk_op->is_lease && + brk_op->o_lease->new_state == SMB2_LEASE_NONE_LE && + atomic_read(&brk_op->breaking_cnt)) + goto next; + + if (op && op->is_lease && brk_op->is_lease && + !memcmp(conn->ClientGUID, brk_op->conn->ClientGUID, + SMB2_CLIENT_GUID_SIZE) && + !memcmp(op->o_lease->lease_key, brk_op->o_lease->lease_key, + SMB2_LEASE_KEY_SIZE)) + goto next; + brk_op->open_trunc = is_trunc; + oplock_break(brk_op, SMB2_OPLOCK_LEVEL_NONE); +next: + opinfo_put(brk_op); + rcu_read_lock(); + } + rcu_read_unlock(); + + if (op) + opinfo_put(op); +} + +/** + * smb_break_all_oplock() - break both batch/exclusive and level2 oplock + * @work: smb work + * @fp: ksmbd file pointer + */ +void smb_break_all_oplock(struct ksmbd_work *work, struct ksmbd_file *fp) +{ + if (!test_share_config_flag(work->tcon->share_conf, + KSMBD_SHARE_FLAG_OPLOCKS)) + return; + + smb_break_all_write_oplock(work, fp, 1); + smb_break_all_levII_oplock(work, fp, 1); +} + +/** + * smb2_map_lease_to_oplock() - map lease state to corresponding oplock type + * @lease_state: lease type + * + * Return: 0 if no mapping, otherwise corresponding oplock type + */ +__u8 smb2_map_lease_to_oplock(__le32 lease_state) +{ + if (lease_state == (SMB2_LEASE_HANDLE_CACHING_LE | + SMB2_LEASE_READ_CACHING_LE | + SMB2_LEASE_WRITE_CACHING_LE)) { + return SMB2_OPLOCK_LEVEL_BATCH; + } else if (lease_state != SMB2_LEASE_WRITE_CACHING_LE && + lease_state & SMB2_LEASE_WRITE_CACHING_LE) { + if (!(lease_state & SMB2_LEASE_HANDLE_CACHING_LE)) + return SMB2_OPLOCK_LEVEL_EXCLUSIVE; + } else if (lease_state & SMB2_LEASE_READ_CACHING_LE) { + return SMB2_OPLOCK_LEVEL_II; + } + return 0; +} + +/** + * create_lease_buf() - create lease context for open cmd response + * @rbuf: buffer to create lease context response + * @lease: buffer to stored parsed lease state information + */ +void create_lease_buf(u8 *rbuf, struct lease *lease) +{ + char *LeaseKey = (char *)&lease->lease_key; + + if (lease->version == 2) { + struct create_lease_v2 *buf = (struct create_lease_v2 *)rbuf; + char *ParentLeaseKey = (char *)&lease->parent_lease_key; + + memset(buf, 0, sizeof(struct create_lease_v2)); + buf->lcontext.LeaseKeyLow = *((__le64 *)LeaseKey); + buf->lcontext.LeaseKeyHigh = *((__le64 *)(LeaseKey + 8)); + buf->lcontext.LeaseFlags = lease->flags; + buf->lcontext.LeaseState = lease->state; + buf->lcontext.ParentLeaseKeyLow = *((__le64 *)ParentLeaseKey); + buf->lcontext.ParentLeaseKeyHigh = *((__le64 *)(ParentLeaseKey + 8)); + buf->ccontext.DataOffset = cpu_to_le16(offsetof + (struct create_lease_v2, lcontext)); + buf->ccontext.DataLength = cpu_to_le32(sizeof(struct lease_context_v2)); + buf->ccontext.NameOffset = cpu_to_le16(offsetof + (struct create_lease_v2, Name)); + buf->ccontext.NameLength = cpu_to_le16(4); + buf->Name[0] = 'R'; + buf->Name[1] = 'q'; + buf->Name[2] = 'L'; + buf->Name[3] = 's'; + } else { + struct create_lease *buf = (struct create_lease *)rbuf; + + memset(buf, 0, sizeof(struct create_lease)); + buf->lcontext.LeaseKeyLow = *((__le64 *)LeaseKey); + buf->lcontext.LeaseKeyHigh = *((__le64 *)(LeaseKey + 8)); + buf->lcontext.LeaseFlags = lease->flags; + buf->lcontext.LeaseState = lease->state; + buf->ccontext.DataOffset = cpu_to_le16(offsetof + (struct create_lease, lcontext)); + buf->ccontext.DataLength = cpu_to_le32(sizeof(struct lease_context)); + buf->ccontext.NameOffset = cpu_to_le16(offsetof + (struct create_lease, Name)); + buf->ccontext.NameLength = cpu_to_le16(4); + buf->Name[0] = 'R'; + buf->Name[1] = 'q'; + buf->Name[2] = 'L'; + buf->Name[3] = 's'; + } +} + +/** + * parse_lease_state() - parse lease context containted in file open request + * @open_req: buffer containing smb2 file open(create) request + * + * Return: oplock state, -ENOENT if create lease context not found + */ +struct lease_ctx_info *parse_lease_state(void *open_req) +{ + char *data_offset; + struct create_context *cc; + unsigned int next = 0; + char *name; + bool found = false; + struct smb2_create_req *req = (struct smb2_create_req *)open_req; + struct lease_ctx_info *lreq = kzalloc(sizeof(struct lease_ctx_info), + GFP_KERNEL); + if (!lreq) + return NULL; + + data_offset = (char *)req + 4 + le32_to_cpu(req->CreateContextsOffset); + cc = (struct create_context *)data_offset; + do { + cc = (struct create_context *)((char *)cc + next); + name = le16_to_cpu(cc->NameOffset) + (char *)cc; + if (le16_to_cpu(cc->NameLength) != 4 || + strncmp(name, SMB2_CREATE_REQUEST_LEASE, 4)) { + next = le32_to_cpu(cc->Next); + continue; + } + found = true; + break; + } while (next != 0); + + if (found) { + if (sizeof(struct lease_context_v2) == le32_to_cpu(cc->DataLength)) { + struct create_lease_v2 *lc = (struct create_lease_v2 *)cc; + + *((__le64 *)lreq->lease_key) = lc->lcontext.LeaseKeyLow; + *((__le64 *)(lreq->lease_key + 8)) = lc->lcontext.LeaseKeyHigh; + lreq->req_state = lc->lcontext.LeaseState; + lreq->flags = lc->lcontext.LeaseFlags; + lreq->duration = lc->lcontext.LeaseDuration; + *((__le64 *)lreq->parent_lease_key) = lc->lcontext.ParentLeaseKeyLow; + *((__le64 *)(lreq->parent_lease_key + 8)) = lc->lcontext.ParentLeaseKeyHigh; + lreq->version = 2; + } else { + struct create_lease *lc = (struct create_lease *)cc; + + *((__le64 *)lreq->lease_key) = lc->lcontext.LeaseKeyLow; + *((__le64 *)(lreq->lease_key + 8)) = lc->lcontext.LeaseKeyHigh; + lreq->req_state = lc->lcontext.LeaseState; + lreq->flags = lc->lcontext.LeaseFlags; + lreq->duration = lc->lcontext.LeaseDuration; + lreq->version = 1; + } + return lreq; + } + + kfree(lreq); + return NULL; +} + +/** + * smb2_find_context_vals() - find a particular context info in open request + * @open_req: buffer containing smb2 file open(create) request + * @tag: context name to search for + * + * Return: pointer to requested context, NULL if @str context not found + * or error pointer if name length is invalid. + */ +struct create_context *smb2_find_context_vals(void *open_req, const char *tag) +{ + char *data_offset; + struct create_context *cc; + unsigned int next = 0; + char *name; + struct smb2_create_req *req = (struct smb2_create_req *)open_req; + + data_offset = (char *)req + 4 + le32_to_cpu(req->CreateContextsOffset); + cc = (struct create_context *)data_offset; + do { + int val; + + cc = (struct create_context *)((char *)cc + next); + name = le16_to_cpu(cc->NameOffset) + (char *)cc; + val = le16_to_cpu(cc->NameLength); + if (val < 4) + return ERR_PTR(-EINVAL); + + if (memcmp(name, tag, val) == 0) + return cc; + next = le32_to_cpu(cc->Next); + } while (next != 0); + + return NULL; +} + +/** + * create_durable_rsp_buf() - create durable handle context + * @cc: buffer to create durable context response + */ +void create_durable_rsp_buf(char *cc) +{ + struct create_durable_rsp *buf; + + buf = (struct create_durable_rsp *)cc; + memset(buf, 0, sizeof(struct create_durable_rsp)); + buf->ccontext.DataOffset = cpu_to_le16(offsetof + (struct create_durable_rsp, Data)); + buf->ccontext.DataLength = cpu_to_le32(8); + buf->ccontext.NameOffset = cpu_to_le16(offsetof + (struct create_durable_rsp, Name)); + buf->ccontext.NameLength = cpu_to_le16(4); + /* SMB2_CREATE_DURABLE_HANDLE_RESPONSE is "DHnQ" */ + buf->Name[0] = 'D'; + buf->Name[1] = 'H'; + buf->Name[2] = 'n'; + buf->Name[3] = 'Q'; +} + +/** + * create_durable_v2_rsp_buf() - create durable handle v2 context + * @cc: buffer to create durable context response + * @fp: ksmbd file pointer + */ +void create_durable_v2_rsp_buf(char *cc, struct ksmbd_file *fp) +{ + struct create_durable_v2_rsp *buf; + + buf = (struct create_durable_v2_rsp *)cc; + memset(buf, 0, sizeof(struct create_durable_rsp)); + buf->ccontext.DataOffset = cpu_to_le16(offsetof + (struct create_durable_rsp, Data)); + buf->ccontext.DataLength = cpu_to_le32(8); + buf->ccontext.NameOffset = cpu_to_le16(offsetof + (struct create_durable_rsp, Name)); + buf->ccontext.NameLength = cpu_to_le16(4); + /* SMB2_CREATE_DURABLE_HANDLE_RESPONSE_V2 is "DH2Q" */ + buf->Name[0] = 'D'; + buf->Name[1] = 'H'; + buf->Name[2] = '2'; + buf->Name[3] = 'Q'; + + buf->Timeout = cpu_to_le32(fp->durable_timeout); +} + +/** + * create_mxac_rsp_buf() - create query maximal access context + * @cc: buffer to create maximal access context response + * @maximal_access: maximal access + */ +void create_mxac_rsp_buf(char *cc, int maximal_access) +{ + struct create_mxac_rsp *buf; + + buf = (struct create_mxac_rsp *)cc; + memset(buf, 0, sizeof(struct create_mxac_rsp)); + buf->ccontext.DataOffset = cpu_to_le16(offsetof + (struct create_mxac_rsp, QueryStatus)); + buf->ccontext.DataLength = cpu_to_le32(8); + buf->ccontext.NameOffset = cpu_to_le16(offsetof + (struct create_mxac_rsp, Name)); + buf->ccontext.NameLength = cpu_to_le16(4); + /* SMB2_CREATE_QUERY_MAXIMAL_ACCESS_RESPONSE is "MxAc" */ + buf->Name[0] = 'M'; + buf->Name[1] = 'x'; + buf->Name[2] = 'A'; + buf->Name[3] = 'c'; + + buf->QueryStatus = STATUS_SUCCESS; + buf->MaximalAccess = cpu_to_le32(maximal_access); +} + +void create_disk_id_rsp_buf(char *cc, __u64 file_id, __u64 vol_id) +{ + struct create_disk_id_rsp *buf; + + buf = (struct create_disk_id_rsp *)cc; + memset(buf, 0, sizeof(struct create_disk_id_rsp)); + buf->ccontext.DataOffset = cpu_to_le16(offsetof + (struct create_disk_id_rsp, DiskFileId)); + buf->ccontext.DataLength = cpu_to_le32(32); + buf->ccontext.NameOffset = cpu_to_le16(offsetof + (struct create_mxac_rsp, Name)); + buf->ccontext.NameLength = cpu_to_le16(4); + /* SMB2_CREATE_QUERY_ON_DISK_ID_RESPONSE is "QFid" */ + buf->Name[0] = 'Q'; + buf->Name[1] = 'F'; + buf->Name[2] = 'i'; + buf->Name[3] = 'd'; + + buf->DiskFileId = cpu_to_le64(file_id); + buf->VolumeId = cpu_to_le64(vol_id); +} + +/** + * create_posix_rsp_buf() - create posix extension context + * @cc: buffer to create posix on posix response + * @fp: ksmbd file pointer + */ +void create_posix_rsp_buf(char *cc, struct ksmbd_file *fp) +{ + struct create_posix_rsp *buf; + struct inode *inode = file_inode(fp->filp); + struct user_namespace *user_ns = file_mnt_user_ns(fp->filp); + + buf = (struct create_posix_rsp *)cc; + memset(buf, 0, sizeof(struct create_posix_rsp)); + buf->ccontext.DataOffset = cpu_to_le16(offsetof + (struct create_posix_rsp, nlink)); + buf->ccontext.DataLength = cpu_to_le32(52); + buf->ccontext.NameOffset = cpu_to_le16(offsetof + (struct create_posix_rsp, Name)); + buf->ccontext.NameLength = cpu_to_le16(POSIX_CTXT_DATA_LEN); + /* SMB2_CREATE_TAG_POSIX is "0x93AD25509CB411E7B42383DE968BCD7C" */ + buf->Name[0] = 0x93; + buf->Name[1] = 0xAD; + buf->Name[2] = 0x25; + buf->Name[3] = 0x50; + buf->Name[4] = 0x9C; + buf->Name[5] = 0xB4; + buf->Name[6] = 0x11; + buf->Name[7] = 0xE7; + buf->Name[8] = 0xB4; + buf->Name[9] = 0x23; + buf->Name[10] = 0x83; + buf->Name[11] = 0xDE; + buf->Name[12] = 0x96; + buf->Name[13] = 0x8B; + buf->Name[14] = 0xCD; + buf->Name[15] = 0x7C; + + buf->nlink = cpu_to_le32(inode->i_nlink); + buf->reparse_tag = cpu_to_le32(fp->volatile_id); + buf->mode = cpu_to_le32(inode->i_mode); + id_to_sid(from_kuid(user_ns, inode->i_uid), + SIDNFS_USER, (struct smb_sid *)&buf->SidBuffer[0]); + id_to_sid(from_kgid(user_ns, inode->i_gid), + SIDNFS_GROUP, (struct smb_sid *)&buf->SidBuffer[20]); +} + +/* + * Find lease object(opinfo) for given lease key/fid from lease + * break/file close path. + */ +/** + * lookup_lease_in_table() - find a matching lease info object + * @conn: connection instance + * @lease_key: lease key to be searched for + * + * Return: opinfo if found matching opinfo, otherwise NULL + */ +struct oplock_info *lookup_lease_in_table(struct ksmbd_conn *conn, + char *lease_key) +{ + struct oplock_info *opinfo = NULL, *ret_op = NULL; + struct lease_table *lt; + int ret; + + read_lock(&lease_list_lock); + list_for_each_entry(lt, &lease_table_list, l_entry) { + if (!memcmp(lt->client_guid, conn->ClientGUID, + SMB2_CLIENT_GUID_SIZE)) + goto found; + } + + read_unlock(&lease_list_lock); + return NULL; + +found: + rcu_read_lock(); + list_for_each_entry_rcu(opinfo, <->lease_list, lease_entry) { + if (!atomic_inc_not_zero(&opinfo->refcount)) + continue; + rcu_read_unlock(); + if (!opinfo->op_state || opinfo->op_state == OPLOCK_CLOSING) + goto op_next; + if (!(opinfo->o_lease->state & + (SMB2_LEASE_HANDLE_CACHING_LE | + SMB2_LEASE_WRITE_CACHING_LE))) + goto op_next; + ret = compare_guid_key(opinfo, conn->ClientGUID, + lease_key); + if (ret) { + ksmbd_debug(OPLOCK, "found opinfo\n"); + ret_op = opinfo; + goto out; + } +op_next: + opinfo_put(opinfo); + rcu_read_lock(); + } + rcu_read_unlock(); + +out: + read_unlock(&lease_list_lock); + return ret_op; +} + +int smb2_check_durable_oplock(struct ksmbd_file *fp, + struct lease_ctx_info *lctx, char *name) +{ + struct oplock_info *opinfo = opinfo_get(fp); + int ret = 0; + + if (opinfo && opinfo->is_lease) { + if (!lctx) { + pr_err("open does not include lease\n"); + ret = -EBADF; + goto out; + } + if (memcmp(opinfo->o_lease->lease_key, lctx->lease_key, + SMB2_LEASE_KEY_SIZE)) { + pr_err("invalid lease key\n"); + ret = -EBADF; + goto out; + } + if (name && strcmp(fp->filename, name)) { + pr_err("invalid name reconnect %s\n", name); + ret = -EINVAL; + goto out; + } + } +out: + if (opinfo) + opinfo_put(opinfo); + return ret; +} diff --git a/fs/ksmbd/oplock.h b/fs/ksmbd/oplock.h new file mode 100644 index 000000000000..119b8047cfbd --- /dev/null +++ b/fs/ksmbd/oplock.h @@ -0,0 +1,131 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2016 Namjae Jeon <linkinjeon@kernel.org> + * Copyright (C) 2018 Samsung Electronics Co., Ltd. + */ + +#ifndef __KSMBD_OPLOCK_H +#define __KSMBD_OPLOCK_H + +#include "smb_common.h" + +#define OPLOCK_WAIT_TIME (35 * HZ) + +/* SMB2 Oplock levels */ +#define SMB2_OPLOCK_LEVEL_NONE 0x00 +#define SMB2_OPLOCK_LEVEL_II 0x01 +#define SMB2_OPLOCK_LEVEL_EXCLUSIVE 0x08 +#define SMB2_OPLOCK_LEVEL_BATCH 0x09 +#define SMB2_OPLOCK_LEVEL_LEASE 0xFF + +/* Oplock states */ +#define OPLOCK_STATE_NONE 0x00 +#define OPLOCK_ACK_WAIT 0x01 +#define OPLOCK_CLOSING 0x02 + +#define OPLOCK_WRITE_TO_READ 0x01 +#define OPLOCK_READ_HANDLE_TO_READ 0x02 +#define OPLOCK_WRITE_TO_NONE 0x04 +#define OPLOCK_READ_TO_NONE 0x08 + +#define SMB2_LEASE_KEY_SIZE 16 + +struct lease_ctx_info { + __u8 lease_key[SMB2_LEASE_KEY_SIZE]; + __le32 req_state; + __le32 flags; + __le64 duration; + __u8 parent_lease_key[SMB2_LEASE_KEY_SIZE]; + int version; +}; + +struct lease_table { + char client_guid[SMB2_CLIENT_GUID_SIZE]; + struct list_head lease_list; + struct list_head l_entry; + spinlock_t lb_lock; +}; + +struct lease { + __u8 lease_key[SMB2_LEASE_KEY_SIZE]; + __le32 state; + __le32 new_state; + __le32 flags; + __le64 duration; + __u8 parent_lease_key[SMB2_LEASE_KEY_SIZE]; + int version; + unsigned short epoch; + struct lease_table *l_lb; +}; + +struct oplock_info { + struct ksmbd_conn *conn; + struct ksmbd_session *sess; + struct ksmbd_work *work; + struct ksmbd_file *o_fp; + int level; + int op_state; + unsigned long pending_break; + u64 fid; + atomic_t breaking_cnt; + atomic_t refcount; + __u16 Tid; + bool is_lease; + bool open_trunc; /* truncate on open */ + struct lease *o_lease; + struct list_head interim_list; + struct list_head op_entry; + struct list_head lease_entry; + wait_queue_head_t oplock_q; /* Other server threads */ + wait_queue_head_t oplock_brk; /* oplock breaking wait */ + struct rcu_head rcu_head; +}; + +struct lease_break_info { + __le32 curr_state; + __le32 new_state; + __le16 epoch; + char lease_key[SMB2_LEASE_KEY_SIZE]; +}; + +struct oplock_break_info { + int level; + int open_trunc; + int fid; +}; + +int smb_grant_oplock(struct ksmbd_work *work, int req_op_level, + u64 pid, struct ksmbd_file *fp, __u16 tid, + struct lease_ctx_info *lctx, int share_ret); +void smb_break_all_levII_oplock(struct ksmbd_work *work, + struct ksmbd_file *fp, int is_trunc); +int opinfo_write_to_read(struct oplock_info *opinfo); +int opinfo_read_handle_to_read(struct oplock_info *opinfo); +int opinfo_write_to_none(struct oplock_info *opinfo); +int opinfo_read_to_none(struct oplock_info *opinfo); +void close_id_del_oplock(struct ksmbd_file *fp); +void smb_break_all_oplock(struct ksmbd_work *work, struct ksmbd_file *fp); +struct oplock_info *opinfo_get(struct ksmbd_file *fp); +void opinfo_put(struct oplock_info *opinfo); + +/* Lease related functions */ +void create_lease_buf(u8 *rbuf, struct lease *lease); +struct lease_ctx_info *parse_lease_state(void *open_req); +__u8 smb2_map_lease_to_oplock(__le32 lease_state); +int lease_read_to_write(struct oplock_info *opinfo); + +/* Durable related functions */ +void create_durable_rsp_buf(char *cc); +void create_durable_v2_rsp_buf(char *cc, struct ksmbd_file *fp); +void create_mxac_rsp_buf(char *cc, int maximal_access); +void create_disk_id_rsp_buf(char *cc, __u64 file_id, __u64 vol_id); +void create_posix_rsp_buf(char *cc, struct ksmbd_file *fp); +struct create_context *smb2_find_context_vals(void *open_req, const char *str); +struct oplock_info *lookup_lease_in_table(struct ksmbd_conn *conn, + char *lease_key); +int find_same_lease_key(struct ksmbd_session *sess, struct ksmbd_inode *ci, + struct lease_ctx_info *lctx); +void destroy_lease_table(struct ksmbd_conn *conn); +int smb2_check_durable_oplock(struct ksmbd_file *fp, + struct lease_ctx_info *lctx, char *name); +#endif /* __KSMBD_OPLOCK_H */ diff --git a/fs/ksmbd/server.c b/fs/ksmbd/server.c new file mode 100644 index 000000000000..e6a9f6aa47eb --- /dev/null +++ b/fs/ksmbd/server.c @@ -0,0 +1,633 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2016 Namjae Jeon <linkinjeon@kernel.org> + * Copyright (C) 2018 Samsung Electronics Co., Ltd. + */ + +#include "glob.h" +#include "oplock.h" +#include "misc.h" +#include <linux/sched/signal.h> +#include <linux/workqueue.h> +#include <linux/sysfs.h> +#include <linux/module.h> +#include <linux/moduleparam.h> + +#include "server.h" +#include "smb_common.h" +#include "smbstatus.h" +#include "connection.h" +#include "transport_ipc.h" +#include "mgmt/user_session.h" +#include "crypto_ctx.h" +#include "auth.h" + +int ksmbd_debug_types; + +struct ksmbd_server_config server_conf; + +enum SERVER_CTRL_TYPE { + SERVER_CTRL_TYPE_INIT, + SERVER_CTRL_TYPE_RESET, +}; + +struct server_ctrl_struct { + int type; + struct work_struct ctrl_work; +}; + +static DEFINE_MUTEX(ctrl_lock); + +static int ___server_conf_set(int idx, char *val) +{ + if (idx >= ARRAY_SIZE(server_conf.conf)) + return -EINVAL; + + if (!val || val[0] == 0x00) + return -EINVAL; + + kfree(server_conf.conf[idx]); + server_conf.conf[idx] = kstrdup(val, GFP_KERNEL); + if (!server_conf.conf[idx]) + return -ENOMEM; + return 0; +} + +int ksmbd_set_netbios_name(char *v) +{ + return ___server_conf_set(SERVER_CONF_NETBIOS_NAME, v); +} + +int ksmbd_set_server_string(char *v) +{ + return ___server_conf_set(SERVER_CONF_SERVER_STRING, v); +} + +int ksmbd_set_work_group(char *v) +{ + return ___server_conf_set(SERVER_CONF_WORK_GROUP, v); +} + +char *ksmbd_netbios_name(void) +{ + return server_conf.conf[SERVER_CONF_NETBIOS_NAME]; +} + +char *ksmbd_server_string(void) +{ + return server_conf.conf[SERVER_CONF_SERVER_STRING]; +} + +char *ksmbd_work_group(void) +{ + return server_conf.conf[SERVER_CONF_WORK_GROUP]; +} + +/** + * check_conn_state() - check state of server thread connection + * @work: smb work containing server thread information + * + * Return: 0 on valid connection, otherwise 1 to reconnect + */ +static inline int check_conn_state(struct ksmbd_work *work) +{ + struct smb_hdr *rsp_hdr; + + if (ksmbd_conn_exiting(work) || ksmbd_conn_need_reconnect(work)) { + rsp_hdr = work->response_buf; + rsp_hdr->Status.CifsError = STATUS_CONNECTION_DISCONNECTED; + return 1; + } + return 0; +} + +#define SERVER_HANDLER_CONTINUE 0 +#define SERVER_HANDLER_ABORT 1 + +static int __process_request(struct ksmbd_work *work, struct ksmbd_conn *conn, + u16 *cmd) +{ + struct smb_version_cmds *cmds; + u16 command; + int ret; + + if (check_conn_state(work)) + return SERVER_HANDLER_CONTINUE; + + if (ksmbd_verify_smb_message(work)) + return SERVER_HANDLER_ABORT; + + command = conn->ops->get_cmd_val(work); + *cmd = command; + +andx_again: + if (command >= conn->max_cmds) { + conn->ops->set_rsp_status(work, STATUS_INVALID_PARAMETER); + return SERVER_HANDLER_CONTINUE; + } + + cmds = &conn->cmds[command]; + if (!cmds->proc) { + ksmbd_debug(SMB, "*** not implemented yet cmd = %x\n", command); + conn->ops->set_rsp_status(work, STATUS_NOT_IMPLEMENTED); + return SERVER_HANDLER_CONTINUE; + } + + if (work->sess && conn->ops->is_sign_req(work, command)) { + ret = conn->ops->check_sign_req(work); + if (!ret) { + conn->ops->set_rsp_status(work, STATUS_ACCESS_DENIED); + return SERVER_HANDLER_CONTINUE; + } + } + + ret = cmds->proc(work); + + if (ret < 0) + ksmbd_debug(CONN, "Failed to process %u [%d]\n", command, ret); + /* AndX commands - chained request can return positive values */ + else if (ret > 0) { + command = ret; + *cmd = command; + goto andx_again; + } + + if (work->send_no_response) + return SERVER_HANDLER_ABORT; + return SERVER_HANDLER_CONTINUE; +} + +static void __handle_ksmbd_work(struct ksmbd_work *work, + struct ksmbd_conn *conn) +{ + u16 command = 0; + int rc; + + if (conn->ops->allocate_rsp_buf(work)) + return; + + if (conn->ops->is_transform_hdr && + conn->ops->is_transform_hdr(work->request_buf)) { + rc = conn->ops->decrypt_req(work); + if (rc < 0) { + conn->ops->set_rsp_status(work, STATUS_DATA_ERROR); + goto send; + } + + work->encrypted = true; + } + + rc = conn->ops->init_rsp_hdr(work); + if (rc) { + /* either uid or tid is not correct */ + conn->ops->set_rsp_status(work, STATUS_INVALID_HANDLE); + goto send; + } + + if (conn->ops->check_user_session) { + rc = conn->ops->check_user_session(work); + if (rc < 0) { + command = conn->ops->get_cmd_val(work); + conn->ops->set_rsp_status(work, + STATUS_USER_SESSION_DELETED); + goto send; + } else if (rc > 0) { + rc = conn->ops->get_ksmbd_tcon(work); + if (rc < 0) { + conn->ops->set_rsp_status(work, + STATUS_NETWORK_NAME_DELETED); + goto send; + } + } + } + + do { + rc = __process_request(work, conn, &command); + if (rc == SERVER_HANDLER_ABORT) + break; + + /* + * Call smb2_set_rsp_credits() function to set number of credits + * granted in hdr of smb2 response. + */ + if (conn->ops->set_rsp_credits) { + spin_lock(&conn->credits_lock); + rc = conn->ops->set_rsp_credits(work); + spin_unlock(&conn->credits_lock); + if (rc < 0) { + conn->ops->set_rsp_status(work, + STATUS_INVALID_PARAMETER); + goto send; + } + } + + if (work->sess && + (work->sess->sign || smb3_11_final_sess_setup_resp(work) || + conn->ops->is_sign_req(work, command))) + conn->ops->set_sign_rsp(work); + } while (is_chained_smb2_message(work)); + + if (work->send_no_response) + return; + +send: + smb3_preauth_hash_rsp(work); + if (work->sess && work->sess->enc && work->encrypted && + conn->ops->encrypt_resp) { + rc = conn->ops->encrypt_resp(work); + if (rc < 0) { + conn->ops->set_rsp_status(work, STATUS_DATA_ERROR); + goto send; + } + } + + ksmbd_conn_write(work); +} + +/** + * handle_ksmbd_work() - process pending smb work requests + * @wk: smb work containing request command buffer + * + * called by kworker threads to processing remaining smb work requests + */ +static void handle_ksmbd_work(struct work_struct *wk) +{ + struct ksmbd_work *work = container_of(wk, struct ksmbd_work, work); + struct ksmbd_conn *conn = work->conn; + + atomic64_inc(&conn->stats.request_served); + + __handle_ksmbd_work(work, conn); + + ksmbd_conn_try_dequeue_request(work); + ksmbd_free_work_struct(work); + atomic_dec(&conn->r_count); +} + +/** + * queue_ksmbd_work() - queue a smb request to worker thread queue + * for proccessing smb command and sending response + * @conn: connection instance + * + * read remaining data from socket create and submit work. + */ +static int queue_ksmbd_work(struct ksmbd_conn *conn) +{ + struct ksmbd_work *work; + + work = ksmbd_alloc_work_struct(); + if (!work) { + pr_err("allocation for work failed\n"); + return -ENOMEM; + } + + work->conn = conn; + work->request_buf = conn->request_buf; + conn->request_buf = NULL; + + if (ksmbd_init_smb_server(work)) { + ksmbd_free_work_struct(work); + return -EINVAL; + } + + ksmbd_conn_enqueue_request(work); + atomic_inc(&conn->r_count); + /* update activity on connection */ + conn->last_active = jiffies; + INIT_WORK(&work->work, handle_ksmbd_work); + ksmbd_queue_work(work); + return 0; +} + +static int ksmbd_server_process_request(struct ksmbd_conn *conn) +{ + return queue_ksmbd_work(conn); +} + +static int ksmbd_server_terminate_conn(struct ksmbd_conn *conn) +{ + ksmbd_sessions_deregister(conn); + destroy_lease_table(conn); + return 0; +} + +static void ksmbd_server_tcp_callbacks_init(void) +{ + struct ksmbd_conn_ops ops; + + ops.process_fn = ksmbd_server_process_request; + ops.terminate_fn = ksmbd_server_terminate_conn; + + ksmbd_conn_init_server_callbacks(&ops); +} + +static void server_conf_free(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(server_conf.conf); i++) { + kfree(server_conf.conf[i]); + server_conf.conf[i] = NULL; + } +} + +static int server_conf_init(void) +{ + WRITE_ONCE(server_conf.state, SERVER_STATE_STARTING_UP); + server_conf.enforced_signing = 0; + server_conf.min_protocol = ksmbd_min_protocol(); + server_conf.max_protocol = ksmbd_max_protocol(); + server_conf.auth_mechs = KSMBD_AUTH_NTLMSSP; +#ifdef CONFIG_SMB_SERVER_KERBEROS5 + server_conf.auth_mechs |= KSMBD_AUTH_KRB5 | + KSMBD_AUTH_MSKRB5; +#endif + return 0; +} + +static void server_ctrl_handle_init(struct server_ctrl_struct *ctrl) +{ + int ret; + + ret = ksmbd_conn_transport_init(); + if (ret) { + server_queue_ctrl_reset_work(); + return; + } + + WRITE_ONCE(server_conf.state, SERVER_STATE_RUNNING); +} + +static void server_ctrl_handle_reset(struct server_ctrl_struct *ctrl) +{ + ksmbd_ipc_soft_reset(); + ksmbd_conn_transport_destroy(); + server_conf_free(); + server_conf_init(); + WRITE_ONCE(server_conf.state, SERVER_STATE_STARTING_UP); +} + +static void server_ctrl_handle_work(struct work_struct *work) +{ + struct server_ctrl_struct *ctrl; + + ctrl = container_of(work, struct server_ctrl_struct, ctrl_work); + + mutex_lock(&ctrl_lock); + switch (ctrl->type) { + case SERVER_CTRL_TYPE_INIT: + server_ctrl_handle_init(ctrl); + break; + case SERVER_CTRL_TYPE_RESET: + server_ctrl_handle_reset(ctrl); + break; + default: + pr_err("Unknown server work type: %d\n", ctrl->type); + } + mutex_unlock(&ctrl_lock); + kfree(ctrl); + module_put(THIS_MODULE); +} + +static int __queue_ctrl_work(int type) +{ + struct server_ctrl_struct *ctrl; + + ctrl = kmalloc(sizeof(struct server_ctrl_struct), GFP_KERNEL); + if (!ctrl) + return -ENOMEM; + + __module_get(THIS_MODULE); + ctrl->type = type; + INIT_WORK(&ctrl->ctrl_work, server_ctrl_handle_work); + queue_work(system_long_wq, &ctrl->ctrl_work); + return 0; +} + +int server_queue_ctrl_init_work(void) +{ + return __queue_ctrl_work(SERVER_CTRL_TYPE_INIT); +} + +int server_queue_ctrl_reset_work(void) +{ + return __queue_ctrl_work(SERVER_CTRL_TYPE_RESET); +} + +static ssize_t stats_show(struct class *class, struct class_attribute *attr, + char *buf) +{ + /* + * Inc this each time you change stats output format, + * so user space will know what to do. + */ + static int stats_version = 2; + static const char * const state[] = { + "startup", + "running", + "reset", + "shutdown" + }; + + ssize_t sz = scnprintf(buf, PAGE_SIZE, "%d %s %d %lu\n", stats_version, + state[server_conf.state], server_conf.tcp_port, + server_conf.ipc_last_active / HZ); + return sz; +} + +static ssize_t kill_server_store(struct class *class, + struct class_attribute *attr, const char *buf, + size_t len) +{ + if (!sysfs_streq(buf, "hard")) + return len; + + pr_info("kill command received\n"); + mutex_lock(&ctrl_lock); + WRITE_ONCE(server_conf.state, SERVER_STATE_RESETTING); + __module_get(THIS_MODULE); + server_ctrl_handle_reset(NULL); + module_put(THIS_MODULE); + mutex_unlock(&ctrl_lock); + return len; +} + +static const char * const debug_type_strings[] = {"smb", "auth", "vfs", + "oplock", "ipc", "conn", + "rdma"}; + +static ssize_t debug_show(struct class *class, struct class_attribute *attr, + char *buf) +{ + ssize_t sz = 0; + int i, pos = 0; + + for (i = 0; i < ARRAY_SIZE(debug_type_strings); i++) { + if ((ksmbd_debug_types >> i) & 1) { + pos = scnprintf(buf + sz, + PAGE_SIZE - sz, + "[%s] ", + debug_type_strings[i]); + } else { + pos = scnprintf(buf + sz, + PAGE_SIZE - sz, + "%s ", + debug_type_strings[i]); + } + sz += pos; + } + sz += scnprintf(buf + sz, PAGE_SIZE - sz, "\n"); + return sz; +} + +static ssize_t debug_store(struct class *class, struct class_attribute *attr, + const char *buf, size_t len) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(debug_type_strings); i++) { + if (sysfs_streq(buf, "all")) { + if (ksmbd_debug_types == KSMBD_DEBUG_ALL) + ksmbd_debug_types = 0; + else + ksmbd_debug_types = KSMBD_DEBUG_ALL; + break; + } + + if (sysfs_streq(buf, debug_type_strings[i])) { + if (ksmbd_debug_types & (1 << i)) + ksmbd_debug_types &= ~(1 << i); + else + ksmbd_debug_types |= (1 << i); + break; + } + } + + return len; +} + +static CLASS_ATTR_RO(stats); +static CLASS_ATTR_WO(kill_server); +static CLASS_ATTR_RW(debug); + +static struct attribute *ksmbd_control_class_attrs[] = { + &class_attr_stats.attr, + &class_attr_kill_server.attr, + &class_attr_debug.attr, + NULL, +}; +ATTRIBUTE_GROUPS(ksmbd_control_class); + +static struct class ksmbd_control_class = { + .name = "ksmbd-control", + .owner = THIS_MODULE, + .class_groups = ksmbd_control_class_groups, +}; + +static int ksmbd_server_shutdown(void) +{ + WRITE_ONCE(server_conf.state, SERVER_STATE_SHUTTING_DOWN); + + class_unregister(&ksmbd_control_class); + ksmbd_workqueue_destroy(); + ksmbd_ipc_release(); + ksmbd_conn_transport_destroy(); + ksmbd_crypto_destroy(); + ksmbd_free_global_file_table(); + destroy_lease_table(NULL); + ksmbd_work_pool_destroy(); + ksmbd_exit_file_cache(); + server_conf_free(); + return 0; +} + +static int __init ksmbd_server_init(void) +{ + int ret; + + ret = class_register(&ksmbd_control_class); + if (ret) { + pr_err("Unable to register ksmbd-control class\n"); + return ret; + } + + ksmbd_server_tcp_callbacks_init(); + + ret = server_conf_init(); + if (ret) + goto err_unregister; + + ret = ksmbd_work_pool_init(); + if (ret) + goto err_unregister; + + ret = ksmbd_init_file_cache(); + if (ret) + goto err_destroy_work_pools; + + ret = ksmbd_ipc_init(); + if (ret) + goto err_exit_file_cache; + + ret = ksmbd_init_global_file_table(); + if (ret) + goto err_ipc_release; + + ret = ksmbd_inode_hash_init(); + if (ret) + goto err_destroy_file_table; + + ret = ksmbd_crypto_create(); + if (ret) + goto err_release_inode_hash; + + ret = ksmbd_workqueue_init(); + if (ret) + goto err_crypto_destroy; + return 0; + +err_crypto_destroy: + ksmbd_crypto_destroy(); +err_release_inode_hash: + ksmbd_release_inode_hash(); +err_destroy_file_table: + ksmbd_free_global_file_table(); +err_ipc_release: + ksmbd_ipc_release(); +err_exit_file_cache: + ksmbd_exit_file_cache(); +err_destroy_work_pools: + ksmbd_work_pool_destroy(); +err_unregister: + class_unregister(&ksmbd_control_class); + + return ret; +} + +/** + * ksmbd_server_exit() - shutdown forker thread and free memory at module exit + */ +static void __exit ksmbd_server_exit(void) +{ + ksmbd_server_shutdown(); + ksmbd_release_inode_hash(); +} + +MODULE_AUTHOR("Namjae Jeon <linkinjeon@kernel.org>"); +MODULE_VERSION(KSMBD_VERSION); +MODULE_DESCRIPTION("Linux kernel CIFS/SMB SERVER"); +MODULE_LICENSE("GPL"); +MODULE_SOFTDEP("pre: ecb"); +MODULE_SOFTDEP("pre: hmac"); +MODULE_SOFTDEP("pre: md4"); +MODULE_SOFTDEP("pre: md5"); +MODULE_SOFTDEP("pre: nls"); +MODULE_SOFTDEP("pre: aes"); +MODULE_SOFTDEP("pre: cmac"); +MODULE_SOFTDEP("pre: sha256"); +MODULE_SOFTDEP("pre: sha512"); +MODULE_SOFTDEP("pre: aead2"); +MODULE_SOFTDEP("pre: ccm"); +MODULE_SOFTDEP("pre: gcm"); +module_init(ksmbd_server_init) +module_exit(ksmbd_server_exit) diff --git a/fs/ksmbd/server.h b/fs/ksmbd/server.h new file mode 100644 index 000000000000..ac9d932f8c8a --- /dev/null +++ b/fs/ksmbd/server.h @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2018 Samsung Electronics Co., Ltd. + */ + +#ifndef __SERVER_H__ +#define __SERVER_H__ + +#include "smbacl.h" + +/* + * Server state type + */ +enum { + SERVER_STATE_STARTING_UP, + SERVER_STATE_RUNNING, + SERVER_STATE_RESETTING, + SERVER_STATE_SHUTTING_DOWN, +}; + +/* + * Server global config string index + */ +enum { + SERVER_CONF_NETBIOS_NAME, + SERVER_CONF_SERVER_STRING, + SERVER_CONF_WORK_GROUP, +}; + +struct ksmbd_server_config { + unsigned int flags; + unsigned int state; + short signing; + short enforced_signing; + short min_protocol; + short max_protocol; + unsigned short tcp_port; + unsigned short ipc_timeout; + unsigned long ipc_last_active; + unsigned long deadtime; + unsigned int share_fake_fscaps; + struct smb_sid domain_sid; + unsigned int auth_mechs; + + char *conf[SERVER_CONF_WORK_GROUP + 1]; +}; + +extern struct ksmbd_server_config server_conf; + +int ksmbd_set_netbios_name(char *v); +int ksmbd_set_server_string(char *v); +int ksmbd_set_work_group(char *v); + +char *ksmbd_netbios_name(void); +char *ksmbd_server_string(void); +char *ksmbd_work_group(void); + +static inline int ksmbd_server_running(void) +{ + return READ_ONCE(server_conf.state) == SERVER_STATE_RUNNING; +} + +static inline int ksmbd_server_configurable(void) +{ + return READ_ONCE(server_conf.state) < SERVER_STATE_RESETTING; +} + +int server_queue_ctrl_init_work(void); +int server_queue_ctrl_reset_work(void); +#endif /* __SERVER_H__ */ diff --git a/fs/ksmbd/smb2misc.c b/fs/ksmbd/smb2misc.c new file mode 100644 index 000000000000..9aa46bb3e10d --- /dev/null +++ b/fs/ksmbd/smb2misc.c @@ -0,0 +1,438 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2016 Namjae Jeon <linkinjeon@kernel.org> + * Copyright (C) 2018 Samsung Electronics Co., Ltd. + */ + +#include "glob.h" +#include "nterr.h" +#include "smb2pdu.h" +#include "smb_common.h" +#include "smbstatus.h" +#include "mgmt/user_session.h" +#include "connection.h" + +static int check_smb2_hdr(struct smb2_hdr *hdr) +{ + /* + * Make sure that this really is an SMB, that it is a response. + */ + if (hdr->Flags & SMB2_FLAGS_SERVER_TO_REDIR) + return 1; + return 0; +} + +/* + * The following table defines the expected "StructureSize" of SMB2 requests + * in order by SMB2 command. This is similar to "wct" in SMB/CIFS requests. + * + * Note that commands are defined in smb2pdu.h in le16 but the array below is + * indexed by command in host byte order + */ +static const __le16 smb2_req_struct_sizes[NUMBER_OF_SMB2_COMMANDS] = { + /* SMB2_NEGOTIATE */ cpu_to_le16(36), + /* SMB2_SESSION_SETUP */ cpu_to_le16(25), + /* SMB2_LOGOFF */ cpu_to_le16(4), + /* SMB2_TREE_CONNECT */ cpu_to_le16(9), + /* SMB2_TREE_DISCONNECT */ cpu_to_le16(4), + /* SMB2_CREATE */ cpu_to_le16(57), + /* SMB2_CLOSE */ cpu_to_le16(24), + /* SMB2_FLUSH */ cpu_to_le16(24), + /* SMB2_READ */ cpu_to_le16(49), + /* SMB2_WRITE */ cpu_to_le16(49), + /* SMB2_LOCK */ cpu_to_le16(48), + /* SMB2_IOCTL */ cpu_to_le16(57), + /* SMB2_CANCEL */ cpu_to_le16(4), + /* SMB2_ECHO */ cpu_to_le16(4), + /* SMB2_QUERY_DIRECTORY */ cpu_to_le16(33), + /* SMB2_CHANGE_NOTIFY */ cpu_to_le16(32), + /* SMB2_QUERY_INFO */ cpu_to_le16(41), + /* SMB2_SET_INFO */ cpu_to_le16(33), + /* use 44 for lease break */ + /* SMB2_OPLOCK_BREAK */ cpu_to_le16(36) +}; + +/* + * The size of the variable area depends on the offset and length fields + * located in different fields for various SMB2 requests. SMB2 requests + * with no variable length info, show an offset of zero for the offset field. + */ +static const bool has_smb2_data_area[NUMBER_OF_SMB2_COMMANDS] = { + /* SMB2_NEGOTIATE */ true, + /* SMB2_SESSION_SETUP */ true, + /* SMB2_LOGOFF */ false, + /* SMB2_TREE_CONNECT */ true, + /* SMB2_TREE_DISCONNECT */ false, + /* SMB2_CREATE */ true, + /* SMB2_CLOSE */ false, + /* SMB2_FLUSH */ false, + /* SMB2_READ */ true, + /* SMB2_WRITE */ true, + /* SMB2_LOCK */ true, + /* SMB2_IOCTL */ true, + /* SMB2_CANCEL */ false, /* BB CHECK this not listed in documentation */ + /* SMB2_ECHO */ false, + /* SMB2_QUERY_DIRECTORY */ true, + /* SMB2_CHANGE_NOTIFY */ false, + /* SMB2_QUERY_INFO */ true, + /* SMB2_SET_INFO */ true, + /* SMB2_OPLOCK_BREAK */ false +}; + +/* + * Returns the pointer to the beginning of the data area. Length of the data + * area and the offset to it (from the beginning of the smb are also returned. + */ +static char *smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr) +{ + *off = 0; + *len = 0; + + /* error reqeusts do not have data area */ + if (hdr->Status && hdr->Status != STATUS_MORE_PROCESSING_REQUIRED && + (((struct smb2_err_rsp *)hdr)->StructureSize) == SMB2_ERROR_STRUCTURE_SIZE2_LE) + return NULL; + + /* + * Following commands have data areas so we have to get the location + * of the data buffer offset and data buffer length for the particular + * command. + */ + switch (hdr->Command) { + case SMB2_SESSION_SETUP: + *off = le16_to_cpu(((struct smb2_sess_setup_req *)hdr)->SecurityBufferOffset); + *len = le16_to_cpu(((struct smb2_sess_setup_req *)hdr)->SecurityBufferLength); + break; + case SMB2_TREE_CONNECT: + *off = le16_to_cpu(((struct smb2_tree_connect_req *)hdr)->PathOffset); + *len = le16_to_cpu(((struct smb2_tree_connect_req *)hdr)->PathLength); + break; + case SMB2_CREATE: + { + if (((struct smb2_create_req *)hdr)->CreateContextsLength) { + *off = le32_to_cpu(((struct smb2_create_req *) + hdr)->CreateContextsOffset); + *len = le32_to_cpu(((struct smb2_create_req *) + hdr)->CreateContextsLength); + break; + } + + *off = le16_to_cpu(((struct smb2_create_req *)hdr)->NameOffset); + *len = le16_to_cpu(((struct smb2_create_req *)hdr)->NameLength); + break; + } + case SMB2_QUERY_INFO: + *off = le16_to_cpu(((struct smb2_query_info_req *)hdr)->InputBufferOffset); + *len = le32_to_cpu(((struct smb2_query_info_req *)hdr)->InputBufferLength); + break; + case SMB2_SET_INFO: + *off = le16_to_cpu(((struct smb2_set_info_req *)hdr)->BufferOffset); + *len = le32_to_cpu(((struct smb2_set_info_req *)hdr)->BufferLength); + break; + case SMB2_READ: + *off = le16_to_cpu(((struct smb2_read_req *)hdr)->ReadChannelInfoOffset); + *len = le16_to_cpu(((struct smb2_read_req *)hdr)->ReadChannelInfoLength); + break; + case SMB2_WRITE: + if (((struct smb2_write_req *)hdr)->DataOffset) { + *off = le16_to_cpu(((struct smb2_write_req *)hdr)->DataOffset); + *len = le32_to_cpu(((struct smb2_write_req *)hdr)->Length); + break; + } + + *off = le16_to_cpu(((struct smb2_write_req *)hdr)->WriteChannelInfoOffset); + *len = le16_to_cpu(((struct smb2_write_req *)hdr)->WriteChannelInfoLength); + break; + case SMB2_QUERY_DIRECTORY: + *off = le16_to_cpu(((struct smb2_query_directory_req *)hdr)->FileNameOffset); + *len = le16_to_cpu(((struct smb2_query_directory_req *)hdr)->FileNameLength); + break; + case SMB2_LOCK: + { + int lock_count; + + /* + * smb2_lock request size is 48 included single + * smb2_lock_element structure size. + */ + lock_count = le16_to_cpu(((struct smb2_lock_req *)hdr)->LockCount) - 1; + if (lock_count > 0) { + *off = __SMB2_HEADER_STRUCTURE_SIZE + 48; + *len = sizeof(struct smb2_lock_element) * lock_count; + } + break; + } + case SMB2_IOCTL: + *off = le32_to_cpu(((struct smb2_ioctl_req *)hdr)->InputOffset); + *len = le32_to_cpu(((struct smb2_ioctl_req *)hdr)->InputCount); + + break; + default: + ksmbd_debug(SMB, "no length check for command\n"); + break; + } + + /* + * Invalid length or offset probably means data area is invalid, but + * we have little choice but to ignore the data area in this case. + */ + if (*off > 4096) { + ksmbd_debug(SMB, "offset %d too large, data area ignored\n", + *off); + *len = 0; + *off = 0; + } else if (*off < 0) { + ksmbd_debug(SMB, + "negative offset %d to data invalid ignore data area\n", + *off); + *off = 0; + *len = 0; + } else if (*len < 0) { + ksmbd_debug(SMB, + "negative data length %d invalid, data area ignored\n", + *len); + *len = 0; + } else if (*len > 128 * 1024) { + ksmbd_debug(SMB, "data area larger than 128K: %d\n", *len); + *len = 0; + } + + /* return pointer to beginning of data area, ie offset from SMB start */ + if ((*off != 0) && (*len != 0)) + return (char *)hdr + *off; + else + return NULL; +} + +/* + * Calculate the size of the SMB message based on the fixed header + * portion, the number of word parameters and the data portion of the message. + */ +static unsigned int smb2_calc_size(void *buf) +{ + struct smb2_pdu *pdu = (struct smb2_pdu *)buf; + struct smb2_hdr *hdr = &pdu->hdr; + int offset; /* the offset from the beginning of SMB to data area */ + int data_length; /* the length of the variable length data area */ + /* Structure Size has already been checked to make sure it is 64 */ + int len = le16_to_cpu(hdr->StructureSize); + + /* + * StructureSize2, ie length of fixed parameter area has already + * been checked to make sure it is the correct length. + */ + len += le16_to_cpu(pdu->StructureSize2); + + if (has_smb2_data_area[le16_to_cpu(hdr->Command)] == false) + goto calc_size_exit; + + smb2_get_data_area_len(&offset, &data_length, hdr); + ksmbd_debug(SMB, "SMB2 data length %d offset %d\n", data_length, + offset); + + if (data_length > 0) { + /* + * Check to make sure that data area begins after fixed area, + * Note that last byte of the fixed area is part of data area + * for some commands, typically those with odd StructureSize, + * so we must add one to the calculation. + */ + if (offset + 1 < len) + ksmbd_debug(SMB, + "data area offset %d overlaps SMB2 header %d\n", + offset + 1, len); + else + len = offset + data_length; + } +calc_size_exit: + ksmbd_debug(SMB, "SMB2 len %d\n", len); + return len; +} + +static inline int smb2_query_info_req_len(struct smb2_query_info_req *h) +{ + return le32_to_cpu(h->InputBufferLength) + + le32_to_cpu(h->OutputBufferLength); +} + +static inline int smb2_set_info_req_len(struct smb2_set_info_req *h) +{ + return le32_to_cpu(h->BufferLength); +} + +static inline int smb2_read_req_len(struct smb2_read_req *h) +{ + return le32_to_cpu(h->Length); +} + +static inline int smb2_write_req_len(struct smb2_write_req *h) +{ + return le32_to_cpu(h->Length); +} + +static inline int smb2_query_dir_req_len(struct smb2_query_directory_req *h) +{ + return le32_to_cpu(h->OutputBufferLength); +} + +static inline int smb2_ioctl_req_len(struct smb2_ioctl_req *h) +{ + return le32_to_cpu(h->InputCount) + + le32_to_cpu(h->OutputCount); +} + +static inline int smb2_ioctl_resp_len(struct smb2_ioctl_req *h) +{ + return le32_to_cpu(h->MaxInputResponse) + + le32_to_cpu(h->MaxOutputResponse); +} + +static int smb2_validate_credit_charge(struct smb2_hdr *hdr) +{ + int req_len = 0, expect_resp_len = 0, calc_credit_num, max_len; + int credit_charge = le16_to_cpu(hdr->CreditCharge); + void *__hdr = hdr; + + switch (hdr->Command) { + case SMB2_QUERY_INFO: + req_len = smb2_query_info_req_len(__hdr); + break; + case SMB2_SET_INFO: + req_len = smb2_set_info_req_len(__hdr); + break; + case SMB2_READ: + req_len = smb2_read_req_len(__hdr); + break; + case SMB2_WRITE: + req_len = smb2_write_req_len(__hdr); + break; + case SMB2_QUERY_DIRECTORY: + req_len = smb2_query_dir_req_len(__hdr); + break; + case SMB2_IOCTL: + req_len = smb2_ioctl_req_len(__hdr); + expect_resp_len = smb2_ioctl_resp_len(__hdr); + break; + default: + return 0; + } + + credit_charge = max(1, credit_charge); + max_len = max(req_len, expect_resp_len); + calc_credit_num = DIV_ROUND_UP(max_len, SMB2_MAX_BUFFER_SIZE); + + if (credit_charge < calc_credit_num) { + pr_err("Insufficient credit charge, given: %d, needed: %d\n", + credit_charge, calc_credit_num); + return 1; + } + + return 0; +} + +int ksmbd_smb2_check_message(struct ksmbd_work *work) +{ + struct smb2_pdu *pdu = work->request_buf; + struct smb2_hdr *hdr = &pdu->hdr; + int command; + __u32 clc_len; /* calculated length */ + __u32 len = get_rfc1002_len(pdu); + + if (work->next_smb2_rcv_hdr_off) { + pdu = ksmbd_req_buf_next(work); + hdr = &pdu->hdr; + } + + if (le32_to_cpu(hdr->NextCommand) > 0) { + len = le32_to_cpu(hdr->NextCommand); + } else if (work->next_smb2_rcv_hdr_off) { + len -= work->next_smb2_rcv_hdr_off; + len = round_up(len, 8); + } + + if (check_smb2_hdr(hdr)) + return 1; + + if (hdr->StructureSize != SMB2_HEADER_STRUCTURE_SIZE) { + ksmbd_debug(SMB, "Illegal structure size %u\n", + le16_to_cpu(hdr->StructureSize)); + return 1; + } + + command = le16_to_cpu(hdr->Command); + if (command >= NUMBER_OF_SMB2_COMMANDS) { + ksmbd_debug(SMB, "Illegal SMB2 command %d\n", command); + return 1; + } + + if (smb2_req_struct_sizes[command] != pdu->StructureSize2) { + if (command != SMB2_OPLOCK_BREAK_HE && + (hdr->Status == 0 || pdu->StructureSize2 != SMB2_ERROR_STRUCTURE_SIZE2_LE)) { + /* error packets have 9 byte structure size */ + ksmbd_debug(SMB, + "Illegal request size %u for command %d\n", + le16_to_cpu(pdu->StructureSize2), command); + return 1; + } else if (command == SMB2_OPLOCK_BREAK_HE && + hdr->Status == 0 && + le16_to_cpu(pdu->StructureSize2) != OP_BREAK_STRUCT_SIZE_20 && + le16_to_cpu(pdu->StructureSize2) != OP_BREAK_STRUCT_SIZE_21) { + /* special case for SMB2.1 lease break message */ + ksmbd_debug(SMB, + "Illegal request size %d for oplock break\n", + le16_to_cpu(pdu->StructureSize2)); + return 1; + } + } + + if ((work->conn->vals->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU) && + smb2_validate_credit_charge(hdr)) { + work->conn->ops->set_rsp_status(work, STATUS_INVALID_PARAMETER); + return 1; + } + + clc_len = smb2_calc_size(hdr); + if (len != clc_len) { + /* server can return one byte more due to implied bcc[0] */ + if (clc_len == len + 1) + return 0; + + /* + * Some windows servers (win2016) will pad also the final + * PDU in a compound to 8 bytes. + */ + if (ALIGN(clc_len, 8) == len) + return 0; + + /* + * windows client also pad up to 8 bytes when compounding. + * If pad is longer than eight bytes, log the server behavior + * (once), since may indicate a problem but allow it and + * continue since the frame is parseable. + */ + if (clc_len < len) { + ksmbd_debug(SMB, + "cli req padded more than expected. Length %d not %d for cmd:%d mid:%llu\n", + len, clc_len, command, + le64_to_cpu(hdr->MessageId)); + return 0; + } + + if (command == SMB2_LOCK_HE && len == 88) + return 0; + + ksmbd_debug(SMB, + "cli req too short, len %d not %d. cmd:%d mid:%llu\n", + len, clc_len, command, + le64_to_cpu(hdr->MessageId)); + + return 1; + } + + return 0; +} + +int smb2_negotiate_request(struct ksmbd_work *work) +{ + return ksmbd_smb_negotiate_common(work, SMB2_NEGOTIATE_HE); +} diff --git a/fs/ksmbd/smb2ops.c b/fs/ksmbd/smb2ops.c new file mode 100644 index 000000000000..197473871aa4 --- /dev/null +++ b/fs/ksmbd/smb2ops.c @@ -0,0 +1,312 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2016 Namjae Jeon <linkinjeon@kernel.org> + * Copyright (C) 2018 Samsung Electronics Co., Ltd. + */ + +#include <linux/slab.h> +#include "glob.h" +#include "smb2pdu.h" + +#include "auth.h" +#include "connection.h" +#include "smb_common.h" +#include "server.h" + +static struct smb_version_values smb21_server_values = { + .version_string = SMB21_VERSION_STRING, + .protocol_id = SMB21_PROT_ID, + .capabilities = SMB2_GLOBAL_CAP_LARGE_MTU, + .max_read_size = SMB21_DEFAULT_IOSIZE, + .max_write_size = SMB21_DEFAULT_IOSIZE, + .max_trans_size = SMB21_DEFAULT_IOSIZE, + .large_lock_type = 0, + .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE, + .shared_lock_type = SMB2_LOCKFLAG_SHARED, + .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, + .header_size = sizeof(struct smb2_hdr), + .max_header_size = MAX_SMB2_HDR_SIZE, + .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .lock_cmd = SMB2_LOCK, + .cap_unix = 0, + .cap_nt_find = SMB2_NT_FIND, + .cap_large_files = SMB2_LARGE_FILES, + .create_lease_size = sizeof(struct create_lease), + .create_durable_size = sizeof(struct create_durable_rsp), + .create_mxac_size = sizeof(struct create_mxac_rsp), + .create_disk_id_size = sizeof(struct create_disk_id_rsp), + .create_posix_size = sizeof(struct create_posix_rsp), +}; + +static struct smb_version_values smb30_server_values = { + .version_string = SMB30_VERSION_STRING, + .protocol_id = SMB30_PROT_ID, + .capabilities = SMB2_GLOBAL_CAP_LARGE_MTU, + .max_read_size = SMB3_DEFAULT_IOSIZE, + .max_write_size = SMB3_DEFAULT_IOSIZE, + .max_trans_size = SMB3_DEFAULT_TRANS_SIZE, + .large_lock_type = 0, + .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE, + .shared_lock_type = SMB2_LOCKFLAG_SHARED, + .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, + .header_size = sizeof(struct smb2_hdr), + .max_header_size = MAX_SMB2_HDR_SIZE, + .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .lock_cmd = SMB2_LOCK, + .cap_unix = 0, + .cap_nt_find = SMB2_NT_FIND, + .cap_large_files = SMB2_LARGE_FILES, + .create_lease_size = sizeof(struct create_lease_v2), + .create_durable_size = sizeof(struct create_durable_rsp), + .create_durable_v2_size = sizeof(struct create_durable_v2_rsp), + .create_mxac_size = sizeof(struct create_mxac_rsp), + .create_disk_id_size = sizeof(struct create_disk_id_rsp), + .create_posix_size = sizeof(struct create_posix_rsp), +}; + +static struct smb_version_values smb302_server_values = { + .version_string = SMB302_VERSION_STRING, + .protocol_id = SMB302_PROT_ID, + .capabilities = SMB2_GLOBAL_CAP_LARGE_MTU, + .max_read_size = SMB3_DEFAULT_IOSIZE, + .max_write_size = SMB3_DEFAULT_IOSIZE, + .max_trans_size = SMB3_DEFAULT_TRANS_SIZE, + .large_lock_type = 0, + .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE, + .shared_lock_type = SMB2_LOCKFLAG_SHARED, + .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, + .header_size = sizeof(struct smb2_hdr), + .max_header_size = MAX_SMB2_HDR_SIZE, + .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .lock_cmd = SMB2_LOCK, + .cap_unix = 0, + .cap_nt_find = SMB2_NT_FIND, + .cap_large_files = SMB2_LARGE_FILES, + .create_lease_size = sizeof(struct create_lease_v2), + .create_durable_size = sizeof(struct create_durable_rsp), + .create_durable_v2_size = sizeof(struct create_durable_v2_rsp), + .create_mxac_size = sizeof(struct create_mxac_rsp), + .create_disk_id_size = sizeof(struct create_disk_id_rsp), + .create_posix_size = sizeof(struct create_posix_rsp), +}; + +static struct smb_version_values smb311_server_values = { + .version_string = SMB311_VERSION_STRING, + .protocol_id = SMB311_PROT_ID, + .capabilities = SMB2_GLOBAL_CAP_LARGE_MTU, + .max_read_size = SMB3_DEFAULT_IOSIZE, + .max_write_size = SMB3_DEFAULT_IOSIZE, + .max_trans_size = SMB3_DEFAULT_TRANS_SIZE, + .large_lock_type = 0, + .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE, + .shared_lock_type = SMB2_LOCKFLAG_SHARED, + .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, + .header_size = sizeof(struct smb2_hdr), + .max_header_size = MAX_SMB2_HDR_SIZE, + .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .lock_cmd = SMB2_LOCK, + .cap_unix = 0, + .cap_nt_find = SMB2_NT_FIND, + .cap_large_files = SMB2_LARGE_FILES, + .create_lease_size = sizeof(struct create_lease_v2), + .create_durable_size = sizeof(struct create_durable_rsp), + .create_durable_v2_size = sizeof(struct create_durable_v2_rsp), + .create_mxac_size = sizeof(struct create_mxac_rsp), + .create_disk_id_size = sizeof(struct create_disk_id_rsp), + .create_posix_size = sizeof(struct create_posix_rsp), +}; + +static struct smb_version_ops smb2_0_server_ops = { + .get_cmd_val = get_smb2_cmd_val, + .init_rsp_hdr = init_smb2_rsp_hdr, + .set_rsp_status = set_smb2_rsp_status, + .allocate_rsp_buf = smb2_allocate_rsp_buf, + .set_rsp_credits = smb2_set_rsp_credits, + .check_user_session = smb2_check_user_session, + .get_ksmbd_tcon = smb2_get_ksmbd_tcon, + .is_sign_req = smb2_is_sign_req, + .check_sign_req = smb2_check_sign_req, + .set_sign_rsp = smb2_set_sign_rsp +}; + +static struct smb_version_ops smb3_0_server_ops = { + .get_cmd_val = get_smb2_cmd_val, + .init_rsp_hdr = init_smb2_rsp_hdr, + .set_rsp_status = set_smb2_rsp_status, + .allocate_rsp_buf = smb2_allocate_rsp_buf, + .set_rsp_credits = smb2_set_rsp_credits, + .check_user_session = smb2_check_user_session, + .get_ksmbd_tcon = smb2_get_ksmbd_tcon, + .is_sign_req = smb2_is_sign_req, + .check_sign_req = smb3_check_sign_req, + .set_sign_rsp = smb3_set_sign_rsp, + .generate_signingkey = ksmbd_gen_smb30_signingkey, + .generate_encryptionkey = ksmbd_gen_smb30_encryptionkey, + .is_transform_hdr = smb3_is_transform_hdr, + .decrypt_req = smb3_decrypt_req, + .encrypt_resp = smb3_encrypt_resp +}; + +static struct smb_version_ops smb3_11_server_ops = { + .get_cmd_val = get_smb2_cmd_val, + .init_rsp_hdr = init_smb2_rsp_hdr, + .set_rsp_status = set_smb2_rsp_status, + .allocate_rsp_buf = smb2_allocate_rsp_buf, + .set_rsp_credits = smb2_set_rsp_credits, + .check_user_session = smb2_check_user_session, + .get_ksmbd_tcon = smb2_get_ksmbd_tcon, + .is_sign_req = smb2_is_sign_req, + .check_sign_req = smb3_check_sign_req, + .set_sign_rsp = smb3_set_sign_rsp, + .generate_signingkey = ksmbd_gen_smb311_signingkey, + .generate_encryptionkey = ksmbd_gen_smb311_encryptionkey, + .is_transform_hdr = smb3_is_transform_hdr, + .decrypt_req = smb3_decrypt_req, + .encrypt_resp = smb3_encrypt_resp +}; + +static struct smb_version_cmds smb2_0_server_cmds[NUMBER_OF_SMB2_COMMANDS] = { + [SMB2_NEGOTIATE_HE] = { .proc = smb2_negotiate_request, }, + [SMB2_SESSION_SETUP_HE] = { .proc = smb2_sess_setup, }, + [SMB2_TREE_CONNECT_HE] = { .proc = smb2_tree_connect,}, + [SMB2_TREE_DISCONNECT_HE] = { .proc = smb2_tree_disconnect,}, + [SMB2_LOGOFF_HE] = { .proc = smb2_session_logoff,}, + [SMB2_CREATE_HE] = { .proc = smb2_open}, + [SMB2_QUERY_INFO_HE] = { .proc = smb2_query_info}, + [SMB2_QUERY_DIRECTORY_HE] = { .proc = smb2_query_dir}, + [SMB2_CLOSE_HE] = { .proc = smb2_close}, + [SMB2_ECHO_HE] = { .proc = smb2_echo}, + [SMB2_SET_INFO_HE] = { .proc = smb2_set_info}, + [SMB2_READ_HE] = { .proc = smb2_read}, + [SMB2_WRITE_HE] = { .proc = smb2_write}, + [SMB2_FLUSH_HE] = { .proc = smb2_flush}, + [SMB2_CANCEL_HE] = { .proc = smb2_cancel}, + [SMB2_LOCK_HE] = { .proc = smb2_lock}, + [SMB2_IOCTL_HE] = { .proc = smb2_ioctl}, + [SMB2_OPLOCK_BREAK_HE] = { .proc = smb2_oplock_break}, + [SMB2_CHANGE_NOTIFY_HE] = { .proc = smb2_notify}, +}; + +int init_smb2_0_server(struct ksmbd_conn *conn) +{ + return -EOPNOTSUPP; +} + +/** + * init_smb2_1_server() - initialize a smb server connection with smb2.1 + * command dispatcher + * @conn: connection instance + */ +void init_smb2_1_server(struct ksmbd_conn *conn) +{ + conn->vals = &smb21_server_values; + conn->ops = &smb2_0_server_ops; + conn->cmds = smb2_0_server_cmds; + conn->max_cmds = ARRAY_SIZE(smb2_0_server_cmds); + conn->max_credits = SMB2_MAX_CREDITS; + conn->signing_algorithm = SIGNING_ALG_HMAC_SHA256; + + if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_LEASES) + conn->vals->capabilities |= SMB2_GLOBAL_CAP_LEASING; +} + +/** + * init_smb3_0_server() - initialize a smb server connection with smb3.0 + * command dispatcher + * @conn: connection instance + */ +void init_smb3_0_server(struct ksmbd_conn *conn) +{ + conn->vals = &smb30_server_values; + conn->ops = &smb3_0_server_ops; + conn->cmds = smb2_0_server_cmds; + conn->max_cmds = ARRAY_SIZE(smb2_0_server_cmds); + conn->max_credits = SMB2_MAX_CREDITS; + conn->signing_algorithm = SIGNING_ALG_AES_CMAC; + + if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_LEASES) + conn->vals->capabilities |= SMB2_GLOBAL_CAP_LEASING; + + if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION && + conn->cli_cap & SMB2_GLOBAL_CAP_ENCRYPTION) + conn->vals->capabilities |= SMB2_GLOBAL_CAP_ENCRYPTION; + + if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL) + conn->vals->capabilities |= SMB2_GLOBAL_CAP_MULTI_CHANNEL; +} + +/** + * init_smb3_02_server() - initialize a smb server connection with smb3.02 + * command dispatcher + * @conn: connection instance + */ +void init_smb3_02_server(struct ksmbd_conn *conn) +{ + conn->vals = &smb302_server_values; + conn->ops = &smb3_0_server_ops; + conn->cmds = smb2_0_server_cmds; + conn->max_cmds = ARRAY_SIZE(smb2_0_server_cmds); + conn->max_credits = SMB2_MAX_CREDITS; + conn->signing_algorithm = SIGNING_ALG_AES_CMAC; + + if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_LEASES) + conn->vals->capabilities |= SMB2_GLOBAL_CAP_LEASING; + + if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION && + conn->cli_cap & SMB2_GLOBAL_CAP_ENCRYPTION) + conn->vals->capabilities |= SMB2_GLOBAL_CAP_ENCRYPTION; + + if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL) + conn->vals->capabilities |= SMB2_GLOBAL_CAP_MULTI_CHANNEL; +} + +/** + * init_smb3_11_server() - initialize a smb server connection with smb3.11 + * command dispatcher + * @conn: connection instance + */ +int init_smb3_11_server(struct ksmbd_conn *conn) +{ + conn->vals = &smb311_server_values; + conn->ops = &smb3_11_server_ops; + conn->cmds = smb2_0_server_cmds; + conn->max_cmds = ARRAY_SIZE(smb2_0_server_cmds); + conn->max_credits = SMB2_MAX_CREDITS; + conn->signing_algorithm = SIGNING_ALG_AES_CMAC; + + if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_LEASES) + conn->vals->capabilities |= SMB2_GLOBAL_CAP_LEASING; + + if (conn->cipher_type) + conn->vals->capabilities |= SMB2_GLOBAL_CAP_ENCRYPTION; + + if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL) + conn->vals->capabilities |= SMB2_GLOBAL_CAP_MULTI_CHANNEL; + + INIT_LIST_HEAD(&conn->preauth_sess_table); + return 0; +} + +void init_smb2_max_read_size(unsigned int sz) +{ + smb21_server_values.max_read_size = sz; + smb30_server_values.max_read_size = sz; + smb302_server_values.max_read_size = sz; + smb311_server_values.max_read_size = sz; +} + +void init_smb2_max_write_size(unsigned int sz) +{ + smb21_server_values.max_write_size = sz; + smb30_server_values.max_write_size = sz; + smb302_server_values.max_write_size = sz; + smb311_server_values.max_write_size = sz; +} + +void init_smb2_max_trans_size(unsigned int sz) +{ + smb21_server_values.max_trans_size = sz; + smb30_server_values.max_trans_size = sz; + smb302_server_values.max_trans_size = sz; + smb311_server_values.max_trans_size = sz; +} diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c new file mode 100644 index 000000000000..d329ea49fa14 --- /dev/null +++ b/fs/ksmbd/smb2pdu.c @@ -0,0 +1,8373 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2016 Namjae Jeon <linkinjeon@kernel.org> + * Copyright (C) 2018 Samsung Electronics Co., Ltd. + */ + +#include <linux/inetdevice.h> +#include <net/addrconf.h> +#include <linux/syscalls.h> +#include <linux/namei.h> +#include <linux/statfs.h> +#include <linux/ethtool.h> +#include <linux/falloc.h> + +#include "glob.h" +#include "smb2pdu.h" +#include "smbfsctl.h" +#include "oplock.h" +#include "smbacl.h" + +#include "auth.h" +#include "asn1.h" +#include "connection.h" +#include "transport_ipc.h" +#include "transport_rdma.h" +#include "vfs.h" +#include "vfs_cache.h" +#include "misc.h" + +#include "server.h" +#include "smb_common.h" +#include "smbstatus.h" +#include "ksmbd_work.h" +#include "mgmt/user_config.h" +#include "mgmt/share_config.h" +#include "mgmt/tree_connect.h" +#include "mgmt/user_session.h" +#include "mgmt/ksmbd_ida.h" +#include "ndr.h" + +static void __wbuf(struct ksmbd_work *work, void **req, void **rsp) +{ + if (work->next_smb2_rcv_hdr_off) { + *req = ksmbd_req_buf_next(work); + *rsp = ksmbd_resp_buf_next(work); + } else { + *req = work->request_buf; + *rsp = work->response_buf; + } +} + +#define WORK_BUFFERS(w, rq, rs) __wbuf((w), (void **)&(rq), (void **)&(rs)) + +/** + * check_session_id() - check for valid session id in smb header + * @conn: connection instance + * @id: session id from smb header + * + * Return: 1 if valid session id, otherwise 0 + */ +static inline bool check_session_id(struct ksmbd_conn *conn, u64 id) +{ + struct ksmbd_session *sess; + + if (id == 0 || id == -1) + return false; + + sess = ksmbd_session_lookup_all(conn, id); + if (sess) + return true; + pr_err("Invalid user session id: %llu\n", id); + return false; +} + +struct channel *lookup_chann_list(struct ksmbd_session *sess, struct ksmbd_conn *conn) +{ + struct channel *chann; + + list_for_each_entry(chann, &sess->ksmbd_chann_list, chann_list) { + if (chann->conn == conn) + return chann; + } + + return NULL; +} + +/** + * smb2_get_ksmbd_tcon() - get tree connection information using a tree id. + * @work: smb work + * + * Return: 0 if there is a tree connection matched or these are + * skipable commands, otherwise error + */ +int smb2_get_ksmbd_tcon(struct ksmbd_work *work) +{ + struct smb2_hdr *req_hdr = work->request_buf; + int tree_id; + + work->tcon = NULL; + if (work->conn->ops->get_cmd_val(work) == SMB2_TREE_CONNECT_HE || + work->conn->ops->get_cmd_val(work) == SMB2_CANCEL_HE || + work->conn->ops->get_cmd_val(work) == SMB2_LOGOFF_HE) { + ksmbd_debug(SMB, "skip to check tree connect request\n"); + return 0; + } + + if (xa_empty(&work->sess->tree_conns)) { + ksmbd_debug(SMB, "NO tree connected\n"); + return -ENOENT; + } + + tree_id = le32_to_cpu(req_hdr->Id.SyncId.TreeId); + work->tcon = ksmbd_tree_conn_lookup(work->sess, tree_id); + if (!work->tcon) { + pr_err("Invalid tid %d\n", tree_id); + return -EINVAL; + } + + return 1; +} + +/** + * smb2_set_err_rsp() - set error response code on smb response + * @work: smb work containing response buffer + */ +void smb2_set_err_rsp(struct ksmbd_work *work) +{ + struct smb2_err_rsp *err_rsp; + + if (work->next_smb2_rcv_hdr_off) + err_rsp = ksmbd_resp_buf_next(work); + else + err_rsp = work->response_buf; + + if (err_rsp->hdr.Status != STATUS_STOPPED_ON_SYMLINK) { + err_rsp->StructureSize = SMB2_ERROR_STRUCTURE_SIZE2_LE; + err_rsp->ErrorContextCount = 0; + err_rsp->Reserved = 0; + err_rsp->ByteCount = 0; + err_rsp->ErrorData[0] = 0; + inc_rfc1001_len(work->response_buf, SMB2_ERROR_STRUCTURE_SIZE2); + } +} + +/** + * is_smb2_neg_cmd() - is it smb2 negotiation command + * @work: smb work containing smb header + * + * Return: true if smb2 negotiation command, otherwise false + */ +bool is_smb2_neg_cmd(struct ksmbd_work *work) +{ + struct smb2_hdr *hdr = work->request_buf; + + /* is it SMB2 header ? */ + if (hdr->ProtocolId != SMB2_PROTO_NUMBER) + return false; + + /* make sure it is request not response message */ + if (hdr->Flags & SMB2_FLAGS_SERVER_TO_REDIR) + return false; + + if (hdr->Command != SMB2_NEGOTIATE) + return false; + + return true; +} + +/** + * is_smb2_rsp() - is it smb2 response + * @work: smb work containing smb response buffer + * + * Return: true if smb2 response, otherwise false + */ +bool is_smb2_rsp(struct ksmbd_work *work) +{ + struct smb2_hdr *hdr = work->response_buf; + + /* is it SMB2 header ? */ + if (hdr->ProtocolId != SMB2_PROTO_NUMBER) + return false; + + /* make sure it is response not request message */ + if (!(hdr->Flags & SMB2_FLAGS_SERVER_TO_REDIR)) + return false; + + return true; +} + +/** + * get_smb2_cmd_val() - get smb command code from smb header + * @work: smb work containing smb request buffer + * + * Return: smb2 request command value + */ +u16 get_smb2_cmd_val(struct ksmbd_work *work) +{ + struct smb2_hdr *rcv_hdr; + + if (work->next_smb2_rcv_hdr_off) + rcv_hdr = ksmbd_req_buf_next(work); + else + rcv_hdr = work->request_buf; + return le16_to_cpu(rcv_hdr->Command); +} + +/** + * set_smb2_rsp_status() - set error response code on smb2 header + * @work: smb work containing response buffer + * @err: error response code + */ +void set_smb2_rsp_status(struct ksmbd_work *work, __le32 err) +{ + struct smb2_hdr *rsp_hdr; + + if (work->next_smb2_rcv_hdr_off) + rsp_hdr = ksmbd_resp_buf_next(work); + else + rsp_hdr = work->response_buf; + rsp_hdr->Status = err; + smb2_set_err_rsp(work); +} + +/** + * init_smb2_neg_rsp() - initialize smb2 response for negotiate command + * @work: smb work containing smb request buffer + * + * smb2 negotiate response is sent in reply of smb1 negotiate command for + * dialect auto-negotiation. + */ +int init_smb2_neg_rsp(struct ksmbd_work *work) +{ + struct smb2_hdr *rsp_hdr; + struct smb2_negotiate_rsp *rsp; + struct ksmbd_conn *conn = work->conn; + + if (conn->need_neg == false) + return -EINVAL; + if (!(conn->dialect >= SMB20_PROT_ID && + conn->dialect <= SMB311_PROT_ID)) + return -EINVAL; + + rsp_hdr = work->response_buf; + + memset(rsp_hdr, 0, sizeof(struct smb2_hdr) + 2); + + rsp_hdr->smb2_buf_length = + cpu_to_be32(smb2_hdr_size_no_buflen(conn->vals)); + + rsp_hdr->ProtocolId = SMB2_PROTO_NUMBER; + rsp_hdr->StructureSize = SMB2_HEADER_STRUCTURE_SIZE; + rsp_hdr->CreditRequest = cpu_to_le16(2); + rsp_hdr->Command = SMB2_NEGOTIATE; + rsp_hdr->Flags = (SMB2_FLAGS_SERVER_TO_REDIR); + rsp_hdr->NextCommand = 0; + rsp_hdr->MessageId = 0; + rsp_hdr->Id.SyncId.ProcessId = 0; + rsp_hdr->Id.SyncId.TreeId = 0; + rsp_hdr->SessionId = 0; + memset(rsp_hdr->Signature, 0, 16); + + rsp = work->response_buf; + + WARN_ON(ksmbd_conn_good(work)); + + rsp->StructureSize = cpu_to_le16(65); + ksmbd_debug(SMB, "conn->dialect 0x%x\n", conn->dialect); + rsp->DialectRevision = cpu_to_le16(conn->dialect); + /* Not setting conn guid rsp->ServerGUID, as it + * not used by client for identifying connection + */ + rsp->Capabilities = cpu_to_le32(conn->vals->capabilities); + /* Default Max Message Size till SMB2.0, 64K*/ + rsp->MaxTransactSize = cpu_to_le32(conn->vals->max_trans_size); + rsp->MaxReadSize = cpu_to_le32(conn->vals->max_read_size); + rsp->MaxWriteSize = cpu_to_le32(conn->vals->max_write_size); + + rsp->SystemTime = cpu_to_le64(ksmbd_systime()); + rsp->ServerStartTime = 0; + + rsp->SecurityBufferOffset = cpu_to_le16(128); + rsp->SecurityBufferLength = cpu_to_le16(AUTH_GSS_LENGTH); + ksmbd_copy_gss_neg_header(((char *)(&rsp->hdr) + + sizeof(rsp->hdr.smb2_buf_length)) + + le16_to_cpu(rsp->SecurityBufferOffset)); + inc_rfc1001_len(rsp, sizeof(struct smb2_negotiate_rsp) - + sizeof(struct smb2_hdr) - sizeof(rsp->Buffer) + + AUTH_GSS_LENGTH); + rsp->SecurityMode = SMB2_NEGOTIATE_SIGNING_ENABLED_LE; + if (server_conf.signing == KSMBD_CONFIG_OPT_MANDATORY) + rsp->SecurityMode |= SMB2_NEGOTIATE_SIGNING_REQUIRED_LE; + conn->use_spnego = true; + + ksmbd_conn_set_need_negotiate(work); + return 0; +} + +static int smb2_consume_credit_charge(struct ksmbd_work *work, + unsigned short credit_charge) +{ + struct ksmbd_conn *conn = work->conn; + unsigned int rsp_credits = 1; + + if (!conn->total_credits) + return 0; + + if (credit_charge > 0) + rsp_credits = credit_charge; + + conn->total_credits -= rsp_credits; + return rsp_credits; +} + +/** + * smb2_set_rsp_credits() - set number of credits in response buffer + * @work: smb work containing smb response buffer + */ +int smb2_set_rsp_credits(struct ksmbd_work *work) +{ + struct smb2_hdr *req_hdr = ksmbd_req_buf_next(work); + struct smb2_hdr *hdr = ksmbd_resp_buf_next(work); + struct ksmbd_conn *conn = work->conn; + unsigned short credits_requested = le16_to_cpu(req_hdr->CreditRequest); + unsigned short credit_charge = 1, credits_granted = 0; + unsigned short aux_max, aux_credits, min_credits; + int rsp_credit_charge; + + if (hdr->Command == SMB2_CANCEL) + goto out; + + /* get default minimum credits by shifting maximum credits by 4 */ + min_credits = conn->max_credits >> 4; + + if (conn->total_credits >= conn->max_credits) { + pr_err("Total credits overflow: %d\n", conn->total_credits); + conn->total_credits = min_credits; + } + + rsp_credit_charge = + smb2_consume_credit_charge(work, le16_to_cpu(req_hdr->CreditCharge)); + if (rsp_credit_charge < 0) + return -EINVAL; + + hdr->CreditCharge = cpu_to_le16(rsp_credit_charge); + + if (credits_requested > 0) { + aux_credits = credits_requested - 1; + aux_max = 32; + if (hdr->Command == SMB2_NEGOTIATE) + aux_max = 0; + aux_credits = (aux_credits < aux_max) ? aux_credits : aux_max; + credits_granted = aux_credits + credit_charge; + + /* if credits granted per client is getting bigger than default + * minimum credits then we should wrap it up within the limits. + */ + if ((conn->total_credits + credits_granted) > min_credits) + credits_granted = min_credits - conn->total_credits; + /* + * TODO: Need to adjuct CreditRequest value according to + * current cpu load + */ + } else if (conn->total_credits == 0) { + credits_granted = 1; + } + + conn->total_credits += credits_granted; + work->credits_granted += credits_granted; + + if (!req_hdr->NextCommand) { + /* Update CreditRequest in last request */ + hdr->CreditRequest = cpu_to_le16(work->credits_granted); + } +out: + ksmbd_debug(SMB, + "credits: requested[%d] granted[%d] total_granted[%d]\n", + credits_requested, credits_granted, + conn->total_credits); + return 0; +} + +/** + * init_chained_smb2_rsp() - initialize smb2 chained response + * @work: smb work containing smb response buffer + */ +static void init_chained_smb2_rsp(struct ksmbd_work *work) +{ + struct smb2_hdr *req = ksmbd_req_buf_next(work); + struct smb2_hdr *rsp = ksmbd_resp_buf_next(work); + struct smb2_hdr *rsp_hdr; + struct smb2_hdr *rcv_hdr; + int next_hdr_offset = 0; + int len, new_len; + + /* Len of this response = updated RFC len - offset of previous cmd + * in the compound rsp + */ + + /* Storing the current local FID which may be needed by subsequent + * command in the compound request + */ + if (req->Command == SMB2_CREATE && rsp->Status == STATUS_SUCCESS) { + work->compound_fid = + le64_to_cpu(((struct smb2_create_rsp *)rsp)-> + VolatileFileId); + work->compound_pfid = + le64_to_cpu(((struct smb2_create_rsp *)rsp)-> + PersistentFileId); + work->compound_sid = le64_to_cpu(rsp->SessionId); + } + + len = get_rfc1002_len(work->response_buf) - work->next_smb2_rsp_hdr_off; + next_hdr_offset = le32_to_cpu(req->NextCommand); + + new_len = ALIGN(len, 8); + inc_rfc1001_len(work->response_buf, ((sizeof(struct smb2_hdr) - 4) + + new_len - len)); + rsp->NextCommand = cpu_to_le32(new_len); + + work->next_smb2_rcv_hdr_off += next_hdr_offset; + work->next_smb2_rsp_hdr_off += new_len; + ksmbd_debug(SMB, + "Compound req new_len = %d rcv off = %d rsp off = %d\n", + new_len, work->next_smb2_rcv_hdr_off, + work->next_smb2_rsp_hdr_off); + + rsp_hdr = ksmbd_resp_buf_next(work); + rcv_hdr = ksmbd_req_buf_next(work); + + if (!(rcv_hdr->Flags & SMB2_FLAGS_RELATED_OPERATIONS)) { + ksmbd_debug(SMB, "related flag should be set\n"); + work->compound_fid = KSMBD_NO_FID; + work->compound_pfid = KSMBD_NO_FID; + } + memset((char *)rsp_hdr + 4, 0, sizeof(struct smb2_hdr) + 2); + rsp_hdr->ProtocolId = rcv_hdr->ProtocolId; + rsp_hdr->StructureSize = SMB2_HEADER_STRUCTURE_SIZE; + rsp_hdr->Command = rcv_hdr->Command; + + /* + * Message is response. We don't grant oplock yet. + */ + rsp_hdr->Flags = (SMB2_FLAGS_SERVER_TO_REDIR | + SMB2_FLAGS_RELATED_OPERATIONS); + rsp_hdr->NextCommand = 0; + rsp_hdr->MessageId = rcv_hdr->MessageId; + rsp_hdr->Id.SyncId.ProcessId = rcv_hdr->Id.SyncId.ProcessId; + rsp_hdr->Id.SyncId.TreeId = rcv_hdr->Id.SyncId.TreeId; + rsp_hdr->SessionId = rcv_hdr->SessionId; + memcpy(rsp_hdr->Signature, rcv_hdr->Signature, 16); +} + +/** + * is_chained_smb2_message() - check for chained command + * @work: smb work containing smb request buffer + * + * Return: true if chained request, otherwise false + */ +bool is_chained_smb2_message(struct ksmbd_work *work) +{ + struct smb2_hdr *hdr = work->request_buf; + unsigned int len; + + if (hdr->ProtocolId != SMB2_PROTO_NUMBER) + return false; + + hdr = ksmbd_req_buf_next(work); + if (le32_to_cpu(hdr->NextCommand) > 0) { + ksmbd_debug(SMB, "got SMB2 chained command\n"); + init_chained_smb2_rsp(work); + return true; + } else if (work->next_smb2_rcv_hdr_off) { + /* + * This is last request in chained command, + * align response to 8 byte + */ + len = ALIGN(get_rfc1002_len(work->response_buf), 8); + len = len - get_rfc1002_len(work->response_buf); + if (len) { + ksmbd_debug(SMB, "padding len %u\n", len); + inc_rfc1001_len(work->response_buf, len); + if (work->aux_payload_sz) + work->aux_payload_sz += len; + } + } + return false; +} + +/** + * init_smb2_rsp_hdr() - initialize smb2 response + * @work: smb work containing smb request buffer + * + * Return: 0 + */ +int init_smb2_rsp_hdr(struct ksmbd_work *work) +{ + struct smb2_hdr *rsp_hdr = work->response_buf; + struct smb2_hdr *rcv_hdr = work->request_buf; + struct ksmbd_conn *conn = work->conn; + + memset(rsp_hdr, 0, sizeof(struct smb2_hdr) + 2); + rsp_hdr->smb2_buf_length = + cpu_to_be32(smb2_hdr_size_no_buflen(conn->vals)); + rsp_hdr->ProtocolId = rcv_hdr->ProtocolId; + rsp_hdr->StructureSize = SMB2_HEADER_STRUCTURE_SIZE; + rsp_hdr->Command = rcv_hdr->Command; + + /* + * Message is response. We don't grant oplock yet. + */ + rsp_hdr->Flags = (SMB2_FLAGS_SERVER_TO_REDIR); + rsp_hdr->NextCommand = 0; + rsp_hdr->MessageId = rcv_hdr->MessageId; + rsp_hdr->Id.SyncId.ProcessId = rcv_hdr->Id.SyncId.ProcessId; + rsp_hdr->Id.SyncId.TreeId = rcv_hdr->Id.SyncId.TreeId; + rsp_hdr->SessionId = rcv_hdr->SessionId; + memcpy(rsp_hdr->Signature, rcv_hdr->Signature, 16); + + work->syncronous = true; + if (work->async_id) { + ksmbd_release_id(&conn->async_ida, work->async_id); + work->async_id = 0; + } + + return 0; +} + +/** + * smb2_allocate_rsp_buf() - allocate smb2 response buffer + * @work: smb work containing smb request buffer + * + * Return: 0 on success, otherwise -ENOMEM + */ +int smb2_allocate_rsp_buf(struct ksmbd_work *work) +{ + struct smb2_hdr *hdr = work->request_buf; + size_t small_sz = MAX_CIFS_SMALL_BUFFER_SIZE; + size_t large_sz = work->conn->vals->max_trans_size + MAX_SMB2_HDR_SIZE; + size_t sz = small_sz; + int cmd = le16_to_cpu(hdr->Command); + + if (cmd == SMB2_IOCTL_HE || cmd == SMB2_QUERY_DIRECTORY_HE) + sz = large_sz; + + if (cmd == SMB2_QUERY_INFO_HE) { + struct smb2_query_info_req *req; + + req = work->request_buf; + if (req->InfoType == SMB2_O_INFO_FILE && + (req->FileInfoClass == FILE_FULL_EA_INFORMATION || + req->FileInfoClass == FILE_ALL_INFORMATION)) + sz = large_sz; + } + + /* allocate large response buf for chained commands */ + if (le32_to_cpu(hdr->NextCommand) > 0) + sz = large_sz; + + work->response_buf = kvmalloc(sz, GFP_KERNEL | __GFP_ZERO); + if (!work->response_buf) + return -ENOMEM; + + work->response_sz = sz; + return 0; +} + +/** + * smb2_check_user_session() - check for valid session for a user + * @work: smb work containing smb request buffer + * + * Return: 0 on success, otherwise error + */ +int smb2_check_user_session(struct ksmbd_work *work) +{ + struct smb2_hdr *req_hdr = work->request_buf; + struct ksmbd_conn *conn = work->conn; + unsigned int cmd = conn->ops->get_cmd_val(work); + unsigned long long sess_id; + + work->sess = NULL; + /* + * SMB2_ECHO, SMB2_NEGOTIATE, SMB2_SESSION_SETUP command do not + * require a session id, so no need to validate user session's for + * these commands. + */ + if (cmd == SMB2_ECHO_HE || cmd == SMB2_NEGOTIATE_HE || + cmd == SMB2_SESSION_SETUP_HE) + return 0; + + if (!ksmbd_conn_good(work)) + return -EINVAL; + + sess_id = le64_to_cpu(req_hdr->SessionId); + /* Check for validity of user session */ + work->sess = ksmbd_session_lookup_all(conn, sess_id); + if (work->sess) + return 1; + ksmbd_debug(SMB, "Invalid user session, Uid %llu\n", sess_id); + return -EINVAL; +} + +static void destroy_previous_session(struct ksmbd_user *user, u64 id) +{ + struct ksmbd_session *prev_sess = ksmbd_session_lookup_slowpath(id); + struct ksmbd_user *prev_user; + + if (!prev_sess) + return; + + prev_user = prev_sess->user; + + if (!prev_user || + strcmp(user->name, prev_user->name) || + user->passkey_sz != prev_user->passkey_sz || + memcmp(user->passkey, prev_user->passkey, user->passkey_sz)) { + put_session(prev_sess); + return; + } + + put_session(prev_sess); + ksmbd_session_destroy(prev_sess); +} + +/** + * smb2_get_name() - get filename string from on the wire smb format + * @share: ksmbd_share_config pointer + * @src: source buffer + * @maxlen: maxlen of source string + * @nls_table: nls_table pointer + * + * Return: matching converted filename on success, otherwise error ptr + */ +static char * +smb2_get_name(struct ksmbd_share_config *share, const char *src, + const int maxlen, struct nls_table *local_nls) +{ + char *name, *unixname; + + name = smb_strndup_from_utf16(src, maxlen, 1, local_nls); + if (IS_ERR(name)) { + pr_err("failed to get name %ld\n", PTR_ERR(name)); + return name; + } + + /* change it to absolute unix name */ + ksmbd_conv_path_to_unix(name); + ksmbd_strip_last_slash(name); + + unixname = convert_to_unix_name(share, name); + kfree(name); + if (!unixname) { + pr_err("can not convert absolute name\n"); + return ERR_PTR(-ENOMEM); + } + + ksmbd_debug(SMB, "absolute name = %s\n", unixname); + return unixname; +} + +int setup_async_work(struct ksmbd_work *work, void (*fn)(void **), void **arg) +{ + struct smb2_hdr *rsp_hdr; + struct ksmbd_conn *conn = work->conn; + int id; + + rsp_hdr = work->response_buf; + rsp_hdr->Flags |= SMB2_FLAGS_ASYNC_COMMAND; + + id = ksmbd_acquire_async_msg_id(&conn->async_ida); + if (id < 0) { + pr_err("Failed to alloc async message id\n"); + return id; + } + work->syncronous = false; + work->async_id = id; + rsp_hdr->Id.AsyncId = cpu_to_le64(id); + + ksmbd_debug(SMB, + "Send interim Response to inform async request id : %d\n", + work->async_id); + + work->cancel_fn = fn; + work->cancel_argv = arg; + + if (list_empty(&work->async_request_entry)) { + spin_lock(&conn->request_lock); + list_add_tail(&work->async_request_entry, &conn->async_requests); + spin_unlock(&conn->request_lock); + } + + return 0; +} + +void smb2_send_interim_resp(struct ksmbd_work *work, __le32 status) +{ + struct smb2_hdr *rsp_hdr; + + rsp_hdr = work->response_buf; + smb2_set_err_rsp(work); + rsp_hdr->Status = status; + + work->multiRsp = 1; + ksmbd_conn_write(work); + rsp_hdr->Status = 0; + work->multiRsp = 0; +} + +static __le32 smb2_get_reparse_tag_special_file(umode_t mode) +{ + if (S_ISDIR(mode) || S_ISREG(mode)) + return 0; + + if (S_ISLNK(mode)) + return IO_REPARSE_TAG_LX_SYMLINK_LE; + else if (S_ISFIFO(mode)) + return IO_REPARSE_TAG_LX_FIFO_LE; + else if (S_ISSOCK(mode)) + return IO_REPARSE_TAG_AF_UNIX_LE; + else if (S_ISCHR(mode)) + return IO_REPARSE_TAG_LX_CHR_LE; + else if (S_ISBLK(mode)) + return IO_REPARSE_TAG_LX_BLK_LE; + + return 0; +} + +/** + * smb2_get_dos_mode() - get file mode in dos format from unix mode + * @stat: kstat containing file mode + * @attribute: attribute flags + * + * Return: converted dos mode + */ +static int smb2_get_dos_mode(struct kstat *stat, int attribute) +{ + int attr = 0; + + if (S_ISDIR(stat->mode)) { + attr = ATTR_DIRECTORY | + (attribute & (ATTR_HIDDEN | ATTR_SYSTEM)); + } else { + attr = (attribute & 0x00005137) | ATTR_ARCHIVE; + attr &= ~(ATTR_DIRECTORY); + if (S_ISREG(stat->mode) && (server_conf.share_fake_fscaps & + FILE_SUPPORTS_SPARSE_FILES)) + attr |= ATTR_SPARSE; + + if (smb2_get_reparse_tag_special_file(stat->mode)) + attr |= ATTR_REPARSE; + } + + return attr; +} + +static void build_preauth_ctxt(struct smb2_preauth_neg_context *pneg_ctxt, + __le16 hash_id) +{ + pneg_ctxt->ContextType = SMB2_PREAUTH_INTEGRITY_CAPABILITIES; + pneg_ctxt->DataLength = cpu_to_le16(38); + pneg_ctxt->HashAlgorithmCount = cpu_to_le16(1); + pneg_ctxt->Reserved = cpu_to_le32(0); + pneg_ctxt->SaltLength = cpu_to_le16(SMB311_SALT_SIZE); + get_random_bytes(pneg_ctxt->Salt, SMB311_SALT_SIZE); + pneg_ctxt->HashAlgorithms = hash_id; +} + +static void build_encrypt_ctxt(struct smb2_encryption_neg_context *pneg_ctxt, + __le16 cipher_type) +{ + pneg_ctxt->ContextType = SMB2_ENCRYPTION_CAPABILITIES; + pneg_ctxt->DataLength = cpu_to_le16(4); + pneg_ctxt->Reserved = cpu_to_le32(0); + pneg_ctxt->CipherCount = cpu_to_le16(1); + pneg_ctxt->Ciphers[0] = cipher_type; +} + +static void build_compression_ctxt(struct smb2_compression_ctx *pneg_ctxt, + __le16 comp_algo) +{ + pneg_ctxt->ContextType = SMB2_COMPRESSION_CAPABILITIES; + pneg_ctxt->DataLength = + cpu_to_le16(sizeof(struct smb2_compression_ctx) + - sizeof(struct smb2_neg_context)); + pneg_ctxt->Reserved = cpu_to_le32(0); + pneg_ctxt->CompressionAlgorithmCount = cpu_to_le16(1); + pneg_ctxt->Reserved1 = cpu_to_le32(0); + pneg_ctxt->CompressionAlgorithms[0] = comp_algo; +} + +static void build_sign_cap_ctxt(struct smb2_signing_capabilities *pneg_ctxt, + __le16 sign_algo) +{ + pneg_ctxt->ContextType = SMB2_SIGNING_CAPABILITIES; + pneg_ctxt->DataLength = + cpu_to_le16((sizeof(struct smb2_signing_capabilities) + 2) + - sizeof(struct smb2_neg_context)); + pneg_ctxt->Reserved = cpu_to_le32(0); + pneg_ctxt->SigningAlgorithmCount = cpu_to_le16(1); + pneg_ctxt->SigningAlgorithms[0] = sign_algo; +} + +static void build_posix_ctxt(struct smb2_posix_neg_context *pneg_ctxt) +{ + pneg_ctxt->ContextType = SMB2_POSIX_EXTENSIONS_AVAILABLE; + pneg_ctxt->DataLength = cpu_to_le16(POSIX_CTXT_DATA_LEN); + /* SMB2_CREATE_TAG_POSIX is "0x93AD25509CB411E7B42383DE968BCD7C" */ + pneg_ctxt->Name[0] = 0x93; + pneg_ctxt->Name[1] = 0xAD; + pneg_ctxt->Name[2] = 0x25; + pneg_ctxt->Name[3] = 0x50; + pneg_ctxt->Name[4] = 0x9C; + pneg_ctxt->Name[5] = 0xB4; + pneg_ctxt->Name[6] = 0x11; + pneg_ctxt->Name[7] = 0xE7; + pneg_ctxt->Name[8] = 0xB4; + pneg_ctxt->Name[9] = 0x23; + pneg_ctxt->Name[10] = 0x83; + pneg_ctxt->Name[11] = 0xDE; + pneg_ctxt->Name[12] = 0x96; + pneg_ctxt->Name[13] = 0x8B; + pneg_ctxt->Name[14] = 0xCD; + pneg_ctxt->Name[15] = 0x7C; +} + +static void assemble_neg_contexts(struct ksmbd_conn *conn, + struct smb2_negotiate_rsp *rsp) +{ + /* +4 is to account for the RFC1001 len field */ + char *pneg_ctxt = (char *)rsp + + le32_to_cpu(rsp->NegotiateContextOffset) + 4; + int neg_ctxt_cnt = 1; + int ctxt_size; + + ksmbd_debug(SMB, + "assemble SMB2_PREAUTH_INTEGRITY_CAPABILITIES context\n"); + build_preauth_ctxt((struct smb2_preauth_neg_context *)pneg_ctxt, + conn->preauth_info->Preauth_HashId); + rsp->NegotiateContextCount = cpu_to_le16(neg_ctxt_cnt); + inc_rfc1001_len(rsp, AUTH_GSS_PADDING); + ctxt_size = sizeof(struct smb2_preauth_neg_context); + /* Round to 8 byte boundary */ + pneg_ctxt += round_up(sizeof(struct smb2_preauth_neg_context), 8); + + if (conn->cipher_type) { + ctxt_size = round_up(ctxt_size, 8); + ksmbd_debug(SMB, + "assemble SMB2_ENCRYPTION_CAPABILITIES context\n"); + build_encrypt_ctxt((struct smb2_encryption_neg_context *)pneg_ctxt, + conn->cipher_type); + rsp->NegotiateContextCount = cpu_to_le16(++neg_ctxt_cnt); + ctxt_size += sizeof(struct smb2_encryption_neg_context) + 2; + /* Round to 8 byte boundary */ + pneg_ctxt += + round_up(sizeof(struct smb2_encryption_neg_context) + 2, + 8); + } + + if (conn->compress_algorithm) { + ctxt_size = round_up(ctxt_size, 8); + ksmbd_debug(SMB, + "assemble SMB2_COMPRESSION_CAPABILITIES context\n"); + /* Temporarily set to SMB3_COMPRESS_NONE */ + build_compression_ctxt((struct smb2_compression_ctx *)pneg_ctxt, + conn->compress_algorithm); + rsp->NegotiateContextCount = cpu_to_le16(++neg_ctxt_cnt); + ctxt_size += sizeof(struct smb2_compression_ctx) + 2; + /* Round to 8 byte boundary */ + pneg_ctxt += round_up(sizeof(struct smb2_compression_ctx) + 2, + 8); + } + + if (conn->posix_ext_supported) { + ctxt_size = round_up(ctxt_size, 8); + ksmbd_debug(SMB, + "assemble SMB2_POSIX_EXTENSIONS_AVAILABLE context\n"); + build_posix_ctxt((struct smb2_posix_neg_context *)pneg_ctxt); + rsp->NegotiateContextCount = cpu_to_le16(++neg_ctxt_cnt); + ctxt_size += sizeof(struct smb2_posix_neg_context); + /* Round to 8 byte boundary */ + pneg_ctxt += round_up(sizeof(struct smb2_posix_neg_context), 8); + } + + if (conn->signing_negotiated) { + ctxt_size = round_up(ctxt_size, 8); + ksmbd_debug(SMB, + "assemble SMB2_SIGNING_CAPABILITIES context\n"); + build_sign_cap_ctxt((struct smb2_signing_capabilities *)pneg_ctxt, + conn->signing_algorithm); + rsp->NegotiateContextCount = cpu_to_le16(++neg_ctxt_cnt); + ctxt_size += sizeof(struct smb2_signing_capabilities) + 2; + } + + inc_rfc1001_len(rsp, ctxt_size); +} + +static __le32 decode_preauth_ctxt(struct ksmbd_conn *conn, + struct smb2_preauth_neg_context *pneg_ctxt) +{ + __le32 err = STATUS_NO_PREAUTH_INTEGRITY_HASH_OVERLAP; + + if (pneg_ctxt->HashAlgorithms == SMB2_PREAUTH_INTEGRITY_SHA512) { + conn->preauth_info->Preauth_HashId = + SMB2_PREAUTH_INTEGRITY_SHA512; + err = STATUS_SUCCESS; + } + + return err; +} + +static void decode_encrypt_ctxt(struct ksmbd_conn *conn, + struct smb2_encryption_neg_context *pneg_ctxt, + int len_of_ctxts) +{ + int cph_cnt = le16_to_cpu(pneg_ctxt->CipherCount); + int i, cphs_size = cph_cnt * sizeof(__le16); + + conn->cipher_type = 0; + + if (sizeof(struct smb2_encryption_neg_context) + cphs_size > + len_of_ctxts) { + pr_err("Invalid cipher count(%d)\n", cph_cnt); + return; + } + + if (!(server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION)) + return; + + for (i = 0; i < cph_cnt; i++) { + if (pneg_ctxt->Ciphers[i] == SMB2_ENCRYPTION_AES128_GCM || + pneg_ctxt->Ciphers[i] == SMB2_ENCRYPTION_AES128_CCM || + pneg_ctxt->Ciphers[i] == SMB2_ENCRYPTION_AES256_CCM || + pneg_ctxt->Ciphers[i] == SMB2_ENCRYPTION_AES256_GCM) { + ksmbd_debug(SMB, "Cipher ID = 0x%x\n", + pneg_ctxt->Ciphers[i]); + conn->cipher_type = pneg_ctxt->Ciphers[i]; + break; + } + } +} + +static void decode_compress_ctxt(struct ksmbd_conn *conn, + struct smb2_compression_ctx *pneg_ctxt) +{ + conn->compress_algorithm = SMB3_COMPRESS_NONE; +} + +static void decode_sign_cap_ctxt(struct ksmbd_conn *conn, + struct smb2_signing_capabilities *pneg_ctxt, + int len_of_ctxts) +{ + int sign_algo_cnt = le16_to_cpu(pneg_ctxt->SigningAlgorithmCount); + int i, sign_alos_size = sign_algo_cnt * sizeof(__le16); + + conn->signing_negotiated = false; + + if (sizeof(struct smb2_signing_capabilities) + sign_alos_size > + len_of_ctxts) { + pr_err("Invalid signing algorithm count(%d)\n", sign_algo_cnt); + return; + } + + for (i = 0; i < sign_algo_cnt; i++) { + if (pneg_ctxt->SigningAlgorithms[i] == SIGNING_ALG_HMAC_SHA256 || + pneg_ctxt->SigningAlgorithms[i] == SIGNING_ALG_AES_CMAC) { + ksmbd_debug(SMB, "Signing Algorithm ID = 0x%x\n", + pneg_ctxt->SigningAlgorithms[i]); + conn->signing_negotiated = true; + conn->signing_algorithm = + pneg_ctxt->SigningAlgorithms[i]; + break; + } + } +} + +static __le32 deassemble_neg_contexts(struct ksmbd_conn *conn, + struct smb2_negotiate_req *req) +{ + /* +4 is to account for the RFC1001 len field */ + struct smb2_neg_context *pctx = (struct smb2_neg_context *)((char *)req + 4); + int i = 0, len_of_ctxts; + int offset = le32_to_cpu(req->NegotiateContextOffset); + int neg_ctxt_cnt = le16_to_cpu(req->NegotiateContextCount); + int len_of_smb = be32_to_cpu(req->hdr.smb2_buf_length); + __le32 status = STATUS_INVALID_PARAMETER; + + ksmbd_debug(SMB, "decoding %d negotiate contexts\n", neg_ctxt_cnt); + if (len_of_smb <= offset) { + ksmbd_debug(SMB, "Invalid response: negotiate context offset\n"); + return status; + } + + len_of_ctxts = len_of_smb - offset; + + while (i++ < neg_ctxt_cnt) { + int clen; + + /* check that offset is not beyond end of SMB */ + if (len_of_ctxts == 0) + break; + + if (len_of_ctxts < sizeof(struct smb2_neg_context)) + break; + + pctx = (struct smb2_neg_context *)((char *)pctx + offset); + clen = le16_to_cpu(pctx->DataLength); + if (clen + sizeof(struct smb2_neg_context) > len_of_ctxts) + break; + + if (pctx->ContextType == SMB2_PREAUTH_INTEGRITY_CAPABILITIES) { + ksmbd_debug(SMB, + "deassemble SMB2_PREAUTH_INTEGRITY_CAPABILITIES context\n"); + if (conn->preauth_info->Preauth_HashId) + break; + + status = decode_preauth_ctxt(conn, + (struct smb2_preauth_neg_context *)pctx); + if (status != STATUS_SUCCESS) + break; + } else if (pctx->ContextType == SMB2_ENCRYPTION_CAPABILITIES) { + ksmbd_debug(SMB, + "deassemble SMB2_ENCRYPTION_CAPABILITIES context\n"); + if (conn->cipher_type) + break; + + decode_encrypt_ctxt(conn, + (struct smb2_encryption_neg_context *)pctx, + len_of_ctxts); + } else if (pctx->ContextType == SMB2_COMPRESSION_CAPABILITIES) { + ksmbd_debug(SMB, + "deassemble SMB2_COMPRESSION_CAPABILITIES context\n"); + if (conn->compress_algorithm) + break; + + decode_compress_ctxt(conn, + (struct smb2_compression_ctx *)pctx); + } else if (pctx->ContextType == SMB2_NETNAME_NEGOTIATE_CONTEXT_ID) { + ksmbd_debug(SMB, + "deassemble SMB2_NETNAME_NEGOTIATE_CONTEXT_ID context\n"); + } else if (pctx->ContextType == SMB2_POSIX_EXTENSIONS_AVAILABLE) { + ksmbd_debug(SMB, + "deassemble SMB2_POSIX_EXTENSIONS_AVAILABLE context\n"); + conn->posix_ext_supported = true; + } else if (pctx->ContextType == SMB2_SIGNING_CAPABILITIES) { + ksmbd_debug(SMB, + "deassemble SMB2_SIGNING_CAPABILITIES context\n"); + decode_sign_cap_ctxt(conn, + (struct smb2_signing_capabilities *)pctx, + len_of_ctxts); + } + + /* offsets must be 8 byte aligned */ + clen = (clen + 7) & ~0x7; + offset = clen + sizeof(struct smb2_neg_context); + len_of_ctxts -= clen + sizeof(struct smb2_neg_context); + } + return status; +} + +/** + * smb2_handle_negotiate() - handler for smb2 negotiate command + * @work: smb work containing smb request buffer + * + * Return: 0 + */ +int smb2_handle_negotiate(struct ksmbd_work *work) +{ + struct ksmbd_conn *conn = work->conn; + struct smb2_negotiate_req *req = work->request_buf; + struct smb2_negotiate_rsp *rsp = work->response_buf; + int rc = 0; + __le32 status; + + ksmbd_debug(SMB, "Received negotiate request\n"); + conn->need_neg = false; + if (ksmbd_conn_good(work)) { + pr_err("conn->tcp_status is already in CifsGood State\n"); + work->send_no_response = 1; + return rc; + } + + if (req->DialectCount == 0) { + pr_err("malformed packet\n"); + rsp->hdr.Status = STATUS_INVALID_PARAMETER; + rc = -EINVAL; + goto err_out; + } + + conn->cli_cap = le32_to_cpu(req->Capabilities); + switch (conn->dialect) { + case SMB311_PROT_ID: + conn->preauth_info = + kzalloc(sizeof(struct preauth_integrity_info), + GFP_KERNEL); + if (!conn->preauth_info) { + rc = -ENOMEM; + rsp->hdr.Status = STATUS_INVALID_PARAMETER; + goto err_out; + } + + status = deassemble_neg_contexts(conn, req); + if (status != STATUS_SUCCESS) { + pr_err("deassemble_neg_contexts error(0x%x)\n", + status); + rsp->hdr.Status = status; + rc = -EINVAL; + goto err_out; + } + + rc = init_smb3_11_server(conn); + if (rc < 0) { + rsp->hdr.Status = STATUS_INVALID_PARAMETER; + goto err_out; + } + + ksmbd_gen_preauth_integrity_hash(conn, + work->request_buf, + conn->preauth_info->Preauth_HashValue); + rsp->NegotiateContextOffset = + cpu_to_le32(OFFSET_OF_NEG_CONTEXT); + assemble_neg_contexts(conn, rsp); + break; + case SMB302_PROT_ID: + init_smb3_02_server(conn); + break; + case SMB30_PROT_ID: + init_smb3_0_server(conn); + break; + case SMB21_PROT_ID: + init_smb2_1_server(conn); + break; + case SMB20_PROT_ID: + rc = init_smb2_0_server(conn); + if (rc) { + rsp->hdr.Status = STATUS_NOT_SUPPORTED; + goto err_out; + } + break; + case SMB2X_PROT_ID: + case BAD_PROT_ID: + default: + ksmbd_debug(SMB, "Server dialect :0x%x not supported\n", + conn->dialect); + rsp->hdr.Status = STATUS_NOT_SUPPORTED; + rc = -EINVAL; + goto err_out; + } + rsp->Capabilities = cpu_to_le32(conn->vals->capabilities); + + /* For stats */ + conn->connection_type = conn->dialect; + + rsp->MaxTransactSize = cpu_to_le32(conn->vals->max_trans_size); + rsp->MaxReadSize = cpu_to_le32(conn->vals->max_read_size); + rsp->MaxWriteSize = cpu_to_le32(conn->vals->max_write_size); + + if (conn->dialect > SMB20_PROT_ID) { + memcpy(conn->ClientGUID, req->ClientGUID, + SMB2_CLIENT_GUID_SIZE); + conn->cli_sec_mode = le16_to_cpu(req->SecurityMode); + } + + rsp->StructureSize = cpu_to_le16(65); + rsp->DialectRevision = cpu_to_le16(conn->dialect); + /* Not setting conn guid rsp->ServerGUID, as it + * not used by client for identifying server + */ + memset(rsp->ServerGUID, 0, SMB2_CLIENT_GUID_SIZE); + + rsp->SystemTime = cpu_to_le64(ksmbd_systime()); + rsp->ServerStartTime = 0; + ksmbd_debug(SMB, "negotiate context offset %d, count %d\n", + le32_to_cpu(rsp->NegotiateContextOffset), + le16_to_cpu(rsp->NegotiateContextCount)); + + rsp->SecurityBufferOffset = cpu_to_le16(128); + rsp->SecurityBufferLength = cpu_to_le16(AUTH_GSS_LENGTH); + ksmbd_copy_gss_neg_header(((char *)(&rsp->hdr) + + sizeof(rsp->hdr.smb2_buf_length)) + + le16_to_cpu(rsp->SecurityBufferOffset)); + inc_rfc1001_len(rsp, sizeof(struct smb2_negotiate_rsp) - + sizeof(struct smb2_hdr) - sizeof(rsp->Buffer) + + AUTH_GSS_LENGTH); + rsp->SecurityMode = SMB2_NEGOTIATE_SIGNING_ENABLED_LE; + conn->use_spnego = true; + + if ((server_conf.signing == KSMBD_CONFIG_OPT_AUTO || + server_conf.signing == KSMBD_CONFIG_OPT_DISABLED) && + req->SecurityMode & SMB2_NEGOTIATE_SIGNING_REQUIRED_LE) + conn->sign = true; + else if (server_conf.signing == KSMBD_CONFIG_OPT_MANDATORY) { + server_conf.enforced_signing = true; + rsp->SecurityMode |= SMB2_NEGOTIATE_SIGNING_REQUIRED_LE; + conn->sign = true; + } + + conn->srv_sec_mode = le16_to_cpu(rsp->SecurityMode); + ksmbd_conn_set_need_negotiate(work); + +err_out: + if (rc < 0) + smb2_set_err_rsp(work); + + return rc; +} + +static int alloc_preauth_hash(struct ksmbd_session *sess, + struct ksmbd_conn *conn) +{ + if (sess->Preauth_HashValue) + return 0; + + sess->Preauth_HashValue = kmemdup(conn->preauth_info->Preauth_HashValue, + PREAUTH_HASHVALUE_SIZE, GFP_KERNEL); + if (!sess->Preauth_HashValue) + return -ENOMEM; + + return 0; +} + +static int generate_preauth_hash(struct ksmbd_work *work) +{ + struct ksmbd_conn *conn = work->conn; + struct ksmbd_session *sess = work->sess; + u8 *preauth_hash; + + if (conn->dialect != SMB311_PROT_ID) + return 0; + + if (conn->binding) { + struct preauth_session *preauth_sess; + + preauth_sess = ksmbd_preauth_session_lookup(conn, sess->id); + if (!preauth_sess) { + preauth_sess = ksmbd_preauth_session_alloc(conn, sess->id); + if (!preauth_sess) + return -ENOMEM; + } + + preauth_hash = preauth_sess->Preauth_HashValue; + } else { + if (!sess->Preauth_HashValue) + if (alloc_preauth_hash(sess, conn)) + return -ENOMEM; + preauth_hash = sess->Preauth_HashValue; + } + + ksmbd_gen_preauth_integrity_hash(conn, work->request_buf, preauth_hash); + return 0; +} + +static int decode_negotiation_token(struct ksmbd_work *work, + struct negotiate_message *negblob) +{ + struct ksmbd_conn *conn = work->conn; + struct smb2_sess_setup_req *req; + int sz; + + if (!conn->use_spnego) + return -EINVAL; + + req = work->request_buf; + sz = le16_to_cpu(req->SecurityBufferLength); + + if (ksmbd_decode_negTokenInit((char *)negblob, sz, conn)) { + if (ksmbd_decode_negTokenTarg((char *)negblob, sz, conn)) { + conn->auth_mechs |= KSMBD_AUTH_NTLMSSP; + conn->preferred_auth_mech = KSMBD_AUTH_NTLMSSP; + conn->use_spnego = false; + } + } + return 0; +} + +static int ntlm_negotiate(struct ksmbd_work *work, + struct negotiate_message *negblob) +{ + struct smb2_sess_setup_req *req = work->request_buf; + struct smb2_sess_setup_rsp *rsp = work->response_buf; + struct challenge_message *chgblob; + unsigned char *spnego_blob = NULL; + u16 spnego_blob_len; + char *neg_blob; + int sz, rc; + + ksmbd_debug(SMB, "negotiate phase\n"); + sz = le16_to_cpu(req->SecurityBufferLength); + rc = ksmbd_decode_ntlmssp_neg_blob(negblob, sz, work->sess); + if (rc) + return rc; + + sz = le16_to_cpu(rsp->SecurityBufferOffset); + chgblob = + (struct challenge_message *)((char *)&rsp->hdr.ProtocolId + sz); + memset(chgblob, 0, sizeof(struct challenge_message)); + + if (!work->conn->use_spnego) { + sz = ksmbd_build_ntlmssp_challenge_blob(chgblob, work->sess); + if (sz < 0) + return -ENOMEM; + + rsp->SecurityBufferLength = cpu_to_le16(sz); + return 0; + } + + sz = sizeof(struct challenge_message); + sz += (strlen(ksmbd_netbios_name()) * 2 + 1 + 4) * 6; + + neg_blob = kzalloc(sz, GFP_KERNEL); + if (!neg_blob) + return -ENOMEM; + + chgblob = (struct challenge_message *)neg_blob; + sz = ksmbd_build_ntlmssp_challenge_blob(chgblob, work->sess); + if (sz < 0) { + rc = -ENOMEM; + goto out; + } + + rc = build_spnego_ntlmssp_neg_blob(&spnego_blob, &spnego_blob_len, + neg_blob, sz); + if (rc) { + rc = -ENOMEM; + goto out; + } + + sz = le16_to_cpu(rsp->SecurityBufferOffset); + memcpy((char *)&rsp->hdr.ProtocolId + sz, spnego_blob, spnego_blob_len); + rsp->SecurityBufferLength = cpu_to_le16(spnego_blob_len); + +out: + kfree(spnego_blob); + kfree(neg_blob); + return rc; +} + +static struct authenticate_message *user_authblob(struct ksmbd_conn *conn, + struct smb2_sess_setup_req *req) +{ + int sz; + + if (conn->use_spnego && conn->mechToken) + return (struct authenticate_message *)conn->mechToken; + + sz = le16_to_cpu(req->SecurityBufferOffset); + return (struct authenticate_message *)((char *)&req->hdr.ProtocolId + + sz); +} + +static struct ksmbd_user *session_user(struct ksmbd_conn *conn, + struct smb2_sess_setup_req *req) +{ + struct authenticate_message *authblob; + struct ksmbd_user *user; + char *name; + int sz; + + authblob = user_authblob(conn, req); + sz = le32_to_cpu(authblob->UserName.BufferOffset); + name = smb_strndup_from_utf16((const char *)authblob + sz, + le16_to_cpu(authblob->UserName.Length), + true, + conn->local_nls); + if (IS_ERR(name)) { + pr_err("cannot allocate memory\n"); + return NULL; + } + + ksmbd_debug(SMB, "session setup request for user %s\n", name); + user = ksmbd_login_user(name); + kfree(name); + return user; +} + +static int ntlm_authenticate(struct ksmbd_work *work) +{ + struct smb2_sess_setup_req *req = work->request_buf; + struct smb2_sess_setup_rsp *rsp = work->response_buf; + struct ksmbd_conn *conn = work->conn; + struct ksmbd_session *sess = work->sess; + struct channel *chann = NULL; + struct ksmbd_user *user; + u64 prev_id; + int sz, rc; + + ksmbd_debug(SMB, "authenticate phase\n"); + if (conn->use_spnego) { + unsigned char *spnego_blob; + u16 spnego_blob_len; + + rc = build_spnego_ntlmssp_auth_blob(&spnego_blob, + &spnego_blob_len, + 0); + if (rc) + return -ENOMEM; + + sz = le16_to_cpu(rsp->SecurityBufferOffset); + memcpy((char *)&rsp->hdr.ProtocolId + sz, spnego_blob, spnego_blob_len); + rsp->SecurityBufferLength = cpu_to_le16(spnego_blob_len); + kfree(spnego_blob); + inc_rfc1001_len(rsp, spnego_blob_len - 1); + } + + user = session_user(conn, req); + if (!user) { + ksmbd_debug(SMB, "Unknown user name or an error\n"); + return -EPERM; + } + + /* Check for previous session */ + prev_id = le64_to_cpu(req->PreviousSessionId); + if (prev_id && prev_id != sess->id) + destroy_previous_session(user, prev_id); + + if (sess->state == SMB2_SESSION_VALID) { + /* + * Reuse session if anonymous try to connect + * on reauthetication. + */ + if (ksmbd_anonymous_user(user)) { + ksmbd_free_user(user); + return 0; + } + ksmbd_free_user(sess->user); + } + + sess->user = user; + if (user_guest(sess->user)) { + if (conn->sign) { + ksmbd_debug(SMB, "Guest login not allowed when signing enabled\n"); + return -EPERM; + } + + rsp->SessionFlags = SMB2_SESSION_FLAG_IS_GUEST_LE; + } else { + struct authenticate_message *authblob; + + authblob = user_authblob(conn, req); + sz = le16_to_cpu(req->SecurityBufferLength); + rc = ksmbd_decode_ntlmssp_auth_blob(authblob, sz, sess); + if (rc) { + set_user_flag(sess->user, KSMBD_USER_FLAG_BAD_PASSWORD); + ksmbd_debug(SMB, "authentication failed\n"); + return -EPERM; + } + + /* + * If session state is SMB2_SESSION_VALID, We can assume + * that it is reauthentication. And the user/password + * has been verified, so return it here. + */ + if (sess->state == SMB2_SESSION_VALID) { + if (conn->binding) + goto binding_session; + return 0; + } + + if ((conn->sign || server_conf.enforced_signing) || + (req->SecurityMode & SMB2_NEGOTIATE_SIGNING_REQUIRED)) + sess->sign = true; + + if (conn->vals->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION && + conn->ops->generate_encryptionkey && + !(req->Flags & SMB2_SESSION_REQ_FLAG_BINDING)) { + rc = conn->ops->generate_encryptionkey(sess); + if (rc) { + ksmbd_debug(SMB, + "SMB3 encryption key generation failed\n"); + return -EINVAL; + } + sess->enc = true; + rsp->SessionFlags = SMB2_SESSION_FLAG_ENCRYPT_DATA_LE; + /* + * signing is disable if encryption is enable + * on this session + */ + sess->sign = false; + } + } + +binding_session: + if (conn->dialect >= SMB30_PROT_ID) { + chann = lookup_chann_list(sess, conn); + if (!chann) { + chann = kmalloc(sizeof(struct channel), GFP_KERNEL); + if (!chann) + return -ENOMEM; + + chann->conn = conn; + INIT_LIST_HEAD(&chann->chann_list); + list_add(&chann->chann_list, &sess->ksmbd_chann_list); + } + } + + if (conn->ops->generate_signingkey) { + rc = conn->ops->generate_signingkey(sess, conn); + if (rc) { + ksmbd_debug(SMB, "SMB3 signing key generation failed\n"); + return -EINVAL; + } + } + + if (conn->dialect > SMB20_PROT_ID) { + if (!ksmbd_conn_lookup_dialect(conn)) { + pr_err("fail to verify the dialect\n"); + return -ENOENT; + } + } + return 0; +} + +#ifdef CONFIG_SMB_SERVER_KERBEROS5 +static int krb5_authenticate(struct ksmbd_work *work) +{ + struct smb2_sess_setup_req *req = work->request_buf; + struct smb2_sess_setup_rsp *rsp = work->response_buf; + struct ksmbd_conn *conn = work->conn; + struct ksmbd_session *sess = work->sess; + char *in_blob, *out_blob; + struct channel *chann = NULL; + u64 prev_sess_id; + int in_len, out_len; + int retval; + + in_blob = (char *)&req->hdr.ProtocolId + + le16_to_cpu(req->SecurityBufferOffset); + in_len = le16_to_cpu(req->SecurityBufferLength); + out_blob = (char *)&rsp->hdr.ProtocolId + + le16_to_cpu(rsp->SecurityBufferOffset); + out_len = work->response_sz - + offsetof(struct smb2_hdr, smb2_buf_length) - + le16_to_cpu(rsp->SecurityBufferOffset); + + /* Check previous session */ + prev_sess_id = le64_to_cpu(req->PreviousSessionId); + if (prev_sess_id && prev_sess_id != sess->id) + destroy_previous_session(sess->user, prev_sess_id); + + if (sess->state == SMB2_SESSION_VALID) + ksmbd_free_user(sess->user); + + retval = ksmbd_krb5_authenticate(sess, in_blob, in_len, + out_blob, &out_len); + if (retval) { + ksmbd_debug(SMB, "krb5 authentication failed\n"); + return -EINVAL; + } + rsp->SecurityBufferLength = cpu_to_le16(out_len); + inc_rfc1001_len(rsp, out_len - 1); + + if ((conn->sign || server_conf.enforced_signing) || + (req->SecurityMode & SMB2_NEGOTIATE_SIGNING_REQUIRED)) + sess->sign = true; + + if ((conn->vals->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION) && + conn->ops->generate_encryptionkey) { + retval = conn->ops->generate_encryptionkey(sess); + if (retval) { + ksmbd_debug(SMB, + "SMB3 encryption key generation failed\n"); + return -EINVAL; + } + sess->enc = true; + rsp->SessionFlags = SMB2_SESSION_FLAG_ENCRYPT_DATA_LE; + sess->sign = false; + } + + if (conn->dialect >= SMB30_PROT_ID) { + chann = lookup_chann_list(sess, conn); + if (!chann) { + chann = kmalloc(sizeof(struct channel), GFP_KERNEL); + if (!chann) + return -ENOMEM; + + chann->conn = conn; + INIT_LIST_HEAD(&chann->chann_list); + list_add(&chann->chann_list, &sess->ksmbd_chann_list); + } + } + + if (conn->ops->generate_signingkey) { + retval = conn->ops->generate_signingkey(sess, conn); + if (retval) { + ksmbd_debug(SMB, "SMB3 signing key generation failed\n"); + return -EINVAL; + } + } + + if (conn->dialect > SMB20_PROT_ID) { + if (!ksmbd_conn_lookup_dialect(conn)) { + pr_err("fail to verify the dialect\n"); + return -ENOENT; + } + } + return 0; +} +#else +static int krb5_authenticate(struct ksmbd_work *work) +{ + return -EOPNOTSUPP; +} +#endif + +int smb2_sess_setup(struct ksmbd_work *work) +{ + struct ksmbd_conn *conn = work->conn; + struct smb2_sess_setup_req *req = work->request_buf; + struct smb2_sess_setup_rsp *rsp = work->response_buf; + struct ksmbd_session *sess; + struct negotiate_message *negblob; + int rc = 0; + + ksmbd_debug(SMB, "Received request for session setup\n"); + + rsp->StructureSize = cpu_to_le16(9); + rsp->SessionFlags = 0; + rsp->SecurityBufferOffset = cpu_to_le16(72); + rsp->SecurityBufferLength = 0; + inc_rfc1001_len(rsp, 9); + + if (!req->hdr.SessionId) { + sess = ksmbd_smb2_session_create(); + if (!sess) { + rc = -ENOMEM; + goto out_err; + } + rsp->hdr.SessionId = cpu_to_le64(sess->id); + ksmbd_session_register(conn, sess); + } else if (conn->dialect >= SMB30_PROT_ID && + (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL) && + req->Flags & SMB2_SESSION_REQ_FLAG_BINDING) { + u64 sess_id = le64_to_cpu(req->hdr.SessionId); + + sess = ksmbd_session_lookup_slowpath(sess_id); + if (!sess) { + rc = -ENOENT; + goto out_err; + } + + if (conn->dialect != sess->conn->dialect) { + rc = -EINVAL; + goto out_err; + } + + if (!(req->hdr.Flags & SMB2_FLAGS_SIGNED)) { + rc = -EINVAL; + goto out_err; + } + + if (strncmp(conn->ClientGUID, sess->conn->ClientGUID, + SMB2_CLIENT_GUID_SIZE)) { + rc = -ENOENT; + goto out_err; + } + + if (sess->state == SMB2_SESSION_IN_PROGRESS) { + rc = -EACCES; + goto out_err; + } + + if (sess->state == SMB2_SESSION_EXPIRED) { + rc = -EFAULT; + goto out_err; + } + + if (ksmbd_session_lookup(conn, sess_id)) { + rc = -EACCES; + goto out_err; + } + + conn->binding = true; + } else if ((conn->dialect < SMB30_PROT_ID || + server_conf.flags & KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL) && + (req->Flags & SMB2_SESSION_REQ_FLAG_BINDING)) { + sess = NULL; + rc = -EACCES; + goto out_err; + } else { + sess = ksmbd_session_lookup(conn, + le64_to_cpu(req->hdr.SessionId)); + if (!sess) { + rc = -ENOENT; + goto out_err; + } + } + work->sess = sess; + + if (sess->state == SMB2_SESSION_EXPIRED) + sess->state = SMB2_SESSION_IN_PROGRESS; + + negblob = (struct negotiate_message *)((char *)&req->hdr.ProtocolId + + le16_to_cpu(req->SecurityBufferOffset)); + + if (decode_negotiation_token(work, negblob) == 0) { + if (conn->mechToken) + negblob = (struct negotiate_message *)conn->mechToken; + } + + if (server_conf.auth_mechs & conn->auth_mechs) { + rc = generate_preauth_hash(work); + if (rc) + goto out_err; + + if (conn->preferred_auth_mech & + (KSMBD_AUTH_KRB5 | KSMBD_AUTH_MSKRB5)) { + rc = krb5_authenticate(work); + if (rc) { + rc = -EINVAL; + goto out_err; + } + + ksmbd_conn_set_good(work); + sess->state = SMB2_SESSION_VALID; + kfree(sess->Preauth_HashValue); + sess->Preauth_HashValue = NULL; + } else if (conn->preferred_auth_mech == KSMBD_AUTH_NTLMSSP) { + if (negblob->MessageType == NtLmNegotiate) { + rc = ntlm_negotiate(work, negblob); + if (rc) + goto out_err; + rsp->hdr.Status = + STATUS_MORE_PROCESSING_REQUIRED; + /* + * Note: here total size -1 is done as an + * adjustment for 0 size blob + */ + inc_rfc1001_len(rsp, le16_to_cpu(rsp->SecurityBufferLength) - 1); + + } else if (negblob->MessageType == NtLmAuthenticate) { + rc = ntlm_authenticate(work); + if (rc) + goto out_err; + + ksmbd_conn_set_good(work); + sess->state = SMB2_SESSION_VALID; + if (conn->binding) { + struct preauth_session *preauth_sess; + + preauth_sess = + ksmbd_preauth_session_lookup(conn, sess->id); + if (preauth_sess) { + list_del(&preauth_sess->preauth_entry); + kfree(preauth_sess); + } + } + kfree(sess->Preauth_HashValue); + sess->Preauth_HashValue = NULL; + } + } else { + /* TODO: need one more negotiation */ + pr_err("Not support the preferred authentication\n"); + rc = -EINVAL; + } + } else { + pr_err("Not support authentication\n"); + rc = -EINVAL; + } + +out_err: + if (rc == -EINVAL) + rsp->hdr.Status = STATUS_INVALID_PARAMETER; + else if (rc == -ENOENT) + rsp->hdr.Status = STATUS_USER_SESSION_DELETED; + else if (rc == -EACCES) + rsp->hdr.Status = STATUS_REQUEST_NOT_ACCEPTED; + else if (rc == -EFAULT) + rsp->hdr.Status = STATUS_NETWORK_SESSION_EXPIRED; + else if (rc == -ENOMEM) + rsp->hdr.Status = STATUS_INSUFFICIENT_RESOURCES; + else if (rc) + rsp->hdr.Status = STATUS_LOGON_FAILURE; + + if (conn->use_spnego && conn->mechToken) { + kfree(conn->mechToken); + conn->mechToken = NULL; + } + + if (rc < 0 && sess) { + ksmbd_session_destroy(sess); + work->sess = NULL; + } + + return rc; +} + +/** + * smb2_tree_connect() - handler for smb2 tree connect command + * @work: smb work containing smb request buffer + * + * Return: 0 on success, otherwise error + */ +int smb2_tree_connect(struct ksmbd_work *work) +{ + struct ksmbd_conn *conn = work->conn; + struct smb2_tree_connect_req *req = work->request_buf; + struct smb2_tree_connect_rsp *rsp = work->response_buf; + struct ksmbd_session *sess = work->sess; + char *treename = NULL, *name = NULL; + struct ksmbd_tree_conn_status status; + struct ksmbd_share_config *share; + int rc = -EINVAL; + + treename = smb_strndup_from_utf16(req->Buffer, + le16_to_cpu(req->PathLength), true, + conn->local_nls); + if (IS_ERR(treename)) { + pr_err("treename is NULL\n"); + status.ret = KSMBD_TREE_CONN_STATUS_ERROR; + goto out_err1; + } + + name = ksmbd_extract_sharename(treename); + if (IS_ERR(name)) { + status.ret = KSMBD_TREE_CONN_STATUS_ERROR; + goto out_err1; + } + + ksmbd_debug(SMB, "tree connect request for tree %s treename %s\n", + name, treename); + + status = ksmbd_tree_conn_connect(sess, name); + if (status.ret == KSMBD_TREE_CONN_STATUS_OK) + rsp->hdr.Id.SyncId.TreeId = cpu_to_le32(status.tree_conn->id); + else + goto out_err1; + + share = status.tree_conn->share_conf; + if (test_share_config_flag(share, KSMBD_SHARE_FLAG_PIPE)) { + ksmbd_debug(SMB, "IPC share path request\n"); + rsp->ShareType = SMB2_SHARE_TYPE_PIPE; + rsp->MaximalAccess = FILE_READ_DATA_LE | FILE_READ_EA_LE | + FILE_EXECUTE_LE | FILE_READ_ATTRIBUTES_LE | + FILE_DELETE_LE | FILE_READ_CONTROL_LE | + FILE_WRITE_DAC_LE | FILE_WRITE_OWNER_LE | + FILE_SYNCHRONIZE_LE; + } else { + rsp->ShareType = SMB2_SHARE_TYPE_DISK; + rsp->MaximalAccess = FILE_READ_DATA_LE | FILE_READ_EA_LE | + FILE_EXECUTE_LE | FILE_READ_ATTRIBUTES_LE; + if (test_tree_conn_flag(status.tree_conn, + KSMBD_TREE_CONN_FLAG_WRITABLE)) { + rsp->MaximalAccess |= FILE_WRITE_DATA_LE | + FILE_APPEND_DATA_LE | FILE_WRITE_EA_LE | + FILE_DELETE_LE | FILE_WRITE_ATTRIBUTES_LE | + FILE_DELETE_CHILD_LE | FILE_READ_CONTROL_LE | + FILE_WRITE_DAC_LE | FILE_WRITE_OWNER_LE | + FILE_SYNCHRONIZE_LE; + } + } + + status.tree_conn->maximal_access = le32_to_cpu(rsp->MaximalAccess); + if (conn->posix_ext_supported) + status.tree_conn->posix_extensions = true; + +out_err1: + rsp->StructureSize = cpu_to_le16(16); + rsp->Capabilities = 0; + rsp->Reserved = 0; + /* default manual caching */ + rsp->ShareFlags = SMB2_SHAREFLAG_MANUAL_CACHING; + inc_rfc1001_len(rsp, 16); + + if (!IS_ERR(treename)) + kfree(treename); + if (!IS_ERR(name)) + kfree(name); + + switch (status.ret) { + case KSMBD_TREE_CONN_STATUS_OK: + rsp->hdr.Status = STATUS_SUCCESS; + rc = 0; + break; + case KSMBD_TREE_CONN_STATUS_NO_SHARE: + rsp->hdr.Status = STATUS_BAD_NETWORK_PATH; + break; + case -ENOMEM: + case KSMBD_TREE_CONN_STATUS_NOMEM: + rsp->hdr.Status = STATUS_NO_MEMORY; + break; + case KSMBD_TREE_CONN_STATUS_ERROR: + case KSMBD_TREE_CONN_STATUS_TOO_MANY_CONNS: + case KSMBD_TREE_CONN_STATUS_TOO_MANY_SESSIONS: + rsp->hdr.Status = STATUS_ACCESS_DENIED; + break; + case -EINVAL: + rsp->hdr.Status = STATUS_INVALID_PARAMETER; + break; + default: + rsp->hdr.Status = STATUS_ACCESS_DENIED; + } + + return rc; +} + +/** + * smb2_create_open_flags() - convert smb open flags to unix open flags + * @file_present: is file already present + * @access: file access flags + * @disposition: file disposition flags + * @may_flags: set with MAY_ flags + * + * Return: file open flags + */ +static int smb2_create_open_flags(bool file_present, __le32 access, + __le32 disposition, + int *may_flags) +{ + int oflags = O_NONBLOCK | O_LARGEFILE; + + if (access & FILE_READ_DESIRED_ACCESS_LE && + access & FILE_WRITE_DESIRE_ACCESS_LE) { + oflags |= O_RDWR; + *may_flags = MAY_OPEN | MAY_READ | MAY_WRITE; + } else if (access & FILE_WRITE_DESIRE_ACCESS_LE) { + oflags |= O_WRONLY; + *may_flags = MAY_OPEN | MAY_WRITE; + } else { + oflags |= O_RDONLY; + *may_flags = MAY_OPEN | MAY_READ; + } + + if (access == FILE_READ_ATTRIBUTES_LE) + oflags |= O_PATH; + + if (file_present) { + switch (disposition & FILE_CREATE_MASK_LE) { + case FILE_OPEN_LE: + case FILE_CREATE_LE: + break; + case FILE_SUPERSEDE_LE: + case FILE_OVERWRITE_LE: + case FILE_OVERWRITE_IF_LE: + oflags |= O_TRUNC; + break; + default: + break; + } + } else { + switch (disposition & FILE_CREATE_MASK_LE) { + case FILE_SUPERSEDE_LE: + case FILE_CREATE_LE: + case FILE_OPEN_IF_LE: + case FILE_OVERWRITE_IF_LE: + oflags |= O_CREAT; + break; + case FILE_OPEN_LE: + case FILE_OVERWRITE_LE: + oflags &= ~O_CREAT; + break; + default: + break; + } + } + + return oflags; +} + +/** + * smb2_tree_disconnect() - handler for smb tree connect request + * @work: smb work containing request buffer + * + * Return: 0 + */ +int smb2_tree_disconnect(struct ksmbd_work *work) +{ + struct smb2_tree_disconnect_rsp *rsp = work->response_buf; + struct ksmbd_session *sess = work->sess; + struct ksmbd_tree_connect *tcon = work->tcon; + + rsp->StructureSize = cpu_to_le16(4); + inc_rfc1001_len(rsp, 4); + + ksmbd_debug(SMB, "request\n"); + + if (!tcon) { + struct smb2_tree_disconnect_req *req = work->request_buf; + + ksmbd_debug(SMB, "Invalid tid %d\n", req->hdr.Id.SyncId.TreeId); + rsp->hdr.Status = STATUS_NETWORK_NAME_DELETED; + smb2_set_err_rsp(work); + return 0; + } + + ksmbd_close_tree_conn_fds(work); + ksmbd_tree_conn_disconnect(sess, tcon); + return 0; +} + +/** + * smb2_session_logoff() - handler for session log off request + * @work: smb work containing request buffer + * + * Return: 0 + */ +int smb2_session_logoff(struct ksmbd_work *work) +{ + struct ksmbd_conn *conn = work->conn; + struct smb2_logoff_rsp *rsp = work->response_buf; + struct ksmbd_session *sess = work->sess; + + rsp->StructureSize = cpu_to_le16(4); + inc_rfc1001_len(rsp, 4); + + ksmbd_debug(SMB, "request\n"); + + /* Got a valid session, set connection state */ + WARN_ON(sess->conn != conn); + + /* setting CifsExiting here may race with start_tcp_sess */ + ksmbd_conn_set_need_reconnect(work); + ksmbd_close_session_fds(work); + ksmbd_conn_wait_idle(conn); + + if (ksmbd_tree_conn_session_logoff(sess)) { + struct smb2_logoff_req *req = work->request_buf; + + ksmbd_debug(SMB, "Invalid tid %d\n", req->hdr.Id.SyncId.TreeId); + rsp->hdr.Status = STATUS_NETWORK_NAME_DELETED; + smb2_set_err_rsp(work); + return 0; + } + + ksmbd_destroy_file_table(&sess->file_table); + sess->state = SMB2_SESSION_EXPIRED; + + ksmbd_free_user(sess->user); + sess->user = NULL; + + /* let start_tcp_sess free connection info now */ + ksmbd_conn_set_need_negotiate(work); + return 0; +} + +/** + * create_smb2_pipe() - create IPC pipe + * @work: smb work containing request buffer + * + * Return: 0 on success, otherwise error + */ +static noinline int create_smb2_pipe(struct ksmbd_work *work) +{ + struct smb2_create_rsp *rsp = work->response_buf; + struct smb2_create_req *req = work->request_buf; + int id; + int err; + char *name; + + name = smb_strndup_from_utf16(req->Buffer, le16_to_cpu(req->NameLength), + 1, work->conn->local_nls); + if (IS_ERR(name)) { + rsp->hdr.Status = STATUS_NO_MEMORY; + err = PTR_ERR(name); + goto out; + } + + id = ksmbd_session_rpc_open(work->sess, name); + if (id < 0) { + pr_err("Unable to open RPC pipe: %d\n", id); + err = id; + goto out; + } + + rsp->hdr.Status = STATUS_SUCCESS; + rsp->StructureSize = cpu_to_le16(89); + rsp->OplockLevel = SMB2_OPLOCK_LEVEL_NONE; + rsp->Reserved = 0; + rsp->CreateAction = cpu_to_le32(FILE_OPENED); + + rsp->CreationTime = cpu_to_le64(0); + rsp->LastAccessTime = cpu_to_le64(0); + rsp->ChangeTime = cpu_to_le64(0); + rsp->AllocationSize = cpu_to_le64(0); + rsp->EndofFile = cpu_to_le64(0); + rsp->FileAttributes = ATTR_NORMAL_LE; + rsp->Reserved2 = 0; + rsp->VolatileFileId = cpu_to_le64(id); + rsp->PersistentFileId = 0; + rsp->CreateContextsOffset = 0; + rsp->CreateContextsLength = 0; + + inc_rfc1001_len(rsp, 88); /* StructureSize - 1*/ + kfree(name); + return 0; + +out: + switch (err) { + case -EINVAL: + rsp->hdr.Status = STATUS_INVALID_PARAMETER; + break; + case -ENOSPC: + case -ENOMEM: + rsp->hdr.Status = STATUS_NO_MEMORY; + break; + } + + if (!IS_ERR(name)) + kfree(name); + + smb2_set_err_rsp(work); + return err; +} + +/** + * smb2_set_ea() - handler for setting extended attributes using set + * info command + * @eabuf: set info command buffer + * @path: dentry path for get ea + * + * Return: 0 on success, otherwise error + */ +static int smb2_set_ea(struct smb2_ea_info *eabuf, struct path *path) +{ + struct user_namespace *user_ns = mnt_user_ns(path->mnt); + char *attr_name = NULL, *value; + int rc = 0; + int next = 0; + + attr_name = kmalloc(XATTR_NAME_MAX + 1, GFP_KERNEL); + if (!attr_name) + return -ENOMEM; + + do { + if (!eabuf->EaNameLength) + goto next; + + ksmbd_debug(SMB, + "name : <%s>, name_len : %u, value_len : %u, next : %u\n", + eabuf->name, eabuf->EaNameLength, + le16_to_cpu(eabuf->EaValueLength), + le32_to_cpu(eabuf->NextEntryOffset)); + + if (eabuf->EaNameLength > + (XATTR_NAME_MAX - XATTR_USER_PREFIX_LEN)) { + rc = -EINVAL; + break; + } + + memcpy(attr_name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); + memcpy(&attr_name[XATTR_USER_PREFIX_LEN], eabuf->name, + eabuf->EaNameLength); + attr_name[XATTR_USER_PREFIX_LEN + eabuf->EaNameLength] = '\0'; + value = (char *)&eabuf->name + eabuf->EaNameLength + 1; + + if (!eabuf->EaValueLength) { + rc = ksmbd_vfs_casexattr_len(user_ns, + path->dentry, + attr_name, + XATTR_USER_PREFIX_LEN + + eabuf->EaNameLength); + + /* delete the EA only when it exits */ + if (rc > 0) { + rc = ksmbd_vfs_remove_xattr(user_ns, + path->dentry, + attr_name); + + if (rc < 0) { + ksmbd_debug(SMB, + "remove xattr failed(%d)\n", + rc); + break; + } + } + + /* if the EA doesn't exist, just do nothing. */ + rc = 0; + } else { + rc = ksmbd_vfs_setxattr(user_ns, + path->dentry, attr_name, value, + le16_to_cpu(eabuf->EaValueLength), 0); + if (rc < 0) { + ksmbd_debug(SMB, + "ksmbd_vfs_setxattr is failed(%d)\n", + rc); + break; + } + } + +next: + next = le32_to_cpu(eabuf->NextEntryOffset); + eabuf = (struct smb2_ea_info *)((char *)eabuf + next); + } while (next != 0); + + kfree(attr_name); + return rc; +} + +static noinline int smb2_set_stream_name_xattr(struct path *path, + struct ksmbd_file *fp, + char *stream_name, int s_type) +{ + struct user_namespace *user_ns = mnt_user_ns(path->mnt); + size_t xattr_stream_size; + char *xattr_stream_name; + int rc; + + rc = ksmbd_vfs_xattr_stream_name(stream_name, + &xattr_stream_name, + &xattr_stream_size, + s_type); + if (rc) + return rc; + + fp->stream.name = xattr_stream_name; + fp->stream.size = xattr_stream_size; + + /* Check if there is stream prefix in xattr space */ + rc = ksmbd_vfs_casexattr_len(user_ns, + path->dentry, + xattr_stream_name, + xattr_stream_size); + if (rc >= 0) + return 0; + + if (fp->cdoption == FILE_OPEN_LE) { + ksmbd_debug(SMB, "XATTR stream name lookup failed: %d\n", rc); + return -EBADF; + } + + rc = ksmbd_vfs_setxattr(user_ns, path->dentry, + xattr_stream_name, NULL, 0, 0); + if (rc < 0) + pr_err("Failed to store XATTR stream name :%d\n", rc); + return 0; +} + +static int smb2_remove_smb_xattrs(struct path *path) +{ + struct user_namespace *user_ns = mnt_user_ns(path->mnt); + char *name, *xattr_list = NULL; + ssize_t xattr_list_len; + int err = 0; + + xattr_list_len = ksmbd_vfs_listxattr(path->dentry, &xattr_list); + if (xattr_list_len < 0) { + goto out; + } else if (!xattr_list_len) { + ksmbd_debug(SMB, "empty xattr in the file\n"); + goto out; + } + + for (name = xattr_list; name - xattr_list < xattr_list_len; + name += strlen(name) + 1) { + ksmbd_debug(SMB, "%s, len %zd\n", name, strlen(name)); + + if (strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) && + strncmp(&name[XATTR_USER_PREFIX_LEN], DOS_ATTRIBUTE_PREFIX, + DOS_ATTRIBUTE_PREFIX_LEN) && + strncmp(&name[XATTR_USER_PREFIX_LEN], STREAM_PREFIX, STREAM_PREFIX_LEN)) + continue; + + err = ksmbd_vfs_remove_xattr(user_ns, path->dentry, name); + if (err) + ksmbd_debug(SMB, "remove xattr failed : %s\n", name); + } +out: + kvfree(xattr_list); + return err; +} + +static int smb2_create_truncate(struct path *path) +{ + int rc = vfs_truncate(path, 0); + + if (rc) { + pr_err("vfs_truncate failed, rc %d\n", rc); + return rc; + } + + rc = smb2_remove_smb_xattrs(path); + if (rc == -EOPNOTSUPP) + rc = 0; + if (rc) + ksmbd_debug(SMB, + "ksmbd_truncate_stream_name_xattr failed, rc %d\n", + rc); + return rc; +} + +static void smb2_new_xattrs(struct ksmbd_tree_connect *tcon, struct path *path, + struct ksmbd_file *fp) +{ + struct xattr_dos_attrib da = {0}; + int rc; + + if (!test_share_config_flag(tcon->share_conf, + KSMBD_SHARE_FLAG_STORE_DOS_ATTRS)) + return; + + da.version = 4; + da.attr = le32_to_cpu(fp->f_ci->m_fattr); + da.itime = da.create_time = fp->create_time; + da.flags = XATTR_DOSINFO_ATTRIB | XATTR_DOSINFO_CREATE_TIME | + XATTR_DOSINFO_ITIME; + + rc = ksmbd_vfs_set_dos_attrib_xattr(mnt_user_ns(path->mnt), + path->dentry, &da); + if (rc) + ksmbd_debug(SMB, "failed to store file attribute into xattr\n"); +} + +static void smb2_update_xattrs(struct ksmbd_tree_connect *tcon, + struct path *path, struct ksmbd_file *fp) +{ + struct xattr_dos_attrib da; + int rc; + + fp->f_ci->m_fattr &= ~(ATTR_HIDDEN_LE | ATTR_SYSTEM_LE); + + /* get FileAttributes from XATTR_NAME_DOS_ATTRIBUTE */ + if (!test_share_config_flag(tcon->share_conf, + KSMBD_SHARE_FLAG_STORE_DOS_ATTRS)) + return; + + rc = ksmbd_vfs_get_dos_attrib_xattr(mnt_user_ns(path->mnt), + path->dentry, &da); + if (rc > 0) { + fp->f_ci->m_fattr = cpu_to_le32(da.attr); + fp->create_time = da.create_time; + fp->itime = da.itime; + } +} + +static int smb2_creat(struct ksmbd_work *work, struct path *path, char *name, + int open_flags, umode_t posix_mode, bool is_dir) +{ + struct ksmbd_tree_connect *tcon = work->tcon; + struct ksmbd_share_config *share = tcon->share_conf; + umode_t mode; + int rc; + + if (!(open_flags & O_CREAT)) + return -EBADF; + + ksmbd_debug(SMB, "file does not exist, so creating\n"); + if (is_dir == true) { + ksmbd_debug(SMB, "creating directory\n"); + + mode = share_config_directory_mode(share, posix_mode); + rc = ksmbd_vfs_mkdir(work, name, mode); + if (rc) + return rc; + } else { + ksmbd_debug(SMB, "creating regular file\n"); + + mode = share_config_create_mode(share, posix_mode); + rc = ksmbd_vfs_create(work, name, mode); + if (rc) + return rc; + } + + rc = ksmbd_vfs_kern_path(name, 0, path, 0); + if (rc) { + pr_err("cannot get linux path (%s), err = %d\n", + name, rc); + return rc; + } + return 0; +} + +static int smb2_create_sd_buffer(struct ksmbd_work *work, + struct smb2_create_req *req, + struct path *path) +{ + struct create_context *context; + struct create_sd_buf_req *sd_buf; + + if (!req->CreateContextsOffset) + return -ENOENT; + + /* Parse SD BUFFER create contexts */ + context = smb2_find_context_vals(req, SMB2_CREATE_SD_BUFFER); + if (!context) + return -ENOENT; + else if (IS_ERR(context)) + return PTR_ERR(context); + + ksmbd_debug(SMB, + "Set ACLs using SMB2_CREATE_SD_BUFFER context\n"); + sd_buf = (struct create_sd_buf_req *)context; + return set_info_sec(work->conn, work->tcon, path, &sd_buf->ntsd, + le32_to_cpu(sd_buf->ccontext.DataLength), true); +} + +static void ksmbd_acls_fattr(struct smb_fattr *fattr, struct inode *inode) +{ + fattr->cf_uid = inode->i_uid; + fattr->cf_gid = inode->i_gid; + fattr->cf_mode = inode->i_mode; + fattr->cf_acls = NULL; + fattr->cf_dacls = NULL; + + if (IS_ENABLED(CONFIG_FS_POSIX_ACL)) { + fattr->cf_acls = get_acl(inode, ACL_TYPE_ACCESS); + if (S_ISDIR(inode->i_mode)) + fattr->cf_dacls = get_acl(inode, ACL_TYPE_DEFAULT); + } +} + +/** + * smb2_open() - handler for smb file open request + * @work: smb work containing request buffer + * + * Return: 0 on success, otherwise error + */ +int smb2_open(struct ksmbd_work *work) +{ + struct ksmbd_conn *conn = work->conn; + struct ksmbd_session *sess = work->sess; + struct ksmbd_tree_connect *tcon = work->tcon; + struct smb2_create_req *req; + struct smb2_create_rsp *rsp, *rsp_org; + struct path path; + struct ksmbd_share_config *share = tcon->share_conf; + struct ksmbd_file *fp = NULL; + struct file *filp = NULL; + struct user_namespace *user_ns = NULL; + struct kstat stat; + struct create_context *context; + struct lease_ctx_info *lc = NULL; + struct create_ea_buf_req *ea_buf = NULL; + struct oplock_info *opinfo; + __le32 *next_ptr = NULL; + int req_op_level = 0, open_flags = 0, may_flags = 0, file_info = 0; + int rc = 0, len = 0; + int contxt_cnt = 0, query_disk_id = 0; + int maximal_access_ctxt = 0, posix_ctxt = 0; + int s_type = 0; + int next_off = 0; + char *name = NULL; + char *stream_name = NULL; + bool file_present = false, created = false, already_permitted = false; + int share_ret, need_truncate = 0; + u64 time; + umode_t posix_mode = 0; + __le32 daccess, maximal_access = 0; + + rsp_org = work->response_buf; + WORK_BUFFERS(work, req, rsp); + + if (req->hdr.NextCommand && !work->next_smb2_rcv_hdr_off && + (req->hdr.Flags & SMB2_FLAGS_RELATED_OPERATIONS)) { + ksmbd_debug(SMB, "invalid flag in chained command\n"); + rsp->hdr.Status = STATUS_INVALID_PARAMETER; + smb2_set_err_rsp(work); + return -EINVAL; + } + + if (test_share_config_flag(share, KSMBD_SHARE_FLAG_PIPE)) { + ksmbd_debug(SMB, "IPC pipe create request\n"); + return create_smb2_pipe(work); + } + + if (req->NameLength) { + if ((req->CreateOptions & FILE_DIRECTORY_FILE_LE) && + *(char *)req->Buffer == '\\') { + pr_err("not allow directory name included leading slash\n"); + rc = -EINVAL; + goto err_out1; + } + + name = smb2_get_name(share, + req->Buffer, + le16_to_cpu(req->NameLength), + work->conn->local_nls); + if (IS_ERR(name)) { + rc = PTR_ERR(name); + if (rc != -ENOMEM) + rc = -ENOENT; + name = NULL; + goto err_out1; + } + + ksmbd_debug(SMB, "converted name = %s\n", name); + if (strchr(name, ':')) { + if (!test_share_config_flag(work->tcon->share_conf, + KSMBD_SHARE_FLAG_STREAMS)) { + rc = -EBADF; + goto err_out1; + } + rc = parse_stream_name(name, &stream_name, &s_type); + if (rc < 0) + goto err_out1; + } + + rc = ksmbd_validate_filename(name); + if (rc < 0) + goto err_out1; + + if (ksmbd_share_veto_filename(share, name)) { + rc = -ENOENT; + ksmbd_debug(SMB, "Reject open(), vetoed file: %s\n", + name); + goto err_out1; + } + } else { + len = strlen(share->path); + ksmbd_debug(SMB, "share path len %d\n", len); + name = kmalloc(len + 1, GFP_KERNEL); + if (!name) { + rsp->hdr.Status = STATUS_NO_MEMORY; + rc = -ENOMEM; + goto err_out1; + } + + memcpy(name, share->path, len); + *(name + len) = '\0'; + } + + req_op_level = req->RequestedOplockLevel; + if (req_op_level == SMB2_OPLOCK_LEVEL_LEASE) + lc = parse_lease_state(req); + + if (le32_to_cpu(req->ImpersonationLevel) > le32_to_cpu(IL_DELEGATE_LE)) { + pr_err("Invalid impersonationlevel : 0x%x\n", + le32_to_cpu(req->ImpersonationLevel)); + rc = -EIO; + rsp->hdr.Status = STATUS_BAD_IMPERSONATION_LEVEL; + goto err_out1; + } + + if (req->CreateOptions && !(req->CreateOptions & CREATE_OPTIONS_MASK)) { + pr_err("Invalid create options : 0x%x\n", + le32_to_cpu(req->CreateOptions)); + rc = -EINVAL; + goto err_out1; + } else { + if (req->CreateOptions & FILE_SEQUENTIAL_ONLY_LE && + req->CreateOptions & FILE_RANDOM_ACCESS_LE) + req->CreateOptions = ~(FILE_SEQUENTIAL_ONLY_LE); + + if (req->CreateOptions & + (FILE_OPEN_BY_FILE_ID_LE | CREATE_TREE_CONNECTION | + FILE_RESERVE_OPFILTER_LE)) { + rc = -EOPNOTSUPP; + goto err_out1; + } + + if (req->CreateOptions & FILE_DIRECTORY_FILE_LE) { + if (req->CreateOptions & FILE_NON_DIRECTORY_FILE_LE) { + rc = -EINVAL; + goto err_out1; + } else if (req->CreateOptions & FILE_NO_COMPRESSION_LE) { + req->CreateOptions = ~(FILE_NO_COMPRESSION_LE); + } + } + } + + if (le32_to_cpu(req->CreateDisposition) > + le32_to_cpu(FILE_OVERWRITE_IF_LE)) { + pr_err("Invalid create disposition : 0x%x\n", + le32_to_cpu(req->CreateDisposition)); + rc = -EINVAL; + goto err_out1; + } + + if (!(req->DesiredAccess & DESIRED_ACCESS_MASK)) { + pr_err("Invalid desired access : 0x%x\n", + le32_to_cpu(req->DesiredAccess)); + rc = -EACCES; + goto err_out1; + } + + if (req->FileAttributes && !(req->FileAttributes & ATTR_MASK_LE)) { + pr_err("Invalid file attribute : 0x%x\n", + le32_to_cpu(req->FileAttributes)); + rc = -EINVAL; + goto err_out1; + } + + if (req->CreateContextsOffset) { + /* Parse non-durable handle create contexts */ + context = smb2_find_context_vals(req, SMB2_CREATE_EA_BUFFER); + if (IS_ERR(context)) { + rc = PTR_ERR(context); + goto err_out1; + } else if (context) { + ea_buf = (struct create_ea_buf_req *)context; + if (req->CreateOptions & FILE_NO_EA_KNOWLEDGE_LE) { + rsp->hdr.Status = STATUS_ACCESS_DENIED; + rc = -EACCES; + goto err_out1; + } + } + + context = smb2_find_context_vals(req, + SMB2_CREATE_QUERY_MAXIMAL_ACCESS_REQUEST); + if (IS_ERR(context)) { + rc = PTR_ERR(context); + goto err_out1; + } else if (context) { + ksmbd_debug(SMB, + "get query maximal access context\n"); + maximal_access_ctxt = 1; + } + + context = smb2_find_context_vals(req, + SMB2_CREATE_TIMEWARP_REQUEST); + if (IS_ERR(context)) { + rc = PTR_ERR(context); + goto err_out1; + } else if (context) { + ksmbd_debug(SMB, "get timewarp context\n"); + rc = -EBADF; + goto err_out1; + } + + if (tcon->posix_extensions) { + context = smb2_find_context_vals(req, + SMB2_CREATE_TAG_POSIX); + if (IS_ERR(context)) { + rc = PTR_ERR(context); + goto err_out1; + } else if (context) { + struct create_posix *posix = + (struct create_posix *)context; + ksmbd_debug(SMB, "get posix context\n"); + + posix_mode = le32_to_cpu(posix->Mode); + posix_ctxt = 1; + } + } + } + + if (ksmbd_override_fsids(work)) { + rc = -ENOMEM; + goto err_out1; + } + + if (req->CreateOptions & FILE_DELETE_ON_CLOSE_LE) { + /* + * On delete request, instead of following up, need to + * look the current entity + */ + rc = ksmbd_vfs_kern_path(name, 0, &path, 1); + if (!rc) { + /* + * If file exists with under flags, return access + * denied error. + */ + if (req->CreateDisposition == FILE_OVERWRITE_IF_LE || + req->CreateDisposition == FILE_OPEN_IF_LE) { + rc = -EACCES; + path_put(&path); + goto err_out; + } + + if (!test_tree_conn_flag(tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) { + ksmbd_debug(SMB, + "User does not have write permission\n"); + rc = -EACCES; + path_put(&path); + goto err_out; + } + } + } else { + if (test_share_config_flag(work->tcon->share_conf, + KSMBD_SHARE_FLAG_FOLLOW_SYMLINKS)) { + /* + * Use LOOKUP_FOLLOW to follow the path of + * symlink in path buildup + */ + rc = ksmbd_vfs_kern_path(name, LOOKUP_FOLLOW, &path, 1); + if (rc) { /* Case for broken link ?*/ + rc = ksmbd_vfs_kern_path(name, 0, &path, 1); + } + } else { + rc = ksmbd_vfs_kern_path(name, 0, &path, 1); + if (!rc && d_is_symlink(path.dentry)) { + rc = -EACCES; + path_put(&path); + goto err_out; + } + } + } + + if (rc) { + if (rc == -EACCES) { + ksmbd_debug(SMB, + "User does not have right permission\n"); + goto err_out; + } + ksmbd_debug(SMB, "can not get linux path for %s, rc = %d\n", + name, rc); + rc = 0; + } else { + file_present = true; + user_ns = mnt_user_ns(path.mnt); + generic_fillattr(user_ns, d_inode(path.dentry), &stat); + } + if (stream_name) { + if (req->CreateOptions & FILE_DIRECTORY_FILE_LE) { + if (s_type == DATA_STREAM) { + rc = -EIO; + rsp->hdr.Status = STATUS_NOT_A_DIRECTORY; + } + } else { + if (S_ISDIR(stat.mode) && s_type == DATA_STREAM) { + rc = -EIO; + rsp->hdr.Status = STATUS_FILE_IS_A_DIRECTORY; + } + } + + if (req->CreateOptions & FILE_DIRECTORY_FILE_LE && + req->FileAttributes & ATTR_NORMAL_LE) { + rsp->hdr.Status = STATUS_NOT_A_DIRECTORY; + rc = -EIO; + } + + if (rc < 0) + goto err_out; + } + + if (file_present && req->CreateOptions & FILE_NON_DIRECTORY_FILE_LE && + S_ISDIR(stat.mode) && !(req->CreateOptions & FILE_DELETE_ON_CLOSE_LE)) { + ksmbd_debug(SMB, "open() argument is a directory: %s, %x\n", + name, req->CreateOptions); + rsp->hdr.Status = STATUS_FILE_IS_A_DIRECTORY; + rc = -EIO; + goto err_out; + } + + if (file_present && (req->CreateOptions & FILE_DIRECTORY_FILE_LE) && + !(req->CreateDisposition == FILE_CREATE_LE) && + !S_ISDIR(stat.mode)) { + rsp->hdr.Status = STATUS_NOT_A_DIRECTORY; + rc = -EIO; + goto err_out; + } + + if (!stream_name && file_present && + req->CreateDisposition == FILE_CREATE_LE) { + rc = -EEXIST; + goto err_out; + } + + daccess = smb_map_generic_desired_access(req->DesiredAccess); + + if (file_present && !(req->CreateOptions & FILE_DELETE_ON_CLOSE_LE)) { + rc = smb_check_perm_dacl(conn, &path, &daccess, + sess->user->uid); + if (rc) + goto err_out; + } + + if (daccess & FILE_MAXIMAL_ACCESS_LE) { + if (!file_present) { + daccess = cpu_to_le32(GENERIC_ALL_FLAGS); + } else { + rc = ksmbd_vfs_query_maximal_access(user_ns, + path.dentry, + &daccess); + if (rc) + goto err_out; + already_permitted = true; + } + maximal_access = daccess; + } + + open_flags = smb2_create_open_flags(file_present, daccess, + req->CreateDisposition, + &may_flags); + + if (!test_tree_conn_flag(tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) { + if (open_flags & O_CREAT) { + ksmbd_debug(SMB, + "User does not have write permission\n"); + rc = -EACCES; + goto err_out; + } + } + + /*create file if not present */ + if (!file_present) { + rc = smb2_creat(work, &path, name, open_flags, posix_mode, + req->CreateOptions & FILE_DIRECTORY_FILE_LE); + if (rc) { + if (rc == -ENOENT) { + rc = -EIO; + rsp->hdr.Status = STATUS_OBJECT_PATH_NOT_FOUND; + } + goto err_out; + } + + created = true; + user_ns = mnt_user_ns(path.mnt); + if (ea_buf) { + rc = smb2_set_ea(&ea_buf->ea, &path); + if (rc == -EOPNOTSUPP) + rc = 0; + else if (rc) + goto err_out; + } + } else if (!already_permitted) { + /* FILE_READ_ATTRIBUTE is allowed without inode_permission, + * because execute(search) permission on a parent directory, + * is already granted. + */ + if (daccess & ~(FILE_READ_ATTRIBUTES_LE | FILE_READ_CONTROL_LE)) { + rc = inode_permission(user_ns, + d_inode(path.dentry), + may_flags); + if (rc) + goto err_out; + + if ((daccess & FILE_DELETE_LE) || + (req->CreateOptions & FILE_DELETE_ON_CLOSE_LE)) { + rc = ksmbd_vfs_may_delete(user_ns, + path.dentry); + if (rc) + goto err_out; + } + } + } + + rc = ksmbd_query_inode_status(d_inode(path.dentry->d_parent)); + if (rc == KSMBD_INODE_STATUS_PENDING_DELETE) { + rc = -EBUSY; + goto err_out; + } + + rc = 0; + filp = dentry_open(&path, open_flags, current_cred()); + if (IS_ERR(filp)) { + rc = PTR_ERR(filp); + pr_err("dentry open for dir failed, rc %d\n", rc); + goto err_out; + } + + if (file_present) { + if (!(open_flags & O_TRUNC)) + file_info = FILE_OPENED; + else + file_info = FILE_OVERWRITTEN; + + if ((req->CreateDisposition & FILE_CREATE_MASK_LE) == + FILE_SUPERSEDE_LE) + file_info = FILE_SUPERSEDED; + } else if (open_flags & O_CREAT) { + file_info = FILE_CREATED; + } + + ksmbd_vfs_set_fadvise(filp, req->CreateOptions); + + /* Obtain Volatile-ID */ + fp = ksmbd_open_fd(work, filp); + if (IS_ERR(fp)) { + fput(filp); + rc = PTR_ERR(fp); + fp = NULL; + goto err_out; + } + + /* Get Persistent-ID */ + ksmbd_open_durable_fd(fp); + if (!has_file_id(fp->persistent_id)) { + rc = -ENOMEM; + goto err_out; + } + + fp->filename = name; + fp->cdoption = req->CreateDisposition; + fp->daccess = daccess; + fp->saccess = req->ShareAccess; + fp->coption = req->CreateOptions; + + /* Set default windows and posix acls if creating new file */ + if (created) { + int posix_acl_rc; + struct inode *inode = d_inode(path.dentry); + + posix_acl_rc = ksmbd_vfs_inherit_posix_acl(user_ns, + inode, + d_inode(path.dentry->d_parent)); + if (posix_acl_rc) + ksmbd_debug(SMB, "inherit posix acl failed : %d\n", posix_acl_rc); + + if (test_share_config_flag(work->tcon->share_conf, + KSMBD_SHARE_FLAG_ACL_XATTR)) { + rc = smb_inherit_dacl(conn, &path, sess->user->uid, + sess->user->gid); + } + + if (rc) { + rc = smb2_create_sd_buffer(work, req, &path); + if (rc) { + if (posix_acl_rc) + ksmbd_vfs_set_init_posix_acl(user_ns, + inode); + + if (test_share_config_flag(work->tcon->share_conf, + KSMBD_SHARE_FLAG_ACL_XATTR)) { + struct smb_fattr fattr; + struct smb_ntsd *pntsd; + int pntsd_size, ace_num = 0; + + ksmbd_acls_fattr(&fattr, inode); + if (fattr.cf_acls) + ace_num = fattr.cf_acls->a_count; + if (fattr.cf_dacls) + ace_num += fattr.cf_dacls->a_count; + + pntsd = kmalloc(sizeof(struct smb_ntsd) + + sizeof(struct smb_sid) * 3 + + sizeof(struct smb_acl) + + sizeof(struct smb_ace) * ace_num * 2, + GFP_KERNEL); + if (!pntsd) + goto err_out; + + rc = build_sec_desc(user_ns, + pntsd, NULL, + OWNER_SECINFO | + GROUP_SECINFO | + DACL_SECINFO, + &pntsd_size, &fattr); + posix_acl_release(fattr.cf_acls); + posix_acl_release(fattr.cf_dacls); + + rc = ksmbd_vfs_set_sd_xattr(conn, + user_ns, + path.dentry, + pntsd, + pntsd_size); + kfree(pntsd); + if (rc) + pr_err("failed to store ntacl in xattr : %d\n", + rc); + } + } + } + rc = 0; + } + + if (stream_name) { + rc = smb2_set_stream_name_xattr(&path, + fp, + stream_name, + s_type); + if (rc) + goto err_out; + file_info = FILE_CREATED; + } + + fp->attrib_only = !(req->DesiredAccess & ~(FILE_READ_ATTRIBUTES_LE | + FILE_WRITE_ATTRIBUTES_LE | FILE_SYNCHRONIZE_LE)); + if (!S_ISDIR(file_inode(filp)->i_mode) && open_flags & O_TRUNC && + !fp->attrib_only && !stream_name) { + smb_break_all_oplock(work, fp); + need_truncate = 1; + } + + /* fp should be searchable through ksmbd_inode.m_fp_list + * after daccess, saccess, attrib_only, and stream are + * initialized. + */ + write_lock(&fp->f_ci->m_lock); + list_add(&fp->node, &fp->f_ci->m_fp_list); + write_unlock(&fp->f_ci->m_lock); + + rc = ksmbd_vfs_getattr(&path, &stat); + if (rc) { + generic_fillattr(user_ns, d_inode(path.dentry), &stat); + rc = 0; + } + + /* Check delete pending among previous fp before oplock break */ + if (ksmbd_inode_pending_delete(fp)) { + rc = -EBUSY; + goto err_out; + } + + share_ret = ksmbd_smb_check_shared_mode(fp->filp, fp); + if (!test_share_config_flag(work->tcon->share_conf, KSMBD_SHARE_FLAG_OPLOCKS) || + (req_op_level == SMB2_OPLOCK_LEVEL_LEASE && + !(conn->vals->capabilities & SMB2_GLOBAL_CAP_LEASING))) { + if (share_ret < 0 && !S_ISDIR(file_inode(fp->filp)->i_mode)) { + rc = share_ret; + goto err_out; + } + } else { + if (req_op_level == SMB2_OPLOCK_LEVEL_LEASE) { + req_op_level = smb2_map_lease_to_oplock(lc->req_state); + ksmbd_debug(SMB, + "lease req for(%s) req oplock state 0x%x, lease state 0x%x\n", + name, req_op_level, lc->req_state); + rc = find_same_lease_key(sess, fp->f_ci, lc); + if (rc) + goto err_out; + } else if (open_flags == O_RDONLY && + (req_op_level == SMB2_OPLOCK_LEVEL_BATCH || + req_op_level == SMB2_OPLOCK_LEVEL_EXCLUSIVE)) + req_op_level = SMB2_OPLOCK_LEVEL_II; + + rc = smb_grant_oplock(work, req_op_level, + fp->persistent_id, fp, + le32_to_cpu(req->hdr.Id.SyncId.TreeId), + lc, share_ret); + if (rc < 0) + goto err_out; + } + + if (req->CreateOptions & FILE_DELETE_ON_CLOSE_LE) + ksmbd_fd_set_delete_on_close(fp, file_info); + + if (need_truncate) { + rc = smb2_create_truncate(&path); + if (rc) + goto err_out; + } + + if (req->CreateContextsOffset) { + struct create_alloc_size_req *az_req; + + az_req = (struct create_alloc_size_req *)smb2_find_context_vals(req, + SMB2_CREATE_ALLOCATION_SIZE); + if (IS_ERR(az_req)) { + rc = PTR_ERR(az_req); + goto err_out; + } else if (az_req) { + loff_t alloc_size = le64_to_cpu(az_req->AllocationSize); + int err; + + ksmbd_debug(SMB, + "request smb2 create allocate size : %llu\n", + alloc_size); + smb_break_all_levII_oplock(work, fp, 1); + err = vfs_fallocate(fp->filp, FALLOC_FL_KEEP_SIZE, 0, + alloc_size); + if (err < 0) + ksmbd_debug(SMB, + "vfs_fallocate is failed : %d\n", + err); + } + + context = smb2_find_context_vals(req, SMB2_CREATE_QUERY_ON_DISK_ID); + if (IS_ERR(context)) { + rc = PTR_ERR(context); + goto err_out; + } else if (context) { + ksmbd_debug(SMB, "get query on disk id context\n"); + query_disk_id = 1; + } + } + + if (stat.result_mask & STATX_BTIME) + fp->create_time = ksmbd_UnixTimeToNT(stat.btime); + else + fp->create_time = ksmbd_UnixTimeToNT(stat.ctime); + if (req->FileAttributes || fp->f_ci->m_fattr == 0) + fp->f_ci->m_fattr = + cpu_to_le32(smb2_get_dos_mode(&stat, le32_to_cpu(req->FileAttributes))); + + if (!created) + smb2_update_xattrs(tcon, &path, fp); + else + smb2_new_xattrs(tcon, &path, fp); + + memcpy(fp->client_guid, conn->ClientGUID, SMB2_CLIENT_GUID_SIZE); + + generic_fillattr(user_ns, file_inode(fp->filp), + &stat); + + rsp->StructureSize = cpu_to_le16(89); + rcu_read_lock(); + opinfo = rcu_dereference(fp->f_opinfo); + rsp->OplockLevel = opinfo != NULL ? opinfo->level : 0; + rcu_read_unlock(); + rsp->Reserved = 0; + rsp->CreateAction = cpu_to_le32(file_info); + rsp->CreationTime = cpu_to_le64(fp->create_time); + time = ksmbd_UnixTimeToNT(stat.atime); + rsp->LastAccessTime = cpu_to_le64(time); + time = ksmbd_UnixTimeToNT(stat.mtime); + rsp->LastWriteTime = cpu_to_le64(time); + time = ksmbd_UnixTimeToNT(stat.ctime); + rsp->ChangeTime = cpu_to_le64(time); + rsp->AllocationSize = S_ISDIR(stat.mode) ? 0 : + cpu_to_le64(stat.blocks << 9); + rsp->EndofFile = S_ISDIR(stat.mode) ? 0 : cpu_to_le64(stat.size); + rsp->FileAttributes = fp->f_ci->m_fattr; + + rsp->Reserved2 = 0; + + rsp->PersistentFileId = cpu_to_le64(fp->persistent_id); + rsp->VolatileFileId = cpu_to_le64(fp->volatile_id); + + rsp->CreateContextsOffset = 0; + rsp->CreateContextsLength = 0; + inc_rfc1001_len(rsp_org, 88); /* StructureSize - 1*/ + + /* If lease is request send lease context response */ + if (opinfo && opinfo->is_lease) { + struct create_context *lease_ccontext; + + ksmbd_debug(SMB, "lease granted on(%s) lease state 0x%x\n", + name, opinfo->o_lease->state); + rsp->OplockLevel = SMB2_OPLOCK_LEVEL_LEASE; + + lease_ccontext = (struct create_context *)rsp->Buffer; + contxt_cnt++; + create_lease_buf(rsp->Buffer, opinfo->o_lease); + le32_add_cpu(&rsp->CreateContextsLength, + conn->vals->create_lease_size); + inc_rfc1001_len(rsp_org, conn->vals->create_lease_size); + next_ptr = &lease_ccontext->Next; + next_off = conn->vals->create_lease_size; + } + + if (maximal_access_ctxt) { + struct create_context *mxac_ccontext; + + if (maximal_access == 0) + ksmbd_vfs_query_maximal_access(user_ns, + path.dentry, + &maximal_access); + mxac_ccontext = (struct create_context *)(rsp->Buffer + + le32_to_cpu(rsp->CreateContextsLength)); + contxt_cnt++; + create_mxac_rsp_buf(rsp->Buffer + + le32_to_cpu(rsp->CreateContextsLength), + le32_to_cpu(maximal_access)); + le32_add_cpu(&rsp->CreateContextsLength, + conn->vals->create_mxac_size); + inc_rfc1001_len(rsp_org, conn->vals->create_mxac_size); + if (next_ptr) + *next_ptr = cpu_to_le32(next_off); + next_ptr = &mxac_ccontext->Next; + next_off = conn->vals->create_mxac_size; + } + + if (query_disk_id) { + struct create_context *disk_id_ccontext; + + disk_id_ccontext = (struct create_context *)(rsp->Buffer + + le32_to_cpu(rsp->CreateContextsLength)); + contxt_cnt++; + create_disk_id_rsp_buf(rsp->Buffer + + le32_to_cpu(rsp->CreateContextsLength), + stat.ino, tcon->id); + le32_add_cpu(&rsp->CreateContextsLength, + conn->vals->create_disk_id_size); + inc_rfc1001_len(rsp_org, conn->vals->create_disk_id_size); + if (next_ptr) + *next_ptr = cpu_to_le32(next_off); + next_ptr = &disk_id_ccontext->Next; + next_off = conn->vals->create_disk_id_size; + } + + if (posix_ctxt) { + contxt_cnt++; + create_posix_rsp_buf(rsp->Buffer + + le32_to_cpu(rsp->CreateContextsLength), + fp); + le32_add_cpu(&rsp->CreateContextsLength, + conn->vals->create_posix_size); + inc_rfc1001_len(rsp_org, conn->vals->create_posix_size); + if (next_ptr) + *next_ptr = cpu_to_le32(next_off); + } + + if (contxt_cnt > 0) { + rsp->CreateContextsOffset = + cpu_to_le32(offsetof(struct smb2_create_rsp, Buffer) + - 4); + } + +err_out: + if (file_present || created) + path_put(&path); + ksmbd_revert_fsids(work); +err_out1: + if (rc) { + if (rc == -EINVAL) + rsp->hdr.Status = STATUS_INVALID_PARAMETER; + else if (rc == -EOPNOTSUPP) + rsp->hdr.Status = STATUS_NOT_SUPPORTED; + else if (rc == -EACCES || rc == -ESTALE) + rsp->hdr.Status = STATUS_ACCESS_DENIED; + else if (rc == -ENOENT) + rsp->hdr.Status = STATUS_OBJECT_NAME_INVALID; + else if (rc == -EPERM) + rsp->hdr.Status = STATUS_SHARING_VIOLATION; + else if (rc == -EBUSY) + rsp->hdr.Status = STATUS_DELETE_PENDING; + else if (rc == -EBADF) + rsp->hdr.Status = STATUS_OBJECT_NAME_NOT_FOUND; + else if (rc == -ENOEXEC) + rsp->hdr.Status = STATUS_DUPLICATE_OBJECTID; + else if (rc == -ENXIO) + rsp->hdr.Status = STATUS_NO_SUCH_DEVICE; + else if (rc == -EEXIST) + rsp->hdr.Status = STATUS_OBJECT_NAME_COLLISION; + else if (rc == -EMFILE) + rsp->hdr.Status = STATUS_INSUFFICIENT_RESOURCES; + if (!rsp->hdr.Status) + rsp->hdr.Status = STATUS_UNEXPECTED_IO_ERROR; + + if (!fp || !fp->filename) + kfree(name); + if (fp) + ksmbd_fd_put(work, fp); + smb2_set_err_rsp(work); + ksmbd_debug(SMB, "Error response: %x\n", rsp->hdr.Status); + } + + kfree(lc); + + return 0; +} + +static int readdir_info_level_struct_sz(int info_level) +{ + switch (info_level) { + case FILE_FULL_DIRECTORY_INFORMATION: + return sizeof(struct file_full_directory_info); + case FILE_BOTH_DIRECTORY_INFORMATION: + return sizeof(struct file_both_directory_info); + case FILE_DIRECTORY_INFORMATION: + return sizeof(struct file_directory_info); + case FILE_NAMES_INFORMATION: + return sizeof(struct file_names_info); + case FILEID_FULL_DIRECTORY_INFORMATION: + return sizeof(struct file_id_full_dir_info); + case FILEID_BOTH_DIRECTORY_INFORMATION: + return sizeof(struct file_id_both_directory_info); + case SMB_FIND_FILE_POSIX_INFO: + return sizeof(struct smb2_posix_info); + default: + return -EOPNOTSUPP; + } +} + +static int dentry_name(struct ksmbd_dir_info *d_info, int info_level) +{ + switch (info_level) { + case FILE_FULL_DIRECTORY_INFORMATION: + { + struct file_full_directory_info *ffdinfo; + + ffdinfo = (struct file_full_directory_info *)d_info->rptr; + d_info->rptr += le32_to_cpu(ffdinfo->NextEntryOffset); + d_info->name = ffdinfo->FileName; + d_info->name_len = le32_to_cpu(ffdinfo->FileNameLength); + return 0; + } + case FILE_BOTH_DIRECTORY_INFORMATION: + { + struct file_both_directory_info *fbdinfo; + + fbdinfo = (struct file_both_directory_info *)d_info->rptr; + d_info->rptr += le32_to_cpu(fbdinfo->NextEntryOffset); + d_info->name = fbdinfo->FileName; + d_info->name_len = le32_to_cpu(fbdinfo->FileNameLength); + return 0; + } + case FILE_DIRECTORY_INFORMATION: + { + struct file_directory_info *fdinfo; + + fdinfo = (struct file_directory_info *)d_info->rptr; + d_info->rptr += le32_to_cpu(fdinfo->NextEntryOffset); + d_info->name = fdinfo->FileName; + d_info->name_len = le32_to_cpu(fdinfo->FileNameLength); + return 0; + } + case FILE_NAMES_INFORMATION: + { + struct file_names_info *fninfo; + + fninfo = (struct file_names_info *)d_info->rptr; + d_info->rptr += le32_to_cpu(fninfo->NextEntryOffset); + d_info->name = fninfo->FileName; + d_info->name_len = le32_to_cpu(fninfo->FileNameLength); + return 0; + } + case FILEID_FULL_DIRECTORY_INFORMATION: + { + struct file_id_full_dir_info *dinfo; + + dinfo = (struct file_id_full_dir_info *)d_info->rptr; + d_info->rptr += le32_to_cpu(dinfo->NextEntryOffset); + d_info->name = dinfo->FileName; + d_info->name_len = le32_to_cpu(dinfo->FileNameLength); + return 0; + } + case FILEID_BOTH_DIRECTORY_INFORMATION: + { + struct file_id_both_directory_info *fibdinfo; + + fibdinfo = (struct file_id_both_directory_info *)d_info->rptr; + d_info->rptr += le32_to_cpu(fibdinfo->NextEntryOffset); + d_info->name = fibdinfo->FileName; + d_info->name_len = le32_to_cpu(fibdinfo->FileNameLength); + return 0; + } + case SMB_FIND_FILE_POSIX_INFO: + { + struct smb2_posix_info *posix_info; + + posix_info = (struct smb2_posix_info *)d_info->rptr; + d_info->rptr += le32_to_cpu(posix_info->NextEntryOffset); + d_info->name = posix_info->name; + d_info->name_len = le32_to_cpu(posix_info->name_len); + return 0; + } + default: + return -EINVAL; + } +} + +/** + * smb2_populate_readdir_entry() - encode directory entry in smb2 response + * buffer + * @conn: connection instance + * @info_level: smb information level + * @d_info: structure included variables for query dir + * @user_ns: user namespace + * @ksmbd_kstat: ksmbd wrapper of dirent stat information + * + * if directory has many entries, find first can't read it fully. + * find next might be called multiple times to read remaining dir entries + * + * Return: 0 on success, otherwise error + */ +static int smb2_populate_readdir_entry(struct ksmbd_conn *conn, int info_level, + struct ksmbd_dir_info *d_info, + struct user_namespace *user_ns, + struct ksmbd_kstat *ksmbd_kstat) +{ + int next_entry_offset = 0; + char *conv_name; + int conv_len; + void *kstat; + int struct_sz, rc = 0; + + conv_name = ksmbd_convert_dir_info_name(d_info, + conn->local_nls, + &conv_len); + if (!conv_name) + return -ENOMEM; + + /* Somehow the name has only terminating NULL bytes */ + if (conv_len < 0) { + rc = -EINVAL; + goto free_conv_name; + } + + struct_sz = readdir_info_level_struct_sz(info_level); + next_entry_offset = ALIGN(struct_sz - 1 + conv_len, + KSMBD_DIR_INFO_ALIGNMENT); + + if (next_entry_offset > d_info->out_buf_len) { + d_info->out_buf_len = 0; + rc = -ENOSPC; + goto free_conv_name; + } + + kstat = d_info->wptr; + if (info_level != FILE_NAMES_INFORMATION) + kstat = ksmbd_vfs_init_kstat(&d_info->wptr, ksmbd_kstat); + + switch (info_level) { + case FILE_FULL_DIRECTORY_INFORMATION: + { + struct file_full_directory_info *ffdinfo; + + ffdinfo = (struct file_full_directory_info *)kstat; + ffdinfo->FileNameLength = cpu_to_le32(conv_len); + ffdinfo->EaSize = + smb2_get_reparse_tag_special_file(ksmbd_kstat->kstat->mode); + if (ffdinfo->EaSize) + ffdinfo->ExtFileAttributes = ATTR_REPARSE_POINT_LE; + if (d_info->hide_dot_file && d_info->name[0] == '.') + ffdinfo->ExtFileAttributes |= ATTR_HIDDEN_LE; + memcpy(ffdinfo->FileName, conv_name, conv_len); + ffdinfo->NextEntryOffset = cpu_to_le32(next_entry_offset); + break; + } + case FILE_BOTH_DIRECTORY_INFORMATION: + { + struct file_both_directory_info *fbdinfo; + + fbdinfo = (struct file_both_directory_info *)kstat; + fbdinfo->FileNameLength = cpu_to_le32(conv_len); + fbdinfo->EaSize = + smb2_get_reparse_tag_special_file(ksmbd_kstat->kstat->mode); + if (fbdinfo->EaSize) + fbdinfo->ExtFileAttributes = ATTR_REPARSE_POINT_LE; + fbdinfo->ShortNameLength = 0; + fbdinfo->Reserved = 0; + if (d_info->hide_dot_file && d_info->name[0] == '.') + fbdinfo->ExtFileAttributes |= ATTR_HIDDEN_LE; + memcpy(fbdinfo->FileName, conv_name, conv_len); + fbdinfo->NextEntryOffset = cpu_to_le32(next_entry_offset); + break; + } + case FILE_DIRECTORY_INFORMATION: + { + struct file_directory_info *fdinfo; + + fdinfo = (struct file_directory_info *)kstat; + fdinfo->FileNameLength = cpu_to_le32(conv_len); + if (d_info->hide_dot_file && d_info->name[0] == '.') + fdinfo->ExtFileAttributes |= ATTR_HIDDEN_LE; + memcpy(fdinfo->FileName, conv_name, conv_len); + fdinfo->NextEntryOffset = cpu_to_le32(next_entry_offset); + break; + } + case FILE_NAMES_INFORMATION: + { + struct file_names_info *fninfo; + + fninfo = (struct file_names_info *)kstat; + fninfo->FileNameLength = cpu_to_le32(conv_len); + memcpy(fninfo->FileName, conv_name, conv_len); + fninfo->NextEntryOffset = cpu_to_le32(next_entry_offset); + break; + } + case FILEID_FULL_DIRECTORY_INFORMATION: + { + struct file_id_full_dir_info *dinfo; + + dinfo = (struct file_id_full_dir_info *)kstat; + dinfo->FileNameLength = cpu_to_le32(conv_len); + dinfo->EaSize = + smb2_get_reparse_tag_special_file(ksmbd_kstat->kstat->mode); + if (dinfo->EaSize) + dinfo->ExtFileAttributes = ATTR_REPARSE_POINT_LE; + dinfo->Reserved = 0; + dinfo->UniqueId = cpu_to_le64(ksmbd_kstat->kstat->ino); + if (d_info->hide_dot_file && d_info->name[0] == '.') + dinfo->ExtFileAttributes |= ATTR_HIDDEN_LE; + memcpy(dinfo->FileName, conv_name, conv_len); + dinfo->NextEntryOffset = cpu_to_le32(next_entry_offset); + break; + } + case FILEID_BOTH_DIRECTORY_INFORMATION: + { + struct file_id_both_directory_info *fibdinfo; + + fibdinfo = (struct file_id_both_directory_info *)kstat; + fibdinfo->FileNameLength = cpu_to_le32(conv_len); + fibdinfo->EaSize = + smb2_get_reparse_tag_special_file(ksmbd_kstat->kstat->mode); + if (fibdinfo->EaSize) + fibdinfo->ExtFileAttributes = ATTR_REPARSE_POINT_LE; + fibdinfo->UniqueId = cpu_to_le64(ksmbd_kstat->kstat->ino); + fibdinfo->ShortNameLength = 0; + fibdinfo->Reserved = 0; + fibdinfo->Reserved2 = cpu_to_le16(0); + if (d_info->hide_dot_file && d_info->name[0] == '.') + fibdinfo->ExtFileAttributes |= ATTR_HIDDEN_LE; + memcpy(fibdinfo->FileName, conv_name, conv_len); + fibdinfo->NextEntryOffset = cpu_to_le32(next_entry_offset); + break; + } + case SMB_FIND_FILE_POSIX_INFO: + { + struct smb2_posix_info *posix_info; + u64 time; + + posix_info = (struct smb2_posix_info *)kstat; + posix_info->Ignored = 0; + posix_info->CreationTime = cpu_to_le64(ksmbd_kstat->create_time); + time = ksmbd_UnixTimeToNT(ksmbd_kstat->kstat->ctime); + posix_info->ChangeTime = cpu_to_le64(time); + time = ksmbd_UnixTimeToNT(ksmbd_kstat->kstat->atime); + posix_info->LastAccessTime = cpu_to_le64(time); + time = ksmbd_UnixTimeToNT(ksmbd_kstat->kstat->mtime); + posix_info->LastWriteTime = cpu_to_le64(time); + posix_info->EndOfFile = cpu_to_le64(ksmbd_kstat->kstat->size); + posix_info->AllocationSize = cpu_to_le64(ksmbd_kstat->kstat->blocks << 9); + posix_info->DeviceId = cpu_to_le32(ksmbd_kstat->kstat->rdev); + posix_info->HardLinks = cpu_to_le32(ksmbd_kstat->kstat->nlink); + posix_info->Mode = cpu_to_le32(ksmbd_kstat->kstat->mode); + posix_info->Inode = cpu_to_le64(ksmbd_kstat->kstat->ino); + posix_info->DosAttributes = + S_ISDIR(ksmbd_kstat->kstat->mode) ? ATTR_DIRECTORY_LE : ATTR_ARCHIVE_LE; + if (d_info->hide_dot_file && d_info->name[0] == '.') + posix_info->DosAttributes |= ATTR_HIDDEN_LE; + id_to_sid(from_kuid(user_ns, ksmbd_kstat->kstat->uid), + SIDNFS_USER, (struct smb_sid *)&posix_info->SidBuffer[0]); + id_to_sid(from_kgid(user_ns, ksmbd_kstat->kstat->gid), + SIDNFS_GROUP, (struct smb_sid *)&posix_info->SidBuffer[20]); + memcpy(posix_info->name, conv_name, conv_len); + posix_info->name_len = cpu_to_le32(conv_len); + posix_info->NextEntryOffset = cpu_to_le32(next_entry_offset); + break; + } + + } /* switch (info_level) */ + + d_info->last_entry_offset = d_info->data_count; + d_info->data_count += next_entry_offset; + d_info->out_buf_len -= next_entry_offset; + d_info->wptr += next_entry_offset; + + ksmbd_debug(SMB, + "info_level : %d, buf_len :%d, next_offset : %d, data_count : %d\n", + info_level, d_info->out_buf_len, + next_entry_offset, d_info->data_count); + +free_conv_name: + kfree(conv_name); + return rc; +} + +struct smb2_query_dir_private { + struct ksmbd_work *work; + char *search_pattern; + struct ksmbd_file *dir_fp; + + struct ksmbd_dir_info *d_info; + int info_level; +}; + +static void lock_dir(struct ksmbd_file *dir_fp) +{ + struct dentry *dir = dir_fp->filp->f_path.dentry; + + inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); +} + +static void unlock_dir(struct ksmbd_file *dir_fp) +{ + struct dentry *dir = dir_fp->filp->f_path.dentry; + + inode_unlock(d_inode(dir)); +} + +static int process_query_dir_entries(struct smb2_query_dir_private *priv) +{ + struct user_namespace *user_ns = file_mnt_user_ns(priv->dir_fp->filp); + struct kstat kstat; + struct ksmbd_kstat ksmbd_kstat; + int rc; + int i; + + for (i = 0; i < priv->d_info->num_entry; i++) { + struct dentry *dent; + + if (dentry_name(priv->d_info, priv->info_level)) + return -EINVAL; + + lock_dir(priv->dir_fp); + dent = lookup_one_len(priv->d_info->name, + priv->dir_fp->filp->f_path.dentry, + priv->d_info->name_len); + unlock_dir(priv->dir_fp); + + if (IS_ERR(dent)) { + ksmbd_debug(SMB, "Cannot lookup `%s' [%ld]\n", + priv->d_info->name, + PTR_ERR(dent)); + continue; + } + if (unlikely(d_is_negative(dent))) { + dput(dent); + ksmbd_debug(SMB, "Negative dentry `%s'\n", + priv->d_info->name); + continue; + } + + ksmbd_kstat.kstat = &kstat; + if (priv->info_level != FILE_NAMES_INFORMATION) + ksmbd_vfs_fill_dentry_attrs(priv->work, + user_ns, + dent, + &ksmbd_kstat); + + rc = smb2_populate_readdir_entry(priv->work->conn, + priv->info_level, + priv->d_info, + user_ns, + &ksmbd_kstat); + dput(dent); + if (rc) + return rc; + } + return 0; +} + +static int reserve_populate_dentry(struct ksmbd_dir_info *d_info, + int info_level) +{ + int struct_sz; + int conv_len; + int next_entry_offset; + + struct_sz = readdir_info_level_struct_sz(info_level); + if (struct_sz == -EOPNOTSUPP) + return -EOPNOTSUPP; + + conv_len = (d_info->name_len + 1) * 2; + next_entry_offset = ALIGN(struct_sz - 1 + conv_len, + KSMBD_DIR_INFO_ALIGNMENT); + + if (next_entry_offset > d_info->out_buf_len) { + d_info->out_buf_len = 0; + return -ENOSPC; + } + + switch (info_level) { + case FILE_FULL_DIRECTORY_INFORMATION: + { + struct file_full_directory_info *ffdinfo; + + ffdinfo = (struct file_full_directory_info *)d_info->wptr; + memcpy(ffdinfo->FileName, d_info->name, d_info->name_len); + ffdinfo->FileName[d_info->name_len] = 0x00; + ffdinfo->FileNameLength = cpu_to_le32(d_info->name_len); + ffdinfo->NextEntryOffset = cpu_to_le32(next_entry_offset); + break; + } + case FILE_BOTH_DIRECTORY_INFORMATION: + { + struct file_both_directory_info *fbdinfo; + + fbdinfo = (struct file_both_directory_info *)d_info->wptr; + memcpy(fbdinfo->FileName, d_info->name, d_info->name_len); + fbdinfo->FileName[d_info->name_len] = 0x00; + fbdinfo->FileNameLength = cpu_to_le32(d_info->name_len); + fbdinfo->NextEntryOffset = cpu_to_le32(next_entry_offset); + break; + } + case FILE_DIRECTORY_INFORMATION: + { + struct file_directory_info *fdinfo; + + fdinfo = (struct file_directory_info *)d_info->wptr; + memcpy(fdinfo->FileName, d_info->name, d_info->name_len); + fdinfo->FileName[d_info->name_len] = 0x00; + fdinfo->FileNameLength = cpu_to_le32(d_info->name_len); + fdinfo->NextEntryOffset = cpu_to_le32(next_entry_offset); + break; + } + case FILE_NAMES_INFORMATION: + { + struct file_names_info *fninfo; + + fninfo = (struct file_names_info *)d_info->wptr; + memcpy(fninfo->FileName, d_info->name, d_info->name_len); + fninfo->FileName[d_info->name_len] = 0x00; + fninfo->FileNameLength = cpu_to_le32(d_info->name_len); + fninfo->NextEntryOffset = cpu_to_le32(next_entry_offset); + break; + } + case FILEID_FULL_DIRECTORY_INFORMATION: + { + struct file_id_full_dir_info *dinfo; + + dinfo = (struct file_id_full_dir_info *)d_info->wptr; + memcpy(dinfo->FileName, d_info->name, d_info->name_len); + dinfo->FileName[d_info->name_len] = 0x00; + dinfo->FileNameLength = cpu_to_le32(d_info->name_len); + dinfo->NextEntryOffset = cpu_to_le32(next_entry_offset); + break; + } + case FILEID_BOTH_DIRECTORY_INFORMATION: + { + struct file_id_both_directory_info *fibdinfo; + + fibdinfo = (struct file_id_both_directory_info *)d_info->wptr; + memcpy(fibdinfo->FileName, d_info->name, d_info->name_len); + fibdinfo->FileName[d_info->name_len] = 0x00; + fibdinfo->FileNameLength = cpu_to_le32(d_info->name_len); + fibdinfo->NextEntryOffset = cpu_to_le32(next_entry_offset); + break; + } + case SMB_FIND_FILE_POSIX_INFO: + { + struct smb2_posix_info *posix_info; + + posix_info = (struct smb2_posix_info *)d_info->wptr; + memcpy(posix_info->name, d_info->name, d_info->name_len); + posix_info->name[d_info->name_len] = 0x00; + posix_info->name_len = cpu_to_le32(d_info->name_len); + posix_info->NextEntryOffset = + cpu_to_le32(next_entry_offset); + break; + } + } /* switch (info_level) */ + + d_info->num_entry++; + d_info->out_buf_len -= next_entry_offset; + d_info->wptr += next_entry_offset; + return 0; +} + +static int __query_dir(struct dir_context *ctx, const char *name, int namlen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct ksmbd_readdir_data *buf; + struct smb2_query_dir_private *priv; + struct ksmbd_dir_info *d_info; + int rc; + + buf = container_of(ctx, struct ksmbd_readdir_data, ctx); + priv = buf->private; + d_info = priv->d_info; + + /* dot and dotdot entries are already reserved */ + if (!strcmp(".", name) || !strcmp("..", name)) + return 0; + if (ksmbd_share_veto_filename(priv->work->tcon->share_conf, name)) + return 0; + if (!match_pattern(name, namlen, priv->search_pattern)) + return 0; + + d_info->name = name; + d_info->name_len = namlen; + rc = reserve_populate_dentry(d_info, priv->info_level); + if (rc) + return rc; + if (d_info->flags & SMB2_RETURN_SINGLE_ENTRY) { + d_info->out_buf_len = 0; + return 0; + } + return 0; +} + +static void restart_ctx(struct dir_context *ctx) +{ + ctx->pos = 0; +} + +static int verify_info_level(int info_level) +{ + switch (info_level) { + case FILE_FULL_DIRECTORY_INFORMATION: + case FILE_BOTH_DIRECTORY_INFORMATION: + case FILE_DIRECTORY_INFORMATION: + case FILE_NAMES_INFORMATION: + case FILEID_FULL_DIRECTORY_INFORMATION: + case FILEID_BOTH_DIRECTORY_INFORMATION: + case SMB_FIND_FILE_POSIX_INFO: + break; + default: + return -EOPNOTSUPP; + } + + return 0; +} + +int smb2_query_dir(struct ksmbd_work *work) +{ + struct ksmbd_conn *conn = work->conn; + struct smb2_query_directory_req *req; + struct smb2_query_directory_rsp *rsp, *rsp_org; + struct ksmbd_share_config *share = work->tcon->share_conf; + struct ksmbd_file *dir_fp = NULL; + struct ksmbd_dir_info d_info; + int rc = 0; + char *srch_ptr = NULL; + unsigned char srch_flag; + int buffer_sz; + struct smb2_query_dir_private query_dir_private = {NULL, }; + + rsp_org = work->response_buf; + WORK_BUFFERS(work, req, rsp); + + if (ksmbd_override_fsids(work)) { + rsp->hdr.Status = STATUS_NO_MEMORY; + smb2_set_err_rsp(work); + return -ENOMEM; + } + + rc = verify_info_level(req->FileInformationClass); + if (rc) { + rc = -EFAULT; + goto err_out2; + } + + dir_fp = ksmbd_lookup_fd_slow(work, + le64_to_cpu(req->VolatileFileId), + le64_to_cpu(req->PersistentFileId)); + if (!dir_fp) { + rc = -EBADF; + goto err_out2; + } + + if (!(dir_fp->daccess & FILE_LIST_DIRECTORY_LE) || + inode_permission(file_mnt_user_ns(dir_fp->filp), + file_inode(dir_fp->filp), + MAY_READ | MAY_EXEC)) { + pr_err("no right to enumerate directory (%pd)\n", + dir_fp->filp->f_path.dentry); + rc = -EACCES; + goto err_out2; + } + + if (!S_ISDIR(file_inode(dir_fp->filp)->i_mode)) { + pr_err("can't do query dir for a file\n"); + rc = -EINVAL; + goto err_out2; + } + + srch_flag = req->Flags; + srch_ptr = smb_strndup_from_utf16(req->Buffer, + le16_to_cpu(req->FileNameLength), 1, + conn->local_nls); + if (IS_ERR(srch_ptr)) { + ksmbd_debug(SMB, "Search Pattern not found\n"); + rc = -EINVAL; + goto err_out2; + } else { + ksmbd_debug(SMB, "Search pattern is %s\n", srch_ptr); + } + + ksmbd_debug(SMB, "Directory name is %s\n", dir_fp->filename); + + if (srch_flag & SMB2_REOPEN || srch_flag & SMB2_RESTART_SCANS) { + ksmbd_debug(SMB, "Restart directory scan\n"); + generic_file_llseek(dir_fp->filp, 0, SEEK_SET); + restart_ctx(&dir_fp->readdir_data.ctx); + } + + memset(&d_info, 0, sizeof(struct ksmbd_dir_info)); + d_info.wptr = (char *)rsp->Buffer; + d_info.rptr = (char *)rsp->Buffer; + d_info.out_buf_len = (work->response_sz - (get_rfc1002_len(rsp_org) + 4)); + d_info.out_buf_len = min_t(int, d_info.out_buf_len, le32_to_cpu(req->OutputBufferLength)) - + sizeof(struct smb2_query_directory_rsp); + d_info.flags = srch_flag; + + /* + * reserve dot and dotdot entries in head of buffer + * in first response + */ + rc = ksmbd_populate_dot_dotdot_entries(work, req->FileInformationClass, + dir_fp, &d_info, srch_ptr, + smb2_populate_readdir_entry); + if (rc == -ENOSPC) + rc = 0; + else if (rc) + goto err_out; + + if (test_share_config_flag(share, KSMBD_SHARE_FLAG_HIDE_DOT_FILES)) + d_info.hide_dot_file = true; + + buffer_sz = d_info.out_buf_len; + d_info.rptr = d_info.wptr; + query_dir_private.work = work; + query_dir_private.search_pattern = srch_ptr; + query_dir_private.dir_fp = dir_fp; + query_dir_private.d_info = &d_info; + query_dir_private.info_level = req->FileInformationClass; + dir_fp->readdir_data.private = &query_dir_private; + set_ctx_actor(&dir_fp->readdir_data.ctx, __query_dir); + + rc = iterate_dir(dir_fp->filp, &dir_fp->readdir_data.ctx); + if (rc == 0) + restart_ctx(&dir_fp->readdir_data.ctx); + if (rc == -ENOSPC) + rc = 0; + if (rc) + goto err_out; + + d_info.wptr = d_info.rptr; + d_info.out_buf_len = buffer_sz; + rc = process_query_dir_entries(&query_dir_private); + if (rc) + goto err_out; + + if (!d_info.data_count && d_info.out_buf_len >= 0) { + if (srch_flag & SMB2_RETURN_SINGLE_ENTRY && !is_asterisk(srch_ptr)) { + rsp->hdr.Status = STATUS_NO_SUCH_FILE; + } else { + dir_fp->dot_dotdot[0] = dir_fp->dot_dotdot[1] = 0; + rsp->hdr.Status = STATUS_NO_MORE_FILES; + } + rsp->StructureSize = cpu_to_le16(9); + rsp->OutputBufferOffset = cpu_to_le16(0); + rsp->OutputBufferLength = cpu_to_le32(0); + rsp->Buffer[0] = 0; + inc_rfc1001_len(rsp_org, 9); + } else { + ((struct file_directory_info *) + ((char *)rsp->Buffer + d_info.last_entry_offset)) + ->NextEntryOffset = 0; + + rsp->StructureSize = cpu_to_le16(9); + rsp->OutputBufferOffset = cpu_to_le16(72); + rsp->OutputBufferLength = cpu_to_le32(d_info.data_count); + inc_rfc1001_len(rsp_org, 8 + d_info.data_count); + } + + kfree(srch_ptr); + ksmbd_fd_put(work, dir_fp); + ksmbd_revert_fsids(work); + return 0; + +err_out: + pr_err("error while processing smb2 query dir rc = %d\n", rc); + kfree(srch_ptr); + +err_out2: + if (rc == -EINVAL) + rsp->hdr.Status = STATUS_INVALID_PARAMETER; + else if (rc == -EACCES) + rsp->hdr.Status = STATUS_ACCESS_DENIED; + else if (rc == -ENOENT) + rsp->hdr.Status = STATUS_NO_SUCH_FILE; + else if (rc == -EBADF) + rsp->hdr.Status = STATUS_FILE_CLOSED; + else if (rc == -ENOMEM) + rsp->hdr.Status = STATUS_NO_MEMORY; + else if (rc == -EFAULT) + rsp->hdr.Status = STATUS_INVALID_INFO_CLASS; + if (!rsp->hdr.Status) + rsp->hdr.Status = STATUS_UNEXPECTED_IO_ERROR; + + smb2_set_err_rsp(work); + ksmbd_fd_put(work, dir_fp); + ksmbd_revert_fsids(work); + return 0; +} + +/** + * buffer_check_err() - helper function to check buffer errors + * @reqOutputBufferLength: max buffer length expected in command response + * @rsp: query info response buffer contains output buffer length + * @infoclass_size: query info class response buffer size + * + * Return: 0 on success, otherwise error + */ +static int buffer_check_err(int reqOutputBufferLength, + struct smb2_query_info_rsp *rsp, int infoclass_size) +{ + if (reqOutputBufferLength < le32_to_cpu(rsp->OutputBufferLength)) { + if (reqOutputBufferLength < infoclass_size) { + pr_err("Invalid Buffer Size Requested\n"); + rsp->hdr.Status = STATUS_INFO_LENGTH_MISMATCH; + rsp->hdr.smb2_buf_length = cpu_to_be32(sizeof(struct smb2_hdr) - 4); + return -EINVAL; + } + + ksmbd_debug(SMB, "Buffer Overflow\n"); + rsp->hdr.Status = STATUS_BUFFER_OVERFLOW; + rsp->hdr.smb2_buf_length = cpu_to_be32(sizeof(struct smb2_hdr) - 4 + + reqOutputBufferLength); + rsp->OutputBufferLength = cpu_to_le32(reqOutputBufferLength); + } + return 0; +} + +static void get_standard_info_pipe(struct smb2_query_info_rsp *rsp) +{ + struct smb2_file_standard_info *sinfo; + + sinfo = (struct smb2_file_standard_info *)rsp->Buffer; + + sinfo->AllocationSize = cpu_to_le64(4096); + sinfo->EndOfFile = cpu_to_le64(0); + sinfo->NumberOfLinks = cpu_to_le32(1); + sinfo->DeletePending = 1; + sinfo->Directory = 0; + rsp->OutputBufferLength = + cpu_to_le32(sizeof(struct smb2_file_standard_info)); + inc_rfc1001_len(rsp, sizeof(struct smb2_file_standard_info)); +} + +static void get_internal_info_pipe(struct smb2_query_info_rsp *rsp, u64 num) +{ + struct smb2_file_internal_info *file_info; + + file_info = (struct smb2_file_internal_info *)rsp->Buffer; + + /* any unique number */ + file_info->IndexNumber = cpu_to_le64(num | (1ULL << 63)); + rsp->OutputBufferLength = + cpu_to_le32(sizeof(struct smb2_file_internal_info)); + inc_rfc1001_len(rsp, sizeof(struct smb2_file_internal_info)); +} + +static int smb2_get_info_file_pipe(struct ksmbd_session *sess, + struct smb2_query_info_req *req, + struct smb2_query_info_rsp *rsp) +{ + u64 id; + int rc; + + /* + * Windows can sometime send query file info request on + * pipe without opening it, checking error condition here + */ + id = le64_to_cpu(req->VolatileFileId); + if (!ksmbd_session_rpc_method(sess, id)) + return -ENOENT; + + ksmbd_debug(SMB, "FileInfoClass %u, FileId 0x%llx\n", + req->FileInfoClass, le64_to_cpu(req->VolatileFileId)); + + switch (req->FileInfoClass) { + case FILE_STANDARD_INFORMATION: + get_standard_info_pipe(rsp); + rc = buffer_check_err(le32_to_cpu(req->OutputBufferLength), + rsp, FILE_STANDARD_INFORMATION_SIZE); + break; + case FILE_INTERNAL_INFORMATION: + get_internal_info_pipe(rsp, id); + rc = buffer_check_err(le32_to_cpu(req->OutputBufferLength), + rsp, FILE_INTERNAL_INFORMATION_SIZE); + break; + default: + ksmbd_debug(SMB, "smb2_info_file_pipe for %u not supported\n", + req->FileInfoClass); + rc = -EOPNOTSUPP; + } + return rc; +} + +/** + * smb2_get_ea() - handler for smb2 get extended attribute command + * @work: smb work containing query info command buffer + * @fp: ksmbd_file pointer + * @req: get extended attribute request + * @rsp: response buffer pointer + * @rsp_org: base response buffer pointer in case of chained response + * + * Return: 0 on success, otherwise error + */ +static int smb2_get_ea(struct ksmbd_work *work, struct ksmbd_file *fp, + struct smb2_query_info_req *req, + struct smb2_query_info_rsp *rsp, void *rsp_org) +{ + struct smb2_ea_info *eainfo, *prev_eainfo; + char *name, *ptr, *xattr_list = NULL, *buf; + int rc, name_len, value_len, xattr_list_len, idx; + ssize_t buf_free_len, alignment_bytes, next_offset, rsp_data_cnt = 0; + struct smb2_ea_info_req *ea_req = NULL; + struct path *path; + struct user_namespace *user_ns = file_mnt_user_ns(fp->filp); + + if (!(fp->daccess & FILE_READ_EA_LE)) { + pr_err("Not permitted to read ext attr : 0x%x\n", + fp->daccess); + return -EACCES; + } + + path = &fp->filp->f_path; + /* single EA entry is requested with given user.* name */ + if (req->InputBufferLength) { + ea_req = (struct smb2_ea_info_req *)req->Buffer; + } else { + /* need to send all EAs, if no specific EA is requested*/ + if (le32_to_cpu(req->Flags) & SL_RETURN_SINGLE_ENTRY) + ksmbd_debug(SMB, + "All EAs are requested but need to send single EA entry in rsp flags 0x%x\n", + le32_to_cpu(req->Flags)); + } + + buf_free_len = work->response_sz - + (get_rfc1002_len(rsp_org) + 4) - + sizeof(struct smb2_query_info_rsp); + + if (le32_to_cpu(req->OutputBufferLength) < buf_free_len) + buf_free_len = le32_to_cpu(req->OutputBufferLength); + + rc = ksmbd_vfs_listxattr(path->dentry, &xattr_list); + if (rc < 0) { + rsp->hdr.Status = STATUS_INVALID_HANDLE; + goto out; + } else if (!rc) { /* there is no EA in the file */ + ksmbd_debug(SMB, "no ea data in the file\n"); + goto done; + } + xattr_list_len = rc; + + ptr = (char *)rsp->Buffer; + eainfo = (struct smb2_ea_info *)ptr; + prev_eainfo = eainfo; + idx = 0; + + while (idx < xattr_list_len) { + name = xattr_list + idx; + name_len = strlen(name); + + ksmbd_debug(SMB, "%s, len %d\n", name, name_len); + idx += name_len + 1; + + /* + * CIFS does not support EA other than user.* namespace, + * still keep the framework generic, to list other attrs + * in future. + */ + if (strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) + continue; + + if (!strncmp(&name[XATTR_USER_PREFIX_LEN], STREAM_PREFIX, + STREAM_PREFIX_LEN)) + continue; + + if (req->InputBufferLength && + strncmp(&name[XATTR_USER_PREFIX_LEN], ea_req->name, + ea_req->EaNameLength)) + continue; + + if (!strncmp(&name[XATTR_USER_PREFIX_LEN], + DOS_ATTRIBUTE_PREFIX, DOS_ATTRIBUTE_PREFIX_LEN)) + continue; + + if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) + name_len -= XATTR_USER_PREFIX_LEN; + + ptr = (char *)(&eainfo->name + name_len + 1); + buf_free_len -= (offsetof(struct smb2_ea_info, name) + + name_len + 1); + /* bailout if xattr can't fit in buf_free_len */ + value_len = ksmbd_vfs_getxattr(user_ns, path->dentry, + name, &buf); + if (value_len <= 0) { + rc = -ENOENT; + rsp->hdr.Status = STATUS_INVALID_HANDLE; + goto out; + } + + buf_free_len -= value_len; + if (buf_free_len < 0) { + kfree(buf); + break; + } + + memcpy(ptr, buf, value_len); + kfree(buf); + + ptr += value_len; + eainfo->Flags = 0; + eainfo->EaNameLength = name_len; + + if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) + memcpy(eainfo->name, &name[XATTR_USER_PREFIX_LEN], + name_len); + else + memcpy(eainfo->name, name, name_len); + + eainfo->name[name_len] = '\0'; + eainfo->EaValueLength = cpu_to_le16(value_len); + next_offset = offsetof(struct smb2_ea_info, name) + + name_len + 1 + value_len; + + /* align next xattr entry at 4 byte bundary */ + alignment_bytes = ((next_offset + 3) & ~3) - next_offset; + if (alignment_bytes) { + memset(ptr, '\0', alignment_bytes); + ptr += alignment_bytes; + next_offset += alignment_bytes; + buf_free_len -= alignment_bytes; + } + eainfo->NextEntryOffset = cpu_to_le32(next_offset); + prev_eainfo = eainfo; + eainfo = (struct smb2_ea_info *)ptr; + rsp_data_cnt += next_offset; + + if (req->InputBufferLength) { + ksmbd_debug(SMB, "single entry requested\n"); + break; + } + } + + /* no more ea entries */ + prev_eainfo->NextEntryOffset = 0; +done: + rc = 0; + if (rsp_data_cnt == 0) + rsp->hdr.Status = STATUS_NO_EAS_ON_FILE; + rsp->OutputBufferLength = cpu_to_le32(rsp_data_cnt); + inc_rfc1001_len(rsp_org, rsp_data_cnt); +out: + kvfree(xattr_list); + return rc; +} + +static void get_file_access_info(struct smb2_query_info_rsp *rsp, + struct ksmbd_file *fp, void *rsp_org) +{ + struct smb2_file_access_info *file_info; + + file_info = (struct smb2_file_access_info *)rsp->Buffer; + file_info->AccessFlags = fp->daccess; + rsp->OutputBufferLength = + cpu_to_le32(sizeof(struct smb2_file_access_info)); + inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_access_info)); +} + +static int get_file_basic_info(struct smb2_query_info_rsp *rsp, + struct ksmbd_file *fp, void *rsp_org) +{ + struct smb2_file_all_info *basic_info; + struct kstat stat; + u64 time; + + if (!(fp->daccess & FILE_READ_ATTRIBUTES_LE)) { + pr_err("no right to read the attributes : 0x%x\n", + fp->daccess); + return -EACCES; + } + + basic_info = (struct smb2_file_all_info *)rsp->Buffer; + generic_fillattr(file_mnt_user_ns(fp->filp), file_inode(fp->filp), + &stat); + basic_info->CreationTime = cpu_to_le64(fp->create_time); + time = ksmbd_UnixTimeToNT(stat.atime); + basic_info->LastAccessTime = cpu_to_le64(time); + time = ksmbd_UnixTimeToNT(stat.mtime); + basic_info->LastWriteTime = cpu_to_le64(time); + time = ksmbd_UnixTimeToNT(stat.ctime); + basic_info->ChangeTime = cpu_to_le64(time); + basic_info->Attributes = fp->f_ci->m_fattr; + basic_info->Pad1 = 0; + rsp->OutputBufferLength = + cpu_to_le32(offsetof(struct smb2_file_all_info, AllocationSize)); + inc_rfc1001_len(rsp_org, offsetof(struct smb2_file_all_info, + AllocationSize)); + return 0; +} + +static unsigned long long get_allocation_size(struct inode *inode, + struct kstat *stat) +{ + unsigned long long alloc_size = 0; + + if (!S_ISDIR(stat->mode)) { + if ((inode->i_blocks << 9) <= stat->size) + alloc_size = stat->size; + else + alloc_size = inode->i_blocks << 9; + } + + return alloc_size; +} + +static void get_file_standard_info(struct smb2_query_info_rsp *rsp, + struct ksmbd_file *fp, void *rsp_org) +{ + struct smb2_file_standard_info *sinfo; + unsigned int delete_pending; + struct inode *inode; + struct kstat stat; + + inode = file_inode(fp->filp); + generic_fillattr(file_mnt_user_ns(fp->filp), inode, &stat); + + sinfo = (struct smb2_file_standard_info *)rsp->Buffer; + delete_pending = ksmbd_inode_pending_delete(fp); + + sinfo->AllocationSize = cpu_to_le64(get_allocation_size(inode, &stat)); + sinfo->EndOfFile = S_ISDIR(stat.mode) ? 0 : cpu_to_le64(stat.size); + sinfo->NumberOfLinks = cpu_to_le32(get_nlink(&stat) - delete_pending); + sinfo->DeletePending = delete_pending; + sinfo->Directory = S_ISDIR(stat.mode) ? 1 : 0; + rsp->OutputBufferLength = + cpu_to_le32(sizeof(struct smb2_file_standard_info)); + inc_rfc1001_len(rsp_org, + sizeof(struct smb2_file_standard_info)); +} + +static void get_file_alignment_info(struct smb2_query_info_rsp *rsp, + void *rsp_org) +{ + struct smb2_file_alignment_info *file_info; + + file_info = (struct smb2_file_alignment_info *)rsp->Buffer; + file_info->AlignmentRequirement = 0; + rsp->OutputBufferLength = + cpu_to_le32(sizeof(struct smb2_file_alignment_info)); + inc_rfc1001_len(rsp_org, + sizeof(struct smb2_file_alignment_info)); +} + +static int get_file_all_info(struct ksmbd_work *work, + struct smb2_query_info_rsp *rsp, + struct ksmbd_file *fp, + void *rsp_org) +{ + struct ksmbd_conn *conn = work->conn; + struct smb2_file_all_info *file_info; + unsigned int delete_pending; + struct inode *inode; + struct kstat stat; + int conv_len; + char *filename; + u64 time; + + if (!(fp->daccess & FILE_READ_ATTRIBUTES_LE)) { + ksmbd_debug(SMB, "no right to read the attributes : 0x%x\n", + fp->daccess); + return -EACCES; + } + + filename = convert_to_nt_pathname(fp->filename, + work->tcon->share_conf->path); + if (!filename) + return -ENOMEM; + + inode = file_inode(fp->filp); + generic_fillattr(file_mnt_user_ns(fp->filp), inode, &stat); + + ksmbd_debug(SMB, "filename = %s\n", filename); + delete_pending = ksmbd_inode_pending_delete(fp); + file_info = (struct smb2_file_all_info *)rsp->Buffer; + + file_info->CreationTime = cpu_to_le64(fp->create_time); + time = ksmbd_UnixTimeToNT(stat.atime); + file_info->LastAccessTime = cpu_to_le64(time); + time = ksmbd_UnixTimeToNT(stat.mtime); + file_info->LastWriteTime = cpu_to_le64(time); + time = ksmbd_UnixTimeToNT(stat.ctime); + file_info->ChangeTime = cpu_to_le64(time); + file_info->Attributes = fp->f_ci->m_fattr; + file_info->Pad1 = 0; + file_info->AllocationSize = + cpu_to_le64(get_allocation_size(inode, &stat)); + file_info->EndOfFile = S_ISDIR(stat.mode) ? 0 : cpu_to_le64(stat.size); + file_info->NumberOfLinks = + cpu_to_le32(get_nlink(&stat) - delete_pending); + file_info->DeletePending = delete_pending; + file_info->Directory = S_ISDIR(stat.mode) ? 1 : 0; + file_info->Pad2 = 0; + file_info->IndexNumber = cpu_to_le64(stat.ino); + file_info->EASize = 0; + file_info->AccessFlags = fp->daccess; + file_info->CurrentByteOffset = cpu_to_le64(fp->filp->f_pos); + file_info->Mode = fp->coption; + file_info->AlignmentRequirement = 0; + conv_len = smbConvertToUTF16((__le16 *)file_info->FileName, filename, + PATH_MAX, conn->local_nls, 0); + conv_len *= 2; + file_info->FileNameLength = cpu_to_le32(conv_len); + rsp->OutputBufferLength = + cpu_to_le32(sizeof(struct smb2_file_all_info) + conv_len - 1); + kfree(filename); + inc_rfc1001_len(rsp_org, le32_to_cpu(rsp->OutputBufferLength)); + return 0; +} + +static void get_file_alternate_info(struct ksmbd_work *work, + struct smb2_query_info_rsp *rsp, + struct ksmbd_file *fp, + void *rsp_org) +{ + struct ksmbd_conn *conn = work->conn; + struct smb2_file_alt_name_info *file_info; + struct dentry *dentry = fp->filp->f_path.dentry; + int conv_len; + + spin_lock(&dentry->d_lock); + file_info = (struct smb2_file_alt_name_info *)rsp->Buffer; + conv_len = ksmbd_extract_shortname(conn, + dentry->d_name.name, + file_info->FileName); + spin_unlock(&dentry->d_lock); + file_info->FileNameLength = cpu_to_le32(conv_len); + rsp->OutputBufferLength = + cpu_to_le32(sizeof(struct smb2_file_alt_name_info) + conv_len); + inc_rfc1001_len(rsp_org, le32_to_cpu(rsp->OutputBufferLength)); +} + +static void get_file_stream_info(struct ksmbd_work *work, + struct smb2_query_info_rsp *rsp, + struct ksmbd_file *fp, + void *rsp_org) +{ + struct ksmbd_conn *conn = work->conn; + struct smb2_file_stream_info *file_info; + char *stream_name, *xattr_list = NULL, *stream_buf; + struct kstat stat; + struct path *path = &fp->filp->f_path; + ssize_t xattr_list_len; + int nbytes = 0, streamlen, stream_name_len, next, idx = 0; + + generic_fillattr(file_mnt_user_ns(fp->filp), file_inode(fp->filp), + &stat); + file_info = (struct smb2_file_stream_info *)rsp->Buffer; + + xattr_list_len = ksmbd_vfs_listxattr(path->dentry, &xattr_list); + if (xattr_list_len < 0) { + goto out; + } else if (!xattr_list_len) { + ksmbd_debug(SMB, "empty xattr in the file\n"); + goto out; + } + + while (idx < xattr_list_len) { + stream_name = xattr_list + idx; + streamlen = strlen(stream_name); + idx += streamlen + 1; + + ksmbd_debug(SMB, "%s, len %d\n", stream_name, streamlen); + + if (strncmp(&stream_name[XATTR_USER_PREFIX_LEN], + STREAM_PREFIX, STREAM_PREFIX_LEN)) + continue; + + stream_name_len = streamlen - (XATTR_USER_PREFIX_LEN + + STREAM_PREFIX_LEN); + streamlen = stream_name_len; + + /* plus : size */ + streamlen += 1; + stream_buf = kmalloc(streamlen + 1, GFP_KERNEL); + if (!stream_buf) + break; + + streamlen = snprintf(stream_buf, streamlen + 1, + ":%s", &stream_name[XATTR_NAME_STREAM_LEN]); + + file_info = (struct smb2_file_stream_info *)&rsp->Buffer[nbytes]; + streamlen = smbConvertToUTF16((__le16 *)file_info->StreamName, + stream_buf, streamlen, + conn->local_nls, 0); + streamlen *= 2; + kfree(stream_buf); + file_info->StreamNameLength = cpu_to_le32(streamlen); + file_info->StreamSize = cpu_to_le64(stream_name_len); + file_info->StreamAllocationSize = cpu_to_le64(stream_name_len); + + next = sizeof(struct smb2_file_stream_info) + streamlen; + nbytes += next; + file_info->NextEntryOffset = cpu_to_le32(next); + } + + if (nbytes) { + file_info = (struct smb2_file_stream_info *) + &rsp->Buffer[nbytes]; + streamlen = smbConvertToUTF16((__le16 *)file_info->StreamName, + "::$DATA", 7, conn->local_nls, 0); + streamlen *= 2; + file_info->StreamNameLength = cpu_to_le32(streamlen); + file_info->StreamSize = S_ISDIR(stat.mode) ? 0 : + cpu_to_le64(stat.size); + file_info->StreamAllocationSize = S_ISDIR(stat.mode) ? 0 : + cpu_to_le64(stat.size); + nbytes += sizeof(struct smb2_file_stream_info) + streamlen; + } + + /* last entry offset should be 0 */ + file_info->NextEntryOffset = 0; +out: + kvfree(xattr_list); + + rsp->OutputBufferLength = cpu_to_le32(nbytes); + inc_rfc1001_len(rsp_org, nbytes); +} + +static void get_file_internal_info(struct smb2_query_info_rsp *rsp, + struct ksmbd_file *fp, void *rsp_org) +{ + struct smb2_file_internal_info *file_info; + struct kstat stat; + + generic_fillattr(file_mnt_user_ns(fp->filp), file_inode(fp->filp), + &stat); + file_info = (struct smb2_file_internal_info *)rsp->Buffer; + file_info->IndexNumber = cpu_to_le64(stat.ino); + rsp->OutputBufferLength = + cpu_to_le32(sizeof(struct smb2_file_internal_info)); + inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_internal_info)); +} + +static int get_file_network_open_info(struct smb2_query_info_rsp *rsp, + struct ksmbd_file *fp, void *rsp_org) +{ + struct smb2_file_ntwrk_info *file_info; + struct inode *inode; + struct kstat stat; + u64 time; + + if (!(fp->daccess & FILE_READ_ATTRIBUTES_LE)) { + pr_err("no right to read the attributes : 0x%x\n", + fp->daccess); + return -EACCES; + } + + file_info = (struct smb2_file_ntwrk_info *)rsp->Buffer; + + inode = file_inode(fp->filp); + generic_fillattr(file_mnt_user_ns(fp->filp), inode, &stat); + + file_info->CreationTime = cpu_to_le64(fp->create_time); + time = ksmbd_UnixTimeToNT(stat.atime); + file_info->LastAccessTime = cpu_to_le64(time); + time = ksmbd_UnixTimeToNT(stat.mtime); + file_info->LastWriteTime = cpu_to_le64(time); + time = ksmbd_UnixTimeToNT(stat.ctime); + file_info->ChangeTime = cpu_to_le64(time); + file_info->Attributes = fp->f_ci->m_fattr; + file_info->AllocationSize = + cpu_to_le64(get_allocation_size(inode, &stat)); + file_info->EndOfFile = S_ISDIR(stat.mode) ? 0 : cpu_to_le64(stat.size); + file_info->Reserved = cpu_to_le32(0); + rsp->OutputBufferLength = + cpu_to_le32(sizeof(struct smb2_file_ntwrk_info)); + inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_ntwrk_info)); + return 0; +} + +static void get_file_ea_info(struct smb2_query_info_rsp *rsp, void *rsp_org) +{ + struct smb2_file_ea_info *file_info; + + file_info = (struct smb2_file_ea_info *)rsp->Buffer; + file_info->EASize = 0; + rsp->OutputBufferLength = + cpu_to_le32(sizeof(struct smb2_file_ea_info)); + inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_ea_info)); +} + +static void get_file_position_info(struct smb2_query_info_rsp *rsp, + struct ksmbd_file *fp, void *rsp_org) +{ + struct smb2_file_pos_info *file_info; + + file_info = (struct smb2_file_pos_info *)rsp->Buffer; + file_info->CurrentByteOffset = cpu_to_le64(fp->filp->f_pos); + rsp->OutputBufferLength = + cpu_to_le32(sizeof(struct smb2_file_pos_info)); + inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_pos_info)); +} + +static void get_file_mode_info(struct smb2_query_info_rsp *rsp, + struct ksmbd_file *fp, void *rsp_org) +{ + struct smb2_file_mode_info *file_info; + + file_info = (struct smb2_file_mode_info *)rsp->Buffer; + file_info->Mode = fp->coption & FILE_MODE_INFO_MASK; + rsp->OutputBufferLength = + cpu_to_le32(sizeof(struct smb2_file_mode_info)); + inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_mode_info)); +} + +static void get_file_compression_info(struct smb2_query_info_rsp *rsp, + struct ksmbd_file *fp, void *rsp_org) +{ + struct smb2_file_comp_info *file_info; + struct kstat stat; + + generic_fillattr(file_mnt_user_ns(fp->filp), file_inode(fp->filp), + &stat); + + file_info = (struct smb2_file_comp_info *)rsp->Buffer; + file_info->CompressedFileSize = cpu_to_le64(stat.blocks << 9); + file_info->CompressionFormat = COMPRESSION_FORMAT_NONE; + file_info->CompressionUnitShift = 0; + file_info->ChunkShift = 0; + file_info->ClusterShift = 0; + memset(&file_info->Reserved[0], 0, 3); + + rsp->OutputBufferLength = + cpu_to_le32(sizeof(struct smb2_file_comp_info)); + inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_comp_info)); +} + +static int get_file_attribute_tag_info(struct smb2_query_info_rsp *rsp, + struct ksmbd_file *fp, void *rsp_org) +{ + struct smb2_file_attr_tag_info *file_info; + + if (!(fp->daccess & FILE_READ_ATTRIBUTES_LE)) { + pr_err("no right to read the attributes : 0x%x\n", + fp->daccess); + return -EACCES; + } + + file_info = (struct smb2_file_attr_tag_info *)rsp->Buffer; + file_info->FileAttributes = fp->f_ci->m_fattr; + file_info->ReparseTag = 0; + rsp->OutputBufferLength = + cpu_to_le32(sizeof(struct smb2_file_attr_tag_info)); + inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_attr_tag_info)); + return 0; +} + +static int find_file_posix_info(struct smb2_query_info_rsp *rsp, + struct ksmbd_file *fp, void *rsp_org) +{ + struct smb311_posix_qinfo *file_info; + struct inode *inode = file_inode(fp->filp); + u64 time; + + file_info = (struct smb311_posix_qinfo *)rsp->Buffer; + file_info->CreationTime = cpu_to_le64(fp->create_time); + time = ksmbd_UnixTimeToNT(inode->i_atime); + file_info->LastAccessTime = cpu_to_le64(time); + time = ksmbd_UnixTimeToNT(inode->i_mtime); + file_info->LastWriteTime = cpu_to_le64(time); + time = ksmbd_UnixTimeToNT(inode->i_ctime); + file_info->ChangeTime = cpu_to_le64(time); + file_info->DosAttributes = fp->f_ci->m_fattr; + file_info->Inode = cpu_to_le64(inode->i_ino); + file_info->EndOfFile = cpu_to_le64(inode->i_size); + file_info->AllocationSize = cpu_to_le64(inode->i_blocks << 9); + file_info->HardLinks = cpu_to_le32(inode->i_nlink); + file_info->Mode = cpu_to_le32(inode->i_mode); + file_info->DeviceId = cpu_to_le32(inode->i_rdev); + rsp->OutputBufferLength = + cpu_to_le32(sizeof(struct smb311_posix_qinfo)); + inc_rfc1001_len(rsp_org, sizeof(struct smb311_posix_qinfo)); + return 0; +} + +static int smb2_get_info_file(struct ksmbd_work *work, + struct smb2_query_info_req *req, + struct smb2_query_info_rsp *rsp, void *rsp_org) +{ + struct ksmbd_file *fp; + int fileinfoclass = 0; + int rc = 0; + int file_infoclass_size; + unsigned int id = KSMBD_NO_FID, pid = KSMBD_NO_FID; + + if (test_share_config_flag(work->tcon->share_conf, + KSMBD_SHARE_FLAG_PIPE)) { + /* smb2 info file called for pipe */ + return smb2_get_info_file_pipe(work->sess, req, rsp); + } + + if (work->next_smb2_rcv_hdr_off) { + if (!has_file_id(le64_to_cpu(req->VolatileFileId))) { + ksmbd_debug(SMB, "Compound request set FID = %llu\n", + work->compound_fid); + id = work->compound_fid; + pid = work->compound_pfid; + } + } + + if (!has_file_id(id)) { + id = le64_to_cpu(req->VolatileFileId); + pid = le64_to_cpu(req->PersistentFileId); + } + + fp = ksmbd_lookup_fd_slow(work, id, pid); + if (!fp) + return -ENOENT; + + fileinfoclass = req->FileInfoClass; + + switch (fileinfoclass) { + case FILE_ACCESS_INFORMATION: + get_file_access_info(rsp, fp, rsp_org); + file_infoclass_size = FILE_ACCESS_INFORMATION_SIZE; + break; + + case FILE_BASIC_INFORMATION: + rc = get_file_basic_info(rsp, fp, rsp_org); + file_infoclass_size = FILE_BASIC_INFORMATION_SIZE; + break; + + case FILE_STANDARD_INFORMATION: + get_file_standard_info(rsp, fp, rsp_org); + file_infoclass_size = FILE_STANDARD_INFORMATION_SIZE; + break; + + case FILE_ALIGNMENT_INFORMATION: + get_file_alignment_info(rsp, rsp_org); + file_infoclass_size = FILE_ALIGNMENT_INFORMATION_SIZE; + break; + + case FILE_ALL_INFORMATION: + rc = get_file_all_info(work, rsp, fp, rsp_org); + file_infoclass_size = FILE_ALL_INFORMATION_SIZE; + break; + + case FILE_ALTERNATE_NAME_INFORMATION: + get_file_alternate_info(work, rsp, fp, rsp_org); + file_infoclass_size = FILE_ALTERNATE_NAME_INFORMATION_SIZE; + break; + + case FILE_STREAM_INFORMATION: + get_file_stream_info(work, rsp, fp, rsp_org); + file_infoclass_size = FILE_STREAM_INFORMATION_SIZE; + break; + + case FILE_INTERNAL_INFORMATION: + get_file_internal_info(rsp, fp, rsp_org); + file_infoclass_size = FILE_INTERNAL_INFORMATION_SIZE; + break; + + case FILE_NETWORK_OPEN_INFORMATION: + rc = get_file_network_open_info(rsp, fp, rsp_org); + file_infoclass_size = FILE_NETWORK_OPEN_INFORMATION_SIZE; + break; + + case FILE_EA_INFORMATION: + get_file_ea_info(rsp, rsp_org); + file_infoclass_size = FILE_EA_INFORMATION_SIZE; + break; + + case FILE_FULL_EA_INFORMATION: + rc = smb2_get_ea(work, fp, req, rsp, rsp_org); + file_infoclass_size = FILE_FULL_EA_INFORMATION_SIZE; + break; + + case FILE_POSITION_INFORMATION: + get_file_position_info(rsp, fp, rsp_org); + file_infoclass_size = FILE_POSITION_INFORMATION_SIZE; + break; + + case FILE_MODE_INFORMATION: + get_file_mode_info(rsp, fp, rsp_org); + file_infoclass_size = FILE_MODE_INFORMATION_SIZE; + break; + + case FILE_COMPRESSION_INFORMATION: + get_file_compression_info(rsp, fp, rsp_org); + file_infoclass_size = FILE_COMPRESSION_INFORMATION_SIZE; + break; + + case FILE_ATTRIBUTE_TAG_INFORMATION: + rc = get_file_attribute_tag_info(rsp, fp, rsp_org); + file_infoclass_size = FILE_ATTRIBUTE_TAG_INFORMATION_SIZE; + break; + case SMB_FIND_FILE_POSIX_INFO: + if (!work->tcon->posix_extensions) { + pr_err("client doesn't negotiate with SMB3.1.1 POSIX Extensions\n"); + rc = -EOPNOTSUPP; + } else { + rc = find_file_posix_info(rsp, fp, rsp_org); + file_infoclass_size = sizeof(struct smb311_posix_qinfo); + } + break; + default: + ksmbd_debug(SMB, "fileinfoclass %d not supported yet\n", + fileinfoclass); + rc = -EOPNOTSUPP; + } + if (!rc) + rc = buffer_check_err(le32_to_cpu(req->OutputBufferLength), + rsp, + file_infoclass_size); + ksmbd_fd_put(work, fp); + return rc; +} + +static int smb2_get_info_filesystem(struct ksmbd_work *work, + struct smb2_query_info_req *req, + struct smb2_query_info_rsp *rsp, void *rsp_org) +{ + struct ksmbd_session *sess = work->sess; + struct ksmbd_conn *conn = sess->conn; + struct ksmbd_share_config *share = work->tcon->share_conf; + int fsinfoclass = 0; + struct kstatfs stfs; + struct path path; + int rc = 0, len; + int fs_infoclass_size = 0; + int lookup_flags = 0; + + if (test_share_config_flag(share, KSMBD_SHARE_FLAG_FOLLOW_SYMLINKS)) + lookup_flags = LOOKUP_FOLLOW; + + rc = ksmbd_vfs_kern_path(share->path, lookup_flags, &path, 0); + if (rc) { + pr_err("cannot create vfs path\n"); + return -EIO; + } + + rc = vfs_statfs(&path, &stfs); + if (rc) { + pr_err("cannot do stat of path %s\n", share->path); + path_put(&path); + return -EIO; + } + + fsinfoclass = req->FileInfoClass; + + switch (fsinfoclass) { + case FS_DEVICE_INFORMATION: + { + struct filesystem_device_info *info; + + info = (struct filesystem_device_info *)rsp->Buffer; + + info->DeviceType = cpu_to_le32(stfs.f_type); + info->DeviceCharacteristics = cpu_to_le32(0x00000020); + rsp->OutputBufferLength = cpu_to_le32(8); + inc_rfc1001_len(rsp_org, 8); + fs_infoclass_size = FS_DEVICE_INFORMATION_SIZE; + break; + } + case FS_ATTRIBUTE_INFORMATION: + { + struct filesystem_attribute_info *info; + size_t sz; + + info = (struct filesystem_attribute_info *)rsp->Buffer; + info->Attributes = cpu_to_le32(FILE_SUPPORTS_OBJECT_IDS | + FILE_PERSISTENT_ACLS | + FILE_UNICODE_ON_DISK | + FILE_CASE_PRESERVED_NAMES | + FILE_CASE_SENSITIVE_SEARCH | + FILE_SUPPORTS_BLOCK_REFCOUNTING); + + info->Attributes |= cpu_to_le32(server_conf.share_fake_fscaps); + + info->MaxPathNameComponentLength = cpu_to_le32(stfs.f_namelen); + len = smbConvertToUTF16((__le16 *)info->FileSystemName, + "NTFS", PATH_MAX, conn->local_nls, 0); + len = len * 2; + info->FileSystemNameLen = cpu_to_le32(len); + sz = sizeof(struct filesystem_attribute_info) - 2 + len; + rsp->OutputBufferLength = cpu_to_le32(sz); + inc_rfc1001_len(rsp_org, sz); + fs_infoclass_size = FS_ATTRIBUTE_INFORMATION_SIZE; + break; + } + case FS_VOLUME_INFORMATION: + { + struct filesystem_vol_info *info; + size_t sz; + + info = (struct filesystem_vol_info *)(rsp->Buffer); + info->VolumeCreationTime = 0; + /* Taking dummy value of serial number*/ + info->SerialNumber = cpu_to_le32(0xbc3ac512); + len = smbConvertToUTF16((__le16 *)info->VolumeLabel, + share->name, PATH_MAX, + conn->local_nls, 0); + len = len * 2; + info->VolumeLabelSize = cpu_to_le32(len); + info->Reserved = 0; + sz = sizeof(struct filesystem_vol_info) - 2 + len; + rsp->OutputBufferLength = cpu_to_le32(sz); + inc_rfc1001_len(rsp_org, sz); + fs_infoclass_size = FS_VOLUME_INFORMATION_SIZE; + break; + } + case FS_SIZE_INFORMATION: + { + struct filesystem_info *info; + + info = (struct filesystem_info *)(rsp->Buffer); + info->TotalAllocationUnits = cpu_to_le64(stfs.f_blocks); + info->FreeAllocationUnits = cpu_to_le64(stfs.f_bfree); + info->SectorsPerAllocationUnit = cpu_to_le32(1); + info->BytesPerSector = cpu_to_le32(stfs.f_bsize); + rsp->OutputBufferLength = cpu_to_le32(24); + inc_rfc1001_len(rsp_org, 24); + fs_infoclass_size = FS_SIZE_INFORMATION_SIZE; + break; + } + case FS_FULL_SIZE_INFORMATION: + { + struct smb2_fs_full_size_info *info; + + info = (struct smb2_fs_full_size_info *)(rsp->Buffer); + info->TotalAllocationUnits = cpu_to_le64(stfs.f_blocks); + info->CallerAvailableAllocationUnits = + cpu_to_le64(stfs.f_bavail); + info->ActualAvailableAllocationUnits = + cpu_to_le64(stfs.f_bfree); + info->SectorsPerAllocationUnit = cpu_to_le32(1); + info->BytesPerSector = cpu_to_le32(stfs.f_bsize); + rsp->OutputBufferLength = cpu_to_le32(32); + inc_rfc1001_len(rsp_org, 32); + fs_infoclass_size = FS_FULL_SIZE_INFORMATION_SIZE; + break; + } + case FS_OBJECT_ID_INFORMATION: + { + struct object_id_info *info; + + info = (struct object_id_info *)(rsp->Buffer); + + if (!user_guest(sess->user)) + memcpy(info->objid, user_passkey(sess->user), 16); + else + memset(info->objid, 0, 16); + + info->extended_info.magic = cpu_to_le32(EXTENDED_INFO_MAGIC); + info->extended_info.version = cpu_to_le32(1); + info->extended_info.release = cpu_to_le32(1); + info->extended_info.rel_date = 0; + memcpy(info->extended_info.version_string, "1.1.0", strlen("1.1.0")); + rsp->OutputBufferLength = cpu_to_le32(64); + inc_rfc1001_len(rsp_org, 64); + fs_infoclass_size = FS_OBJECT_ID_INFORMATION_SIZE; + break; + } + case FS_SECTOR_SIZE_INFORMATION: + { + struct smb3_fs_ss_info *info; + + info = (struct smb3_fs_ss_info *)(rsp->Buffer); + + info->LogicalBytesPerSector = cpu_to_le32(stfs.f_bsize); + info->PhysicalBytesPerSectorForAtomicity = + cpu_to_le32(stfs.f_bsize); + info->PhysicalBytesPerSectorForPerf = cpu_to_le32(stfs.f_bsize); + info->FSEffPhysicalBytesPerSectorForAtomicity = + cpu_to_le32(stfs.f_bsize); + info->Flags = cpu_to_le32(SSINFO_FLAGS_ALIGNED_DEVICE | + SSINFO_FLAGS_PARTITION_ALIGNED_ON_DEVICE); + info->ByteOffsetForSectorAlignment = 0; + info->ByteOffsetForPartitionAlignment = 0; + rsp->OutputBufferLength = cpu_to_le32(28); + inc_rfc1001_len(rsp_org, 28); + fs_infoclass_size = FS_SECTOR_SIZE_INFORMATION_SIZE; + break; + } + case FS_CONTROL_INFORMATION: + { + /* + * TODO : The current implementation is based on + * test result with win7(NTFS) server. It's need to + * modify this to get valid Quota values + * from Linux kernel + */ + struct smb2_fs_control_info *info; + + info = (struct smb2_fs_control_info *)(rsp->Buffer); + info->FreeSpaceStartFiltering = 0; + info->FreeSpaceThreshold = 0; + info->FreeSpaceStopFiltering = 0; + info->DefaultQuotaThreshold = cpu_to_le64(SMB2_NO_FID); + info->DefaultQuotaLimit = cpu_to_le64(SMB2_NO_FID); + info->Padding = 0; + rsp->OutputBufferLength = cpu_to_le32(48); + inc_rfc1001_len(rsp_org, 48); + fs_infoclass_size = FS_CONTROL_INFORMATION_SIZE; + break; + } + case FS_POSIX_INFORMATION: + { + struct filesystem_posix_info *info; + + if (!work->tcon->posix_extensions) { + pr_err("client doesn't negotiate with SMB3.1.1 POSIX Extensions\n"); + rc = -EOPNOTSUPP; + } else { + info = (struct filesystem_posix_info *)(rsp->Buffer); + info->OptimalTransferSize = cpu_to_le32(stfs.f_bsize); + info->BlockSize = cpu_to_le32(stfs.f_bsize); + info->TotalBlocks = cpu_to_le64(stfs.f_blocks); + info->BlocksAvail = cpu_to_le64(stfs.f_bfree); + info->UserBlocksAvail = cpu_to_le64(stfs.f_bavail); + info->TotalFileNodes = cpu_to_le64(stfs.f_files); + info->FreeFileNodes = cpu_to_le64(stfs.f_ffree); + rsp->OutputBufferLength = cpu_to_le32(56); + inc_rfc1001_len(rsp_org, 56); + fs_infoclass_size = FS_POSIX_INFORMATION_SIZE; + } + break; + } + default: + path_put(&path); + return -EOPNOTSUPP; + } + rc = buffer_check_err(le32_to_cpu(req->OutputBufferLength), + rsp, + fs_infoclass_size); + path_put(&path); + return rc; +} + +static int smb2_get_info_sec(struct ksmbd_work *work, + struct smb2_query_info_req *req, + struct smb2_query_info_rsp *rsp, void *rsp_org) +{ + struct ksmbd_file *fp; + struct user_namespace *user_ns; + struct smb_ntsd *pntsd = (struct smb_ntsd *)rsp->Buffer, *ppntsd = NULL; + struct smb_fattr fattr = {{0}}; + struct inode *inode; + __u32 secdesclen; + unsigned int id = KSMBD_NO_FID, pid = KSMBD_NO_FID; + int addition_info = le32_to_cpu(req->AdditionalInformation); + int rc; + + if (addition_info & ~(OWNER_SECINFO | GROUP_SECINFO | DACL_SECINFO | + PROTECTED_DACL_SECINFO | + UNPROTECTED_DACL_SECINFO)) { + pr_err("Unsupported addition info: 0x%x)\n", + addition_info); + + pntsd->revision = cpu_to_le16(1); + pntsd->type = cpu_to_le16(SELF_RELATIVE | DACL_PROTECTED); + pntsd->osidoffset = 0; + pntsd->gsidoffset = 0; + pntsd->sacloffset = 0; + pntsd->dacloffset = 0; + + secdesclen = sizeof(struct smb_ntsd); + rsp->OutputBufferLength = cpu_to_le32(secdesclen); + inc_rfc1001_len(rsp_org, secdesclen); + + return 0; + } + + if (work->next_smb2_rcv_hdr_off) { + if (!has_file_id(le64_to_cpu(req->VolatileFileId))) { + ksmbd_debug(SMB, "Compound request set FID = %llu\n", + work->compound_fid); + id = work->compound_fid; + pid = work->compound_pfid; + } + } + + if (!has_file_id(id)) { + id = le64_to_cpu(req->VolatileFileId); + pid = le64_to_cpu(req->PersistentFileId); + } + + fp = ksmbd_lookup_fd_slow(work, id, pid); + if (!fp) + return -ENOENT; + + user_ns = file_mnt_user_ns(fp->filp); + inode = file_inode(fp->filp); + ksmbd_acls_fattr(&fattr, inode); + + if (test_share_config_flag(work->tcon->share_conf, + KSMBD_SHARE_FLAG_ACL_XATTR)) + ksmbd_vfs_get_sd_xattr(work->conn, user_ns, + fp->filp->f_path.dentry, &ppntsd); + + rc = build_sec_desc(user_ns, pntsd, ppntsd, addition_info, + &secdesclen, &fattr); + posix_acl_release(fattr.cf_acls); + posix_acl_release(fattr.cf_dacls); + kfree(ppntsd); + ksmbd_fd_put(work, fp); + if (rc) + return rc; + + rsp->OutputBufferLength = cpu_to_le32(secdesclen); + inc_rfc1001_len(rsp_org, secdesclen); + return 0; +} + +/** + * smb2_query_info() - handler for smb2 query info command + * @work: smb work containing query info request buffer + * + * Return: 0 on success, otherwise error + */ +int smb2_query_info(struct ksmbd_work *work) +{ + struct smb2_query_info_req *req; + struct smb2_query_info_rsp *rsp, *rsp_org; + int rc = 0; + + rsp_org = work->response_buf; + WORK_BUFFERS(work, req, rsp); + + ksmbd_debug(SMB, "GOT query info request\n"); + + switch (req->InfoType) { + case SMB2_O_INFO_FILE: + ksmbd_debug(SMB, "GOT SMB2_O_INFO_FILE\n"); + rc = smb2_get_info_file(work, req, rsp, (void *)rsp_org); + break; + case SMB2_O_INFO_FILESYSTEM: + ksmbd_debug(SMB, "GOT SMB2_O_INFO_FILESYSTEM\n"); + rc = smb2_get_info_filesystem(work, req, rsp, (void *)rsp_org); + break; + case SMB2_O_INFO_SECURITY: + ksmbd_debug(SMB, "GOT SMB2_O_INFO_SECURITY\n"); + rc = smb2_get_info_sec(work, req, rsp, (void *)rsp_org); + break; + default: + ksmbd_debug(SMB, "InfoType %d not supported yet\n", + req->InfoType); + rc = -EOPNOTSUPP; + } + + if (rc < 0) { + if (rc == -EACCES) + rsp->hdr.Status = STATUS_ACCESS_DENIED; + else if (rc == -ENOENT) + rsp->hdr.Status = STATUS_FILE_CLOSED; + else if (rc == -EIO) + rsp->hdr.Status = STATUS_UNEXPECTED_IO_ERROR; + else if (rc == -EOPNOTSUPP || rsp->hdr.Status == 0) + rsp->hdr.Status = STATUS_INVALID_INFO_CLASS; + smb2_set_err_rsp(work); + + ksmbd_debug(SMB, "error while processing smb2 query rc = %d\n", + rc); + return rc; + } + rsp->StructureSize = cpu_to_le16(9); + rsp->OutputBufferOffset = cpu_to_le16(72); + inc_rfc1001_len(rsp_org, 8); + return 0; +} + +/** + * smb2_close_pipe() - handler for closing IPC pipe + * @work: smb work containing close request buffer + * + * Return: 0 + */ +static noinline int smb2_close_pipe(struct ksmbd_work *work) +{ + u64 id; + struct smb2_close_req *req = work->request_buf; + struct smb2_close_rsp *rsp = work->response_buf; + + id = le64_to_cpu(req->VolatileFileId); + ksmbd_session_rpc_close(work->sess, id); + + rsp->StructureSize = cpu_to_le16(60); + rsp->Flags = 0; + rsp->Reserved = 0; + rsp->CreationTime = 0; + rsp->LastAccessTime = 0; + rsp->LastWriteTime = 0; + rsp->ChangeTime = 0; + rsp->AllocationSize = 0; + rsp->EndOfFile = 0; + rsp->Attributes = 0; + inc_rfc1001_len(rsp, 60); + return 0; +} + +/** + * smb2_close() - handler for smb2 close file command + * @work: smb work containing close request buffer + * + * Return: 0 + */ +int smb2_close(struct ksmbd_work *work) +{ + u64 volatile_id = KSMBD_NO_FID; + u64 sess_id; + struct smb2_close_req *req; + struct smb2_close_rsp *rsp; + struct smb2_close_rsp *rsp_org; + struct ksmbd_conn *conn = work->conn; + struct ksmbd_file *fp; + struct inode *inode; + u64 time; + int err = 0; + + rsp_org = work->response_buf; + WORK_BUFFERS(work, req, rsp); + + if (test_share_config_flag(work->tcon->share_conf, + KSMBD_SHARE_FLAG_PIPE)) { + ksmbd_debug(SMB, "IPC pipe close request\n"); + return smb2_close_pipe(work); + } + + sess_id = le64_to_cpu(req->hdr.SessionId); + if (req->hdr.Flags & SMB2_FLAGS_RELATED_OPERATIONS) + sess_id = work->compound_sid; + + work->compound_sid = 0; + if (check_session_id(conn, sess_id)) { + work->compound_sid = sess_id; + } else { + rsp->hdr.Status = STATUS_USER_SESSION_DELETED; + if (req->hdr.Flags & SMB2_FLAGS_RELATED_OPERATIONS) + rsp->hdr.Status = STATUS_INVALID_PARAMETER; + err = -EBADF; + goto out; + } + + if (work->next_smb2_rcv_hdr_off && + !has_file_id(le64_to_cpu(req->VolatileFileId))) { + if (!has_file_id(work->compound_fid)) { + /* file already closed, return FILE_CLOSED */ + ksmbd_debug(SMB, "file already closed\n"); + rsp->hdr.Status = STATUS_FILE_CLOSED; + err = -EBADF; + goto out; + } else { + ksmbd_debug(SMB, + "Compound request set FID = %llu:%llu\n", + work->compound_fid, + work->compound_pfid); + volatile_id = work->compound_fid; + + /* file closed, stored id is not valid anymore */ + work->compound_fid = KSMBD_NO_FID; + work->compound_pfid = KSMBD_NO_FID; + } + } else { + volatile_id = le64_to_cpu(req->VolatileFileId); + } + ksmbd_debug(SMB, "volatile_id = %llu\n", volatile_id); + + rsp->StructureSize = cpu_to_le16(60); + rsp->Reserved = 0; + + if (req->Flags == SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB) { + fp = ksmbd_lookup_fd_fast(work, volatile_id); + if (!fp) { + err = -ENOENT; + goto out; + } + + inode = file_inode(fp->filp); + rsp->Flags = SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB; + rsp->AllocationSize = S_ISDIR(inode->i_mode) ? 0 : + cpu_to_le64(inode->i_blocks << 9); + rsp->EndOfFile = cpu_to_le64(inode->i_size); + rsp->Attributes = fp->f_ci->m_fattr; + rsp->CreationTime = cpu_to_le64(fp->create_time); + time = ksmbd_UnixTimeToNT(inode->i_atime); + rsp->LastAccessTime = cpu_to_le64(time); + time = ksmbd_UnixTimeToNT(inode->i_mtime); + rsp->LastWriteTime = cpu_to_le64(time); + time = ksmbd_UnixTimeToNT(inode->i_ctime); + rsp->ChangeTime = cpu_to_le64(time); + ksmbd_fd_put(work, fp); + } else { + rsp->Flags = 0; + rsp->AllocationSize = 0; + rsp->EndOfFile = 0; + rsp->Attributes = 0; + rsp->CreationTime = 0; + rsp->LastAccessTime = 0; + rsp->LastWriteTime = 0; + rsp->ChangeTime = 0; + } + + err = ksmbd_close_fd(work, volatile_id); +out: + if (err) { + if (rsp->hdr.Status == 0) + rsp->hdr.Status = STATUS_FILE_CLOSED; + smb2_set_err_rsp(work); + } else { + inc_rfc1001_len(rsp_org, 60); + } + + return 0; +} + +/** + * smb2_echo() - handler for smb2 echo(ping) command + * @work: smb work containing echo request buffer + * + * Return: 0 + */ +int smb2_echo(struct ksmbd_work *work) +{ + struct smb2_echo_rsp *rsp = work->response_buf; + + rsp->StructureSize = cpu_to_le16(4); + rsp->Reserved = 0; + inc_rfc1001_len(rsp, 4); + return 0; +} + +static int smb2_rename(struct ksmbd_work *work, struct ksmbd_file *fp, + struct smb2_file_rename_info *file_info, + struct nls_table *local_nls) +{ + struct ksmbd_share_config *share = fp->tcon->share_conf; + char *new_name = NULL, *abs_oldname = NULL, *old_name = NULL; + char *pathname = NULL; + struct path path; + bool file_present = true; + int rc; + + ksmbd_debug(SMB, "setting FILE_RENAME_INFO\n"); + pathname = kmalloc(PATH_MAX, GFP_KERNEL); + if (!pathname) + return -ENOMEM; + + abs_oldname = d_path(&fp->filp->f_path, pathname, PATH_MAX); + if (IS_ERR(abs_oldname)) { + rc = -EINVAL; + goto out; + } + old_name = strrchr(abs_oldname, '/'); + if (old_name && old_name[1] != '\0') { + old_name++; + } else { + ksmbd_debug(SMB, "can't get last component in path %s\n", + abs_oldname); + rc = -ENOENT; + goto out; + } + + new_name = smb2_get_name(share, + file_info->FileName, + le32_to_cpu(file_info->FileNameLength), + local_nls); + if (IS_ERR(new_name)) { + rc = PTR_ERR(new_name); + goto out; + } + + if (strchr(new_name, ':')) { + int s_type; + char *xattr_stream_name, *stream_name = NULL; + size_t xattr_stream_size; + int len; + + rc = parse_stream_name(new_name, &stream_name, &s_type); + if (rc < 0) + goto out; + + len = strlen(new_name); + if (new_name[len - 1] != '/') { + pr_err("not allow base filename in rename\n"); + rc = -ESHARE; + goto out; + } + + rc = ksmbd_vfs_xattr_stream_name(stream_name, + &xattr_stream_name, + &xattr_stream_size, + s_type); + if (rc) + goto out; + + rc = ksmbd_vfs_setxattr(file_mnt_user_ns(fp->filp), + fp->filp->f_path.dentry, + xattr_stream_name, + NULL, 0, 0); + if (rc < 0) { + pr_err("failed to store stream name in xattr: %d\n", + rc); + rc = -EINVAL; + goto out; + } + + goto out; + } + + ksmbd_debug(SMB, "new name %s\n", new_name); + rc = ksmbd_vfs_kern_path(new_name, 0, &path, 1); + if (rc) + file_present = false; + else + path_put(&path); + + if (ksmbd_share_veto_filename(share, new_name)) { + rc = -ENOENT; + ksmbd_debug(SMB, "Can't rename vetoed file: %s\n", new_name); + goto out; + } + + if (file_info->ReplaceIfExists) { + if (file_present) { + rc = ksmbd_vfs_remove_file(work, new_name); + if (rc) { + if (rc != -ENOTEMPTY) + rc = -EINVAL; + ksmbd_debug(SMB, "cannot delete %s, rc %d\n", + new_name, rc); + goto out; + } + } + } else { + if (file_present && + strncmp(old_name, path.dentry->d_name.name, strlen(old_name))) { + rc = -EEXIST; + ksmbd_debug(SMB, + "cannot rename already existing file\n"); + goto out; + } + } + + rc = ksmbd_vfs_fp_rename(work, fp, new_name); +out: + kfree(pathname); + if (!IS_ERR(new_name)) + kfree(new_name); + return rc; +} + +static int smb2_create_link(struct ksmbd_work *work, + struct ksmbd_share_config *share, + struct smb2_file_link_info *file_info, + struct file *filp, + struct nls_table *local_nls) +{ + char *link_name = NULL, *target_name = NULL, *pathname = NULL; + struct path path; + bool file_present = true; + int rc; + + ksmbd_debug(SMB, "setting FILE_LINK_INFORMATION\n"); + pathname = kmalloc(PATH_MAX, GFP_KERNEL); + if (!pathname) + return -ENOMEM; + + link_name = smb2_get_name(share, + file_info->FileName, + le32_to_cpu(file_info->FileNameLength), + local_nls); + if (IS_ERR(link_name) || S_ISDIR(file_inode(filp)->i_mode)) { + rc = -EINVAL; + goto out; + } + + ksmbd_debug(SMB, "link name is %s\n", link_name); + target_name = d_path(&filp->f_path, pathname, PATH_MAX); + if (IS_ERR(target_name)) { + rc = -EINVAL; + goto out; + } + + ksmbd_debug(SMB, "target name is %s\n", target_name); + rc = ksmbd_vfs_kern_path(link_name, 0, &path, 0); + if (rc) + file_present = false; + else + path_put(&path); + + if (file_info->ReplaceIfExists) { + if (file_present) { + rc = ksmbd_vfs_remove_file(work, link_name); + if (rc) { + rc = -EINVAL; + ksmbd_debug(SMB, "cannot delete %s\n", + link_name); + goto out; + } + } + } else { + if (file_present) { + rc = -EEXIST; + ksmbd_debug(SMB, "link already exists\n"); + goto out; + } + } + + rc = ksmbd_vfs_link(work, target_name, link_name); + if (rc) + rc = -EINVAL; +out: + if (!IS_ERR(link_name)) + kfree(link_name); + kfree(pathname); + return rc; +} + +static int set_file_basic_info(struct ksmbd_file *fp, char *buf, + struct ksmbd_share_config *share) +{ + struct smb2_file_all_info *file_info; + struct iattr attrs; + struct iattr temp_attrs; + struct file *filp; + struct inode *inode; + struct user_namespace *user_ns; + int rc; + + if (!(fp->daccess & FILE_WRITE_ATTRIBUTES_LE)) + return -EACCES; + + file_info = (struct smb2_file_all_info *)buf; + attrs.ia_valid = 0; + filp = fp->filp; + inode = file_inode(filp); + user_ns = file_mnt_user_ns(filp); + + if (file_info->CreationTime) + fp->create_time = le64_to_cpu(file_info->CreationTime); + + if (file_info->LastAccessTime) { + attrs.ia_atime = ksmbd_NTtimeToUnix(file_info->LastAccessTime); + attrs.ia_valid |= (ATTR_ATIME | ATTR_ATIME_SET); + } + + if (file_info->ChangeTime) { + temp_attrs.ia_ctime = ksmbd_NTtimeToUnix(file_info->ChangeTime); + attrs.ia_ctime = temp_attrs.ia_ctime; + attrs.ia_valid |= ATTR_CTIME; + } else { + temp_attrs.ia_ctime = inode->i_ctime; + } + + if (file_info->LastWriteTime) { + attrs.ia_mtime = ksmbd_NTtimeToUnix(file_info->LastWriteTime); + attrs.ia_valid |= (ATTR_MTIME | ATTR_MTIME_SET); + } + + if (file_info->Attributes) { + if (!S_ISDIR(inode->i_mode) && + file_info->Attributes & ATTR_DIRECTORY_LE) { + pr_err("can't change a file to a directory\n"); + return -EINVAL; + } + + if (!(S_ISDIR(inode->i_mode) && file_info->Attributes == ATTR_NORMAL_LE)) + fp->f_ci->m_fattr = file_info->Attributes | + (fp->f_ci->m_fattr & ATTR_DIRECTORY_LE); + } + + if (test_share_config_flag(share, KSMBD_SHARE_FLAG_STORE_DOS_ATTRS) && + (file_info->CreationTime || file_info->Attributes)) { + struct xattr_dos_attrib da = {0}; + + da.version = 4; + da.itime = fp->itime; + da.create_time = fp->create_time; + da.attr = le32_to_cpu(fp->f_ci->m_fattr); + da.flags = XATTR_DOSINFO_ATTRIB | XATTR_DOSINFO_CREATE_TIME | + XATTR_DOSINFO_ITIME; + + rc = ksmbd_vfs_set_dos_attrib_xattr(user_ns, + filp->f_path.dentry, &da); + if (rc) + ksmbd_debug(SMB, + "failed to restore file attribute in EA\n"); + rc = 0; + } + + /* + * HACK : set ctime here to avoid ctime changed + * when file_info->ChangeTime is zero. + */ + attrs.ia_ctime = temp_attrs.ia_ctime; + attrs.ia_valid |= ATTR_CTIME; + + if (attrs.ia_valid) { + struct dentry *dentry = filp->f_path.dentry; + struct inode *inode = d_inode(dentry); + + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + return -EACCES; + + rc = setattr_prepare(user_ns, dentry, &attrs); + if (rc) + return -EINVAL; + + inode_lock(inode); + setattr_copy(user_ns, inode, &attrs); + attrs.ia_valid &= ~ATTR_CTIME; + rc = notify_change(user_ns, dentry, &attrs, NULL); + inode_unlock(inode); + } + return 0; +} + +static int set_file_allocation_info(struct ksmbd_work *work, + struct ksmbd_file *fp, char *buf) +{ + /* + * TODO : It's working fine only when store dos attributes + * is not yes. need to implement a logic which works + * properly with any smb.conf option + */ + + struct smb2_file_alloc_info *file_alloc_info; + loff_t alloc_blks; + struct inode *inode; + int rc; + + if (!(fp->daccess & FILE_WRITE_DATA_LE)) + return -EACCES; + + file_alloc_info = (struct smb2_file_alloc_info *)buf; + alloc_blks = (le64_to_cpu(file_alloc_info->AllocationSize) + 511) >> 9; + inode = file_inode(fp->filp); + + if (alloc_blks > inode->i_blocks) { + smb_break_all_levII_oplock(work, fp, 1); + rc = vfs_fallocate(fp->filp, FALLOC_FL_KEEP_SIZE, 0, + alloc_blks * 512); + if (rc && rc != -EOPNOTSUPP) { + pr_err("vfs_fallocate is failed : %d\n", rc); + return rc; + } + } else if (alloc_blks < inode->i_blocks) { + loff_t size; + + /* + * Allocation size could be smaller than original one + * which means allocated blocks in file should be + * deallocated. use truncate to cut out it, but inode + * size is also updated with truncate offset. + * inode size is retained by backup inode size. + */ + size = i_size_read(inode); + rc = ksmbd_vfs_truncate(work, NULL, fp, alloc_blks * 512); + if (rc) { + pr_err("truncate failed! filename : %s, err %d\n", + fp->filename, rc); + return rc; + } + if (size < alloc_blks * 512) + i_size_write(inode, size); + } + return 0; +} + +static int set_end_of_file_info(struct ksmbd_work *work, struct ksmbd_file *fp, + char *buf) +{ + struct smb2_file_eof_info *file_eof_info; + loff_t newsize; + struct inode *inode; + int rc; + + if (!(fp->daccess & FILE_WRITE_DATA_LE)) + return -EACCES; + + file_eof_info = (struct smb2_file_eof_info *)buf; + newsize = le64_to_cpu(file_eof_info->EndOfFile); + inode = file_inode(fp->filp); + + /* + * If FILE_END_OF_FILE_INFORMATION of set_info_file is called + * on FAT32 shared device, truncate execution time is too long + * and network error could cause from windows client. because + * truncate of some filesystem like FAT32 fill zero data in + * truncated range. + */ + if (inode->i_sb->s_magic != MSDOS_SUPER_MAGIC) { + ksmbd_debug(SMB, "filename : %s truncated to newsize %lld\n", + fp->filename, newsize); + rc = ksmbd_vfs_truncate(work, NULL, fp, newsize); + if (rc) { + ksmbd_debug(SMB, "truncate failed! filename : %s err %d\n", + fp->filename, rc); + if (rc != -EAGAIN) + rc = -EBADF; + return rc; + } + } + return 0; +} + +static int set_rename_info(struct ksmbd_work *work, struct ksmbd_file *fp, + char *buf) +{ + struct ksmbd_file *parent_fp; + struct dentry *parent; + struct dentry *dentry = fp->filp->f_path.dentry; + int ret; + + if (!(fp->daccess & FILE_DELETE_LE)) { + pr_err("no right to delete : 0x%x\n", fp->daccess); + return -EACCES; + } + + if (ksmbd_stream_fd(fp)) + goto next; + + parent = dget_parent(dentry); + ret = ksmbd_vfs_lock_parent(parent, dentry); + if (ret) { + dput(parent); + return ret; + } + + parent_fp = ksmbd_lookup_fd_inode(d_inode(parent)); + inode_unlock(d_inode(parent)); + dput(parent); + + if (parent_fp) { + if (parent_fp->daccess & FILE_DELETE_LE) { + pr_err("parent dir is opened with delete access\n"); + return -ESHARE; + } + } +next: + return smb2_rename(work, fp, + (struct smb2_file_rename_info *)buf, + work->sess->conn->local_nls); +} + +static int set_file_disposition_info(struct ksmbd_file *fp, char *buf) +{ + struct smb2_file_disposition_info *file_info; + struct inode *inode; + + if (!(fp->daccess & FILE_DELETE_LE)) { + pr_err("no right to delete : 0x%x\n", fp->daccess); + return -EACCES; + } + + inode = file_inode(fp->filp); + file_info = (struct smb2_file_disposition_info *)buf; + if (file_info->DeletePending) { + if (S_ISDIR(inode->i_mode) && + ksmbd_vfs_empty_dir(fp) == -ENOTEMPTY) + return -EBUSY; + ksmbd_set_inode_pending_delete(fp); + } else { + ksmbd_clear_inode_pending_delete(fp); + } + return 0; +} + +static int set_file_position_info(struct ksmbd_file *fp, char *buf) +{ + struct smb2_file_pos_info *file_info; + loff_t current_byte_offset; + unsigned long sector_size; + struct inode *inode; + + inode = file_inode(fp->filp); + file_info = (struct smb2_file_pos_info *)buf; + current_byte_offset = le64_to_cpu(file_info->CurrentByteOffset); + sector_size = inode->i_sb->s_blocksize; + + if (current_byte_offset < 0 || + (fp->coption == FILE_NO_INTERMEDIATE_BUFFERING_LE && + current_byte_offset & (sector_size - 1))) { + pr_err("CurrentByteOffset is not valid : %llu\n", + current_byte_offset); + return -EINVAL; + } + + fp->filp->f_pos = current_byte_offset; + return 0; +} + +static int set_file_mode_info(struct ksmbd_file *fp, char *buf) +{ + struct smb2_file_mode_info *file_info; + __le32 mode; + + file_info = (struct smb2_file_mode_info *)buf; + mode = file_info->Mode; + + if ((mode & ~FILE_MODE_INFO_MASK) || + (mode & FILE_SYNCHRONOUS_IO_ALERT_LE && + mode & FILE_SYNCHRONOUS_IO_NONALERT_LE)) { + pr_err("Mode is not valid : 0x%x\n", le32_to_cpu(mode)); + return -EINVAL; + } + + /* + * TODO : need to implement consideration for + * FILE_SYNCHRONOUS_IO_ALERT and FILE_SYNCHRONOUS_IO_NONALERT + */ + ksmbd_vfs_set_fadvise(fp->filp, mode); + fp->coption = mode; + return 0; +} + +/** + * smb2_set_info_file() - handler for smb2 set info command + * @work: smb work containing set info command buffer + * @fp: ksmbd_file pointer + * @info_class: smb2 set info class + * @share: ksmbd_share_config pointer + * + * Return: 0 on success, otherwise error + * TODO: need to implement an error handling for STATUS_INFO_LENGTH_MISMATCH + */ +static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp, + int info_class, char *buf, + struct ksmbd_share_config *share) +{ + switch (info_class) { + case FILE_BASIC_INFORMATION: + return set_file_basic_info(fp, buf, share); + + case FILE_ALLOCATION_INFORMATION: + return set_file_allocation_info(work, fp, buf); + + case FILE_END_OF_FILE_INFORMATION: + return set_end_of_file_info(work, fp, buf); + + case FILE_RENAME_INFORMATION: + if (!test_tree_conn_flag(work->tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) { + ksmbd_debug(SMB, + "User does not have write permission\n"); + return -EACCES; + } + return set_rename_info(work, fp, buf); + + case FILE_LINK_INFORMATION: + return smb2_create_link(work, work->tcon->share_conf, + (struct smb2_file_link_info *)buf, fp->filp, + work->sess->conn->local_nls); + + case FILE_DISPOSITION_INFORMATION: + if (!test_tree_conn_flag(work->tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) { + ksmbd_debug(SMB, + "User does not have write permission\n"); + return -EACCES; + } + return set_file_disposition_info(fp, buf); + + case FILE_FULL_EA_INFORMATION: + { + if (!(fp->daccess & FILE_WRITE_EA_LE)) { + pr_err("Not permitted to write ext attr: 0x%x\n", + fp->daccess); + return -EACCES; + } + + return smb2_set_ea((struct smb2_ea_info *)buf, + &fp->filp->f_path); + } + + case FILE_POSITION_INFORMATION: + return set_file_position_info(fp, buf); + + case FILE_MODE_INFORMATION: + return set_file_mode_info(fp, buf); + } + + pr_err("Unimplemented Fileinfoclass :%d\n", info_class); + return -EOPNOTSUPP; +} + +static int smb2_set_info_sec(struct ksmbd_file *fp, int addition_info, + char *buffer, int buf_len) +{ + struct smb_ntsd *pntsd = (struct smb_ntsd *)buffer; + + fp->saccess |= FILE_SHARE_DELETE_LE; + + return set_info_sec(fp->conn, fp->tcon, &fp->filp->f_path, pntsd, + buf_len, false); +} + +/** + * smb2_set_info() - handler for smb2 set info command handler + * @work: smb work containing set info request buffer + * + * Return: 0 on success, otherwise error + */ +int smb2_set_info(struct ksmbd_work *work) +{ + struct smb2_set_info_req *req; + struct smb2_set_info_rsp *rsp, *rsp_org; + struct ksmbd_file *fp; + int rc = 0; + unsigned int id = KSMBD_NO_FID, pid = KSMBD_NO_FID; + + ksmbd_debug(SMB, "Received set info request\n"); + + rsp_org = work->response_buf; + if (work->next_smb2_rcv_hdr_off) { + req = ksmbd_req_buf_next(work); + rsp = ksmbd_resp_buf_next(work); + if (!has_file_id(le64_to_cpu(req->VolatileFileId))) { + ksmbd_debug(SMB, "Compound request set FID = %llu\n", + work->compound_fid); + id = work->compound_fid; + pid = work->compound_pfid; + } + } else { + req = work->request_buf; + rsp = work->response_buf; + } + + if (!has_file_id(id)) { + id = le64_to_cpu(req->VolatileFileId); + pid = le64_to_cpu(req->PersistentFileId); + } + + fp = ksmbd_lookup_fd_slow(work, id, pid); + if (!fp) { + ksmbd_debug(SMB, "Invalid id for close: %u\n", id); + rc = -ENOENT; + goto err_out; + } + + switch (req->InfoType) { + case SMB2_O_INFO_FILE: + ksmbd_debug(SMB, "GOT SMB2_O_INFO_FILE\n"); + rc = smb2_set_info_file(work, fp, req->FileInfoClass, + req->Buffer, work->tcon->share_conf); + break; + case SMB2_O_INFO_SECURITY: + ksmbd_debug(SMB, "GOT SMB2_O_INFO_SECURITY\n"); + if (ksmbd_override_fsids(work)) { + rc = -ENOMEM; + goto err_out; + } + rc = smb2_set_info_sec(fp, + le32_to_cpu(req->AdditionalInformation), + req->Buffer, + le32_to_cpu(req->BufferLength)); + ksmbd_revert_fsids(work); + break; + default: + rc = -EOPNOTSUPP; + } + + if (rc < 0) + goto err_out; + + rsp->StructureSize = cpu_to_le16(2); + inc_rfc1001_len(rsp_org, 2); + ksmbd_fd_put(work, fp); + return 0; + +err_out: + if (rc == -EACCES || rc == -EPERM) + rsp->hdr.Status = STATUS_ACCESS_DENIED; + else if (rc == -EINVAL) + rsp->hdr.Status = STATUS_INVALID_PARAMETER; + else if (rc == -ESHARE) + rsp->hdr.Status = STATUS_SHARING_VIOLATION; + else if (rc == -ENOENT) + rsp->hdr.Status = STATUS_OBJECT_NAME_INVALID; + else if (rc == -EBUSY || rc == -ENOTEMPTY) + rsp->hdr.Status = STATUS_DIRECTORY_NOT_EMPTY; + else if (rc == -EAGAIN) + rsp->hdr.Status = STATUS_FILE_LOCK_CONFLICT; + else if (rc == -EBADF || rc == -ESTALE) + rsp->hdr.Status = STATUS_INVALID_HANDLE; + else if (rc == -EEXIST) + rsp->hdr.Status = STATUS_OBJECT_NAME_COLLISION; + else if (rsp->hdr.Status == 0 || rc == -EOPNOTSUPP) + rsp->hdr.Status = STATUS_INVALID_INFO_CLASS; + smb2_set_err_rsp(work); + ksmbd_fd_put(work, fp); + ksmbd_debug(SMB, "error while processing smb2 query rc = %d\n", rc); + return rc; +} + +/** + * smb2_read_pipe() - handler for smb2 read from IPC pipe + * @work: smb work containing read IPC pipe command buffer + * + * Return: 0 on success, otherwise error + */ +static noinline int smb2_read_pipe(struct ksmbd_work *work) +{ + int nbytes = 0, err; + u64 id; + struct ksmbd_rpc_command *rpc_resp; + struct smb2_read_req *req = work->request_buf; + struct smb2_read_rsp *rsp = work->response_buf; + + id = le64_to_cpu(req->VolatileFileId); + + inc_rfc1001_len(rsp, 16); + rpc_resp = ksmbd_rpc_read(work->sess, id); + if (rpc_resp) { + if (rpc_resp->flags != KSMBD_RPC_OK) { + err = -EINVAL; + goto out; + } + + work->aux_payload_buf = + kvmalloc(rpc_resp->payload_sz, GFP_KERNEL | __GFP_ZERO); + if (!work->aux_payload_buf) { + err = -ENOMEM; + goto out; + } + + memcpy(work->aux_payload_buf, rpc_resp->payload, + rpc_resp->payload_sz); + + nbytes = rpc_resp->payload_sz; + work->resp_hdr_sz = get_rfc1002_len(rsp) + 4; + work->aux_payload_sz = nbytes; + kvfree(rpc_resp); + } + + rsp->StructureSize = cpu_to_le16(17); + rsp->DataOffset = 80; + rsp->Reserved = 0; + rsp->DataLength = cpu_to_le32(nbytes); + rsp->DataRemaining = 0; + rsp->Reserved2 = 0; + inc_rfc1001_len(rsp, nbytes); + return 0; + +out: + rsp->hdr.Status = STATUS_UNEXPECTED_IO_ERROR; + smb2_set_err_rsp(work); + kvfree(rpc_resp); + return err; +} + +static ssize_t smb2_read_rdma_channel(struct ksmbd_work *work, + struct smb2_read_req *req, void *data_buf, + size_t length) +{ + struct smb2_buffer_desc_v1 *desc = + (struct smb2_buffer_desc_v1 *)&req->Buffer[0]; + int err; + + if (work->conn->dialect == SMB30_PROT_ID && + req->Channel != SMB2_CHANNEL_RDMA_V1) + return -EINVAL; + + if (req->ReadChannelInfoOffset == 0 || + le16_to_cpu(req->ReadChannelInfoLength) < sizeof(*desc)) + return -EINVAL; + + work->need_invalidate_rkey = + (req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE); + work->remote_key = le32_to_cpu(desc->token); + + err = ksmbd_conn_rdma_write(work->conn, data_buf, length, + le32_to_cpu(desc->token), + le64_to_cpu(desc->offset), + le32_to_cpu(desc->length)); + if (err) + return err; + + return length; +} + +/** + * smb2_read() - handler for smb2 read from file + * @work: smb work containing read command buffer + * + * Return: 0 on success, otherwise error + */ +int smb2_read(struct ksmbd_work *work) +{ + struct ksmbd_conn *conn = work->conn; + struct smb2_read_req *req; + struct smb2_read_rsp *rsp, *rsp_org; + struct ksmbd_file *fp; + loff_t offset; + size_t length, mincount; + ssize_t nbytes = 0, remain_bytes = 0; + int err = 0; + + rsp_org = work->response_buf; + WORK_BUFFERS(work, req, rsp); + + if (test_share_config_flag(work->tcon->share_conf, + KSMBD_SHARE_FLAG_PIPE)) { + ksmbd_debug(SMB, "IPC pipe read request\n"); + return smb2_read_pipe(work); + } + + fp = ksmbd_lookup_fd_slow(work, le64_to_cpu(req->VolatileFileId), + le64_to_cpu(req->PersistentFileId)); + if (!fp) { + err = -ENOENT; + goto out; + } + + if (!(fp->daccess & (FILE_READ_DATA_LE | FILE_READ_ATTRIBUTES_LE))) { + pr_err("Not permitted to read : 0x%x\n", fp->daccess); + err = -EACCES; + goto out; + } + + offset = le64_to_cpu(req->Offset); + length = le32_to_cpu(req->Length); + mincount = le32_to_cpu(req->MinimumCount); + + if (length > conn->vals->max_read_size) { + ksmbd_debug(SMB, "limiting read size to max size(%u)\n", + conn->vals->max_read_size); + err = -EINVAL; + goto out; + } + + ksmbd_debug(SMB, "filename %pd, offset %lld, len %zu\n", + fp->filp->f_path.dentry, offset, length); + + work->aux_payload_buf = kvmalloc(length, GFP_KERNEL | __GFP_ZERO); + if (!work->aux_payload_buf) { + err = -ENOMEM; + goto out; + } + + nbytes = ksmbd_vfs_read(work, fp, length, &offset); + if (nbytes < 0) { + err = nbytes; + goto out; + } + + if ((nbytes == 0 && length != 0) || nbytes < mincount) { + kvfree(work->aux_payload_buf); + work->aux_payload_buf = NULL; + rsp->hdr.Status = STATUS_END_OF_FILE; + smb2_set_err_rsp(work); + ksmbd_fd_put(work, fp); + return 0; + } + + ksmbd_debug(SMB, "nbytes %zu, offset %lld mincount %zu\n", + nbytes, offset, mincount); + + if (req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE || + req->Channel == SMB2_CHANNEL_RDMA_V1) { + /* write data to the client using rdma channel */ + remain_bytes = smb2_read_rdma_channel(work, req, + work->aux_payload_buf, + nbytes); + kvfree(work->aux_payload_buf); + work->aux_payload_buf = NULL; + + nbytes = 0; + if (remain_bytes < 0) { + err = (int)remain_bytes; + goto out; + } + } + + rsp->StructureSize = cpu_to_le16(17); + rsp->DataOffset = 80; + rsp->Reserved = 0; + rsp->DataLength = cpu_to_le32(nbytes); + rsp->DataRemaining = cpu_to_le32(remain_bytes); + rsp->Reserved2 = 0; + inc_rfc1001_len(rsp_org, 16); + work->resp_hdr_sz = get_rfc1002_len(rsp_org) + 4; + work->aux_payload_sz = nbytes; + inc_rfc1001_len(rsp_org, nbytes); + ksmbd_fd_put(work, fp); + return 0; + +out: + if (err) { + if (err == -EISDIR) + rsp->hdr.Status = STATUS_INVALID_DEVICE_REQUEST; + else if (err == -EAGAIN) + rsp->hdr.Status = STATUS_FILE_LOCK_CONFLICT; + else if (err == -ENOENT) + rsp->hdr.Status = STATUS_FILE_CLOSED; + else if (err == -EACCES) + rsp->hdr.Status = STATUS_ACCESS_DENIED; + else if (err == -ESHARE) + rsp->hdr.Status = STATUS_SHARING_VIOLATION; + else if (err == -EINVAL) + rsp->hdr.Status = STATUS_INVALID_PARAMETER; + else + rsp->hdr.Status = STATUS_INVALID_HANDLE; + + smb2_set_err_rsp(work); + } + ksmbd_fd_put(work, fp); + return err; +} + +/** + * smb2_write_pipe() - handler for smb2 write on IPC pipe + * @work: smb work containing write IPC pipe command buffer + * + * Return: 0 on success, otherwise error + */ +static noinline int smb2_write_pipe(struct ksmbd_work *work) +{ + struct smb2_write_req *req = work->request_buf; + struct smb2_write_rsp *rsp = work->response_buf; + struct ksmbd_rpc_command *rpc_resp; + u64 id = 0; + int err = 0, ret = 0; + char *data_buf; + size_t length; + + length = le32_to_cpu(req->Length); + id = le64_to_cpu(req->VolatileFileId); + + if (le16_to_cpu(req->DataOffset) == + (offsetof(struct smb2_write_req, Buffer) - 4)) { + data_buf = (char *)&req->Buffer[0]; + } else { + if ((le16_to_cpu(req->DataOffset) > get_rfc1002_len(req)) || + (le16_to_cpu(req->DataOffset) + length > get_rfc1002_len(req))) { + pr_err("invalid write data offset %u, smb_len %u\n", + le16_to_cpu(req->DataOffset), + get_rfc1002_len(req)); + err = -EINVAL; + goto out; + } + + data_buf = (char *)(((char *)&req->hdr.ProtocolId) + + le16_to_cpu(req->DataOffset)); + } + + rpc_resp = ksmbd_rpc_write(work->sess, id, data_buf, length); + if (rpc_resp) { + if (rpc_resp->flags == KSMBD_RPC_ENOTIMPLEMENTED) { + rsp->hdr.Status = STATUS_NOT_SUPPORTED; + kvfree(rpc_resp); + smb2_set_err_rsp(work); + return -EOPNOTSUPP; + } + if (rpc_resp->flags != KSMBD_RPC_OK) { + rsp->hdr.Status = STATUS_INVALID_HANDLE; + smb2_set_err_rsp(work); + kvfree(rpc_resp); + return ret; + } + kvfree(rpc_resp); + } + + rsp->StructureSize = cpu_to_le16(17); + rsp->DataOffset = 0; + rsp->Reserved = 0; + rsp->DataLength = cpu_to_le32(length); + rsp->DataRemaining = 0; + rsp->Reserved2 = 0; + inc_rfc1001_len(rsp, 16); + return 0; +out: + if (err) { + rsp->hdr.Status = STATUS_INVALID_HANDLE; + smb2_set_err_rsp(work); + } + + return err; +} + +static ssize_t smb2_write_rdma_channel(struct ksmbd_work *work, + struct smb2_write_req *req, + struct ksmbd_file *fp, + loff_t offset, size_t length, bool sync) +{ + struct smb2_buffer_desc_v1 *desc; + char *data_buf; + int ret; + ssize_t nbytes; + + desc = (struct smb2_buffer_desc_v1 *)&req->Buffer[0]; + + if (work->conn->dialect == SMB30_PROT_ID && + req->Channel != SMB2_CHANNEL_RDMA_V1) + return -EINVAL; + + if (req->Length != 0 || req->DataOffset != 0) + return -EINVAL; + + if (req->WriteChannelInfoOffset == 0 || + le16_to_cpu(req->WriteChannelInfoLength) < sizeof(*desc)) + return -EINVAL; + + work->need_invalidate_rkey = + (req->Channel == SMB2_CHANNEL_RDMA_V1_INVALIDATE); + work->remote_key = le32_to_cpu(desc->token); + + data_buf = kvmalloc(length, GFP_KERNEL | __GFP_ZERO); + if (!data_buf) + return -ENOMEM; + + ret = ksmbd_conn_rdma_read(work->conn, data_buf, length, + le32_to_cpu(desc->token), + le64_to_cpu(desc->offset), + le32_to_cpu(desc->length)); + if (ret < 0) { + kvfree(data_buf); + return ret; + } + + ret = ksmbd_vfs_write(work, fp, data_buf, length, &offset, sync, &nbytes); + kvfree(data_buf); + if (ret < 0) + return ret; + + return nbytes; +} + +/** + * smb2_write() - handler for smb2 write from file + * @work: smb work containing write command buffer + * + * Return: 0 on success, otherwise error + */ +int smb2_write(struct ksmbd_work *work) +{ + struct smb2_write_req *req; + struct smb2_write_rsp *rsp, *rsp_org; + struct ksmbd_file *fp = NULL; + loff_t offset; + size_t length; + ssize_t nbytes; + char *data_buf; + bool writethrough = false; + int err = 0; + + rsp_org = work->response_buf; + WORK_BUFFERS(work, req, rsp); + + if (test_share_config_flag(work->tcon->share_conf, KSMBD_SHARE_FLAG_PIPE)) { + ksmbd_debug(SMB, "IPC pipe write request\n"); + return smb2_write_pipe(work); + } + + if (!test_tree_conn_flag(work->tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) { + ksmbd_debug(SMB, "User does not have write permission\n"); + err = -EACCES; + goto out; + } + + fp = ksmbd_lookup_fd_slow(work, le64_to_cpu(req->VolatileFileId), + le64_to_cpu(req->PersistentFileId)); + if (!fp) { + err = -ENOENT; + goto out; + } + + if (!(fp->daccess & (FILE_WRITE_DATA_LE | FILE_READ_ATTRIBUTES_LE))) { + pr_err("Not permitted to write : 0x%x\n", fp->daccess); + err = -EACCES; + goto out; + } + + offset = le64_to_cpu(req->Offset); + length = le32_to_cpu(req->Length); + + if (length > work->conn->vals->max_write_size) { + ksmbd_debug(SMB, "limiting write size to max size(%u)\n", + work->conn->vals->max_write_size); + err = -EINVAL; + goto out; + } + + if (le32_to_cpu(req->Flags) & SMB2_WRITEFLAG_WRITE_THROUGH) + writethrough = true; + + if (req->Channel != SMB2_CHANNEL_RDMA_V1 && + req->Channel != SMB2_CHANNEL_RDMA_V1_INVALIDATE) { + if (le16_to_cpu(req->DataOffset) == + (offsetof(struct smb2_write_req, Buffer) - 4)) { + data_buf = (char *)&req->Buffer[0]; + } else { + if ((le16_to_cpu(req->DataOffset) > get_rfc1002_len(req)) || + (le16_to_cpu(req->DataOffset) + length > get_rfc1002_len(req))) { + pr_err("invalid write data offset %u, smb_len %u\n", + le16_to_cpu(req->DataOffset), + get_rfc1002_len(req)); + err = -EINVAL; + goto out; + } + + data_buf = (char *)(((char *)&req->hdr.ProtocolId) + + le16_to_cpu(req->DataOffset)); + } + + ksmbd_debug(SMB, "flags %u\n", le32_to_cpu(req->Flags)); + if (le32_to_cpu(req->Flags) & SMB2_WRITEFLAG_WRITE_THROUGH) + writethrough = true; + + ksmbd_debug(SMB, "filename %pd, offset %lld, len %zu\n", + fp->filp->f_path.dentry, offset, length); + err = ksmbd_vfs_write(work, fp, data_buf, length, &offset, + writethrough, &nbytes); + if (err < 0) + goto out; + } else { + /* read data from the client using rdma channel, and + * write the data. + */ + nbytes = smb2_write_rdma_channel(work, req, fp, offset, + le32_to_cpu(req->RemainingBytes), + writethrough); + if (nbytes < 0) { + err = (int)nbytes; + goto out; + } + } + + rsp->StructureSize = cpu_to_le16(17); + rsp->DataOffset = 0; + rsp->Reserved = 0; + rsp->DataLength = cpu_to_le32(nbytes); + rsp->DataRemaining = 0; + rsp->Reserved2 = 0; + inc_rfc1001_len(rsp_org, 16); + ksmbd_fd_put(work, fp); + return 0; + +out: + if (err == -EAGAIN) + rsp->hdr.Status = STATUS_FILE_LOCK_CONFLICT; + else if (err == -ENOSPC || err == -EFBIG) + rsp->hdr.Status = STATUS_DISK_FULL; + else if (err == -ENOENT) + rsp->hdr.Status = STATUS_FILE_CLOSED; + else if (err == -EACCES) + rsp->hdr.Status = STATUS_ACCESS_DENIED; + else if (err == -ESHARE) + rsp->hdr.Status = STATUS_SHARING_VIOLATION; + else if (err == -EINVAL) + rsp->hdr.Status = STATUS_INVALID_PARAMETER; + else + rsp->hdr.Status = STATUS_INVALID_HANDLE; + + smb2_set_err_rsp(work); + ksmbd_fd_put(work, fp); + return err; +} + +/** + * smb2_flush() - handler for smb2 flush file - fsync + * @work: smb work containing flush command buffer + * + * Return: 0 on success, otherwise error + */ +int smb2_flush(struct ksmbd_work *work) +{ + struct smb2_flush_req *req; + struct smb2_flush_rsp *rsp, *rsp_org; + int err; + + rsp_org = work->response_buf; + WORK_BUFFERS(work, req, rsp); + + ksmbd_debug(SMB, "SMB2_FLUSH called for fid %llu\n", + le64_to_cpu(req->VolatileFileId)); + + err = ksmbd_vfs_fsync(work, + le64_to_cpu(req->VolatileFileId), + le64_to_cpu(req->PersistentFileId)); + if (err) + goto out; + + rsp->StructureSize = cpu_to_le16(4); + rsp->Reserved = 0; + inc_rfc1001_len(rsp_org, 4); + return 0; + +out: + if (err) { + rsp->hdr.Status = STATUS_INVALID_HANDLE; + smb2_set_err_rsp(work); + } + + return err; +} + +/** + * smb2_cancel() - handler for smb2 cancel command + * @work: smb work containing cancel command buffer + * + * Return: 0 on success, otherwise error + */ +int smb2_cancel(struct ksmbd_work *work) +{ + struct ksmbd_conn *conn = work->conn; + struct smb2_hdr *hdr = work->request_buf; + struct smb2_hdr *chdr; + struct ksmbd_work *cancel_work = NULL; + int canceled = 0; + struct list_head *command_list; + + ksmbd_debug(SMB, "smb2 cancel called on mid %llu, async flags 0x%x\n", + hdr->MessageId, hdr->Flags); + + if (hdr->Flags & SMB2_FLAGS_ASYNC_COMMAND) { + command_list = &conn->async_requests; + + spin_lock(&conn->request_lock); + list_for_each_entry(cancel_work, command_list, + async_request_entry) { + chdr = cancel_work->request_buf; + + if (cancel_work->async_id != + le64_to_cpu(hdr->Id.AsyncId)) + continue; + + ksmbd_debug(SMB, + "smb2 with AsyncId %llu cancelled command = 0x%x\n", + le64_to_cpu(hdr->Id.AsyncId), + le16_to_cpu(chdr->Command)); + canceled = 1; + break; + } + spin_unlock(&conn->request_lock); + } else { + command_list = &conn->requests; + + spin_lock(&conn->request_lock); + list_for_each_entry(cancel_work, command_list, request_entry) { + chdr = cancel_work->request_buf; + + if (chdr->MessageId != hdr->MessageId || + cancel_work == work) + continue; + + ksmbd_debug(SMB, + "smb2 with mid %llu cancelled command = 0x%x\n", + le64_to_cpu(hdr->MessageId), + le16_to_cpu(chdr->Command)); + canceled = 1; + break; + } + spin_unlock(&conn->request_lock); + } + + if (canceled) { + cancel_work->state = KSMBD_WORK_CANCELLED; + if (cancel_work->cancel_fn) + cancel_work->cancel_fn(cancel_work->cancel_argv); + } + + /* For SMB2_CANCEL command itself send no response*/ + work->send_no_response = 1; + return 0; +} + +struct file_lock *smb_flock_init(struct file *f) +{ + struct file_lock *fl; + + fl = locks_alloc_lock(); + if (!fl) + goto out; + + locks_init_lock(fl); + + fl->fl_owner = f; + fl->fl_pid = current->tgid; + fl->fl_file = f; + fl->fl_flags = FL_POSIX; + fl->fl_ops = NULL; + fl->fl_lmops = NULL; + +out: + return fl; +} + +static int smb2_set_flock_flags(struct file_lock *flock, int flags) +{ + int cmd = -EINVAL; + + /* Checking for wrong flag combination during lock request*/ + switch (flags) { + case SMB2_LOCKFLAG_SHARED: + ksmbd_debug(SMB, "received shared request\n"); + cmd = F_SETLKW; + flock->fl_type = F_RDLCK; + flock->fl_flags |= FL_SLEEP; + break; + case SMB2_LOCKFLAG_EXCLUSIVE: + ksmbd_debug(SMB, "received exclusive request\n"); + cmd = F_SETLKW; + flock->fl_type = F_WRLCK; + flock->fl_flags |= FL_SLEEP; + break; + case SMB2_LOCKFLAG_SHARED | SMB2_LOCKFLAG_FAIL_IMMEDIATELY: + ksmbd_debug(SMB, + "received shared & fail immediately request\n"); + cmd = F_SETLK; + flock->fl_type = F_RDLCK; + break; + case SMB2_LOCKFLAG_EXCLUSIVE | SMB2_LOCKFLAG_FAIL_IMMEDIATELY: + ksmbd_debug(SMB, + "received exclusive & fail immediately request\n"); + cmd = F_SETLK; + flock->fl_type = F_WRLCK; + break; + case SMB2_LOCKFLAG_UNLOCK: + ksmbd_debug(SMB, "received unlock request\n"); + flock->fl_type = F_UNLCK; + cmd = 0; + break; + } + + return cmd; +} + +static struct ksmbd_lock *smb2_lock_init(struct file_lock *flock, + unsigned int cmd, int flags, + struct list_head *lock_list) +{ + struct ksmbd_lock *lock; + + lock = kzalloc(sizeof(struct ksmbd_lock), GFP_KERNEL); + if (!lock) + return NULL; + + lock->cmd = cmd; + lock->fl = flock; + lock->start = flock->fl_start; + lock->end = flock->fl_end; + lock->flags = flags; + if (lock->start == lock->end) + lock->zero_len = 1; + INIT_LIST_HEAD(&lock->clist); + INIT_LIST_HEAD(&lock->flist); + INIT_LIST_HEAD(&lock->llist); + list_add_tail(&lock->llist, lock_list); + + return lock; +} + +static void smb2_remove_blocked_lock(void **argv) +{ + struct file_lock *flock = (struct file_lock *)argv[0]; + + ksmbd_vfs_posix_lock_unblock(flock); + wake_up(&flock->fl_wait); +} + +static inline bool lock_defer_pending(struct file_lock *fl) +{ + /* check pending lock waiters */ + return waitqueue_active(&fl->fl_wait); +} + +/** + * smb2_lock() - handler for smb2 file lock command + * @work: smb work containing lock command buffer + * + * Return: 0 on success, otherwise error + */ +int smb2_lock(struct ksmbd_work *work) +{ + struct smb2_lock_req *req = work->request_buf; + struct smb2_lock_rsp *rsp = work->response_buf; + struct smb2_lock_element *lock_ele; + struct ksmbd_file *fp = NULL; + struct file_lock *flock = NULL; + struct file *filp = NULL; + int lock_count; + int flags = 0; + int cmd = 0; + int err = -EIO, i, rc = 0; + u64 lock_start, lock_length; + struct ksmbd_lock *smb_lock = NULL, *cmp_lock, *tmp, *tmp2; + struct ksmbd_conn *conn; + int nolock = 0; + LIST_HEAD(lock_list); + LIST_HEAD(rollback_list); + int prior_lock = 0; + + ksmbd_debug(SMB, "Received lock request\n"); + fp = ksmbd_lookup_fd_slow(work, + le64_to_cpu(req->VolatileFileId), + le64_to_cpu(req->PersistentFileId)); + if (!fp) { + ksmbd_debug(SMB, "Invalid file id for lock : %llu\n", + le64_to_cpu(req->VolatileFileId)); + err = -ENOENT; + goto out2; + } + + filp = fp->filp; + lock_count = le16_to_cpu(req->LockCount); + lock_ele = req->locks; + + ksmbd_debug(SMB, "lock count is %d\n", lock_count); + if (!lock_count) { + err = -EINVAL; + goto out2; + } + + for (i = 0; i < lock_count; i++) { + flags = le32_to_cpu(lock_ele[i].Flags); + + flock = smb_flock_init(filp); + if (!flock) + goto out; + + cmd = smb2_set_flock_flags(flock, flags); + + lock_start = le64_to_cpu(lock_ele[i].Offset); + lock_length = le64_to_cpu(lock_ele[i].Length); + if (lock_start > U64_MAX - lock_length) { + pr_err("Invalid lock range requested\n"); + rsp->hdr.Status = STATUS_INVALID_LOCK_RANGE; + goto out; + } + + if (lock_start > OFFSET_MAX) + flock->fl_start = OFFSET_MAX; + else + flock->fl_start = lock_start; + + lock_length = le64_to_cpu(lock_ele[i].Length); + if (lock_length > OFFSET_MAX - flock->fl_start) + lock_length = OFFSET_MAX - flock->fl_start; + + flock->fl_end = flock->fl_start + lock_length; + + if (flock->fl_end < flock->fl_start) { + ksmbd_debug(SMB, + "the end offset(%llx) is smaller than the start offset(%llx)\n", + flock->fl_end, flock->fl_start); + rsp->hdr.Status = STATUS_INVALID_LOCK_RANGE; + goto out; + } + + /* Check conflict locks in one request */ + list_for_each_entry(cmp_lock, &lock_list, llist) { + if (cmp_lock->fl->fl_start <= flock->fl_start && + cmp_lock->fl->fl_end >= flock->fl_end) { + if (cmp_lock->fl->fl_type != F_UNLCK && + flock->fl_type != F_UNLCK) { + pr_err("conflict two locks in one request\n"); + err = -EINVAL; + goto out; + } + } + } + + smb_lock = smb2_lock_init(flock, cmd, flags, &lock_list); + if (!smb_lock) { + err = -EINVAL; + goto out; + } + } + + list_for_each_entry_safe(smb_lock, tmp, &lock_list, llist) { + if (smb_lock->cmd < 0) { + err = -EINVAL; + goto out; + } + + if (!(smb_lock->flags & SMB2_LOCKFLAG_MASK)) { + err = -EINVAL; + goto out; + } + + if ((prior_lock & (SMB2_LOCKFLAG_EXCLUSIVE | SMB2_LOCKFLAG_SHARED) && + smb_lock->flags & SMB2_LOCKFLAG_UNLOCK) || + (prior_lock == SMB2_LOCKFLAG_UNLOCK && + !(smb_lock->flags & SMB2_LOCKFLAG_UNLOCK))) { + err = -EINVAL; + goto out; + } + + prior_lock = smb_lock->flags; + + if (!(smb_lock->flags & SMB2_LOCKFLAG_UNLOCK) && + !(smb_lock->flags & SMB2_LOCKFLAG_FAIL_IMMEDIATELY)) + goto no_check_cl; + + nolock = 1; + /* check locks in connection list */ + read_lock(&conn_list_lock); + list_for_each_entry(conn, &conn_list, conns_list) { + spin_lock(&conn->llist_lock); + list_for_each_entry_safe(cmp_lock, tmp2, &conn->lock_list, clist) { + if (file_inode(cmp_lock->fl->fl_file) != + file_inode(smb_lock->fl->fl_file)) + continue; + + if (smb_lock->fl->fl_type == F_UNLCK) { + if (cmp_lock->fl->fl_file == smb_lock->fl->fl_file && + cmp_lock->start == smb_lock->start && + cmp_lock->end == smb_lock->end && + !lock_defer_pending(cmp_lock->fl)) { + nolock = 0; + list_del(&cmp_lock->flist); + list_del(&cmp_lock->clist); + spin_unlock(&conn->llist_lock); + read_unlock(&conn_list_lock); + + locks_free_lock(cmp_lock->fl); + kfree(cmp_lock); + goto out_check_cl; + } + continue; + } + + if (cmp_lock->fl->fl_file == smb_lock->fl->fl_file) { + if (smb_lock->flags & SMB2_LOCKFLAG_SHARED) + continue; + } else { + if (cmp_lock->flags & SMB2_LOCKFLAG_SHARED) + continue; + } + + /* check zero byte lock range */ + if (cmp_lock->zero_len && !smb_lock->zero_len && + cmp_lock->start > smb_lock->start && + cmp_lock->start < smb_lock->end) { + spin_unlock(&conn->llist_lock); + read_unlock(&conn_list_lock); + pr_err("previous lock conflict with zero byte lock range\n"); + goto out; + } + + if (smb_lock->zero_len && !cmp_lock->zero_len && + smb_lock->start > cmp_lock->start && + smb_lock->start < cmp_lock->end) { + spin_unlock(&conn->llist_lock); + read_unlock(&conn_list_lock); + pr_err("current lock conflict with zero byte lock range\n"); + goto out; + } + + if (((cmp_lock->start <= smb_lock->start && + cmp_lock->end > smb_lock->start) || + (cmp_lock->start < smb_lock->end && + cmp_lock->end >= smb_lock->end)) && + !cmp_lock->zero_len && !smb_lock->zero_len) { + spin_unlock(&conn->llist_lock); + read_unlock(&conn_list_lock); + pr_err("Not allow lock operation on exclusive lock range\n"); + goto out; + } + } + spin_unlock(&conn->llist_lock); + } + read_unlock(&conn_list_lock); +out_check_cl: + if (smb_lock->fl->fl_type == F_UNLCK && nolock) { + pr_err("Try to unlock nolocked range\n"); + rsp->hdr.Status = STATUS_RANGE_NOT_LOCKED; + goto out; + } + +no_check_cl: + if (smb_lock->zero_len) { + err = 0; + goto skip; + } + + flock = smb_lock->fl; + list_del(&smb_lock->llist); +retry: + rc = vfs_lock_file(filp, smb_lock->cmd, flock, NULL); +skip: + if (flags & SMB2_LOCKFLAG_UNLOCK) { + if (!rc) { + ksmbd_debug(SMB, "File unlocked\n"); + } else if (rc == -ENOENT) { + rsp->hdr.Status = STATUS_NOT_LOCKED; + goto out; + } + locks_free_lock(flock); + kfree(smb_lock); + } else { + if (rc == FILE_LOCK_DEFERRED) { + void **argv; + + ksmbd_debug(SMB, + "would have to wait for getting lock\n"); + spin_lock(&work->conn->llist_lock); + list_add_tail(&smb_lock->clist, + &work->conn->lock_list); + spin_unlock(&work->conn->llist_lock); + list_add(&smb_lock->llist, &rollback_list); + + argv = kmalloc(sizeof(void *), GFP_KERNEL); + if (!argv) { + err = -ENOMEM; + goto out; + } + argv[0] = flock; + + rc = setup_async_work(work, + smb2_remove_blocked_lock, + argv); + if (rc) { + err = -ENOMEM; + goto out; + } + spin_lock(&fp->f_lock); + list_add(&work->fp_entry, &fp->blocked_works); + spin_unlock(&fp->f_lock); + + smb2_send_interim_resp(work, STATUS_PENDING); + + ksmbd_vfs_posix_lock_wait(flock); + + if (work->state != KSMBD_WORK_ACTIVE) { + list_del(&smb_lock->llist); + spin_lock(&work->conn->llist_lock); + list_del(&smb_lock->clist); + spin_unlock(&work->conn->llist_lock); + locks_free_lock(flock); + + if (work->state == KSMBD_WORK_CANCELLED) { + spin_lock(&fp->f_lock); + list_del(&work->fp_entry); + spin_unlock(&fp->f_lock); + rsp->hdr.Status = + STATUS_CANCELLED; + kfree(smb_lock); + smb2_send_interim_resp(work, + STATUS_CANCELLED); + work->send_no_response = 1; + goto out; + } + init_smb2_rsp_hdr(work); + smb2_set_err_rsp(work); + rsp->hdr.Status = + STATUS_RANGE_NOT_LOCKED; + kfree(smb_lock); + goto out2; + } + + list_del(&smb_lock->llist); + spin_lock(&work->conn->llist_lock); + list_del(&smb_lock->clist); + spin_unlock(&work->conn->llist_lock); + + spin_lock(&fp->f_lock); + list_del(&work->fp_entry); + spin_unlock(&fp->f_lock); + goto retry; + } else if (!rc) { + spin_lock(&work->conn->llist_lock); + list_add_tail(&smb_lock->clist, + &work->conn->lock_list); + list_add_tail(&smb_lock->flist, + &fp->lock_list); + spin_unlock(&work->conn->llist_lock); + list_add(&smb_lock->llist, &rollback_list); + ksmbd_debug(SMB, "successful in taking lock\n"); + } else { + goto out; + } + } + } + + if (atomic_read(&fp->f_ci->op_count) > 1) + smb_break_all_oplock(work, fp); + + rsp->StructureSize = cpu_to_le16(4); + ksmbd_debug(SMB, "successful in taking lock\n"); + rsp->hdr.Status = STATUS_SUCCESS; + rsp->Reserved = 0; + inc_rfc1001_len(rsp, 4); + ksmbd_fd_put(work, fp); + return 0; + +out: + list_for_each_entry_safe(smb_lock, tmp, &lock_list, llist) { + locks_free_lock(smb_lock->fl); + list_del(&smb_lock->llist); + kfree(smb_lock); + } + + list_for_each_entry_safe(smb_lock, tmp, &rollback_list, llist) { + struct file_lock *rlock = NULL; + + rlock = smb_flock_init(filp); + rlock->fl_type = F_UNLCK; + rlock->fl_start = smb_lock->start; + rlock->fl_end = smb_lock->end; + + rc = vfs_lock_file(filp, 0, rlock, NULL); + if (rc) + pr_err("rollback unlock fail : %d\n", rc); + + list_del(&smb_lock->llist); + spin_lock(&work->conn->llist_lock); + if (!list_empty(&smb_lock->flist)) + list_del(&smb_lock->flist); + list_del(&smb_lock->clist); + spin_unlock(&work->conn->llist_lock); + + locks_free_lock(smb_lock->fl); + locks_free_lock(rlock); + kfree(smb_lock); + } +out2: + ksmbd_debug(SMB, "failed in taking lock(flags : %x), err : %d\n", flags, err); + + if (!rsp->hdr.Status) { + if (err == -EINVAL) + rsp->hdr.Status = STATUS_INVALID_PARAMETER; + else if (err == -ENOMEM) + rsp->hdr.Status = STATUS_INSUFFICIENT_RESOURCES; + else if (err == -ENOENT) + rsp->hdr.Status = STATUS_FILE_CLOSED; + else + rsp->hdr.Status = STATUS_LOCK_NOT_GRANTED; + } + + smb2_set_err_rsp(work); + ksmbd_fd_put(work, fp); + return err; +} + +static int fsctl_copychunk(struct ksmbd_work *work, struct smb2_ioctl_req *req, + struct smb2_ioctl_rsp *rsp) +{ + struct copychunk_ioctl_req *ci_req; + struct copychunk_ioctl_rsp *ci_rsp; + struct ksmbd_file *src_fp = NULL, *dst_fp = NULL; + struct srv_copychunk *chunks; + unsigned int i, chunk_count, chunk_count_written = 0; + unsigned int chunk_size_written = 0; + loff_t total_size_written = 0; + int ret, cnt_code; + + cnt_code = le32_to_cpu(req->CntCode); + ci_req = (struct copychunk_ioctl_req *)&req->Buffer[0]; + ci_rsp = (struct copychunk_ioctl_rsp *)&rsp->Buffer[0]; + + rsp->VolatileFileId = req->VolatileFileId; + rsp->PersistentFileId = req->PersistentFileId; + ci_rsp->ChunksWritten = + cpu_to_le32(ksmbd_server_side_copy_max_chunk_count()); + ci_rsp->ChunkBytesWritten = + cpu_to_le32(ksmbd_server_side_copy_max_chunk_size()); + ci_rsp->TotalBytesWritten = + cpu_to_le32(ksmbd_server_side_copy_max_total_size()); + + chunks = (struct srv_copychunk *)&ci_req->Chunks[0]; + chunk_count = le32_to_cpu(ci_req->ChunkCount); + total_size_written = 0; + + /* verify the SRV_COPYCHUNK_COPY packet */ + if (chunk_count > ksmbd_server_side_copy_max_chunk_count() || + le32_to_cpu(req->InputCount) < + offsetof(struct copychunk_ioctl_req, Chunks) + + chunk_count * sizeof(struct srv_copychunk)) { + rsp->hdr.Status = STATUS_INVALID_PARAMETER; + return -EINVAL; + } + + for (i = 0; i < chunk_count; i++) { + if (le32_to_cpu(chunks[i].Length) == 0 || + le32_to_cpu(chunks[i].Length) > ksmbd_server_side_copy_max_chunk_size()) + break; + total_size_written += le32_to_cpu(chunks[i].Length); + } + + if (i < chunk_count || + total_size_written > ksmbd_server_side_copy_max_total_size()) { + rsp->hdr.Status = STATUS_INVALID_PARAMETER; + return -EINVAL; + } + + src_fp = ksmbd_lookup_foreign_fd(work, + le64_to_cpu(ci_req->ResumeKey[0])); + dst_fp = ksmbd_lookup_fd_slow(work, + le64_to_cpu(req->VolatileFileId), + le64_to_cpu(req->PersistentFileId)); + ret = -EINVAL; + if (!src_fp || + src_fp->persistent_id != le64_to_cpu(ci_req->ResumeKey[1])) { + rsp->hdr.Status = STATUS_OBJECT_NAME_NOT_FOUND; + goto out; + } + + if (!dst_fp) { + rsp->hdr.Status = STATUS_FILE_CLOSED; + goto out; + } + + /* + * FILE_READ_DATA should only be included in + * the FSCTL_COPYCHUNK case + */ + if (cnt_code == FSCTL_COPYCHUNK && + !(dst_fp->daccess & (FILE_READ_DATA_LE | FILE_GENERIC_READ_LE))) { + rsp->hdr.Status = STATUS_ACCESS_DENIED; + goto out; + } + + ret = ksmbd_vfs_copy_file_ranges(work, src_fp, dst_fp, + chunks, chunk_count, + &chunk_count_written, + &chunk_size_written, + &total_size_written); + if (ret < 0) { + if (ret == -EACCES) + rsp->hdr.Status = STATUS_ACCESS_DENIED; + if (ret == -EAGAIN) + rsp->hdr.Status = STATUS_FILE_LOCK_CONFLICT; + else if (ret == -EBADF) + rsp->hdr.Status = STATUS_INVALID_HANDLE; + else if (ret == -EFBIG || ret == -ENOSPC) + rsp->hdr.Status = STATUS_DISK_FULL; + else if (ret == -EINVAL) + rsp->hdr.Status = STATUS_INVALID_PARAMETER; + else if (ret == -EISDIR) + rsp->hdr.Status = STATUS_FILE_IS_A_DIRECTORY; + else if (ret == -E2BIG) + rsp->hdr.Status = STATUS_INVALID_VIEW_SIZE; + else + rsp->hdr.Status = STATUS_UNEXPECTED_IO_ERROR; + } + + ci_rsp->ChunksWritten = cpu_to_le32(chunk_count_written); + ci_rsp->ChunkBytesWritten = cpu_to_le32(chunk_size_written); + ci_rsp->TotalBytesWritten = cpu_to_le32(total_size_written); +out: + ksmbd_fd_put(work, src_fp); + ksmbd_fd_put(work, dst_fp); + return ret; +} + +static __be32 idev_ipv4_address(struct in_device *idev) +{ + __be32 addr = 0; + + struct in_ifaddr *ifa; + + rcu_read_lock(); + in_dev_for_each_ifa_rcu(ifa, idev) { + if (ifa->ifa_flags & IFA_F_SECONDARY) + continue; + + addr = ifa->ifa_address; + break; + } + rcu_read_unlock(); + return addr; +} + +static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn, + struct smb2_ioctl_req *req, + struct smb2_ioctl_rsp *rsp) +{ + struct network_interface_info_ioctl_rsp *nii_rsp = NULL; + int nbytes = 0; + struct net_device *netdev; + struct sockaddr_storage_rsp *sockaddr_storage; + unsigned int flags; + unsigned long long speed; + struct sockaddr_in6 *csin6 = (struct sockaddr_in6 *)&conn->peer_addr; + + rtnl_lock(); + for_each_netdev(&init_net, netdev) { + if (netdev->type == ARPHRD_LOOPBACK) + continue; + + flags = dev_get_flags(netdev); + if (!(flags & IFF_RUNNING)) + continue; + + nii_rsp = (struct network_interface_info_ioctl_rsp *) + &rsp->Buffer[nbytes]; + nii_rsp->IfIndex = cpu_to_le32(netdev->ifindex); + + nii_rsp->Capability = 0; + if (ksmbd_rdma_capable_netdev(netdev)) + nii_rsp->Capability |= cpu_to_le32(RDMA_CAPABLE); + + nii_rsp->Next = cpu_to_le32(152); + nii_rsp->Reserved = 0; + + if (netdev->ethtool_ops->get_link_ksettings) { + struct ethtool_link_ksettings cmd; + + netdev->ethtool_ops->get_link_ksettings(netdev, &cmd); + speed = cmd.base.speed; + } else { + pr_err("%s %s\n", netdev->name, + "speed is unknown, defaulting to 1Gb/sec"); + speed = SPEED_1000; + } + + speed *= 1000000; + nii_rsp->LinkSpeed = cpu_to_le64(speed); + + sockaddr_storage = (struct sockaddr_storage_rsp *) + nii_rsp->SockAddr_Storage; + memset(sockaddr_storage, 0, 128); + + if (conn->peer_addr.ss_family == PF_INET || + ipv6_addr_v4mapped(&csin6->sin6_addr)) { + struct in_device *idev; + + sockaddr_storage->Family = cpu_to_le16(INTERNETWORK); + sockaddr_storage->addr4.Port = 0; + + idev = __in_dev_get_rtnl(netdev); + if (!idev) + continue; + sockaddr_storage->addr4.IPv4address = + idev_ipv4_address(idev); + } else { + struct inet6_dev *idev6; + struct inet6_ifaddr *ifa; + __u8 *ipv6_addr = sockaddr_storage->addr6.IPv6address; + + sockaddr_storage->Family = cpu_to_le16(INTERNETWORKV6); + sockaddr_storage->addr6.Port = 0; + sockaddr_storage->addr6.FlowInfo = 0; + + idev6 = __in6_dev_get(netdev); + if (!idev6) + continue; + + list_for_each_entry(ifa, &idev6->addr_list, if_list) { + if (ifa->flags & (IFA_F_TENTATIVE | + IFA_F_DEPRECATED)) + continue; + memcpy(ipv6_addr, ifa->addr.s6_addr, 16); + break; + } + sockaddr_storage->addr6.ScopeId = 0; + } + + nbytes += sizeof(struct network_interface_info_ioctl_rsp); + } + rtnl_unlock(); + + /* zero if this is last one */ + if (nii_rsp) + nii_rsp->Next = 0; + + if (!nbytes) { + rsp->hdr.Status = STATUS_BUFFER_TOO_SMALL; + return -EINVAL; + } + + rsp->PersistentFileId = cpu_to_le64(SMB2_NO_FID); + rsp->VolatileFileId = cpu_to_le64(SMB2_NO_FID); + return nbytes; +} + +static int fsctl_validate_negotiate_info(struct ksmbd_conn *conn, + struct validate_negotiate_info_req *neg_req, + struct validate_negotiate_info_rsp *neg_rsp) +{ + int ret = 0; + int dialect; + + dialect = ksmbd_lookup_dialect_by_id(neg_req->Dialects, + neg_req->DialectCount); + if (dialect == BAD_PROT_ID || dialect != conn->dialect) { + ret = -EINVAL; + goto err_out; + } + + if (strncmp(neg_req->Guid, conn->ClientGUID, SMB2_CLIENT_GUID_SIZE)) { + ret = -EINVAL; + goto err_out; + } + + if (le16_to_cpu(neg_req->SecurityMode) != conn->cli_sec_mode) { + ret = -EINVAL; + goto err_out; + } + + if (le32_to_cpu(neg_req->Capabilities) != conn->cli_cap) { + ret = -EINVAL; + goto err_out; + } + + neg_rsp->Capabilities = cpu_to_le32(conn->vals->capabilities); + memset(neg_rsp->Guid, 0, SMB2_CLIENT_GUID_SIZE); + neg_rsp->SecurityMode = cpu_to_le16(conn->srv_sec_mode); + neg_rsp->Dialect = cpu_to_le16(conn->dialect); +err_out: + return ret; +} + +static int fsctl_query_allocated_ranges(struct ksmbd_work *work, u64 id, + struct file_allocated_range_buffer *qar_req, + struct file_allocated_range_buffer *qar_rsp, + int in_count, int *out_count) +{ + struct ksmbd_file *fp; + loff_t start, length; + int ret = 0; + + *out_count = 0; + if (in_count == 0) + return -EINVAL; + + fp = ksmbd_lookup_fd_fast(work, id); + if (!fp) + return -ENOENT; + + start = le64_to_cpu(qar_req->file_offset); + length = le64_to_cpu(qar_req->length); + + ret = ksmbd_vfs_fqar_lseek(fp, start, length, + qar_rsp, in_count, out_count); + if (ret && ret != -E2BIG) + *out_count = 0; + + ksmbd_fd_put(work, fp); + return ret; +} + +static int fsctl_pipe_transceive(struct ksmbd_work *work, u64 id, + int out_buf_len, struct smb2_ioctl_req *req, + struct smb2_ioctl_rsp *rsp) +{ + struct ksmbd_rpc_command *rpc_resp; + char *data_buf = (char *)&req->Buffer[0]; + int nbytes = 0; + + rpc_resp = ksmbd_rpc_ioctl(work->sess, id, data_buf, + le32_to_cpu(req->InputCount)); + if (rpc_resp) { + if (rpc_resp->flags == KSMBD_RPC_SOME_NOT_MAPPED) { + /* + * set STATUS_SOME_NOT_MAPPED response + * for unknown domain sid. + */ + rsp->hdr.Status = STATUS_SOME_NOT_MAPPED; + } else if (rpc_resp->flags == KSMBD_RPC_ENOTIMPLEMENTED) { + rsp->hdr.Status = STATUS_NOT_SUPPORTED; + goto out; + } else if (rpc_resp->flags != KSMBD_RPC_OK) { + rsp->hdr.Status = STATUS_INVALID_PARAMETER; + goto out; + } + + nbytes = rpc_resp->payload_sz; + if (rpc_resp->payload_sz > out_buf_len) { + rsp->hdr.Status = STATUS_BUFFER_OVERFLOW; + nbytes = out_buf_len; + } + + if (!rpc_resp->payload_sz) { + rsp->hdr.Status = + STATUS_UNEXPECTED_IO_ERROR; + goto out; + } + + memcpy((char *)rsp->Buffer, rpc_resp->payload, nbytes); + } +out: + kvfree(rpc_resp); + return nbytes; +} + +static inline int fsctl_set_sparse(struct ksmbd_work *work, u64 id, + struct file_sparse *sparse) +{ + struct ksmbd_file *fp; + struct user_namespace *user_ns; + int ret = 0; + __le32 old_fattr; + + fp = ksmbd_lookup_fd_fast(work, id); + if (!fp) + return -ENOENT; + user_ns = file_mnt_user_ns(fp->filp); + + old_fattr = fp->f_ci->m_fattr; + if (sparse->SetSparse) + fp->f_ci->m_fattr |= ATTR_SPARSE_FILE_LE; + else + fp->f_ci->m_fattr &= ~ATTR_SPARSE_FILE_LE; + + if (fp->f_ci->m_fattr != old_fattr && + test_share_config_flag(work->tcon->share_conf, + KSMBD_SHARE_FLAG_STORE_DOS_ATTRS)) { + struct xattr_dos_attrib da; + + ret = ksmbd_vfs_get_dos_attrib_xattr(user_ns, + fp->filp->f_path.dentry, &da); + if (ret <= 0) + goto out; + + da.attr = le32_to_cpu(fp->f_ci->m_fattr); + ret = ksmbd_vfs_set_dos_attrib_xattr(user_ns, + fp->filp->f_path.dentry, &da); + if (ret) + fp->f_ci->m_fattr = old_fattr; + } + +out: + ksmbd_fd_put(work, fp); + return ret; +} + +static int fsctl_request_resume_key(struct ksmbd_work *work, + struct smb2_ioctl_req *req, + struct resume_key_ioctl_rsp *key_rsp) +{ + struct ksmbd_file *fp; + + fp = ksmbd_lookup_fd_slow(work, + le64_to_cpu(req->VolatileFileId), + le64_to_cpu(req->PersistentFileId)); + if (!fp) + return -ENOENT; + + memset(key_rsp, 0, sizeof(*key_rsp)); + key_rsp->ResumeKey[0] = req->VolatileFileId; + key_rsp->ResumeKey[1] = req->PersistentFileId; + ksmbd_fd_put(work, fp); + + return 0; +} + +/** + * smb2_ioctl() - handler for smb2 ioctl command + * @work: smb work containing ioctl command buffer + * + * Return: 0 on success, otherwise error + */ +int smb2_ioctl(struct ksmbd_work *work) +{ + struct smb2_ioctl_req *req; + struct smb2_ioctl_rsp *rsp, *rsp_org; + int cnt_code, nbytes = 0; + int out_buf_len; + u64 id = KSMBD_NO_FID; + struct ksmbd_conn *conn = work->conn; + int ret = 0; + + rsp_org = work->response_buf; + if (work->next_smb2_rcv_hdr_off) { + req = ksmbd_req_buf_next(work); + rsp = ksmbd_resp_buf_next(work); + if (!has_file_id(le64_to_cpu(req->VolatileFileId))) { + ksmbd_debug(SMB, "Compound request set FID = %llu\n", + work->compound_fid); + id = work->compound_fid; + } + } else { + req = work->request_buf; + rsp = work->response_buf; + } + + if (!has_file_id(id)) + id = le64_to_cpu(req->VolatileFileId); + + if (req->Flags != cpu_to_le32(SMB2_0_IOCTL_IS_FSCTL)) { + rsp->hdr.Status = STATUS_NOT_SUPPORTED; + goto out; + } + + cnt_code = le32_to_cpu(req->CntCode); + out_buf_len = le32_to_cpu(req->MaxOutputResponse); + out_buf_len = min(KSMBD_IPC_MAX_PAYLOAD, out_buf_len); + + switch (cnt_code) { + case FSCTL_DFS_GET_REFERRALS: + case FSCTL_DFS_GET_REFERRALS_EX: + /* Not support DFS yet */ + rsp->hdr.Status = STATUS_FS_DRIVER_REQUIRED; + goto out; + case FSCTL_CREATE_OR_GET_OBJECT_ID: + { + struct file_object_buf_type1_ioctl_rsp *obj_buf; + + nbytes = sizeof(struct file_object_buf_type1_ioctl_rsp); + obj_buf = (struct file_object_buf_type1_ioctl_rsp *) + &rsp->Buffer[0]; + + /* + * TODO: This is dummy implementation to pass smbtorture + * Need to check correct response later + */ + memset(obj_buf->ObjectId, 0x0, 16); + memset(obj_buf->BirthVolumeId, 0x0, 16); + memset(obj_buf->BirthObjectId, 0x0, 16); + memset(obj_buf->DomainId, 0x0, 16); + + break; + } + case FSCTL_PIPE_TRANSCEIVE: + nbytes = fsctl_pipe_transceive(work, id, out_buf_len, req, rsp); + break; + case FSCTL_VALIDATE_NEGOTIATE_INFO: + if (conn->dialect < SMB30_PROT_ID) { + ret = -EOPNOTSUPP; + goto out; + } + + ret = fsctl_validate_negotiate_info(conn, + (struct validate_negotiate_info_req *)&req->Buffer[0], + (struct validate_negotiate_info_rsp *)&rsp->Buffer[0]); + if (ret < 0) + goto out; + + nbytes = sizeof(struct validate_negotiate_info_rsp); + rsp->PersistentFileId = cpu_to_le64(SMB2_NO_FID); + rsp->VolatileFileId = cpu_to_le64(SMB2_NO_FID); + break; + case FSCTL_QUERY_NETWORK_INTERFACE_INFO: + nbytes = fsctl_query_iface_info_ioctl(conn, req, rsp); + if (nbytes < 0) + goto out; + break; + case FSCTL_REQUEST_RESUME_KEY: + if (out_buf_len < sizeof(struct resume_key_ioctl_rsp)) { + ret = -EINVAL; + goto out; + } + + ret = fsctl_request_resume_key(work, req, + (struct resume_key_ioctl_rsp *)&rsp->Buffer[0]); + if (ret < 0) + goto out; + rsp->PersistentFileId = req->PersistentFileId; + rsp->VolatileFileId = req->VolatileFileId; + nbytes = sizeof(struct resume_key_ioctl_rsp); + break; + case FSCTL_COPYCHUNK: + case FSCTL_COPYCHUNK_WRITE: + if (!test_tree_conn_flag(work->tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) { + ksmbd_debug(SMB, + "User does not have write permission\n"); + ret = -EACCES; + goto out; + } + + if (out_buf_len < sizeof(struct copychunk_ioctl_rsp)) { + ret = -EINVAL; + goto out; + } + + nbytes = sizeof(struct copychunk_ioctl_rsp); + fsctl_copychunk(work, req, rsp); + break; + case FSCTL_SET_SPARSE: + ret = fsctl_set_sparse(work, id, + (struct file_sparse *)&req->Buffer[0]); + if (ret < 0) + goto out; + break; + case FSCTL_SET_ZERO_DATA: + { + struct file_zero_data_information *zero_data; + struct ksmbd_file *fp; + loff_t off, len; + + if (!test_tree_conn_flag(work->tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) { + ksmbd_debug(SMB, + "User does not have write permission\n"); + ret = -EACCES; + goto out; + } + + zero_data = + (struct file_zero_data_information *)&req->Buffer[0]; + + fp = ksmbd_lookup_fd_fast(work, id); + if (!fp) { + ret = -ENOENT; + goto out; + } + + off = le64_to_cpu(zero_data->FileOffset); + len = le64_to_cpu(zero_data->BeyondFinalZero) - off; + + ret = ksmbd_vfs_zero_data(work, fp, off, len); + ksmbd_fd_put(work, fp); + if (ret < 0) + goto out; + break; + } + case FSCTL_QUERY_ALLOCATED_RANGES: + ret = fsctl_query_allocated_ranges(work, id, + (struct file_allocated_range_buffer *)&req->Buffer[0], + (struct file_allocated_range_buffer *)&rsp->Buffer[0], + out_buf_len / + sizeof(struct file_allocated_range_buffer), &nbytes); + if (ret == -E2BIG) { + rsp->hdr.Status = STATUS_BUFFER_OVERFLOW; + } else if (ret < 0) { + nbytes = 0; + goto out; + } + + nbytes *= sizeof(struct file_allocated_range_buffer); + break; + case FSCTL_GET_REPARSE_POINT: + { + struct reparse_data_buffer *reparse_ptr; + struct ksmbd_file *fp; + + reparse_ptr = (struct reparse_data_buffer *)&rsp->Buffer[0]; + fp = ksmbd_lookup_fd_fast(work, id); + if (!fp) { + pr_err("not found fp!!\n"); + ret = -ENOENT; + goto out; + } + + reparse_ptr->ReparseTag = + smb2_get_reparse_tag_special_file(file_inode(fp->filp)->i_mode); + reparse_ptr->ReparseDataLength = 0; + ksmbd_fd_put(work, fp); + nbytes = sizeof(struct reparse_data_buffer); + break; + } + case FSCTL_DUPLICATE_EXTENTS_TO_FILE: + { + struct ksmbd_file *fp_in, *fp_out = NULL; + struct duplicate_extents_to_file *dup_ext; + loff_t src_off, dst_off, length, cloned; + + dup_ext = (struct duplicate_extents_to_file *)&req->Buffer[0]; + + fp_in = ksmbd_lookup_fd_slow(work, dup_ext->VolatileFileHandle, + dup_ext->PersistentFileHandle); + if (!fp_in) { + pr_err("not found file handle in duplicate extent to file\n"); + ret = -ENOENT; + goto out; + } + + fp_out = ksmbd_lookup_fd_fast(work, id); + if (!fp_out) { + pr_err("not found fp\n"); + ret = -ENOENT; + goto dup_ext_out; + } + + src_off = le64_to_cpu(dup_ext->SourceFileOffset); + dst_off = le64_to_cpu(dup_ext->TargetFileOffset); + length = le64_to_cpu(dup_ext->ByteCount); + cloned = vfs_clone_file_range(fp_in->filp, src_off, fp_out->filp, + dst_off, length, 0); + if (cloned == -EXDEV || cloned == -EOPNOTSUPP) { + ret = -EOPNOTSUPP; + goto dup_ext_out; + } else if (cloned != length) { + cloned = vfs_copy_file_range(fp_in->filp, src_off, + fp_out->filp, dst_off, length, 0); + if (cloned != length) { + if (cloned < 0) + ret = cloned; + else + ret = -EINVAL; + } + } + +dup_ext_out: + ksmbd_fd_put(work, fp_in); + ksmbd_fd_put(work, fp_out); + if (ret < 0) + goto out; + break; + } + default: + ksmbd_debug(SMB, "not implemented yet ioctl command 0x%x\n", + cnt_code); + ret = -EOPNOTSUPP; + goto out; + } + + rsp->CntCode = cpu_to_le32(cnt_code); + rsp->InputCount = cpu_to_le32(0); + rsp->InputOffset = cpu_to_le32(112); + rsp->OutputOffset = cpu_to_le32(112); + rsp->OutputCount = cpu_to_le32(nbytes); + rsp->StructureSize = cpu_to_le16(49); + rsp->Reserved = cpu_to_le16(0); + rsp->Flags = cpu_to_le32(0); + rsp->Reserved2 = cpu_to_le32(0); + inc_rfc1001_len(rsp_org, 48 + nbytes); + + return 0; + +out: + if (ret == -EACCES) + rsp->hdr.Status = STATUS_ACCESS_DENIED; + else if (ret == -ENOENT) + rsp->hdr.Status = STATUS_OBJECT_NAME_NOT_FOUND; + else if (ret == -EOPNOTSUPP) + rsp->hdr.Status = STATUS_NOT_SUPPORTED; + else if (ret < 0 || rsp->hdr.Status == 0) + rsp->hdr.Status = STATUS_INVALID_PARAMETER; + smb2_set_err_rsp(work); + return 0; +} + +/** + * smb20_oplock_break_ack() - handler for smb2.0 oplock break command + * @work: smb work containing oplock break command buffer + * + * Return: 0 + */ +static void smb20_oplock_break_ack(struct ksmbd_work *work) +{ + struct smb2_oplock_break *req = work->request_buf; + struct smb2_oplock_break *rsp = work->response_buf; + struct ksmbd_file *fp; + struct oplock_info *opinfo = NULL; + __le32 err = 0; + int ret = 0; + u64 volatile_id, persistent_id; + char req_oplevel = 0, rsp_oplevel = 0; + unsigned int oplock_change_type; + + volatile_id = le64_to_cpu(req->VolatileFid); + persistent_id = le64_to_cpu(req->PersistentFid); + req_oplevel = req->OplockLevel; + ksmbd_debug(OPLOCK, "v_id %llu, p_id %llu request oplock level %d\n", + volatile_id, persistent_id, req_oplevel); + + fp = ksmbd_lookup_fd_slow(work, volatile_id, persistent_id); + if (!fp) { + rsp->hdr.Status = STATUS_FILE_CLOSED; + smb2_set_err_rsp(work); + return; + } + + opinfo = opinfo_get(fp); + if (!opinfo) { + pr_err("unexpected null oplock_info\n"); + rsp->hdr.Status = STATUS_INVALID_OPLOCK_PROTOCOL; + smb2_set_err_rsp(work); + ksmbd_fd_put(work, fp); + return; + } + + if (opinfo->level == SMB2_OPLOCK_LEVEL_NONE) { + rsp->hdr.Status = STATUS_INVALID_OPLOCK_PROTOCOL; + goto err_out; + } + + if (opinfo->op_state == OPLOCK_STATE_NONE) { + ksmbd_debug(SMB, "unexpected oplock state 0x%x\n", opinfo->op_state); + rsp->hdr.Status = STATUS_UNSUCCESSFUL; + goto err_out; + } + + if ((opinfo->level == SMB2_OPLOCK_LEVEL_EXCLUSIVE || + opinfo->level == SMB2_OPLOCK_LEVEL_BATCH) && + (req_oplevel != SMB2_OPLOCK_LEVEL_II && + req_oplevel != SMB2_OPLOCK_LEVEL_NONE)) { + err = STATUS_INVALID_OPLOCK_PROTOCOL; + oplock_change_type = OPLOCK_WRITE_TO_NONE; + } else if (opinfo->level == SMB2_OPLOCK_LEVEL_II && + req_oplevel != SMB2_OPLOCK_LEVEL_NONE) { + err = STATUS_INVALID_OPLOCK_PROTOCOL; + oplock_change_type = OPLOCK_READ_TO_NONE; + } else if (req_oplevel == SMB2_OPLOCK_LEVEL_II || + req_oplevel == SMB2_OPLOCK_LEVEL_NONE) { + err = STATUS_INVALID_DEVICE_STATE; + if ((opinfo->level == SMB2_OPLOCK_LEVEL_EXCLUSIVE || + opinfo->level == SMB2_OPLOCK_LEVEL_BATCH) && + req_oplevel == SMB2_OPLOCK_LEVEL_II) { + oplock_change_type = OPLOCK_WRITE_TO_READ; + } else if ((opinfo->level == SMB2_OPLOCK_LEVEL_EXCLUSIVE || + opinfo->level == SMB2_OPLOCK_LEVEL_BATCH) && + req_oplevel == SMB2_OPLOCK_LEVEL_NONE) { + oplock_change_type = OPLOCK_WRITE_TO_NONE; + } else if (opinfo->level == SMB2_OPLOCK_LEVEL_II && + req_oplevel == SMB2_OPLOCK_LEVEL_NONE) { + oplock_change_type = OPLOCK_READ_TO_NONE; + } else { + oplock_change_type = 0; + } + } else { + oplock_change_type = 0; + } + + switch (oplock_change_type) { + case OPLOCK_WRITE_TO_READ: + ret = opinfo_write_to_read(opinfo); + rsp_oplevel = SMB2_OPLOCK_LEVEL_II; + break; + case OPLOCK_WRITE_TO_NONE: + ret = opinfo_write_to_none(opinfo); + rsp_oplevel = SMB2_OPLOCK_LEVEL_NONE; + break; + case OPLOCK_READ_TO_NONE: + ret = opinfo_read_to_none(opinfo); + rsp_oplevel = SMB2_OPLOCK_LEVEL_NONE; + break; + default: + pr_err("unknown oplock change 0x%x -> 0x%x\n", + opinfo->level, rsp_oplevel); + } + + if (ret < 0) { + rsp->hdr.Status = err; + goto err_out; + } + + opinfo_put(opinfo); + ksmbd_fd_put(work, fp); + opinfo->op_state = OPLOCK_STATE_NONE; + wake_up_interruptible_all(&opinfo->oplock_q); + + rsp->StructureSize = cpu_to_le16(24); + rsp->OplockLevel = rsp_oplevel; + rsp->Reserved = 0; + rsp->Reserved2 = 0; + rsp->VolatileFid = cpu_to_le64(volatile_id); + rsp->PersistentFid = cpu_to_le64(persistent_id); + inc_rfc1001_len(rsp, 24); + return; + +err_out: + opinfo->op_state = OPLOCK_STATE_NONE; + wake_up_interruptible_all(&opinfo->oplock_q); + + opinfo_put(opinfo); + ksmbd_fd_put(work, fp); + smb2_set_err_rsp(work); +} + +static int check_lease_state(struct lease *lease, __le32 req_state) +{ + if ((lease->new_state == + (SMB2_LEASE_READ_CACHING_LE | SMB2_LEASE_HANDLE_CACHING_LE)) && + !(req_state & SMB2_LEASE_WRITE_CACHING_LE)) { + lease->new_state = req_state; + return 0; + } + + if (lease->new_state == req_state) + return 0; + + return 1; +} + +/** + * smb21_lease_break_ack() - handler for smb2.1 lease break command + * @work: smb work containing lease break command buffer + * + * Return: 0 + */ +static void smb21_lease_break_ack(struct ksmbd_work *work) +{ + struct ksmbd_conn *conn = work->conn; + struct smb2_lease_ack *req = work->request_buf; + struct smb2_lease_ack *rsp = work->response_buf; + struct oplock_info *opinfo; + __le32 err = 0; + int ret = 0; + unsigned int lease_change_type; + __le32 lease_state; + struct lease *lease; + + ksmbd_debug(OPLOCK, "smb21 lease break, lease state(0x%x)\n", + le32_to_cpu(req->LeaseState)); + opinfo = lookup_lease_in_table(conn, req->LeaseKey); + if (!opinfo) { + ksmbd_debug(OPLOCK, "file not opened\n"); + smb2_set_err_rsp(work); + rsp->hdr.Status = STATUS_UNSUCCESSFUL; + return; + } + lease = opinfo->o_lease; + + if (opinfo->op_state == OPLOCK_STATE_NONE) { + pr_err("unexpected lease break state 0x%x\n", + opinfo->op_state); + rsp->hdr.Status = STATUS_UNSUCCESSFUL; + goto err_out; + } + + if (check_lease_state(lease, req->LeaseState)) { + rsp->hdr.Status = STATUS_REQUEST_NOT_ACCEPTED; + ksmbd_debug(OPLOCK, + "req lease state: 0x%x, expected state: 0x%x\n", + req->LeaseState, lease->new_state); + goto err_out; + } + + if (!atomic_read(&opinfo->breaking_cnt)) { + rsp->hdr.Status = STATUS_UNSUCCESSFUL; + goto err_out; + } + + /* check for bad lease state */ + if (req->LeaseState & + (~(SMB2_LEASE_READ_CACHING_LE | SMB2_LEASE_HANDLE_CACHING_LE))) { + err = STATUS_INVALID_OPLOCK_PROTOCOL; + if (lease->state & SMB2_LEASE_WRITE_CACHING_LE) + lease_change_type = OPLOCK_WRITE_TO_NONE; + else + lease_change_type = OPLOCK_READ_TO_NONE; + ksmbd_debug(OPLOCK, "handle bad lease state 0x%x -> 0x%x\n", + le32_to_cpu(lease->state), + le32_to_cpu(req->LeaseState)); + } else if (lease->state == SMB2_LEASE_READ_CACHING_LE && + req->LeaseState != SMB2_LEASE_NONE_LE) { + err = STATUS_INVALID_OPLOCK_PROTOCOL; + lease_change_type = OPLOCK_READ_TO_NONE; + ksmbd_debug(OPLOCK, "handle bad lease state 0x%x -> 0x%x\n", + le32_to_cpu(lease->state), + le32_to_cpu(req->LeaseState)); + } else { + /* valid lease state changes */ + err = STATUS_INVALID_DEVICE_STATE; + if (req->LeaseState == SMB2_LEASE_NONE_LE) { + if (lease->state & SMB2_LEASE_WRITE_CACHING_LE) + lease_change_type = OPLOCK_WRITE_TO_NONE; + else + lease_change_type = OPLOCK_READ_TO_NONE; + } else if (req->LeaseState & SMB2_LEASE_READ_CACHING_LE) { + if (lease->state & SMB2_LEASE_WRITE_CACHING_LE) + lease_change_type = OPLOCK_WRITE_TO_READ; + else + lease_change_type = OPLOCK_READ_HANDLE_TO_READ; + } else { + lease_change_type = 0; + } + } + + switch (lease_change_type) { + case OPLOCK_WRITE_TO_READ: + ret = opinfo_write_to_read(opinfo); + break; + case OPLOCK_READ_HANDLE_TO_READ: + ret = opinfo_read_handle_to_read(opinfo); + break; + case OPLOCK_WRITE_TO_NONE: + ret = opinfo_write_to_none(opinfo); + break; + case OPLOCK_READ_TO_NONE: + ret = opinfo_read_to_none(opinfo); + break; + default: + ksmbd_debug(OPLOCK, "unknown lease change 0x%x -> 0x%x\n", + le32_to_cpu(lease->state), + le32_to_cpu(req->LeaseState)); + } + + lease_state = lease->state; + opinfo->op_state = OPLOCK_STATE_NONE; + wake_up_interruptible_all(&opinfo->oplock_q); + atomic_dec(&opinfo->breaking_cnt); + wake_up_interruptible_all(&opinfo->oplock_brk); + opinfo_put(opinfo); + + if (ret < 0) { + rsp->hdr.Status = err; + goto err_out; + } + + rsp->StructureSize = cpu_to_le16(36); + rsp->Reserved = 0; + rsp->Flags = 0; + memcpy(rsp->LeaseKey, req->LeaseKey, 16); + rsp->LeaseState = lease_state; + rsp->LeaseDuration = 0; + inc_rfc1001_len(rsp, 36); + return; + +err_out: + opinfo->op_state = OPLOCK_STATE_NONE; + wake_up_interruptible_all(&opinfo->oplock_q); + atomic_dec(&opinfo->breaking_cnt); + wake_up_interruptible_all(&opinfo->oplock_brk); + + opinfo_put(opinfo); + smb2_set_err_rsp(work); +} + +/** + * smb2_oplock_break() - dispatcher for smb2.0 and 2.1 oplock/lease break + * @work: smb work containing oplock/lease break command buffer + * + * Return: 0 + */ +int smb2_oplock_break(struct ksmbd_work *work) +{ + struct smb2_oplock_break *req = work->request_buf; + struct smb2_oplock_break *rsp = work->response_buf; + + switch (le16_to_cpu(req->StructureSize)) { + case OP_BREAK_STRUCT_SIZE_20: + smb20_oplock_break_ack(work); + break; + case OP_BREAK_STRUCT_SIZE_21: + smb21_lease_break_ack(work); + break; + default: + ksmbd_debug(OPLOCK, "invalid break cmd %d\n", + le16_to_cpu(req->StructureSize)); + rsp->hdr.Status = STATUS_INVALID_PARAMETER; + smb2_set_err_rsp(work); + } + + return 0; +} + +/** + * smb2_notify() - handler for smb2 notify request + * @work: smb work containing notify command buffer + * + * Return: 0 + */ +int smb2_notify(struct ksmbd_work *work) +{ + struct smb2_notify_req *req; + struct smb2_notify_rsp *rsp; + + WORK_BUFFERS(work, req, rsp); + + if (work->next_smb2_rcv_hdr_off && req->hdr.NextCommand) { + rsp->hdr.Status = STATUS_INTERNAL_ERROR; + smb2_set_err_rsp(work); + return 0; + } + + smb2_set_err_rsp(work); + rsp->hdr.Status = STATUS_NOT_IMPLEMENTED; + return 0; +} + +/** + * smb2_is_sign_req() - handler for checking packet signing status + * @work: smb work containing notify command buffer + * @command: SMB2 command id + * + * Return: true if packed is signed, false otherwise + */ +bool smb2_is_sign_req(struct ksmbd_work *work, unsigned int command) +{ + struct smb2_hdr *rcv_hdr2 = work->request_buf; + + if ((rcv_hdr2->Flags & SMB2_FLAGS_SIGNED) && + command != SMB2_NEGOTIATE_HE && + command != SMB2_SESSION_SETUP_HE && + command != SMB2_OPLOCK_BREAK_HE) + return true; + + return false; +} + +/** + * smb2_check_sign_req() - handler for req packet sign processing + * @work: smb work containing notify command buffer + * + * Return: 1 on success, 0 otherwise + */ +int smb2_check_sign_req(struct ksmbd_work *work) +{ + struct smb2_hdr *hdr, *hdr_org; + char signature_req[SMB2_SIGNATURE_SIZE]; + char signature[SMB2_HMACSHA256_SIZE]; + struct kvec iov[1]; + size_t len; + + hdr_org = hdr = work->request_buf; + if (work->next_smb2_rcv_hdr_off) + hdr = ksmbd_req_buf_next(work); + + if (!hdr->NextCommand && !work->next_smb2_rcv_hdr_off) + len = be32_to_cpu(hdr_org->smb2_buf_length); + else if (hdr->NextCommand) + len = le32_to_cpu(hdr->NextCommand); + else + len = be32_to_cpu(hdr_org->smb2_buf_length) - + work->next_smb2_rcv_hdr_off; + + memcpy(signature_req, hdr->Signature, SMB2_SIGNATURE_SIZE); + memset(hdr->Signature, 0, SMB2_SIGNATURE_SIZE); + + iov[0].iov_base = (char *)&hdr->ProtocolId; + iov[0].iov_len = len; + + if (ksmbd_sign_smb2_pdu(work->conn, work->sess->sess_key, iov, 1, + signature)) + return 0; + + if (memcmp(signature, signature_req, SMB2_SIGNATURE_SIZE)) { + pr_err("bad smb2 signature\n"); + return 0; + } + + return 1; +} + +/** + * smb2_set_sign_rsp() - handler for rsp packet sign processing + * @work: smb work containing notify command buffer + * + */ +void smb2_set_sign_rsp(struct ksmbd_work *work) +{ + struct smb2_hdr *hdr, *hdr_org; + struct smb2_hdr *req_hdr; + char signature[SMB2_HMACSHA256_SIZE]; + struct kvec iov[2]; + size_t len; + int n_vec = 1; + + hdr_org = hdr = work->response_buf; + if (work->next_smb2_rsp_hdr_off) + hdr = ksmbd_resp_buf_next(work); + + req_hdr = ksmbd_req_buf_next(work); + + if (!work->next_smb2_rsp_hdr_off) { + len = get_rfc1002_len(hdr_org); + if (req_hdr->NextCommand) + len = ALIGN(len, 8); + } else { + len = get_rfc1002_len(hdr_org) - work->next_smb2_rsp_hdr_off; + len = ALIGN(len, 8); + } + + if (req_hdr->NextCommand) + hdr->NextCommand = cpu_to_le32(len); + + hdr->Flags |= SMB2_FLAGS_SIGNED; + memset(hdr->Signature, 0, SMB2_SIGNATURE_SIZE); + + iov[0].iov_base = (char *)&hdr->ProtocolId; + iov[0].iov_len = len; + + if (work->aux_payload_sz) { + iov[0].iov_len -= work->aux_payload_sz; + + iov[1].iov_base = work->aux_payload_buf; + iov[1].iov_len = work->aux_payload_sz; + n_vec++; + } + + if (!ksmbd_sign_smb2_pdu(work->conn, work->sess->sess_key, iov, n_vec, + signature)) + memcpy(hdr->Signature, signature, SMB2_SIGNATURE_SIZE); +} + +/** + * smb3_check_sign_req() - handler for req packet sign processing + * @work: smb work containing notify command buffer + * + * Return: 1 on success, 0 otherwise + */ +int smb3_check_sign_req(struct ksmbd_work *work) +{ + struct ksmbd_conn *conn = work->conn; + char *signing_key; + struct smb2_hdr *hdr, *hdr_org; + struct channel *chann; + char signature_req[SMB2_SIGNATURE_SIZE]; + char signature[SMB2_CMACAES_SIZE]; + struct kvec iov[1]; + size_t len; + + hdr_org = hdr = work->request_buf; + if (work->next_smb2_rcv_hdr_off) + hdr = ksmbd_req_buf_next(work); + + if (!hdr->NextCommand && !work->next_smb2_rcv_hdr_off) + len = be32_to_cpu(hdr_org->smb2_buf_length); + else if (hdr->NextCommand) + len = le32_to_cpu(hdr->NextCommand); + else + len = be32_to_cpu(hdr_org->smb2_buf_length) - + work->next_smb2_rcv_hdr_off; + + if (le16_to_cpu(hdr->Command) == SMB2_SESSION_SETUP_HE) { + signing_key = work->sess->smb3signingkey; + } else { + chann = lookup_chann_list(work->sess, conn); + if (!chann) + return 0; + signing_key = chann->smb3signingkey; + } + + if (!signing_key) { + pr_err("SMB3 signing key is not generated\n"); + return 0; + } + + memcpy(signature_req, hdr->Signature, SMB2_SIGNATURE_SIZE); + memset(hdr->Signature, 0, SMB2_SIGNATURE_SIZE); + iov[0].iov_base = (char *)&hdr->ProtocolId; + iov[0].iov_len = len; + + if (ksmbd_sign_smb3_pdu(conn, signing_key, iov, 1, signature)) + return 0; + + if (memcmp(signature, signature_req, SMB2_SIGNATURE_SIZE)) { + pr_err("bad smb2 signature\n"); + return 0; + } + + return 1; +} + +/** + * smb3_set_sign_rsp() - handler for rsp packet sign processing + * @work: smb work containing notify command buffer + * + */ +void smb3_set_sign_rsp(struct ksmbd_work *work) +{ + struct ksmbd_conn *conn = work->conn; + struct smb2_hdr *req_hdr; + struct smb2_hdr *hdr, *hdr_org; + struct channel *chann; + char signature[SMB2_CMACAES_SIZE]; + struct kvec iov[2]; + int n_vec = 1; + size_t len; + char *signing_key; + + hdr_org = hdr = work->response_buf; + if (work->next_smb2_rsp_hdr_off) + hdr = ksmbd_resp_buf_next(work); + + req_hdr = ksmbd_req_buf_next(work); + + if (!work->next_smb2_rsp_hdr_off) { + len = get_rfc1002_len(hdr_org); + if (req_hdr->NextCommand) + len = ALIGN(len, 8); + } else { + len = get_rfc1002_len(hdr_org) - work->next_smb2_rsp_hdr_off; + len = ALIGN(len, 8); + } + + if (conn->binding == false && + le16_to_cpu(hdr->Command) == SMB2_SESSION_SETUP_HE) { + signing_key = work->sess->smb3signingkey; + } else { + chann = lookup_chann_list(work->sess, work->conn); + if (!chann) + return; + signing_key = chann->smb3signingkey; + } + + if (!signing_key) + return; + + if (req_hdr->NextCommand) + hdr->NextCommand = cpu_to_le32(len); + + hdr->Flags |= SMB2_FLAGS_SIGNED; + memset(hdr->Signature, 0, SMB2_SIGNATURE_SIZE); + iov[0].iov_base = (char *)&hdr->ProtocolId; + iov[0].iov_len = len; + if (work->aux_payload_sz) { + iov[0].iov_len -= work->aux_payload_sz; + iov[1].iov_base = work->aux_payload_buf; + iov[1].iov_len = work->aux_payload_sz; + n_vec++; + } + + if (!ksmbd_sign_smb3_pdu(conn, signing_key, iov, n_vec, signature)) + memcpy(hdr->Signature, signature, SMB2_SIGNATURE_SIZE); +} + +/** + * smb3_preauth_hash_rsp() - handler for computing preauth hash on response + * @work: smb work containing response buffer + * + */ +void smb3_preauth_hash_rsp(struct ksmbd_work *work) +{ + struct ksmbd_conn *conn = work->conn; + struct ksmbd_session *sess = work->sess; + struct smb2_hdr *req, *rsp; + + if (conn->dialect != SMB311_PROT_ID) + return; + + WORK_BUFFERS(work, req, rsp); + + if (le16_to_cpu(req->Command) == SMB2_NEGOTIATE_HE) + ksmbd_gen_preauth_integrity_hash(conn, (char *)rsp, + conn->preauth_info->Preauth_HashValue); + + if (le16_to_cpu(rsp->Command) == SMB2_SESSION_SETUP_HE && sess) { + __u8 *hash_value; + + if (conn->binding) { + struct preauth_session *preauth_sess; + + preauth_sess = ksmbd_preauth_session_lookup(conn, sess->id); + if (!preauth_sess) + return; + hash_value = preauth_sess->Preauth_HashValue; + } else { + hash_value = sess->Preauth_HashValue; + if (!hash_value) + return; + } + ksmbd_gen_preauth_integrity_hash(conn, (char *)rsp, + hash_value); + } +} + +static void fill_transform_hdr(struct smb2_transform_hdr *tr_hdr, char *old_buf, + __le16 cipher_type) +{ + struct smb2_hdr *hdr = (struct smb2_hdr *)old_buf; + unsigned int orig_len = get_rfc1002_len(old_buf); + + memset(tr_hdr, 0, sizeof(struct smb2_transform_hdr)); + tr_hdr->ProtocolId = SMB2_TRANSFORM_PROTO_NUM; + tr_hdr->OriginalMessageSize = cpu_to_le32(orig_len); + tr_hdr->Flags = cpu_to_le16(0x01); + if (cipher_type == SMB2_ENCRYPTION_AES128_GCM || + cipher_type == SMB2_ENCRYPTION_AES256_GCM) + get_random_bytes(&tr_hdr->Nonce, SMB3_AES_GCM_NONCE); + else + get_random_bytes(&tr_hdr->Nonce, SMB3_AES_CCM_NONCE); + memcpy(&tr_hdr->SessionId, &hdr->SessionId, 8); + inc_rfc1001_len(tr_hdr, sizeof(struct smb2_transform_hdr) - 4); + inc_rfc1001_len(tr_hdr, orig_len); +} + +int smb3_encrypt_resp(struct ksmbd_work *work) +{ + char *buf = work->response_buf; + struct smb2_transform_hdr *tr_hdr; + struct kvec iov[3]; + int rc = -ENOMEM; + int buf_size = 0, rq_nvec = 2 + (work->aux_payload_sz ? 1 : 0); + + if (ARRAY_SIZE(iov) < rq_nvec) + return -ENOMEM; + + tr_hdr = kzalloc(sizeof(struct smb2_transform_hdr), GFP_KERNEL); + if (!tr_hdr) + return rc; + + /* fill transform header */ + fill_transform_hdr(tr_hdr, buf, work->conn->cipher_type); + + iov[0].iov_base = tr_hdr; + iov[0].iov_len = sizeof(struct smb2_transform_hdr); + buf_size += iov[0].iov_len - 4; + + iov[1].iov_base = buf + 4; + iov[1].iov_len = get_rfc1002_len(buf); + if (work->aux_payload_sz) { + iov[1].iov_len = work->resp_hdr_sz - 4; + + iov[2].iov_base = work->aux_payload_buf; + iov[2].iov_len = work->aux_payload_sz; + buf_size += iov[2].iov_len; + } + buf_size += iov[1].iov_len; + work->resp_hdr_sz = iov[1].iov_len; + + rc = ksmbd_crypt_message(work->conn, iov, rq_nvec, 1); + if (rc) + return rc; + + memmove(buf, iov[1].iov_base, iov[1].iov_len); + tr_hdr->smb2_buf_length = cpu_to_be32(buf_size); + work->tr_buf = tr_hdr; + + return rc; +} + +bool smb3_is_transform_hdr(void *buf) +{ + struct smb2_transform_hdr *trhdr = buf; + + return trhdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM; +} + +int smb3_decrypt_req(struct ksmbd_work *work) +{ + struct ksmbd_conn *conn = work->conn; + struct ksmbd_session *sess; + char *buf = work->request_buf; + struct smb2_hdr *hdr; + unsigned int pdu_length = get_rfc1002_len(buf); + struct kvec iov[2]; + unsigned int buf_data_size = pdu_length + 4 - + sizeof(struct smb2_transform_hdr); + struct smb2_transform_hdr *tr_hdr = (struct smb2_transform_hdr *)buf; + unsigned int orig_len = le32_to_cpu(tr_hdr->OriginalMessageSize); + int rc = 0; + + sess = ksmbd_session_lookup_all(conn, le64_to_cpu(tr_hdr->SessionId)); + if (!sess) { + pr_err("invalid session id(%llx) in transform header\n", + le64_to_cpu(tr_hdr->SessionId)); + return -ECONNABORTED; + } + + if (pdu_length + 4 < + sizeof(struct smb2_transform_hdr) + sizeof(struct smb2_hdr)) { + pr_err("Transform message is too small (%u)\n", + pdu_length); + return -ECONNABORTED; + } + + if (pdu_length + 4 < orig_len + sizeof(struct smb2_transform_hdr)) { + pr_err("Transform message is broken\n"); + return -ECONNABORTED; + } + + iov[0].iov_base = buf; + iov[0].iov_len = sizeof(struct smb2_transform_hdr); + iov[1].iov_base = buf + sizeof(struct smb2_transform_hdr); + iov[1].iov_len = buf_data_size; + rc = ksmbd_crypt_message(conn, iov, 2, 0); + if (rc) + return rc; + + memmove(buf + 4, iov[1].iov_base, buf_data_size); + hdr = (struct smb2_hdr *)buf; + hdr->smb2_buf_length = cpu_to_be32(buf_data_size); + + return rc; +} + +bool smb3_11_final_sess_setup_resp(struct ksmbd_work *work) +{ + struct ksmbd_conn *conn = work->conn; + struct smb2_hdr *rsp = work->response_buf; + + if (conn->dialect < SMB30_PROT_ID) + return false; + + if (work->next_smb2_rcv_hdr_off) + rsp = ksmbd_resp_buf_next(work); + + if (le16_to_cpu(rsp->Command) == SMB2_SESSION_SETUP_HE && + rsp->Status == STATUS_SUCCESS) + return true; + return false; +} diff --git a/fs/ksmbd/smb2pdu.h b/fs/ksmbd/smb2pdu.h new file mode 100644 index 000000000000..bcec845b03f3 --- /dev/null +++ b/fs/ksmbd/smb2pdu.h @@ -0,0 +1,1698 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2016 Namjae Jeon <linkinjeon@kernel.org> + * Copyright (C) 2018 Samsung Electronics Co., Ltd. + */ + +#ifndef _SMB2PDU_H +#define _SMB2PDU_H + +#include "ntlmssp.h" +#include "smbacl.h" + +/* + * Note that, due to trying to use names similar to the protocol specifications, + * there are many mixed case field names in the structures below. Although + * this does not match typical Linux kernel style, it is necessary to be + * able to match against the protocol specfication. + * + * SMB2 commands + * Some commands have minimal (wct=0,bcc=0), or uninteresting, responses + * (ie no useful data other than the SMB error code itself) and are marked such. + * Knowing this helps avoid response buffer allocations and copy in some cases. + */ + +/* List of commands in host endian */ +#define SMB2_NEGOTIATE_HE 0x0000 +#define SMB2_SESSION_SETUP_HE 0x0001 +#define SMB2_LOGOFF_HE 0x0002 /* trivial request/resp */ +#define SMB2_TREE_CONNECT_HE 0x0003 +#define SMB2_TREE_DISCONNECT_HE 0x0004 /* trivial req/resp */ +#define SMB2_CREATE_HE 0x0005 +#define SMB2_CLOSE_HE 0x0006 +#define SMB2_FLUSH_HE 0x0007 /* trivial resp */ +#define SMB2_READ_HE 0x0008 +#define SMB2_WRITE_HE 0x0009 +#define SMB2_LOCK_HE 0x000A +#define SMB2_IOCTL_HE 0x000B +#define SMB2_CANCEL_HE 0x000C +#define SMB2_ECHO_HE 0x000D +#define SMB2_QUERY_DIRECTORY_HE 0x000E +#define SMB2_CHANGE_NOTIFY_HE 0x000F +#define SMB2_QUERY_INFO_HE 0x0010 +#define SMB2_SET_INFO_HE 0x0011 +#define SMB2_OPLOCK_BREAK_HE 0x0012 + +/* The same list in little endian */ +#define SMB2_NEGOTIATE cpu_to_le16(SMB2_NEGOTIATE_HE) +#define SMB2_SESSION_SETUP cpu_to_le16(SMB2_SESSION_SETUP_HE) +#define SMB2_LOGOFF cpu_to_le16(SMB2_LOGOFF_HE) +#define SMB2_TREE_CONNECT cpu_to_le16(SMB2_TREE_CONNECT_HE) +#define SMB2_TREE_DISCONNECT cpu_to_le16(SMB2_TREE_DISCONNECT_HE) +#define SMB2_CREATE cpu_to_le16(SMB2_CREATE_HE) +#define SMB2_CLOSE cpu_to_le16(SMB2_CLOSE_HE) +#define SMB2_FLUSH cpu_to_le16(SMB2_FLUSH_HE) +#define SMB2_READ cpu_to_le16(SMB2_READ_HE) +#define SMB2_WRITE cpu_to_le16(SMB2_WRITE_HE) +#define SMB2_LOCK cpu_to_le16(SMB2_LOCK_HE) +#define SMB2_IOCTL cpu_to_le16(SMB2_IOCTL_HE) +#define SMB2_CANCEL cpu_to_le16(SMB2_CANCEL_HE) +#define SMB2_ECHO cpu_to_le16(SMB2_ECHO_HE) +#define SMB2_QUERY_DIRECTORY cpu_to_le16(SMB2_QUERY_DIRECTORY_HE) +#define SMB2_CHANGE_NOTIFY cpu_to_le16(SMB2_CHANGE_NOTIFY_HE) +#define SMB2_QUERY_INFO cpu_to_le16(SMB2_QUERY_INFO_HE) +#define SMB2_SET_INFO cpu_to_le16(SMB2_SET_INFO_HE) +#define SMB2_OPLOCK_BREAK cpu_to_le16(SMB2_OPLOCK_BREAK_HE) + +/*Create Action Flags*/ +#define FILE_SUPERSEDED 0x00000000 +#define FILE_OPENED 0x00000001 +#define FILE_CREATED 0x00000002 +#define FILE_OVERWRITTEN 0x00000003 + +/* + * Size of the session key (crypto key encrypted with the password + */ +#define SMB2_NTLMV2_SESSKEY_SIZE 16 +#define SMB2_SIGNATURE_SIZE 16 +#define SMB2_HMACSHA256_SIZE 32 +#define SMB2_CMACAES_SIZE 16 +#define SMB3_GCM128_CRYPTKEY_SIZE 16 +#define SMB3_GCM256_CRYPTKEY_SIZE 32 + +/* + * Size of the smb3 encryption/decryption keys + */ +#define SMB3_ENC_DEC_KEY_SIZE 32 + +/* + * Size of the smb3 signing key + */ +#define SMB3_SIGN_KEY_SIZE 16 + +#define CIFS_CLIENT_CHALLENGE_SIZE 8 +#define SMB_SERVER_CHALLENGE_SIZE 8 + +/* SMB2 Max Credits */ +#define SMB2_MAX_CREDITS 8192 + +#define SMB2_CLIENT_GUID_SIZE 16 +#define SMB2_CREATE_GUID_SIZE 16 + +/* Maximum buffer size value we can send with 1 credit */ +#define SMB2_MAX_BUFFER_SIZE 65536 + +#define NUMBER_OF_SMB2_COMMANDS 0x0013 + +/* BB FIXME - analyze following length BB */ +#define MAX_SMB2_HDR_SIZE 0x78 /* 4 len + 64 hdr + (2*24 wct) + 2 bct + 2 pad */ + +#define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe) /* 'B''M''S' */ +#define SMB2_TRANSFORM_PROTO_NUM cpu_to_le32(0x424d53fd) + +#define SMB21_DEFAULT_IOSIZE (1024 * 1024) +#define SMB3_DEFAULT_IOSIZE (4 * 1024 * 1024) +#define SMB3_DEFAULT_TRANS_SIZE (1024 * 1024) + +/* + * SMB2 Header Definition + * + * "MBZ" : Must be Zero + * "BB" : BugBug, Something to check/review/analyze later + * "PDU" : "Protocol Data Unit" (ie a network "frame") + * + */ + +#define __SMB2_HEADER_STRUCTURE_SIZE 64 +#define SMB2_HEADER_STRUCTURE_SIZE \ + cpu_to_le16(__SMB2_HEADER_STRUCTURE_SIZE) + +struct smb2_hdr { + __be32 smb2_buf_length; /* big endian on wire */ + /* + * length is only two or three bytes - with + * one or two byte type preceding it that MBZ + */ + __le32 ProtocolId; /* 0xFE 'S' 'M' 'B' */ + __le16 StructureSize; /* 64 */ + __le16 CreditCharge; /* MBZ */ + __le32 Status; /* Error from server */ + __le16 Command; + __le16 CreditRequest; /* CreditResponse */ + __le32 Flags; + __le32 NextCommand; + __le64 MessageId; + union { + struct { + __le32 ProcessId; + __le32 TreeId; + } __packed SyncId; + __le64 AsyncId; + } __packed Id; + __le64 SessionId; + __u8 Signature[16]; +} __packed; + +struct smb2_pdu { + struct smb2_hdr hdr; + __le16 StructureSize2; /* size of wct area (varies, request specific) */ +} __packed; + +#define SMB3_AES_CCM_NONCE 11 +#define SMB3_AES_GCM_NONCE 12 + +struct smb2_transform_hdr { + __be32 smb2_buf_length; /* big endian on wire */ + /* + * length is only two or three bytes - with + * one or two byte type preceding it that MBZ + */ + __le32 ProtocolId; /* 0xFD 'S' 'M' 'B' */ + __u8 Signature[16]; + __u8 Nonce[16]; + __le32 OriginalMessageSize; + __u16 Reserved1; + __le16 Flags; /* EncryptionAlgorithm */ + __le64 SessionId; +} __packed; + +/* + * SMB2 flag definitions + */ +#define SMB2_FLAGS_SERVER_TO_REDIR cpu_to_le32(0x00000001) +#define SMB2_FLAGS_ASYNC_COMMAND cpu_to_le32(0x00000002) +#define SMB2_FLAGS_RELATED_OPERATIONS cpu_to_le32(0x00000004) +#define SMB2_FLAGS_SIGNED cpu_to_le32(0x00000008) +#define SMB2_FLAGS_DFS_OPERATIONS cpu_to_le32(0x10000000) +#define SMB2_FLAGS_REPLAY_OPERATIONS cpu_to_le32(0x20000000) + +/* + * Definitions for SMB2 Protocol Data Units (network frames) + * + * See MS-SMB2.PDF specification for protocol details. + * The Naming convention is the lower case version of the SMB2 + * command code name for the struct. Note that structures must be packed. + * + */ + +#define SMB2_ERROR_STRUCTURE_SIZE2 9 +#define SMB2_ERROR_STRUCTURE_SIZE2_LE cpu_to_le16(SMB2_ERROR_STRUCTURE_SIZE2) + +struct smb2_err_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; + __u8 ErrorContextCount; + __u8 Reserved; + __le32 ByteCount; /* even if zero, at least one byte follows */ + __u8 ErrorData[1]; /* variable length */ +} __packed; + +struct smb2_negotiate_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 36 */ + __le16 DialectCount; + __le16 SecurityMode; + __le16 Reserved; /* MBZ */ + __le32 Capabilities; + __u8 ClientGUID[SMB2_CLIENT_GUID_SIZE]; + /* In SMB3.02 and earlier next three were MBZ le64 ClientStartTime */ + __le32 NegotiateContextOffset; /* SMB3.1.1 only. MBZ earlier */ + __le16 NegotiateContextCount; /* SMB3.1.1 only. MBZ earlier */ + __le16 Reserved2; + __le16 Dialects[1]; /* One dialect (vers=) at a time for now */ +} __packed; + +/* SecurityMode flags */ +#define SMB2_NEGOTIATE_SIGNING_ENABLED_LE cpu_to_le16(0x0001) +#define SMB2_NEGOTIATE_SIGNING_REQUIRED 0x0002 +#define SMB2_NEGOTIATE_SIGNING_REQUIRED_LE cpu_to_le16(0x0002) +/* Capabilities flags */ +#define SMB2_GLOBAL_CAP_DFS 0x00000001 +#define SMB2_GLOBAL_CAP_LEASING 0x00000002 /* Resp only New to SMB2.1 */ +#define SMB2_GLOBAL_CAP_LARGE_MTU 0X00000004 /* Resp only New to SMB2.1 */ +#define SMB2_GLOBAL_CAP_MULTI_CHANNEL 0x00000008 /* New to SMB3 */ +#define SMB2_GLOBAL_CAP_PERSISTENT_HANDLES 0x00000010 /* New to SMB3 */ +#define SMB2_GLOBAL_CAP_DIRECTORY_LEASING 0x00000020 /* New to SMB3 */ +#define SMB2_GLOBAL_CAP_ENCRYPTION 0x00000040 /* New to SMB3 */ +/* Internal types */ +#define SMB2_NT_FIND 0x00100000 +#define SMB2_LARGE_FILES 0x00200000 + +#define SMB311_SALT_SIZE 32 +/* Hash Algorithm Types */ +#define SMB2_PREAUTH_INTEGRITY_SHA512 cpu_to_le16(0x0001) + +#define PREAUTH_HASHVALUE_SIZE 64 + +struct preauth_integrity_info { + /* PreAuth integrity Hash ID */ + __le16 Preauth_HashId; + /* PreAuth integrity Hash Value */ + __u8 Preauth_HashValue[PREAUTH_HASHVALUE_SIZE]; +}; + +/* offset is sizeof smb2_negotiate_rsp - 4 but rounded up to 8 bytes. */ +#ifdef CONFIG_SMB_SERVER_KERBEROS5 +/* sizeof(struct smb2_negotiate_rsp) - 4 = + * header(64) + response(64) + GSS_LENGTH(96) + GSS_PADDING(0) + */ +#define OFFSET_OF_NEG_CONTEXT 0xe0 +#else +/* sizeof(struct smb2_negotiate_rsp) - 4 = + * header(64) + response(64) + GSS_LENGTH(74) + GSS_PADDING(6) + */ +#define OFFSET_OF_NEG_CONTEXT 0xd0 +#endif + +#define SMB2_PREAUTH_INTEGRITY_CAPABILITIES cpu_to_le16(1) +#define SMB2_ENCRYPTION_CAPABILITIES cpu_to_le16(2) +#define SMB2_COMPRESSION_CAPABILITIES cpu_to_le16(3) +#define SMB2_NETNAME_NEGOTIATE_CONTEXT_ID cpu_to_le16(5) +#define SMB2_SIGNING_CAPABILITIES cpu_to_le16(8) +#define SMB2_POSIX_EXTENSIONS_AVAILABLE cpu_to_le16(0x100) + +struct smb2_neg_context { + __le16 ContextType; + __le16 DataLength; + __le32 Reserved; + /* Followed by array of data */ +} __packed; + +struct smb2_preauth_neg_context { + __le16 ContextType; /* 1 */ + __le16 DataLength; + __le32 Reserved; + __le16 HashAlgorithmCount; /* 1 */ + __le16 SaltLength; + __le16 HashAlgorithms; /* HashAlgorithms[0] since only one defined */ + __u8 Salt[SMB311_SALT_SIZE]; +} __packed; + +/* Encryption Algorithms Ciphers */ +#define SMB2_ENCRYPTION_AES128_CCM cpu_to_le16(0x0001) +#define SMB2_ENCRYPTION_AES128_GCM cpu_to_le16(0x0002) +#define SMB2_ENCRYPTION_AES256_CCM cpu_to_le16(0x0003) +#define SMB2_ENCRYPTION_AES256_GCM cpu_to_le16(0x0004) + +struct smb2_encryption_neg_context { + __le16 ContextType; /* 2 */ + __le16 DataLength; + __le32 Reserved; + /* CipherCount usally 2, but can be 3 when AES256-GCM enabled */ + __le16 CipherCount; /* AES-128-GCM and AES-128-CCM by default */ + __le16 Ciphers[]; +} __packed; + +#define SMB3_COMPRESS_NONE cpu_to_le16(0x0000) +#define SMB3_COMPRESS_LZNT1 cpu_to_le16(0x0001) +#define SMB3_COMPRESS_LZ77 cpu_to_le16(0x0002) +#define SMB3_COMPRESS_LZ77_HUFF cpu_to_le16(0x0003) + +struct smb2_compression_ctx { + __le16 ContextType; /* 3 */ + __le16 DataLength; + __le32 Reserved; + __le16 CompressionAlgorithmCount; + __u16 Padding; + __le32 Reserved1; + __le16 CompressionAlgorithms[]; +} __packed; + +#define POSIX_CTXT_DATA_LEN 16 +struct smb2_posix_neg_context { + __le16 ContextType; /* 0x100 */ + __le16 DataLength; + __le32 Reserved; + __u8 Name[16]; /* POSIX ctxt GUID 93AD25509CB411E7B42383DE968BCD7C */ +} __packed; + +struct smb2_netname_neg_context { + __le16 ContextType; /* 0x100 */ + __le16 DataLength; + __le32 Reserved; + __le16 NetName[]; /* hostname of target converted to UCS-2 */ +} __packed; + +/* Signing algorithms */ +#define SIGNING_ALG_HMAC_SHA256 cpu_to_le16(0) +#define SIGNING_ALG_AES_CMAC cpu_to_le16(1) +#define SIGNING_ALG_AES_GMAC cpu_to_le16(2) + +struct smb2_signing_capabilities { + __le16 ContextType; /* 8 */ + __le16 DataLength; + __le32 Reserved; + __le16 SigningAlgorithmCount; + __le16 SigningAlgorithms[]; +} __packed; + +struct smb2_negotiate_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 65 */ + __le16 SecurityMode; + __le16 DialectRevision; + __le16 NegotiateContextCount; /* Prior to SMB3.1.1 was Reserved & MBZ */ + __u8 ServerGUID[16]; + __le32 Capabilities; + __le32 MaxTransactSize; + __le32 MaxReadSize; + __le32 MaxWriteSize; + __le64 SystemTime; /* MBZ */ + __le64 ServerStartTime; + __le16 SecurityBufferOffset; + __le16 SecurityBufferLength; + __le32 NegotiateContextOffset; /* Pre:SMB3.1.1 was reserved/ignored */ + __u8 Buffer[1]; /* variable length GSS security buffer */ +} __packed; + +/* Flags */ +#define SMB2_SESSION_REQ_FLAG_BINDING 0x01 +#define SMB2_SESSION_REQ_FLAG_ENCRYPT_DATA 0x04 + +#define SMB2_SESSION_EXPIRED (0) +#define SMB2_SESSION_IN_PROGRESS BIT(0) +#define SMB2_SESSION_VALID BIT(1) + +/* Flags */ +#define SMB2_SESSION_REQ_FLAG_BINDING 0x01 +#define SMB2_SESSION_REQ_FLAG_ENCRYPT_DATA 0x04 + +struct smb2_sess_setup_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 25 */ + __u8 Flags; + __u8 SecurityMode; + __le32 Capabilities; + __le32 Channel; + __le16 SecurityBufferOffset; + __le16 SecurityBufferLength; + __le64 PreviousSessionId; + __u8 Buffer[1]; /* variable length GSS security buffer */ +} __packed; + +/* Flags/Reserved for SMB3.1.1 */ +#define SMB2_SHAREFLAG_CLUSTER_RECONNECT 0x0001 + +/* Currently defined SessionFlags */ +#define SMB2_SESSION_FLAG_IS_GUEST_LE cpu_to_le16(0x0001) +#define SMB2_SESSION_FLAG_IS_NULL_LE cpu_to_le16(0x0002) +#define SMB2_SESSION_FLAG_ENCRYPT_DATA_LE cpu_to_le16(0x0004) +struct smb2_sess_setup_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 9 */ + __le16 SessionFlags; + __le16 SecurityBufferOffset; + __le16 SecurityBufferLength; + __u8 Buffer[1]; /* variable length GSS security buffer */ +} __packed; + +struct smb2_logoff_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 4 */ + __le16 Reserved; +} __packed; + +struct smb2_logoff_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 4 */ + __le16 Reserved; +} __packed; + +struct smb2_tree_connect_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 9 */ + __le16 Reserved; /* Flags in SMB3.1.1 */ + __le16 PathOffset; + __le16 PathLength; + __u8 Buffer[1]; /* variable length */ +} __packed; + +struct smb2_tree_connect_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 16 */ + __u8 ShareType; /* see below */ + __u8 Reserved; + __le32 ShareFlags; /* see below */ + __le32 Capabilities; /* see below */ + __le32 MaximalAccess; +} __packed; + +/* Possible ShareType values */ +#define SMB2_SHARE_TYPE_DISK 0x01 +#define SMB2_SHARE_TYPE_PIPE 0x02 +#define SMB2_SHARE_TYPE_PRINT 0x03 + +/* + * Possible ShareFlags - exactly one and only one of the first 4 caching flags + * must be set (any of the remaining, SHI1005, flags may be set individually + * or in combination. + */ +#define SMB2_SHAREFLAG_MANUAL_CACHING 0x00000000 +#define SMB2_SHAREFLAG_AUTO_CACHING 0x00000010 +#define SMB2_SHAREFLAG_VDO_CACHING 0x00000020 +#define SMB2_SHAREFLAG_NO_CACHING 0x00000030 +#define SHI1005_FLAGS_DFS 0x00000001 +#define SHI1005_FLAGS_DFS_ROOT 0x00000002 +#define SHI1005_FLAGS_RESTRICT_EXCLUSIVE_OPENS 0x00000100 +#define SHI1005_FLAGS_FORCE_SHARED_DELETE 0x00000200 +#define SHI1005_FLAGS_ALLOW_NAMESPACE_CACHING 0x00000400 +#define SHI1005_FLAGS_ACCESS_BASED_DIRECTORY_ENUM 0x00000800 +#define SHI1005_FLAGS_FORCE_LEVELII_OPLOCK 0x00001000 +#define SHI1005_FLAGS_ENABLE_HASH 0x00002000 + +/* Possible share capabilities */ +#define SMB2_SHARE_CAP_DFS cpu_to_le32(0x00000008) + +struct smb2_tree_disconnect_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 4 */ + __le16 Reserved; +} __packed; + +struct smb2_tree_disconnect_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 4 */ + __le16 Reserved; +} __packed; + +#define ATTR_READONLY_LE cpu_to_le32(ATTR_READONLY) +#define ATTR_HIDDEN_LE cpu_to_le32(ATTR_HIDDEN) +#define ATTR_SYSTEM_LE cpu_to_le32(ATTR_SYSTEM) +#define ATTR_DIRECTORY_LE cpu_to_le32(ATTR_DIRECTORY) +#define ATTR_ARCHIVE_LE cpu_to_le32(ATTR_ARCHIVE) +#define ATTR_NORMAL_LE cpu_to_le32(ATTR_NORMAL) +#define ATTR_TEMPORARY_LE cpu_to_le32(ATTR_TEMPORARY) +#define ATTR_SPARSE_FILE_LE cpu_to_le32(ATTR_SPARSE) +#define ATTR_REPARSE_POINT_LE cpu_to_le32(ATTR_REPARSE) +#define ATTR_COMPRESSED_LE cpu_to_le32(ATTR_COMPRESSED) +#define ATTR_OFFLINE_LE cpu_to_le32(ATTR_OFFLINE) +#define ATTR_NOT_CONTENT_INDEXED_LE cpu_to_le32(ATTR_NOT_CONTENT_INDEXED) +#define ATTR_ENCRYPTED_LE cpu_to_le32(ATTR_ENCRYPTED) +#define ATTR_INTEGRITY_STREAML_LE cpu_to_le32(0x00008000) +#define ATTR_NO_SCRUB_DATA_LE cpu_to_le32(0x00020000) +#define ATTR_MASK_LE cpu_to_le32(0x00007FB7) + +/* Oplock levels */ +#define SMB2_OPLOCK_LEVEL_NONE 0x00 +#define SMB2_OPLOCK_LEVEL_II 0x01 +#define SMB2_OPLOCK_LEVEL_EXCLUSIVE 0x08 +#define SMB2_OPLOCK_LEVEL_BATCH 0x09 +#define SMB2_OPLOCK_LEVEL_LEASE 0xFF +/* Non-spec internal type */ +#define SMB2_OPLOCK_LEVEL_NOCHANGE 0x99 + +/* Desired Access Flags */ +#define FILE_READ_DATA_LE cpu_to_le32(0x00000001) +#define FILE_LIST_DIRECTORY_LE cpu_to_le32(0x00000001) +#define FILE_WRITE_DATA_LE cpu_to_le32(0x00000002) +#define FILE_ADD_FILE_LE cpu_to_le32(0x00000002) +#define FILE_APPEND_DATA_LE cpu_to_le32(0x00000004) +#define FILE_ADD_SUBDIRECTORY_LE cpu_to_le32(0x00000004) +#define FILE_READ_EA_LE cpu_to_le32(0x00000008) +#define FILE_WRITE_EA_LE cpu_to_le32(0x00000010) +#define FILE_EXECUTE_LE cpu_to_le32(0x00000020) +#define FILE_TRAVERSE_LE cpu_to_le32(0x00000020) +#define FILE_DELETE_CHILD_LE cpu_to_le32(0x00000040) +#define FILE_READ_ATTRIBUTES_LE cpu_to_le32(0x00000080) +#define FILE_WRITE_ATTRIBUTES_LE cpu_to_le32(0x00000100) +#define FILE_DELETE_LE cpu_to_le32(0x00010000) +#define FILE_READ_CONTROL_LE cpu_to_le32(0x00020000) +#define FILE_WRITE_DAC_LE cpu_to_le32(0x00040000) +#define FILE_WRITE_OWNER_LE cpu_to_le32(0x00080000) +#define FILE_SYNCHRONIZE_LE cpu_to_le32(0x00100000) +#define FILE_ACCESS_SYSTEM_SECURITY_LE cpu_to_le32(0x01000000) +#define FILE_MAXIMAL_ACCESS_LE cpu_to_le32(0x02000000) +#define FILE_GENERIC_ALL_LE cpu_to_le32(0x10000000) +#define FILE_GENERIC_EXECUTE_LE cpu_to_le32(0x20000000) +#define FILE_GENERIC_WRITE_LE cpu_to_le32(0x40000000) +#define FILE_GENERIC_READ_LE cpu_to_le32(0x80000000) +#define DESIRED_ACCESS_MASK cpu_to_le32(0xF21F01FF) + +/* ShareAccess Flags */ +#define FILE_SHARE_READ_LE cpu_to_le32(0x00000001) +#define FILE_SHARE_WRITE_LE cpu_to_le32(0x00000002) +#define FILE_SHARE_DELETE_LE cpu_to_le32(0x00000004) +#define FILE_SHARE_ALL_LE cpu_to_le32(0x00000007) + +/* CreateDisposition Flags */ +#define FILE_SUPERSEDE_LE cpu_to_le32(0x00000000) +#define FILE_OPEN_LE cpu_to_le32(0x00000001) +#define FILE_CREATE_LE cpu_to_le32(0x00000002) +#define FILE_OPEN_IF_LE cpu_to_le32(0x00000003) +#define FILE_OVERWRITE_LE cpu_to_le32(0x00000004) +#define FILE_OVERWRITE_IF_LE cpu_to_le32(0x00000005) +#define FILE_CREATE_MASK_LE cpu_to_le32(0x00000007) + +#define FILE_READ_DESIRED_ACCESS_LE (FILE_READ_DATA_LE | \ + FILE_READ_EA_LE | \ + FILE_GENERIC_READ_LE) +#define FILE_WRITE_DESIRE_ACCESS_LE (FILE_WRITE_DATA_LE | \ + FILE_APPEND_DATA_LE | \ + FILE_WRITE_EA_LE | \ + FILE_WRITE_ATTRIBUTES_LE | \ + FILE_GENERIC_WRITE_LE) + +/* Impersonation Levels */ +#define IL_ANONYMOUS_LE cpu_to_le32(0x00000000) +#define IL_IDENTIFICATION_LE cpu_to_le32(0x00000001) +#define IL_IMPERSONATION_LE cpu_to_le32(0x00000002) +#define IL_DELEGATE_LE cpu_to_le32(0x00000003) + +/* Create Context Values */ +#define SMB2_CREATE_EA_BUFFER "ExtA" /* extended attributes */ +#define SMB2_CREATE_SD_BUFFER "SecD" /* security descriptor */ +#define SMB2_CREATE_DURABLE_HANDLE_REQUEST "DHnQ" +#define SMB2_CREATE_DURABLE_HANDLE_RECONNECT "DHnC" +#define SMB2_CREATE_ALLOCATION_SIZE "AlSi" +#define SMB2_CREATE_QUERY_MAXIMAL_ACCESS_REQUEST "MxAc" +#define SMB2_CREATE_TIMEWARP_REQUEST "TWrp" +#define SMB2_CREATE_QUERY_ON_DISK_ID "QFid" +#define SMB2_CREATE_REQUEST_LEASE "RqLs" +#define SMB2_CREATE_DURABLE_HANDLE_REQUEST_V2 "DH2Q" +#define SMB2_CREATE_DURABLE_HANDLE_RECONNECT_V2 "DH2C" +#define SMB2_CREATE_APP_INSTANCE_ID "\x45\xBC\xA6\x6A\xEF\xA7\xF7\x4A\x90\x08\xFA\x46\x2E\x14\x4D\x74" + #define SMB2_CREATE_APP_INSTANCE_VERSION "\xB9\x82\xD0\xB7\x3B\x56\x07\x4F\xA0\x7B\x52\x4A\x81\x16\xA0\x10" +#define SVHDX_OPEN_DEVICE_CONTEXT 0x83CE6F1AD851E0986E34401CC9BCFCE9 +#define SMB2_CREATE_TAG_POSIX "\x93\xAD\x25\x50\x9C\xB4\x11\xE7\xB4\x23\x83\xDE\x96\x8B\xCD\x7C" + +struct smb2_create_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 57 */ + __u8 SecurityFlags; + __u8 RequestedOplockLevel; + __le32 ImpersonationLevel; + __le64 SmbCreateFlags; + __le64 Reserved; + __le32 DesiredAccess; + __le32 FileAttributes; + __le32 ShareAccess; + __le32 CreateDisposition; + __le32 CreateOptions; + __le16 NameOffset; + __le16 NameLength; + __le32 CreateContextsOffset; + __le32 CreateContextsLength; + __u8 Buffer[0]; +} __packed; + +struct smb2_create_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 89 */ + __u8 OplockLevel; + __u8 Reserved; + __le32 CreateAction; + __le64 CreationTime; + __le64 LastAccessTime; + __le64 LastWriteTime; + __le64 ChangeTime; + __le64 AllocationSize; + __le64 EndofFile; + __le32 FileAttributes; + __le32 Reserved2; + __le64 PersistentFileId; + __le64 VolatileFileId; + __le32 CreateContextsOffset; + __le32 CreateContextsLength; + __u8 Buffer[1]; +} __packed; + +struct create_context { + __le32 Next; + __le16 NameOffset; + __le16 NameLength; + __le16 Reserved; + __le16 DataOffset; + __le32 DataLength; + __u8 Buffer[0]; +} __packed; + +struct create_durable_req_v2 { + struct create_context ccontext; + __u8 Name[8]; + __le32 Timeout; + __le32 Flags; + __u8 Reserved[8]; + __u8 CreateGuid[16]; +} __packed; + +struct create_durable_reconn_req { + struct create_context ccontext; + __u8 Name[8]; + union { + __u8 Reserved[16]; + struct { + __le64 PersistentFileId; + __le64 VolatileFileId; + } Fid; + } Data; +} __packed; + +struct create_durable_reconn_v2_req { + struct create_context ccontext; + __u8 Name[8]; + struct { + __le64 PersistentFileId; + __le64 VolatileFileId; + } Fid; + __u8 CreateGuid[16]; + __le32 Flags; +} __packed; + +struct create_app_inst_id { + struct create_context ccontext; + __u8 Name[8]; + __u8 Reserved[8]; + __u8 AppInstanceId[16]; +} __packed; + +struct create_app_inst_id_vers { + struct create_context ccontext; + __u8 Name[8]; + __u8 Reserved[2]; + __u8 Padding[4]; + __le64 AppInstanceVersionHigh; + __le64 AppInstanceVersionLow; +} __packed; + +struct create_mxac_req { + struct create_context ccontext; + __u8 Name[8]; + __le64 Timestamp; +} __packed; + +struct create_alloc_size_req { + struct create_context ccontext; + __u8 Name[8]; + __le64 AllocationSize; +} __packed; + +struct create_posix { + struct create_context ccontext; + __u8 Name[16]; + __le32 Mode; + __u32 Reserved; +} __packed; + +struct create_durable_rsp { + struct create_context ccontext; + __u8 Name[8]; + union { + __u8 Reserved[8]; + __u64 data; + } Data; +} __packed; + +struct create_durable_v2_rsp { + struct create_context ccontext; + __u8 Name[8]; + __le32 Timeout; + __le32 Flags; +} __packed; + +struct create_mxac_rsp { + struct create_context ccontext; + __u8 Name[8]; + __le32 QueryStatus; + __le32 MaximalAccess; +} __packed; + +struct create_disk_id_rsp { + struct create_context ccontext; + __u8 Name[8]; + __le64 DiskFileId; + __le64 VolumeId; + __u8 Reserved[16]; +} __packed; + +/* equivalent of the contents of SMB3.1.1 POSIX open context response */ +struct create_posix_rsp { + struct create_context ccontext; + __u8 Name[16]; + __le32 nlink; + __le32 reparse_tag; + __le32 mode; + u8 SidBuffer[40]; +} __packed; + +#define SMB2_LEASE_NONE_LE cpu_to_le32(0x00) +#define SMB2_LEASE_READ_CACHING_LE cpu_to_le32(0x01) +#define SMB2_LEASE_HANDLE_CACHING_LE cpu_to_le32(0x02) +#define SMB2_LEASE_WRITE_CACHING_LE cpu_to_le32(0x04) + +#define SMB2_LEASE_FLAG_BREAK_IN_PROGRESS_LE cpu_to_le32(0x02) + +struct lease_context { + __le64 LeaseKeyLow; + __le64 LeaseKeyHigh; + __le32 LeaseState; + __le32 LeaseFlags; + __le64 LeaseDuration; +} __packed; + +struct lease_context_v2 { + __le64 LeaseKeyLow; + __le64 LeaseKeyHigh; + __le32 LeaseState; + __le32 LeaseFlags; + __le64 LeaseDuration; + __le64 ParentLeaseKeyLow; + __le64 ParentLeaseKeyHigh; + __le16 Epoch; + __le16 Reserved; +} __packed; + +struct create_lease { + struct create_context ccontext; + __u8 Name[8]; + struct lease_context lcontext; +} __packed; + +struct create_lease_v2 { + struct create_context ccontext; + __u8 Name[8]; + struct lease_context_v2 lcontext; + __u8 Pad[4]; +} __packed; + +/* Currently defined values for close flags */ +#define SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB cpu_to_le16(0x0001) +struct smb2_close_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 24 */ + __le16 Flags; + __le32 Reserved; + __le64 PersistentFileId; + __le64 VolatileFileId; +} __packed; + +struct smb2_close_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* 60 */ + __le16 Flags; + __le32 Reserved; + __le64 CreationTime; + __le64 LastAccessTime; + __le64 LastWriteTime; + __le64 ChangeTime; + __le64 AllocationSize; /* Beginning of FILE_STANDARD_INFO equivalent */ + __le64 EndOfFile; + __le32 Attributes; +} __packed; + +struct smb2_flush_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 24 */ + __le16 Reserved1; + __le32 Reserved2; + __le64 PersistentFileId; + __le64 VolatileFileId; +} __packed; + +struct smb2_flush_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; + __le16 Reserved; +} __packed; + +struct smb2_buffer_desc_v1 { + __le64 offset; + __le32 token; + __le32 length; +} __packed; + +#define SMB2_CHANNEL_NONE cpu_to_le32(0x00000000) +#define SMB2_CHANNEL_RDMA_V1 cpu_to_le32(0x00000001) +#define SMB2_CHANNEL_RDMA_V1_INVALIDATE cpu_to_le32(0x00000002) + +struct smb2_read_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 49 */ + __u8 Padding; /* offset from start of SMB2 header to place read */ + __u8 Reserved; + __le32 Length; + __le64 Offset; + __le64 PersistentFileId; + __le64 VolatileFileId; + __le32 MinimumCount; + __le32 Channel; /* Reserved MBZ */ + __le32 RemainingBytes; + __le16 ReadChannelInfoOffset; /* Reserved MBZ */ + __le16 ReadChannelInfoLength; /* Reserved MBZ */ + __u8 Buffer[1]; +} __packed; + +struct smb2_read_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 17 */ + __u8 DataOffset; + __u8 Reserved; + __le32 DataLength; + __le32 DataRemaining; + __u32 Reserved2; + __u8 Buffer[1]; +} __packed; + +/* For write request Flags field below the following flag is defined: */ +#define SMB2_WRITEFLAG_WRITE_THROUGH 0x00000001 + +struct smb2_write_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 49 */ + __le16 DataOffset; /* offset from start of SMB2 header to write data */ + __le32 Length; + __le64 Offset; + __le64 PersistentFileId; + __le64 VolatileFileId; + __le32 Channel; /* Reserved MBZ */ + __le32 RemainingBytes; + __le16 WriteChannelInfoOffset; /* Reserved MBZ */ + __le16 WriteChannelInfoLength; /* Reserved MBZ */ + __le32 Flags; + __u8 Buffer[1]; +} __packed; + +struct smb2_write_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 17 */ + __u8 DataOffset; + __u8 Reserved; + __le32 DataLength; + __le32 DataRemaining; + __u32 Reserved2; + __u8 Buffer[1]; +} __packed; + +#define SMB2_0_IOCTL_IS_FSCTL 0x00000001 + +struct duplicate_extents_to_file { + __u64 PersistentFileHandle; /* source file handle, opaque endianness */ + __u64 VolatileFileHandle; + __le64 SourceFileOffset; + __le64 TargetFileOffset; + __le64 ByteCount; /* Bytes to be copied */ +} __packed; + +struct smb2_ioctl_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 57 */ + __le16 Reserved; /* offset from start of SMB2 header to write data */ + __le32 CntCode; + __le64 PersistentFileId; + __le64 VolatileFileId; + __le32 InputOffset; /* Reserved MBZ */ + __le32 InputCount; + __le32 MaxInputResponse; + __le32 OutputOffset; + __le32 OutputCount; + __le32 MaxOutputResponse; + __le32 Flags; + __le32 Reserved2; + __u8 Buffer[1]; +} __packed; + +struct smb2_ioctl_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 49 */ + __le16 Reserved; /* offset from start of SMB2 header to write data */ + __le32 CntCode; + __le64 PersistentFileId; + __le64 VolatileFileId; + __le32 InputOffset; /* Reserved MBZ */ + __le32 InputCount; + __le32 OutputOffset; + __le32 OutputCount; + __le32 Flags; + __le32 Reserved2; + __u8 Buffer[1]; +} __packed; + +struct validate_negotiate_info_req { + __le32 Capabilities; + __u8 Guid[SMB2_CLIENT_GUID_SIZE]; + __le16 SecurityMode; + __le16 DialectCount; + __le16 Dialects[1]; /* dialect (someday maybe list) client asked for */ +} __packed; + +struct validate_negotiate_info_rsp { + __le32 Capabilities; + __u8 Guid[SMB2_CLIENT_GUID_SIZE]; + __le16 SecurityMode; + __le16 Dialect; /* Dialect in use for the connection */ +} __packed; + +struct smb_sockaddr_in { + __be16 Port; + __be32 IPv4address; + __u8 Reserved[8]; +} __packed; + +struct smb_sockaddr_in6 { + __be16 Port; + __be32 FlowInfo; + __u8 IPv6address[16]; + __be32 ScopeId; +} __packed; + +#define INTERNETWORK 0x0002 +#define INTERNETWORKV6 0x0017 + +struct sockaddr_storage_rsp { + __le16 Family; + union { + struct smb_sockaddr_in addr4; + struct smb_sockaddr_in6 addr6; + }; +} __packed; + +#define RSS_CAPABLE 0x00000001 +#define RDMA_CAPABLE 0x00000002 + +struct network_interface_info_ioctl_rsp { + __le32 Next; /* next interface. zero if this is last one */ + __le32 IfIndex; + __le32 Capability; /* RSS or RDMA Capable */ + __le32 Reserved; + __le64 LinkSpeed; + char SockAddr_Storage[128]; +} __packed; + +struct file_object_buf_type1_ioctl_rsp { + __u8 ObjectId[16]; + __u8 BirthVolumeId[16]; + __u8 BirthObjectId[16]; + __u8 DomainId[16]; +} __packed; + +struct resume_key_ioctl_rsp { + __le64 ResumeKey[3]; + __le32 ContextLength; + __u8 Context[4]; /* ignored, Windows sets to 4 bytes of zero */ +} __packed; + +struct copychunk_ioctl_req { + __le64 ResumeKey[3]; + __le32 ChunkCount; + __le32 Reserved; + __u8 Chunks[1]; /* array of srv_copychunk */ +} __packed; + +struct srv_copychunk { + __le64 SourceOffset; + __le64 TargetOffset; + __le32 Length; + __le32 Reserved; +} __packed; + +struct copychunk_ioctl_rsp { + __le32 ChunksWritten; + __le32 ChunkBytesWritten; + __le32 TotalBytesWritten; +} __packed; + +struct file_sparse { + __u8 SetSparse; +} __packed; + +struct file_zero_data_information { + __le64 FileOffset; + __le64 BeyondFinalZero; +} __packed; + +struct file_allocated_range_buffer { + __le64 file_offset; + __le64 length; +} __packed; + +struct reparse_data_buffer { + __le32 ReparseTag; + __le16 ReparseDataLength; + __u16 Reserved; + __u8 DataBuffer[]; /* Variable Length */ +} __packed; + +/* Completion Filter flags for Notify */ +#define FILE_NOTIFY_CHANGE_FILE_NAME 0x00000001 +#define FILE_NOTIFY_CHANGE_DIR_NAME 0x00000002 +#define FILE_NOTIFY_CHANGE_NAME 0x00000003 +#define FILE_NOTIFY_CHANGE_ATTRIBUTES 0x00000004 +#define FILE_NOTIFY_CHANGE_SIZE 0x00000008 +#define FILE_NOTIFY_CHANGE_LAST_WRITE 0x00000010 +#define FILE_NOTIFY_CHANGE_LAST_ACCESS 0x00000020 +#define FILE_NOTIFY_CHANGE_CREATION 0x00000040 +#define FILE_NOTIFY_CHANGE_EA 0x00000080 +#define FILE_NOTIFY_CHANGE_SECURITY 0x00000100 +#define FILE_NOTIFY_CHANGE_STREAM_NAME 0x00000200 +#define FILE_NOTIFY_CHANGE_STREAM_SIZE 0x00000400 +#define FILE_NOTIFY_CHANGE_STREAM_WRITE 0x00000800 + +/* Flags */ +#define SMB2_WATCH_TREE 0x0001 + +struct smb2_notify_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 32 */ + __le16 Flags; + __le32 OutputBufferLength; + __le64 PersistentFileId; + __le64 VolatileFileId; + __u32 CompletionFileter; + __u32 Reserved; +} __packed; + +struct smb2_notify_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 9 */ + __le16 OutputBufferOffset; + __le32 OutputBufferLength; + __u8 Buffer[1]; +} __packed; + +/* SMB2 Notify Action Flags */ +#define FILE_ACTION_ADDED 0x00000001 +#define FILE_ACTION_REMOVED 0x00000002 +#define FILE_ACTION_MODIFIED 0x00000003 +#define FILE_ACTION_RENAMED_OLD_NAME 0x00000004 +#define FILE_ACTION_RENAMED_NEW_NAME 0x00000005 +#define FILE_ACTION_ADDED_STREAM 0x00000006 +#define FILE_ACTION_REMOVED_STREAM 0x00000007 +#define FILE_ACTION_MODIFIED_STREAM 0x00000008 +#define FILE_ACTION_REMOVED_BY_DELETE 0x00000009 + +#define SMB2_LOCKFLAG_SHARED 0x0001 +#define SMB2_LOCKFLAG_EXCLUSIVE 0x0002 +#define SMB2_LOCKFLAG_UNLOCK 0x0004 +#define SMB2_LOCKFLAG_FAIL_IMMEDIATELY 0x0010 +#define SMB2_LOCKFLAG_MASK 0x0007 + +struct smb2_lock_element { + __le64 Offset; + __le64 Length; + __le32 Flags; + __le32 Reserved; +} __packed; + +struct smb2_lock_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 48 */ + __le16 LockCount; + __le32 Reserved; + __le64 PersistentFileId; + __le64 VolatileFileId; + /* Followed by at least one */ + struct smb2_lock_element locks[1]; +} __packed; + +struct smb2_lock_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 4 */ + __le16 Reserved; +} __packed; + +struct smb2_echo_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 4 */ + __u16 Reserved; +} __packed; + +struct smb2_echo_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 4 */ + __u16 Reserved; +} __packed; + +/* search (query_directory) Flags field */ +#define SMB2_RESTART_SCANS 0x01 +#define SMB2_RETURN_SINGLE_ENTRY 0x02 +#define SMB2_INDEX_SPECIFIED 0x04 +#define SMB2_REOPEN 0x10 + +struct smb2_query_directory_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 33 */ + __u8 FileInformationClass; + __u8 Flags; + __le32 FileIndex; + __le64 PersistentFileId; + __le64 VolatileFileId; + __le16 FileNameOffset; + __le16 FileNameLength; + __le32 OutputBufferLength; + __u8 Buffer[1]; +} __packed; + +struct smb2_query_directory_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 9 */ + __le16 OutputBufferOffset; + __le32 OutputBufferLength; + __u8 Buffer[1]; +} __packed; + +/* Possible InfoType values */ +#define SMB2_O_INFO_FILE 0x01 +#define SMB2_O_INFO_FILESYSTEM 0x02 +#define SMB2_O_INFO_SECURITY 0x03 +#define SMB2_O_INFO_QUOTA 0x04 + +/* Security info type additionalinfo flags. See MS-SMB2 (2.2.37) or MS-DTYP */ +#define OWNER_SECINFO 0x00000001 +#define GROUP_SECINFO 0x00000002 +#define DACL_SECINFO 0x00000004 +#define SACL_SECINFO 0x00000008 +#define LABEL_SECINFO 0x00000010 +#define ATTRIBUTE_SECINFO 0x00000020 +#define SCOPE_SECINFO 0x00000040 +#define BACKUP_SECINFO 0x00010000 +#define UNPROTECTED_SACL_SECINFO 0x10000000 +#define UNPROTECTED_DACL_SECINFO 0x20000000 +#define PROTECTED_SACL_SECINFO 0x40000000 +#define PROTECTED_DACL_SECINFO 0x80000000 + +struct smb2_query_info_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 41 */ + __u8 InfoType; + __u8 FileInfoClass; + __le32 OutputBufferLength; + __le16 InputBufferOffset; + __u16 Reserved; + __le32 InputBufferLength; + __le32 AdditionalInformation; + __le32 Flags; + __le64 PersistentFileId; + __le64 VolatileFileId; + __u8 Buffer[1]; +} __packed; + +struct smb2_query_info_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 9 */ + __le16 OutputBufferOffset; + __le32 OutputBufferLength; + __u8 Buffer[1]; +} __packed; + +struct smb2_set_info_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 33 */ + __u8 InfoType; + __u8 FileInfoClass; + __le32 BufferLength; + __le16 BufferOffset; + __u16 Reserved; + __le32 AdditionalInformation; + __le64 PersistentFileId; + __le64 VolatileFileId; + __u8 Buffer[1]; +} __packed; + +struct smb2_set_info_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 2 */ +} __packed; + +/* FILE Info response size */ +#define FILE_DIRECTORY_INFORMATION_SIZE 1 +#define FILE_FULL_DIRECTORY_INFORMATION_SIZE 2 +#define FILE_BOTH_DIRECTORY_INFORMATION_SIZE 3 +#define FILE_BASIC_INFORMATION_SIZE 40 +#define FILE_STANDARD_INFORMATION_SIZE 24 +#define FILE_INTERNAL_INFORMATION_SIZE 8 +#define FILE_EA_INFORMATION_SIZE 4 +#define FILE_ACCESS_INFORMATION_SIZE 4 +#define FILE_NAME_INFORMATION_SIZE 9 +#define FILE_RENAME_INFORMATION_SIZE 10 +#define FILE_LINK_INFORMATION_SIZE 11 +#define FILE_NAMES_INFORMATION_SIZE 12 +#define FILE_DISPOSITION_INFORMATION_SIZE 13 +#define FILE_POSITION_INFORMATION_SIZE 14 +#define FILE_FULL_EA_INFORMATION_SIZE 15 +#define FILE_MODE_INFORMATION_SIZE 4 +#define FILE_ALIGNMENT_INFORMATION_SIZE 4 +#define FILE_ALL_INFORMATION_SIZE 104 +#define FILE_ALLOCATION_INFORMATION_SIZE 19 +#define FILE_END_OF_FILE_INFORMATION_SIZE 20 +#define FILE_ALTERNATE_NAME_INFORMATION_SIZE 8 +#define FILE_STREAM_INFORMATION_SIZE 32 +#define FILE_PIPE_INFORMATION_SIZE 23 +#define FILE_PIPE_LOCAL_INFORMATION_SIZE 24 +#define FILE_PIPE_REMOTE_INFORMATION_SIZE 25 +#define FILE_MAILSLOT_QUERY_INFORMATION_SIZE 26 +#define FILE_MAILSLOT_SET_INFORMATION_SIZE 27 +#define FILE_COMPRESSION_INFORMATION_SIZE 16 +#define FILE_OBJECT_ID_INFORMATION_SIZE 29 +/* Number 30 not defined in documents */ +#define FILE_MOVE_CLUSTER_INFORMATION_SIZE 31 +#define FILE_QUOTA_INFORMATION_SIZE 32 +#define FILE_REPARSE_POINT_INFORMATION_SIZE 33 +#define FILE_NETWORK_OPEN_INFORMATION_SIZE 56 +#define FILE_ATTRIBUTE_TAG_INFORMATION_SIZE 8 + +/* FS Info response size */ +#define FS_DEVICE_INFORMATION_SIZE 8 +#define FS_ATTRIBUTE_INFORMATION_SIZE 16 +#define FS_VOLUME_INFORMATION_SIZE 24 +#define FS_SIZE_INFORMATION_SIZE 24 +#define FS_FULL_SIZE_INFORMATION_SIZE 32 +#define FS_SECTOR_SIZE_INFORMATION_SIZE 28 +#define FS_OBJECT_ID_INFORMATION_SIZE 64 +#define FS_CONTROL_INFORMATION_SIZE 48 +#define FS_POSIX_INFORMATION_SIZE 56 + +/* FS_ATTRIBUTE_File_System_Name */ +#define FS_TYPE_SUPPORT_SIZE 44 +struct fs_type_info { + char *fs_name; + long magic_number; +} __packed; + +struct smb2_oplock_break { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 24 */ + __u8 OplockLevel; + __u8 Reserved; + __le32 Reserved2; + __le64 PersistentFid; + __le64 VolatileFid; +} __packed; + +#define SMB2_NOTIFY_BREAK_LEASE_FLAG_ACK_REQUIRED cpu_to_le32(0x01) + +struct smb2_lease_break { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 44 */ + __le16 Epoch; + __le32 Flags; + __u8 LeaseKey[16]; + __le32 CurrentLeaseState; + __le32 NewLeaseState; + __le32 BreakReason; + __le32 AccessMaskHint; + __le32 ShareMaskHint; +} __packed; + +struct smb2_lease_ack { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 36 */ + __le16 Reserved; + __le32 Flags; + __u8 LeaseKey[16]; + __le32 LeaseState; + __le64 LeaseDuration; +} __packed; + +/* + * PDU infolevel structure definitions + * BB consider moving to a different header + */ + +/* File System Information Classes */ +#define FS_VOLUME_INFORMATION 1 /* Query */ +#define FS_LABEL_INFORMATION 2 /* Set */ +#define FS_SIZE_INFORMATION 3 /* Query */ +#define FS_DEVICE_INFORMATION 4 /* Query */ +#define FS_ATTRIBUTE_INFORMATION 5 /* Query */ +#define FS_CONTROL_INFORMATION 6 /* Query, Set */ +#define FS_FULL_SIZE_INFORMATION 7 /* Query */ +#define FS_OBJECT_ID_INFORMATION 8 /* Query, Set */ +#define FS_DRIVER_PATH_INFORMATION 9 /* Query */ +#define FS_SECTOR_SIZE_INFORMATION 11 /* SMB3 or later. Query */ +#define FS_POSIX_INFORMATION 100 /* SMB3.1.1 POSIX. Query */ + +struct smb2_fs_full_size_info { + __le64 TotalAllocationUnits; + __le64 CallerAvailableAllocationUnits; + __le64 ActualAvailableAllocationUnits; + __le32 SectorsPerAllocationUnit; + __le32 BytesPerSector; +} __packed; + +#define SSINFO_FLAGS_ALIGNED_DEVICE 0x00000001 +#define SSINFO_FLAGS_PARTITION_ALIGNED_ON_DEVICE 0x00000002 +#define SSINFO_FLAGS_NO_SEEK_PENALTY 0x00000004 +#define SSINFO_FLAGS_TRIM_ENABLED 0x00000008 + +/* sector size info struct */ +struct smb3_fs_ss_info { + __le32 LogicalBytesPerSector; + __le32 PhysicalBytesPerSectorForAtomicity; + __le32 PhysicalBytesPerSectorForPerf; + __le32 FSEffPhysicalBytesPerSectorForAtomicity; + __le32 Flags; + __le32 ByteOffsetForSectorAlignment; + __le32 ByteOffsetForPartitionAlignment; +} __packed; + +/* File System Control Information */ +struct smb2_fs_control_info { + __le64 FreeSpaceStartFiltering; + __le64 FreeSpaceThreshold; + __le64 FreeSpaceStopFiltering; + __le64 DefaultQuotaThreshold; + __le64 DefaultQuotaLimit; + __le32 FileSystemControlFlags; + __le32 Padding; +} __packed; + +/* partial list of QUERY INFO levels */ +#define FILE_DIRECTORY_INFORMATION 1 +#define FILE_FULL_DIRECTORY_INFORMATION 2 +#define FILE_BOTH_DIRECTORY_INFORMATION 3 +#define FILE_BASIC_INFORMATION 4 +#define FILE_STANDARD_INFORMATION 5 +#define FILE_INTERNAL_INFORMATION 6 +#define FILE_EA_INFORMATION 7 +#define FILE_ACCESS_INFORMATION 8 +#define FILE_NAME_INFORMATION 9 +#define FILE_RENAME_INFORMATION 10 +#define FILE_LINK_INFORMATION 11 +#define FILE_NAMES_INFORMATION 12 +#define FILE_DISPOSITION_INFORMATION 13 +#define FILE_POSITION_INFORMATION 14 +#define FILE_FULL_EA_INFORMATION 15 +#define FILE_MODE_INFORMATION 16 +#define FILE_ALIGNMENT_INFORMATION 17 +#define FILE_ALL_INFORMATION 18 +#define FILE_ALLOCATION_INFORMATION 19 +#define FILE_END_OF_FILE_INFORMATION 20 +#define FILE_ALTERNATE_NAME_INFORMATION 21 +#define FILE_STREAM_INFORMATION 22 +#define FILE_PIPE_INFORMATION 23 +#define FILE_PIPE_LOCAL_INFORMATION 24 +#define FILE_PIPE_REMOTE_INFORMATION 25 +#define FILE_MAILSLOT_QUERY_INFORMATION 26 +#define FILE_MAILSLOT_SET_INFORMATION 27 +#define FILE_COMPRESSION_INFORMATION 28 +#define FILE_OBJECT_ID_INFORMATION 29 +/* Number 30 not defined in documents */ +#define FILE_MOVE_CLUSTER_INFORMATION 31 +#define FILE_QUOTA_INFORMATION 32 +#define FILE_REPARSE_POINT_INFORMATION 33 +#define FILE_NETWORK_OPEN_INFORMATION 34 +#define FILE_ATTRIBUTE_TAG_INFORMATION 35 +#define FILE_TRACKING_INFORMATION 36 +#define FILEID_BOTH_DIRECTORY_INFORMATION 37 +#define FILEID_FULL_DIRECTORY_INFORMATION 38 +#define FILE_VALID_DATA_LENGTH_INFORMATION 39 +#define FILE_SHORT_NAME_INFORMATION 40 +#define FILE_SFIO_RESERVE_INFORMATION 44 +#define FILE_SFIO_VOLUME_INFORMATION 45 +#define FILE_HARD_LINK_INFORMATION 46 +#define FILE_NORMALIZED_NAME_INFORMATION 48 +#define FILEID_GLOBAL_TX_DIRECTORY_INFORMATION 50 +#define FILE_STANDARD_LINK_INFORMATION 54 + +#define OP_BREAK_STRUCT_SIZE_20 24 +#define OP_BREAK_STRUCT_SIZE_21 36 + +struct smb2_file_access_info { + __le32 AccessFlags; +} __packed; + +struct smb2_file_alignment_info { + __le32 AlignmentRequirement; +} __packed; + +struct smb2_file_internal_info { + __le64 IndexNumber; +} __packed; /* level 6 Query */ + +struct smb2_file_rename_info { /* encoding of request for level 10 */ + __u8 ReplaceIfExists; /* 1 = replace existing target with new */ + /* 0 = fail if target already exists */ + __u8 Reserved[7]; + __u64 RootDirectory; /* MBZ for network operations (why says spec?) */ + __le32 FileNameLength; + char FileName[0]; /* New name to be assigned */ +} __packed; /* level 10 Set */ + +struct smb2_file_link_info { /* encoding of request for level 11 */ + __u8 ReplaceIfExists; /* 1 = replace existing link with new */ + /* 0 = fail if link already exists */ + __u8 Reserved[7]; + __u64 RootDirectory; /* MBZ for network operations (why says spec?) */ + __le32 FileNameLength; + char FileName[0]; /* Name to be assigned to new link */ +} __packed; /* level 11 Set */ + +/* + * This level 18, although with struct with same name is different from cifs + * level 0x107. Level 0x107 has an extra u64 between AccessFlags and + * CurrentByteOffset. + */ +struct smb2_file_all_info { /* data block encoding of response to level 18 */ + __le64 CreationTime; /* Beginning of FILE_BASIC_INFO equivalent */ + __le64 LastAccessTime; + __le64 LastWriteTime; + __le64 ChangeTime; + __le32 Attributes; + __u32 Pad1; /* End of FILE_BASIC_INFO_INFO equivalent */ + __le64 AllocationSize; /* Beginning of FILE_STANDARD_INFO equivalent */ + __le64 EndOfFile; /* size ie offset to first free byte in file */ + __le32 NumberOfLinks; /* hard links */ + __u8 DeletePending; + __u8 Directory; + __u16 Pad2; /* End of FILE_STANDARD_INFO equivalent */ + __le64 IndexNumber; + __le32 EASize; + __le32 AccessFlags; + __le64 CurrentByteOffset; + __le32 Mode; + __le32 AlignmentRequirement; + __le32 FileNameLength; + char FileName[1]; +} __packed; /* level 18 Query */ + +struct smb2_file_alt_name_info { + __le32 FileNameLength; + char FileName[0]; +} __packed; + +struct smb2_file_stream_info { + __le32 NextEntryOffset; + __le32 StreamNameLength; + __le64 StreamSize; + __le64 StreamAllocationSize; + char StreamName[0]; +} __packed; + +struct smb2_file_eof_info { /* encoding of request for level 10 */ + __le64 EndOfFile; /* new end of file value */ +} __packed; /* level 20 Set */ + +struct smb2_file_ntwrk_info { + __le64 CreationTime; + __le64 LastAccessTime; + __le64 LastWriteTime; + __le64 ChangeTime; + __le64 AllocationSize; + __le64 EndOfFile; + __le32 Attributes; + __le32 Reserved; +} __packed; + +struct smb2_file_standard_info { + __le64 AllocationSize; + __le64 EndOfFile; + __le32 NumberOfLinks; /* hard links */ + __u8 DeletePending; + __u8 Directory; + __le16 Reserved; +} __packed; /* level 18 Query */ + +struct smb2_file_ea_info { + __le32 EASize; +} __packed; + +struct smb2_file_alloc_info { + __le64 AllocationSize; +} __packed; + +struct smb2_file_disposition_info { + __u8 DeletePending; +} __packed; + +struct smb2_file_pos_info { + __le64 CurrentByteOffset; +} __packed; + +#define FILE_MODE_INFO_MASK cpu_to_le32(0x0000103e) + +struct smb2_file_mode_info { + __le32 Mode; +} __packed; + +#define COMPRESSION_FORMAT_NONE 0x0000 +#define COMPRESSION_FORMAT_LZNT1 0x0002 + +struct smb2_file_comp_info { + __le64 CompressedFileSize; + __le16 CompressionFormat; + __u8 CompressionUnitShift; + __u8 ChunkShift; + __u8 ClusterShift; + __u8 Reserved[3]; +} __packed; + +struct smb2_file_attr_tag_info { + __le32 FileAttributes; + __le32 ReparseTag; +} __packed; + +#define SL_RESTART_SCAN 0x00000001 +#define SL_RETURN_SINGLE_ENTRY 0x00000002 +#define SL_INDEX_SPECIFIED 0x00000004 + +struct smb2_ea_info_req { + __le32 NextEntryOffset; + __u8 EaNameLength; + char name[1]; +} __packed; /* level 15 Query */ + +struct smb2_ea_info { + __le32 NextEntryOffset; + __u8 Flags; + __u8 EaNameLength; + __le16 EaValueLength; + char name[1]; + /* optionally followed by value */ +} __packed; /* level 15 Query */ + +struct create_ea_buf_req { + struct create_context ccontext; + __u8 Name[8]; + struct smb2_ea_info ea; +} __packed; + +struct create_sd_buf_req { + struct create_context ccontext; + __u8 Name[8]; + struct smb_ntsd ntsd; +} __packed; + +/* Find File infolevels */ +#define SMB_FIND_FILE_POSIX_INFO 0x064 + +/* Level 100 query info */ +struct smb311_posix_qinfo { + __le64 CreationTime; + __le64 LastAccessTime; + __le64 LastWriteTime; + __le64 ChangeTime; + __le64 EndOfFile; + __le64 AllocationSize; + __le32 DosAttributes; + __le64 Inode; + __le32 DeviceId; + __le32 Zero; + /* beginning of POSIX Create Context Response */ + __le32 HardLinks; + __le32 ReparseTag; + __le32 Mode; + u8 Sids[]; + /* + * var sized owner SID + * var sized group SID + * le32 filenamelength + * u8 filename[] + */ +} __packed; + +struct smb2_posix_info { + __le32 NextEntryOffset; + __u32 Ignored; + __le64 CreationTime; + __le64 LastAccessTime; + __le64 LastWriteTime; + __le64 ChangeTime; + __le64 EndOfFile; + __le64 AllocationSize; + __le32 DosAttributes; + __le64 Inode; + __le32 DeviceId; + __le32 Zero; + /* beginning of POSIX Create Context Response */ + __le32 HardLinks; + __le32 ReparseTag; + __le32 Mode; + u8 SidBuffer[40]; + __le32 name_len; + u8 name[1]; + /* + * var sized owner SID + * var sized group SID + * le32 filenamelength + * u8 filename[] + */ +} __packed; + +/* functions */ +int init_smb2_0_server(struct ksmbd_conn *conn); +void init_smb2_1_server(struct ksmbd_conn *conn); +void init_smb3_0_server(struct ksmbd_conn *conn); +void init_smb3_02_server(struct ksmbd_conn *conn); +int init_smb3_11_server(struct ksmbd_conn *conn); + +void init_smb2_max_read_size(unsigned int sz); +void init_smb2_max_write_size(unsigned int sz); +void init_smb2_max_trans_size(unsigned int sz); + +bool is_smb2_neg_cmd(struct ksmbd_work *work); +bool is_smb2_rsp(struct ksmbd_work *work); + +u16 get_smb2_cmd_val(struct ksmbd_work *work); +void set_smb2_rsp_status(struct ksmbd_work *work, __le32 err); +int init_smb2_rsp_hdr(struct ksmbd_work *work); +int smb2_allocate_rsp_buf(struct ksmbd_work *work); +bool is_chained_smb2_message(struct ksmbd_work *work); +int init_smb2_neg_rsp(struct ksmbd_work *work); +void smb2_set_err_rsp(struct ksmbd_work *work); +int smb2_check_user_session(struct ksmbd_work *work); +int smb2_get_ksmbd_tcon(struct ksmbd_work *work); +bool smb2_is_sign_req(struct ksmbd_work *work, unsigned int command); +int smb2_check_sign_req(struct ksmbd_work *work); +void smb2_set_sign_rsp(struct ksmbd_work *work); +int smb3_check_sign_req(struct ksmbd_work *work); +void smb3_set_sign_rsp(struct ksmbd_work *work); +int find_matching_smb2_dialect(int start_index, __le16 *cli_dialects, + __le16 dialects_count); +struct file_lock *smb_flock_init(struct file *f); +int setup_async_work(struct ksmbd_work *work, void (*fn)(void **), + void **arg); +void smb2_send_interim_resp(struct ksmbd_work *work, __le32 status); +struct channel *lookup_chann_list(struct ksmbd_session *sess, + struct ksmbd_conn *conn); +void smb3_preauth_hash_rsp(struct ksmbd_work *work); +bool smb3_is_transform_hdr(void *buf); +int smb3_decrypt_req(struct ksmbd_work *work); +int smb3_encrypt_resp(struct ksmbd_work *work); +bool smb3_11_final_sess_setup_resp(struct ksmbd_work *work); +int smb2_set_rsp_credits(struct ksmbd_work *work); + +/* smb2 misc functions */ +int ksmbd_smb2_check_message(struct ksmbd_work *work); + +/* smb2 command handlers */ +int smb2_handle_negotiate(struct ksmbd_work *work); +int smb2_negotiate_request(struct ksmbd_work *work); +int smb2_sess_setup(struct ksmbd_work *work); +int smb2_tree_connect(struct ksmbd_work *work); +int smb2_tree_disconnect(struct ksmbd_work *work); +int smb2_session_logoff(struct ksmbd_work *work); +int smb2_open(struct ksmbd_work *work); +int smb2_query_info(struct ksmbd_work *work); +int smb2_query_dir(struct ksmbd_work *work); +int smb2_close(struct ksmbd_work *work); +int smb2_echo(struct ksmbd_work *work); +int smb2_set_info(struct ksmbd_work *work); +int smb2_read(struct ksmbd_work *work); +int smb2_write(struct ksmbd_work *work); +int smb2_flush(struct ksmbd_work *work); +int smb2_cancel(struct ksmbd_work *work); +int smb2_lock(struct ksmbd_work *work); +int smb2_ioctl(struct ksmbd_work *work); +int smb2_oplock_break(struct ksmbd_work *work); +int smb2_notify(struct ksmbd_work *ksmbd_work); + +#endif /* _SMB2PDU_H */ diff --git a/fs/ksmbd/smb_common.c b/fs/ksmbd/smb_common.c new file mode 100644 index 000000000000..b108b918ec84 --- /dev/null +++ b/fs/ksmbd/smb_common.c @@ -0,0 +1,674 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018 Samsung Electronics Co., Ltd. + * Copyright (C) 2018 Namjae Jeon <linkinjeon@kernel.org> + */ + +#include "smb_common.h" +#include "server.h" +#include "misc.h" +#include "smbstatus.h" +#include "connection.h" +#include "ksmbd_work.h" +#include "mgmt/user_session.h" +#include "mgmt/user_config.h" +#include "mgmt/tree_connect.h" +#include "mgmt/share_config.h" + +/*for shortname implementation */ +static const char basechars[43] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_-!@#$%"; +#define MANGLE_BASE (sizeof(basechars) / sizeof(char) - 1) +#define MAGIC_CHAR '~' +#define PERIOD '.' +#define mangle(V) ((char)(basechars[(V) % MANGLE_BASE])) +#define KSMBD_MIN_SUPPORTED_HEADER_SIZE (sizeof(struct smb2_hdr)) + +struct smb_protocol { + int index; + char *name; + char *prot; + __u16 prot_id; +}; + +static struct smb_protocol smb1_protos[] = { + { + SMB21_PROT, + "\2SMB 2.1", + "SMB2_10", + SMB21_PROT_ID + }, + { + SMB2X_PROT, + "\2SMB 2.???", + "SMB2_22", + SMB2X_PROT_ID + }, +}; + +static struct smb_protocol smb2_protos[] = { + { + SMB21_PROT, + "\2SMB 2.1", + "SMB2_10", + SMB21_PROT_ID + }, + { + SMB30_PROT, + "\2SMB 3.0", + "SMB3_00", + SMB30_PROT_ID + }, + { + SMB302_PROT, + "\2SMB 3.02", + "SMB3_02", + SMB302_PROT_ID + }, + { + SMB311_PROT, + "\2SMB 3.1.1", + "SMB3_11", + SMB311_PROT_ID + }, +}; + +unsigned int ksmbd_server_side_copy_max_chunk_count(void) +{ + return 256; +} + +unsigned int ksmbd_server_side_copy_max_chunk_size(void) +{ + return (2U << 30) - 1; +} + +unsigned int ksmbd_server_side_copy_max_total_size(void) +{ + return (2U << 30) - 1; +} + +inline int ksmbd_min_protocol(void) +{ + return SMB2_PROT; +} + +inline int ksmbd_max_protocol(void) +{ + return SMB311_PROT; +} + +int ksmbd_lookup_protocol_idx(char *str) +{ + int offt = ARRAY_SIZE(smb1_protos) - 1; + int len = strlen(str); + + while (offt >= 0) { + if (!strncmp(str, smb1_protos[offt].prot, len)) { + ksmbd_debug(SMB, "selected %s dialect idx = %d\n", + smb1_protos[offt].prot, offt); + return smb1_protos[offt].index; + } + offt--; + } + + offt = ARRAY_SIZE(smb2_protos) - 1; + while (offt >= 0) { + if (!strncmp(str, smb2_protos[offt].prot, len)) { + ksmbd_debug(SMB, "selected %s dialect idx = %d\n", + smb2_protos[offt].prot, offt); + return smb2_protos[offt].index; + } + offt--; + } + return -1; +} + +/** + * ksmbd_verify_smb_message() - check for valid smb2 request header + * @work: smb work + * + * check for valid smb signature and packet direction(request/response) + * + * Return: 0 on success, otherwise 1 + */ +int ksmbd_verify_smb_message(struct ksmbd_work *work) +{ + struct smb2_hdr *smb2_hdr = work->request_buf; + + if (smb2_hdr->ProtocolId == SMB2_PROTO_NUMBER) + return ksmbd_smb2_check_message(work); + + return 0; +} + +/** + * ksmbd_smb_request() - check for valid smb request type + * @conn: connection instance + * + * Return: true on success, otherwise false + */ +bool ksmbd_smb_request(struct ksmbd_conn *conn) +{ + int type = *(char *)conn->request_buf; + + switch (type) { + case RFC1002_SESSION_MESSAGE: + /* Regular SMB request */ + return true; + case RFC1002_SESSION_KEEP_ALIVE: + ksmbd_debug(SMB, "RFC 1002 session keep alive\n"); + break; + default: + ksmbd_debug(SMB, "RFC 1002 unknown request type 0x%x\n", type); + } + + return false; +} + +static bool supported_protocol(int idx) +{ + if (idx == SMB2X_PROT && + (server_conf.min_protocol >= SMB21_PROT || + server_conf.max_protocol <= SMB311_PROT)) + return true; + + return (server_conf.min_protocol <= idx && + idx <= server_conf.max_protocol); +} + +static char *next_dialect(char *dialect, int *next_off) +{ + dialect = dialect + *next_off; + *next_off = strlen(dialect); + return dialect; +} + +static int ksmbd_lookup_dialect_by_name(char *cli_dialects, __le16 byte_count) +{ + int i, seq_num, bcount, next; + char *dialect; + + for (i = ARRAY_SIZE(smb1_protos) - 1; i >= 0; i--) { + seq_num = 0; + next = 0; + dialect = cli_dialects; + bcount = le16_to_cpu(byte_count); + do { + dialect = next_dialect(dialect, &next); + ksmbd_debug(SMB, "client requested dialect %s\n", + dialect); + if (!strcmp(dialect, smb1_protos[i].name)) { + if (supported_protocol(smb1_protos[i].index)) { + ksmbd_debug(SMB, + "selected %s dialect\n", + smb1_protos[i].name); + if (smb1_protos[i].index == SMB1_PROT) + return seq_num; + return smb1_protos[i].prot_id; + } + } + seq_num++; + bcount -= (++next); + } while (bcount > 0); + } + + return BAD_PROT_ID; +} + +int ksmbd_lookup_dialect_by_id(__le16 *cli_dialects, __le16 dialects_count) +{ + int i; + int count; + + for (i = ARRAY_SIZE(smb2_protos) - 1; i >= 0; i--) { + count = le16_to_cpu(dialects_count); + while (--count >= 0) { + ksmbd_debug(SMB, "client requested dialect 0x%x\n", + le16_to_cpu(cli_dialects[count])); + if (le16_to_cpu(cli_dialects[count]) != + smb2_protos[i].prot_id) + continue; + + if (supported_protocol(smb2_protos[i].index)) { + ksmbd_debug(SMB, "selected %s dialect\n", + smb2_protos[i].name); + return smb2_protos[i].prot_id; + } + } + } + + return BAD_PROT_ID; +} + +static int ksmbd_negotiate_smb_dialect(void *buf) +{ + __le32 proto; + + proto = ((struct smb2_hdr *)buf)->ProtocolId; + if (proto == SMB2_PROTO_NUMBER) { + struct smb2_negotiate_req *req; + + req = (struct smb2_negotiate_req *)buf; + return ksmbd_lookup_dialect_by_id(req->Dialects, + req->DialectCount); + } + + proto = *(__le32 *)((struct smb_hdr *)buf)->Protocol; + if (proto == SMB1_PROTO_NUMBER) { + struct smb_negotiate_req *req; + + req = (struct smb_negotiate_req *)buf; + return ksmbd_lookup_dialect_by_name(req->DialectsArray, + req->ByteCount); + } + + return BAD_PROT_ID; +} + +#define SMB_COM_NEGOTIATE 0x72 +int ksmbd_init_smb_server(struct ksmbd_work *work) +{ + struct ksmbd_conn *conn = work->conn; + + if (conn->need_neg == false) + return 0; + + init_smb3_11_server(conn); + + if (conn->ops->get_cmd_val(work) != SMB_COM_NEGOTIATE) + conn->need_neg = false; + return 0; +} + +bool ksmbd_pdu_size_has_room(unsigned int pdu) +{ + return (pdu >= KSMBD_MIN_SUPPORTED_HEADER_SIZE - 4); +} + +int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work, int info_level, + struct ksmbd_file *dir, + struct ksmbd_dir_info *d_info, + char *search_pattern, + int (*fn)(struct ksmbd_conn *, int, + struct ksmbd_dir_info *, + struct user_namespace *, + struct ksmbd_kstat *)) +{ + int i, rc = 0; + struct ksmbd_conn *conn = work->conn; + struct user_namespace *user_ns = file_mnt_user_ns(dir->filp); + + for (i = 0; i < 2; i++) { + struct kstat kstat; + struct ksmbd_kstat ksmbd_kstat; + + if (!dir->dot_dotdot[i]) { /* fill dot entry info */ + if (i == 0) { + d_info->name = "."; + d_info->name_len = 1; + } else { + d_info->name = ".."; + d_info->name_len = 2; + } + + if (!match_pattern(d_info->name, d_info->name_len, + search_pattern)) { + dir->dot_dotdot[i] = 1; + continue; + } + + ksmbd_kstat.kstat = &kstat; + ksmbd_vfs_fill_dentry_attrs(work, + user_ns, + dir->filp->f_path.dentry->d_parent, + &ksmbd_kstat); + rc = fn(conn, info_level, d_info, + user_ns, &ksmbd_kstat); + if (rc) + break; + if (d_info->out_buf_len <= 0) + break; + + dir->dot_dotdot[i] = 1; + if (d_info->flags & SMB2_RETURN_SINGLE_ENTRY) { + d_info->out_buf_len = 0; + break; + } + } + } + + return rc; +} + +/** + * ksmbd_extract_shortname() - get shortname from long filename + * @conn: connection instance + * @longname: source long filename + * @shortname: destination short filename + * + * Return: shortname length or 0 when source long name is '.' or '..' + * TODO: Though this function comforms the restriction of 8.3 Filename spec, + * but the result is different with Windows 7's one. need to check. + */ +int ksmbd_extract_shortname(struct ksmbd_conn *conn, const char *longname, + char *shortname) +{ + const char *p; + char base[9], extension[4]; + char out[13] = {0}; + int baselen = 0; + int extlen = 0, len = 0; + unsigned int csum = 0; + const unsigned char *ptr; + bool dot_present = true; + + p = longname; + if ((*p == '.') || (!(strcmp(p, "..")))) { + /*no mangling required */ + return 0; + } + + p = strrchr(longname, '.'); + if (p == longname) { /*name starts with a dot*/ + strscpy(extension, "___", strlen("___")); + } else { + if (p) { + p++; + while (*p && extlen < 3) { + if (*p != '.') + extension[extlen++] = toupper(*p); + p++; + } + extension[extlen] = '\0'; + } else { + dot_present = false; + } + } + + p = longname; + if (*p == '.') { + p++; + longname++; + } + while (*p && (baselen < 5)) { + if (*p != '.') + base[baselen++] = toupper(*p); + p++; + } + + base[baselen] = MAGIC_CHAR; + memcpy(out, base, baselen + 1); + + ptr = longname; + len = strlen(longname); + for (; len > 0; len--, ptr++) + csum += *ptr; + + csum = csum % (MANGLE_BASE * MANGLE_BASE); + out[baselen + 1] = mangle(csum / MANGLE_BASE); + out[baselen + 2] = mangle(csum); + out[baselen + 3] = PERIOD; + + if (dot_present) + memcpy(&out[baselen + 4], extension, 4); + else + out[baselen + 4] = '\0'; + smbConvertToUTF16((__le16 *)shortname, out, PATH_MAX, + conn->local_nls, 0); + len = strlen(out) * 2; + return len; +} + +static int __smb2_negotiate(struct ksmbd_conn *conn) +{ + return (conn->dialect >= SMB20_PROT_ID && + conn->dialect <= SMB311_PROT_ID); +} + +static int smb_handle_negotiate(struct ksmbd_work *work) +{ + struct smb_negotiate_rsp *neg_rsp = work->response_buf; + + ksmbd_debug(SMB, "Unsupported SMB protocol\n"); + neg_rsp->hdr.Status.CifsError = STATUS_INVALID_LOGON_TYPE; + return -EINVAL; +} + +int ksmbd_smb_negotiate_common(struct ksmbd_work *work, unsigned int command) +{ + struct ksmbd_conn *conn = work->conn; + int ret; + + conn->dialect = ksmbd_negotiate_smb_dialect(work->request_buf); + ksmbd_debug(SMB, "conn->dialect 0x%x\n", conn->dialect); + + if (command == SMB2_NEGOTIATE_HE) { + struct smb2_hdr *smb2_hdr = work->request_buf; + + if (smb2_hdr->ProtocolId != SMB2_PROTO_NUMBER) { + ksmbd_debug(SMB, "Downgrade to SMB1 negotiation\n"); + command = SMB_COM_NEGOTIATE; + } + } + + if (command == SMB2_NEGOTIATE_HE) { + ret = smb2_handle_negotiate(work); + init_smb2_neg_rsp(work); + return ret; + } + + if (command == SMB_COM_NEGOTIATE) { + if (__smb2_negotiate(conn)) { + conn->need_neg = true; + init_smb3_11_server(conn); + init_smb2_neg_rsp(work); + ksmbd_debug(SMB, "Upgrade to SMB2 negotiation\n"); + return 0; + } + return smb_handle_negotiate(work); + } + + pr_err("Unknown SMB negotiation command: %u\n", command); + return -EINVAL; +} + +enum SHARED_MODE_ERRORS { + SHARE_DELETE_ERROR, + SHARE_READ_ERROR, + SHARE_WRITE_ERROR, + FILE_READ_ERROR, + FILE_WRITE_ERROR, + FILE_DELETE_ERROR, +}; + +static const char * const shared_mode_errors[] = { + "Current access mode does not permit SHARE_DELETE", + "Current access mode does not permit SHARE_READ", + "Current access mode does not permit SHARE_WRITE", + "Desired access mode does not permit FILE_READ", + "Desired access mode does not permit FILE_WRITE", + "Desired access mode does not permit FILE_DELETE", +}; + +static void smb_shared_mode_error(int error, struct ksmbd_file *prev_fp, + struct ksmbd_file *curr_fp) +{ + ksmbd_debug(SMB, "%s\n", shared_mode_errors[error]); + ksmbd_debug(SMB, "Current mode: 0x%x Desired mode: 0x%x\n", + prev_fp->saccess, curr_fp->daccess); +} + +int ksmbd_smb_check_shared_mode(struct file *filp, struct ksmbd_file *curr_fp) +{ + int rc = 0; + struct ksmbd_file *prev_fp; + + /* + * Lookup fp in master fp list, and check desired access and + * shared mode between previous open and current open. + */ + read_lock(&curr_fp->f_ci->m_lock); + list_for_each_entry(prev_fp, &curr_fp->f_ci->m_fp_list, node) { + if (file_inode(filp) != file_inode(prev_fp->filp)) + continue; + + if (filp == prev_fp->filp) + continue; + + if (ksmbd_stream_fd(prev_fp) && ksmbd_stream_fd(curr_fp)) + if (strcmp(prev_fp->stream.name, curr_fp->stream.name)) + continue; + + if (prev_fp->attrib_only != curr_fp->attrib_only) + continue; + + if (!(prev_fp->saccess & FILE_SHARE_DELETE_LE) && + curr_fp->daccess & FILE_DELETE_LE) { + smb_shared_mode_error(SHARE_DELETE_ERROR, + prev_fp, + curr_fp); + rc = -EPERM; + break; + } + + /* + * Only check FILE_SHARE_DELETE if stream opened and + * normal file opened. + */ + if (ksmbd_stream_fd(prev_fp) && !ksmbd_stream_fd(curr_fp)) + continue; + + if (!(prev_fp->saccess & FILE_SHARE_READ_LE) && + curr_fp->daccess & (FILE_EXECUTE_LE | FILE_READ_DATA_LE)) { + smb_shared_mode_error(SHARE_READ_ERROR, + prev_fp, + curr_fp); + rc = -EPERM; + break; + } + + if (!(prev_fp->saccess & FILE_SHARE_WRITE_LE) && + curr_fp->daccess & (FILE_WRITE_DATA_LE | FILE_APPEND_DATA_LE)) { + smb_shared_mode_error(SHARE_WRITE_ERROR, + prev_fp, + curr_fp); + rc = -EPERM; + break; + } + + if (prev_fp->daccess & (FILE_EXECUTE_LE | FILE_READ_DATA_LE) && + !(curr_fp->saccess & FILE_SHARE_READ_LE)) { + smb_shared_mode_error(FILE_READ_ERROR, + prev_fp, + curr_fp); + rc = -EPERM; + break; + } + + if (prev_fp->daccess & (FILE_WRITE_DATA_LE | FILE_APPEND_DATA_LE) && + !(curr_fp->saccess & FILE_SHARE_WRITE_LE)) { + smb_shared_mode_error(FILE_WRITE_ERROR, + prev_fp, + curr_fp); + rc = -EPERM; + break; + } + + if (prev_fp->daccess & FILE_DELETE_LE && + !(curr_fp->saccess & FILE_SHARE_DELETE_LE)) { + smb_shared_mode_error(FILE_DELETE_ERROR, + prev_fp, + curr_fp); + rc = -EPERM; + break; + } + } + read_unlock(&curr_fp->f_ci->m_lock); + + return rc; +} + +bool is_asterisk(char *p) +{ + return p && p[0] == '*'; +} + +int ksmbd_override_fsids(struct ksmbd_work *work) +{ + struct ksmbd_session *sess = work->sess; + struct ksmbd_share_config *share = work->tcon->share_conf; + struct cred *cred; + struct group_info *gi; + unsigned int uid; + unsigned int gid; + + uid = user_uid(sess->user); + gid = user_gid(sess->user); + if (share->force_uid != KSMBD_SHARE_INVALID_UID) + uid = share->force_uid; + if (share->force_gid != KSMBD_SHARE_INVALID_GID) + gid = share->force_gid; + + cred = prepare_kernel_cred(NULL); + if (!cred) + return -ENOMEM; + + cred->fsuid = make_kuid(current_user_ns(), uid); + cred->fsgid = make_kgid(current_user_ns(), gid); + + gi = groups_alloc(0); + if (!gi) { + abort_creds(cred); + return -ENOMEM; + } + set_groups(cred, gi); + put_group_info(gi); + + if (!uid_eq(cred->fsuid, GLOBAL_ROOT_UID)) + cred->cap_effective = cap_drop_fs_set(cred->cap_effective); + + WARN_ON(work->saved_cred); + work->saved_cred = override_creds(cred); + if (!work->saved_cred) { + abort_creds(cred); + return -EINVAL; + } + return 0; +} + +void ksmbd_revert_fsids(struct ksmbd_work *work) +{ + const struct cred *cred; + + WARN_ON(!work->saved_cred); + + cred = current_cred(); + revert_creds(work->saved_cred); + put_cred(cred); + work->saved_cred = NULL; +} + +__le32 smb_map_generic_desired_access(__le32 daccess) +{ + if (daccess & FILE_GENERIC_READ_LE) { + daccess |= cpu_to_le32(GENERIC_READ_FLAGS); + daccess &= ~FILE_GENERIC_READ_LE; + } + + if (daccess & FILE_GENERIC_WRITE_LE) { + daccess |= cpu_to_le32(GENERIC_WRITE_FLAGS); + daccess &= ~FILE_GENERIC_WRITE_LE; + } + + if (daccess & FILE_GENERIC_EXECUTE_LE) { + daccess |= cpu_to_le32(GENERIC_EXECUTE_FLAGS); + daccess &= ~FILE_GENERIC_EXECUTE_LE; + } + + if (daccess & FILE_GENERIC_ALL_LE) { + daccess |= cpu_to_le32(GENERIC_ALL_FLAGS); + daccess &= ~FILE_GENERIC_ALL_LE; + } + + return daccess; +} diff --git a/fs/ksmbd/smb_common.h b/fs/ksmbd/smb_common.h new file mode 100644 index 000000000000..eb667d85558e --- /dev/null +++ b/fs/ksmbd/smb_common.h @@ -0,0 +1,542 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2018 Samsung Electronics Co., Ltd. + */ + +#ifndef __SMB_COMMON_H__ +#define __SMB_COMMON_H__ + +#include <linux/kernel.h> + +#include "glob.h" +#include "nterr.h" +#include "smb2pdu.h" + +/* ksmbd's Specific ERRNO */ +#define ESHARE 50000 + +#define SMB1_PROT 0 +#define SMB2_PROT 1 +#define SMB21_PROT 2 +/* multi-protocol negotiate request */ +#define SMB2X_PROT 3 +#define SMB30_PROT 4 +#define SMB302_PROT 5 +#define SMB311_PROT 6 +#define BAD_PROT 0xFFFF + +#define SMB1_VERSION_STRING "1.0" +#define SMB20_VERSION_STRING "2.0" +#define SMB21_VERSION_STRING "2.1" +#define SMB30_VERSION_STRING "3.0" +#define SMB302_VERSION_STRING "3.02" +#define SMB311_VERSION_STRING "3.1.1" + +/* Dialects */ +#define SMB10_PROT_ID 0x00 +#define SMB20_PROT_ID 0x0202 +#define SMB21_PROT_ID 0x0210 +/* multi-protocol negotiate request */ +#define SMB2X_PROT_ID 0x02FF +#define SMB30_PROT_ID 0x0300 +#define SMB302_PROT_ID 0x0302 +#define SMB311_PROT_ID 0x0311 +#define BAD_PROT_ID 0xFFFF + +#define SMB_ECHO_INTERVAL (60 * HZ) + +#define CIFS_DEFAULT_IOSIZE (64 * 1024) +#define MAX_CIFS_SMALL_BUFFER_SIZE 448 /* big enough for most */ + +/* RFC 1002 session packet types */ +#define RFC1002_SESSION_MESSAGE 0x00 +#define RFC1002_SESSION_REQUEST 0x81 +#define RFC1002_POSITIVE_SESSION_RESPONSE 0x82 +#define RFC1002_NEGATIVE_SESSION_RESPONSE 0x83 +#define RFC1002_RETARGET_SESSION_RESPONSE 0x84 +#define RFC1002_SESSION_KEEP_ALIVE 0x85 + +/* Responses when opening a file. */ +#define F_SUPERSEDED 0 +#define F_OPENED 1 +#define F_CREATED 2 +#define F_OVERWRITTEN 3 + +/* + * File Attribute flags + */ +#define ATTR_READONLY 0x0001 +#define ATTR_HIDDEN 0x0002 +#define ATTR_SYSTEM 0x0004 +#define ATTR_VOLUME 0x0008 +#define ATTR_DIRECTORY 0x0010 +#define ATTR_ARCHIVE 0x0020 +#define ATTR_DEVICE 0x0040 +#define ATTR_NORMAL 0x0080 +#define ATTR_TEMPORARY 0x0100 +#define ATTR_SPARSE 0x0200 +#define ATTR_REPARSE 0x0400 +#define ATTR_COMPRESSED 0x0800 +#define ATTR_OFFLINE 0x1000 +#define ATTR_NOT_CONTENT_INDEXED 0x2000 +#define ATTR_ENCRYPTED 0x4000 +#define ATTR_POSIX_SEMANTICS 0x01000000 +#define ATTR_BACKUP_SEMANTICS 0x02000000 +#define ATTR_DELETE_ON_CLOSE 0x04000000 +#define ATTR_SEQUENTIAL_SCAN 0x08000000 +#define ATTR_RANDOM_ACCESS 0x10000000 +#define ATTR_NO_BUFFERING 0x20000000 +#define ATTR_WRITE_THROUGH 0x80000000 + +#define ATTR_READONLY_LE cpu_to_le32(ATTR_READONLY) +#define ATTR_HIDDEN_LE cpu_to_le32(ATTR_HIDDEN) +#define ATTR_SYSTEM_LE cpu_to_le32(ATTR_SYSTEM) +#define ATTR_DIRECTORY_LE cpu_to_le32(ATTR_DIRECTORY) +#define ATTR_ARCHIVE_LE cpu_to_le32(ATTR_ARCHIVE) +#define ATTR_NORMAL_LE cpu_to_le32(ATTR_NORMAL) +#define ATTR_TEMPORARY_LE cpu_to_le32(ATTR_TEMPORARY) +#define ATTR_SPARSE_FILE_LE cpu_to_le32(ATTR_SPARSE) +#define ATTR_REPARSE_POINT_LE cpu_to_le32(ATTR_REPARSE) +#define ATTR_COMPRESSED_LE cpu_to_le32(ATTR_COMPRESSED) +#define ATTR_OFFLINE_LE cpu_to_le32(ATTR_OFFLINE) +#define ATTR_NOT_CONTENT_INDEXED_LE cpu_to_le32(ATTR_NOT_CONTENT_INDEXED) +#define ATTR_ENCRYPTED_LE cpu_to_le32(ATTR_ENCRYPTED) +#define ATTR_INTEGRITY_STREAML_LE cpu_to_le32(0x00008000) +#define ATTR_NO_SCRUB_DATA_LE cpu_to_le32(0x00020000) +#define ATTR_MASK_LE cpu_to_le32(0x00007FB7) + +/* List of FileSystemAttributes - see 2.5.1 of MS-FSCC */ +#define FILE_SUPPORTS_SPARSE_VDL 0x10000000 /* faster nonsparse extend */ +#define FILE_SUPPORTS_BLOCK_REFCOUNTING 0x08000000 /* allow ioctl dup extents */ +#define FILE_SUPPORT_INTEGRITY_STREAMS 0x04000000 +#define FILE_SUPPORTS_USN_JOURNAL 0x02000000 +#define FILE_SUPPORTS_OPEN_BY_FILE_ID 0x01000000 +#define FILE_SUPPORTS_EXTENDED_ATTRIBUTES 0x00800000 +#define FILE_SUPPORTS_HARD_LINKS 0x00400000 +#define FILE_SUPPORTS_TRANSACTIONS 0x00200000 +#define FILE_SEQUENTIAL_WRITE_ONCE 0x00100000 +#define FILE_READ_ONLY_VOLUME 0x00080000 +#define FILE_NAMED_STREAMS 0x00040000 +#define FILE_SUPPORTS_ENCRYPTION 0x00020000 +#define FILE_SUPPORTS_OBJECT_IDS 0x00010000 +#define FILE_VOLUME_IS_COMPRESSED 0x00008000 +#define FILE_SUPPORTS_REMOTE_STORAGE 0x00000100 +#define FILE_SUPPORTS_REPARSE_POINTS 0x00000080 +#define FILE_SUPPORTS_SPARSE_FILES 0x00000040 +#define FILE_VOLUME_QUOTAS 0x00000020 +#define FILE_FILE_COMPRESSION 0x00000010 +#define FILE_PERSISTENT_ACLS 0x00000008 +#define FILE_UNICODE_ON_DISK 0x00000004 +#define FILE_CASE_PRESERVED_NAMES 0x00000002 +#define FILE_CASE_SENSITIVE_SEARCH 0x00000001 + +#define FILE_READ_DATA 0x00000001 /* Data can be read from the file */ +#define FILE_WRITE_DATA 0x00000002 /* Data can be written to the file */ +#define FILE_APPEND_DATA 0x00000004 /* Data can be appended to the file */ +#define FILE_READ_EA 0x00000008 /* Extended attributes associated */ +/* with the file can be read */ +#define FILE_WRITE_EA 0x00000010 /* Extended attributes associated */ +/* with the file can be written */ +#define FILE_EXECUTE 0x00000020 /*Data can be read into memory from */ +/* the file using system paging I/O */ +#define FILE_DELETE_CHILD 0x00000040 +#define FILE_READ_ATTRIBUTES 0x00000080 /* Attributes associated with the */ +/* file can be read */ +#define FILE_WRITE_ATTRIBUTES 0x00000100 /* Attributes associated with the */ +/* file can be written */ +#define DELETE 0x00010000 /* The file can be deleted */ +#define READ_CONTROL 0x00020000 /* The access control list and */ +/* ownership associated with the */ +/* file can be read */ +#define WRITE_DAC 0x00040000 /* The access control list and */ +/* ownership associated with the */ +/* file can be written. */ +#define WRITE_OWNER 0x00080000 /* Ownership information associated */ +/* with the file can be written */ +#define SYNCHRONIZE 0x00100000 /* The file handle can waited on to */ +/* synchronize with the completion */ +/* of an input/output request */ +#define GENERIC_ALL 0x10000000 +#define GENERIC_EXECUTE 0x20000000 +#define GENERIC_WRITE 0x40000000 +#define GENERIC_READ 0x80000000 +/* In summary - Relevant file */ +/* access flags from CIFS are */ +/* file_read_data, file_write_data */ +/* file_execute, file_read_attributes*/ +/* write_dac, and delete. */ + +#define FILE_READ_RIGHTS (FILE_READ_DATA | FILE_READ_EA | FILE_READ_ATTRIBUTES) +#define FILE_WRITE_RIGHTS (FILE_WRITE_DATA | FILE_APPEND_DATA \ + | FILE_WRITE_EA | FILE_WRITE_ATTRIBUTES) +#define FILE_EXEC_RIGHTS (FILE_EXECUTE) + +#define SET_FILE_READ_RIGHTS (FILE_READ_DATA | FILE_READ_EA \ + | FILE_READ_ATTRIBUTES \ + | DELETE | READ_CONTROL | WRITE_DAC \ + | WRITE_OWNER | SYNCHRONIZE) +#define SET_FILE_WRITE_RIGHTS (FILE_WRITE_DATA | FILE_APPEND_DATA \ + | FILE_WRITE_EA \ + | FILE_DELETE_CHILD \ + | FILE_WRITE_ATTRIBUTES \ + | DELETE | READ_CONTROL | WRITE_DAC \ + | WRITE_OWNER | SYNCHRONIZE) +#define SET_FILE_EXEC_RIGHTS (FILE_READ_EA | FILE_WRITE_EA | FILE_EXECUTE \ + | FILE_READ_ATTRIBUTES \ + | FILE_WRITE_ATTRIBUTES \ + | DELETE | READ_CONTROL | WRITE_DAC \ + | WRITE_OWNER | SYNCHRONIZE) + +#define SET_MINIMUM_RIGHTS (FILE_READ_EA | FILE_READ_ATTRIBUTES \ + | READ_CONTROL | SYNCHRONIZE) + +/* generic flags for file open */ +#define GENERIC_READ_FLAGS (READ_CONTROL | FILE_READ_DATA | \ + FILE_READ_ATTRIBUTES | \ + FILE_READ_EA | SYNCHRONIZE) + +#define GENERIC_WRITE_FLAGS (READ_CONTROL | FILE_WRITE_DATA | \ + FILE_WRITE_ATTRIBUTES | FILE_WRITE_EA | \ + FILE_APPEND_DATA | SYNCHRONIZE) + +#define GENERIC_EXECUTE_FLAGS (READ_CONTROL | FILE_EXECUTE | \ + FILE_READ_ATTRIBUTES | SYNCHRONIZE) + +#define GENERIC_ALL_FLAGS (DELETE | READ_CONTROL | WRITE_DAC | \ + WRITE_OWNER | SYNCHRONIZE | FILE_READ_DATA | \ + FILE_WRITE_DATA | FILE_APPEND_DATA | \ + FILE_READ_EA | FILE_WRITE_EA | \ + FILE_EXECUTE | FILE_DELETE_CHILD | \ + FILE_READ_ATTRIBUTES | FILE_WRITE_ATTRIBUTES) + +#define SMB1_PROTO_NUMBER cpu_to_le32(0x424d53ff) + +#define SMB1_CLIENT_GUID_SIZE (16) +struct smb_hdr { + __be32 smb_buf_length; + __u8 Protocol[4]; + __u8 Command; + union { + struct { + __u8 ErrorClass; + __u8 Reserved; + __le16 Error; + } __packed DosError; + __le32 CifsError; + } __packed Status; + __u8 Flags; + __le16 Flags2; /* note: le */ + __le16 PidHigh; + union { + struct { + __le32 SequenceNumber; /* le */ + __u32 Reserved; /* zero */ + } __packed Sequence; + __u8 SecuritySignature[8]; /* le */ + } __packed Signature; + __u8 pad[2]; + __le16 Tid; + __le16 Pid; + __le16 Uid; + __le16 Mid; + __u8 WordCount; +} __packed; + +struct smb_negotiate_req { + struct smb_hdr hdr; /* wct = 0 */ + __le16 ByteCount; + unsigned char DialectsArray[1]; +} __packed; + +struct smb_negotiate_rsp { + struct smb_hdr hdr; /* wct = 17 */ + __le16 DialectIndex; /* 0xFFFF = no dialect acceptable */ + __u8 SecurityMode; + __le16 MaxMpxCount; + __le16 MaxNumberVcs; + __le32 MaxBufferSize; + __le32 MaxRawSize; + __le32 SessionKey; + __le32 Capabilities; /* see below */ + __le32 SystemTimeLow; + __le32 SystemTimeHigh; + __le16 ServerTimeZone; + __u8 EncryptionKeyLength; + __le16 ByteCount; + union { + unsigned char EncryptionKey[8]; /* cap extended security off */ + /* followed by Domain name - if extended security is off */ + /* followed by 16 bytes of server GUID */ + /* then security blob if cap_extended_security negotiated */ + struct { + unsigned char GUID[SMB1_CLIENT_GUID_SIZE]; + unsigned char SecurityBlob[1]; + } __packed extended_response; + } __packed u; +} __packed; + +struct filesystem_attribute_info { + __le32 Attributes; + __le32 MaxPathNameComponentLength; + __le32 FileSystemNameLen; + __le16 FileSystemName[1]; /* do not have to save this - get subset? */ +} __packed; + +struct filesystem_device_info { + __le32 DeviceType; + __le32 DeviceCharacteristics; +} __packed; /* device info level 0x104 */ + +struct filesystem_vol_info { + __le64 VolumeCreationTime; + __le32 SerialNumber; + __le32 VolumeLabelSize; + __le16 Reserved; + __le16 VolumeLabel[1]; +} __packed; + +struct filesystem_info { + __le64 TotalAllocationUnits; + __le64 FreeAllocationUnits; + __le32 SectorsPerAllocationUnit; + __le32 BytesPerSector; +} __packed; /* size info, level 0x103 */ + +#define EXTENDED_INFO_MAGIC 0x43667364 /* Cfsd */ +#define STRING_LENGTH 28 + +struct fs_extended_info { + __le32 magic; + __le32 version; + __le32 release; + __u64 rel_date; + char version_string[STRING_LENGTH]; +} __packed; + +struct object_id_info { + char objid[16]; + struct fs_extended_info extended_info; +} __packed; + +struct file_directory_info { + __le32 NextEntryOffset; + __u32 FileIndex; + __le64 CreationTime; + __le64 LastAccessTime; + __le64 LastWriteTime; + __le64 ChangeTime; + __le64 EndOfFile; + __le64 AllocationSize; + __le32 ExtFileAttributes; + __le32 FileNameLength; + char FileName[1]; +} __packed; /* level 0x101 FF resp data */ + +struct file_names_info { + __le32 NextEntryOffset; + __u32 FileIndex; + __le32 FileNameLength; + char FileName[1]; +} __packed; /* level 0xc FF resp data */ + +struct file_full_directory_info { + __le32 NextEntryOffset; + __u32 FileIndex; + __le64 CreationTime; + __le64 LastAccessTime; + __le64 LastWriteTime; + __le64 ChangeTime; + __le64 EndOfFile; + __le64 AllocationSize; + __le32 ExtFileAttributes; + __le32 FileNameLength; + __le32 EaSize; + char FileName[1]; +} __packed; /* level 0x102 FF resp */ + +struct file_both_directory_info { + __le32 NextEntryOffset; + __u32 FileIndex; + __le64 CreationTime; + __le64 LastAccessTime; + __le64 LastWriteTime; + __le64 ChangeTime; + __le64 EndOfFile; + __le64 AllocationSize; + __le32 ExtFileAttributes; + __le32 FileNameLength; + __le32 EaSize; /* length of the xattrs */ + __u8 ShortNameLength; + __u8 Reserved; + __u8 ShortName[24]; + char FileName[1]; +} __packed; /* level 0x104 FFrsp data */ + +struct file_id_both_directory_info { + __le32 NextEntryOffset; + __u32 FileIndex; + __le64 CreationTime; + __le64 LastAccessTime; + __le64 LastWriteTime; + __le64 ChangeTime; + __le64 EndOfFile; + __le64 AllocationSize; + __le32 ExtFileAttributes; + __le32 FileNameLength; + __le32 EaSize; /* length of the xattrs */ + __u8 ShortNameLength; + __u8 Reserved; + __u8 ShortName[24]; + __le16 Reserved2; + __le64 UniqueId; + char FileName[1]; +} __packed; + +struct file_id_full_dir_info { + __le32 NextEntryOffset; + __u32 FileIndex; + __le64 CreationTime; + __le64 LastAccessTime; + __le64 LastWriteTime; + __le64 ChangeTime; + __le64 EndOfFile; + __le64 AllocationSize; + __le32 ExtFileAttributes; + __le32 FileNameLength; + __le32 EaSize; /* EA size */ + __le32 Reserved; + __le64 UniqueId; /* inode num - le since Samba puts ino in low 32 bit*/ + char FileName[1]; +} __packed; /* level 0x105 FF rsp data */ + +struct smb_version_values { + char *version_string; + __u16 protocol_id; + __le16 lock_cmd; + __u32 capabilities; + __u32 max_read_size; + __u32 max_write_size; + __u32 max_trans_size; + __u32 large_lock_type; + __u32 exclusive_lock_type; + __u32 shared_lock_type; + __u32 unlock_lock_type; + size_t header_size; + size_t max_header_size; + size_t read_rsp_size; + unsigned int cap_unix; + unsigned int cap_nt_find; + unsigned int cap_large_files; + __u16 signing_enabled; + __u16 signing_required; + size_t create_lease_size; + size_t create_durable_size; + size_t create_durable_v2_size; + size_t create_mxac_size; + size_t create_disk_id_size; + size_t create_posix_size; +}; + +struct filesystem_posix_info { + /* For undefined recommended transfer size return -1 in that field */ + __le32 OptimalTransferSize; /* bsize on some os, iosize on other os */ + __le32 BlockSize; + /* The next three fields are in terms of the block size. + * (above). If block size is unknown, 4096 would be a + * reasonable block size for a server to report. + * Note that returning the blocks/blocksavail removes need + * to make a second call (to QFSInfo level 0x103 to get this info. + * UserBlockAvail is typically less than or equal to BlocksAvail, + * if no distinction is made return the same value in each + */ + __le64 TotalBlocks; + __le64 BlocksAvail; /* bfree */ + __le64 UserBlocksAvail; /* bavail */ + /* For undefined Node fields or FSID return -1 */ + __le64 TotalFileNodes; + __le64 FreeFileNodes; + __le64 FileSysIdentifier; /* fsid */ + /* NB Namelen comes from FILE_SYSTEM_ATTRIBUTE_INFO call */ + /* NB flags can come from FILE_SYSTEM_DEVICE_INFO call */ +} __packed; + +struct smb_version_ops { + u16 (*get_cmd_val)(struct ksmbd_work *swork); + int (*init_rsp_hdr)(struct ksmbd_work *swork); + void (*set_rsp_status)(struct ksmbd_work *swork, __le32 err); + int (*allocate_rsp_buf)(struct ksmbd_work *work); + int (*set_rsp_credits)(struct ksmbd_work *work); + int (*check_user_session)(struct ksmbd_work *work); + int (*get_ksmbd_tcon)(struct ksmbd_work *work); + bool (*is_sign_req)(struct ksmbd_work *work, unsigned int command); + int (*check_sign_req)(struct ksmbd_work *work); + void (*set_sign_rsp)(struct ksmbd_work *work); + int (*generate_signingkey)(struct ksmbd_session *sess, struct ksmbd_conn *conn); + int (*generate_encryptionkey)(struct ksmbd_session *sess); + bool (*is_transform_hdr)(void *buf); + int (*decrypt_req)(struct ksmbd_work *work); + int (*encrypt_resp)(struct ksmbd_work *work); +}; + +struct smb_version_cmds { + int (*proc)(struct ksmbd_work *swork); +}; + +static inline size_t +smb2_hdr_size_no_buflen(struct smb_version_values *vals) +{ + return vals->header_size - 4; +} + +int ksmbd_min_protocol(void); +int ksmbd_max_protocol(void); + +int ksmbd_lookup_protocol_idx(char *str); + +int ksmbd_verify_smb_message(struct ksmbd_work *work); +bool ksmbd_smb_request(struct ksmbd_conn *conn); + +int ksmbd_lookup_dialect_by_id(__le16 *cli_dialects, __le16 dialects_count); + +int ksmbd_init_smb_server(struct ksmbd_work *work); + +bool ksmbd_pdu_size_has_room(unsigned int pdu); + +struct ksmbd_kstat; +int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work, + int info_level, + struct ksmbd_file *dir, + struct ksmbd_dir_info *d_info, + char *search_pattern, + int (*fn)(struct ksmbd_conn *, + int, + struct ksmbd_dir_info *, + struct user_namespace *, + struct ksmbd_kstat *)); + +int ksmbd_extract_shortname(struct ksmbd_conn *conn, + const char *longname, + char *shortname); + +int ksmbd_smb_negotiate_common(struct ksmbd_work *work, unsigned int command); + +int ksmbd_smb_check_shared_mode(struct file *filp, struct ksmbd_file *curr_fp); +int ksmbd_override_fsids(struct ksmbd_work *work); +void ksmbd_revert_fsids(struct ksmbd_work *work); + +unsigned int ksmbd_server_side_copy_max_chunk_count(void); +unsigned int ksmbd_server_side_copy_max_chunk_size(void); +unsigned int ksmbd_server_side_copy_max_total_size(void); +bool is_asterisk(char *p); +__le32 smb_map_generic_desired_access(__le32 daccess); + +static inline unsigned int get_rfc1002_len(void *buf) +{ + return be32_to_cpu(*((__be32 *)buf)) & 0xffffff; +} + +static inline void inc_rfc1001_len(void *buf, int count) +{ + be32_add_cpu((__be32 *)buf, count); +} +#endif /* __SMB_COMMON_H__ */ diff --git a/fs/ksmbd/smbacl.c b/fs/ksmbd/smbacl.c new file mode 100644 index 000000000000..5456e3ad943e --- /dev/null +++ b/fs/ksmbd/smbacl.c @@ -0,0 +1,1366 @@ +// SPDX-License-Identifier: LGPL-2.1+ +/* + * Copyright (C) International Business Machines Corp., 2007,2008 + * Author(s): Steve French (sfrench@us.ibm.com) + * Copyright (C) 2020 Samsung Electronics Co., Ltd. + * Author(s): Namjae Jeon <linkinjeon@kernel.org> + */ + +#include <linux/fs.h> +#include <linux/slab.h> +#include <linux/string.h> + +#include "smbacl.h" +#include "smb_common.h" +#include "server.h" +#include "misc.h" +#include "mgmt/share_config.h" + +static const struct smb_sid domain = {1, 4, {0, 0, 0, 0, 0, 5}, + {cpu_to_le32(21), cpu_to_le32(1), cpu_to_le32(2), cpu_to_le32(3), + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} }; + +/* security id for everyone/world system group */ +static const struct smb_sid creator_owner = { + 1, 1, {0, 0, 0, 0, 0, 3}, {0} }; +/* security id for everyone/world system group */ +static const struct smb_sid creator_group = { + 1, 1, {0, 0, 0, 0, 0, 3}, {cpu_to_le32(1)} }; + +/* security id for everyone/world system group */ +static const struct smb_sid sid_everyone = { + 1, 1, {0, 0, 0, 0, 0, 1}, {0} }; +/* security id for Authenticated Users system group */ +static const struct smb_sid sid_authusers = { + 1, 1, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(11)} }; + +/* S-1-22-1 Unmapped Unix users */ +static const struct smb_sid sid_unix_users = {1, 1, {0, 0, 0, 0, 0, 22}, + {cpu_to_le32(1), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} }; + +/* S-1-22-2 Unmapped Unix groups */ +static const struct smb_sid sid_unix_groups = { 1, 1, {0, 0, 0, 0, 0, 22}, + {cpu_to_le32(2), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} }; + +/* + * See http://technet.microsoft.com/en-us/library/hh509017(v=ws.10).aspx + */ + +/* S-1-5-88 MS NFS and Apple style UID/GID/mode */ + +/* S-1-5-88-1 Unix uid */ +static const struct smb_sid sid_unix_NFS_users = { 1, 2, {0, 0, 0, 0, 0, 5}, + {cpu_to_le32(88), + cpu_to_le32(1), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} }; + +/* S-1-5-88-2 Unix gid */ +static const struct smb_sid sid_unix_NFS_groups = { 1, 2, {0, 0, 0, 0, 0, 5}, + {cpu_to_le32(88), + cpu_to_le32(2), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} }; + +/* S-1-5-88-3 Unix mode */ +static const struct smb_sid sid_unix_NFS_mode = { 1, 2, {0, 0, 0, 0, 0, 5}, + {cpu_to_le32(88), + cpu_to_le32(3), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} }; + +/* + * if the two SIDs (roughly equivalent to a UUID for a user or group) are + * the same returns zero, if they do not match returns non-zero. + */ +int compare_sids(const struct smb_sid *ctsid, const struct smb_sid *cwsid) +{ + int i; + int num_subauth, num_sat, num_saw; + + if (!ctsid || !cwsid) + return 1; + + /* compare the revision */ + if (ctsid->revision != cwsid->revision) { + if (ctsid->revision > cwsid->revision) + return 1; + else + return -1; + } + + /* compare all of the six auth values */ + for (i = 0; i < NUM_AUTHS; ++i) { + if (ctsid->authority[i] != cwsid->authority[i]) { + if (ctsid->authority[i] > cwsid->authority[i]) + return 1; + else + return -1; + } + } + + /* compare all of the subauth values if any */ + num_sat = ctsid->num_subauth; + num_saw = cwsid->num_subauth; + num_subauth = num_sat < num_saw ? num_sat : num_saw; + if (num_subauth) { + for (i = 0; i < num_subauth; ++i) { + if (ctsid->sub_auth[i] != cwsid->sub_auth[i]) { + if (le32_to_cpu(ctsid->sub_auth[i]) > + le32_to_cpu(cwsid->sub_auth[i])) + return 1; + else + return -1; + } + } + } + + return 0; /* sids compare/match */ +} + +static void smb_copy_sid(struct smb_sid *dst, const struct smb_sid *src) +{ + int i; + + dst->revision = src->revision; + dst->num_subauth = min_t(u8, src->num_subauth, SID_MAX_SUB_AUTHORITIES); + for (i = 0; i < NUM_AUTHS; ++i) + dst->authority[i] = src->authority[i]; + for (i = 0; i < dst->num_subauth; ++i) + dst->sub_auth[i] = src->sub_auth[i]; +} + +/* + * change posix mode to reflect permissions + * pmode is the existing mode (we only want to overwrite part of this + * bits to set can be: S_IRWXU, S_IRWXG or S_IRWXO ie 00700 or 00070 or 00007 + */ +static umode_t access_flags_to_mode(struct smb_fattr *fattr, __le32 ace_flags, + int type) +{ + __u32 flags = le32_to_cpu(ace_flags); + umode_t mode = 0; + + if (flags & GENERIC_ALL) { + mode = 0777; + ksmbd_debug(SMB, "all perms\n"); + return mode; + } + + if ((flags & GENERIC_READ) || (flags & FILE_READ_RIGHTS)) + mode = 0444; + if ((flags & GENERIC_WRITE) || (flags & FILE_WRITE_RIGHTS)) { + mode |= 0222; + if (S_ISDIR(fattr->cf_mode)) + mode |= 0111; + } + if ((flags & GENERIC_EXECUTE) || (flags & FILE_EXEC_RIGHTS)) + mode |= 0111; + + if (type == ACCESS_DENIED_ACE_TYPE || type == ACCESS_DENIED_OBJECT_ACE_TYPE) + mode = ~mode; + + ksmbd_debug(SMB, "access flags 0x%x mode now %04o\n", flags, mode); + + return mode; +} + +/* + * Generate access flags to reflect permissions mode is the existing mode. + * This function is called for every ACE in the DACL whose SID matches + * with either owner or group or everyone. + */ +static void mode_to_access_flags(umode_t mode, umode_t bits_to_use, + __u32 *pace_flags) +{ + /* reset access mask */ + *pace_flags = 0x0; + + /* bits to use are either S_IRWXU or S_IRWXG or S_IRWXO */ + mode &= bits_to_use; + + /* + * check for R/W/X UGO since we do not know whose flags + * is this but we have cleared all the bits sans RWX for + * either user or group or other as per bits_to_use + */ + if (mode & 0444) + *pace_flags |= SET_FILE_READ_RIGHTS; + if (mode & 0222) + *pace_flags |= FILE_WRITE_RIGHTS; + if (mode & 0111) + *pace_flags |= SET_FILE_EXEC_RIGHTS; + + ksmbd_debug(SMB, "mode: %o, access flags now 0x%x\n", + mode, *pace_flags); +} + +static __u16 fill_ace_for_sid(struct smb_ace *pntace, + const struct smb_sid *psid, int type, int flags, + umode_t mode, umode_t bits) +{ + int i; + __u16 size = 0; + __u32 access_req = 0; + + pntace->type = type; + pntace->flags = flags; + mode_to_access_flags(mode, bits, &access_req); + if (!access_req) + access_req = SET_MINIMUM_RIGHTS; + pntace->access_req = cpu_to_le32(access_req); + + pntace->sid.revision = psid->revision; + pntace->sid.num_subauth = psid->num_subauth; + for (i = 0; i < NUM_AUTHS; i++) + pntace->sid.authority[i] = psid->authority[i]; + for (i = 0; i < psid->num_subauth; i++) + pntace->sid.sub_auth[i] = psid->sub_auth[i]; + + size = 1 + 1 + 2 + 4 + 1 + 1 + 6 + (psid->num_subauth * 4); + pntace->size = cpu_to_le16(size); + + return size; +} + +void id_to_sid(unsigned int cid, uint sidtype, struct smb_sid *ssid) +{ + switch (sidtype) { + case SIDOWNER: + smb_copy_sid(ssid, &server_conf.domain_sid); + break; + case SIDUNIX_USER: + smb_copy_sid(ssid, &sid_unix_users); + break; + case SIDUNIX_GROUP: + smb_copy_sid(ssid, &sid_unix_groups); + break; + case SIDCREATOR_OWNER: + smb_copy_sid(ssid, &creator_owner); + return; + case SIDCREATOR_GROUP: + smb_copy_sid(ssid, &creator_group); + return; + case SIDNFS_USER: + smb_copy_sid(ssid, &sid_unix_NFS_users); + break; + case SIDNFS_GROUP: + smb_copy_sid(ssid, &sid_unix_NFS_groups); + break; + case SIDNFS_MODE: + smb_copy_sid(ssid, &sid_unix_NFS_mode); + break; + default: + return; + } + + /* RID */ + ssid->sub_auth[ssid->num_subauth] = cpu_to_le32(cid); + ssid->num_subauth++; +} + +static int sid_to_id(struct user_namespace *user_ns, + struct smb_sid *psid, uint sidtype, + struct smb_fattr *fattr) +{ + int rc = -EINVAL; + + /* + * If we have too many subauthorities, then something is really wrong. + * Just return an error. + */ + if (unlikely(psid->num_subauth > SID_MAX_SUB_AUTHORITIES)) { + pr_err("%s: %u subauthorities is too many!\n", + __func__, psid->num_subauth); + return -EIO; + } + + if (sidtype == SIDOWNER) { + kuid_t uid; + uid_t id; + + id = le32_to_cpu(psid->sub_auth[psid->num_subauth - 1]); + if (id > 0) { + uid = make_kuid(user_ns, id); + if (uid_valid(uid) && kuid_has_mapping(user_ns, uid)) { + fattr->cf_uid = uid; + rc = 0; + } + } + } else { + kgid_t gid; + gid_t id; + + id = le32_to_cpu(psid->sub_auth[psid->num_subauth - 1]); + if (id > 0) { + gid = make_kgid(user_ns, id); + if (gid_valid(gid) && kgid_has_mapping(user_ns, gid)) { + fattr->cf_gid = gid; + rc = 0; + } + } + } + + return rc; +} + +void posix_state_to_acl(struct posix_acl_state *state, + struct posix_acl_entry *pace) +{ + int i; + + pace->e_tag = ACL_USER_OBJ; + pace->e_perm = state->owner.allow; + for (i = 0; i < state->users->n; i++) { + pace++; + pace->e_tag = ACL_USER; + pace->e_uid = state->users->aces[i].uid; + pace->e_perm = state->users->aces[i].perms.allow; + } + + pace++; + pace->e_tag = ACL_GROUP_OBJ; + pace->e_perm = state->group.allow; + + for (i = 0; i < state->groups->n; i++) { + pace++; + pace->e_tag = ACL_GROUP; + pace->e_gid = state->groups->aces[i].gid; + pace->e_perm = state->groups->aces[i].perms.allow; + } + + if (state->users->n || state->groups->n) { + pace++; + pace->e_tag = ACL_MASK; + pace->e_perm = state->mask.allow; + } + + pace++; + pace->e_tag = ACL_OTHER; + pace->e_perm = state->other.allow; +} + +int init_acl_state(struct posix_acl_state *state, int cnt) +{ + int alloc; + + memset(state, 0, sizeof(struct posix_acl_state)); + /* + * In the worst case, each individual acl could be for a distinct + * named user or group, but we don't know which, so we allocate + * enough space for either: + */ + alloc = sizeof(struct posix_ace_state_array) + + cnt * sizeof(struct posix_user_ace_state); + state->users = kzalloc(alloc, GFP_KERNEL); + if (!state->users) + return -ENOMEM; + state->groups = kzalloc(alloc, GFP_KERNEL); + if (!state->groups) { + kfree(state->users); + return -ENOMEM; + } + return 0; +} + +void free_acl_state(struct posix_acl_state *state) +{ + kfree(state->users); + kfree(state->groups); +} + +static void parse_dacl(struct user_namespace *user_ns, + struct smb_acl *pdacl, char *end_of_acl, + struct smb_sid *pownersid, struct smb_sid *pgrpsid, + struct smb_fattr *fattr) +{ + int i, ret; + int num_aces = 0; + int acl_size; + char *acl_base; + struct smb_ace **ppace; + struct posix_acl_entry *cf_pace, *cf_pdace; + struct posix_acl_state acl_state, default_acl_state; + umode_t mode = 0, acl_mode; + bool owner_found = false, group_found = false, others_found = false; + + if (!pdacl) + return; + + /* validate that we do not go past end of acl */ + if (end_of_acl <= (char *)pdacl || + end_of_acl < (char *)pdacl + le16_to_cpu(pdacl->size)) { + pr_err("ACL too small to parse DACL\n"); + return; + } + + ksmbd_debug(SMB, "DACL revision %d size %d num aces %d\n", + le16_to_cpu(pdacl->revision), le16_to_cpu(pdacl->size), + le32_to_cpu(pdacl->num_aces)); + + acl_base = (char *)pdacl; + acl_size = sizeof(struct smb_acl); + + num_aces = le32_to_cpu(pdacl->num_aces); + if (num_aces <= 0) + return; + + if (num_aces > ULONG_MAX / sizeof(struct smb_ace *)) + return; + + ppace = kmalloc_array(num_aces, sizeof(struct smb_ace *), GFP_KERNEL); + if (!ppace) + return; + + ret = init_acl_state(&acl_state, num_aces); + if (ret) + return; + ret = init_acl_state(&default_acl_state, num_aces); + if (ret) { + free_acl_state(&acl_state); + return; + } + + /* + * reset rwx permissions for user/group/other. + * Also, if num_aces is 0 i.e. DACL has no ACEs, + * user/group/other have no permissions + */ + for (i = 0; i < num_aces; ++i) { + ppace[i] = (struct smb_ace *)(acl_base + acl_size); + acl_base = (char *)ppace[i]; + acl_size = le16_to_cpu(ppace[i]->size); + ppace[i]->access_req = + smb_map_generic_desired_access(ppace[i]->access_req); + + if (!(compare_sids(&ppace[i]->sid, &sid_unix_NFS_mode))) { + fattr->cf_mode = + le32_to_cpu(ppace[i]->sid.sub_auth[2]); + break; + } else if (!compare_sids(&ppace[i]->sid, pownersid)) { + acl_mode = access_flags_to_mode(fattr, + ppace[i]->access_req, + ppace[i]->type); + acl_mode &= 0700; + + if (!owner_found) { + mode &= ~(0700); + mode |= acl_mode; + } + owner_found = true; + } else if (!compare_sids(&ppace[i]->sid, pgrpsid) || + ppace[i]->sid.sub_auth[ppace[i]->sid.num_subauth - 1] == + DOMAIN_USER_RID_LE) { + acl_mode = access_flags_to_mode(fattr, + ppace[i]->access_req, + ppace[i]->type); + acl_mode &= 0070; + if (!group_found) { + mode &= ~(0070); + mode |= acl_mode; + } + group_found = true; + } else if (!compare_sids(&ppace[i]->sid, &sid_everyone)) { + acl_mode = access_flags_to_mode(fattr, + ppace[i]->access_req, + ppace[i]->type); + acl_mode &= 0007; + if (!others_found) { + mode &= ~(0007); + mode |= acl_mode; + } + others_found = true; + } else if (!compare_sids(&ppace[i]->sid, &creator_owner)) { + continue; + } else if (!compare_sids(&ppace[i]->sid, &creator_group)) { + continue; + } else if (!compare_sids(&ppace[i]->sid, &sid_authusers)) { + continue; + } else { + struct smb_fattr temp_fattr; + + acl_mode = access_flags_to_mode(fattr, ppace[i]->access_req, + ppace[i]->type); + temp_fattr.cf_uid = INVALID_UID; + ret = sid_to_id(user_ns, &ppace[i]->sid, SIDOWNER, &temp_fattr); + if (ret || uid_eq(temp_fattr.cf_uid, INVALID_UID)) { + pr_err("%s: Error %d mapping Owner SID to uid\n", + __func__, ret); + continue; + } + + acl_state.owner.allow = ((acl_mode & 0700) >> 6) | 0004; + acl_state.users->aces[acl_state.users->n].uid = + temp_fattr.cf_uid; + acl_state.users->aces[acl_state.users->n++].perms.allow = + ((acl_mode & 0700) >> 6) | 0004; + default_acl_state.owner.allow = ((acl_mode & 0700) >> 6) | 0004; + default_acl_state.users->aces[default_acl_state.users->n].uid = + temp_fattr.cf_uid; + default_acl_state.users->aces[default_acl_state.users->n++].perms.allow = + ((acl_mode & 0700) >> 6) | 0004; + } + } + kfree(ppace); + + if (owner_found) { + /* The owner must be set to at least read-only. */ + acl_state.owner.allow = ((mode & 0700) >> 6) | 0004; + acl_state.users->aces[acl_state.users->n].uid = fattr->cf_uid; + acl_state.users->aces[acl_state.users->n++].perms.allow = + ((mode & 0700) >> 6) | 0004; + default_acl_state.owner.allow = ((mode & 0700) >> 6) | 0004; + default_acl_state.users->aces[default_acl_state.users->n].uid = + fattr->cf_uid; + default_acl_state.users->aces[default_acl_state.users->n++].perms.allow = + ((mode & 0700) >> 6) | 0004; + } + + if (group_found) { + acl_state.group.allow = (mode & 0070) >> 3; + acl_state.groups->aces[acl_state.groups->n].gid = + fattr->cf_gid; + acl_state.groups->aces[acl_state.groups->n++].perms.allow = + (mode & 0070) >> 3; + default_acl_state.group.allow = (mode & 0070) >> 3; + default_acl_state.groups->aces[default_acl_state.groups->n].gid = + fattr->cf_gid; + default_acl_state.groups->aces[default_acl_state.groups->n++].perms.allow = + (mode & 0070) >> 3; + } + + if (others_found) { + fattr->cf_mode &= ~(0007); + fattr->cf_mode |= mode & 0007; + + acl_state.other.allow = mode & 0007; + default_acl_state.other.allow = mode & 0007; + } + + if (acl_state.users->n || acl_state.groups->n) { + acl_state.mask.allow = 0x07; + + if (IS_ENABLED(CONFIG_FS_POSIX_ACL)) { + fattr->cf_acls = + posix_acl_alloc(acl_state.users->n + + acl_state.groups->n + 4, GFP_KERNEL); + if (fattr->cf_acls) { + cf_pace = fattr->cf_acls->a_entries; + posix_state_to_acl(&acl_state, cf_pace); + } + } + } + + if (default_acl_state.users->n || default_acl_state.groups->n) { + default_acl_state.mask.allow = 0x07; + + if (IS_ENABLED(CONFIG_FS_POSIX_ACL)) { + fattr->cf_dacls = + posix_acl_alloc(default_acl_state.users->n + + default_acl_state.groups->n + 4, GFP_KERNEL); + if (fattr->cf_dacls) { + cf_pdace = fattr->cf_dacls->a_entries; + posix_state_to_acl(&default_acl_state, cf_pdace); + } + } + } + free_acl_state(&acl_state); + free_acl_state(&default_acl_state); +} + +static void set_posix_acl_entries_dacl(struct user_namespace *user_ns, + struct smb_ace *pndace, + struct smb_fattr *fattr, u32 *num_aces, + u16 *size, u32 nt_aces_num) +{ + struct posix_acl_entry *pace; + struct smb_sid *sid; + struct smb_ace *ntace; + int i, j; + + if (!fattr->cf_acls) + goto posix_default_acl; + + pace = fattr->cf_acls->a_entries; + for (i = 0; i < fattr->cf_acls->a_count; i++, pace++) { + int flags = 0; + + sid = kmalloc(sizeof(struct smb_sid), GFP_KERNEL); + if (!sid) + break; + + if (pace->e_tag == ACL_USER) { + uid_t uid; + unsigned int sid_type = SIDOWNER; + + uid = from_kuid(user_ns, pace->e_uid); + if (!uid) + sid_type = SIDUNIX_USER; + id_to_sid(uid, sid_type, sid); + } else if (pace->e_tag == ACL_GROUP) { + gid_t gid; + + gid = from_kgid(user_ns, pace->e_gid); + id_to_sid(gid, SIDUNIX_GROUP, sid); + } else if (pace->e_tag == ACL_OTHER && !nt_aces_num) { + smb_copy_sid(sid, &sid_everyone); + } else { + kfree(sid); + continue; + } + ntace = pndace; + for (j = 0; j < nt_aces_num; j++) { + if (ntace->sid.sub_auth[ntace->sid.num_subauth - 1] == + sid->sub_auth[sid->num_subauth - 1]) + goto pass_same_sid; + ntace = (struct smb_ace *)((char *)ntace + + le16_to_cpu(ntace->size)); + } + + if (S_ISDIR(fattr->cf_mode) && pace->e_tag == ACL_OTHER) + flags = 0x03; + + ntace = (struct smb_ace *)((char *)pndace + *size); + *size += fill_ace_for_sid(ntace, sid, ACCESS_ALLOWED, flags, + pace->e_perm, 0777); + (*num_aces)++; + if (pace->e_tag == ACL_USER) + ntace->access_req |= + FILE_DELETE_LE | FILE_DELETE_CHILD_LE; + + if (S_ISDIR(fattr->cf_mode) && + (pace->e_tag == ACL_USER || pace->e_tag == ACL_GROUP)) { + ntace = (struct smb_ace *)((char *)pndace + *size); + *size += fill_ace_for_sid(ntace, sid, ACCESS_ALLOWED, + 0x03, pace->e_perm, 0777); + (*num_aces)++; + if (pace->e_tag == ACL_USER) + ntace->access_req |= + FILE_DELETE_LE | FILE_DELETE_CHILD_LE; + } + +pass_same_sid: + kfree(sid); + } + + if (nt_aces_num) + return; + +posix_default_acl: + if (!fattr->cf_dacls) + return; + + pace = fattr->cf_dacls->a_entries; + for (i = 0; i < fattr->cf_dacls->a_count; i++, pace++) { + sid = kmalloc(sizeof(struct smb_sid), GFP_KERNEL); + if (!sid) + break; + + if (pace->e_tag == ACL_USER) { + uid_t uid; + + uid = from_kuid(user_ns, pace->e_uid); + id_to_sid(uid, SIDCREATOR_OWNER, sid); + } else if (pace->e_tag == ACL_GROUP) { + gid_t gid; + + gid = from_kgid(user_ns, pace->e_gid); + id_to_sid(gid, SIDCREATOR_GROUP, sid); + } else { + kfree(sid); + continue; + } + + ntace = (struct smb_ace *)((char *)pndace + *size); + *size += fill_ace_for_sid(ntace, sid, ACCESS_ALLOWED, 0x0b, + pace->e_perm, 0777); + (*num_aces)++; + if (pace->e_tag == ACL_USER) + ntace->access_req |= + FILE_DELETE_LE | FILE_DELETE_CHILD_LE; + kfree(sid); + } +} + +static void set_ntacl_dacl(struct user_namespace *user_ns, + struct smb_acl *pndacl, + struct smb_acl *nt_dacl, + const struct smb_sid *pownersid, + const struct smb_sid *pgrpsid, + struct smb_fattr *fattr) +{ + struct smb_ace *ntace, *pndace; + int nt_num_aces = le32_to_cpu(nt_dacl->num_aces), num_aces = 0; + unsigned short size = 0; + int i; + + pndace = (struct smb_ace *)((char *)pndacl + sizeof(struct smb_acl)); + if (nt_num_aces) { + ntace = (struct smb_ace *)((char *)nt_dacl + sizeof(struct smb_acl)); + for (i = 0; i < nt_num_aces; i++) { + memcpy((char *)pndace + size, ntace, le16_to_cpu(ntace->size)); + size += le16_to_cpu(ntace->size); + ntace = (struct smb_ace *)((char *)ntace + le16_to_cpu(ntace->size)); + num_aces++; + } + } + + set_posix_acl_entries_dacl(user_ns, pndace, fattr, + &num_aces, &size, nt_num_aces); + pndacl->num_aces = cpu_to_le32(num_aces); + pndacl->size = cpu_to_le16(le16_to_cpu(pndacl->size) + size); +} + +static void set_mode_dacl(struct user_namespace *user_ns, + struct smb_acl *pndacl, struct smb_fattr *fattr) +{ + struct smb_ace *pace, *pndace; + u32 num_aces = 0; + u16 size = 0, ace_size = 0; + uid_t uid; + const struct smb_sid *sid; + + pace = pndace = (struct smb_ace *)((char *)pndacl + sizeof(struct smb_acl)); + + if (fattr->cf_acls) { + set_posix_acl_entries_dacl(user_ns, pndace, fattr, + &num_aces, &size, num_aces); + goto out; + } + + /* owner RID */ + uid = from_kuid(user_ns, fattr->cf_uid); + if (uid) + sid = &server_conf.domain_sid; + else + sid = &sid_unix_users; + ace_size = fill_ace_for_sid(pace, sid, ACCESS_ALLOWED, 0, + fattr->cf_mode, 0700); + pace->sid.sub_auth[pace->sid.num_subauth++] = cpu_to_le32(uid); + pace->size = cpu_to_le16(ace_size + 4); + size += le16_to_cpu(pace->size); + pace = (struct smb_ace *)((char *)pndace + size); + + /* Group RID */ + ace_size = fill_ace_for_sid(pace, &sid_unix_groups, + ACCESS_ALLOWED, 0, fattr->cf_mode, 0070); + pace->sid.sub_auth[pace->sid.num_subauth++] = + cpu_to_le32(from_kgid(user_ns, fattr->cf_gid)); + pace->size = cpu_to_le16(ace_size + 4); + size += le16_to_cpu(pace->size); + pace = (struct smb_ace *)((char *)pndace + size); + num_aces = 3; + + if (S_ISDIR(fattr->cf_mode)) { + pace = (struct smb_ace *)((char *)pndace + size); + + /* creator owner */ + size += fill_ace_for_sid(pace, &creator_owner, ACCESS_ALLOWED, + 0x0b, fattr->cf_mode, 0700); + pace = (struct smb_ace *)((char *)pndace + size); + + /* creator group */ + size += fill_ace_for_sid(pace, &creator_group, ACCESS_ALLOWED, + 0x0b, fattr->cf_mode, 0070); + pace = (struct smb_ace *)((char *)pndace + size); + num_aces = 5; + } + + /* other */ + size += fill_ace_for_sid(pace, &sid_everyone, ACCESS_ALLOWED, 0, + fattr->cf_mode, 0007); + +out: + pndacl->num_aces = cpu_to_le32(num_aces); + pndacl->size = cpu_to_le16(le16_to_cpu(pndacl->size) + size); +} + +static int parse_sid(struct smb_sid *psid, char *end_of_acl) +{ + /* + * validate that we do not go past end of ACL - sid must be at least 8 + * bytes long (assuming no sub-auths - e.g. the null SID + */ + if (end_of_acl < (char *)psid + 8) { + pr_err("ACL too small to parse SID %p\n", psid); + return -EINVAL; + } + + return 0; +} + +/* Convert CIFS ACL to POSIX form */ +int parse_sec_desc(struct user_namespace *user_ns, struct smb_ntsd *pntsd, + int acl_len, struct smb_fattr *fattr) +{ + int rc = 0; + struct smb_sid *owner_sid_ptr, *group_sid_ptr; + struct smb_acl *dacl_ptr; /* no need for SACL ptr */ + char *end_of_acl = ((char *)pntsd) + acl_len; + __u32 dacloffset; + int pntsd_type; + + if (!pntsd) + return -EIO; + + owner_sid_ptr = (struct smb_sid *)((char *)pntsd + + le32_to_cpu(pntsd->osidoffset)); + group_sid_ptr = (struct smb_sid *)((char *)pntsd + + le32_to_cpu(pntsd->gsidoffset)); + dacloffset = le32_to_cpu(pntsd->dacloffset); + dacl_ptr = (struct smb_acl *)((char *)pntsd + dacloffset); + ksmbd_debug(SMB, + "revision %d type 0x%x ooffset 0x%x goffset 0x%x sacloffset 0x%x dacloffset 0x%x\n", + pntsd->revision, pntsd->type, le32_to_cpu(pntsd->osidoffset), + le32_to_cpu(pntsd->gsidoffset), + le32_to_cpu(pntsd->sacloffset), dacloffset); + + pntsd_type = le16_to_cpu(pntsd->type); + if (!(pntsd_type & DACL_PRESENT)) { + ksmbd_debug(SMB, "DACL_PRESENT in DACL type is not set\n"); + return rc; + } + + pntsd->type = cpu_to_le16(DACL_PRESENT); + + if (pntsd->osidoffset) { + rc = parse_sid(owner_sid_ptr, end_of_acl); + if (rc) { + pr_err("%s: Error %d parsing Owner SID\n", __func__, rc); + return rc; + } + + rc = sid_to_id(user_ns, owner_sid_ptr, SIDOWNER, fattr); + if (rc) { + pr_err("%s: Error %d mapping Owner SID to uid\n", + __func__, rc); + owner_sid_ptr = NULL; + } + } + + if (pntsd->gsidoffset) { + rc = parse_sid(group_sid_ptr, end_of_acl); + if (rc) { + pr_err("%s: Error %d mapping Owner SID to gid\n", + __func__, rc); + return rc; + } + rc = sid_to_id(user_ns, group_sid_ptr, SIDUNIX_GROUP, fattr); + if (rc) { + pr_err("%s: Error %d mapping Group SID to gid\n", + __func__, rc); + group_sid_ptr = NULL; + } + } + + if ((pntsd_type & (DACL_AUTO_INHERITED | DACL_AUTO_INHERIT_REQ)) == + (DACL_AUTO_INHERITED | DACL_AUTO_INHERIT_REQ)) + pntsd->type |= cpu_to_le16(DACL_AUTO_INHERITED); + if (pntsd_type & DACL_PROTECTED) + pntsd->type |= cpu_to_le16(DACL_PROTECTED); + + if (dacloffset) { + parse_dacl(user_ns, dacl_ptr, end_of_acl, + owner_sid_ptr, group_sid_ptr, fattr); + } + + return 0; +} + +/* Convert permission bits from mode to equivalent CIFS ACL */ +int build_sec_desc(struct user_namespace *user_ns, + struct smb_ntsd *pntsd, struct smb_ntsd *ppntsd, + int addition_info, __u32 *secdesclen, + struct smb_fattr *fattr) +{ + int rc = 0; + __u32 offset; + struct smb_sid *owner_sid_ptr, *group_sid_ptr; + struct smb_sid *nowner_sid_ptr, *ngroup_sid_ptr; + struct smb_acl *dacl_ptr = NULL; /* no need for SACL ptr */ + uid_t uid; + gid_t gid; + unsigned int sid_type = SIDOWNER; + + nowner_sid_ptr = kmalloc(sizeof(struct smb_sid), GFP_KERNEL); + if (!nowner_sid_ptr) + return -ENOMEM; + + uid = from_kuid(user_ns, fattr->cf_uid); + if (!uid) + sid_type = SIDUNIX_USER; + id_to_sid(uid, sid_type, nowner_sid_ptr); + + ngroup_sid_ptr = kmalloc(sizeof(struct smb_sid), GFP_KERNEL); + if (!ngroup_sid_ptr) { + kfree(nowner_sid_ptr); + return -ENOMEM; + } + + gid = from_kgid(user_ns, fattr->cf_gid); + id_to_sid(gid, SIDUNIX_GROUP, ngroup_sid_ptr); + + offset = sizeof(struct smb_ntsd); + pntsd->sacloffset = 0; + pntsd->revision = cpu_to_le16(1); + pntsd->type = cpu_to_le16(SELF_RELATIVE); + if (ppntsd) + pntsd->type |= ppntsd->type; + + if (addition_info & OWNER_SECINFO) { + pntsd->osidoffset = cpu_to_le32(offset); + owner_sid_ptr = (struct smb_sid *)((char *)pntsd + offset); + smb_copy_sid(owner_sid_ptr, nowner_sid_ptr); + offset += 1 + 1 + 6 + (nowner_sid_ptr->num_subauth * 4); + } + + if (addition_info & GROUP_SECINFO) { + pntsd->gsidoffset = cpu_to_le32(offset); + group_sid_ptr = (struct smb_sid *)((char *)pntsd + offset); + smb_copy_sid(group_sid_ptr, ngroup_sid_ptr); + offset += 1 + 1 + 6 + (ngroup_sid_ptr->num_subauth * 4); + } + + if (addition_info & DACL_SECINFO) { + pntsd->type |= cpu_to_le16(DACL_PRESENT); + dacl_ptr = (struct smb_acl *)((char *)pntsd + offset); + dacl_ptr->revision = cpu_to_le16(2); + dacl_ptr->size = cpu_to_le16(sizeof(struct smb_acl)); + dacl_ptr->num_aces = 0; + + if (!ppntsd) { + set_mode_dacl(user_ns, dacl_ptr, fattr); + } else if (!ppntsd->dacloffset) { + goto out; + } else { + struct smb_acl *ppdacl_ptr; + + ppdacl_ptr = (struct smb_acl *)((char *)ppntsd + + le32_to_cpu(ppntsd->dacloffset)); + set_ntacl_dacl(user_ns, dacl_ptr, ppdacl_ptr, + nowner_sid_ptr, ngroup_sid_ptr, fattr); + } + pntsd->dacloffset = cpu_to_le32(offset); + offset += le16_to_cpu(dacl_ptr->size); + } + +out: + kfree(nowner_sid_ptr); + kfree(ngroup_sid_ptr); + *secdesclen = offset; + return rc; +} + +static void smb_set_ace(struct smb_ace *ace, const struct smb_sid *sid, u8 type, + u8 flags, __le32 access_req) +{ + ace->type = type; + ace->flags = flags; + ace->access_req = access_req; + smb_copy_sid(&ace->sid, sid); + ace->size = cpu_to_le16(1 + 1 + 2 + 4 + 1 + 1 + 6 + (sid->num_subauth * 4)); +} + +int smb_inherit_dacl(struct ksmbd_conn *conn, + struct path *path, + unsigned int uid, unsigned int gid) +{ + const struct smb_sid *psid, *creator = NULL; + struct smb_ace *parent_aces, *aces; + struct smb_acl *parent_pdacl; + struct smb_ntsd *parent_pntsd = NULL; + struct smb_sid owner_sid, group_sid; + struct dentry *parent = path->dentry->d_parent; + struct user_namespace *user_ns = mnt_user_ns(path->mnt); + int inherited_flags = 0, flags = 0, i, ace_cnt = 0, nt_size = 0; + int rc = 0, num_aces, dacloffset, pntsd_type, acl_len; + char *aces_base; + bool is_dir = S_ISDIR(d_inode(path->dentry)->i_mode); + + acl_len = ksmbd_vfs_get_sd_xattr(conn, user_ns, + parent, &parent_pntsd); + if (acl_len <= 0) + return -ENOENT; + dacloffset = le32_to_cpu(parent_pntsd->dacloffset); + if (!dacloffset) { + rc = -EINVAL; + goto free_parent_pntsd; + } + + parent_pdacl = (struct smb_acl *)((char *)parent_pntsd + dacloffset); + num_aces = le32_to_cpu(parent_pdacl->num_aces); + pntsd_type = le16_to_cpu(parent_pntsd->type); + + aces_base = kmalloc(sizeof(struct smb_ace) * num_aces * 2, GFP_KERNEL); + if (!aces_base) { + rc = -ENOMEM; + goto free_parent_pntsd; + } + + aces = (struct smb_ace *)aces_base; + parent_aces = (struct smb_ace *)((char *)parent_pdacl + + sizeof(struct smb_acl)); + + if (pntsd_type & DACL_AUTO_INHERITED) + inherited_flags = INHERITED_ACE; + + for (i = 0; i < num_aces; i++) { + flags = parent_aces->flags; + if (!smb_inherit_flags(flags, is_dir)) + goto pass; + if (is_dir) { + flags &= ~(INHERIT_ONLY_ACE | INHERITED_ACE); + if (!(flags & CONTAINER_INHERIT_ACE)) + flags |= INHERIT_ONLY_ACE; + if (flags & NO_PROPAGATE_INHERIT_ACE) + flags = 0; + } else { + flags = 0; + } + + if (!compare_sids(&creator_owner, &parent_aces->sid)) { + creator = &creator_owner; + id_to_sid(uid, SIDOWNER, &owner_sid); + psid = &owner_sid; + } else if (!compare_sids(&creator_group, &parent_aces->sid)) { + creator = &creator_group; + id_to_sid(gid, SIDUNIX_GROUP, &group_sid); + psid = &group_sid; + } else { + creator = NULL; + psid = &parent_aces->sid; + } + + if (is_dir && creator && flags & CONTAINER_INHERIT_ACE) { + smb_set_ace(aces, psid, parent_aces->type, inherited_flags, + parent_aces->access_req); + nt_size += le16_to_cpu(aces->size); + ace_cnt++; + aces = (struct smb_ace *)((char *)aces + le16_to_cpu(aces->size)); + flags |= INHERIT_ONLY_ACE; + psid = creator; + } else if (is_dir && !(parent_aces->flags & NO_PROPAGATE_INHERIT_ACE)) { + psid = &parent_aces->sid; + } + + smb_set_ace(aces, psid, parent_aces->type, flags | inherited_flags, + parent_aces->access_req); + nt_size += le16_to_cpu(aces->size); + aces = (struct smb_ace *)((char *)aces + le16_to_cpu(aces->size)); + ace_cnt++; +pass: + parent_aces = + (struct smb_ace *)((char *)parent_aces + le16_to_cpu(parent_aces->size)); + } + + if (nt_size > 0) { + struct smb_ntsd *pntsd; + struct smb_acl *pdacl; + struct smb_sid *powner_sid = NULL, *pgroup_sid = NULL; + int powner_sid_size = 0, pgroup_sid_size = 0, pntsd_size; + + if (parent_pntsd->osidoffset) { + powner_sid = (struct smb_sid *)((char *)parent_pntsd + + le32_to_cpu(parent_pntsd->osidoffset)); + powner_sid_size = 1 + 1 + 6 + (powner_sid->num_subauth * 4); + } + if (parent_pntsd->gsidoffset) { + pgroup_sid = (struct smb_sid *)((char *)parent_pntsd + + le32_to_cpu(parent_pntsd->gsidoffset)); + pgroup_sid_size = 1 + 1 + 6 + (pgroup_sid->num_subauth * 4); + } + + pntsd = kzalloc(sizeof(struct smb_ntsd) + powner_sid_size + + pgroup_sid_size + sizeof(struct smb_acl) + + nt_size, GFP_KERNEL); + if (!pntsd) { + rc = -ENOMEM; + goto free_aces_base; + } + + pntsd->revision = cpu_to_le16(1); + pntsd->type = cpu_to_le16(SELF_RELATIVE | DACL_PRESENT); + if (le16_to_cpu(parent_pntsd->type) & DACL_AUTO_INHERITED) + pntsd->type |= cpu_to_le16(DACL_AUTO_INHERITED); + pntsd_size = sizeof(struct smb_ntsd); + pntsd->osidoffset = parent_pntsd->osidoffset; + pntsd->gsidoffset = parent_pntsd->gsidoffset; + pntsd->dacloffset = parent_pntsd->dacloffset; + + if (pntsd->osidoffset) { + struct smb_sid *owner_sid = (struct smb_sid *)((char *)pntsd + + le32_to_cpu(pntsd->osidoffset)); + memcpy(owner_sid, powner_sid, powner_sid_size); + pntsd_size += powner_sid_size; + } + + if (pntsd->gsidoffset) { + struct smb_sid *group_sid = (struct smb_sid *)((char *)pntsd + + le32_to_cpu(pntsd->gsidoffset)); + memcpy(group_sid, pgroup_sid, pgroup_sid_size); + pntsd_size += pgroup_sid_size; + } + + if (pntsd->dacloffset) { + struct smb_ace *pace; + + pdacl = (struct smb_acl *)((char *)pntsd + le32_to_cpu(pntsd->dacloffset)); + pdacl->revision = cpu_to_le16(2); + pdacl->size = cpu_to_le16(sizeof(struct smb_acl) + nt_size); + pdacl->num_aces = cpu_to_le32(ace_cnt); + pace = (struct smb_ace *)((char *)pdacl + sizeof(struct smb_acl)); + memcpy(pace, aces_base, nt_size); + pntsd_size += sizeof(struct smb_acl) + nt_size; + } + + ksmbd_vfs_set_sd_xattr(conn, user_ns, + path->dentry, pntsd, pntsd_size); + kfree(pntsd); + } + +free_aces_base: + kfree(aces_base); +free_parent_pntsd: + kfree(parent_pntsd); + return rc; +} + +bool smb_inherit_flags(int flags, bool is_dir) +{ + if (!is_dir) + return (flags & OBJECT_INHERIT_ACE) != 0; + + if (flags & OBJECT_INHERIT_ACE && !(flags & NO_PROPAGATE_INHERIT_ACE)) + return true; + + if (flags & CONTAINER_INHERIT_ACE) + return true; + return false; +} + +int smb_check_perm_dacl(struct ksmbd_conn *conn, struct path *path, + __le32 *pdaccess, int uid) +{ + struct user_namespace *user_ns = mnt_user_ns(path->mnt); + struct smb_ntsd *pntsd = NULL; + struct smb_acl *pdacl; + struct posix_acl *posix_acls; + int rc = 0, acl_size; + struct smb_sid sid; + int granted = le32_to_cpu(*pdaccess & ~FILE_MAXIMAL_ACCESS_LE); + struct smb_ace *ace; + int i, found = 0; + unsigned int access_bits = 0; + struct smb_ace *others_ace = NULL; + struct posix_acl_entry *pa_entry; + unsigned int sid_type = SIDOWNER; + char *end_of_acl; + + ksmbd_debug(SMB, "check permission using windows acl\n"); + acl_size = ksmbd_vfs_get_sd_xattr(conn, user_ns, + path->dentry, &pntsd); + if (acl_size <= 0 || !pntsd || !pntsd->dacloffset) { + kfree(pntsd); + return 0; + } + + pdacl = (struct smb_acl *)((char *)pntsd + le32_to_cpu(pntsd->dacloffset)); + end_of_acl = ((char *)pntsd) + acl_size; + if (end_of_acl <= (char *)pdacl) { + kfree(pntsd); + return 0; + } + + if (end_of_acl < (char *)pdacl + le16_to_cpu(pdacl->size) || + le16_to_cpu(pdacl->size) < sizeof(struct smb_acl)) { + kfree(pntsd); + return 0; + } + + if (!pdacl->num_aces) { + if (!(le16_to_cpu(pdacl->size) - sizeof(struct smb_acl)) && + *pdaccess & ~(FILE_READ_CONTROL_LE | FILE_WRITE_DAC_LE)) { + rc = -EACCES; + goto err_out; + } + kfree(pntsd); + return 0; + } + + if (*pdaccess & FILE_MAXIMAL_ACCESS_LE) { + granted = READ_CONTROL | WRITE_DAC | FILE_READ_ATTRIBUTES | + DELETE; + + ace = (struct smb_ace *)((char *)pdacl + sizeof(struct smb_acl)); + for (i = 0; i < le32_to_cpu(pdacl->num_aces); i++) { + granted |= le32_to_cpu(ace->access_req); + ace = (struct smb_ace *)((char *)ace + le16_to_cpu(ace->size)); + if (end_of_acl < (char *)ace) + goto err_out; + } + + if (!pdacl->num_aces) + granted = GENERIC_ALL_FLAGS; + } + + if (!uid) + sid_type = SIDUNIX_USER; + id_to_sid(uid, sid_type, &sid); + + ace = (struct smb_ace *)((char *)pdacl + sizeof(struct smb_acl)); + for (i = 0; i < le32_to_cpu(pdacl->num_aces); i++) { + if (!compare_sids(&sid, &ace->sid) || + !compare_sids(&sid_unix_NFS_mode, &ace->sid)) { + found = 1; + break; + } + if (!compare_sids(&sid_everyone, &ace->sid)) + others_ace = ace; + + ace = (struct smb_ace *)((char *)ace + le16_to_cpu(ace->size)); + if (end_of_acl < (char *)ace) + goto err_out; + } + + if (*pdaccess & FILE_MAXIMAL_ACCESS_LE && found) { + granted = READ_CONTROL | WRITE_DAC | FILE_READ_ATTRIBUTES | + DELETE; + + granted |= le32_to_cpu(ace->access_req); + + if (!pdacl->num_aces) + granted = GENERIC_ALL_FLAGS; + } + + if (IS_ENABLED(CONFIG_FS_POSIX_ACL)) { + posix_acls = get_acl(d_inode(path->dentry), ACL_TYPE_ACCESS); + if (posix_acls && !found) { + unsigned int id = -1; + + pa_entry = posix_acls->a_entries; + for (i = 0; i < posix_acls->a_count; i++, pa_entry++) { + if (pa_entry->e_tag == ACL_USER) + id = from_kuid(user_ns, + pa_entry->e_uid); + else if (pa_entry->e_tag == ACL_GROUP) + id = from_kgid(user_ns, + pa_entry->e_gid); + else + continue; + + if (id == uid) { + mode_to_access_flags(pa_entry->e_perm, + 0777, + &access_bits); + if (!access_bits) + access_bits = + SET_MINIMUM_RIGHTS; + goto check_access_bits; + } + } + } + if (posix_acls) + posix_acl_release(posix_acls); + } + + if (!found) { + if (others_ace) { + ace = others_ace; + } else { + ksmbd_debug(SMB, "Can't find corresponding sid\n"); + rc = -EACCES; + goto err_out; + } + } + + switch (ace->type) { + case ACCESS_ALLOWED_ACE_TYPE: + access_bits = le32_to_cpu(ace->access_req); + break; + case ACCESS_DENIED_ACE_TYPE: + case ACCESS_DENIED_CALLBACK_ACE_TYPE: + access_bits = le32_to_cpu(~ace->access_req); + break; + } + +check_access_bits: + if (granted & + ~(access_bits | FILE_READ_ATTRIBUTES | READ_CONTROL | WRITE_DAC | DELETE)) { + ksmbd_debug(SMB, "Access denied with winACL, granted : %x, access_req : %x\n", + granted, le32_to_cpu(ace->access_req)); + rc = -EACCES; + goto err_out; + } + + *pdaccess = cpu_to_le32(granted); +err_out: + kfree(pntsd); + return rc; +} + +int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon, + struct path *path, struct smb_ntsd *pntsd, int ntsd_len, + bool type_check) +{ + int rc; + struct smb_fattr fattr = {{0}}; + struct inode *inode = d_inode(path->dentry); + struct user_namespace *user_ns = mnt_user_ns(path->mnt); + struct iattr newattrs; + + fattr.cf_uid = INVALID_UID; + fattr.cf_gid = INVALID_GID; + fattr.cf_mode = inode->i_mode; + + rc = parse_sec_desc(user_ns, pntsd, ntsd_len, &fattr); + if (rc) + goto out; + + newattrs.ia_valid = ATTR_CTIME; + if (!uid_eq(fattr.cf_uid, INVALID_UID)) { + newattrs.ia_valid |= ATTR_UID; + newattrs.ia_uid = fattr.cf_uid; + } + if (!gid_eq(fattr.cf_gid, INVALID_GID)) { + newattrs.ia_valid |= ATTR_GID; + newattrs.ia_gid = fattr.cf_gid; + } + newattrs.ia_valid |= ATTR_MODE; + newattrs.ia_mode = (inode->i_mode & ~0777) | (fattr.cf_mode & 0777); + + inode_lock(inode); + rc = notify_change(user_ns, path->dentry, &newattrs, NULL); + inode_unlock(inode); + if (rc) + goto out; + + ksmbd_vfs_remove_acl_xattrs(user_ns, path->dentry); + /* Update posix acls */ + if (IS_ENABLED(CONFIG_FS_POSIX_ACL) && fattr.cf_dacls) { + rc = set_posix_acl(user_ns, inode, + ACL_TYPE_ACCESS, fattr.cf_acls); + if (S_ISDIR(inode->i_mode) && fattr.cf_dacls) + rc = set_posix_acl(user_ns, inode, + ACL_TYPE_DEFAULT, fattr.cf_dacls); + } + + /* Check it only calling from SD BUFFER context */ + if (type_check && !(le16_to_cpu(pntsd->type) & DACL_PRESENT)) + goto out; + + if (test_share_config_flag(tcon->share_conf, KSMBD_SHARE_FLAG_ACL_XATTR)) { + /* Update WinACL in xattr */ + ksmbd_vfs_remove_sd_xattrs(user_ns, path->dentry); + ksmbd_vfs_set_sd_xattr(conn, user_ns, + path->dentry, pntsd, ntsd_len); + } + +out: + posix_acl_release(fattr.cf_acls); + posix_acl_release(fattr.cf_dacls); + mark_inode_dirty(inode); + return rc; +} + +void ksmbd_init_domain(u32 *sub_auth) +{ + int i; + + memcpy(&server_conf.domain_sid, &domain, sizeof(struct smb_sid)); + for (i = 0; i < 3; ++i) + server_conf.domain_sid.sub_auth[i + 1] = cpu_to_le32(sub_auth[i]); +} diff --git a/fs/ksmbd/smbacl.h b/fs/ksmbd/smbacl.h new file mode 100644 index 000000000000..940f686a1d95 --- /dev/null +++ b/fs/ksmbd/smbacl.h @@ -0,0 +1,212 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +/* + * Copyright (c) International Business Machines Corp., 2007 + * Author(s): Steve French (sfrench@us.ibm.com) + * Modified by Namjae Jeon (linkinjeon@kernel.org) + */ + +#ifndef _SMBACL_H +#define _SMBACL_H + +#include <linux/fs.h> +#include <linux/namei.h> +#include <linux/posix_acl.h> + +#include "mgmt/tree_connect.h" + +#define NUM_AUTHS (6) /* number of authority fields */ +#define SID_MAX_SUB_AUTHORITIES (15) /* max number of sub authority fields */ + +/* + * ACE types - see MS-DTYP 2.4.4.1 + */ +enum { + ACCESS_ALLOWED, + ACCESS_DENIED, +}; + +/* + * Security ID types + */ +enum { + SIDOWNER = 1, + SIDGROUP, + SIDCREATOR_OWNER, + SIDCREATOR_GROUP, + SIDUNIX_USER, + SIDUNIX_GROUP, + SIDNFS_USER, + SIDNFS_GROUP, + SIDNFS_MODE, +}; + +/* Revision for ACLs */ +#define SD_REVISION 1 + +/* Control flags for Security Descriptor */ +#define OWNER_DEFAULTED 0x0001 +#define GROUP_DEFAULTED 0x0002 +#define DACL_PRESENT 0x0004 +#define DACL_DEFAULTED 0x0008 +#define SACL_PRESENT 0x0010 +#define SACL_DEFAULTED 0x0020 +#define DACL_TRUSTED 0x0040 +#define SERVER_SECURITY 0x0080 +#define DACL_AUTO_INHERIT_REQ 0x0100 +#define SACL_AUTO_INHERIT_REQ 0x0200 +#define DACL_AUTO_INHERITED 0x0400 +#define SACL_AUTO_INHERITED 0x0800 +#define DACL_PROTECTED 0x1000 +#define SACL_PROTECTED 0x2000 +#define RM_CONTROL_VALID 0x4000 +#define SELF_RELATIVE 0x8000 + +/* ACE types - see MS-DTYP 2.4.4.1 */ +#define ACCESS_ALLOWED_ACE_TYPE 0x00 +#define ACCESS_DENIED_ACE_TYPE 0x01 +#define SYSTEM_AUDIT_ACE_TYPE 0x02 +#define SYSTEM_ALARM_ACE_TYPE 0x03 +#define ACCESS_ALLOWED_COMPOUND_ACE_TYPE 0x04 +#define ACCESS_ALLOWED_OBJECT_ACE_TYPE 0x05 +#define ACCESS_DENIED_OBJECT_ACE_TYPE 0x06 +#define SYSTEM_AUDIT_OBJECT_ACE_TYPE 0x07 +#define SYSTEM_ALARM_OBJECT_ACE_TYPE 0x08 +#define ACCESS_ALLOWED_CALLBACK_ACE_TYPE 0x09 +#define ACCESS_DENIED_CALLBACK_ACE_TYPE 0x0A +#define ACCESS_ALLOWED_CALLBACK_OBJECT_ACE_TYPE 0x0B +#define ACCESS_DENIED_CALLBACK_OBJECT_ACE_TYPE 0x0C +#define SYSTEM_AUDIT_CALLBACK_ACE_TYPE 0x0D +#define SYSTEM_ALARM_CALLBACK_ACE_TYPE 0x0E /* Reserved */ +#define SYSTEM_AUDIT_CALLBACK_OBJECT_ACE_TYPE 0x0F +#define SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE 0x10 /* reserved */ +#define SYSTEM_MANDATORY_LABEL_ACE_TYPE 0x11 +#define SYSTEM_RESOURCE_ATTRIBUTE_ACE_TYPE 0x12 +#define SYSTEM_SCOPED_POLICY_ID_ACE_TYPE 0x13 + +/* ACE flags */ +#define OBJECT_INHERIT_ACE 0x01 +#define CONTAINER_INHERIT_ACE 0x02 +#define NO_PROPAGATE_INHERIT_ACE 0x04 +#define INHERIT_ONLY_ACE 0x08 +#define INHERITED_ACE 0x10 +#define SUCCESSFUL_ACCESS_ACE_FLAG 0x40 +#define FAILED_ACCESS_ACE_FLAG 0x80 + +/* + * Maximum size of a string representation of a SID: + * + * The fields are unsigned values in decimal. So: + * + * u8: max 3 bytes in decimal + * u32: max 10 bytes in decimal + * + * "S-" + 3 bytes for version field + 15 for authority field + NULL terminator + * + * For authority field, max is when all 6 values are non-zero and it must be + * represented in hex. So "-0x" + 12 hex digits. + * + * Add 11 bytes for each subauthority field (10 bytes each + 1 for '-') + */ +#define SID_STRING_BASE_SIZE (2 + 3 + 15 + 1) +#define SID_STRING_SUBAUTH_SIZE (11) /* size of a single subauth string */ + +#define DOMAIN_USER_RID_LE cpu_to_le32(513) + +struct ksmbd_conn; + +struct smb_ntsd { + __le16 revision; /* revision level */ + __le16 type; + __le32 osidoffset; + __le32 gsidoffset; + __le32 sacloffset; + __le32 dacloffset; +} __packed; + +struct smb_sid { + __u8 revision; /* revision level */ + __u8 num_subauth; + __u8 authority[NUM_AUTHS]; + __le32 sub_auth[SID_MAX_SUB_AUTHORITIES]; /* sub_auth[num_subauth] */ +} __packed; + +/* size of a struct cifs_sid, sans sub_auth array */ +#define CIFS_SID_BASE_SIZE (1 + 1 + NUM_AUTHS) + +struct smb_acl { + __le16 revision; /* revision level */ + __le16 size; + __le32 num_aces; +} __packed; + +struct smb_ace { + __u8 type; + __u8 flags; + __le16 size; + __le32 access_req; + struct smb_sid sid; /* ie UUID of user or group who gets these perms */ +} __packed; + +struct smb_fattr { + kuid_t cf_uid; + kgid_t cf_gid; + umode_t cf_mode; + __le32 daccess; + struct posix_acl *cf_acls; + struct posix_acl *cf_dacls; +}; + +struct posix_ace_state { + u32 allow; + u32 deny; +}; + +struct posix_user_ace_state { + union { + kuid_t uid; + kgid_t gid; + }; + struct posix_ace_state perms; +}; + +struct posix_ace_state_array { + int n; + struct posix_user_ace_state aces[]; +}; + +/* + * while processing the nfsv4 ace, this maintains the partial permissions + * calculated so far: + */ + +struct posix_acl_state { + struct posix_ace_state owner; + struct posix_ace_state group; + struct posix_ace_state other; + struct posix_ace_state everyone; + struct posix_ace_state mask; /* deny unused in this case */ + struct posix_ace_state_array *users; + struct posix_ace_state_array *groups; +}; + +int parse_sec_desc(struct user_namespace *user_ns, struct smb_ntsd *pntsd, + int acl_len, struct smb_fattr *fattr); +int build_sec_desc(struct user_namespace *user_ns, struct smb_ntsd *pntsd, + struct smb_ntsd *ppntsd, int addition_info, + __u32 *secdesclen, struct smb_fattr *fattr); +int init_acl_state(struct posix_acl_state *state, int cnt); +void free_acl_state(struct posix_acl_state *state); +void posix_state_to_acl(struct posix_acl_state *state, + struct posix_acl_entry *pace); +int compare_sids(const struct smb_sid *ctsid, const struct smb_sid *cwsid); +bool smb_inherit_flags(int flags, bool is_dir); +int smb_inherit_dacl(struct ksmbd_conn *conn, struct path *path, + unsigned int uid, unsigned int gid); +int smb_check_perm_dacl(struct ksmbd_conn *conn, struct path *path, + __le32 *pdaccess, int uid); +int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon, + struct path *path, struct smb_ntsd *pntsd, int ntsd_len, + bool type_check); +void id_to_sid(unsigned int cid, uint sidtype, struct smb_sid *ssid); +void ksmbd_init_domain(u32 *sub_auth); +#endif /* _SMBACL_H */ diff --git a/fs/ksmbd/smbfsctl.h b/fs/ksmbd/smbfsctl.h new file mode 100644 index 000000000000..b98418aae20c --- /dev/null +++ b/fs/ksmbd/smbfsctl.h @@ -0,0 +1,91 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +/* + * fs/cifs/smbfsctl.h: SMB, CIFS, SMB2 FSCTL definitions + * + * Copyright (c) International Business Machines Corp., 2002,2009 + * Author(s): Steve French (sfrench@us.ibm.com) + */ + +/* IOCTL information */ +/* + * List of ioctl/fsctl function codes that are or could be useful in the + * future to remote clients like cifs or SMB2 client. There is probably + * a slightly larger set of fsctls that NTFS local filesystem could handle, + * including the seven below that we do not have struct definitions for. + * Even with protocol definitions for most of these now available, we still + * need to do some experimentation to identify which are practical to do + * remotely. Some of the following, such as the encryption/compression ones + * could be invoked from tools via a specialized hook into the VFS rather + * than via the standard vfs entry points + */ + +#ifndef __KSMBD_SMBFSCTL_H +#define __KSMBD_SMBFSCTL_H + +#define FSCTL_DFS_GET_REFERRALS 0x00060194 +#define FSCTL_DFS_GET_REFERRALS_EX 0x000601B0 +#define FSCTL_REQUEST_OPLOCK_LEVEL_1 0x00090000 +#define FSCTL_REQUEST_OPLOCK_LEVEL_2 0x00090004 +#define FSCTL_REQUEST_BATCH_OPLOCK 0x00090008 +#define FSCTL_LOCK_VOLUME 0x00090018 +#define FSCTL_UNLOCK_VOLUME 0x0009001C +#define FSCTL_IS_PATHNAME_VALID 0x0009002C /* BB add struct */ +#define FSCTL_GET_COMPRESSION 0x0009003C /* BB add struct */ +#define FSCTL_SET_COMPRESSION 0x0009C040 /* BB add struct */ +#define FSCTL_QUERY_FAT_BPB 0x00090058 /* BB add struct */ +/* Verify the next FSCTL number, we had it as 0x00090090 before */ +#define FSCTL_FILESYSTEM_GET_STATS 0x00090060 /* BB add struct */ +#define FSCTL_GET_NTFS_VOLUME_DATA 0x00090064 /* BB add struct */ +#define FSCTL_GET_RETRIEVAL_POINTERS 0x00090073 /* BB add struct */ +#define FSCTL_IS_VOLUME_DIRTY 0x00090078 /* BB add struct */ +#define FSCTL_ALLOW_EXTENDED_DASD_IO 0x00090083 /* BB add struct */ +#define FSCTL_REQUEST_FILTER_OPLOCK 0x0009008C +#define FSCTL_FIND_FILES_BY_SID 0x0009008F /* BB add struct */ +#define FSCTL_SET_OBJECT_ID 0x00090098 /* BB add struct */ +#define FSCTL_GET_OBJECT_ID 0x0009009C /* BB add struct */ +#define FSCTL_DELETE_OBJECT_ID 0x000900A0 /* BB add struct */ +#define FSCTL_SET_REPARSE_POINT 0x000900A4 /* BB add struct */ +#define FSCTL_GET_REPARSE_POINT 0x000900A8 /* BB add struct */ +#define FSCTL_DELETE_REPARSE_POINT 0x000900AC /* BB add struct */ +#define FSCTL_SET_OBJECT_ID_EXTENDED 0x000900BC /* BB add struct */ +#define FSCTL_CREATE_OR_GET_OBJECT_ID 0x000900C0 /* BB add struct */ +#define FSCTL_SET_SPARSE 0x000900C4 /* BB add struct */ +#define FSCTL_SET_ZERO_DATA 0x000980C8 /* BB add struct */ +#define FSCTL_SET_ENCRYPTION 0x000900D7 /* BB add struct */ +#define FSCTL_ENCRYPTION_FSCTL_IO 0x000900DB /* BB add struct */ +#define FSCTL_WRITE_RAW_ENCRYPTED 0x000900DF /* BB add struct */ +#define FSCTL_READ_RAW_ENCRYPTED 0x000900E3 /* BB add struct */ +#define FSCTL_READ_FILE_USN_DATA 0x000900EB /* BB add struct */ +#define FSCTL_WRITE_USN_CLOSE_RECORD 0x000900EF /* BB add struct */ +#define FSCTL_SIS_COPYFILE 0x00090100 /* BB add struct */ +#define FSCTL_RECALL_FILE 0x00090117 /* BB add struct */ +#define FSCTL_QUERY_SPARING_INFO 0x00090138 /* BB add struct */ +#define FSCTL_SET_ZERO_ON_DEALLOC 0x00090194 /* BB add struct */ +#define FSCTL_SET_SHORT_NAME_BEHAVIOR 0x000901B4 /* BB add struct */ +#define FSCTL_QUERY_ALLOCATED_RANGES 0x000940CF /* BB add struct */ +#define FSCTL_SET_DEFECT_MANAGEMENT 0x00098134 /* BB add struct */ +#define FSCTL_DUPLICATE_EXTENTS_TO_FILE 0x00098344 +#define FSCTL_SIS_LINK_FILES 0x0009C104 +#define FSCTL_PIPE_PEEK 0x0011400C /* BB add struct */ +#define FSCTL_PIPE_TRANSCEIVE 0x0011C017 /* BB add struct */ +/* strange that the number for this op is not sequential with previous op */ +#define FSCTL_PIPE_WAIT 0x00110018 /* BB add struct */ +#define FSCTL_REQUEST_RESUME_KEY 0x00140078 +#define FSCTL_LMR_GET_LINK_TRACK_INF 0x001400E8 /* BB add struct */ +#define FSCTL_LMR_SET_LINK_TRACK_INF 0x001400EC /* BB add struct */ +#define FSCTL_VALIDATE_NEGOTIATE_INFO 0x00140204 +#define FSCTL_QUERY_NETWORK_INTERFACE_INFO 0x001401FC +#define FSCTL_COPYCHUNK 0x001440F2 +#define FSCTL_COPYCHUNK_WRITE 0x001480F2 + +#define IO_REPARSE_TAG_MOUNT_POINT 0xA0000003 +#define IO_REPARSE_TAG_HSM 0xC0000004 +#define IO_REPARSE_TAG_SIS 0x80000007 + +/* WSL reparse tags */ +#define IO_REPARSE_TAG_LX_SYMLINK_LE cpu_to_le32(0xA000001D) +#define IO_REPARSE_TAG_AF_UNIX_LE cpu_to_le32(0x80000023) +#define IO_REPARSE_TAG_LX_FIFO_LE cpu_to_le32(0x80000024) +#define IO_REPARSE_TAG_LX_CHR_LE cpu_to_le32(0x80000025) +#define IO_REPARSE_TAG_LX_BLK_LE cpu_to_le32(0x80000026) +#endif /* __KSMBD_SMBFSCTL_H */ diff --git a/fs/ksmbd/smbstatus.h b/fs/ksmbd/smbstatus.h new file mode 100644 index 000000000000..108a8b6ed24a --- /dev/null +++ b/fs/ksmbd/smbstatus.h @@ -0,0 +1,1822 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +/* + * fs/cifs/smb2status.h + * + * SMB2 Status code (network error) definitions + * Definitions are from MS-ERREF + * + * Copyright (c) International Business Machines Corp., 2009,2011 + * Author(s): Steve French (sfrench@us.ibm.com) + */ + +/* + * 0 1 2 3 4 5 6 7 8 9 0 A B C D E F 0 1 2 3 4 5 6 7 8 9 A B C D E F + * SEV C N <-------Facility--------> <------Error Status Code------> + * + * C is set if "customer defined" error, N bit is reserved and MBZ + */ + +#define STATUS_SEVERITY_SUCCESS cpu_to_le32(0x0000) +#define STATUS_SEVERITY_INFORMATIONAL cpu_to_le32(0x0001) +#define STATUS_SEVERITY_WARNING cpu_to_le32(0x0002) +#define STATUS_SEVERITY_ERROR cpu_to_le32(0x0003) + +struct ntstatus { + /* Facility is the high 12 bits of the following field */ + __le32 Facility; /* low 2 bits Severity, next is Customer, then rsrvd */ + __le32 Code; +}; + +#define STATUS_SUCCESS 0x00000000 +#define STATUS_WAIT_0 cpu_to_le32(0x00000000) +#define STATUS_WAIT_1 cpu_to_le32(0x00000001) +#define STATUS_WAIT_2 cpu_to_le32(0x00000002) +#define STATUS_WAIT_3 cpu_to_le32(0x00000003) +#define STATUS_WAIT_63 cpu_to_le32(0x0000003F) +#define STATUS_ABANDONED cpu_to_le32(0x00000080) +#define STATUS_ABANDONED_WAIT_0 cpu_to_le32(0x00000080) +#define STATUS_ABANDONED_WAIT_63 cpu_to_le32(0x000000BF) +#define STATUS_USER_APC cpu_to_le32(0x000000C0) +#define STATUS_KERNEL_APC cpu_to_le32(0x00000100) +#define STATUS_ALERTED cpu_to_le32(0x00000101) +#define STATUS_TIMEOUT cpu_to_le32(0x00000102) +#define STATUS_PENDING cpu_to_le32(0x00000103) +#define STATUS_REPARSE cpu_to_le32(0x00000104) +#define STATUS_MORE_ENTRIES cpu_to_le32(0x00000105) +#define STATUS_NOT_ALL_ASSIGNED cpu_to_le32(0x00000106) +#define STATUS_SOME_NOT_MAPPED cpu_to_le32(0x00000107) +#define STATUS_OPLOCK_BREAK_IN_PROGRESS cpu_to_le32(0x00000108) +#define STATUS_VOLUME_MOUNTED cpu_to_le32(0x00000109) +#define STATUS_RXACT_COMMITTED cpu_to_le32(0x0000010A) +#define STATUS_NOTIFY_CLEANUP cpu_to_le32(0x0000010B) +#define STATUS_NOTIFY_ENUM_DIR cpu_to_le32(0x0000010C) +#define STATUS_NO_QUOTAS_FOR_ACCOUNT cpu_to_le32(0x0000010D) +#define STATUS_PRIMARY_TRANSPORT_CONNECT_FAILED cpu_to_le32(0x0000010E) +#define STATUS_PAGE_FAULT_TRANSITION cpu_to_le32(0x00000110) +#define STATUS_PAGE_FAULT_DEMAND_ZERO cpu_to_le32(0x00000111) +#define STATUS_PAGE_FAULT_COPY_ON_WRITE cpu_to_le32(0x00000112) +#define STATUS_PAGE_FAULT_GUARD_PAGE cpu_to_le32(0x00000113) +#define STATUS_PAGE_FAULT_PAGING_FILE cpu_to_le32(0x00000114) +#define STATUS_CACHE_PAGE_LOCKED cpu_to_le32(0x00000115) +#define STATUS_CRASH_DUMP cpu_to_le32(0x00000116) +#define STATUS_BUFFER_ALL_ZEROS cpu_to_le32(0x00000117) +#define STATUS_REPARSE_OBJECT cpu_to_le32(0x00000118) +#define STATUS_RESOURCE_REQUIREMENTS_CHANGED cpu_to_le32(0x00000119) +#define STATUS_TRANSLATION_COMPLETE cpu_to_le32(0x00000120) +#define STATUS_DS_MEMBERSHIP_EVALUATED_LOCALLY cpu_to_le32(0x00000121) +#define STATUS_NOTHING_TO_TERMINATE cpu_to_le32(0x00000122) +#define STATUS_PROCESS_NOT_IN_JOB cpu_to_le32(0x00000123) +#define STATUS_PROCESS_IN_JOB cpu_to_le32(0x00000124) +#define STATUS_VOLSNAP_HIBERNATE_READY cpu_to_le32(0x00000125) +#define STATUS_FSFILTER_OP_COMPLETED_SUCCESSFULLY cpu_to_le32(0x00000126) +#define STATUS_INTERRUPT_VECTOR_ALREADY_CONNECTED cpu_to_le32(0x00000127) +#define STATUS_INTERRUPT_STILL_CONNECTED cpu_to_le32(0x00000128) +#define STATUS_PROCESS_CLONED cpu_to_le32(0x00000129) +#define STATUS_FILE_LOCKED_WITH_ONLY_READERS cpu_to_le32(0x0000012A) +#define STATUS_FILE_LOCKED_WITH_WRITERS cpu_to_le32(0x0000012B) +#define STATUS_RESOURCEMANAGER_READ_ONLY cpu_to_le32(0x00000202) +#define STATUS_WAIT_FOR_OPLOCK cpu_to_le32(0x00000367) +#define DBG_EXCEPTION_HANDLED cpu_to_le32(0x00010001) +#define DBG_CONTINUE cpu_to_le32(0x00010002) +#define STATUS_FLT_IO_COMPLETE cpu_to_le32(0x001C0001) +#define STATUS_OBJECT_NAME_EXISTS cpu_to_le32(0x40000000) +#define STATUS_THREAD_WAS_SUSPENDED cpu_to_le32(0x40000001) +#define STATUS_WORKING_SET_LIMIT_RANGE cpu_to_le32(0x40000002) +#define STATUS_IMAGE_NOT_AT_BASE cpu_to_le32(0x40000003) +#define STATUS_RXACT_STATE_CREATED cpu_to_le32(0x40000004) +#define STATUS_SEGMENT_NOTIFICATION cpu_to_le32(0x40000005) +#define STATUS_LOCAL_USER_SESSION_KEY cpu_to_le32(0x40000006) +#define STATUS_BAD_CURRENT_DIRECTORY cpu_to_le32(0x40000007) +#define STATUS_SERIAL_MORE_WRITES cpu_to_le32(0x40000008) +#define STATUS_REGISTRY_RECOVERED cpu_to_le32(0x40000009) +#define STATUS_FT_READ_RECOVERY_FROM_BACKUP cpu_to_le32(0x4000000A) +#define STATUS_FT_WRITE_RECOVERY cpu_to_le32(0x4000000B) +#define STATUS_SERIAL_COUNTER_TIMEOUT cpu_to_le32(0x4000000C) +#define STATUS_NULL_LM_PASSWORD cpu_to_le32(0x4000000D) +#define STATUS_IMAGE_MACHINE_TYPE_MISMATCH cpu_to_le32(0x4000000E) +#define STATUS_RECEIVE_PARTIAL cpu_to_le32(0x4000000F) +#define STATUS_RECEIVE_EXPEDITED cpu_to_le32(0x40000010) +#define STATUS_RECEIVE_PARTIAL_EXPEDITED cpu_to_le32(0x40000011) +#define STATUS_EVENT_DONE cpu_to_le32(0x40000012) +#define STATUS_EVENT_PENDING cpu_to_le32(0x40000013) +#define STATUS_CHECKING_FILE_SYSTEM cpu_to_le32(0x40000014) +#define STATUS_FATAL_APP_EXIT cpu_to_le32(0x40000015) +#define STATUS_PREDEFINED_HANDLE cpu_to_le32(0x40000016) +#define STATUS_WAS_UNLOCKED cpu_to_le32(0x40000017) +#define STATUS_SERVICE_NOTIFICATION cpu_to_le32(0x40000018) +#define STATUS_WAS_LOCKED cpu_to_le32(0x40000019) +#define STATUS_LOG_HARD_ERROR cpu_to_le32(0x4000001A) +#define STATUS_ALREADY_WIN32 cpu_to_le32(0x4000001B) +#define STATUS_WX86_UNSIMULATE cpu_to_le32(0x4000001C) +#define STATUS_WX86_CONTINUE cpu_to_le32(0x4000001D) +#define STATUS_WX86_SINGLE_STEP cpu_to_le32(0x4000001E) +#define STATUS_WX86_BREAKPOINT cpu_to_le32(0x4000001F) +#define STATUS_WX86_EXCEPTION_CONTINUE cpu_to_le32(0x40000020) +#define STATUS_WX86_EXCEPTION_LASTCHANCE cpu_to_le32(0x40000021) +#define STATUS_WX86_EXCEPTION_CHAIN cpu_to_le32(0x40000022) +#define STATUS_IMAGE_MACHINE_TYPE_MISMATCH_EXE cpu_to_le32(0x40000023) +#define STATUS_NO_YIELD_PERFORMED cpu_to_le32(0x40000024) +#define STATUS_TIMER_RESUME_IGNORED cpu_to_le32(0x40000025) +#define STATUS_ARBITRATION_UNHANDLED cpu_to_le32(0x40000026) +#define STATUS_CARDBUS_NOT_SUPPORTED cpu_to_le32(0x40000027) +#define STATUS_WX86_CREATEWX86TIB cpu_to_le32(0x40000028) +#define STATUS_MP_PROCESSOR_MISMATCH cpu_to_le32(0x40000029) +#define STATUS_HIBERNATED cpu_to_le32(0x4000002A) +#define STATUS_RESUME_HIBERNATION cpu_to_le32(0x4000002B) +#define STATUS_FIRMWARE_UPDATED cpu_to_le32(0x4000002C) +#define STATUS_DRIVERS_LEAKING_LOCKED_PAGES cpu_to_le32(0x4000002D) +#define STATUS_MESSAGE_RETRIEVED cpu_to_le32(0x4000002E) +#define STATUS_SYSTEM_POWERSTATE_TRANSITION cpu_to_le32(0x4000002F) +#define STATUS_ALPC_CHECK_COMPLETION_LIST cpu_to_le32(0x40000030) +#define STATUS_SYSTEM_POWERSTATE_COMPLEX_TRANSITION cpu_to_le32(0x40000031) +#define STATUS_ACCESS_AUDIT_BY_POLICY cpu_to_le32(0x40000032) +#define STATUS_ABANDON_HIBERFILE cpu_to_le32(0x40000033) +#define STATUS_BIZRULES_NOT_ENABLED cpu_to_le32(0x40000034) +#define STATUS_WAKE_SYSTEM cpu_to_le32(0x40000294) +#define STATUS_DS_SHUTTING_DOWN cpu_to_le32(0x40000370) +#define DBG_REPLY_LATER cpu_to_le32(0x40010001) +#define DBG_UNABLE_TO_PROVIDE_HANDLE cpu_to_le32(0x40010002) +#define DBG_TERMINATE_THREAD cpu_to_le32(0x40010003) +#define DBG_TERMINATE_PROCESS cpu_to_le32(0x40010004) +#define DBG_CONTROL_C cpu_to_le32(0x40010005) +#define DBG_PRINTEXCEPTION_C cpu_to_le32(0x40010006) +#define DBG_RIPEXCEPTION cpu_to_le32(0x40010007) +#define DBG_CONTROL_BREAK cpu_to_le32(0x40010008) +#define DBG_COMMAND_EXCEPTION cpu_to_le32(0x40010009) +#define RPC_NT_UUID_LOCAL_ONLY cpu_to_le32(0x40020056) +#define RPC_NT_SEND_INCOMPLETE cpu_to_le32(0x400200AF) +#define STATUS_CTX_CDM_CONNECT cpu_to_le32(0x400A0004) +#define STATUS_CTX_CDM_DISCONNECT cpu_to_le32(0x400A0005) +#define STATUS_SXS_RELEASE_ACTIVATION_CONTEXT cpu_to_le32(0x4015000D) +#define STATUS_RECOVERY_NOT_NEEDED cpu_to_le32(0x40190034) +#define STATUS_RM_ALREADY_STARTED cpu_to_le32(0x40190035) +#define STATUS_LOG_NO_RESTART cpu_to_le32(0x401A000C) +#define STATUS_VIDEO_DRIVER_DEBUG_REPORT_REQUEST cpu_to_le32(0x401B00EC) +#define STATUS_GRAPHICS_PARTIAL_DATA_POPULATED cpu_to_le32(0x401E000A) +#define STATUS_GRAPHICS_DRIVER_MISMATCH cpu_to_le32(0x401E0117) +#define STATUS_GRAPHICS_MODE_NOT_PINNED cpu_to_le32(0x401E0307) +#define STATUS_GRAPHICS_NO_PREFERRED_MODE cpu_to_le32(0x401E031E) +#define STATUS_GRAPHICS_DATASET_IS_EMPTY cpu_to_le32(0x401E034B) +#define STATUS_GRAPHICS_NO_MORE_ELEMENTS_IN_DATASET cpu_to_le32(0x401E034C) +#define STATUS_GRAPHICS_PATH_CONTENT_GEOMETRY_TRANSFORMATION_NOT_PINNED \ + cpu_to_le32(0x401E0351) +#define STATUS_GRAPHICS_UNKNOWN_CHILD_STATUS cpu_to_le32(0x401E042F) +#define STATUS_GRAPHICS_LEADLINK_START_DEFERRED cpu_to_le32(0x401E0437) +#define STATUS_GRAPHICS_POLLING_TOO_FREQUENTLY cpu_to_le32(0x401E0439) +#define STATUS_GRAPHICS_START_DEFERRED cpu_to_le32(0x401E043A) +#define STATUS_NDIS_INDICATION_REQUIRED cpu_to_le32(0x40230001) +#define STATUS_GUARD_PAGE_VIOLATION cpu_to_le32(0x80000001) +#define STATUS_DATATYPE_MISALIGNMENT cpu_to_le32(0x80000002) +#define STATUS_BREAKPOINT cpu_to_le32(0x80000003) +#define STATUS_SINGLE_STEP cpu_to_le32(0x80000004) +#define STATUS_BUFFER_OVERFLOW cpu_to_le32(0x80000005) +#define STATUS_NO_MORE_FILES cpu_to_le32(0x80000006) +#define STATUS_WAKE_SYSTEM_DEBUGGER cpu_to_le32(0x80000007) +#define STATUS_HANDLES_CLOSED cpu_to_le32(0x8000000A) +#define STATUS_NO_INHERITANCE cpu_to_le32(0x8000000B) +#define STATUS_GUID_SUBSTITUTION_MADE cpu_to_le32(0x8000000C) +#define STATUS_PARTIAL_COPY cpu_to_le32(0x8000000D) +#define STATUS_DEVICE_PAPER_EMPTY cpu_to_le32(0x8000000E) +#define STATUS_DEVICE_POWERED_OFF cpu_to_le32(0x8000000F) +#define STATUS_DEVICE_OFF_LINE cpu_to_le32(0x80000010) +#define STATUS_DEVICE_BUSY cpu_to_le32(0x80000011) +#define STATUS_NO_MORE_EAS cpu_to_le32(0x80000012) +#define STATUS_INVALID_EA_NAME cpu_to_le32(0x80000013) +#define STATUS_EA_LIST_INCONSISTENT cpu_to_le32(0x80000014) +#define STATUS_INVALID_EA_FLAG cpu_to_le32(0x80000015) +#define STATUS_VERIFY_REQUIRED cpu_to_le32(0x80000016) +#define STATUS_EXTRANEOUS_INFORMATION cpu_to_le32(0x80000017) +#define STATUS_RXACT_COMMIT_NECESSARY cpu_to_le32(0x80000018) +#define STATUS_NO_MORE_ENTRIES cpu_to_le32(0x8000001A) +#define STATUS_FILEMARK_DETECTED cpu_to_le32(0x8000001B) +#define STATUS_MEDIA_CHANGED cpu_to_le32(0x8000001C) +#define STATUS_BUS_RESET cpu_to_le32(0x8000001D) +#define STATUS_END_OF_MEDIA cpu_to_le32(0x8000001E) +#define STATUS_BEGINNING_OF_MEDIA cpu_to_le32(0x8000001F) +#define STATUS_MEDIA_CHECK cpu_to_le32(0x80000020) +#define STATUS_SETMARK_DETECTED cpu_to_le32(0x80000021) +#define STATUS_NO_DATA_DETECTED cpu_to_le32(0x80000022) +#define STATUS_REDIRECTOR_HAS_OPEN_HANDLES cpu_to_le32(0x80000023) +#define STATUS_SERVER_HAS_OPEN_HANDLES cpu_to_le32(0x80000024) +#define STATUS_ALREADY_DISCONNECTED cpu_to_le32(0x80000025) +#define STATUS_LONGJUMP cpu_to_le32(0x80000026) +#define STATUS_CLEANER_CARTRIDGE_INSTALLED cpu_to_le32(0x80000027) +#define STATUS_PLUGPLAY_QUERY_VETOED cpu_to_le32(0x80000028) +#define STATUS_UNWIND_CONSOLIDATE cpu_to_le32(0x80000029) +#define STATUS_REGISTRY_HIVE_RECOVERED cpu_to_le32(0x8000002A) +#define STATUS_DLL_MIGHT_BE_INSECURE cpu_to_le32(0x8000002B) +#define STATUS_DLL_MIGHT_BE_INCOMPATIBLE cpu_to_le32(0x8000002C) +#define STATUS_STOPPED_ON_SYMLINK cpu_to_le32(0x8000002D) +#define STATUS_DEVICE_REQUIRES_CLEANING cpu_to_le32(0x80000288) +#define STATUS_DEVICE_DOOR_OPEN cpu_to_le32(0x80000289) +#define STATUS_DATA_LOST_REPAIR cpu_to_le32(0x80000803) +#define DBG_EXCEPTION_NOT_HANDLED cpu_to_le32(0x80010001) +#define STATUS_CLUSTER_NODE_ALREADY_UP cpu_to_le32(0x80130001) +#define STATUS_CLUSTER_NODE_ALREADY_DOWN cpu_to_le32(0x80130002) +#define STATUS_CLUSTER_NETWORK_ALREADY_ONLINE cpu_to_le32(0x80130003) +#define STATUS_CLUSTER_NETWORK_ALREADY_OFFLINE cpu_to_le32(0x80130004) +#define STATUS_CLUSTER_NODE_ALREADY_MEMBER cpu_to_le32(0x80130005) +#define STATUS_COULD_NOT_RESIZE_LOG cpu_to_le32(0x80190009) +#define STATUS_NO_TXF_METADATA cpu_to_le32(0x80190029) +#define STATUS_CANT_RECOVER_WITH_HANDLE_OPEN cpu_to_le32(0x80190031) +#define STATUS_TXF_METADATA_ALREADY_PRESENT cpu_to_le32(0x80190041) +#define STATUS_TRANSACTION_SCOPE_CALLBACKS_NOT_SET cpu_to_le32(0x80190042) +#define STATUS_VIDEO_HUNG_DISPLAY_DRIVER_THREAD_RECOVERED \ + cpu_to_le32(0x801B00EB) +#define STATUS_FLT_BUFFER_TOO_SMALL cpu_to_le32(0x801C0001) +#define STATUS_FVE_PARTIAL_METADATA cpu_to_le32(0x80210001) +#define STATUS_UNSUCCESSFUL cpu_to_le32(0xC0000001) +#define STATUS_NOT_IMPLEMENTED cpu_to_le32(0xC0000002) +#define STATUS_INVALID_INFO_CLASS cpu_to_le32(0xC0000003) +#define STATUS_INFO_LENGTH_MISMATCH cpu_to_le32(0xC0000004) +#define STATUS_ACCESS_VIOLATION cpu_to_le32(0xC0000005) +#define STATUS_IN_PAGE_ERROR cpu_to_le32(0xC0000006) +#define STATUS_PAGEFILE_QUOTA cpu_to_le32(0xC0000007) +#define STATUS_INVALID_HANDLE cpu_to_le32(0xC0000008) +#define STATUS_BAD_INITIAL_STACK cpu_to_le32(0xC0000009) +#define STATUS_BAD_INITIAL_PC cpu_to_le32(0xC000000A) +#define STATUS_INVALID_CID cpu_to_le32(0xC000000B) +#define STATUS_TIMER_NOT_CANCELED cpu_to_le32(0xC000000C) +#define STATUS_INVALID_PARAMETER cpu_to_le32(0xC000000D) +#define STATUS_NO_SUCH_DEVICE cpu_to_le32(0xC000000E) +#define STATUS_NO_SUCH_FILE cpu_to_le32(0xC000000F) +#define STATUS_INVALID_DEVICE_REQUEST cpu_to_le32(0xC0000010) +#define STATUS_END_OF_FILE cpu_to_le32(0xC0000011) +#define STATUS_WRONG_VOLUME cpu_to_le32(0xC0000012) +#define STATUS_NO_MEDIA_IN_DEVICE cpu_to_le32(0xC0000013) +#define STATUS_UNRECOGNIZED_MEDIA cpu_to_le32(0xC0000014) +#define STATUS_NONEXISTENT_SECTOR cpu_to_le32(0xC0000015) +#define STATUS_MORE_PROCESSING_REQUIRED cpu_to_le32(0xC0000016) +#define STATUS_NO_MEMORY cpu_to_le32(0xC0000017) +#define STATUS_CONFLICTING_ADDRESSES cpu_to_le32(0xC0000018) +#define STATUS_NOT_MAPPED_VIEW cpu_to_le32(0xC0000019) +#define STATUS_UNABLE_TO_FREE_VM cpu_to_le32(0xC000001A) +#define STATUS_UNABLE_TO_DELETE_SECTION cpu_to_le32(0xC000001B) +#define STATUS_INVALID_SYSTEM_SERVICE cpu_to_le32(0xC000001C) +#define STATUS_ILLEGAL_INSTRUCTION cpu_to_le32(0xC000001D) +#define STATUS_INVALID_LOCK_SEQUENCE cpu_to_le32(0xC000001E) +#define STATUS_INVALID_VIEW_SIZE cpu_to_le32(0xC000001F) +#define STATUS_INVALID_FILE_FOR_SECTION cpu_to_le32(0xC0000020) +#define STATUS_ALREADY_COMMITTED cpu_to_le32(0xC0000021) +#define STATUS_ACCESS_DENIED cpu_to_le32(0xC0000022) +#define STATUS_BUFFER_TOO_SMALL cpu_to_le32(0xC0000023) +#define STATUS_OBJECT_TYPE_MISMATCH cpu_to_le32(0xC0000024) +#define STATUS_NONCONTINUABLE_EXCEPTION cpu_to_le32(0xC0000025) +#define STATUS_INVALID_DISPOSITION cpu_to_le32(0xC0000026) +#define STATUS_UNWIND cpu_to_le32(0xC0000027) +#define STATUS_BAD_STACK cpu_to_le32(0xC0000028) +#define STATUS_INVALID_UNWIND_TARGET cpu_to_le32(0xC0000029) +#define STATUS_NOT_LOCKED cpu_to_le32(0xC000002A) +#define STATUS_PARITY_ERROR cpu_to_le32(0xC000002B) +#define STATUS_UNABLE_TO_DECOMMIT_VM cpu_to_le32(0xC000002C) +#define STATUS_NOT_COMMITTED cpu_to_le32(0xC000002D) +#define STATUS_INVALID_PORT_ATTRIBUTES cpu_to_le32(0xC000002E) +#define STATUS_PORT_MESSAGE_TOO_LONG cpu_to_le32(0xC000002F) +#define STATUS_INVALID_PARAMETER_MIX cpu_to_le32(0xC0000030) +#define STATUS_INVALID_QUOTA_LOWER cpu_to_le32(0xC0000031) +#define STATUS_DISK_CORRUPT_ERROR cpu_to_le32(0xC0000032) +#define STATUS_OBJECT_NAME_INVALID cpu_to_le32(0xC0000033) +#define STATUS_OBJECT_NAME_NOT_FOUND cpu_to_le32(0xC0000034) +#define STATUS_OBJECT_NAME_COLLISION cpu_to_le32(0xC0000035) +#define STATUS_PORT_DISCONNECTED cpu_to_le32(0xC0000037) +#define STATUS_DEVICE_ALREADY_ATTACHED cpu_to_le32(0xC0000038) +#define STATUS_OBJECT_PATH_INVALID cpu_to_le32(0xC0000039) +#define STATUS_OBJECT_PATH_NOT_FOUND cpu_to_le32(0xC000003A) +#define STATUS_OBJECT_PATH_SYNTAX_BAD cpu_to_le32(0xC000003B) +#define STATUS_DATA_OVERRUN cpu_to_le32(0xC000003C) +#define STATUS_DATA_LATE_ERROR cpu_to_le32(0xC000003D) +#define STATUS_DATA_ERROR cpu_to_le32(0xC000003E) +#define STATUS_CRC_ERROR cpu_to_le32(0xC000003F) +#define STATUS_SECTION_TOO_BIG cpu_to_le32(0xC0000040) +#define STATUS_PORT_CONNECTION_REFUSED cpu_to_le32(0xC0000041) +#define STATUS_INVALID_PORT_HANDLE cpu_to_le32(0xC0000042) +#define STATUS_SHARING_VIOLATION cpu_to_le32(0xC0000043) +#define STATUS_QUOTA_EXCEEDED cpu_to_le32(0xC0000044) +#define STATUS_INVALID_PAGE_PROTECTION cpu_to_le32(0xC0000045) +#define STATUS_MUTANT_NOT_OWNED cpu_to_le32(0xC0000046) +#define STATUS_SEMAPHORE_LIMIT_EXCEEDED cpu_to_le32(0xC0000047) +#define STATUS_PORT_ALREADY_SET cpu_to_le32(0xC0000048) +#define STATUS_SECTION_NOT_IMAGE cpu_to_le32(0xC0000049) +#define STATUS_SUSPEND_COUNT_EXCEEDED cpu_to_le32(0xC000004A) +#define STATUS_THREAD_IS_TERMINATING cpu_to_le32(0xC000004B) +#define STATUS_BAD_WORKING_SET_LIMIT cpu_to_le32(0xC000004C) +#define STATUS_INCOMPATIBLE_FILE_MAP cpu_to_le32(0xC000004D) +#define STATUS_SECTION_PROTECTION cpu_to_le32(0xC000004E) +#define STATUS_EAS_NOT_SUPPORTED cpu_to_le32(0xC000004F) +#define STATUS_EA_TOO_LARGE cpu_to_le32(0xC0000050) +#define STATUS_NONEXISTENT_EA_ENTRY cpu_to_le32(0xC0000051) +#define STATUS_NO_EAS_ON_FILE cpu_to_le32(0xC0000052) +#define STATUS_EA_CORRUPT_ERROR cpu_to_le32(0xC0000053) +#define STATUS_FILE_LOCK_CONFLICT cpu_to_le32(0xC0000054) +#define STATUS_LOCK_NOT_GRANTED cpu_to_le32(0xC0000055) +#define STATUS_DELETE_PENDING cpu_to_le32(0xC0000056) +#define STATUS_CTL_FILE_NOT_SUPPORTED cpu_to_le32(0xC0000057) +#define STATUS_UNKNOWN_REVISION cpu_to_le32(0xC0000058) +#define STATUS_REVISION_MISMATCH cpu_to_le32(0xC0000059) +#define STATUS_INVALID_OWNER cpu_to_le32(0xC000005A) +#define STATUS_INVALID_PRIMARY_GROUP cpu_to_le32(0xC000005B) +#define STATUS_NO_IMPERSONATION_TOKEN cpu_to_le32(0xC000005C) +#define STATUS_CANT_DISABLE_MANDATORY cpu_to_le32(0xC000005D) +#define STATUS_NO_LOGON_SERVERS cpu_to_le32(0xC000005E) +#define STATUS_NO_SUCH_LOGON_SESSION cpu_to_le32(0xC000005F) +#define STATUS_NO_SUCH_PRIVILEGE cpu_to_le32(0xC0000060) +#define STATUS_PRIVILEGE_NOT_HELD cpu_to_le32(0xC0000061) +#define STATUS_INVALID_ACCOUNT_NAME cpu_to_le32(0xC0000062) +#define STATUS_USER_EXISTS cpu_to_le32(0xC0000063) +#define STATUS_NO_SUCH_USER cpu_to_le32(0xC0000064) +#define STATUS_GROUP_EXISTS cpu_to_le32(0xC0000065) +#define STATUS_NO_SUCH_GROUP cpu_to_le32(0xC0000066) +#define STATUS_MEMBER_IN_GROUP cpu_to_le32(0xC0000067) +#define STATUS_MEMBER_NOT_IN_GROUP cpu_to_le32(0xC0000068) +#define STATUS_LAST_ADMIN cpu_to_le32(0xC0000069) +#define STATUS_WRONG_PASSWORD cpu_to_le32(0xC000006A) +#define STATUS_ILL_FORMED_PASSWORD cpu_to_le32(0xC000006B) +#define STATUS_PASSWORD_RESTRICTION cpu_to_le32(0xC000006C) +#define STATUS_LOGON_FAILURE cpu_to_le32(0xC000006D) +#define STATUS_ACCOUNT_RESTRICTION cpu_to_le32(0xC000006E) +#define STATUS_INVALID_LOGON_HOURS cpu_to_le32(0xC000006F) +#define STATUS_INVALID_WORKSTATION cpu_to_le32(0xC0000070) +#define STATUS_PASSWORD_EXPIRED cpu_to_le32(0xC0000071) +#define STATUS_ACCOUNT_DISABLED cpu_to_le32(0xC0000072) +#define STATUS_NONE_MAPPED cpu_to_le32(0xC0000073) +#define STATUS_TOO_MANY_LUIDS_REQUESTED cpu_to_le32(0xC0000074) +#define STATUS_LUIDS_EXHAUSTED cpu_to_le32(0xC0000075) +#define STATUS_INVALID_SUB_AUTHORITY cpu_to_le32(0xC0000076) +#define STATUS_INVALID_ACL cpu_to_le32(0xC0000077) +#define STATUS_INVALID_SID cpu_to_le32(0xC0000078) +#define STATUS_INVALID_SECURITY_DESCR cpu_to_le32(0xC0000079) +#define STATUS_PROCEDURE_NOT_FOUND cpu_to_le32(0xC000007A) +#define STATUS_INVALID_IMAGE_FORMAT cpu_to_le32(0xC000007B) +#define STATUS_NO_TOKEN cpu_to_le32(0xC000007C) +#define STATUS_BAD_INHERITANCE_ACL cpu_to_le32(0xC000007D) +#define STATUS_RANGE_NOT_LOCKED cpu_to_le32(0xC000007E) +#define STATUS_DISK_FULL cpu_to_le32(0xC000007F) +#define STATUS_SERVER_DISABLED cpu_to_le32(0xC0000080) +#define STATUS_SERVER_NOT_DISABLED cpu_to_le32(0xC0000081) +#define STATUS_TOO_MANY_GUIDS_REQUESTED cpu_to_le32(0xC0000082) +#define STATUS_GUIDS_EXHAUSTED cpu_to_le32(0xC0000083) +#define STATUS_INVALID_ID_AUTHORITY cpu_to_le32(0xC0000084) +#define STATUS_AGENTS_EXHAUSTED cpu_to_le32(0xC0000085) +#define STATUS_INVALID_VOLUME_LABEL cpu_to_le32(0xC0000086) +#define STATUS_SECTION_NOT_EXTENDED cpu_to_le32(0xC0000087) +#define STATUS_NOT_MAPPED_DATA cpu_to_le32(0xC0000088) +#define STATUS_RESOURCE_DATA_NOT_FOUND cpu_to_le32(0xC0000089) +#define STATUS_RESOURCE_TYPE_NOT_FOUND cpu_to_le32(0xC000008A) +#define STATUS_RESOURCE_NAME_NOT_FOUND cpu_to_le32(0xC000008B) +#define STATUS_ARRAY_BOUNDS_EXCEEDED cpu_to_le32(0xC000008C) +#define STATUS_FLOAT_DENORMAL_OPERAND cpu_to_le32(0xC000008D) +#define STATUS_FLOAT_DIVIDE_BY_ZERO cpu_to_le32(0xC000008E) +#define STATUS_FLOAT_INEXACT_RESULT cpu_to_le32(0xC000008F) +#define STATUS_FLOAT_INVALID_OPERATION cpu_to_le32(0xC0000090) +#define STATUS_FLOAT_OVERFLOW cpu_to_le32(0xC0000091) +#define STATUS_FLOAT_STACK_CHECK cpu_to_le32(0xC0000092) +#define STATUS_FLOAT_UNDERFLOW cpu_to_le32(0xC0000093) +#define STATUS_INTEGER_DIVIDE_BY_ZERO cpu_to_le32(0xC0000094) +#define STATUS_INTEGER_OVERFLOW cpu_to_le32(0xC0000095) +#define STATUS_PRIVILEGED_INSTRUCTION cpu_to_le32(0xC0000096) +#define STATUS_TOO_MANY_PAGING_FILES cpu_to_le32(0xC0000097) +#define STATUS_FILE_INVALID cpu_to_le32(0xC0000098) +#define STATUS_ALLOTTED_SPACE_EXCEEDED cpu_to_le32(0xC0000099) +#define STATUS_INSUFFICIENT_RESOURCES cpu_to_le32(0xC000009A) +#define STATUS_DFS_EXIT_PATH_FOUND cpu_to_le32(0xC000009B) +#define STATUS_DEVICE_DATA_ERROR cpu_to_le32(0xC000009C) +#define STATUS_DEVICE_NOT_CONNECTED cpu_to_le32(0xC000009D) +#define STATUS_DEVICE_POWER_FAILURE cpu_to_le32(0xC000009E) +#define STATUS_FREE_VM_NOT_AT_BASE cpu_to_le32(0xC000009F) +#define STATUS_MEMORY_NOT_ALLOCATED cpu_to_le32(0xC00000A0) +#define STATUS_WORKING_SET_QUOTA cpu_to_le32(0xC00000A1) +#define STATUS_MEDIA_WRITE_PROTECTED cpu_to_le32(0xC00000A2) +#define STATUS_DEVICE_NOT_READY cpu_to_le32(0xC00000A3) +#define STATUS_INVALID_GROUP_ATTRIBUTES cpu_to_le32(0xC00000A4) +#define STATUS_BAD_IMPERSONATION_LEVEL cpu_to_le32(0xC00000A5) +#define STATUS_CANT_OPEN_ANONYMOUS cpu_to_le32(0xC00000A6) +#define STATUS_BAD_VALIDATION_CLASS cpu_to_le32(0xC00000A7) +#define STATUS_BAD_TOKEN_TYPE cpu_to_le32(0xC00000A8) +#define STATUS_BAD_MASTER_BOOT_RECORD cpu_to_le32(0xC00000A9) +#define STATUS_INSTRUCTION_MISALIGNMENT cpu_to_le32(0xC00000AA) +#define STATUS_INSTANCE_NOT_AVAILABLE cpu_to_le32(0xC00000AB) +#define STATUS_PIPE_NOT_AVAILABLE cpu_to_le32(0xC00000AC) +#define STATUS_INVALID_PIPE_STATE cpu_to_le32(0xC00000AD) +#define STATUS_PIPE_BUSY cpu_to_le32(0xC00000AE) +#define STATUS_ILLEGAL_FUNCTION cpu_to_le32(0xC00000AF) +#define STATUS_PIPE_DISCONNECTED cpu_to_le32(0xC00000B0) +#define STATUS_PIPE_CLOSING cpu_to_le32(0xC00000B1) +#define STATUS_PIPE_CONNECTED cpu_to_le32(0xC00000B2) +#define STATUS_PIPE_LISTENING cpu_to_le32(0xC00000B3) +#define STATUS_INVALID_READ_MODE cpu_to_le32(0xC00000B4) +#define STATUS_IO_TIMEOUT cpu_to_le32(0xC00000B5) +#define STATUS_FILE_FORCED_CLOSED cpu_to_le32(0xC00000B6) +#define STATUS_PROFILING_NOT_STARTED cpu_to_le32(0xC00000B7) +#define STATUS_PROFILING_NOT_STOPPED cpu_to_le32(0xC00000B8) +#define STATUS_COULD_NOT_INTERPRET cpu_to_le32(0xC00000B9) +#define STATUS_FILE_IS_A_DIRECTORY cpu_to_le32(0xC00000BA) +#define STATUS_NOT_SUPPORTED cpu_to_le32(0xC00000BB) +#define STATUS_REMOTE_NOT_LISTENING cpu_to_le32(0xC00000BC) +#define STATUS_DUPLICATE_NAME cpu_to_le32(0xC00000BD) +#define STATUS_BAD_NETWORK_PATH cpu_to_le32(0xC00000BE) +#define STATUS_NETWORK_BUSY cpu_to_le32(0xC00000BF) +#define STATUS_DEVICE_DOES_NOT_EXIST cpu_to_le32(0xC00000C0) +#define STATUS_TOO_MANY_COMMANDS cpu_to_le32(0xC00000C1) +#define STATUS_ADAPTER_HARDWARE_ERROR cpu_to_le32(0xC00000C2) +#define STATUS_INVALID_NETWORK_RESPONSE cpu_to_le32(0xC00000C3) +#define STATUS_UNEXPECTED_NETWORK_ERROR cpu_to_le32(0xC00000C4) +#define STATUS_BAD_REMOTE_ADAPTER cpu_to_le32(0xC00000C5) +#define STATUS_PRINT_QUEUE_FULL cpu_to_le32(0xC00000C6) +#define STATUS_NO_SPOOL_SPACE cpu_to_le32(0xC00000C7) +#define STATUS_PRINT_CANCELLED cpu_to_le32(0xC00000C8) +#define STATUS_NETWORK_NAME_DELETED cpu_to_le32(0xC00000C9) +#define STATUS_NETWORK_ACCESS_DENIED cpu_to_le32(0xC00000CA) +#define STATUS_BAD_DEVICE_TYPE cpu_to_le32(0xC00000CB) +#define STATUS_BAD_NETWORK_NAME cpu_to_le32(0xC00000CC) +#define STATUS_TOO_MANY_NAMES cpu_to_le32(0xC00000CD) +#define STATUS_TOO_MANY_SESSIONS cpu_to_le32(0xC00000CE) +#define STATUS_SHARING_PAUSED cpu_to_le32(0xC00000CF) +#define STATUS_REQUEST_NOT_ACCEPTED cpu_to_le32(0xC00000D0) +#define STATUS_REDIRECTOR_PAUSED cpu_to_le32(0xC00000D1) +#define STATUS_NET_WRITE_FAULT cpu_to_le32(0xC00000D2) +#define STATUS_PROFILING_AT_LIMIT cpu_to_le32(0xC00000D3) +#define STATUS_NOT_SAME_DEVICE cpu_to_le32(0xC00000D4) +#define STATUS_FILE_RENAMED cpu_to_le32(0xC00000D5) +#define STATUS_VIRTUAL_CIRCUIT_CLOSED cpu_to_le32(0xC00000D6) +#define STATUS_NO_SECURITY_ON_OBJECT cpu_to_le32(0xC00000D7) +#define STATUS_CANT_WAIT cpu_to_le32(0xC00000D8) +#define STATUS_PIPE_EMPTY cpu_to_le32(0xC00000D9) +#define STATUS_CANT_ACCESS_DOMAIN_INFO cpu_to_le32(0xC00000DA) +#define STATUS_CANT_TERMINATE_SELF cpu_to_le32(0xC00000DB) +#define STATUS_INVALID_SERVER_STATE cpu_to_le32(0xC00000DC) +#define STATUS_INVALID_DOMAIN_STATE cpu_to_le32(0xC00000DD) +#define STATUS_INVALID_DOMAIN_ROLE cpu_to_le32(0xC00000DE) +#define STATUS_NO_SUCH_DOMAIN cpu_to_le32(0xC00000DF) +#define STATUS_DOMAIN_EXISTS cpu_to_le32(0xC00000E0) +#define STATUS_DOMAIN_LIMIT_EXCEEDED cpu_to_le32(0xC00000E1) +#define STATUS_OPLOCK_NOT_GRANTED cpu_to_le32(0xC00000E2) +#define STATUS_INVALID_OPLOCK_PROTOCOL cpu_to_le32(0xC00000E3) +#define STATUS_INTERNAL_DB_CORRUPTION cpu_to_le32(0xC00000E4) +#define STATUS_INTERNAL_ERROR cpu_to_le32(0xC00000E5) +#define STATUS_GENERIC_NOT_MAPPED cpu_to_le32(0xC00000E6) +#define STATUS_BAD_DESCRIPTOR_FORMAT cpu_to_le32(0xC00000E7) +#define STATUS_INVALID_USER_BUFFER cpu_to_le32(0xC00000E8) +#define STATUS_UNEXPECTED_IO_ERROR cpu_to_le32(0xC00000E9) +#define STATUS_UNEXPECTED_MM_CREATE_ERR cpu_to_le32(0xC00000EA) +#define STATUS_UNEXPECTED_MM_MAP_ERROR cpu_to_le32(0xC00000EB) +#define STATUS_UNEXPECTED_MM_EXTEND_ERR cpu_to_le32(0xC00000EC) +#define STATUS_NOT_LOGON_PROCESS cpu_to_le32(0xC00000ED) +#define STATUS_LOGON_SESSION_EXISTS cpu_to_le32(0xC00000EE) +#define STATUS_INVALID_PARAMETER_1 cpu_to_le32(0xC00000EF) +#define STATUS_INVALID_PARAMETER_2 cpu_to_le32(0xC00000F0) +#define STATUS_INVALID_PARAMETER_3 cpu_to_le32(0xC00000F1) +#define STATUS_INVALID_PARAMETER_4 cpu_to_le32(0xC00000F2) +#define STATUS_INVALID_PARAMETER_5 cpu_to_le32(0xC00000F3) +#define STATUS_INVALID_PARAMETER_6 cpu_to_le32(0xC00000F4) +#define STATUS_INVALID_PARAMETER_7 cpu_to_le32(0xC00000F5) +#define STATUS_INVALID_PARAMETER_8 cpu_to_le32(0xC00000F6) +#define STATUS_INVALID_PARAMETER_9 cpu_to_le32(0xC00000F7) +#define STATUS_INVALID_PARAMETER_10 cpu_to_le32(0xC00000F8) +#define STATUS_INVALID_PARAMETER_11 cpu_to_le32(0xC00000F9) +#define STATUS_INVALID_PARAMETER_12 cpu_to_le32(0xC00000FA) +#define STATUS_REDIRECTOR_NOT_STARTED cpu_to_le32(0xC00000FB) +#define STATUS_REDIRECTOR_STARTED cpu_to_le32(0xC00000FC) +#define STATUS_STACK_OVERFLOW cpu_to_le32(0xC00000FD) +#define STATUS_NO_SUCH_PACKAGE cpu_to_le32(0xC00000FE) +#define STATUS_BAD_FUNCTION_TABLE cpu_to_le32(0xC00000FF) +#define STATUS_VARIABLE_NOT_FOUND cpu_to_le32(0xC0000100) +#define STATUS_DIRECTORY_NOT_EMPTY cpu_to_le32(0xC0000101) +#define STATUS_FILE_CORRUPT_ERROR cpu_to_le32(0xC0000102) +#define STATUS_NOT_A_DIRECTORY cpu_to_le32(0xC0000103) +#define STATUS_BAD_LOGON_SESSION_STATE cpu_to_le32(0xC0000104) +#define STATUS_LOGON_SESSION_COLLISION cpu_to_le32(0xC0000105) +#define STATUS_NAME_TOO_LONG cpu_to_le32(0xC0000106) +#define STATUS_FILES_OPEN cpu_to_le32(0xC0000107) +#define STATUS_CONNECTION_IN_USE cpu_to_le32(0xC0000108) +#define STATUS_MESSAGE_NOT_FOUND cpu_to_le32(0xC0000109) +#define STATUS_PROCESS_IS_TERMINATING cpu_to_le32(0xC000010A) +#define STATUS_INVALID_LOGON_TYPE cpu_to_le32(0xC000010B) +#define STATUS_NO_GUID_TRANSLATION cpu_to_le32(0xC000010C) +#define STATUS_CANNOT_IMPERSONATE cpu_to_le32(0xC000010D) +#define STATUS_IMAGE_ALREADY_LOADED cpu_to_le32(0xC000010E) +#define STATUS_ABIOS_NOT_PRESENT cpu_to_le32(0xC000010F) +#define STATUS_ABIOS_LID_NOT_EXIST cpu_to_le32(0xC0000110) +#define STATUS_ABIOS_LID_ALREADY_OWNED cpu_to_le32(0xC0000111) +#define STATUS_ABIOS_NOT_LID_OWNER cpu_to_le32(0xC0000112) +#define STATUS_ABIOS_INVALID_COMMAND cpu_to_le32(0xC0000113) +#define STATUS_ABIOS_INVALID_LID cpu_to_le32(0xC0000114) +#define STATUS_ABIOS_SELECTOR_NOT_AVAILABLE cpu_to_le32(0xC0000115) +#define STATUS_ABIOS_INVALID_SELECTOR cpu_to_le32(0xC0000116) +#define STATUS_NO_LDT cpu_to_le32(0xC0000117) +#define STATUS_INVALID_LDT_SIZE cpu_to_le32(0xC0000118) +#define STATUS_INVALID_LDT_OFFSET cpu_to_le32(0xC0000119) +#define STATUS_INVALID_LDT_DESCRIPTOR cpu_to_le32(0xC000011A) +#define STATUS_INVALID_IMAGE_NE_FORMAT cpu_to_le32(0xC000011B) +#define STATUS_RXACT_INVALID_STATE cpu_to_le32(0xC000011C) +#define STATUS_RXACT_COMMIT_FAILURE cpu_to_le32(0xC000011D) +#define STATUS_MAPPED_FILE_SIZE_ZERO cpu_to_le32(0xC000011E) +#define STATUS_TOO_MANY_OPENED_FILES cpu_to_le32(0xC000011F) +#define STATUS_CANCELLED cpu_to_le32(0xC0000120) +#define STATUS_CANNOT_DELETE cpu_to_le32(0xC0000121) +#define STATUS_INVALID_COMPUTER_NAME cpu_to_le32(0xC0000122) +#define STATUS_FILE_DELETED cpu_to_le32(0xC0000123) +#define STATUS_SPECIAL_ACCOUNT cpu_to_le32(0xC0000124) +#define STATUS_SPECIAL_GROUP cpu_to_le32(0xC0000125) +#define STATUS_SPECIAL_USER cpu_to_le32(0xC0000126) +#define STATUS_MEMBERS_PRIMARY_GROUP cpu_to_le32(0xC0000127) +#define STATUS_FILE_CLOSED cpu_to_le32(0xC0000128) +#define STATUS_TOO_MANY_THREADS cpu_to_le32(0xC0000129) +#define STATUS_THREAD_NOT_IN_PROCESS cpu_to_le32(0xC000012A) +#define STATUS_TOKEN_ALREADY_IN_USE cpu_to_le32(0xC000012B) +#define STATUS_PAGEFILE_QUOTA_EXCEEDED cpu_to_le32(0xC000012C) +#define STATUS_COMMITMENT_LIMIT cpu_to_le32(0xC000012D) +#define STATUS_INVALID_IMAGE_LE_FORMAT cpu_to_le32(0xC000012E) +#define STATUS_INVALID_IMAGE_NOT_MZ cpu_to_le32(0xC000012F) +#define STATUS_INVALID_IMAGE_PROTECT cpu_to_le32(0xC0000130) +#define STATUS_INVALID_IMAGE_WIN_16 cpu_to_le32(0xC0000131) +#define STATUS_LOGON_SERVER_CONFLICT cpu_to_le32(0xC0000132) +#define STATUS_TIME_DIFFERENCE_AT_DC cpu_to_le32(0xC0000133) +#define STATUS_SYNCHRONIZATION_REQUIRED cpu_to_le32(0xC0000134) +#define STATUS_DLL_NOT_FOUND cpu_to_le32(0xC0000135) +#define STATUS_OPEN_FAILED cpu_to_le32(0xC0000136) +#define STATUS_IO_PRIVILEGE_FAILED cpu_to_le32(0xC0000137) +#define STATUS_ORDINAL_NOT_FOUND cpu_to_le32(0xC0000138) +#define STATUS_ENTRYPOINT_NOT_FOUND cpu_to_le32(0xC0000139) +#define STATUS_CONTROL_C_EXIT cpu_to_le32(0xC000013A) +#define STATUS_LOCAL_DISCONNECT cpu_to_le32(0xC000013B) +#define STATUS_REMOTE_DISCONNECT cpu_to_le32(0xC000013C) +#define STATUS_REMOTE_RESOURCES cpu_to_le32(0xC000013D) +#define STATUS_LINK_FAILED cpu_to_le32(0xC000013E) +#define STATUS_LINK_TIMEOUT cpu_to_le32(0xC000013F) +#define STATUS_INVALID_CONNECTION cpu_to_le32(0xC0000140) +#define STATUS_INVALID_ADDRESS cpu_to_le32(0xC0000141) +#define STATUS_DLL_INIT_FAILED cpu_to_le32(0xC0000142) +#define STATUS_MISSING_SYSTEMFILE cpu_to_le32(0xC0000143) +#define STATUS_UNHANDLED_EXCEPTION cpu_to_le32(0xC0000144) +#define STATUS_APP_INIT_FAILURE cpu_to_le32(0xC0000145) +#define STATUS_PAGEFILE_CREATE_FAILED cpu_to_le32(0xC0000146) +#define STATUS_NO_PAGEFILE cpu_to_le32(0xC0000147) +#define STATUS_INVALID_LEVEL cpu_to_le32(0xC0000148) +#define STATUS_WRONG_PASSWORD_CORE cpu_to_le32(0xC0000149) +#define STATUS_ILLEGAL_FLOAT_CONTEXT cpu_to_le32(0xC000014A) +#define STATUS_PIPE_BROKEN cpu_to_le32(0xC000014B) +#define STATUS_REGISTRY_CORRUPT cpu_to_le32(0xC000014C) +#define STATUS_REGISTRY_IO_FAILED cpu_to_le32(0xC000014D) +#define STATUS_NO_EVENT_PAIR cpu_to_le32(0xC000014E) +#define STATUS_UNRECOGNIZED_VOLUME cpu_to_le32(0xC000014F) +#define STATUS_SERIAL_NO_DEVICE_INITED cpu_to_le32(0xC0000150) +#define STATUS_NO_SUCH_ALIAS cpu_to_le32(0xC0000151) +#define STATUS_MEMBER_NOT_IN_ALIAS cpu_to_le32(0xC0000152) +#define STATUS_MEMBER_IN_ALIAS cpu_to_le32(0xC0000153) +#define STATUS_ALIAS_EXISTS cpu_to_le32(0xC0000154) +#define STATUS_LOGON_NOT_GRANTED cpu_to_le32(0xC0000155) +#define STATUS_TOO_MANY_SECRETS cpu_to_le32(0xC0000156) +#define STATUS_SECRET_TOO_LONG cpu_to_le32(0xC0000157) +#define STATUS_INTERNAL_DB_ERROR cpu_to_le32(0xC0000158) +#define STATUS_FULLSCREEN_MODE cpu_to_le32(0xC0000159) +#define STATUS_TOO_MANY_CONTEXT_IDS cpu_to_le32(0xC000015A) +#define STATUS_LOGON_TYPE_NOT_GRANTED cpu_to_le32(0xC000015B) +#define STATUS_NOT_REGISTRY_FILE cpu_to_le32(0xC000015C) +#define STATUS_NT_CROSS_ENCRYPTION_REQUIRED cpu_to_le32(0xC000015D) +#define STATUS_DOMAIN_CTRLR_CONFIG_ERROR cpu_to_le32(0xC000015E) +#define STATUS_FT_MISSING_MEMBER cpu_to_le32(0xC000015F) +#define STATUS_ILL_FORMED_SERVICE_ENTRY cpu_to_le32(0xC0000160) +#define STATUS_ILLEGAL_CHARACTER cpu_to_le32(0xC0000161) +#define STATUS_UNMAPPABLE_CHARACTER cpu_to_le32(0xC0000162) +#define STATUS_UNDEFINED_CHARACTER cpu_to_le32(0xC0000163) +#define STATUS_FLOPPY_VOLUME cpu_to_le32(0xC0000164) +#define STATUS_FLOPPY_ID_MARK_NOT_FOUND cpu_to_le32(0xC0000165) +#define STATUS_FLOPPY_WRONG_CYLINDER cpu_to_le32(0xC0000166) +#define STATUS_FLOPPY_UNKNOWN_ERROR cpu_to_le32(0xC0000167) +#define STATUS_FLOPPY_BAD_REGISTERS cpu_to_le32(0xC0000168) +#define STATUS_DISK_RECALIBRATE_FAILED cpu_to_le32(0xC0000169) +#define STATUS_DISK_OPERATION_FAILED cpu_to_le32(0xC000016A) +#define STATUS_DISK_RESET_FAILED cpu_to_le32(0xC000016B) +#define STATUS_SHARED_IRQ_BUSY cpu_to_le32(0xC000016C) +#define STATUS_FT_ORPHANING cpu_to_le32(0xC000016D) +#define STATUS_BIOS_FAILED_TO_CONNECT_INTERRUPT cpu_to_le32(0xC000016E) +#define STATUS_PARTITION_FAILURE cpu_to_le32(0xC0000172) +#define STATUS_INVALID_BLOCK_LENGTH cpu_to_le32(0xC0000173) +#define STATUS_DEVICE_NOT_PARTITIONED cpu_to_le32(0xC0000174) +#define STATUS_UNABLE_TO_LOCK_MEDIA cpu_to_le32(0xC0000175) +#define STATUS_UNABLE_TO_UNLOAD_MEDIA cpu_to_le32(0xC0000176) +#define STATUS_EOM_OVERFLOW cpu_to_le32(0xC0000177) +#define STATUS_NO_MEDIA cpu_to_le32(0xC0000178) +#define STATUS_NO_SUCH_MEMBER cpu_to_le32(0xC000017A) +#define STATUS_INVALID_MEMBER cpu_to_le32(0xC000017B) +#define STATUS_KEY_DELETED cpu_to_le32(0xC000017C) +#define STATUS_NO_LOG_SPACE cpu_to_le32(0xC000017D) +#define STATUS_TOO_MANY_SIDS cpu_to_le32(0xC000017E) +#define STATUS_LM_CROSS_ENCRYPTION_REQUIRED cpu_to_le32(0xC000017F) +#define STATUS_KEY_HAS_CHILDREN cpu_to_le32(0xC0000180) +#define STATUS_CHILD_MUST_BE_VOLATILE cpu_to_le32(0xC0000181) +#define STATUS_DEVICE_CONFIGURATION_ERROR cpu_to_le32(0xC0000182) +#define STATUS_DRIVER_INTERNAL_ERROR cpu_to_le32(0xC0000183) +#define STATUS_INVALID_DEVICE_STATE cpu_to_le32(0xC0000184) +#define STATUS_IO_DEVICE_ERROR cpu_to_le32(0xC0000185) +#define STATUS_DEVICE_PROTOCOL_ERROR cpu_to_le32(0xC0000186) +#define STATUS_BACKUP_CONTROLLER cpu_to_le32(0xC0000187) +#define STATUS_LOG_FILE_FULL cpu_to_le32(0xC0000188) +#define STATUS_TOO_LATE cpu_to_le32(0xC0000189) +#define STATUS_NO_TRUST_LSA_SECRET cpu_to_le32(0xC000018A) +#define STATUS_NO_TRUST_SAM_ACCOUNT cpu_to_le32(0xC000018B) +#define STATUS_TRUSTED_DOMAIN_FAILURE cpu_to_le32(0xC000018C) +#define STATUS_TRUSTED_RELATIONSHIP_FAILURE cpu_to_le32(0xC000018D) +#define STATUS_EVENTLOG_FILE_CORRUPT cpu_to_le32(0xC000018E) +#define STATUS_EVENTLOG_CANT_START cpu_to_le32(0xC000018F) +#define STATUS_TRUST_FAILURE cpu_to_le32(0xC0000190) +#define STATUS_MUTANT_LIMIT_EXCEEDED cpu_to_le32(0xC0000191) +#define STATUS_NETLOGON_NOT_STARTED cpu_to_le32(0xC0000192) +#define STATUS_ACCOUNT_EXPIRED cpu_to_le32(0xC0000193) +#define STATUS_POSSIBLE_DEADLOCK cpu_to_le32(0xC0000194) +#define STATUS_NETWORK_CREDENTIAL_CONFLICT cpu_to_le32(0xC0000195) +#define STATUS_REMOTE_SESSION_LIMIT cpu_to_le32(0xC0000196) +#define STATUS_EVENTLOG_FILE_CHANGED cpu_to_le32(0xC0000197) +#define STATUS_NOLOGON_INTERDOMAIN_TRUST_ACCOUNT cpu_to_le32(0xC0000198) +#define STATUS_NOLOGON_WORKSTATION_TRUST_ACCOUNT cpu_to_le32(0xC0000199) +#define STATUS_NOLOGON_SERVER_TRUST_ACCOUNT cpu_to_le32(0xC000019A) +#define STATUS_DOMAIN_TRUST_INCONSISTENT cpu_to_le32(0xC000019B) +#define STATUS_FS_DRIVER_REQUIRED cpu_to_le32(0xC000019C) +#define STATUS_IMAGE_ALREADY_LOADED_AS_DLL cpu_to_le32(0xC000019D) +#define STATUS_NETWORK_OPEN_RESTRICTION cpu_to_le32(0xC0000201) +#define STATUS_NO_USER_SESSION_KEY cpu_to_le32(0xC0000202) +#define STATUS_USER_SESSION_DELETED cpu_to_le32(0xC0000203) +#define STATUS_RESOURCE_LANG_NOT_FOUND cpu_to_le32(0xC0000204) +#define STATUS_INSUFF_SERVER_RESOURCES cpu_to_le32(0xC0000205) +#define STATUS_INVALID_BUFFER_SIZE cpu_to_le32(0xC0000206) +#define STATUS_INVALID_ADDRESS_COMPONENT cpu_to_le32(0xC0000207) +#define STATUS_INVALID_ADDRESS_WILDCARD cpu_to_le32(0xC0000208) +#define STATUS_TOO_MANY_ADDRESSES cpu_to_le32(0xC0000209) +#define STATUS_ADDRESS_ALREADY_EXISTS cpu_to_le32(0xC000020A) +#define STATUS_ADDRESS_CLOSED cpu_to_le32(0xC000020B) +#define STATUS_CONNECTION_DISCONNECTED cpu_to_le32(0xC000020C) +#define STATUS_CONNECTION_RESET cpu_to_le32(0xC000020D) +#define STATUS_TOO_MANY_NODES cpu_to_le32(0xC000020E) +#define STATUS_TRANSACTION_ABORTED cpu_to_le32(0xC000020F) +#define STATUS_TRANSACTION_TIMED_OUT cpu_to_le32(0xC0000210) +#define STATUS_TRANSACTION_NO_RELEASE cpu_to_le32(0xC0000211) +#define STATUS_TRANSACTION_NO_MATCH cpu_to_le32(0xC0000212) +#define STATUS_TRANSACTION_RESPONDED cpu_to_le32(0xC0000213) +#define STATUS_TRANSACTION_INVALID_ID cpu_to_le32(0xC0000214) +#define STATUS_TRANSACTION_INVALID_TYPE cpu_to_le32(0xC0000215) +#define STATUS_NOT_SERVER_SESSION cpu_to_le32(0xC0000216) +#define STATUS_NOT_CLIENT_SESSION cpu_to_le32(0xC0000217) +#define STATUS_CANNOT_LOAD_REGISTRY_FILE cpu_to_le32(0xC0000218) +#define STATUS_DEBUG_ATTACH_FAILED cpu_to_le32(0xC0000219) +#define STATUS_SYSTEM_PROCESS_TERMINATED cpu_to_le32(0xC000021A) +#define STATUS_DATA_NOT_ACCEPTED cpu_to_le32(0xC000021B) +#define STATUS_NO_BROWSER_SERVERS_FOUND cpu_to_le32(0xC000021C) +#define STATUS_VDM_HARD_ERROR cpu_to_le32(0xC000021D) +#define STATUS_DRIVER_CANCEL_TIMEOUT cpu_to_le32(0xC000021E) +#define STATUS_REPLY_MESSAGE_MISMATCH cpu_to_le32(0xC000021F) +#define STATUS_MAPPED_ALIGNMENT cpu_to_le32(0xC0000220) +#define STATUS_IMAGE_CHECKSUM_MISMATCH cpu_to_le32(0xC0000221) +#define STATUS_LOST_WRITEBEHIND_DATA cpu_to_le32(0xC0000222) +#define STATUS_CLIENT_SERVER_PARAMETERS_INVALID cpu_to_le32(0xC0000223) +#define STATUS_PASSWORD_MUST_CHANGE cpu_to_le32(0xC0000224) +#define STATUS_NOT_FOUND cpu_to_le32(0xC0000225) +#define STATUS_NOT_TINY_STREAM cpu_to_le32(0xC0000226) +#define STATUS_RECOVERY_FAILURE cpu_to_le32(0xC0000227) +#define STATUS_STACK_OVERFLOW_READ cpu_to_le32(0xC0000228) +#define STATUS_FAIL_CHECK cpu_to_le32(0xC0000229) +#define STATUS_DUPLICATE_OBJECTID cpu_to_le32(0xC000022A) +#define STATUS_OBJECTID_EXISTS cpu_to_le32(0xC000022B) +#define STATUS_CONVERT_TO_LARGE cpu_to_le32(0xC000022C) +#define STATUS_RETRY cpu_to_le32(0xC000022D) +#define STATUS_FOUND_OUT_OF_SCOPE cpu_to_le32(0xC000022E) +#define STATUS_ALLOCATE_BUCKET cpu_to_le32(0xC000022F) +#define STATUS_PROPSET_NOT_FOUND cpu_to_le32(0xC0000230) +#define STATUS_MARSHALL_OVERFLOW cpu_to_le32(0xC0000231) +#define STATUS_INVALID_VARIANT cpu_to_le32(0xC0000232) +#define STATUS_DOMAIN_CONTROLLER_NOT_FOUND cpu_to_le32(0xC0000233) +#define STATUS_ACCOUNT_LOCKED_OUT cpu_to_le32(0xC0000234) +#define STATUS_HANDLE_NOT_CLOSABLE cpu_to_le32(0xC0000235) +#define STATUS_CONNECTION_REFUSED cpu_to_le32(0xC0000236) +#define STATUS_GRACEFUL_DISCONNECT cpu_to_le32(0xC0000237) +#define STATUS_ADDRESS_ALREADY_ASSOCIATED cpu_to_le32(0xC0000238) +#define STATUS_ADDRESS_NOT_ASSOCIATED cpu_to_le32(0xC0000239) +#define STATUS_CONNECTION_INVALID cpu_to_le32(0xC000023A) +#define STATUS_CONNECTION_ACTIVE cpu_to_le32(0xC000023B) +#define STATUS_NETWORK_UNREACHABLE cpu_to_le32(0xC000023C) +#define STATUS_HOST_UNREACHABLE cpu_to_le32(0xC000023D) +#define STATUS_PROTOCOL_UNREACHABLE cpu_to_le32(0xC000023E) +#define STATUS_PORT_UNREACHABLE cpu_to_le32(0xC000023F) +#define STATUS_REQUEST_ABORTED cpu_to_le32(0xC0000240) +#define STATUS_CONNECTION_ABORTED cpu_to_le32(0xC0000241) +#define STATUS_BAD_COMPRESSION_BUFFER cpu_to_le32(0xC0000242) +#define STATUS_USER_MAPPED_FILE cpu_to_le32(0xC0000243) +#define STATUS_AUDIT_FAILED cpu_to_le32(0xC0000244) +#define STATUS_TIMER_RESOLUTION_NOT_SET cpu_to_le32(0xC0000245) +#define STATUS_CONNECTION_COUNT_LIMIT cpu_to_le32(0xC0000246) +#define STATUS_LOGIN_TIME_RESTRICTION cpu_to_le32(0xC0000247) +#define STATUS_LOGIN_WKSTA_RESTRICTION cpu_to_le32(0xC0000248) +#define STATUS_IMAGE_MP_UP_MISMATCH cpu_to_le32(0xC0000249) +#define STATUS_INSUFFICIENT_LOGON_INFO cpu_to_le32(0xC0000250) +#define STATUS_BAD_DLL_ENTRYPOINT cpu_to_le32(0xC0000251) +#define STATUS_BAD_SERVICE_ENTRYPOINT cpu_to_le32(0xC0000252) +#define STATUS_LPC_REPLY_LOST cpu_to_le32(0xC0000253) +#define STATUS_IP_ADDRESS_CONFLICT1 cpu_to_le32(0xC0000254) +#define STATUS_IP_ADDRESS_CONFLICT2 cpu_to_le32(0xC0000255) +#define STATUS_REGISTRY_QUOTA_LIMIT cpu_to_le32(0xC0000256) +#define STATUS_PATH_NOT_COVERED cpu_to_le32(0xC0000257) +#define STATUS_NO_CALLBACK_ACTIVE cpu_to_le32(0xC0000258) +#define STATUS_LICENSE_QUOTA_EXCEEDED cpu_to_le32(0xC0000259) +#define STATUS_PWD_TOO_SHORT cpu_to_le32(0xC000025A) +#define STATUS_PWD_TOO_RECENT cpu_to_le32(0xC000025B) +#define STATUS_PWD_HISTORY_CONFLICT cpu_to_le32(0xC000025C) +#define STATUS_PLUGPLAY_NO_DEVICE cpu_to_le32(0xC000025E) +#define STATUS_UNSUPPORTED_COMPRESSION cpu_to_le32(0xC000025F) +#define STATUS_INVALID_HW_PROFILE cpu_to_le32(0xC0000260) +#define STATUS_INVALID_PLUGPLAY_DEVICE_PATH cpu_to_le32(0xC0000261) +#define STATUS_DRIVER_ORDINAL_NOT_FOUND cpu_to_le32(0xC0000262) +#define STATUS_DRIVER_ENTRYPOINT_NOT_FOUND cpu_to_le32(0xC0000263) +#define STATUS_RESOURCE_NOT_OWNED cpu_to_le32(0xC0000264) +#define STATUS_TOO_MANY_LINKS cpu_to_le32(0xC0000265) +#define STATUS_QUOTA_LIST_INCONSISTENT cpu_to_le32(0xC0000266) +#define STATUS_FILE_IS_OFFLINE cpu_to_le32(0xC0000267) +#define STATUS_EVALUATION_EXPIRATION cpu_to_le32(0xC0000268) +#define STATUS_ILLEGAL_DLL_RELOCATION cpu_to_le32(0xC0000269) +#define STATUS_LICENSE_VIOLATION cpu_to_le32(0xC000026A) +#define STATUS_DLL_INIT_FAILED_LOGOFF cpu_to_le32(0xC000026B) +#define STATUS_DRIVER_UNABLE_TO_LOAD cpu_to_le32(0xC000026C) +#define STATUS_DFS_UNAVAILABLE cpu_to_le32(0xC000026D) +#define STATUS_VOLUME_DISMOUNTED cpu_to_le32(0xC000026E) +#define STATUS_WX86_INTERNAL_ERROR cpu_to_le32(0xC000026F) +#define STATUS_WX86_FLOAT_STACK_CHECK cpu_to_le32(0xC0000270) +#define STATUS_VALIDATE_CONTINUE cpu_to_le32(0xC0000271) +#define STATUS_NO_MATCH cpu_to_le32(0xC0000272) +#define STATUS_NO_MORE_MATCHES cpu_to_le32(0xC0000273) +#define STATUS_NOT_A_REPARSE_POINT cpu_to_le32(0xC0000275) +#define STATUS_IO_REPARSE_TAG_INVALID cpu_to_le32(0xC0000276) +#define STATUS_IO_REPARSE_TAG_MISMATCH cpu_to_le32(0xC0000277) +#define STATUS_IO_REPARSE_DATA_INVALID cpu_to_le32(0xC0000278) +#define STATUS_IO_REPARSE_TAG_NOT_HANDLED cpu_to_le32(0xC0000279) +#define STATUS_REPARSE_POINT_NOT_RESOLVED cpu_to_le32(0xC0000280) +#define STATUS_DIRECTORY_IS_A_REPARSE_POINT cpu_to_le32(0xC0000281) +#define STATUS_RANGE_LIST_CONFLICT cpu_to_le32(0xC0000282) +#define STATUS_SOURCE_ELEMENT_EMPTY cpu_to_le32(0xC0000283) +#define STATUS_DESTINATION_ELEMENT_FULL cpu_to_le32(0xC0000284) +#define STATUS_ILLEGAL_ELEMENT_ADDRESS cpu_to_le32(0xC0000285) +#define STATUS_MAGAZINE_NOT_PRESENT cpu_to_le32(0xC0000286) +#define STATUS_REINITIALIZATION_NEEDED cpu_to_le32(0xC0000287) +#define STATUS_ENCRYPTION_FAILED cpu_to_le32(0xC000028A) +#define STATUS_DECRYPTION_FAILED cpu_to_le32(0xC000028B) +#define STATUS_RANGE_NOT_FOUND cpu_to_le32(0xC000028C) +#define STATUS_NO_RECOVERY_POLICY cpu_to_le32(0xC000028D) +#define STATUS_NO_EFS cpu_to_le32(0xC000028E) +#define STATUS_WRONG_EFS cpu_to_le32(0xC000028F) +#define STATUS_NO_USER_KEYS cpu_to_le32(0xC0000290) +#define STATUS_FILE_NOT_ENCRYPTED cpu_to_le32(0xC0000291) +#define STATUS_NOT_EXPORT_FORMAT cpu_to_le32(0xC0000292) +#define STATUS_FILE_ENCRYPTED cpu_to_le32(0xC0000293) +#define STATUS_WMI_GUID_NOT_FOUND cpu_to_le32(0xC0000295) +#define STATUS_WMI_INSTANCE_NOT_FOUND cpu_to_le32(0xC0000296) +#define STATUS_WMI_ITEMID_NOT_FOUND cpu_to_le32(0xC0000297) +#define STATUS_WMI_TRY_AGAIN cpu_to_le32(0xC0000298) +#define STATUS_SHARED_POLICY cpu_to_le32(0xC0000299) +#define STATUS_POLICY_OBJECT_NOT_FOUND cpu_to_le32(0xC000029A) +#define STATUS_POLICY_ONLY_IN_DS cpu_to_le32(0xC000029B) +#define STATUS_VOLUME_NOT_UPGRADED cpu_to_le32(0xC000029C) +#define STATUS_REMOTE_STORAGE_NOT_ACTIVE cpu_to_le32(0xC000029D) +#define STATUS_REMOTE_STORAGE_MEDIA_ERROR cpu_to_le32(0xC000029E) +#define STATUS_NO_TRACKING_SERVICE cpu_to_le32(0xC000029F) +#define STATUS_SERVER_SID_MISMATCH cpu_to_le32(0xC00002A0) +#define STATUS_DS_NO_ATTRIBUTE_OR_VALUE cpu_to_le32(0xC00002A1) +#define STATUS_DS_INVALID_ATTRIBUTE_SYNTAX cpu_to_le32(0xC00002A2) +#define STATUS_DS_ATTRIBUTE_TYPE_UNDEFINED cpu_to_le32(0xC00002A3) +#define STATUS_DS_ATTRIBUTE_OR_VALUE_EXISTS cpu_to_le32(0xC00002A4) +#define STATUS_DS_BUSY cpu_to_le32(0xC00002A5) +#define STATUS_DS_UNAVAILABLE cpu_to_le32(0xC00002A6) +#define STATUS_DS_NO_RIDS_ALLOCATED cpu_to_le32(0xC00002A7) +#define STATUS_DS_NO_MORE_RIDS cpu_to_le32(0xC00002A8) +#define STATUS_DS_INCORRECT_ROLE_OWNER cpu_to_le32(0xC00002A9) +#define STATUS_DS_RIDMGR_INIT_ERROR cpu_to_le32(0xC00002AA) +#define STATUS_DS_OBJ_CLASS_VIOLATION cpu_to_le32(0xC00002AB) +#define STATUS_DS_CANT_ON_NON_LEAF cpu_to_le32(0xC00002AC) +#define STATUS_DS_CANT_ON_RDN cpu_to_le32(0xC00002AD) +#define STATUS_DS_CANT_MOD_OBJ_CLASS cpu_to_le32(0xC00002AE) +#define STATUS_DS_CROSS_DOM_MOVE_FAILED cpu_to_le32(0xC00002AF) +#define STATUS_DS_GC_NOT_AVAILABLE cpu_to_le32(0xC00002B0) +#define STATUS_DIRECTORY_SERVICE_REQUIRED cpu_to_le32(0xC00002B1) +#define STATUS_REPARSE_ATTRIBUTE_CONFLICT cpu_to_le32(0xC00002B2) +#define STATUS_CANT_ENABLE_DENY_ONLY cpu_to_le32(0xC00002B3) +#define STATUS_FLOAT_MULTIPLE_FAULTS cpu_to_le32(0xC00002B4) +#define STATUS_FLOAT_MULTIPLE_TRAPS cpu_to_le32(0xC00002B5) +#define STATUS_DEVICE_REMOVED cpu_to_le32(0xC00002B6) +#define STATUS_JOURNAL_DELETE_IN_PROGRESS cpu_to_le32(0xC00002B7) +#define STATUS_JOURNAL_NOT_ACTIVE cpu_to_le32(0xC00002B8) +#define STATUS_NOINTERFACE cpu_to_le32(0xC00002B9) +#define STATUS_DS_ADMIN_LIMIT_EXCEEDED cpu_to_le32(0xC00002C1) +#define STATUS_DRIVER_FAILED_SLEEP cpu_to_le32(0xC00002C2) +#define STATUS_MUTUAL_AUTHENTICATION_FAILED cpu_to_le32(0xC00002C3) +#define STATUS_CORRUPT_SYSTEM_FILE cpu_to_le32(0xC00002C4) +#define STATUS_DATATYPE_MISALIGNMENT_ERROR cpu_to_le32(0xC00002C5) +#define STATUS_WMI_READ_ONLY cpu_to_le32(0xC00002C6) +#define STATUS_WMI_SET_FAILURE cpu_to_le32(0xC00002C7) +#define STATUS_COMMITMENT_MINIMUM cpu_to_le32(0xC00002C8) +#define STATUS_REG_NAT_CONSUMPTION cpu_to_le32(0xC00002C9) +#define STATUS_TRANSPORT_FULL cpu_to_le32(0xC00002CA) +#define STATUS_DS_SAM_INIT_FAILURE cpu_to_le32(0xC00002CB) +#define STATUS_ONLY_IF_CONNECTED cpu_to_le32(0xC00002CC) +#define STATUS_DS_SENSITIVE_GROUP_VIOLATION cpu_to_le32(0xC00002CD) +#define STATUS_PNP_RESTART_ENUMERATION cpu_to_le32(0xC00002CE) +#define STATUS_JOURNAL_ENTRY_DELETED cpu_to_le32(0xC00002CF) +#define STATUS_DS_CANT_MOD_PRIMARYGROUPID cpu_to_le32(0xC00002D0) +#define STATUS_SYSTEM_IMAGE_BAD_SIGNATURE cpu_to_le32(0xC00002D1) +#define STATUS_PNP_REBOOT_REQUIRED cpu_to_le32(0xC00002D2) +#define STATUS_POWER_STATE_INVALID cpu_to_le32(0xC00002D3) +#define STATUS_DS_INVALID_GROUP_TYPE cpu_to_le32(0xC00002D4) +#define STATUS_DS_NO_NEST_GLOBALGROUP_IN_MIXEDDOMAIN cpu_to_le32(0xC00002D5) +#define STATUS_DS_NO_NEST_LOCALGROUP_IN_MIXEDDOMAIN cpu_to_le32(0xC00002D6) +#define STATUS_DS_GLOBAL_CANT_HAVE_LOCAL_MEMBER cpu_to_le32(0xC00002D7) +#define STATUS_DS_GLOBAL_CANT_HAVE_UNIVERSAL_MEMBER cpu_to_le32(0xC00002D8) +#define STATUS_DS_UNIVERSAL_CANT_HAVE_LOCAL_MEMBER cpu_to_le32(0xC00002D9) +#define STATUS_DS_GLOBAL_CANT_HAVE_CROSSDOMAIN_MEMBER cpu_to_le32(0xC00002DA) +#define STATUS_DS_LOCAL_CANT_HAVE_CROSSDOMAIN_LOCAL_MEMBER \ + cpu_to_le32(0xC00002DB) +#define STATUS_DS_HAVE_PRIMARY_MEMBERS cpu_to_le32(0xC00002DC) +#define STATUS_WMI_NOT_SUPPORTED cpu_to_le32(0xC00002DD) +#define STATUS_INSUFFICIENT_POWER cpu_to_le32(0xC00002DE) +#define STATUS_SAM_NEED_BOOTKEY_PASSWORD cpu_to_le32(0xC00002DF) +#define STATUS_SAM_NEED_BOOTKEY_FLOPPY cpu_to_le32(0xC00002E0) +#define STATUS_DS_CANT_START cpu_to_le32(0xC00002E1) +#define STATUS_DS_INIT_FAILURE cpu_to_le32(0xC00002E2) +#define STATUS_SAM_INIT_FAILURE cpu_to_le32(0xC00002E3) +#define STATUS_DS_GC_REQUIRED cpu_to_le32(0xC00002E4) +#define STATUS_DS_LOCAL_MEMBER_OF_LOCAL_ONLY cpu_to_le32(0xC00002E5) +#define STATUS_DS_NO_FPO_IN_UNIVERSAL_GROUPS cpu_to_le32(0xC00002E6) +#define STATUS_DS_MACHINE_ACCOUNT_QUOTA_EXCEEDED cpu_to_le32(0xC00002E7) +#define STATUS_MULTIPLE_FAULT_VIOLATION cpu_to_le32(0xC00002E8) +#define STATUS_CURRENT_DOMAIN_NOT_ALLOWED cpu_to_le32(0xC00002E9) +#define STATUS_CANNOT_MAKE cpu_to_le32(0xC00002EA) +#define STATUS_SYSTEM_SHUTDOWN cpu_to_le32(0xC00002EB) +#define STATUS_DS_INIT_FAILURE_CONSOLE cpu_to_le32(0xC00002EC) +#define STATUS_DS_SAM_INIT_FAILURE_CONSOLE cpu_to_le32(0xC00002ED) +#define STATUS_UNFINISHED_CONTEXT_DELETED cpu_to_le32(0xC00002EE) +#define STATUS_NO_TGT_REPLY cpu_to_le32(0xC00002EF) +#define STATUS_OBJECTID_NOT_FOUND cpu_to_le32(0xC00002F0) +#define STATUS_NO_IP_ADDRESSES cpu_to_le32(0xC00002F1) +#define STATUS_WRONG_CREDENTIAL_HANDLE cpu_to_le32(0xC00002F2) +#define STATUS_CRYPTO_SYSTEM_INVALID cpu_to_le32(0xC00002F3) +#define STATUS_MAX_REFERRALS_EXCEEDED cpu_to_le32(0xC00002F4) +#define STATUS_MUST_BE_KDC cpu_to_le32(0xC00002F5) +#define STATUS_STRONG_CRYPTO_NOT_SUPPORTED cpu_to_le32(0xC00002F6) +#define STATUS_TOO_MANY_PRINCIPALS cpu_to_le32(0xC00002F7) +#define STATUS_NO_PA_DATA cpu_to_le32(0xC00002F8) +#define STATUS_PKINIT_NAME_MISMATCH cpu_to_le32(0xC00002F9) +#define STATUS_SMARTCARD_LOGON_REQUIRED cpu_to_le32(0xC00002FA) +#define STATUS_KDC_INVALID_REQUEST cpu_to_le32(0xC00002FB) +#define STATUS_KDC_UNABLE_TO_REFER cpu_to_le32(0xC00002FC) +#define STATUS_KDC_UNKNOWN_ETYPE cpu_to_le32(0xC00002FD) +#define STATUS_SHUTDOWN_IN_PROGRESS cpu_to_le32(0xC00002FE) +#define STATUS_SERVER_SHUTDOWN_IN_PROGRESS cpu_to_le32(0xC00002FF) +#define STATUS_NOT_SUPPORTED_ON_SBS cpu_to_le32(0xC0000300) +#define STATUS_WMI_GUID_DISCONNECTED cpu_to_le32(0xC0000301) +#define STATUS_WMI_ALREADY_DISABLED cpu_to_le32(0xC0000302) +#define STATUS_WMI_ALREADY_ENABLED cpu_to_le32(0xC0000303) +#define STATUS_MFT_TOO_FRAGMENTED cpu_to_le32(0xC0000304) +#define STATUS_COPY_PROTECTION_FAILURE cpu_to_le32(0xC0000305) +#define STATUS_CSS_AUTHENTICATION_FAILURE cpu_to_le32(0xC0000306) +#define STATUS_CSS_KEY_NOT_PRESENT cpu_to_le32(0xC0000307) +#define STATUS_CSS_KEY_NOT_ESTABLISHED cpu_to_le32(0xC0000308) +#define STATUS_CSS_SCRAMBLED_SECTOR cpu_to_le32(0xC0000309) +#define STATUS_CSS_REGION_MISMATCH cpu_to_le32(0xC000030A) +#define STATUS_CSS_RESETS_EXHAUSTED cpu_to_le32(0xC000030B) +#define STATUS_PKINIT_FAILURE cpu_to_le32(0xC0000320) +#define STATUS_SMARTCARD_SUBSYSTEM_FAILURE cpu_to_le32(0xC0000321) +#define STATUS_NO_KERB_KEY cpu_to_le32(0xC0000322) +#define STATUS_HOST_DOWN cpu_to_le32(0xC0000350) +#define STATUS_UNSUPPORTED_PREAUTH cpu_to_le32(0xC0000351) +#define STATUS_EFS_ALG_BLOB_TOO_BIG cpu_to_le32(0xC0000352) +#define STATUS_PORT_NOT_SET cpu_to_le32(0xC0000353) +#define STATUS_DEBUGGER_INACTIVE cpu_to_le32(0xC0000354) +#define STATUS_DS_VERSION_CHECK_FAILURE cpu_to_le32(0xC0000355) +#define STATUS_AUDITING_DISABLED cpu_to_le32(0xC0000356) +#define STATUS_PRENT4_MACHINE_ACCOUNT cpu_to_le32(0xC0000357) +#define STATUS_DS_AG_CANT_HAVE_UNIVERSAL_MEMBER cpu_to_le32(0xC0000358) +#define STATUS_INVALID_IMAGE_WIN_32 cpu_to_le32(0xC0000359) +#define STATUS_INVALID_IMAGE_WIN_64 cpu_to_le32(0xC000035A) +#define STATUS_BAD_BINDINGS cpu_to_le32(0xC000035B) +#define STATUS_NETWORK_SESSION_EXPIRED cpu_to_le32(0xC000035C) +#define STATUS_APPHELP_BLOCK cpu_to_le32(0xC000035D) +#define STATUS_ALL_SIDS_FILTERED cpu_to_le32(0xC000035E) +#define STATUS_NOT_SAFE_MODE_DRIVER cpu_to_le32(0xC000035F) +#define STATUS_ACCESS_DISABLED_BY_POLICY_DEFAULT cpu_to_le32(0xC0000361) +#define STATUS_ACCESS_DISABLED_BY_POLICY_PATH cpu_to_le32(0xC0000362) +#define STATUS_ACCESS_DISABLED_BY_POLICY_PUBLISHER cpu_to_le32(0xC0000363) +#define STATUS_ACCESS_DISABLED_BY_POLICY_OTHER cpu_to_le32(0xC0000364) +#define STATUS_FAILED_DRIVER_ENTRY cpu_to_le32(0xC0000365) +#define STATUS_DEVICE_ENUMERATION_ERROR cpu_to_le32(0xC0000366) +#define STATUS_MOUNT_POINT_NOT_RESOLVED cpu_to_le32(0xC0000368) +#define STATUS_INVALID_DEVICE_OBJECT_PARAMETER cpu_to_le32(0xC0000369) +#define STATUS_MCA_OCCURRED cpu_to_le32(0xC000036A) +#define STATUS_DRIVER_BLOCKED_CRITICAL cpu_to_le32(0xC000036B) +#define STATUS_DRIVER_BLOCKED cpu_to_le32(0xC000036C) +#define STATUS_DRIVER_DATABASE_ERROR cpu_to_le32(0xC000036D) +#define STATUS_SYSTEM_HIVE_TOO_LARGE cpu_to_le32(0xC000036E) +#define STATUS_INVALID_IMPORT_OF_NON_DLL cpu_to_le32(0xC000036F) +#define STATUS_NO_SECRETS cpu_to_le32(0xC0000371) +#define STATUS_ACCESS_DISABLED_NO_SAFER_UI_BY_POLICY cpu_to_le32(0xC0000372) +#define STATUS_FAILED_STACK_SWITCH cpu_to_le32(0xC0000373) +#define STATUS_HEAP_CORRUPTION cpu_to_le32(0xC0000374) +#define STATUS_SMARTCARD_WRONG_PIN cpu_to_le32(0xC0000380) +#define STATUS_SMARTCARD_CARD_BLOCKED cpu_to_le32(0xC0000381) +#define STATUS_SMARTCARD_CARD_NOT_AUTHENTICATED cpu_to_le32(0xC0000382) +#define STATUS_SMARTCARD_NO_CARD cpu_to_le32(0xC0000383) +#define STATUS_SMARTCARD_NO_KEY_CONTAINER cpu_to_le32(0xC0000384) +#define STATUS_SMARTCARD_NO_CERTIFICATE cpu_to_le32(0xC0000385) +#define STATUS_SMARTCARD_NO_KEYSET cpu_to_le32(0xC0000386) +#define STATUS_SMARTCARD_IO_ERROR cpu_to_le32(0xC0000387) +#define STATUS_DOWNGRADE_DETECTED cpu_to_le32(0xC0000388) +#define STATUS_SMARTCARD_CERT_REVOKED cpu_to_le32(0xC0000389) +#define STATUS_ISSUING_CA_UNTRUSTED cpu_to_le32(0xC000038A) +#define STATUS_REVOCATION_OFFLINE_C cpu_to_le32(0xC000038B) +#define STATUS_PKINIT_CLIENT_FAILURE cpu_to_le32(0xC000038C) +#define STATUS_SMARTCARD_CERT_EXPIRED cpu_to_le32(0xC000038D) +#define STATUS_DRIVER_FAILED_PRIOR_UNLOAD cpu_to_le32(0xC000038E) +#define STATUS_SMARTCARD_SILENT_CONTEXT cpu_to_le32(0xC000038F) +#define STATUS_PER_USER_TRUST_QUOTA_EXCEEDED cpu_to_le32(0xC0000401) +#define STATUS_ALL_USER_TRUST_QUOTA_EXCEEDED cpu_to_le32(0xC0000402) +#define STATUS_USER_DELETE_TRUST_QUOTA_EXCEEDED cpu_to_le32(0xC0000403) +#define STATUS_DS_NAME_NOT_UNIQUE cpu_to_le32(0xC0000404) +#define STATUS_DS_DUPLICATE_ID_FOUND cpu_to_le32(0xC0000405) +#define STATUS_DS_GROUP_CONVERSION_ERROR cpu_to_le32(0xC0000406) +#define STATUS_VOLSNAP_PREPARE_HIBERNATE cpu_to_le32(0xC0000407) +#define STATUS_USER2USER_REQUIRED cpu_to_le32(0xC0000408) +#define STATUS_STACK_BUFFER_OVERRUN cpu_to_le32(0xC0000409) +#define STATUS_NO_S4U_PROT_SUPPORT cpu_to_le32(0xC000040A) +#define STATUS_CROSSREALM_DELEGATION_FAILURE cpu_to_le32(0xC000040B) +#define STATUS_REVOCATION_OFFLINE_KDC cpu_to_le32(0xC000040C) +#define STATUS_ISSUING_CA_UNTRUSTED_KDC cpu_to_le32(0xC000040D) +#define STATUS_KDC_CERT_EXPIRED cpu_to_le32(0xC000040E) +#define STATUS_KDC_CERT_REVOKED cpu_to_le32(0xC000040F) +#define STATUS_PARAMETER_QUOTA_EXCEEDED cpu_to_le32(0xC0000410) +#define STATUS_HIBERNATION_FAILURE cpu_to_le32(0xC0000411) +#define STATUS_DELAY_LOAD_FAILED cpu_to_le32(0xC0000412) +#define STATUS_AUTHENTICATION_FIREWALL_FAILED cpu_to_le32(0xC0000413) +#define STATUS_VDM_DISALLOWED cpu_to_le32(0xC0000414) +#define STATUS_HUNG_DISPLAY_DRIVER_THREAD cpu_to_le32(0xC0000415) +#define STATUS_INSUFFICIENT_RESOURCE_FOR_SPECIFIED_SHARED_SECTION_SIZE \ + cpu_to_le32(0xC0000416) +#define STATUS_INVALID_CRUNTIME_PARAMETER cpu_to_le32(0xC0000417) +#define STATUS_NTLM_BLOCKED cpu_to_le32(0xC0000418) +#define STATUS_ASSERTION_FAILURE cpu_to_le32(0xC0000420) +#define STATUS_VERIFIER_STOP cpu_to_le32(0xC0000421) +#define STATUS_CALLBACK_POP_STACK cpu_to_le32(0xC0000423) +#define STATUS_INCOMPATIBLE_DRIVER_BLOCKED cpu_to_le32(0xC0000424) +#define STATUS_HIVE_UNLOADED cpu_to_le32(0xC0000425) +#define STATUS_COMPRESSION_DISABLED cpu_to_le32(0xC0000426) +#define STATUS_FILE_SYSTEM_LIMITATION cpu_to_le32(0xC0000427) +#define STATUS_INVALID_IMAGE_HASH cpu_to_le32(0xC0000428) +#define STATUS_NOT_CAPABLE cpu_to_le32(0xC0000429) +#define STATUS_REQUEST_OUT_OF_SEQUENCE cpu_to_le32(0xC000042A) +#define STATUS_IMPLEMENTATION_LIMIT cpu_to_le32(0xC000042B) +#define STATUS_ELEVATION_REQUIRED cpu_to_le32(0xC000042C) +#define STATUS_BEYOND_VDL cpu_to_le32(0xC0000432) +#define STATUS_ENCOUNTERED_WRITE_IN_PROGRESS cpu_to_le32(0xC0000433) +#define STATUS_PTE_CHANGED cpu_to_le32(0xC0000434) +#define STATUS_PURGE_FAILED cpu_to_le32(0xC0000435) +#define STATUS_CRED_REQUIRES_CONFIRMATION cpu_to_le32(0xC0000440) +#define STATUS_CS_ENCRYPTION_INVALID_SERVER_RESPONSE cpu_to_le32(0xC0000441) +#define STATUS_CS_ENCRYPTION_UNSUPPORTED_SERVER cpu_to_le32(0xC0000442) +#define STATUS_CS_ENCRYPTION_EXISTING_ENCRYPTED_FILE cpu_to_le32(0xC0000443) +#define STATUS_CS_ENCRYPTION_NEW_ENCRYPTED_FILE cpu_to_le32(0xC0000444) +#define STATUS_CS_ENCRYPTION_FILE_NOT_CSE cpu_to_le32(0xC0000445) +#define STATUS_INVALID_LABEL cpu_to_le32(0xC0000446) +#define STATUS_DRIVER_PROCESS_TERMINATED cpu_to_le32(0xC0000450) +#define STATUS_AMBIGUOUS_SYSTEM_DEVICE cpu_to_le32(0xC0000451) +#define STATUS_SYSTEM_DEVICE_NOT_FOUND cpu_to_le32(0xC0000452) +#define STATUS_RESTART_BOOT_APPLICATION cpu_to_le32(0xC0000453) +#define STATUS_INVALID_TASK_NAME cpu_to_le32(0xC0000500) +#define STATUS_INVALID_TASK_INDEX cpu_to_le32(0xC0000501) +#define STATUS_THREAD_ALREADY_IN_TASK cpu_to_le32(0xC0000502) +#define STATUS_CALLBACK_BYPASS cpu_to_le32(0xC0000503) +#define STATUS_PORT_CLOSED cpu_to_le32(0xC0000700) +#define STATUS_MESSAGE_LOST cpu_to_le32(0xC0000701) +#define STATUS_INVALID_MESSAGE cpu_to_le32(0xC0000702) +#define STATUS_REQUEST_CANCELED cpu_to_le32(0xC0000703) +#define STATUS_RECURSIVE_DISPATCH cpu_to_le32(0xC0000704) +#define STATUS_LPC_RECEIVE_BUFFER_EXPECTED cpu_to_le32(0xC0000705) +#define STATUS_LPC_INVALID_CONNECTION_USAGE cpu_to_le32(0xC0000706) +#define STATUS_LPC_REQUESTS_NOT_ALLOWED cpu_to_le32(0xC0000707) +#define STATUS_RESOURCE_IN_USE cpu_to_le32(0xC0000708) +#define STATUS_HARDWARE_MEMORY_ERROR cpu_to_le32(0xC0000709) +#define STATUS_THREADPOOL_HANDLE_EXCEPTION cpu_to_le32(0xC000070A) +#define STATUS_THREADPOOL_SET_EVENT_ON_COMPLETION_FAILED cpu_to_le32(0xC000070B) +#define STATUS_THREADPOOL_RELEASE_SEMAPHORE_ON_COMPLETION_FAILED \ + cpu_to_le32(0xC000070C) +#define STATUS_THREADPOOL_RELEASE_MUTEX_ON_COMPLETION_FAILED \ + cpu_to_le32(0xC000070D) +#define STATUS_THREADPOOL_FREE_LIBRARY_ON_COMPLETION_FAILED \ + cpu_to_le32(0xC000070E) +#define STATUS_THREADPOOL_RELEASED_DURING_OPERATION cpu_to_le32(0xC000070F) +#define STATUS_CALLBACK_RETURNED_WHILE_IMPERSONATING cpu_to_le32(0xC0000710) +#define STATUS_APC_RETURNED_WHILE_IMPERSONATING cpu_to_le32(0xC0000711) +#define STATUS_PROCESS_IS_PROTECTED cpu_to_le32(0xC0000712) +#define STATUS_MCA_EXCEPTION cpu_to_le32(0xC0000713) +#define STATUS_CERTIFICATE_MAPPING_NOT_UNIQUE cpu_to_le32(0xC0000714) +#define STATUS_SYMLINK_CLASS_DISABLED cpu_to_le32(0xC0000715) +#define STATUS_INVALID_IDN_NORMALIZATION cpu_to_le32(0xC0000716) +#define STATUS_NO_UNICODE_TRANSLATION cpu_to_le32(0xC0000717) +#define STATUS_ALREADY_REGISTERED cpu_to_le32(0xC0000718) +#define STATUS_CONTEXT_MISMATCH cpu_to_le32(0xC0000719) +#define STATUS_PORT_ALREADY_HAS_COMPLETION_LIST cpu_to_le32(0xC000071A) +#define STATUS_CALLBACK_RETURNED_THREAD_PRIORITY cpu_to_le32(0xC000071B) +#define STATUS_INVALID_THREAD cpu_to_le32(0xC000071C) +#define STATUS_CALLBACK_RETURNED_TRANSACTION cpu_to_le32(0xC000071D) +#define STATUS_CALLBACK_RETURNED_LDR_LOCK cpu_to_le32(0xC000071E) +#define STATUS_CALLBACK_RETURNED_LANG cpu_to_le32(0xC000071F) +#define STATUS_CALLBACK_RETURNED_PRI_BACK cpu_to_le32(0xC0000720) +#define STATUS_CALLBACK_RETURNED_THREAD_AFFINITY cpu_to_le32(0xC0000721) +#define STATUS_DISK_REPAIR_DISABLED cpu_to_le32(0xC0000800) +#define STATUS_DS_DOMAIN_RENAME_IN_PROGRESS cpu_to_le32(0xC0000801) +#define STATUS_DISK_QUOTA_EXCEEDED cpu_to_le32(0xC0000802) +#define STATUS_CONTENT_BLOCKED cpu_to_le32(0xC0000804) +#define STATUS_BAD_CLUSTERS cpu_to_le32(0xC0000805) +#define STATUS_VOLUME_DIRTY cpu_to_le32(0xC0000806) +#define STATUS_FILE_CHECKED_OUT cpu_to_le32(0xC0000901) +#define STATUS_CHECKOUT_REQUIRED cpu_to_le32(0xC0000902) +#define STATUS_BAD_FILE_TYPE cpu_to_le32(0xC0000903) +#define STATUS_FILE_TOO_LARGE cpu_to_le32(0xC0000904) +#define STATUS_FORMS_AUTH_REQUIRED cpu_to_le32(0xC0000905) +#define STATUS_VIRUS_INFECTED cpu_to_le32(0xC0000906) +#define STATUS_VIRUS_DELETED cpu_to_le32(0xC0000907) +#define STATUS_BAD_MCFG_TABLE cpu_to_le32(0xC0000908) +#define STATUS_WOW_ASSERTION cpu_to_le32(0xC0009898) +#define STATUS_INVALID_SIGNATURE cpu_to_le32(0xC000A000) +#define STATUS_HMAC_NOT_SUPPORTED cpu_to_le32(0xC000A001) +#define STATUS_IPSEC_QUEUE_OVERFLOW cpu_to_le32(0xC000A010) +#define STATUS_ND_QUEUE_OVERFLOW cpu_to_le32(0xC000A011) +#define STATUS_HOPLIMIT_EXCEEDED cpu_to_le32(0xC000A012) +#define STATUS_PROTOCOL_NOT_SUPPORTED cpu_to_le32(0xC000A013) +#define STATUS_LOST_WRITEBEHIND_DATA_NETWORK_DISCONNECTED \ + cpu_to_le32(0xC000A080) +#define STATUS_LOST_WRITEBEHIND_DATA_NETWORK_SERVER_ERROR \ + cpu_to_le32(0xC000A081) +#define STATUS_LOST_WRITEBEHIND_DATA_LOCAL_DISK_ERROR cpu_to_le32(0xC000A082) +#define STATUS_XML_PARSE_ERROR cpu_to_le32(0xC000A083) +#define STATUS_XMLDSIG_ERROR cpu_to_le32(0xC000A084) +#define STATUS_WRONG_COMPARTMENT cpu_to_le32(0xC000A085) +#define STATUS_AUTHIP_FAILURE cpu_to_le32(0xC000A086) +#define DBG_NO_STATE_CHANGE cpu_to_le32(0xC0010001) +#define DBG_APP_NOT_IDLE cpu_to_le32(0xC0010002) +#define RPC_NT_INVALID_STRING_BINDING cpu_to_le32(0xC0020001) +#define RPC_NT_WRONG_KIND_OF_BINDING cpu_to_le32(0xC0020002) +#define RPC_NT_INVALID_BINDING cpu_to_le32(0xC0020003) +#define RPC_NT_PROTSEQ_NOT_SUPPORTED cpu_to_le32(0xC0020004) +#define RPC_NT_INVALID_RPC_PROTSEQ cpu_to_le32(0xC0020005) +#define RPC_NT_INVALID_STRING_UUID cpu_to_le32(0xC0020006) +#define RPC_NT_INVALID_ENDPOINT_FORMAT cpu_to_le32(0xC0020007) +#define RPC_NT_INVALID_NET_ADDR cpu_to_le32(0xC0020008) +#define RPC_NT_NO_ENDPOINT_FOUND cpu_to_le32(0xC0020009) +#define RPC_NT_INVALID_TIMEOUT cpu_to_le32(0xC002000A) +#define RPC_NT_OBJECT_NOT_FOUND cpu_to_le32(0xC002000B) +#define RPC_NT_ALREADY_REGISTERED cpu_to_le32(0xC002000C) +#define RPC_NT_TYPE_ALREADY_REGISTERED cpu_to_le32(0xC002000D) +#define RPC_NT_ALREADY_LISTENING cpu_to_le32(0xC002000E) +#define RPC_NT_NO_PROTSEQS_REGISTERED cpu_to_le32(0xC002000F) +#define RPC_NT_NOT_LISTENING cpu_to_le32(0xC0020010) +#define RPC_NT_UNKNOWN_MGR_TYPE cpu_to_le32(0xC0020011) +#define RPC_NT_UNKNOWN_IF cpu_to_le32(0xC0020012) +#define RPC_NT_NO_BINDINGS cpu_to_le32(0xC0020013) +#define RPC_NT_NO_PROTSEQS cpu_to_le32(0xC0020014) +#define RPC_NT_CANT_CREATE_ENDPOINT cpu_to_le32(0xC0020015) +#define RPC_NT_OUT_OF_RESOURCES cpu_to_le32(0xC0020016) +#define RPC_NT_SERVER_UNAVAILABLE cpu_to_le32(0xC0020017) +#define RPC_NT_SERVER_TOO_BUSY cpu_to_le32(0xC0020018) +#define RPC_NT_INVALID_NETWORK_OPTIONS cpu_to_le32(0xC0020019) +#define RPC_NT_NO_CALL_ACTIVE cpu_to_le32(0xC002001A) +#define RPC_NT_CALL_FAILED cpu_to_le32(0xC002001B) +#define RPC_NT_CALL_FAILED_DNE cpu_to_le32(0xC002001C) +#define RPC_NT_PROTOCOL_ERROR cpu_to_le32(0xC002001D) +#define RPC_NT_UNSUPPORTED_TRANS_SYN cpu_to_le32(0xC002001F) +#define RPC_NT_UNSUPPORTED_TYPE cpu_to_le32(0xC0020021) +#define RPC_NT_INVALID_TAG cpu_to_le32(0xC0020022) +#define RPC_NT_INVALID_BOUND cpu_to_le32(0xC0020023) +#define RPC_NT_NO_ENTRY_NAME cpu_to_le32(0xC0020024) +#define RPC_NT_INVALID_NAME_SYNTAX cpu_to_le32(0xC0020025) +#define RPC_NT_UNSUPPORTED_NAME_SYNTAX cpu_to_le32(0xC0020026) +#define RPC_NT_UUID_NO_ADDRESS cpu_to_le32(0xC0020028) +#define RPC_NT_DUPLICATE_ENDPOINT cpu_to_le32(0xC0020029) +#define RPC_NT_UNKNOWN_AUTHN_TYPE cpu_to_le32(0xC002002A) +#define RPC_NT_MAX_CALLS_TOO_SMALL cpu_to_le32(0xC002002B) +#define RPC_NT_STRING_TOO_LONG cpu_to_le32(0xC002002C) +#define RPC_NT_PROTSEQ_NOT_FOUND cpu_to_le32(0xC002002D) +#define RPC_NT_PROCNUM_OUT_OF_RANGE cpu_to_le32(0xC002002E) +#define RPC_NT_BINDING_HAS_NO_AUTH cpu_to_le32(0xC002002F) +#define RPC_NT_UNKNOWN_AUTHN_SERVICE cpu_to_le32(0xC0020030) +#define RPC_NT_UNKNOWN_AUTHN_LEVEL cpu_to_le32(0xC0020031) +#define RPC_NT_INVALID_AUTH_IDENTITY cpu_to_le32(0xC0020032) +#define RPC_NT_UNKNOWN_AUTHZ_SERVICE cpu_to_le32(0xC0020033) +#define EPT_NT_INVALID_ENTRY cpu_to_le32(0xC0020034) +#define EPT_NT_CANT_PERFORM_OP cpu_to_le32(0xC0020035) +#define EPT_NT_NOT_REGISTERED cpu_to_le32(0xC0020036) +#define RPC_NT_NOTHING_TO_EXPORT cpu_to_le32(0xC0020037) +#define RPC_NT_INCOMPLETE_NAME cpu_to_le32(0xC0020038) +#define RPC_NT_INVALID_VERS_OPTION cpu_to_le32(0xC0020039) +#define RPC_NT_NO_MORE_MEMBERS cpu_to_le32(0xC002003A) +#define RPC_NT_NOT_ALL_OBJS_UNEXPORTED cpu_to_le32(0xC002003B) +#define RPC_NT_INTERFACE_NOT_FOUND cpu_to_le32(0xC002003C) +#define RPC_NT_ENTRY_ALREADY_EXISTS cpu_to_le32(0xC002003D) +#define RPC_NT_ENTRY_NOT_FOUND cpu_to_le32(0xC002003E) +#define RPC_NT_NAME_SERVICE_UNAVAILABLE cpu_to_le32(0xC002003F) +#define RPC_NT_INVALID_NAF_ID cpu_to_le32(0xC0020040) +#define RPC_NT_CANNOT_SUPPORT cpu_to_le32(0xC0020041) +#define RPC_NT_NO_CONTEXT_AVAILABLE cpu_to_le32(0xC0020042) +#define RPC_NT_INTERNAL_ERROR cpu_to_le32(0xC0020043) +#define RPC_NT_ZERO_DIVIDE cpu_to_le32(0xC0020044) +#define RPC_NT_ADDRESS_ERROR cpu_to_le32(0xC0020045) +#define RPC_NT_FP_DIV_ZERO cpu_to_le32(0xC0020046) +#define RPC_NT_FP_UNDERFLOW cpu_to_le32(0xC0020047) +#define RPC_NT_FP_OVERFLOW cpu_to_le32(0xC0020048) +#define RPC_NT_CALL_IN_PROGRESS cpu_to_le32(0xC0020049) +#define RPC_NT_NO_MORE_BINDINGS cpu_to_le32(0xC002004A) +#define RPC_NT_GROUP_MEMBER_NOT_FOUND cpu_to_le32(0xC002004B) +#define EPT_NT_CANT_CREATE cpu_to_le32(0xC002004C) +#define RPC_NT_INVALID_OBJECT cpu_to_le32(0xC002004D) +#define RPC_NT_NO_INTERFACES cpu_to_le32(0xC002004F) +#define RPC_NT_CALL_CANCELLED cpu_to_le32(0xC0020050) +#define RPC_NT_BINDING_INCOMPLETE cpu_to_le32(0xC0020051) +#define RPC_NT_COMM_FAILURE cpu_to_le32(0xC0020052) +#define RPC_NT_UNSUPPORTED_AUTHN_LEVEL cpu_to_le32(0xC0020053) +#define RPC_NT_NO_PRINC_NAME cpu_to_le32(0xC0020054) +#define RPC_NT_NOT_RPC_ERROR cpu_to_le32(0xC0020055) +#define RPC_NT_SEC_PKG_ERROR cpu_to_le32(0xC0020057) +#define RPC_NT_NOT_CANCELLED cpu_to_le32(0xC0020058) +#define RPC_NT_INVALID_ASYNC_HANDLE cpu_to_le32(0xC0020062) +#define RPC_NT_INVALID_ASYNC_CALL cpu_to_le32(0xC0020063) +#define RPC_NT_PROXY_ACCESS_DENIED cpu_to_le32(0xC0020064) +#define RPC_NT_NO_MORE_ENTRIES cpu_to_le32(0xC0030001) +#define RPC_NT_SS_CHAR_TRANS_OPEN_FAIL cpu_to_le32(0xC0030002) +#define RPC_NT_SS_CHAR_TRANS_SHORT_FILE cpu_to_le32(0xC0030003) +#define RPC_NT_SS_IN_NULL_CONTEXT cpu_to_le32(0xC0030004) +#define RPC_NT_SS_CONTEXT_MISMATCH cpu_to_le32(0xC0030005) +#define RPC_NT_SS_CONTEXT_DAMAGED cpu_to_le32(0xC0030006) +#define RPC_NT_SS_HANDLES_MISMATCH cpu_to_le32(0xC0030007) +#define RPC_NT_SS_CANNOT_GET_CALL_HANDLE cpu_to_le32(0xC0030008) +#define RPC_NT_NULL_REF_POINTER cpu_to_le32(0xC0030009) +#define RPC_NT_ENUM_VALUE_OUT_OF_RANGE cpu_to_le32(0xC003000A) +#define RPC_NT_BYTE_COUNT_TOO_SMALL cpu_to_le32(0xC003000B) +#define RPC_NT_BAD_STUB_DATA cpu_to_le32(0xC003000C) +#define RPC_NT_INVALID_ES_ACTION cpu_to_le32(0xC0030059) +#define RPC_NT_WRONG_ES_VERSION cpu_to_le32(0xC003005A) +#define RPC_NT_WRONG_STUB_VERSION cpu_to_le32(0xC003005B) +#define RPC_NT_INVALID_PIPE_OBJECT cpu_to_le32(0xC003005C) +#define RPC_NT_INVALID_PIPE_OPERATION cpu_to_le32(0xC003005D) +#define RPC_NT_WRONG_PIPE_VERSION cpu_to_le32(0xC003005E) +#define RPC_NT_PIPE_CLOSED cpu_to_le32(0xC003005F) +#define RPC_NT_PIPE_DISCIPLINE_ERROR cpu_to_le32(0xC0030060) +#define RPC_NT_PIPE_EMPTY cpu_to_le32(0xC0030061) +#define STATUS_PNP_BAD_MPS_TABLE cpu_to_le32(0xC0040035) +#define STATUS_PNP_TRANSLATION_FAILED cpu_to_le32(0xC0040036) +#define STATUS_PNP_IRQ_TRANSLATION_FAILED cpu_to_le32(0xC0040037) +#define STATUS_PNP_INVALID_ID cpu_to_le32(0xC0040038) +#define STATUS_IO_REISSUE_AS_CACHED cpu_to_le32(0xC0040039) +#define STATUS_CTX_WINSTATION_NAME_INVALID cpu_to_le32(0xC00A0001) +#define STATUS_CTX_INVALID_PD cpu_to_le32(0xC00A0002) +#define STATUS_CTX_PD_NOT_FOUND cpu_to_le32(0xC00A0003) +#define STATUS_CTX_CLOSE_PENDING cpu_to_le32(0xC00A0006) +#define STATUS_CTX_NO_OUTBUF cpu_to_le32(0xC00A0007) +#define STATUS_CTX_MODEM_INF_NOT_FOUND cpu_to_le32(0xC00A0008) +#define STATUS_CTX_INVALID_MODEMNAME cpu_to_le32(0xC00A0009) +#define STATUS_CTX_RESPONSE_ERROR cpu_to_le32(0xC00A000A) +#define STATUS_CTX_MODEM_RESPONSE_TIMEOUT cpu_to_le32(0xC00A000B) +#define STATUS_CTX_MODEM_RESPONSE_NO_CARRIER cpu_to_le32(0xC00A000C) +#define STATUS_CTX_MODEM_RESPONSE_NO_DIALTONE cpu_to_le32(0xC00A000D) +#define STATUS_CTX_MODEM_RESPONSE_BUSY cpu_to_le32(0xC00A000E) +#define STATUS_CTX_MODEM_RESPONSE_VOICE cpu_to_le32(0xC00A000F) +#define STATUS_CTX_TD_ERROR cpu_to_le32(0xC00A0010) +#define STATUS_CTX_LICENSE_CLIENT_INVALID cpu_to_le32(0xC00A0012) +#define STATUS_CTX_LICENSE_NOT_AVAILABLE cpu_to_le32(0xC00A0013) +#define STATUS_CTX_LICENSE_EXPIRED cpu_to_le32(0xC00A0014) +#define STATUS_CTX_WINSTATION_NOT_FOUND cpu_to_le32(0xC00A0015) +#define STATUS_CTX_WINSTATION_NAME_COLLISION cpu_to_le32(0xC00A0016) +#define STATUS_CTX_WINSTATION_BUSY cpu_to_le32(0xC00A0017) +#define STATUS_CTX_BAD_VIDEO_MODE cpu_to_le32(0xC00A0018) +#define STATUS_CTX_GRAPHICS_INVALID cpu_to_le32(0xC00A0022) +#define STATUS_CTX_NOT_CONSOLE cpu_to_le32(0xC00A0024) +#define STATUS_CTX_CLIENT_QUERY_TIMEOUT cpu_to_le32(0xC00A0026) +#define STATUS_CTX_CONSOLE_DISCONNECT cpu_to_le32(0xC00A0027) +#define STATUS_CTX_CONSOLE_CONNECT cpu_to_le32(0xC00A0028) +#define STATUS_CTX_SHADOW_DENIED cpu_to_le32(0xC00A002A) +#define STATUS_CTX_WINSTATION_ACCESS_DENIED cpu_to_le32(0xC00A002B) +#define STATUS_CTX_INVALID_WD cpu_to_le32(0xC00A002E) +#define STATUS_CTX_WD_NOT_FOUND cpu_to_le32(0xC00A002F) +#define STATUS_CTX_SHADOW_INVALID cpu_to_le32(0xC00A0030) +#define STATUS_CTX_SHADOW_DISABLED cpu_to_le32(0xC00A0031) +#define STATUS_RDP_PROTOCOL_ERROR cpu_to_le32(0xC00A0032) +#define STATUS_CTX_CLIENT_LICENSE_NOT_SET cpu_to_le32(0xC00A0033) +#define STATUS_CTX_CLIENT_LICENSE_IN_USE cpu_to_le32(0xC00A0034) +#define STATUS_CTX_SHADOW_ENDED_BY_MODE_CHANGE cpu_to_le32(0xC00A0035) +#define STATUS_CTX_SHADOW_NOT_RUNNING cpu_to_le32(0xC00A0036) +#define STATUS_CTX_LOGON_DISABLED cpu_to_le32(0xC00A0037) +#define STATUS_CTX_SECURITY_LAYER_ERROR cpu_to_le32(0xC00A0038) +#define STATUS_TS_INCOMPATIBLE_SESSIONS cpu_to_le32(0xC00A0039) +#define STATUS_MUI_FILE_NOT_FOUND cpu_to_le32(0xC00B0001) +#define STATUS_MUI_INVALID_FILE cpu_to_le32(0xC00B0002) +#define STATUS_MUI_INVALID_RC_CONFIG cpu_to_le32(0xC00B0003) +#define STATUS_MUI_INVALID_LOCALE_NAME cpu_to_le32(0xC00B0004) +#define STATUS_MUI_INVALID_ULTIMATEFALLBACK_NAME cpu_to_le32(0xC00B0005) +#define STATUS_MUI_FILE_NOT_LOADED cpu_to_le32(0xC00B0006) +#define STATUS_RESOURCE_ENUM_USER_STOP cpu_to_le32(0xC00B0007) +#define STATUS_CLUSTER_INVALID_NODE cpu_to_le32(0xC0130001) +#define STATUS_CLUSTER_NODE_EXISTS cpu_to_le32(0xC0130002) +#define STATUS_CLUSTER_JOIN_IN_PROGRESS cpu_to_le32(0xC0130003) +#define STATUS_CLUSTER_NODE_NOT_FOUND cpu_to_le32(0xC0130004) +#define STATUS_CLUSTER_LOCAL_NODE_NOT_FOUND cpu_to_le32(0xC0130005) +#define STATUS_CLUSTER_NETWORK_EXISTS cpu_to_le32(0xC0130006) +#define STATUS_CLUSTER_NETWORK_NOT_FOUND cpu_to_le32(0xC0130007) +#define STATUS_CLUSTER_NETINTERFACE_EXISTS cpu_to_le32(0xC0130008) +#define STATUS_CLUSTER_NETINTERFACE_NOT_FOUND cpu_to_le32(0xC0130009) +#define STATUS_CLUSTER_INVALID_REQUEST cpu_to_le32(0xC013000A) +#define STATUS_CLUSTER_INVALID_NETWORK_PROVIDER cpu_to_le32(0xC013000B) +#define STATUS_CLUSTER_NODE_DOWN cpu_to_le32(0xC013000C) +#define STATUS_CLUSTER_NODE_UNREACHABLE cpu_to_le32(0xC013000D) +#define STATUS_CLUSTER_NODE_NOT_MEMBER cpu_to_le32(0xC013000E) +#define STATUS_CLUSTER_JOIN_NOT_IN_PROGRESS cpu_to_le32(0xC013000F) +#define STATUS_CLUSTER_INVALID_NETWORK cpu_to_le32(0xC0130010) +#define STATUS_CLUSTER_NO_NET_ADAPTERS cpu_to_le32(0xC0130011) +#define STATUS_CLUSTER_NODE_UP cpu_to_le32(0xC0130012) +#define STATUS_CLUSTER_NODE_PAUSED cpu_to_le32(0xC0130013) +#define STATUS_CLUSTER_NODE_NOT_PAUSED cpu_to_le32(0xC0130014) +#define STATUS_CLUSTER_NO_SECURITY_CONTEXT cpu_to_le32(0xC0130015) +#define STATUS_CLUSTER_NETWORK_NOT_INTERNAL cpu_to_le32(0xC0130016) +#define STATUS_CLUSTER_POISONED cpu_to_le32(0xC0130017) +#define STATUS_ACPI_INVALID_OPCODE cpu_to_le32(0xC0140001) +#define STATUS_ACPI_STACK_OVERFLOW cpu_to_le32(0xC0140002) +#define STATUS_ACPI_ASSERT_FAILED cpu_to_le32(0xC0140003) +#define STATUS_ACPI_INVALID_INDEX cpu_to_le32(0xC0140004) +#define STATUS_ACPI_INVALID_ARGUMENT cpu_to_le32(0xC0140005) +#define STATUS_ACPI_FATAL cpu_to_le32(0xC0140006) +#define STATUS_ACPI_INVALID_SUPERNAME cpu_to_le32(0xC0140007) +#define STATUS_ACPI_INVALID_ARGTYPE cpu_to_le32(0xC0140008) +#define STATUS_ACPI_INVALID_OBJTYPE cpu_to_le32(0xC0140009) +#define STATUS_ACPI_INVALID_TARGETTYPE cpu_to_le32(0xC014000A) +#define STATUS_ACPI_INCORRECT_ARGUMENT_COUNT cpu_to_le32(0xC014000B) +#define STATUS_ACPI_ADDRESS_NOT_MAPPED cpu_to_le32(0xC014000C) +#define STATUS_ACPI_INVALID_EVENTTYPE cpu_to_le32(0xC014000D) +#define STATUS_ACPI_HANDLER_COLLISION cpu_to_le32(0xC014000E) +#define STATUS_ACPI_INVALID_DATA cpu_to_le32(0xC014000F) +#define STATUS_ACPI_INVALID_REGION cpu_to_le32(0xC0140010) +#define STATUS_ACPI_INVALID_ACCESS_SIZE cpu_to_le32(0xC0140011) +#define STATUS_ACPI_ACQUIRE_GLOBAL_LOCK cpu_to_le32(0xC0140012) +#define STATUS_ACPI_ALREADY_INITIALIZED cpu_to_le32(0xC0140013) +#define STATUS_ACPI_NOT_INITIALIZED cpu_to_le32(0xC0140014) +#define STATUS_ACPI_INVALID_MUTEX_LEVEL cpu_to_le32(0xC0140015) +#define STATUS_ACPI_MUTEX_NOT_OWNED cpu_to_le32(0xC0140016) +#define STATUS_ACPI_MUTEX_NOT_OWNER cpu_to_le32(0xC0140017) +#define STATUS_ACPI_RS_ACCESS cpu_to_le32(0xC0140018) +#define STATUS_ACPI_INVALID_TABLE cpu_to_le32(0xC0140019) +#define STATUS_ACPI_REG_HANDLER_FAILED cpu_to_le32(0xC0140020) +#define STATUS_ACPI_POWER_REQUEST_FAILED cpu_to_le32(0xC0140021) +#define STATUS_SXS_SECTION_NOT_FOUND cpu_to_le32(0xC0150001) +#define STATUS_SXS_CANT_GEN_ACTCTX cpu_to_le32(0xC0150002) +#define STATUS_SXS_INVALID_ACTCTXDATA_FORMAT cpu_to_le32(0xC0150003) +#define STATUS_SXS_ASSEMBLY_NOT_FOUND cpu_to_le32(0xC0150004) +#define STATUS_SXS_MANIFEST_FORMAT_ERROR cpu_to_le32(0xC0150005) +#define STATUS_SXS_MANIFEST_PARSE_ERROR cpu_to_le32(0xC0150006) +#define STATUS_SXS_ACTIVATION_CONTEXT_DISABLED cpu_to_le32(0xC0150007) +#define STATUS_SXS_KEY_NOT_FOUND cpu_to_le32(0xC0150008) +#define STATUS_SXS_VERSION_CONFLICT cpu_to_le32(0xC0150009) +#define STATUS_SXS_WRONG_SECTION_TYPE cpu_to_le32(0xC015000A) +#define STATUS_SXS_THREAD_QUERIES_DISABLED cpu_to_le32(0xC015000B) +#define STATUS_SXS_ASSEMBLY_MISSING cpu_to_le32(0xC015000C) +#define STATUS_SXS_PROCESS_DEFAULT_ALREADY_SET cpu_to_le32(0xC015000E) +#define STATUS_SXS_EARLY_DEACTIVATION cpu_to_le32(0xC015000F) +#define STATUS_SXS_INVALID_DEACTIVATION cpu_to_le32(0xC0150010) +#define STATUS_SXS_MULTIPLE_DEACTIVATION cpu_to_le32(0xC0150011) +#define STATUS_SXS_SYSTEM_DEFAULT_ACTIVATION_CONTEXT_EMPTY \ + cpu_to_le32(0xC0150012) +#define STATUS_SXS_PROCESS_TERMINATION_REQUESTED cpu_to_le32(0xC0150013) +#define STATUS_SXS_CORRUPT_ACTIVATION_STACK cpu_to_le32(0xC0150014) +#define STATUS_SXS_CORRUPTION cpu_to_le32(0xC0150015) +#define STATUS_SXS_INVALID_IDENTITY_ATTRIBUTE_VALUE cpu_to_le32(0xC0150016) +#define STATUS_SXS_INVALID_IDENTITY_ATTRIBUTE_NAME cpu_to_le32(0xC0150017) +#define STATUS_SXS_IDENTITY_DUPLICATE_ATTRIBUTE cpu_to_le32(0xC0150018) +#define STATUS_SXS_IDENTITY_PARSE_ERROR cpu_to_le32(0xC0150019) +#define STATUS_SXS_COMPONENT_STORE_CORRUPT cpu_to_le32(0xC015001A) +#define STATUS_SXS_FILE_HASH_MISMATCH cpu_to_le32(0xC015001B) +#define STATUS_SXS_MANIFEST_IDENTITY_SAME_BUT_CONTENTS_DIFFERENT \ + cpu_to_le32(0xC015001C) +#define STATUS_SXS_IDENTITIES_DIFFERENT cpu_to_le32(0xC015001D) +#define STATUS_SXS_ASSEMBLY_IS_NOT_A_DEPLOYMENT cpu_to_le32(0xC015001E) +#define STATUS_SXS_FILE_NOT_PART_OF_ASSEMBLY cpu_to_le32(0xC015001F) +#define STATUS_ADVANCED_INSTALLER_FAILED cpu_to_le32(0xC0150020) +#define STATUS_XML_ENCODING_MISMATCH cpu_to_le32(0xC0150021) +#define STATUS_SXS_MANIFEST_TOO_BIG cpu_to_le32(0xC0150022) +#define STATUS_SXS_SETTING_NOT_REGISTERED cpu_to_le32(0xC0150023) +#define STATUS_SXS_TRANSACTION_CLOSURE_INCOMPLETE cpu_to_le32(0xC0150024) +#define STATUS_SMI_PRIMITIVE_INSTALLER_FAILED cpu_to_le32(0xC0150025) +#define STATUS_GENERIC_COMMAND_FAILED cpu_to_le32(0xC0150026) +#define STATUS_SXS_FILE_HASH_MISSING cpu_to_le32(0xC0150027) +#define STATUS_TRANSACTIONAL_CONFLICT cpu_to_le32(0xC0190001) +#define STATUS_INVALID_TRANSACTION cpu_to_le32(0xC0190002) +#define STATUS_TRANSACTION_NOT_ACTIVE cpu_to_le32(0xC0190003) +#define STATUS_TM_INITIALIZATION_FAILED cpu_to_le32(0xC0190004) +#define STATUS_RM_NOT_ACTIVE cpu_to_le32(0xC0190005) +#define STATUS_RM_METADATA_CORRUPT cpu_to_le32(0xC0190006) +#define STATUS_TRANSACTION_NOT_JOINED cpu_to_le32(0xC0190007) +#define STATUS_DIRECTORY_NOT_RM cpu_to_le32(0xC0190008) +#define STATUS_TRANSACTIONS_UNSUPPORTED_REMOTE cpu_to_le32(0xC019000A) +#define STATUS_LOG_RESIZE_INVALID_SIZE cpu_to_le32(0xC019000B) +#define STATUS_REMOTE_FILE_VERSION_MISMATCH cpu_to_le32(0xC019000C) +#define STATUS_CRM_PROTOCOL_ALREADY_EXISTS cpu_to_le32(0xC019000F) +#define STATUS_TRANSACTION_PROPAGATION_FAILED cpu_to_le32(0xC0190010) +#define STATUS_CRM_PROTOCOL_NOT_FOUND cpu_to_le32(0xC0190011) +#define STATUS_TRANSACTION_SUPERIOR_EXISTS cpu_to_le32(0xC0190012) +#define STATUS_TRANSACTION_REQUEST_NOT_VALID cpu_to_le32(0xC0190013) +#define STATUS_TRANSACTION_NOT_REQUESTED cpu_to_le32(0xC0190014) +#define STATUS_TRANSACTION_ALREADY_ABORTED cpu_to_le32(0xC0190015) +#define STATUS_TRANSACTION_ALREADY_COMMITTED cpu_to_le32(0xC0190016) +#define STATUS_TRANSACTION_INVALID_MARSHALL_BUFFER cpu_to_le32(0xC0190017) +#define STATUS_CURRENT_TRANSACTION_NOT_VALID cpu_to_le32(0xC0190018) +#define STATUS_LOG_GROWTH_FAILED cpu_to_le32(0xC0190019) +#define STATUS_OBJECT_NO_LONGER_EXISTS cpu_to_le32(0xC0190021) +#define STATUS_STREAM_MINIVERSION_NOT_FOUND cpu_to_le32(0xC0190022) +#define STATUS_STREAM_MINIVERSION_NOT_VALID cpu_to_le32(0xC0190023) +#define STATUS_MINIVERSION_INACCESSIBLE_FROM_SPECIFIED_TRANSACTION \ + cpu_to_le32(0xC0190024) +#define STATUS_CANT_OPEN_MINIVERSION_WITH_MODIFY_INTENT cpu_to_le32(0xC0190025) +#define STATUS_CANT_CREATE_MORE_STREAM_MINIVERSIONS cpu_to_le32(0xC0190026) +#define STATUS_HANDLE_NO_LONGER_VALID cpu_to_le32(0xC0190028) +#define STATUS_LOG_CORRUPTION_DETECTED cpu_to_le32(0xC0190030) +#define STATUS_RM_DISCONNECTED cpu_to_le32(0xC0190032) +#define STATUS_ENLISTMENT_NOT_SUPERIOR cpu_to_le32(0xC0190033) +#define STATUS_FILE_IDENTITY_NOT_PERSISTENT cpu_to_le32(0xC0190036) +#define STATUS_CANT_BREAK_TRANSACTIONAL_DEPENDENCY cpu_to_le32(0xC0190037) +#define STATUS_CANT_CROSS_RM_BOUNDARY cpu_to_le32(0xC0190038) +#define STATUS_TXF_DIR_NOT_EMPTY cpu_to_le32(0xC0190039) +#define STATUS_INDOUBT_TRANSACTIONS_EXIST cpu_to_le32(0xC019003A) +#define STATUS_TM_VOLATILE cpu_to_le32(0xC019003B) +#define STATUS_ROLLBACK_TIMER_EXPIRED cpu_to_le32(0xC019003C) +#define STATUS_TXF_ATTRIBUTE_CORRUPT cpu_to_le32(0xC019003D) +#define STATUS_EFS_NOT_ALLOWED_IN_TRANSACTION cpu_to_le32(0xC019003E) +#define STATUS_TRANSACTIONAL_OPEN_NOT_ALLOWED cpu_to_le32(0xC019003F) +#define STATUS_TRANSACTED_MAPPING_UNSUPPORTED_REMOTE cpu_to_le32(0xC0190040) +#define STATUS_TRANSACTION_REQUIRED_PROMOTION cpu_to_le32(0xC0190043) +#define STATUS_CANNOT_EXECUTE_FILE_IN_TRANSACTION cpu_to_le32(0xC0190044) +#define STATUS_TRANSACTIONS_NOT_FROZEN cpu_to_le32(0xC0190045) +#define STATUS_TRANSACTION_FREEZE_IN_PROGRESS cpu_to_le32(0xC0190046) +#define STATUS_NOT_SNAPSHOT_VOLUME cpu_to_le32(0xC0190047) +#define STATUS_NO_SAVEPOINT_WITH_OPEN_FILES cpu_to_le32(0xC0190048) +#define STATUS_SPARSE_NOT_ALLOWED_IN_TRANSACTION cpu_to_le32(0xC0190049) +#define STATUS_TM_IDENTITY_MISMATCH cpu_to_le32(0xC019004A) +#define STATUS_FLOATED_SECTION cpu_to_le32(0xC019004B) +#define STATUS_CANNOT_ACCEPT_TRANSACTED_WORK cpu_to_le32(0xC019004C) +#define STATUS_CANNOT_ABORT_TRANSACTIONS cpu_to_le32(0xC019004D) +#define STATUS_TRANSACTION_NOT_FOUND cpu_to_le32(0xC019004E) +#define STATUS_RESOURCEMANAGER_NOT_FOUND cpu_to_le32(0xC019004F) +#define STATUS_ENLISTMENT_NOT_FOUND cpu_to_le32(0xC0190050) +#define STATUS_TRANSACTIONMANAGER_NOT_FOUND cpu_to_le32(0xC0190051) +#define STATUS_TRANSACTIONMANAGER_NOT_ONLINE cpu_to_le32(0xC0190052) +#define STATUS_TRANSACTIONMANAGER_RECOVERY_NAME_COLLISION \ + cpu_to_le32(0xC0190053) +#define STATUS_TRANSACTION_NOT_ROOT cpu_to_le32(0xC0190054) +#define STATUS_TRANSACTION_OBJECT_EXPIRED cpu_to_le32(0xC0190055) +#define STATUS_COMPRESSION_NOT_ALLOWED_IN_TRANSACTION cpu_to_le32(0xC0190056) +#define STATUS_TRANSACTION_RESPONSE_NOT_ENLISTED cpu_to_le32(0xC0190057) +#define STATUS_TRANSACTION_RECORD_TOO_LONG cpu_to_le32(0xC0190058) +#define STATUS_NO_LINK_TRACKING_IN_TRANSACTION cpu_to_le32(0xC0190059) +#define STATUS_OPERATION_NOT_SUPPORTED_IN_TRANSACTION cpu_to_le32(0xC019005A) +#define STATUS_TRANSACTION_INTEGRITY_VIOLATED cpu_to_le32(0xC019005B) +#define STATUS_LOG_SECTOR_INVALID cpu_to_le32(0xC01A0001) +#define STATUS_LOG_SECTOR_PARITY_INVALID cpu_to_le32(0xC01A0002) +#define STATUS_LOG_SECTOR_REMAPPED cpu_to_le32(0xC01A0003) +#define STATUS_LOG_BLOCK_INCOMPLETE cpu_to_le32(0xC01A0004) +#define STATUS_LOG_INVALID_RANGE cpu_to_le32(0xC01A0005) +#define STATUS_LOG_BLOCKS_EXHAUSTED cpu_to_le32(0xC01A0006) +#define STATUS_LOG_READ_CONTEXT_INVALID cpu_to_le32(0xC01A0007) +#define STATUS_LOG_RESTART_INVALID cpu_to_le32(0xC01A0008) +#define STATUS_LOG_BLOCK_VERSION cpu_to_le32(0xC01A0009) +#define STATUS_LOG_BLOCK_INVALID cpu_to_le32(0xC01A000A) +#define STATUS_LOG_READ_MODE_INVALID cpu_to_le32(0xC01A000B) +#define STATUS_LOG_METADATA_CORRUPT cpu_to_le32(0xC01A000D) +#define STATUS_LOG_METADATA_INVALID cpu_to_le32(0xC01A000E) +#define STATUS_LOG_METADATA_INCONSISTENT cpu_to_le32(0xC01A000F) +#define STATUS_LOG_RESERVATION_INVALID cpu_to_le32(0xC01A0010) +#define STATUS_LOG_CANT_DELETE cpu_to_le32(0xC01A0011) +#define STATUS_LOG_CONTAINER_LIMIT_EXCEEDED cpu_to_le32(0xC01A0012) +#define STATUS_LOG_START_OF_LOG cpu_to_le32(0xC01A0013) +#define STATUS_LOG_POLICY_ALREADY_INSTALLED cpu_to_le32(0xC01A0014) +#define STATUS_LOG_POLICY_NOT_INSTALLED cpu_to_le32(0xC01A0015) +#define STATUS_LOG_POLICY_INVALID cpu_to_le32(0xC01A0016) +#define STATUS_LOG_POLICY_CONFLICT cpu_to_le32(0xC01A0017) +#define STATUS_LOG_PINNED_ARCHIVE_TAIL cpu_to_le32(0xC01A0018) +#define STATUS_LOG_RECORD_NONEXISTENT cpu_to_le32(0xC01A0019) +#define STATUS_LOG_RECORDS_RESERVED_INVALID cpu_to_le32(0xC01A001A) +#define STATUS_LOG_SPACE_RESERVED_INVALID cpu_to_le32(0xC01A001B) +#define STATUS_LOG_TAIL_INVALID cpu_to_le32(0xC01A001C) +#define STATUS_LOG_FULL cpu_to_le32(0xC01A001D) +#define STATUS_LOG_MULTIPLEXED cpu_to_le32(0xC01A001E) +#define STATUS_LOG_DEDICATED cpu_to_le32(0xC01A001F) +#define STATUS_LOG_ARCHIVE_NOT_IN_PROGRESS cpu_to_le32(0xC01A0020) +#define STATUS_LOG_ARCHIVE_IN_PROGRESS cpu_to_le32(0xC01A0021) +#define STATUS_LOG_EPHEMERAL cpu_to_le32(0xC01A0022) +#define STATUS_LOG_NOT_ENOUGH_CONTAINERS cpu_to_le32(0xC01A0023) +#define STATUS_LOG_CLIENT_ALREADY_REGISTERED cpu_to_le32(0xC01A0024) +#define STATUS_LOG_CLIENT_NOT_REGISTERED cpu_to_le32(0xC01A0025) +#define STATUS_LOG_FULL_HANDLER_IN_PROGRESS cpu_to_le32(0xC01A0026) +#define STATUS_LOG_CONTAINER_READ_FAILED cpu_to_le32(0xC01A0027) +#define STATUS_LOG_CONTAINER_WRITE_FAILED cpu_to_le32(0xC01A0028) +#define STATUS_LOG_CONTAINER_OPEN_FAILED cpu_to_le32(0xC01A0029) +#define STATUS_LOG_CONTAINER_STATE_INVALID cpu_to_le32(0xC01A002A) +#define STATUS_LOG_STATE_INVALID cpu_to_le32(0xC01A002B) +#define STATUS_LOG_PINNED cpu_to_le32(0xC01A002C) +#define STATUS_LOG_METADATA_FLUSH_FAILED cpu_to_le32(0xC01A002D) +#define STATUS_LOG_INCONSISTENT_SECURITY cpu_to_le32(0xC01A002E) +#define STATUS_LOG_APPENDED_FLUSH_FAILED cpu_to_le32(0xC01A002F) +#define STATUS_LOG_PINNED_RESERVATION cpu_to_le32(0xC01A0030) +#define STATUS_VIDEO_HUNG_DISPLAY_DRIVER_THREAD cpu_to_le32(0xC01B00EA) +#define STATUS_FLT_NO_HANDLER_DEFINED cpu_to_le32(0xC01C0001) +#define STATUS_FLT_CONTEXT_ALREADY_DEFINED cpu_to_le32(0xC01C0002) +#define STATUS_FLT_INVALID_ASYNCHRONOUS_REQUEST cpu_to_le32(0xC01C0003) +#define STATUS_FLT_DISALLOW_FAST_IO cpu_to_le32(0xC01C0004) +#define STATUS_FLT_INVALID_NAME_REQUEST cpu_to_le32(0xC01C0005) +#define STATUS_FLT_NOT_SAFE_TO_POST_OPERATION cpu_to_le32(0xC01C0006) +#define STATUS_FLT_NOT_INITIALIZED cpu_to_le32(0xC01C0007) +#define STATUS_FLT_FILTER_NOT_READY cpu_to_le32(0xC01C0008) +#define STATUS_FLT_POST_OPERATION_CLEANUP cpu_to_le32(0xC01C0009) +#define STATUS_FLT_INTERNAL_ERROR cpu_to_le32(0xC01C000A) +#define STATUS_FLT_DELETING_OBJECT cpu_to_le32(0xC01C000B) +#define STATUS_FLT_MUST_BE_NONPAGED_POOL cpu_to_le32(0xC01C000C) +#define STATUS_FLT_DUPLICATE_ENTRY cpu_to_le32(0xC01C000D) +#define STATUS_FLT_CBDQ_DISABLED cpu_to_le32(0xC01C000E) +#define STATUS_FLT_DO_NOT_ATTACH cpu_to_le32(0xC01C000F) +#define STATUS_FLT_DO_NOT_DETACH cpu_to_le32(0xC01C0010) +#define STATUS_FLT_INSTANCE_ALTITUDE_COLLISION cpu_to_le32(0xC01C0011) +#define STATUS_FLT_INSTANCE_NAME_COLLISION cpu_to_le32(0xC01C0012) +#define STATUS_FLT_FILTER_NOT_FOUND cpu_to_le32(0xC01C0013) +#define STATUS_FLT_VOLUME_NOT_FOUND cpu_to_le32(0xC01C0014) +#define STATUS_FLT_INSTANCE_NOT_FOUND cpu_to_le32(0xC01C0015) +#define STATUS_FLT_CONTEXT_ALLOCATION_NOT_FOUND cpu_to_le32(0xC01C0016) +#define STATUS_FLT_INVALID_CONTEXT_REGISTRATION cpu_to_le32(0xC01C0017) +#define STATUS_FLT_NAME_CACHE_MISS cpu_to_le32(0xC01C0018) +#define STATUS_FLT_NO_DEVICE_OBJECT cpu_to_le32(0xC01C0019) +#define STATUS_FLT_VOLUME_ALREADY_MOUNTED cpu_to_le32(0xC01C001A) +#define STATUS_FLT_ALREADY_ENLISTED cpu_to_le32(0xC01C001B) +#define STATUS_FLT_CONTEXT_ALREADY_LINKED cpu_to_le32(0xC01C001C) +#define STATUS_FLT_NO_WAITER_FOR_REPLY cpu_to_le32(0xC01C0020) +#define STATUS_MONITOR_NO_DESCRIPTOR cpu_to_le32(0xC01D0001) +#define STATUS_MONITOR_UNKNOWN_DESCRIPTOR_FORMAT cpu_to_le32(0xC01D0002) +#define STATUS_MONITOR_INVALID_DESCRIPTOR_CHECKSUM cpu_to_le32(0xC01D0003) +#define STATUS_MONITOR_INVALID_STANDARD_TIMING_BLOCK cpu_to_le32(0xC01D0004) +#define STATUS_MONITOR_WMI_DATABLOCK_REGISTRATION_FAILED cpu_to_le32(0xC01D0005) +#define STATUS_MONITOR_INVALID_SERIAL_NUMBER_MONDSC_BLOCK \ + cpu_to_le32(0xC01D0006) +#define STATUS_MONITOR_INVALID_USER_FRIENDLY_MONDSC_BLOCK \ + cpu_to_le32(0xC01D0007) +#define STATUS_MONITOR_NO_MORE_DESCRIPTOR_DATA cpu_to_le32(0xC01D0008) +#define STATUS_MONITOR_INVALID_DETAILED_TIMING_BLOCK cpu_to_le32(0xC01D0009) +#define STATUS_GRAPHICS_NOT_EXCLUSIVE_MODE_OWNER cpu_to_le32(0xC01E0000) +#define STATUS_GRAPHICS_INSUFFICIENT_DMA_BUFFER cpu_to_le32(0xC01E0001) +#define STATUS_GRAPHICS_INVALID_DISPLAY_ADAPTER cpu_to_le32(0xC01E0002) +#define STATUS_GRAPHICS_ADAPTER_WAS_RESET cpu_to_le32(0xC01E0003) +#define STATUS_GRAPHICS_INVALID_DRIVER_MODEL cpu_to_le32(0xC01E0004) +#define STATUS_GRAPHICS_PRESENT_MODE_CHANGED cpu_to_le32(0xC01E0005) +#define STATUS_GRAPHICS_PRESENT_OCCLUDED cpu_to_le32(0xC01E0006) +#define STATUS_GRAPHICS_PRESENT_DENIED cpu_to_le32(0xC01E0007) +#define STATUS_GRAPHICS_CANNOTCOLORCONVERT cpu_to_le32(0xC01E0008) +#define STATUS_GRAPHICS_NO_VIDEO_MEMORY cpu_to_le32(0xC01E0100) +#define STATUS_GRAPHICS_CANT_LOCK_MEMORY cpu_to_le32(0xC01E0101) +#define STATUS_GRAPHICS_ALLOCATION_BUSY cpu_to_le32(0xC01E0102) +#define STATUS_GRAPHICS_TOO_MANY_REFERENCES cpu_to_le32(0xC01E0103) +#define STATUS_GRAPHICS_TRY_AGAIN_LATER cpu_to_le32(0xC01E0104) +#define STATUS_GRAPHICS_TRY_AGAIN_NOW cpu_to_le32(0xC01E0105) +#define STATUS_GRAPHICS_ALLOCATION_INVALID cpu_to_le32(0xC01E0106) +#define STATUS_GRAPHICS_UNSWIZZLING_APERTURE_UNAVAILABLE cpu_to_le32(0xC01E0107) +#define STATUS_GRAPHICS_UNSWIZZLING_APERTURE_UNSUPPORTED cpu_to_le32(0xC01E0108) +#define STATUS_GRAPHICS_CANT_EVICT_PINNED_ALLOCATION cpu_to_le32(0xC01E0109) +#define STATUS_GRAPHICS_INVALID_ALLOCATION_USAGE cpu_to_le32(0xC01E0110) +#define STATUS_GRAPHICS_CANT_RENDER_LOCKED_ALLOCATION cpu_to_le32(0xC01E0111) +#define STATUS_GRAPHICS_ALLOCATION_CLOSED cpu_to_le32(0xC01E0112) +#define STATUS_GRAPHICS_INVALID_ALLOCATION_INSTANCE cpu_to_le32(0xC01E0113) +#define STATUS_GRAPHICS_INVALID_ALLOCATION_HANDLE cpu_to_le32(0xC01E0114) +#define STATUS_GRAPHICS_WRONG_ALLOCATION_DEVICE cpu_to_le32(0xC01E0115) +#define STATUS_GRAPHICS_ALLOCATION_CONTENT_LOST cpu_to_le32(0xC01E0116) +#define STATUS_GRAPHICS_GPU_EXCEPTION_ON_DEVICE cpu_to_le32(0xC01E0200) +#define STATUS_GRAPHICS_INVALID_VIDPN_TOPOLOGY cpu_to_le32(0xC01E0300) +#define STATUS_GRAPHICS_VIDPN_TOPOLOGY_NOT_SUPPORTED cpu_to_le32(0xC01E0301) +#define STATUS_GRAPHICS_VIDPN_TOPOLOGY_CURRENTLY_NOT_SUPPORTED \ + cpu_to_le32(0xC01E0302) +#define STATUS_GRAPHICS_INVALID_VIDPN cpu_to_le32(0xC01E0303) +#define STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_SOURCE cpu_to_le32(0xC01E0304) +#define STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_TARGET cpu_to_le32(0xC01E0305) +#define STATUS_GRAPHICS_VIDPN_MODALITY_NOT_SUPPORTED cpu_to_le32(0xC01E0306) +#define STATUS_GRAPHICS_INVALID_VIDPN_SOURCEMODESET cpu_to_le32(0xC01E0308) +#define STATUS_GRAPHICS_INVALID_VIDPN_TARGETMODESET cpu_to_le32(0xC01E0309) +#define STATUS_GRAPHICS_INVALID_FREQUENCY cpu_to_le32(0xC01E030A) +#define STATUS_GRAPHICS_INVALID_ACTIVE_REGION cpu_to_le32(0xC01E030B) +#define STATUS_GRAPHICS_INVALID_TOTAL_REGION cpu_to_le32(0xC01E030C) +#define STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_SOURCE_MODE \ + cpu_to_le32(0xC01E0310) +#define STATUS_GRAPHICS_INVALID_VIDEO_PRESENT_TARGET_MODE \ + cpu_to_le32(0xC01E0311) +#define STATUS_GRAPHICS_PINNED_MODE_MUST_REMAIN_IN_SET cpu_to_le32(0xC01E0312) +#define STATUS_GRAPHICS_PATH_ALREADY_IN_TOPOLOGY cpu_to_le32(0xC01E0313) +#define STATUS_GRAPHICS_MODE_ALREADY_IN_MODESET cpu_to_le32(0xC01E0314) +#define STATUS_GRAPHICS_INVALID_VIDEOPRESENTSOURCESET cpu_to_le32(0xC01E0315) +#define STATUS_GRAPHICS_INVALID_VIDEOPRESENTTARGETSET cpu_to_le32(0xC01E0316) +#define STATUS_GRAPHICS_SOURCE_ALREADY_IN_SET cpu_to_le32(0xC01E0317) +#define STATUS_GRAPHICS_TARGET_ALREADY_IN_SET cpu_to_le32(0xC01E0318) +#define STATUS_GRAPHICS_INVALID_VIDPN_PRESENT_PATH cpu_to_le32(0xC01E0319) +#define STATUS_GRAPHICS_NO_RECOMMENDED_VIDPN_TOPOLOGY cpu_to_le32(0xC01E031A) +#define STATUS_GRAPHICS_INVALID_MONITOR_FREQUENCYRANGESET \ + cpu_to_le32(0xC01E031B) +#define STATUS_GRAPHICS_INVALID_MONITOR_FREQUENCYRANGE cpu_to_le32(0xC01E031C) +#define STATUS_GRAPHICS_FREQUENCYRANGE_NOT_IN_SET cpu_to_le32(0xC01E031D) +#define STATUS_GRAPHICS_FREQUENCYRANGE_ALREADY_IN_SET cpu_to_le32(0xC01E031F) +#define STATUS_GRAPHICS_STALE_MODESET cpu_to_le32(0xC01E0320) +#define STATUS_GRAPHICS_INVALID_MONITOR_SOURCEMODESET cpu_to_le32(0xC01E0321) +#define STATUS_GRAPHICS_INVALID_MONITOR_SOURCE_MODE cpu_to_le32(0xC01E0322) +#define STATUS_GRAPHICS_NO_RECOMMENDED_FUNCTIONAL_VIDPN cpu_to_le32(0xC01E0323) +#define STATUS_GRAPHICS_MODE_ID_MUST_BE_UNIQUE cpu_to_le32(0xC01E0324) +#define STATUS_GRAPHICS_EMPTY_ADAPTER_MONITOR_MODE_SUPPORT_INTERSECTION \ + cpu_to_le32(0xC01E0325) +#define STATUS_GRAPHICS_VIDEO_PRESENT_TARGETS_LESS_THAN_SOURCES \ + cpu_to_le32(0xC01E0326) +#define STATUS_GRAPHICS_PATH_NOT_IN_TOPOLOGY cpu_to_le32(0xC01E0327) +#define STATUS_GRAPHICS_ADAPTER_MUST_HAVE_AT_LEAST_ONE_SOURCE \ + cpu_to_le32(0xC01E0328) +#define STATUS_GRAPHICS_ADAPTER_MUST_HAVE_AT_LEAST_ONE_TARGET \ + cpu_to_le32(0xC01E0329) +#define STATUS_GRAPHICS_INVALID_MONITORDESCRIPTORSET cpu_to_le32(0xC01E032A) +#define STATUS_GRAPHICS_INVALID_MONITORDESCRIPTOR cpu_to_le32(0xC01E032B) +#define STATUS_GRAPHICS_MONITORDESCRIPTOR_NOT_IN_SET cpu_to_le32(0xC01E032C) +#define STATUS_GRAPHICS_MONITORDESCRIPTOR_ALREADY_IN_SET cpu_to_le32(0xC01E032D) +#define STATUS_GRAPHICS_MONITORDESCRIPTOR_ID_MUST_BE_UNIQUE \ + cpu_to_le32(0xC01E032E) +#define STATUS_GRAPHICS_INVALID_VIDPN_TARGET_SUBSET_TYPE cpu_to_le32(0xC01E032F) +#define STATUS_GRAPHICS_RESOURCES_NOT_RELATED cpu_to_le32(0xC01E0330) +#define STATUS_GRAPHICS_SOURCE_ID_MUST_BE_UNIQUE cpu_to_le32(0xC01E0331) +#define STATUS_GRAPHICS_TARGET_ID_MUST_BE_UNIQUE cpu_to_le32(0xC01E0332) +#define STATUS_GRAPHICS_NO_AVAILABLE_VIDPN_TARGET cpu_to_le32(0xC01E0333) +#define STATUS_GRAPHICS_MONITOR_COULD_NOT_BE_ASSOCIATED_WITH_ADAPTER \ + cpu_to_le32(0xC01E0334) +#define STATUS_GRAPHICS_NO_VIDPNMGR cpu_to_le32(0xC01E0335) +#define STATUS_GRAPHICS_NO_ACTIVE_VIDPN cpu_to_le32(0xC01E0336) +#define STATUS_GRAPHICS_STALE_VIDPN_TOPOLOGY cpu_to_le32(0xC01E0337) +#define STATUS_GRAPHICS_MONITOR_NOT_CONNECTED cpu_to_le32(0xC01E0338) +#define STATUS_GRAPHICS_SOURCE_NOT_IN_TOPOLOGY cpu_to_le32(0xC01E0339) +#define STATUS_GRAPHICS_INVALID_PRIMARYSURFACE_SIZE cpu_to_le32(0xC01E033A) +#define STATUS_GRAPHICS_INVALID_VISIBLEREGION_SIZE cpu_to_le32(0xC01E033B) +#define STATUS_GRAPHICS_INVALID_STRIDE cpu_to_le32(0xC01E033C) +#define STATUS_GRAPHICS_INVALID_PIXELFORMAT cpu_to_le32(0xC01E033D) +#define STATUS_GRAPHICS_INVALID_COLORBASIS cpu_to_le32(0xC01E033E) +#define STATUS_GRAPHICS_INVALID_PIXELVALUEACCESSMODE cpu_to_le32(0xC01E033F) +#define STATUS_GRAPHICS_TARGET_NOT_IN_TOPOLOGY cpu_to_le32(0xC01E0340) +#define STATUS_GRAPHICS_NO_DISPLAY_MODE_MANAGEMENT_SUPPORT \ + cpu_to_le32(0xC01E0341) +#define STATUS_GRAPHICS_VIDPN_SOURCE_IN_USE cpu_to_le32(0xC01E0342) +#define STATUS_GRAPHICS_CANT_ACCESS_ACTIVE_VIDPN cpu_to_le32(0xC01E0343) +#define STATUS_GRAPHICS_INVALID_PATH_IMPORTANCE_ORDINAL cpu_to_le32(0xC01E0344) +#define STATUS_GRAPHICS_INVALID_PATH_CONTENT_GEOMETRY_TRANSFORMATION \ + cpu_to_le32(0xC01E0345) +#define STATUS_GRAPHICS_PATH_CONTENT_GEOMETRY_TRANSFORMATION_NOT_SUPPORTED \ + cpu_to_le32(0xC01E0346) +#define STATUS_GRAPHICS_INVALID_GAMMA_RAMP cpu_to_le32(0xC01E0347) +#define STATUS_GRAPHICS_GAMMA_RAMP_NOT_SUPPORTED cpu_to_le32(0xC01E0348) +#define STATUS_GRAPHICS_MULTISAMPLING_NOT_SUPPORTED cpu_to_le32(0xC01E0349) +#define STATUS_GRAPHICS_MODE_NOT_IN_MODESET cpu_to_le32(0xC01E034A) +#define STATUS_GRAPHICS_INVALID_VIDPN_TOPOLOGY_RECOMMENDATION_REASON \ + cpu_to_le32(0xC01E034D) +#define STATUS_GRAPHICS_INVALID_PATH_CONTENT_TYPE cpu_to_le32(0xC01E034E) +#define STATUS_GRAPHICS_INVALID_COPYPROTECTION_TYPE cpu_to_le32(0xC01E034F) +#define STATUS_GRAPHICS_UNASSIGNED_MODESET_ALREADY_EXISTS \ + cpu_to_le32(0xC01E0350) +#define STATUS_GRAPHICS_INVALID_SCANLINE_ORDERING cpu_to_le32(0xC01E0352) +#define STATUS_GRAPHICS_TOPOLOGY_CHANGES_NOT_ALLOWED cpu_to_le32(0xC01E0353) +#define STATUS_GRAPHICS_NO_AVAILABLE_IMPORTANCE_ORDINALS cpu_to_le32(0xC01E0354) +#define STATUS_GRAPHICS_INCOMPATIBLE_PRIVATE_FORMAT cpu_to_le32(0xC01E0355) +#define STATUS_GRAPHICS_INVALID_MODE_PRUNING_ALGORITHM cpu_to_le32(0xC01E0356) +#define STATUS_GRAPHICS_INVALID_MONITOR_CAPABILITY_ORIGIN \ + cpu_to_le32(0xC01E0357) +#define STATUS_GRAPHICS_INVALID_MONITOR_FREQUENCYRANGE_CONSTRAINT \ + cpu_to_le32(0xC01E0358) +#define STATUS_GRAPHICS_MAX_NUM_PATHS_REACHED cpu_to_le32(0xC01E0359) +#define STATUS_GRAPHICS_CANCEL_VIDPN_TOPOLOGY_AUGMENTATION \ + cpu_to_le32(0xC01E035A) +#define STATUS_GRAPHICS_INVALID_CLIENT_TYPE cpu_to_le32(0xC01E035B) +#define STATUS_GRAPHICS_CLIENTVIDPN_NOT_SET cpu_to_le32(0xC01E035C) +#define STATUS_GRAPHICS_SPECIFIED_CHILD_ALREADY_CONNECTED \ + cpu_to_le32(0xC01E0400) +#define STATUS_GRAPHICS_CHILD_DESCRIPTOR_NOT_SUPPORTED cpu_to_le32(0xC01E0401) +#define STATUS_GRAPHICS_NOT_A_LINKED_ADAPTER cpu_to_le32(0xC01E0430) +#define STATUS_GRAPHICS_LEADLINK_NOT_ENUMERATED cpu_to_le32(0xC01E0431) +#define STATUS_GRAPHICS_CHAINLINKS_NOT_ENUMERATED cpu_to_le32(0xC01E0432) +#define STATUS_GRAPHICS_ADAPTER_CHAIN_NOT_READY cpu_to_le32(0xC01E0433) +#define STATUS_GRAPHICS_CHAINLINKS_NOT_STARTED cpu_to_le32(0xC01E0434) +#define STATUS_GRAPHICS_CHAINLINKS_NOT_POWERED_ON cpu_to_le32(0xC01E0435) +#define STATUS_GRAPHICS_INCONSISTENT_DEVICE_LINK_STATE cpu_to_le32(0xC01E0436) +#define STATUS_GRAPHICS_NOT_POST_DEVICE_DRIVER cpu_to_le32(0xC01E0438) +#define STATUS_GRAPHICS_ADAPTER_ACCESS_NOT_EXCLUDED cpu_to_le32(0xC01E043B) +#define STATUS_GRAPHICS_OPM_PROTECTED_OUTPUT_DOES_NOT_HAVE_COPP_SEMANTICS \ + cpu_to_le32(0xC01E051C) +#define STATUS_GRAPHICS_OPM_INVALID_INFORMATION_REQUEST cpu_to_le32(0xC01E051D) +#define STATUS_GRAPHICS_OPM_DRIVER_INTERNAL_ERROR cpu_to_le32(0xC01E051E) +#define STATUS_GRAPHICS_OPM_PROTECTED_OUTPUT_DOES_NOT_HAVE_OPM_SEMANTICS \ + cpu_to_le32(0xC01E051F) +#define STATUS_GRAPHICS_OPM_SIGNALING_NOT_SUPPORTED cpu_to_le32(0xC01E0520) +#define STATUS_GRAPHICS_OPM_INVALID_CONFIGURATION_REQUEST \ + cpu_to_le32(0xC01E0521) +#define STATUS_GRAPHICS_OPM_NOT_SUPPORTED cpu_to_le32(0xC01E0500) +#define STATUS_GRAPHICS_COPP_NOT_SUPPORTED cpu_to_le32(0xC01E0501) +#define STATUS_GRAPHICS_UAB_NOT_SUPPORTED cpu_to_le32(0xC01E0502) +#define STATUS_GRAPHICS_OPM_INVALID_ENCRYPTED_PARAMETERS cpu_to_le32(0xC01E0503) +#define STATUS_GRAPHICS_OPM_PARAMETER_ARRAY_TOO_SMALL cpu_to_le32(0xC01E0504) +#define STATUS_GRAPHICS_OPM_NO_PROTECTED_OUTPUTS_EXIST cpu_to_le32(0xC01E0505) +#define STATUS_GRAPHICS_PVP_NO_DISPLAY_DEVICE_CORRESPONDS_TO_NAME \ + cpu_to_le32(0xC01E0506) +#define STATUS_GRAPHICS_PVP_DISPLAY_DEVICE_NOT_ATTACHED_TO_DESKTOP \ + cpu_to_le32(0xC01E0507) +#define STATUS_GRAPHICS_PVP_MIRRORING_DEVICES_NOT_SUPPORTED \ + cpu_to_le32(0xC01E0508) +#define STATUS_GRAPHICS_OPM_INVALID_POINTER cpu_to_le32(0xC01E050A) +#define STATUS_GRAPHICS_OPM_INTERNAL_ERROR cpu_to_le32(0xC01E050B) +#define STATUS_GRAPHICS_OPM_INVALID_HANDLE cpu_to_le32(0xC01E050C) +#define STATUS_GRAPHICS_PVP_NO_MONITORS_CORRESPOND_TO_DISPLAY_DEVICE \ + cpu_to_le32(0xC01E050D) +#define STATUS_GRAPHICS_PVP_INVALID_CERTIFICATE_LENGTH cpu_to_le32(0xC01E050E) +#define STATUS_GRAPHICS_OPM_SPANNING_MODE_ENABLED cpu_to_le32(0xC01E050F) +#define STATUS_GRAPHICS_OPM_THEATER_MODE_ENABLED cpu_to_le32(0xC01E0510) +#define STATUS_GRAPHICS_PVP_HFS_FAILED cpu_to_le32(0xC01E0511) +#define STATUS_GRAPHICS_OPM_INVALID_SRM cpu_to_le32(0xC01E0512) +#define STATUS_GRAPHICS_OPM_OUTPUT_DOES_NOT_SUPPORT_HDCP cpu_to_le32(0xC01E0513) +#define STATUS_GRAPHICS_OPM_OUTPUT_DOES_NOT_SUPPORT_ACP cpu_to_le32(0xC01E0514) +#define STATUS_GRAPHICS_OPM_OUTPUT_DOES_NOT_SUPPORT_CGMSA \ + cpu_to_le32(0xC01E0515) +#define STATUS_GRAPHICS_OPM_HDCP_SRM_NEVER_SET cpu_to_le32(0xC01E0516) +#define STATUS_GRAPHICS_OPM_RESOLUTION_TOO_HIGH cpu_to_le32(0xC01E0517) +#define STATUS_GRAPHICS_OPM_ALL_HDCP_HARDWARE_ALREADY_IN_USE \ + cpu_to_le32(0xC01E0518) +#define STATUS_GRAPHICS_OPM_PROTECTED_OUTPUT_NO_LONGER_EXISTS \ + cpu_to_le32(0xC01E051A) +#define STATUS_GRAPHICS_OPM_SESSION_TYPE_CHANGE_IN_PROGRESS \ + cpu_to_le32(0xC01E051B) +#define STATUS_GRAPHICS_I2C_NOT_SUPPORTED cpu_to_le32(0xC01E0580) +#define STATUS_GRAPHICS_I2C_DEVICE_DOES_NOT_EXIST cpu_to_le32(0xC01E0581) +#define STATUS_GRAPHICS_I2C_ERROR_TRANSMITTING_DATA cpu_to_le32(0xC01E0582) +#define STATUS_GRAPHICS_I2C_ERROR_RECEIVING_DATA cpu_to_le32(0xC01E0583) +#define STATUS_GRAPHICS_DDCCI_VCP_NOT_SUPPORTED cpu_to_le32(0xC01E0584) +#define STATUS_GRAPHICS_DDCCI_INVALID_DATA cpu_to_le32(0xC01E0585) +#define STATUS_GRAPHICS_DDCCI_MONITOR_RETURNED_INVALID_TIMING_STATUS_BYTE \ + cpu_to_le32(0xC01E0586) +#define STATUS_GRAPHICS_DDCCI_INVALID_CAPABILITIES_STRING \ + cpu_to_le32(0xC01E0587) +#define STATUS_GRAPHICS_MCA_INTERNAL_ERROR cpu_to_le32(0xC01E0588) +#define STATUS_GRAPHICS_DDCCI_INVALID_MESSAGE_COMMAND cpu_to_le32(0xC01E0589) +#define STATUS_GRAPHICS_DDCCI_INVALID_MESSAGE_LENGTH cpu_to_le32(0xC01E058A) +#define STATUS_GRAPHICS_DDCCI_INVALID_MESSAGE_CHECKSUM cpu_to_le32(0xC01E058B) +#define STATUS_GRAPHICS_INVALID_PHYSICAL_MONITOR_HANDLE cpu_to_le32(0xC01E058C) +#define STATUS_GRAPHICS_MONITOR_NO_LONGER_EXISTS cpu_to_le32(0xC01E058D) +#define STATUS_GRAPHICS_ONLY_CONSOLE_SESSION_SUPPORTED cpu_to_le32(0xC01E05E0) +#define STATUS_GRAPHICS_NO_DISPLAY_DEVICE_CORRESPONDS_TO_NAME \ + cpu_to_le32(0xC01E05E1) +#define STATUS_GRAPHICS_DISPLAY_DEVICE_NOT_ATTACHED_TO_DESKTOP \ + cpu_to_le32(0xC01E05E2) +#define STATUS_GRAPHICS_MIRRORING_DEVICES_NOT_SUPPORTED cpu_to_le32(0xC01E05E3) +#define STATUS_GRAPHICS_INVALID_POINTER cpu_to_le32(0xC01E05E4) +#define STATUS_GRAPHICS_NO_MONITORS_CORRESPOND_TO_DISPLAY_DEVICE \ + cpu_to_le32(0xC01E05E5) +#define STATUS_GRAPHICS_PARAMETER_ARRAY_TOO_SMALL cpu_to_le32(0xC01E05E6) +#define STATUS_GRAPHICS_INTERNAL_ERROR cpu_to_le32(0xC01E05E7) +#define STATUS_GRAPHICS_SESSION_TYPE_CHANGE_IN_PROGRESS cpu_to_le32(0xC01E05E8) +#define STATUS_FVE_LOCKED_VOLUME cpu_to_le32(0xC0210000) +#define STATUS_FVE_NOT_ENCRYPTED cpu_to_le32(0xC0210001) +#define STATUS_FVE_BAD_INFORMATION cpu_to_le32(0xC0210002) +#define STATUS_FVE_TOO_SMALL cpu_to_le32(0xC0210003) +#define STATUS_FVE_FAILED_WRONG_FS cpu_to_le32(0xC0210004) +#define STATUS_FVE_FAILED_BAD_FS cpu_to_le32(0xC0210005) +#define STATUS_FVE_FS_NOT_EXTENDED cpu_to_le32(0xC0210006) +#define STATUS_FVE_FS_MOUNTED cpu_to_le32(0xC0210007) +#define STATUS_FVE_NO_LICENSE cpu_to_le32(0xC0210008) +#define STATUS_FVE_ACTION_NOT_ALLOWED cpu_to_le32(0xC0210009) +#define STATUS_FVE_BAD_DATA cpu_to_le32(0xC021000A) +#define STATUS_FVE_VOLUME_NOT_BOUND cpu_to_le32(0xC021000B) +#define STATUS_FVE_NOT_DATA_VOLUME cpu_to_le32(0xC021000C) +#define STATUS_FVE_CONV_READ_ERROR cpu_to_le32(0xC021000D) +#define STATUS_FVE_CONV_WRITE_ERROR cpu_to_le32(0xC021000E) +#define STATUS_FVE_OVERLAPPED_UPDATE cpu_to_le32(0xC021000F) +#define STATUS_FVE_FAILED_SECTOR_SIZE cpu_to_le32(0xC0210010) +#define STATUS_FVE_FAILED_AUTHENTICATION cpu_to_le32(0xC0210011) +#define STATUS_FVE_NOT_OS_VOLUME cpu_to_le32(0xC0210012) +#define STATUS_FVE_KEYFILE_NOT_FOUND cpu_to_le32(0xC0210013) +#define STATUS_FVE_KEYFILE_INVALID cpu_to_le32(0xC0210014) +#define STATUS_FVE_KEYFILE_NO_VMK cpu_to_le32(0xC0210015) +#define STATUS_FVE_TPM_DISABLED cpu_to_le32(0xC0210016) +#define STATUS_FVE_TPM_SRK_AUTH_NOT_ZERO cpu_to_le32(0xC0210017) +#define STATUS_FVE_TPM_INVALID_PCR cpu_to_le32(0xC0210018) +#define STATUS_FVE_TPM_NO_VMK cpu_to_le32(0xC0210019) +#define STATUS_FVE_PIN_INVALID cpu_to_le32(0xC021001A) +#define STATUS_FVE_AUTH_INVALID_APPLICATION cpu_to_le32(0xC021001B) +#define STATUS_FVE_AUTH_INVALID_CONFIG cpu_to_le32(0xC021001C) +#define STATUS_FVE_DEBUGGER_ENABLED cpu_to_le32(0xC021001D) +#define STATUS_FVE_DRY_RUN_FAILED cpu_to_le32(0xC021001E) +#define STATUS_FVE_BAD_METADATA_POINTER cpu_to_le32(0xC021001F) +#define STATUS_FVE_OLD_METADATA_COPY cpu_to_le32(0xC0210020) +#define STATUS_FVE_REBOOT_REQUIRED cpu_to_le32(0xC0210021) +#define STATUS_FVE_RAW_ACCESS cpu_to_le32(0xC0210022) +#define STATUS_FVE_RAW_BLOCKED cpu_to_le32(0xC0210023) +#define STATUS_FWP_CALLOUT_NOT_FOUND cpu_to_le32(0xC0220001) +#define STATUS_FWP_CONDITION_NOT_FOUND cpu_to_le32(0xC0220002) +#define STATUS_FWP_FILTER_NOT_FOUND cpu_to_le32(0xC0220003) +#define STATUS_FWP_LAYER_NOT_FOUND cpu_to_le32(0xC0220004) +#define STATUS_FWP_PROVIDER_NOT_FOUND cpu_to_le32(0xC0220005) +#define STATUS_FWP_PROVIDER_CONTEXT_NOT_FOUND cpu_to_le32(0xC0220006) +#define STATUS_FWP_SUBLAYER_NOT_FOUND cpu_to_le32(0xC0220007) +#define STATUS_FWP_NOT_FOUND cpu_to_le32(0xC0220008) +#define STATUS_FWP_ALREADY_EXISTS cpu_to_le32(0xC0220009) +#define STATUS_FWP_IN_USE cpu_to_le32(0xC022000A) +#define STATUS_FWP_DYNAMIC_SESSION_IN_PROGRESS cpu_to_le32(0xC022000B) +#define STATUS_FWP_WRONG_SESSION cpu_to_le32(0xC022000C) +#define STATUS_FWP_NO_TXN_IN_PROGRESS cpu_to_le32(0xC022000D) +#define STATUS_FWP_TXN_IN_PROGRESS cpu_to_le32(0xC022000E) +#define STATUS_FWP_TXN_ABORTED cpu_to_le32(0xC022000F) +#define STATUS_FWP_SESSION_ABORTED cpu_to_le32(0xC0220010) +#define STATUS_FWP_INCOMPATIBLE_TXN cpu_to_le32(0xC0220011) +#define STATUS_FWP_TIMEOUT cpu_to_le32(0xC0220012) +#define STATUS_FWP_NET_EVENTS_DISABLED cpu_to_le32(0xC0220013) +#define STATUS_FWP_INCOMPATIBLE_LAYER cpu_to_le32(0xC0220014) +#define STATUS_FWP_KM_CLIENTS_ONLY cpu_to_le32(0xC0220015) +#define STATUS_FWP_LIFETIME_MISMATCH cpu_to_le32(0xC0220016) +#define STATUS_FWP_BUILTIN_OBJECT cpu_to_le32(0xC0220017) +#define STATUS_FWP_TOO_MANY_BOOTTIME_FILTERS cpu_to_le32(0xC0220018) +#define STATUS_FWP_TOO_MANY_CALLOUTS cpu_to_le32(0xC0220018) +#define STATUS_FWP_NOTIFICATION_DROPPED cpu_to_le32(0xC0220019) +#define STATUS_FWP_TRAFFIC_MISMATCH cpu_to_le32(0xC022001A) +#define STATUS_FWP_INCOMPATIBLE_SA_STATE cpu_to_le32(0xC022001B) +#define STATUS_FWP_NULL_POINTER cpu_to_le32(0xC022001C) +#define STATUS_FWP_INVALID_ENUMERATOR cpu_to_le32(0xC022001D) +#define STATUS_FWP_INVALID_FLAGS cpu_to_le32(0xC022001E) +#define STATUS_FWP_INVALID_NET_MASK cpu_to_le32(0xC022001F) +#define STATUS_FWP_INVALID_RANGE cpu_to_le32(0xC0220020) +#define STATUS_FWP_INVALID_INTERVAL cpu_to_le32(0xC0220021) +#define STATUS_FWP_ZERO_LENGTH_ARRAY cpu_to_le32(0xC0220022) +#define STATUS_FWP_NULL_DISPLAY_NAME cpu_to_le32(0xC0220023) +#define STATUS_FWP_INVALID_ACTION_TYPE cpu_to_le32(0xC0220024) +#define STATUS_FWP_INVALID_WEIGHT cpu_to_le32(0xC0220025) +#define STATUS_FWP_MATCH_TYPE_MISMATCH cpu_to_le32(0xC0220026) +#define STATUS_FWP_TYPE_MISMATCH cpu_to_le32(0xC0220027) +#define STATUS_FWP_OUT_OF_BOUNDS cpu_to_le32(0xC0220028) +#define STATUS_FWP_RESERVED cpu_to_le32(0xC0220029) +#define STATUS_FWP_DUPLICATE_CONDITION cpu_to_le32(0xC022002A) +#define STATUS_FWP_DUPLICATE_KEYMOD cpu_to_le32(0xC022002B) +#define STATUS_FWP_ACTION_INCOMPATIBLE_WITH_LAYER cpu_to_le32(0xC022002C) +#define STATUS_FWP_ACTION_INCOMPATIBLE_WITH_SUBLAYER cpu_to_le32(0xC022002D) +#define STATUS_FWP_CONTEXT_INCOMPATIBLE_WITH_LAYER cpu_to_le32(0xC022002E) +#define STATUS_FWP_CONTEXT_INCOMPATIBLE_WITH_CALLOUT cpu_to_le32(0xC022002F) +#define STATUS_FWP_INCOMPATIBLE_AUTH_METHOD cpu_to_le32(0xC0220030) +#define STATUS_FWP_INCOMPATIBLE_DH_GROUP cpu_to_le32(0xC0220031) +#define STATUS_FWP_EM_NOT_SUPPORTED cpu_to_le32(0xC0220032) +#define STATUS_FWP_NEVER_MATCH cpu_to_le32(0xC0220033) +#define STATUS_FWP_PROVIDER_CONTEXT_MISMATCH cpu_to_le32(0xC0220034) +#define STATUS_FWP_INVALID_PARAMETER cpu_to_le32(0xC0220035) +#define STATUS_FWP_TOO_MANY_SUBLAYERS cpu_to_le32(0xC0220036) +#define STATUS_FWP_CALLOUT_NOTIFICATION_FAILED cpu_to_le32(0xC0220037) +#define STATUS_FWP_INCOMPATIBLE_AUTH_CONFIG cpu_to_le32(0xC0220038) +#define STATUS_FWP_INCOMPATIBLE_CIPHER_CONFIG cpu_to_le32(0xC0220039) +#define STATUS_FWP_TCPIP_NOT_READY cpu_to_le32(0xC0220100) +#define STATUS_FWP_INJECT_HANDLE_CLOSING cpu_to_le32(0xC0220101) +#define STATUS_FWP_INJECT_HANDLE_STALE cpu_to_le32(0xC0220102) +#define STATUS_FWP_CANNOT_PEND cpu_to_le32(0xC0220103) +#define STATUS_NDIS_CLOSING cpu_to_le32(0xC0230002) +#define STATUS_NDIS_BAD_VERSION cpu_to_le32(0xC0230004) +#define STATUS_NDIS_BAD_CHARACTERISTICS cpu_to_le32(0xC0230005) +#define STATUS_NDIS_ADAPTER_NOT_FOUND cpu_to_le32(0xC0230006) +#define STATUS_NDIS_OPEN_FAILED cpu_to_le32(0xC0230007) +#define STATUS_NDIS_DEVICE_FAILED cpu_to_le32(0xC0230008) +#define STATUS_NDIS_MULTICAST_FULL cpu_to_le32(0xC0230009) +#define STATUS_NDIS_MULTICAST_EXISTS cpu_to_le32(0xC023000A) +#define STATUS_NDIS_MULTICAST_NOT_FOUND cpu_to_le32(0xC023000B) +#define STATUS_NDIS_REQUEST_ABORTED cpu_to_le32(0xC023000C) +#define STATUS_NDIS_RESET_IN_PROGRESS cpu_to_le32(0xC023000D) +#define STATUS_NDIS_INVALID_PACKET cpu_to_le32(0xC023000F) +#define STATUS_NDIS_INVALID_DEVICE_REQUEST cpu_to_le32(0xC0230010) +#define STATUS_NDIS_ADAPTER_NOT_READY cpu_to_le32(0xC0230011) +#define STATUS_NDIS_INVALID_LENGTH cpu_to_le32(0xC0230014) +#define STATUS_NDIS_INVALID_DATA cpu_to_le32(0xC0230015) +#define STATUS_NDIS_BUFFER_TOO_SHORT cpu_to_le32(0xC0230016) +#define STATUS_NDIS_INVALID_OID cpu_to_le32(0xC0230017) +#define STATUS_NDIS_ADAPTER_REMOVED cpu_to_le32(0xC0230018) +#define STATUS_NDIS_UNSUPPORTED_MEDIA cpu_to_le32(0xC0230019) +#define STATUS_NDIS_GROUP_ADDRESS_IN_USE cpu_to_le32(0xC023001A) +#define STATUS_NDIS_FILE_NOT_FOUND cpu_to_le32(0xC023001B) +#define STATUS_NDIS_ERROR_READING_FILE cpu_to_le32(0xC023001C) +#define STATUS_NDIS_ALREADY_MAPPED cpu_to_le32(0xC023001D) +#define STATUS_NDIS_RESOURCE_CONFLICT cpu_to_le32(0xC023001E) +#define STATUS_NDIS_MEDIA_DISCONNECTED cpu_to_le32(0xC023001F) +#define STATUS_NDIS_INVALID_ADDRESS cpu_to_le32(0xC0230022) +#define STATUS_NDIS_PAUSED cpu_to_le32(0xC023002A) +#define STATUS_NDIS_INTERFACE_NOT_FOUND cpu_to_le32(0xC023002B) +#define STATUS_NDIS_UNSUPPORTED_REVISION cpu_to_le32(0xC023002C) +#define STATUS_NDIS_INVALID_PORT cpu_to_le32(0xC023002D) +#define STATUS_NDIS_INVALID_PORT_STATE cpu_to_le32(0xC023002E) +#define STATUS_NDIS_LOW_POWER_STATE cpu_to_le32(0xC023002F) +#define STATUS_NDIS_NOT_SUPPORTED cpu_to_le32(0xC02300BB) +#define STATUS_NDIS_DOT11_AUTO_CONFIG_ENABLED cpu_to_le32(0xC0232000) +#define STATUS_NDIS_DOT11_MEDIA_IN_USE cpu_to_le32(0xC0232001) +#define STATUS_NDIS_DOT11_POWER_STATE_INVALID cpu_to_le32(0xC0232002) +#define STATUS_IPSEC_BAD_SPI cpu_to_le32(0xC0360001) +#define STATUS_IPSEC_SA_LIFETIME_EXPIRED cpu_to_le32(0xC0360002) +#define STATUS_IPSEC_WRONG_SA cpu_to_le32(0xC0360003) +#define STATUS_IPSEC_REPLAY_CHECK_FAILED cpu_to_le32(0xC0360004) +#define STATUS_IPSEC_INVALID_PACKET cpu_to_le32(0xC0360005) +#define STATUS_IPSEC_INTEGRITY_CHECK_FAILED cpu_to_le32(0xC0360006) +#define STATUS_IPSEC_CLEAR_TEXT_DROP cpu_to_le32(0xC0360007) + +#define STATUS_NO_PREAUTH_INTEGRITY_HASH_OVERLAP cpu_to_le32(0xC05D0000) +#define STATUS_INVALID_LOCK_RANGE cpu_to_le32(0xC00001a1) diff --git a/fs/ksmbd/transport_ipc.c b/fs/ksmbd/transport_ipc.c new file mode 100644 index 000000000000..44aea33a67fa --- /dev/null +++ b/fs/ksmbd/transport_ipc.c @@ -0,0 +1,874 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2018 Samsung Electronics Co., Ltd. + */ + +#include <linux/jhash.h> +#include <linux/slab.h> +#include <linux/rwsem.h> +#include <linux/mutex.h> +#include <linux/wait.h> +#include <linux/hashtable.h> +#include <net/net_namespace.h> +#include <net/genetlink.h> +#include <linux/socket.h> +#include <linux/workqueue.h> + +#include "vfs_cache.h" +#include "transport_ipc.h" +#include "server.h" +#include "smb_common.h" + +#include "mgmt/user_config.h" +#include "mgmt/share_config.h" +#include "mgmt/user_session.h" +#include "mgmt/tree_connect.h" +#include "mgmt/ksmbd_ida.h" +#include "connection.h" +#include "transport_tcp.h" + +#define IPC_WAIT_TIMEOUT (2 * HZ) + +#define IPC_MSG_HASH_BITS 3 +static DEFINE_HASHTABLE(ipc_msg_table, IPC_MSG_HASH_BITS); +static DECLARE_RWSEM(ipc_msg_table_lock); +static DEFINE_MUTEX(startup_lock); + +static DEFINE_IDA(ipc_ida); + +static unsigned int ksmbd_tools_pid; + +static bool ksmbd_ipc_validate_version(struct genl_info *m) +{ + if (m->genlhdr->version != KSMBD_GENL_VERSION) { + pr_err("%s. ksmbd: %d, kernel module: %d. %s.\n", + "Daemon and kernel module version mismatch", + m->genlhdr->version, + KSMBD_GENL_VERSION, + "User-space ksmbd should terminate"); + return false; + } + return true; +} + +struct ksmbd_ipc_msg { + unsigned int type; + unsigned int sz; + unsigned char payload[]; +}; + +struct ipc_msg_table_entry { + unsigned int handle; + unsigned int type; + wait_queue_head_t wait; + struct hlist_node ipc_table_hlist; + + void *response; +}; + +static struct delayed_work ipc_timer_work; + +static int handle_startup_event(struct sk_buff *skb, struct genl_info *info); +static int handle_unsupported_event(struct sk_buff *skb, struct genl_info *info); +static int handle_generic_event(struct sk_buff *skb, struct genl_info *info); +static int ksmbd_ipc_heartbeat_request(void); + +static const struct nla_policy ksmbd_nl_policy[KSMBD_EVENT_MAX] = { + [KSMBD_EVENT_UNSPEC] = { + .len = 0, + }, + [KSMBD_EVENT_HEARTBEAT_REQUEST] = { + .len = sizeof(struct ksmbd_heartbeat), + }, + [KSMBD_EVENT_STARTING_UP] = { + .len = sizeof(struct ksmbd_startup_request), + }, + [KSMBD_EVENT_SHUTTING_DOWN] = { + .len = sizeof(struct ksmbd_shutdown_request), + }, + [KSMBD_EVENT_LOGIN_REQUEST] = { + .len = sizeof(struct ksmbd_login_request), + }, + [KSMBD_EVENT_LOGIN_RESPONSE] = { + .len = sizeof(struct ksmbd_login_response), + }, + [KSMBD_EVENT_SHARE_CONFIG_REQUEST] = { + .len = sizeof(struct ksmbd_share_config_request), + }, + [KSMBD_EVENT_SHARE_CONFIG_RESPONSE] = { + .len = sizeof(struct ksmbd_share_config_response), + }, + [KSMBD_EVENT_TREE_CONNECT_REQUEST] = { + .len = sizeof(struct ksmbd_tree_connect_request), + }, + [KSMBD_EVENT_TREE_CONNECT_RESPONSE] = { + .len = sizeof(struct ksmbd_tree_connect_response), + }, + [KSMBD_EVENT_TREE_DISCONNECT_REQUEST] = { + .len = sizeof(struct ksmbd_tree_disconnect_request), + }, + [KSMBD_EVENT_LOGOUT_REQUEST] = { + .len = sizeof(struct ksmbd_logout_request), + }, + [KSMBD_EVENT_RPC_REQUEST] = { + }, + [KSMBD_EVENT_RPC_RESPONSE] = { + }, + [KSMBD_EVENT_SPNEGO_AUTHEN_REQUEST] = { + }, + [KSMBD_EVENT_SPNEGO_AUTHEN_RESPONSE] = { + }, +}; + +static struct genl_ops ksmbd_genl_ops[] = { + { + .cmd = KSMBD_EVENT_UNSPEC, + .doit = handle_unsupported_event, + }, + { + .cmd = KSMBD_EVENT_HEARTBEAT_REQUEST, + .doit = handle_unsupported_event, + }, + { + .cmd = KSMBD_EVENT_STARTING_UP, + .doit = handle_startup_event, + }, + { + .cmd = KSMBD_EVENT_SHUTTING_DOWN, + .doit = handle_unsupported_event, + }, + { + .cmd = KSMBD_EVENT_LOGIN_REQUEST, + .doit = handle_unsupported_event, + }, + { + .cmd = KSMBD_EVENT_LOGIN_RESPONSE, + .doit = handle_generic_event, + }, + { + .cmd = KSMBD_EVENT_SHARE_CONFIG_REQUEST, + .doit = handle_unsupported_event, + }, + { + .cmd = KSMBD_EVENT_SHARE_CONFIG_RESPONSE, + .doit = handle_generic_event, + }, + { + .cmd = KSMBD_EVENT_TREE_CONNECT_REQUEST, + .doit = handle_unsupported_event, + }, + { + .cmd = KSMBD_EVENT_TREE_CONNECT_RESPONSE, + .doit = handle_generic_event, + }, + { + .cmd = KSMBD_EVENT_TREE_DISCONNECT_REQUEST, + .doit = handle_unsupported_event, + }, + { + .cmd = KSMBD_EVENT_LOGOUT_REQUEST, + .doit = handle_unsupported_event, + }, + { + .cmd = KSMBD_EVENT_RPC_REQUEST, + .doit = handle_unsupported_event, + }, + { + .cmd = KSMBD_EVENT_RPC_RESPONSE, + .doit = handle_generic_event, + }, + { + .cmd = KSMBD_EVENT_SPNEGO_AUTHEN_REQUEST, + .doit = handle_unsupported_event, + }, + { + .cmd = KSMBD_EVENT_SPNEGO_AUTHEN_RESPONSE, + .doit = handle_generic_event, + }, +}; + +static struct genl_family ksmbd_genl_family = { + .name = KSMBD_GENL_NAME, + .version = KSMBD_GENL_VERSION, + .hdrsize = 0, + .maxattr = KSMBD_EVENT_MAX, + .netnsok = true, + .module = THIS_MODULE, + .ops = ksmbd_genl_ops, + .n_ops = ARRAY_SIZE(ksmbd_genl_ops), +}; + +static void ksmbd_nl_init_fixup(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(ksmbd_genl_ops); i++) + ksmbd_genl_ops[i].validate = GENL_DONT_VALIDATE_STRICT | + GENL_DONT_VALIDATE_DUMP; + + ksmbd_genl_family.policy = ksmbd_nl_policy; +} + +static int rpc_context_flags(struct ksmbd_session *sess) +{ + if (user_guest(sess->user)) + return KSMBD_RPC_RESTRICTED_CONTEXT; + return 0; +} + +static void ipc_update_last_active(void) +{ + if (server_conf.ipc_timeout) + server_conf.ipc_last_active = jiffies; +} + +static struct ksmbd_ipc_msg *ipc_msg_alloc(size_t sz) +{ + struct ksmbd_ipc_msg *msg; + size_t msg_sz = sz + sizeof(struct ksmbd_ipc_msg); + + msg = kvmalloc(msg_sz, GFP_KERNEL | __GFP_ZERO); + if (msg) + msg->sz = sz; + return msg; +} + +static void ipc_msg_free(struct ksmbd_ipc_msg *msg) +{ + kvfree(msg); +} + +static void ipc_msg_handle_free(int handle) +{ + if (handle >= 0) + ksmbd_release_id(&ipc_ida, handle); +} + +static int handle_response(int type, void *payload, size_t sz) +{ + unsigned int handle = *(unsigned int *)payload; + struct ipc_msg_table_entry *entry; + int ret = 0; + + ipc_update_last_active(); + down_read(&ipc_msg_table_lock); + hash_for_each_possible(ipc_msg_table, entry, ipc_table_hlist, handle) { + if (handle != entry->handle) + continue; + + entry->response = NULL; + /* + * Response message type value should be equal to + * request message type + 1. + */ + if (entry->type + 1 != type) { + pr_err("Waiting for IPC type %d, got %d. Ignore.\n", + entry->type + 1, type); + } + + entry->response = kvmalloc(sz, GFP_KERNEL | __GFP_ZERO); + if (!entry->response) { + ret = -ENOMEM; + break; + } + + memcpy(entry->response, payload, sz); + wake_up_interruptible(&entry->wait); + ret = 0; + break; + } + up_read(&ipc_msg_table_lock); + + return ret; +} + +static int ipc_server_config_on_startup(struct ksmbd_startup_request *req) +{ + int ret; + + ksmbd_set_fd_limit(req->file_max); + server_conf.flags = req->flags; + server_conf.signing = req->signing; + server_conf.tcp_port = req->tcp_port; + server_conf.ipc_timeout = req->ipc_timeout * HZ; + server_conf.deadtime = req->deadtime * SMB_ECHO_INTERVAL; + server_conf.share_fake_fscaps = req->share_fake_fscaps; + ksmbd_init_domain(req->sub_auth); + + if (req->smb2_max_read) + init_smb2_max_read_size(req->smb2_max_read); + if (req->smb2_max_write) + init_smb2_max_write_size(req->smb2_max_write); + if (req->smb2_max_trans) + init_smb2_max_trans_size(req->smb2_max_trans); + + ret = ksmbd_set_netbios_name(req->netbios_name); + ret |= ksmbd_set_server_string(req->server_string); + ret |= ksmbd_set_work_group(req->work_group); + ret |= ksmbd_tcp_set_interfaces(KSMBD_STARTUP_CONFIG_INTERFACES(req), + req->ifc_list_sz); + if (ret) { + pr_err("Server configuration error: %s %s %s\n", + req->netbios_name, req->server_string, + req->work_group); + return ret; + } + + if (req->min_prot[0]) { + ret = ksmbd_lookup_protocol_idx(req->min_prot); + if (ret >= 0) + server_conf.min_protocol = ret; + } + if (req->max_prot[0]) { + ret = ksmbd_lookup_protocol_idx(req->max_prot); + if (ret >= 0) + server_conf.max_protocol = ret; + } + + if (server_conf.ipc_timeout) + schedule_delayed_work(&ipc_timer_work, server_conf.ipc_timeout); + return 0; +} + +static int handle_startup_event(struct sk_buff *skb, struct genl_info *info) +{ + int ret = 0; + +#ifdef CONFIG_SMB_SERVER_CHECK_CAP_NET_ADMIN + if (!netlink_capable(skb, CAP_NET_ADMIN)) + return -EPERM; +#endif + + if (!ksmbd_ipc_validate_version(info)) + return -EINVAL; + + if (!info->attrs[KSMBD_EVENT_STARTING_UP]) + return -EINVAL; + + mutex_lock(&startup_lock); + if (!ksmbd_server_configurable()) { + mutex_unlock(&startup_lock); + pr_err("Server reset is in progress, can't start daemon\n"); + return -EINVAL; + } + + if (ksmbd_tools_pid) { + if (ksmbd_ipc_heartbeat_request() == 0) { + ret = -EINVAL; + goto out; + } + + pr_err("Reconnect to a new user space daemon\n"); + } else { + struct ksmbd_startup_request *req; + + req = nla_data(info->attrs[info->genlhdr->cmd]); + ret = ipc_server_config_on_startup(req); + if (ret) + goto out; + server_queue_ctrl_init_work(); + } + + ksmbd_tools_pid = info->snd_portid; + ipc_update_last_active(); + +out: + mutex_unlock(&startup_lock); + return ret; +} + +static int handle_unsupported_event(struct sk_buff *skb, struct genl_info *info) +{ + pr_err("Unknown IPC event: %d, ignore.\n", info->genlhdr->cmd); + return -EINVAL; +} + +static int handle_generic_event(struct sk_buff *skb, struct genl_info *info) +{ + void *payload; + int sz; + int type = info->genlhdr->cmd; + +#ifdef CONFIG_SMB_SERVER_CHECK_CAP_NET_ADMIN + if (!netlink_capable(skb, CAP_NET_ADMIN)) + return -EPERM; +#endif + + if (type >= KSMBD_EVENT_MAX) { + WARN_ON(1); + return -EINVAL; + } + + if (!ksmbd_ipc_validate_version(info)) + return -EINVAL; + + if (!info->attrs[type]) + return -EINVAL; + + payload = nla_data(info->attrs[info->genlhdr->cmd]); + sz = nla_len(info->attrs[info->genlhdr->cmd]); + return handle_response(type, payload, sz); +} + +static int ipc_msg_send(struct ksmbd_ipc_msg *msg) +{ + struct genlmsghdr *nlh; + struct sk_buff *skb; + int ret = -EINVAL; + + if (!ksmbd_tools_pid) + return ret; + + skb = genlmsg_new(msg->sz, GFP_KERNEL); + if (!skb) + return -ENOMEM; + + nlh = genlmsg_put(skb, 0, 0, &ksmbd_genl_family, 0, msg->type); + if (!nlh) + goto out; + + ret = nla_put(skb, msg->type, msg->sz, msg->payload); + if (ret) { + genlmsg_cancel(skb, nlh); + goto out; + } + + genlmsg_end(skb, nlh); + ret = genlmsg_unicast(&init_net, skb, ksmbd_tools_pid); + if (!ret) + ipc_update_last_active(); + return ret; + +out: + nlmsg_free(skb); + return ret; +} + +static void *ipc_msg_send_request(struct ksmbd_ipc_msg *msg, unsigned int handle) +{ + struct ipc_msg_table_entry entry; + int ret; + + if ((int)handle < 0) + return NULL; + + entry.type = msg->type; + entry.response = NULL; + init_waitqueue_head(&entry.wait); + + down_write(&ipc_msg_table_lock); + entry.handle = handle; + hash_add(ipc_msg_table, &entry.ipc_table_hlist, entry.handle); + up_write(&ipc_msg_table_lock); + + ret = ipc_msg_send(msg); + if (ret) + goto out; + + ret = wait_event_interruptible_timeout(entry.wait, + entry.response != NULL, + IPC_WAIT_TIMEOUT); +out: + down_write(&ipc_msg_table_lock); + hash_del(&entry.ipc_table_hlist); + up_write(&ipc_msg_table_lock); + return entry.response; +} + +static int ksmbd_ipc_heartbeat_request(void) +{ + struct ksmbd_ipc_msg *msg; + int ret; + + msg = ipc_msg_alloc(sizeof(struct ksmbd_heartbeat)); + if (!msg) + return -EINVAL; + + msg->type = KSMBD_EVENT_HEARTBEAT_REQUEST; + ret = ipc_msg_send(msg); + ipc_msg_free(msg); + return ret; +} + +struct ksmbd_login_response *ksmbd_ipc_login_request(const char *account) +{ + struct ksmbd_ipc_msg *msg; + struct ksmbd_login_request *req; + struct ksmbd_login_response *resp; + + if (strlen(account) >= KSMBD_REQ_MAX_ACCOUNT_NAME_SZ) + return NULL; + + msg = ipc_msg_alloc(sizeof(struct ksmbd_login_request)); + if (!msg) + return NULL; + + msg->type = KSMBD_EVENT_LOGIN_REQUEST; + req = (struct ksmbd_login_request *)msg->payload; + req->handle = ksmbd_acquire_id(&ipc_ida); + strscpy(req->account, account, KSMBD_REQ_MAX_ACCOUNT_NAME_SZ); + + resp = ipc_msg_send_request(msg, req->handle); + ipc_msg_handle_free(req->handle); + ipc_msg_free(msg); + return resp; +} + +struct ksmbd_spnego_authen_response * +ksmbd_ipc_spnego_authen_request(const char *spnego_blob, int blob_len) +{ + struct ksmbd_ipc_msg *msg; + struct ksmbd_spnego_authen_request *req; + struct ksmbd_spnego_authen_response *resp; + + msg = ipc_msg_alloc(sizeof(struct ksmbd_spnego_authen_request) + + blob_len + 1); + if (!msg) + return NULL; + + msg->type = KSMBD_EVENT_SPNEGO_AUTHEN_REQUEST; + req = (struct ksmbd_spnego_authen_request *)msg->payload; + req->handle = ksmbd_acquire_id(&ipc_ida); + req->spnego_blob_len = blob_len; + memcpy(req->spnego_blob, spnego_blob, blob_len); + + resp = ipc_msg_send_request(msg, req->handle); + ipc_msg_handle_free(req->handle); + ipc_msg_free(msg); + return resp; +} + +struct ksmbd_tree_connect_response * +ksmbd_ipc_tree_connect_request(struct ksmbd_session *sess, + struct ksmbd_share_config *share, + struct ksmbd_tree_connect *tree_conn, + struct sockaddr *peer_addr) +{ + struct ksmbd_ipc_msg *msg; + struct ksmbd_tree_connect_request *req; + struct ksmbd_tree_connect_response *resp; + + if (strlen(user_name(sess->user)) >= KSMBD_REQ_MAX_ACCOUNT_NAME_SZ) + return NULL; + + if (strlen(share->name) >= KSMBD_REQ_MAX_SHARE_NAME) + return NULL; + + msg = ipc_msg_alloc(sizeof(struct ksmbd_tree_connect_request)); + if (!msg) + return NULL; + + msg->type = KSMBD_EVENT_TREE_CONNECT_REQUEST; + req = (struct ksmbd_tree_connect_request *)msg->payload; + + req->handle = ksmbd_acquire_id(&ipc_ida); + req->account_flags = sess->user->flags; + req->session_id = sess->id; + req->connect_id = tree_conn->id; + strscpy(req->account, user_name(sess->user), KSMBD_REQ_MAX_ACCOUNT_NAME_SZ); + strscpy(req->share, share->name, KSMBD_REQ_MAX_SHARE_NAME); + snprintf(req->peer_addr, sizeof(req->peer_addr), "%pIS", peer_addr); + + if (peer_addr->sa_family == AF_INET6) + req->flags |= KSMBD_TREE_CONN_FLAG_REQUEST_IPV6; + if (test_session_flag(sess, CIFDS_SESSION_FLAG_SMB2)) + req->flags |= KSMBD_TREE_CONN_FLAG_REQUEST_SMB2; + + resp = ipc_msg_send_request(msg, req->handle); + ipc_msg_handle_free(req->handle); + ipc_msg_free(msg); + return resp; +} + +int ksmbd_ipc_tree_disconnect_request(unsigned long long session_id, + unsigned long long connect_id) +{ + struct ksmbd_ipc_msg *msg; + struct ksmbd_tree_disconnect_request *req; + int ret; + + msg = ipc_msg_alloc(sizeof(struct ksmbd_tree_disconnect_request)); + if (!msg) + return -ENOMEM; + + msg->type = KSMBD_EVENT_TREE_DISCONNECT_REQUEST; + req = (struct ksmbd_tree_disconnect_request *)msg->payload; + req->session_id = session_id; + req->connect_id = connect_id; + + ret = ipc_msg_send(msg); + ipc_msg_free(msg); + return ret; +} + +int ksmbd_ipc_logout_request(const char *account) +{ + struct ksmbd_ipc_msg *msg; + struct ksmbd_logout_request *req; + int ret; + + if (strlen(account) >= KSMBD_REQ_MAX_ACCOUNT_NAME_SZ) + return -EINVAL; + + msg = ipc_msg_alloc(sizeof(struct ksmbd_logout_request)); + if (!msg) + return -ENOMEM; + + msg->type = KSMBD_EVENT_LOGOUT_REQUEST; + req = (struct ksmbd_logout_request *)msg->payload; + strscpy(req->account, account, KSMBD_REQ_MAX_ACCOUNT_NAME_SZ); + + ret = ipc_msg_send(msg); + ipc_msg_free(msg); + return ret; +} + +struct ksmbd_share_config_response * +ksmbd_ipc_share_config_request(const char *name) +{ + struct ksmbd_ipc_msg *msg; + struct ksmbd_share_config_request *req; + struct ksmbd_share_config_response *resp; + + if (strlen(name) >= KSMBD_REQ_MAX_SHARE_NAME) + return NULL; + + msg = ipc_msg_alloc(sizeof(struct ksmbd_share_config_request)); + if (!msg) + return NULL; + + msg->type = KSMBD_EVENT_SHARE_CONFIG_REQUEST; + req = (struct ksmbd_share_config_request *)msg->payload; + req->handle = ksmbd_acquire_id(&ipc_ida); + strscpy(req->share_name, name, KSMBD_REQ_MAX_SHARE_NAME); + + resp = ipc_msg_send_request(msg, req->handle); + ipc_msg_handle_free(req->handle); + ipc_msg_free(msg); + return resp; +} + +struct ksmbd_rpc_command *ksmbd_rpc_open(struct ksmbd_session *sess, int handle) +{ + struct ksmbd_ipc_msg *msg; + struct ksmbd_rpc_command *req; + struct ksmbd_rpc_command *resp; + + msg = ipc_msg_alloc(sizeof(struct ksmbd_rpc_command)); + if (!msg) + return NULL; + + msg->type = KSMBD_EVENT_RPC_REQUEST; + req = (struct ksmbd_rpc_command *)msg->payload; + req->handle = handle; + req->flags = ksmbd_session_rpc_method(sess, handle); + req->flags |= KSMBD_RPC_OPEN_METHOD; + req->payload_sz = 0; + + resp = ipc_msg_send_request(msg, req->handle); + ipc_msg_free(msg); + return resp; +} + +struct ksmbd_rpc_command *ksmbd_rpc_close(struct ksmbd_session *sess, int handle) +{ + struct ksmbd_ipc_msg *msg; + struct ksmbd_rpc_command *req; + struct ksmbd_rpc_command *resp; + + msg = ipc_msg_alloc(sizeof(struct ksmbd_rpc_command)); + if (!msg) + return NULL; + + msg->type = KSMBD_EVENT_RPC_REQUEST; + req = (struct ksmbd_rpc_command *)msg->payload; + req->handle = handle; + req->flags = ksmbd_session_rpc_method(sess, handle); + req->flags |= KSMBD_RPC_CLOSE_METHOD; + req->payload_sz = 0; + + resp = ipc_msg_send_request(msg, req->handle); + ipc_msg_free(msg); + return resp; +} + +struct ksmbd_rpc_command *ksmbd_rpc_write(struct ksmbd_session *sess, int handle, + void *payload, size_t payload_sz) +{ + struct ksmbd_ipc_msg *msg; + struct ksmbd_rpc_command *req; + struct ksmbd_rpc_command *resp; + + msg = ipc_msg_alloc(sizeof(struct ksmbd_rpc_command) + payload_sz + 1); + if (!msg) + return NULL; + + msg->type = KSMBD_EVENT_RPC_REQUEST; + req = (struct ksmbd_rpc_command *)msg->payload; + req->handle = handle; + req->flags = ksmbd_session_rpc_method(sess, handle); + req->flags |= rpc_context_flags(sess); + req->flags |= KSMBD_RPC_WRITE_METHOD; + req->payload_sz = payload_sz; + memcpy(req->payload, payload, payload_sz); + + resp = ipc_msg_send_request(msg, req->handle); + ipc_msg_free(msg); + return resp; +} + +struct ksmbd_rpc_command *ksmbd_rpc_read(struct ksmbd_session *sess, int handle) +{ + struct ksmbd_ipc_msg *msg; + struct ksmbd_rpc_command *req; + struct ksmbd_rpc_command *resp; + + msg = ipc_msg_alloc(sizeof(struct ksmbd_rpc_command)); + if (!msg) + return NULL; + + msg->type = KSMBD_EVENT_RPC_REQUEST; + req = (struct ksmbd_rpc_command *)msg->payload; + req->handle = handle; + req->flags = ksmbd_session_rpc_method(sess, handle); + req->flags |= rpc_context_flags(sess); + req->flags |= KSMBD_RPC_READ_METHOD; + req->payload_sz = 0; + + resp = ipc_msg_send_request(msg, req->handle); + ipc_msg_free(msg); + return resp; +} + +struct ksmbd_rpc_command *ksmbd_rpc_ioctl(struct ksmbd_session *sess, int handle, + void *payload, size_t payload_sz) +{ + struct ksmbd_ipc_msg *msg; + struct ksmbd_rpc_command *req; + struct ksmbd_rpc_command *resp; + + msg = ipc_msg_alloc(sizeof(struct ksmbd_rpc_command) + payload_sz + 1); + if (!msg) + return NULL; + + msg->type = KSMBD_EVENT_RPC_REQUEST; + req = (struct ksmbd_rpc_command *)msg->payload; + req->handle = handle; + req->flags = ksmbd_session_rpc_method(sess, handle); + req->flags |= rpc_context_flags(sess); + req->flags |= KSMBD_RPC_IOCTL_METHOD; + req->payload_sz = payload_sz; + memcpy(req->payload, payload, payload_sz); + + resp = ipc_msg_send_request(msg, req->handle); + ipc_msg_free(msg); + return resp; +} + +struct ksmbd_rpc_command *ksmbd_rpc_rap(struct ksmbd_session *sess, void *payload, + size_t payload_sz) +{ + struct ksmbd_ipc_msg *msg; + struct ksmbd_rpc_command *req; + struct ksmbd_rpc_command *resp; + + msg = ipc_msg_alloc(sizeof(struct ksmbd_rpc_command) + payload_sz + 1); + if (!msg) + return NULL; + + msg->type = KSMBD_EVENT_RPC_REQUEST; + req = (struct ksmbd_rpc_command *)msg->payload; + req->handle = ksmbd_acquire_id(&ipc_ida); + req->flags = rpc_context_flags(sess); + req->flags |= KSMBD_RPC_RAP_METHOD; + req->payload_sz = payload_sz; + memcpy(req->payload, payload, payload_sz); + + resp = ipc_msg_send_request(msg, req->handle); + ipc_msg_handle_free(req->handle); + ipc_msg_free(msg); + return resp; +} + +static int __ipc_heartbeat(void) +{ + unsigned long delta; + + if (!ksmbd_server_running()) + return 0; + + if (time_after(jiffies, server_conf.ipc_last_active)) { + delta = (jiffies - server_conf.ipc_last_active); + } else { + ipc_update_last_active(); + schedule_delayed_work(&ipc_timer_work, + server_conf.ipc_timeout); + return 0; + } + + if (delta < server_conf.ipc_timeout) { + schedule_delayed_work(&ipc_timer_work, + server_conf.ipc_timeout - delta); + return 0; + } + + if (ksmbd_ipc_heartbeat_request() == 0) { + schedule_delayed_work(&ipc_timer_work, + server_conf.ipc_timeout); + return 0; + } + + mutex_lock(&startup_lock); + WRITE_ONCE(server_conf.state, SERVER_STATE_RESETTING); + server_conf.ipc_last_active = 0; + ksmbd_tools_pid = 0; + pr_err("No IPC daemon response for %lus\n", delta / HZ); + mutex_unlock(&startup_lock); + return -EINVAL; +} + +static void ipc_timer_heartbeat(struct work_struct *w) +{ + if (__ipc_heartbeat()) + server_queue_ctrl_reset_work(); +} + +int ksmbd_ipc_id_alloc(void) +{ + return ksmbd_acquire_id(&ipc_ida); +} + +void ksmbd_rpc_id_free(int handle) +{ + ksmbd_release_id(&ipc_ida, handle); +} + +void ksmbd_ipc_release(void) +{ + cancel_delayed_work_sync(&ipc_timer_work); + genl_unregister_family(&ksmbd_genl_family); +} + +void ksmbd_ipc_soft_reset(void) +{ + mutex_lock(&startup_lock); + ksmbd_tools_pid = 0; + cancel_delayed_work_sync(&ipc_timer_work); + mutex_unlock(&startup_lock); +} + +int ksmbd_ipc_init(void) +{ + int ret = 0; + + ksmbd_nl_init_fixup(); + INIT_DELAYED_WORK(&ipc_timer_work, ipc_timer_heartbeat); + + ret = genl_register_family(&ksmbd_genl_family); + if (ret) { + pr_err("Failed to register KSMBD netlink interface %d\n", ret); + cancel_delayed_work_sync(&ipc_timer_work); + } + + return ret; +} diff --git a/fs/ksmbd/transport_ipc.h b/fs/ksmbd/transport_ipc.h new file mode 100644 index 000000000000..9eacc895ffdb --- /dev/null +++ b/fs/ksmbd/transport_ipc.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2018 Samsung Electronics Co., Ltd. + */ + +#ifndef __KSMBD_TRANSPORT_IPC_H__ +#define __KSMBD_TRANSPORT_IPC_H__ + +#include <linux/wait.h> + +#define KSMBD_IPC_MAX_PAYLOAD 4096 + +struct ksmbd_login_response * +ksmbd_ipc_login_request(const char *account); + +struct ksmbd_session; +struct ksmbd_share_config; +struct ksmbd_tree_connect; +struct sockaddr; + +struct ksmbd_tree_connect_response * +ksmbd_ipc_tree_connect_request(struct ksmbd_session *sess, + struct ksmbd_share_config *share, + struct ksmbd_tree_connect *tree_conn, + struct sockaddr *peer_addr); +int ksmbd_ipc_tree_disconnect_request(unsigned long long session_id, + unsigned long long connect_id); +int ksmbd_ipc_logout_request(const char *account); +struct ksmbd_share_config_response * +ksmbd_ipc_share_config_request(const char *name); +struct ksmbd_spnego_authen_response * +ksmbd_ipc_spnego_authen_request(const char *spnego_blob, int blob_len); +int ksmbd_ipc_id_alloc(void); +void ksmbd_rpc_id_free(int handle); +struct ksmbd_rpc_command *ksmbd_rpc_open(struct ksmbd_session *sess, int handle); +struct ksmbd_rpc_command *ksmbd_rpc_close(struct ksmbd_session *sess, int handle); +struct ksmbd_rpc_command *ksmbd_rpc_write(struct ksmbd_session *sess, int handle, + void *payload, size_t payload_sz); +struct ksmbd_rpc_command *ksmbd_rpc_read(struct ksmbd_session *sess, int handle); +struct ksmbd_rpc_command *ksmbd_rpc_ioctl(struct ksmbd_session *sess, int handle, + void *payload, size_t payload_sz); +struct ksmbd_rpc_command *ksmbd_rpc_rap(struct ksmbd_session *sess, void *payload, + size_t payload_sz); +void ksmbd_ipc_release(void); +void ksmbd_ipc_soft_reset(void); +int ksmbd_ipc_init(void); +#endif /* __KSMBD_TRANSPORT_IPC_H__ */ diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c new file mode 100644 index 000000000000..58f530056ac0 --- /dev/null +++ b/fs/ksmbd/transport_rdma.c @@ -0,0 +1,2058 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2017, Microsoft Corporation. + * Copyright (C) 2018, LG Electronics. + * + * Author(s): Long Li <longli@microsoft.com>, + * Hyunchul Lee <hyc.lee@gmail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + */ + +#define SUBMOD_NAME "smb_direct" + +#include <linux/kthread.h> +#include <linux/rwlock.h> +#include <linux/list.h> +#include <linux/mempool.h> +#include <linux/highmem.h> +#include <linux/scatterlist.h> +#include <rdma/ib_verbs.h> +#include <rdma/rdma_cm.h> +#include <rdma/rw.h> + +#include "glob.h" +#include "connection.h" +#include "smb_common.h" +#include "smbstatus.h" +#include "transport_rdma.h" + +#define SMB_DIRECT_PORT 5445 + +#define SMB_DIRECT_VERSION_LE cpu_to_le16(0x0100) + +/* SMB_DIRECT negotiation timeout in seconds */ +#define SMB_DIRECT_NEGOTIATE_TIMEOUT 120 + +#define SMB_DIRECT_MAX_SEND_SGES 8 +#define SMB_DIRECT_MAX_RECV_SGES 1 + +/* + * Default maximum number of RDMA read/write outstanding on this connection + * This value is possibly decreased during QP creation on hardware limit + */ +#define SMB_DIRECT_CM_INITIATOR_DEPTH 8 + +/* Maximum number of retries on data transfer operations */ +#define SMB_DIRECT_CM_RETRY 6 +/* No need to retry on Receiver Not Ready since SMB_DIRECT manages credits */ +#define SMB_DIRECT_CM_RNR_RETRY 0 + +/* + * User configurable initial values per SMB_DIRECT transport connection + * as defined in [MS-SMBD] 3.1.1.1 + * Those may change after a SMB_DIRECT negotiation + */ +/* The local peer's maximum number of credits to grant to the peer */ +static int smb_direct_receive_credit_max = 255; + +/* The remote peer's credit request of local peer */ +static int smb_direct_send_credit_target = 255; + +/* The maximum single message size can be sent to remote peer */ +static int smb_direct_max_send_size = 8192; + +/* The maximum fragmented upper-layer payload receive size supported */ +static int smb_direct_max_fragmented_recv_size = 1024 * 1024; + +/* The maximum single-message size which can be received */ +static int smb_direct_max_receive_size = 8192; + +static int smb_direct_max_read_write_size = 1024 * 1024; + +static int smb_direct_max_outstanding_rw_ops = 8; + +static struct smb_direct_listener { + struct rdma_cm_id *cm_id; +} smb_direct_listener; + +static struct workqueue_struct *smb_direct_wq; + +enum smb_direct_status { + SMB_DIRECT_CS_NEW = 0, + SMB_DIRECT_CS_CONNECTED, + SMB_DIRECT_CS_DISCONNECTING, + SMB_DIRECT_CS_DISCONNECTED, +}; + +struct smb_direct_transport { + struct ksmbd_transport transport; + + enum smb_direct_status status; + bool full_packet_received; + wait_queue_head_t wait_status; + + struct rdma_cm_id *cm_id; + struct ib_cq *send_cq; + struct ib_cq *recv_cq; + struct ib_pd *pd; + struct ib_qp *qp; + + int max_send_size; + int max_recv_size; + int max_fragmented_send_size; + int max_fragmented_recv_size; + int max_rdma_rw_size; + + spinlock_t reassembly_queue_lock; + struct list_head reassembly_queue; + int reassembly_data_length; + int reassembly_queue_length; + int first_entry_offset; + wait_queue_head_t wait_reassembly_queue; + + spinlock_t receive_credit_lock; + int recv_credits; + int count_avail_recvmsg; + int recv_credit_max; + int recv_credit_target; + + spinlock_t recvmsg_queue_lock; + struct list_head recvmsg_queue; + + spinlock_t empty_recvmsg_queue_lock; + struct list_head empty_recvmsg_queue; + + int send_credit_target; + atomic_t send_credits; + spinlock_t lock_new_recv_credits; + int new_recv_credits; + atomic_t rw_avail_ops; + + wait_queue_head_t wait_send_credits; + wait_queue_head_t wait_rw_avail_ops; + + mempool_t *sendmsg_mempool; + struct kmem_cache *sendmsg_cache; + mempool_t *recvmsg_mempool; + struct kmem_cache *recvmsg_cache; + + wait_queue_head_t wait_send_payload_pending; + atomic_t send_payload_pending; + wait_queue_head_t wait_send_pending; + atomic_t send_pending; + + struct delayed_work post_recv_credits_work; + struct work_struct send_immediate_work; + struct work_struct disconnect_work; + + bool negotiation_requested; +}; + +#define KSMBD_TRANS(t) ((struct ksmbd_transport *)&((t)->transport)) + +enum { + SMB_DIRECT_MSG_NEGOTIATE_REQ = 0, + SMB_DIRECT_MSG_DATA_TRANSFER +}; + +static struct ksmbd_transport_ops ksmbd_smb_direct_transport_ops; + +struct smb_direct_send_ctx { + struct list_head msg_list; + int wr_cnt; + bool need_invalidate_rkey; + unsigned int remote_key; +}; + +struct smb_direct_sendmsg { + struct smb_direct_transport *transport; + struct ib_send_wr wr; + struct list_head list; + int num_sge; + struct ib_sge sge[SMB_DIRECT_MAX_SEND_SGES]; + struct ib_cqe cqe; + u8 packet[]; +}; + +struct smb_direct_recvmsg { + struct smb_direct_transport *transport; + struct list_head list; + int type; + struct ib_sge sge; + struct ib_cqe cqe; + bool first_segment; + u8 packet[]; +}; + +struct smb_direct_rdma_rw_msg { + struct smb_direct_transport *t; + struct ib_cqe cqe; + struct completion *completion; + struct rdma_rw_ctx rw_ctx; + struct sg_table sgt; + struct scatterlist sg_list[0]; +}; + +static inline int get_buf_page_count(void *buf, int size) +{ + return DIV_ROUND_UP((uintptr_t)buf + size, PAGE_SIZE) - + (uintptr_t)buf / PAGE_SIZE; +} + +static void smb_direct_destroy_pools(struct smb_direct_transport *transport); +static void smb_direct_post_recv_credits(struct work_struct *work); +static int smb_direct_post_send_data(struct smb_direct_transport *t, + struct smb_direct_send_ctx *send_ctx, + struct kvec *iov, int niov, + int remaining_data_length); + +static inline struct smb_direct_transport * +smb_trans_direct_transfort(struct ksmbd_transport *t) +{ + return container_of(t, struct smb_direct_transport, transport); +} + +static inline void +*smb_direct_recvmsg_payload(struct smb_direct_recvmsg *recvmsg) +{ + return (void *)recvmsg->packet; +} + +static inline bool is_receive_credit_post_required(int receive_credits, + int avail_recvmsg_count) +{ + return receive_credits <= (smb_direct_receive_credit_max >> 3) && + avail_recvmsg_count >= (receive_credits >> 2); +} + +static struct +smb_direct_recvmsg *get_free_recvmsg(struct smb_direct_transport *t) +{ + struct smb_direct_recvmsg *recvmsg = NULL; + + spin_lock(&t->recvmsg_queue_lock); + if (!list_empty(&t->recvmsg_queue)) { + recvmsg = list_first_entry(&t->recvmsg_queue, + struct smb_direct_recvmsg, + list); + list_del(&recvmsg->list); + } + spin_unlock(&t->recvmsg_queue_lock); + return recvmsg; +} + +static void put_recvmsg(struct smb_direct_transport *t, + struct smb_direct_recvmsg *recvmsg) +{ + ib_dma_unmap_single(t->cm_id->device, recvmsg->sge.addr, + recvmsg->sge.length, DMA_FROM_DEVICE); + + spin_lock(&t->recvmsg_queue_lock); + list_add(&recvmsg->list, &t->recvmsg_queue); + spin_unlock(&t->recvmsg_queue_lock); +} + +static struct +smb_direct_recvmsg *get_empty_recvmsg(struct smb_direct_transport *t) +{ + struct smb_direct_recvmsg *recvmsg = NULL; + + spin_lock(&t->empty_recvmsg_queue_lock); + if (!list_empty(&t->empty_recvmsg_queue)) { + recvmsg = list_first_entry(&t->empty_recvmsg_queue, + struct smb_direct_recvmsg, list); + list_del(&recvmsg->list); + } + spin_unlock(&t->empty_recvmsg_queue_lock); + return recvmsg; +} + +static void put_empty_recvmsg(struct smb_direct_transport *t, + struct smb_direct_recvmsg *recvmsg) +{ + ib_dma_unmap_single(t->cm_id->device, recvmsg->sge.addr, + recvmsg->sge.length, DMA_FROM_DEVICE); + + spin_lock(&t->empty_recvmsg_queue_lock); + list_add_tail(&recvmsg->list, &t->empty_recvmsg_queue); + spin_unlock(&t->empty_recvmsg_queue_lock); +} + +static void enqueue_reassembly(struct smb_direct_transport *t, + struct smb_direct_recvmsg *recvmsg, + int data_length) +{ + spin_lock(&t->reassembly_queue_lock); + list_add_tail(&recvmsg->list, &t->reassembly_queue); + t->reassembly_queue_length++; + /* + * Make sure reassembly_data_length is updated after list and + * reassembly_queue_length are updated. On the dequeue side + * reassembly_data_length is checked without a lock to determine + * if reassembly_queue_length and list is up to date + */ + virt_wmb(); + t->reassembly_data_length += data_length; + spin_unlock(&t->reassembly_queue_lock); +} + +static struct smb_direct_recvmsg *get_first_reassembly(struct smb_direct_transport *t) +{ + if (!list_empty(&t->reassembly_queue)) + return list_first_entry(&t->reassembly_queue, + struct smb_direct_recvmsg, list); + else + return NULL; +} + +static void smb_direct_disconnect_rdma_work(struct work_struct *work) +{ + struct smb_direct_transport *t = + container_of(work, struct smb_direct_transport, + disconnect_work); + + if (t->status == SMB_DIRECT_CS_CONNECTED) { + t->status = SMB_DIRECT_CS_DISCONNECTING; + rdma_disconnect(t->cm_id); + } +} + +static void +smb_direct_disconnect_rdma_connection(struct smb_direct_transport *t) +{ + if (t->status == SMB_DIRECT_CS_CONNECTED) + queue_work(smb_direct_wq, &t->disconnect_work); +} + +static void smb_direct_send_immediate_work(struct work_struct *work) +{ + struct smb_direct_transport *t = container_of(work, + struct smb_direct_transport, send_immediate_work); + + if (t->status != SMB_DIRECT_CS_CONNECTED) + return; + + smb_direct_post_send_data(t, NULL, NULL, 0, 0); +} + +static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) +{ + struct smb_direct_transport *t; + struct ksmbd_conn *conn; + + t = kzalloc(sizeof(*t), GFP_KERNEL); + if (!t) + return NULL; + + t->cm_id = cm_id; + cm_id->context = t; + + t->status = SMB_DIRECT_CS_NEW; + init_waitqueue_head(&t->wait_status); + + spin_lock_init(&t->reassembly_queue_lock); + INIT_LIST_HEAD(&t->reassembly_queue); + t->reassembly_data_length = 0; + t->reassembly_queue_length = 0; + init_waitqueue_head(&t->wait_reassembly_queue); + init_waitqueue_head(&t->wait_send_credits); + init_waitqueue_head(&t->wait_rw_avail_ops); + + spin_lock_init(&t->receive_credit_lock); + spin_lock_init(&t->recvmsg_queue_lock); + INIT_LIST_HEAD(&t->recvmsg_queue); + + spin_lock_init(&t->empty_recvmsg_queue_lock); + INIT_LIST_HEAD(&t->empty_recvmsg_queue); + + init_waitqueue_head(&t->wait_send_payload_pending); + atomic_set(&t->send_payload_pending, 0); + init_waitqueue_head(&t->wait_send_pending); + atomic_set(&t->send_pending, 0); + + spin_lock_init(&t->lock_new_recv_credits); + + INIT_DELAYED_WORK(&t->post_recv_credits_work, + smb_direct_post_recv_credits); + INIT_WORK(&t->send_immediate_work, smb_direct_send_immediate_work); + INIT_WORK(&t->disconnect_work, smb_direct_disconnect_rdma_work); + + conn = ksmbd_conn_alloc(); + if (!conn) + goto err; + conn->transport = KSMBD_TRANS(t); + KSMBD_TRANS(t)->conn = conn; + KSMBD_TRANS(t)->ops = &ksmbd_smb_direct_transport_ops; + return t; +err: + kfree(t); + return NULL; +} + +static void free_transport(struct smb_direct_transport *t) +{ + struct smb_direct_recvmsg *recvmsg; + + wake_up_interruptible(&t->wait_send_credits); + + ksmbd_debug(RDMA, "wait for all send posted to IB to finish\n"); + wait_event(t->wait_send_payload_pending, + atomic_read(&t->send_payload_pending) == 0); + wait_event(t->wait_send_pending, + atomic_read(&t->send_pending) == 0); + + cancel_work_sync(&t->disconnect_work); + cancel_delayed_work_sync(&t->post_recv_credits_work); + cancel_work_sync(&t->send_immediate_work); + + if (t->qp) { + ib_drain_qp(t->qp); + ib_destroy_qp(t->qp); + } + + ksmbd_debug(RDMA, "drain the reassembly queue\n"); + do { + spin_lock(&t->reassembly_queue_lock); + recvmsg = get_first_reassembly(t); + if (recvmsg) { + list_del(&recvmsg->list); + spin_unlock(&t->reassembly_queue_lock); + put_recvmsg(t, recvmsg); + } else { + spin_unlock(&t->reassembly_queue_lock); + } + } while (recvmsg); + t->reassembly_data_length = 0; + + if (t->send_cq) + ib_free_cq(t->send_cq); + if (t->recv_cq) + ib_free_cq(t->recv_cq); + if (t->pd) + ib_dealloc_pd(t->pd); + if (t->cm_id) + rdma_destroy_id(t->cm_id); + + smb_direct_destroy_pools(t); + ksmbd_conn_free(KSMBD_TRANS(t)->conn); + kfree(t); +} + +static struct smb_direct_sendmsg +*smb_direct_alloc_sendmsg(struct smb_direct_transport *t) +{ + struct smb_direct_sendmsg *msg; + + msg = mempool_alloc(t->sendmsg_mempool, GFP_KERNEL); + if (!msg) + return ERR_PTR(-ENOMEM); + msg->transport = t; + INIT_LIST_HEAD(&msg->list); + msg->num_sge = 0; + return msg; +} + +static void smb_direct_free_sendmsg(struct smb_direct_transport *t, + struct smb_direct_sendmsg *msg) +{ + int i; + + if (msg->num_sge > 0) { + ib_dma_unmap_single(t->cm_id->device, + msg->sge[0].addr, msg->sge[0].length, + DMA_TO_DEVICE); + for (i = 1; i < msg->num_sge; i++) + ib_dma_unmap_page(t->cm_id->device, + msg->sge[i].addr, msg->sge[i].length, + DMA_TO_DEVICE); + } + mempool_free(msg, t->sendmsg_mempool); +} + +static int smb_direct_check_recvmsg(struct smb_direct_recvmsg *recvmsg) +{ + switch (recvmsg->type) { + case SMB_DIRECT_MSG_DATA_TRANSFER: { + struct smb_direct_data_transfer *req = + (struct smb_direct_data_transfer *)recvmsg->packet; + struct smb2_hdr *hdr = (struct smb2_hdr *)(recvmsg->packet + + le32_to_cpu(req->data_offset) - 4); + ksmbd_debug(RDMA, + "CreditGranted: %u, CreditRequested: %u, DataLength: %u, RemainingDataLength: %u, SMB: %x, Command: %u\n", + le16_to_cpu(req->credits_granted), + le16_to_cpu(req->credits_requested), + req->data_length, req->remaining_data_length, + hdr->ProtocolId, hdr->Command); + break; + } + case SMB_DIRECT_MSG_NEGOTIATE_REQ: { + struct smb_direct_negotiate_req *req = + (struct smb_direct_negotiate_req *)recvmsg->packet; + ksmbd_debug(RDMA, + "MinVersion: %u, MaxVersion: %u, CreditRequested: %u, MaxSendSize: %u, MaxRecvSize: %u, MaxFragmentedSize: %u\n", + le16_to_cpu(req->min_version), + le16_to_cpu(req->max_version), + le16_to_cpu(req->credits_requested), + le32_to_cpu(req->preferred_send_size), + le32_to_cpu(req->max_receive_size), + le32_to_cpu(req->max_fragmented_size)); + if (le16_to_cpu(req->min_version) > 0x0100 || + le16_to_cpu(req->max_version) < 0x0100) + return -EOPNOTSUPP; + if (le16_to_cpu(req->credits_requested) <= 0 || + le32_to_cpu(req->max_receive_size) <= 128 || + le32_to_cpu(req->max_fragmented_size) <= + 128 * 1024) + return -ECONNABORTED; + + break; + } + default: + return -EINVAL; + } + return 0; +} + +static void recv_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct smb_direct_recvmsg *recvmsg; + struct smb_direct_transport *t; + + recvmsg = container_of(wc->wr_cqe, struct smb_direct_recvmsg, cqe); + t = recvmsg->transport; + + if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_RECV) { + if (wc->status != IB_WC_WR_FLUSH_ERR) { + pr_err("Recv error. status='%s (%d)' opcode=%d\n", + ib_wc_status_msg(wc->status), wc->status, + wc->opcode); + smb_direct_disconnect_rdma_connection(t); + } + put_empty_recvmsg(t, recvmsg); + return; + } + + ksmbd_debug(RDMA, "Recv completed. status='%s (%d)', opcode=%d\n", + ib_wc_status_msg(wc->status), wc->status, + wc->opcode); + + ib_dma_sync_single_for_cpu(wc->qp->device, recvmsg->sge.addr, + recvmsg->sge.length, DMA_FROM_DEVICE); + + switch (recvmsg->type) { + case SMB_DIRECT_MSG_NEGOTIATE_REQ: + t->negotiation_requested = true; + t->full_packet_received = true; + wake_up_interruptible(&t->wait_status); + break; + case SMB_DIRECT_MSG_DATA_TRANSFER: { + struct smb_direct_data_transfer *data_transfer = + (struct smb_direct_data_transfer *)recvmsg->packet; + int data_length = le32_to_cpu(data_transfer->data_length); + int avail_recvmsg_count, receive_credits; + + if (data_length) { + if (t->full_packet_received) + recvmsg->first_segment = true; + + if (le32_to_cpu(data_transfer->remaining_data_length)) + t->full_packet_received = false; + else + t->full_packet_received = true; + + enqueue_reassembly(t, recvmsg, data_length); + wake_up_interruptible(&t->wait_reassembly_queue); + + spin_lock(&t->receive_credit_lock); + receive_credits = --(t->recv_credits); + avail_recvmsg_count = t->count_avail_recvmsg; + spin_unlock(&t->receive_credit_lock); + } else { + put_empty_recvmsg(t, recvmsg); + + spin_lock(&t->receive_credit_lock); + receive_credits = --(t->recv_credits); + avail_recvmsg_count = ++(t->count_avail_recvmsg); + spin_unlock(&t->receive_credit_lock); + } + + t->recv_credit_target = + le16_to_cpu(data_transfer->credits_requested); + atomic_add(le16_to_cpu(data_transfer->credits_granted), + &t->send_credits); + + if (le16_to_cpu(data_transfer->flags) & + SMB_DIRECT_RESPONSE_REQUESTED) + queue_work(smb_direct_wq, &t->send_immediate_work); + + if (atomic_read(&t->send_credits) > 0) + wake_up_interruptible(&t->wait_send_credits); + + if (is_receive_credit_post_required(receive_credits, avail_recvmsg_count)) + mod_delayed_work(smb_direct_wq, + &t->post_recv_credits_work, 0); + break; + } + default: + break; + } +} + +static int smb_direct_post_recv(struct smb_direct_transport *t, + struct smb_direct_recvmsg *recvmsg) +{ + struct ib_recv_wr wr; + int ret; + + recvmsg->sge.addr = ib_dma_map_single(t->cm_id->device, + recvmsg->packet, t->max_recv_size, + DMA_FROM_DEVICE); + ret = ib_dma_mapping_error(t->cm_id->device, recvmsg->sge.addr); + if (ret) + return ret; + recvmsg->sge.length = t->max_recv_size; + recvmsg->sge.lkey = t->pd->local_dma_lkey; + recvmsg->cqe.done = recv_done; + + wr.wr_cqe = &recvmsg->cqe; + wr.next = NULL; + wr.sg_list = &recvmsg->sge; + wr.num_sge = 1; + + ret = ib_post_recv(t->qp, &wr, NULL); + if (ret) { + pr_err("Can't post recv: %d\n", ret); + ib_dma_unmap_single(t->cm_id->device, + recvmsg->sge.addr, recvmsg->sge.length, + DMA_FROM_DEVICE); + smb_direct_disconnect_rdma_connection(t); + return ret; + } + return ret; +} + +static int smb_direct_read(struct ksmbd_transport *t, char *buf, + unsigned int size) +{ + struct smb_direct_recvmsg *recvmsg; + struct smb_direct_data_transfer *data_transfer; + int to_copy, to_read, data_read, offset; + u32 data_length, remaining_data_length, data_offset; + int rc; + struct smb_direct_transport *st = smb_trans_direct_transfort(t); + +again: + if (st->status != SMB_DIRECT_CS_CONNECTED) { + pr_err("disconnected\n"); + return -ENOTCONN; + } + + /* + * No need to hold the reassembly queue lock all the time as we are + * the only one reading from the front of the queue. The transport + * may add more entries to the back of the queue at the same time + */ + if (st->reassembly_data_length >= size) { + int queue_length; + int queue_removed = 0; + + /* + * Need to make sure reassembly_data_length is read before + * reading reassembly_queue_length and calling + * get_first_reassembly. This call is lock free + * as we never read at the end of the queue which are being + * updated in SOFTIRQ as more data is received + */ + virt_rmb(); + queue_length = st->reassembly_queue_length; + data_read = 0; + to_read = size; + offset = st->first_entry_offset; + while (data_read < size) { + recvmsg = get_first_reassembly(st); + data_transfer = smb_direct_recvmsg_payload(recvmsg); + data_length = le32_to_cpu(data_transfer->data_length); + remaining_data_length = + le32_to_cpu(data_transfer->remaining_data_length); + data_offset = le32_to_cpu(data_transfer->data_offset); + + /* + * The upper layer expects RFC1002 length at the + * beginning of the payload. Return it to indicate + * the total length of the packet. This minimize the + * change to upper layer packet processing logic. This + * will be eventually remove when an intermediate + * transport layer is added + */ + if (recvmsg->first_segment && size == 4) { + unsigned int rfc1002_len = + data_length + remaining_data_length; + *((__be32 *)buf) = cpu_to_be32(rfc1002_len); + data_read = 4; + recvmsg->first_segment = false; + ksmbd_debug(RDMA, + "returning rfc1002 length %d\n", + rfc1002_len); + goto read_rfc1002_done; + } + + to_copy = min_t(int, data_length - offset, to_read); + memcpy(buf + data_read, (char *)data_transfer + data_offset + offset, + to_copy); + + /* move on to the next buffer? */ + if (to_copy == data_length - offset) { + queue_length--; + /* + * No need to lock if we are not at the + * end of the queue + */ + if (queue_length) { + list_del(&recvmsg->list); + } else { + spin_lock_irq(&st->reassembly_queue_lock); + list_del(&recvmsg->list); + spin_unlock_irq(&st->reassembly_queue_lock); + } + queue_removed++; + put_recvmsg(st, recvmsg); + offset = 0; + } else { + offset += to_copy; + } + + to_read -= to_copy; + data_read += to_copy; + } + + spin_lock_irq(&st->reassembly_queue_lock); + st->reassembly_data_length -= data_read; + st->reassembly_queue_length -= queue_removed; + spin_unlock_irq(&st->reassembly_queue_lock); + + spin_lock(&st->receive_credit_lock); + st->count_avail_recvmsg += queue_removed; + if (is_receive_credit_post_required(st->recv_credits, st->count_avail_recvmsg)) { + spin_unlock(&st->receive_credit_lock); + mod_delayed_work(smb_direct_wq, + &st->post_recv_credits_work, 0); + } else { + spin_unlock(&st->receive_credit_lock); + } + + st->first_entry_offset = offset; + ksmbd_debug(RDMA, + "returning to thread data_read=%d reassembly_data_length=%d first_entry_offset=%d\n", + data_read, st->reassembly_data_length, + st->first_entry_offset); +read_rfc1002_done: + return data_read; + } + + ksmbd_debug(RDMA, "wait_event on more data\n"); + rc = wait_event_interruptible(st->wait_reassembly_queue, + st->reassembly_data_length >= size || + st->status != SMB_DIRECT_CS_CONNECTED); + if (rc) + return -EINTR; + + goto again; +} + +static void smb_direct_post_recv_credits(struct work_struct *work) +{ + struct smb_direct_transport *t = container_of(work, + struct smb_direct_transport, post_recv_credits_work.work); + struct smb_direct_recvmsg *recvmsg; + int receive_credits, credits = 0; + int ret; + int use_free = 1; + + spin_lock(&t->receive_credit_lock); + receive_credits = t->recv_credits; + spin_unlock(&t->receive_credit_lock); + + if (receive_credits < t->recv_credit_target) { + while (true) { + if (use_free) + recvmsg = get_free_recvmsg(t); + else + recvmsg = get_empty_recvmsg(t); + if (!recvmsg) { + if (use_free) { + use_free = 0; + continue; + } else { + break; + } + } + + recvmsg->type = SMB_DIRECT_MSG_DATA_TRANSFER; + recvmsg->first_segment = false; + + ret = smb_direct_post_recv(t, recvmsg); + if (ret) { + pr_err("Can't post recv: %d\n", ret); + put_recvmsg(t, recvmsg); + break; + } + credits++; + } + } + + spin_lock(&t->receive_credit_lock); + t->recv_credits += credits; + t->count_avail_recvmsg -= credits; + spin_unlock(&t->receive_credit_lock); + + spin_lock(&t->lock_new_recv_credits); + t->new_recv_credits += credits; + spin_unlock(&t->lock_new_recv_credits); + + if (credits) + queue_work(smb_direct_wq, &t->send_immediate_work); +} + +static void send_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct smb_direct_sendmsg *sendmsg, *sibling; + struct smb_direct_transport *t; + struct list_head *pos, *prev, *end; + + sendmsg = container_of(wc->wr_cqe, struct smb_direct_sendmsg, cqe); + t = sendmsg->transport; + + ksmbd_debug(RDMA, "Send completed. status='%s (%d)', opcode=%d\n", + ib_wc_status_msg(wc->status), wc->status, + wc->opcode); + + if (wc->status != IB_WC_SUCCESS || wc->opcode != IB_WC_SEND) { + pr_err("Send error. status='%s (%d)', opcode=%d\n", + ib_wc_status_msg(wc->status), wc->status, + wc->opcode); + smb_direct_disconnect_rdma_connection(t); + } + + if (sendmsg->num_sge > 1) { + if (atomic_dec_and_test(&t->send_payload_pending)) + wake_up(&t->wait_send_payload_pending); + } else { + if (atomic_dec_and_test(&t->send_pending)) + wake_up(&t->wait_send_pending); + } + + /* iterate and free the list of messages in reverse. the list's head + * is invalid. + */ + for (pos = &sendmsg->list, prev = pos->prev, end = sendmsg->list.next; + prev != end; pos = prev, prev = prev->prev) { + sibling = container_of(pos, struct smb_direct_sendmsg, list); + smb_direct_free_sendmsg(t, sibling); + } + + sibling = container_of(pos, struct smb_direct_sendmsg, list); + smb_direct_free_sendmsg(t, sibling); +} + +static int manage_credits_prior_sending(struct smb_direct_transport *t) +{ + int new_credits; + + spin_lock(&t->lock_new_recv_credits); + new_credits = t->new_recv_credits; + t->new_recv_credits = 0; + spin_unlock(&t->lock_new_recv_credits); + + return new_credits; +} + +static int smb_direct_post_send(struct smb_direct_transport *t, + struct ib_send_wr *wr) +{ + int ret; + + if (wr->num_sge > 1) + atomic_inc(&t->send_payload_pending); + else + atomic_inc(&t->send_pending); + + ret = ib_post_send(t->qp, wr, NULL); + if (ret) { + pr_err("failed to post send: %d\n", ret); + if (wr->num_sge > 1) { + if (atomic_dec_and_test(&t->send_payload_pending)) + wake_up(&t->wait_send_payload_pending); + } else { + if (atomic_dec_and_test(&t->send_pending)) + wake_up(&t->wait_send_pending); + } + smb_direct_disconnect_rdma_connection(t); + } + return ret; +} + +static void smb_direct_send_ctx_init(struct smb_direct_transport *t, + struct smb_direct_send_ctx *send_ctx, + bool need_invalidate_rkey, + unsigned int remote_key) +{ + INIT_LIST_HEAD(&send_ctx->msg_list); + send_ctx->wr_cnt = 0; + send_ctx->need_invalidate_rkey = need_invalidate_rkey; + send_ctx->remote_key = remote_key; +} + +static int smb_direct_flush_send_list(struct smb_direct_transport *t, + struct smb_direct_send_ctx *send_ctx, + bool is_last) +{ + struct smb_direct_sendmsg *first, *last; + int ret; + + if (list_empty(&send_ctx->msg_list)) + return 0; + + first = list_first_entry(&send_ctx->msg_list, + struct smb_direct_sendmsg, + list); + last = list_last_entry(&send_ctx->msg_list, + struct smb_direct_sendmsg, + list); + + last->wr.send_flags = IB_SEND_SIGNALED; + last->wr.wr_cqe = &last->cqe; + if (is_last && send_ctx->need_invalidate_rkey) { + last->wr.opcode = IB_WR_SEND_WITH_INV; + last->wr.ex.invalidate_rkey = send_ctx->remote_key; + } + + ret = smb_direct_post_send(t, &first->wr); + if (!ret) { + smb_direct_send_ctx_init(t, send_ctx, + send_ctx->need_invalidate_rkey, + send_ctx->remote_key); + } else { + atomic_add(send_ctx->wr_cnt, &t->send_credits); + wake_up(&t->wait_send_credits); + list_for_each_entry_safe(first, last, &send_ctx->msg_list, + list) { + smb_direct_free_sendmsg(t, first); + } + } + return ret; +} + +static int wait_for_credits(struct smb_direct_transport *t, + wait_queue_head_t *waitq, atomic_t *credits) +{ + int ret; + + do { + if (atomic_dec_return(credits) >= 0) + return 0; + + atomic_inc(credits); + ret = wait_event_interruptible(*waitq, + atomic_read(credits) > 0 || + t->status != SMB_DIRECT_CS_CONNECTED); + + if (t->status != SMB_DIRECT_CS_CONNECTED) + return -ENOTCONN; + else if (ret < 0) + return ret; + } while (true); +} + +static int wait_for_send_credits(struct smb_direct_transport *t, + struct smb_direct_send_ctx *send_ctx) +{ + int ret; + + if (send_ctx && + (send_ctx->wr_cnt >= 16 || atomic_read(&t->send_credits) <= 1)) { + ret = smb_direct_flush_send_list(t, send_ctx, false); + if (ret) + return ret; + } + + return wait_for_credits(t, &t->wait_send_credits, &t->send_credits); +} + +static int smb_direct_create_header(struct smb_direct_transport *t, + int size, int remaining_data_length, + struct smb_direct_sendmsg **sendmsg_out) +{ + struct smb_direct_sendmsg *sendmsg; + struct smb_direct_data_transfer *packet; + int header_length; + int ret; + + sendmsg = smb_direct_alloc_sendmsg(t); + if (IS_ERR(sendmsg)) + return PTR_ERR(sendmsg); + + /* Fill in the packet header */ + packet = (struct smb_direct_data_transfer *)sendmsg->packet; + packet->credits_requested = cpu_to_le16(t->send_credit_target); + packet->credits_granted = cpu_to_le16(manage_credits_prior_sending(t)); + + packet->flags = 0; + packet->reserved = 0; + if (!size) + packet->data_offset = 0; + else + packet->data_offset = cpu_to_le32(24); + packet->data_length = cpu_to_le32(size); + packet->remaining_data_length = cpu_to_le32(remaining_data_length); + packet->padding = 0; + + ksmbd_debug(RDMA, + "credits_requested=%d credits_granted=%d data_offset=%d data_length=%d remaining_data_length=%d\n", + le16_to_cpu(packet->credits_requested), + le16_to_cpu(packet->credits_granted), + le32_to_cpu(packet->data_offset), + le32_to_cpu(packet->data_length), + le32_to_cpu(packet->remaining_data_length)); + + /* Map the packet to DMA */ + header_length = sizeof(struct smb_direct_data_transfer); + /* If this is a packet without payload, don't send padding */ + if (!size) + header_length = + offsetof(struct smb_direct_data_transfer, padding); + + sendmsg->sge[0].addr = ib_dma_map_single(t->cm_id->device, + (void *)packet, + header_length, + DMA_TO_DEVICE); + ret = ib_dma_mapping_error(t->cm_id->device, sendmsg->sge[0].addr); + if (ret) { + smb_direct_free_sendmsg(t, sendmsg); + return ret; + } + + sendmsg->num_sge = 1; + sendmsg->sge[0].length = header_length; + sendmsg->sge[0].lkey = t->pd->local_dma_lkey; + + *sendmsg_out = sendmsg; + return 0; +} + +static int get_sg_list(void *buf, int size, struct scatterlist *sg_list, int nentries) +{ + bool high = is_vmalloc_addr(buf); + struct page *page; + int offset, len; + int i = 0; + + if (nentries < get_buf_page_count(buf, size)) + return -EINVAL; + + offset = offset_in_page(buf); + buf -= offset; + while (size > 0) { + len = min_t(int, PAGE_SIZE - offset, size); + if (high) + page = vmalloc_to_page(buf); + else + page = kmap_to_page(buf); + + if (!sg_list) + return -EINVAL; + sg_set_page(sg_list, page, len, offset); + sg_list = sg_next(sg_list); + + buf += PAGE_SIZE; + size -= len; + offset = 0; + i++; + } + return i; +} + +static int get_mapped_sg_list(struct ib_device *device, void *buf, int size, + struct scatterlist *sg_list, int nentries, + enum dma_data_direction dir) +{ + int npages; + + npages = get_sg_list(buf, size, sg_list, nentries); + if (npages <= 0) + return -EINVAL; + return ib_dma_map_sg(device, sg_list, npages, dir); +} + +static int post_sendmsg(struct smb_direct_transport *t, + struct smb_direct_send_ctx *send_ctx, + struct smb_direct_sendmsg *msg) +{ + int i; + + for (i = 0; i < msg->num_sge; i++) + ib_dma_sync_single_for_device(t->cm_id->device, + msg->sge[i].addr, msg->sge[i].length, + DMA_TO_DEVICE); + + msg->cqe.done = send_done; + msg->wr.opcode = IB_WR_SEND; + msg->wr.sg_list = &msg->sge[0]; + msg->wr.num_sge = msg->num_sge; + msg->wr.next = NULL; + + if (send_ctx) { + msg->wr.wr_cqe = NULL; + msg->wr.send_flags = 0; + if (!list_empty(&send_ctx->msg_list)) { + struct smb_direct_sendmsg *last; + + last = list_last_entry(&send_ctx->msg_list, + struct smb_direct_sendmsg, + list); + last->wr.next = &msg->wr; + } + list_add_tail(&msg->list, &send_ctx->msg_list); + send_ctx->wr_cnt++; + return 0; + } + + msg->wr.wr_cqe = &msg->cqe; + msg->wr.send_flags = IB_SEND_SIGNALED; + return smb_direct_post_send(t, &msg->wr); +} + +static int smb_direct_post_send_data(struct smb_direct_transport *t, + struct smb_direct_send_ctx *send_ctx, + struct kvec *iov, int niov, + int remaining_data_length) +{ + int i, j, ret; + struct smb_direct_sendmsg *msg; + int data_length; + struct scatterlist sg[SMB_DIRECT_MAX_SEND_SGES - 1]; + + ret = wait_for_send_credits(t, send_ctx); + if (ret) + return ret; + + data_length = 0; + for (i = 0; i < niov; i++) + data_length += iov[i].iov_len; + + ret = smb_direct_create_header(t, data_length, remaining_data_length, + &msg); + if (ret) { + atomic_inc(&t->send_credits); + return ret; + } + + for (i = 0; i < niov; i++) { + struct ib_sge *sge; + int sg_cnt; + + sg_init_table(sg, SMB_DIRECT_MAX_SEND_SGES - 1); + sg_cnt = get_mapped_sg_list(t->cm_id->device, + iov[i].iov_base, iov[i].iov_len, + sg, SMB_DIRECT_MAX_SEND_SGES - 1, + DMA_TO_DEVICE); + if (sg_cnt <= 0) { + pr_err("failed to map buffer\n"); + ret = -ENOMEM; + goto err; + } else if (sg_cnt + msg->num_sge > SMB_DIRECT_MAX_SEND_SGES - 1) { + pr_err("buffer not fitted into sges\n"); + ret = -E2BIG; + ib_dma_unmap_sg(t->cm_id->device, sg, sg_cnt, + DMA_TO_DEVICE); + goto err; + } + + for (j = 0; j < sg_cnt; j++) { + sge = &msg->sge[msg->num_sge]; + sge->addr = sg_dma_address(&sg[j]); + sge->length = sg_dma_len(&sg[j]); + sge->lkey = t->pd->local_dma_lkey; + msg->num_sge++; + } + } + + ret = post_sendmsg(t, send_ctx, msg); + if (ret) + goto err; + return 0; +err: + smb_direct_free_sendmsg(t, msg); + atomic_inc(&t->send_credits); + return ret; +} + +static int smb_direct_writev(struct ksmbd_transport *t, + struct kvec *iov, int niovs, int buflen, + bool need_invalidate, unsigned int remote_key) +{ + struct smb_direct_transport *st = smb_trans_direct_transfort(t); + int remaining_data_length; + int start, i, j; + int max_iov_size = st->max_send_size - + sizeof(struct smb_direct_data_transfer); + int ret; + struct kvec vec; + struct smb_direct_send_ctx send_ctx; + + if (st->status != SMB_DIRECT_CS_CONNECTED) + return -ENOTCONN; + + //FIXME: skip RFC1002 header.. + buflen -= 4; + iov[0].iov_base += 4; + iov[0].iov_len -= 4; + + remaining_data_length = buflen; + ksmbd_debug(RDMA, "Sending smb (RDMA): smb_len=%u\n", buflen); + + smb_direct_send_ctx_init(st, &send_ctx, need_invalidate, remote_key); + start = i = 0; + buflen = 0; + while (true) { + buflen += iov[i].iov_len; + if (buflen > max_iov_size) { + if (i > start) { + remaining_data_length -= + (buflen - iov[i].iov_len); + ret = smb_direct_post_send_data(st, &send_ctx, + &iov[start], i - start, + remaining_data_length); + if (ret) + goto done; + } else { + /* iov[start] is too big, break it */ + int nvec = (buflen + max_iov_size - 1) / + max_iov_size; + + for (j = 0; j < nvec; j++) { + vec.iov_base = + (char *)iov[start].iov_base + + j * max_iov_size; + vec.iov_len = + min_t(int, max_iov_size, + buflen - max_iov_size * j); + remaining_data_length -= vec.iov_len; + ret = smb_direct_post_send_data(st, &send_ctx, &vec, 1, + remaining_data_length); + if (ret) + goto done; + } + i++; + if (i == niovs) + break; + } + start = i; + buflen = 0; + } else { + i++; + if (i == niovs) { + /* send out all remaining vecs */ + remaining_data_length -= buflen; + ret = smb_direct_post_send_data(st, &send_ctx, + &iov[start], i - start, + remaining_data_length); + if (ret) + goto done; + break; + } + } + } + +done: + ret = smb_direct_flush_send_list(st, &send_ctx, true); + + /* + * As an optimization, we don't wait for individual I/O to finish + * before sending the next one. + * Send them all and wait for pending send count to get to 0 + * that means all the I/Os have been out and we are good to return + */ + + wait_event(st->wait_send_payload_pending, + atomic_read(&st->send_payload_pending) == 0); + return ret; +} + +static void read_write_done(struct ib_cq *cq, struct ib_wc *wc, + enum dma_data_direction dir) +{ + struct smb_direct_rdma_rw_msg *msg = container_of(wc->wr_cqe, + struct smb_direct_rdma_rw_msg, cqe); + struct smb_direct_transport *t = msg->t; + + if (wc->status != IB_WC_SUCCESS) { + pr_err("read/write error. opcode = %d, status = %s(%d)\n", + wc->opcode, ib_wc_status_msg(wc->status), wc->status); + smb_direct_disconnect_rdma_connection(t); + } + + if (atomic_inc_return(&t->rw_avail_ops) > 0) + wake_up(&t->wait_rw_avail_ops); + + rdma_rw_ctx_destroy(&msg->rw_ctx, t->qp, t->qp->port, + msg->sg_list, msg->sgt.nents, dir); + sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); + complete(msg->completion); + kfree(msg); +} + +static void read_done(struct ib_cq *cq, struct ib_wc *wc) +{ + read_write_done(cq, wc, DMA_FROM_DEVICE); +} + +static void write_done(struct ib_cq *cq, struct ib_wc *wc) +{ + read_write_done(cq, wc, DMA_TO_DEVICE); +} + +static int smb_direct_rdma_xmit(struct smb_direct_transport *t, void *buf, + int buf_len, u32 remote_key, u64 remote_offset, + u32 remote_len, bool is_read) +{ + struct smb_direct_rdma_rw_msg *msg; + int ret; + DECLARE_COMPLETION_ONSTACK(completion); + struct ib_send_wr *first_wr = NULL; + + ret = wait_for_credits(t, &t->wait_rw_avail_ops, &t->rw_avail_ops); + if (ret < 0) + return ret; + + /* TODO: mempool */ + msg = kmalloc(offsetof(struct smb_direct_rdma_rw_msg, sg_list) + + sizeof(struct scatterlist) * SG_CHUNK_SIZE, GFP_KERNEL); + if (!msg) { + atomic_inc(&t->rw_avail_ops); + return -ENOMEM; + } + + msg->sgt.sgl = &msg->sg_list[0]; + ret = sg_alloc_table_chained(&msg->sgt, + get_buf_page_count(buf, buf_len), + msg->sg_list, SG_CHUNK_SIZE); + if (ret) { + atomic_inc(&t->rw_avail_ops); + kfree(msg); + return -ENOMEM; + } + + ret = get_sg_list(buf, buf_len, msg->sgt.sgl, msg->sgt.orig_nents); + if (ret <= 0) { + pr_err("failed to get pages\n"); + goto err; + } + + ret = rdma_rw_ctx_init(&msg->rw_ctx, t->qp, t->qp->port, + msg->sg_list, get_buf_page_count(buf, buf_len), + 0, remote_offset, remote_key, + is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE); + if (ret < 0) { + pr_err("failed to init rdma_rw_ctx: %d\n", ret); + goto err; + } + + msg->t = t; + msg->cqe.done = is_read ? read_done : write_done; + msg->completion = &completion; + first_wr = rdma_rw_ctx_wrs(&msg->rw_ctx, t->qp, t->qp->port, + &msg->cqe, NULL); + + ret = ib_post_send(t->qp, first_wr, NULL); + if (ret) { + pr_err("failed to post send wr: %d\n", ret); + goto err; + } + + wait_for_completion(&completion); + return 0; + +err: + atomic_inc(&t->rw_avail_ops); + if (first_wr) + rdma_rw_ctx_destroy(&msg->rw_ctx, t->qp, t->qp->port, + msg->sg_list, msg->sgt.nents, + is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE); + sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); + kfree(msg); + return ret; +} + +static int smb_direct_rdma_write(struct ksmbd_transport *t, void *buf, + unsigned int buflen, u32 remote_key, + u64 remote_offset, u32 remote_len) +{ + return smb_direct_rdma_xmit(smb_trans_direct_transfort(t), buf, buflen, + remote_key, remote_offset, + remote_len, false); +} + +static int smb_direct_rdma_read(struct ksmbd_transport *t, void *buf, + unsigned int buflen, u32 remote_key, + u64 remote_offset, u32 remote_len) +{ + return smb_direct_rdma_xmit(smb_trans_direct_transfort(t), buf, buflen, + remote_key, remote_offset, + remote_len, true); +} + +static void smb_direct_disconnect(struct ksmbd_transport *t) +{ + struct smb_direct_transport *st = smb_trans_direct_transfort(t); + + ksmbd_debug(RDMA, "Disconnecting cm_id=%p\n", st->cm_id); + + smb_direct_disconnect_rdma_work(&st->disconnect_work); + wait_event_interruptible(st->wait_status, + st->status == SMB_DIRECT_CS_DISCONNECTED); + free_transport(st); +} + +static int smb_direct_cm_handler(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event) +{ + struct smb_direct_transport *t = cm_id->context; + + ksmbd_debug(RDMA, "RDMA CM event. cm_id=%p event=%s (%d)\n", + cm_id, rdma_event_msg(event->event), event->event); + + switch (event->event) { + case RDMA_CM_EVENT_ESTABLISHED: { + t->status = SMB_DIRECT_CS_CONNECTED; + wake_up_interruptible(&t->wait_status); + break; + } + case RDMA_CM_EVENT_DEVICE_REMOVAL: + case RDMA_CM_EVENT_DISCONNECTED: { + t->status = SMB_DIRECT_CS_DISCONNECTED; + wake_up_interruptible(&t->wait_status); + wake_up_interruptible(&t->wait_reassembly_queue); + wake_up(&t->wait_send_credits); + break; + } + case RDMA_CM_EVENT_CONNECT_ERROR: { + t->status = SMB_DIRECT_CS_DISCONNECTED; + wake_up_interruptible(&t->wait_status); + break; + } + default: + pr_err("Unexpected RDMA CM event. cm_id=%p, event=%s (%d)\n", + cm_id, rdma_event_msg(event->event), + event->event); + break; + } + return 0; +} + +static void smb_direct_qpair_handler(struct ib_event *event, void *context) +{ + struct smb_direct_transport *t = context; + + ksmbd_debug(RDMA, "Received QP event. cm_id=%p, event=%s (%d)\n", + t->cm_id, ib_event_msg(event->event), event->event); + + switch (event->event) { + case IB_EVENT_CQ_ERR: + case IB_EVENT_QP_FATAL: + smb_direct_disconnect_rdma_connection(t); + break; + default: + break; + } +} + +static int smb_direct_send_negotiate_response(struct smb_direct_transport *t, + int failed) +{ + struct smb_direct_sendmsg *sendmsg; + struct smb_direct_negotiate_resp *resp; + int ret; + + sendmsg = smb_direct_alloc_sendmsg(t); + if (IS_ERR(sendmsg)) + return -ENOMEM; + + resp = (struct smb_direct_negotiate_resp *)sendmsg->packet; + if (failed) { + memset(resp, 0, sizeof(*resp)); + resp->min_version = cpu_to_le16(0x0100); + resp->max_version = cpu_to_le16(0x0100); + resp->status = STATUS_NOT_SUPPORTED; + } else { + resp->status = STATUS_SUCCESS; + resp->min_version = SMB_DIRECT_VERSION_LE; + resp->max_version = SMB_DIRECT_VERSION_LE; + resp->negotiated_version = SMB_DIRECT_VERSION_LE; + resp->reserved = 0; + resp->credits_requested = + cpu_to_le16(t->send_credit_target); + resp->credits_granted = cpu_to_le16(manage_credits_prior_sending(t)); + resp->max_readwrite_size = cpu_to_le32(t->max_rdma_rw_size); + resp->preferred_send_size = cpu_to_le32(t->max_send_size); + resp->max_receive_size = cpu_to_le32(t->max_recv_size); + resp->max_fragmented_size = + cpu_to_le32(t->max_fragmented_recv_size); + } + + sendmsg->sge[0].addr = ib_dma_map_single(t->cm_id->device, + (void *)resp, sizeof(*resp), + DMA_TO_DEVICE); + ret = ib_dma_mapping_error(t->cm_id->device, sendmsg->sge[0].addr); + if (ret) { + smb_direct_free_sendmsg(t, sendmsg); + return ret; + } + + sendmsg->num_sge = 1; + sendmsg->sge[0].length = sizeof(*resp); + sendmsg->sge[0].lkey = t->pd->local_dma_lkey; + + ret = post_sendmsg(t, NULL, sendmsg); + if (ret) { + smb_direct_free_sendmsg(t, sendmsg); + return ret; + } + + wait_event(t->wait_send_pending, + atomic_read(&t->send_pending) == 0); + return 0; +} + +static int smb_direct_accept_client(struct smb_direct_transport *t) +{ + struct rdma_conn_param conn_param; + struct ib_port_immutable port_immutable; + u32 ird_ord_hdr[2]; + int ret; + + memset(&conn_param, 0, sizeof(conn_param)); + conn_param.initiator_depth = min_t(u8, t->cm_id->device->attrs.max_qp_rd_atom, + SMB_DIRECT_CM_INITIATOR_DEPTH); + conn_param.responder_resources = 0; + + t->cm_id->device->ops.get_port_immutable(t->cm_id->device, + t->cm_id->port_num, + &port_immutable); + if (port_immutable.core_cap_flags & RDMA_CORE_PORT_IWARP) { + ird_ord_hdr[0] = conn_param.responder_resources; + ird_ord_hdr[1] = 1; + conn_param.private_data = ird_ord_hdr; + conn_param.private_data_len = sizeof(ird_ord_hdr); + } else { + conn_param.private_data = NULL; + conn_param.private_data_len = 0; + } + conn_param.retry_count = SMB_DIRECT_CM_RETRY; + conn_param.rnr_retry_count = SMB_DIRECT_CM_RNR_RETRY; + conn_param.flow_control = 0; + + ret = rdma_accept(t->cm_id, &conn_param); + if (ret) { + pr_err("error at rdma_accept: %d\n", ret); + return ret; + } + + wait_event_interruptible(t->wait_status, + t->status != SMB_DIRECT_CS_NEW); + if (t->status != SMB_DIRECT_CS_CONNECTED) + return -ENOTCONN; + return 0; +} + +static int smb_direct_negotiate(struct smb_direct_transport *t) +{ + int ret; + struct smb_direct_recvmsg *recvmsg; + struct smb_direct_negotiate_req *req; + + recvmsg = get_free_recvmsg(t); + if (!recvmsg) + return -ENOMEM; + recvmsg->type = SMB_DIRECT_MSG_NEGOTIATE_REQ; + + ret = smb_direct_post_recv(t, recvmsg); + if (ret) { + pr_err("Can't post recv: %d\n", ret); + goto out; + } + + t->negotiation_requested = false; + ret = smb_direct_accept_client(t); + if (ret) { + pr_err("Can't accept client\n"); + goto out; + } + + smb_direct_post_recv_credits(&t->post_recv_credits_work.work); + + ksmbd_debug(RDMA, "Waiting for SMB_DIRECT negotiate request\n"); + ret = wait_event_interruptible_timeout(t->wait_status, + t->negotiation_requested || + t->status == SMB_DIRECT_CS_DISCONNECTED, + SMB_DIRECT_NEGOTIATE_TIMEOUT * HZ); + if (ret <= 0 || t->status == SMB_DIRECT_CS_DISCONNECTED) { + ret = ret < 0 ? ret : -ETIMEDOUT; + goto out; + } + + ret = smb_direct_check_recvmsg(recvmsg); + if (ret == -ECONNABORTED) + goto out; + + req = (struct smb_direct_negotiate_req *)recvmsg->packet; + t->max_recv_size = min_t(int, t->max_recv_size, + le32_to_cpu(req->preferred_send_size)); + t->max_send_size = min_t(int, t->max_send_size, + le32_to_cpu(req->max_receive_size)); + t->max_fragmented_send_size = + le32_to_cpu(req->max_fragmented_size); + + ret = smb_direct_send_negotiate_response(t, ret); +out: + if (recvmsg) + put_recvmsg(t, recvmsg); + return ret; +} + +static int smb_direct_init_params(struct smb_direct_transport *t, + struct ib_qp_cap *cap) +{ + struct ib_device *device = t->cm_id->device; + int max_send_sges, max_pages, max_rw_wrs, max_send_wrs; + + /* need 2 more sge. because a SMB_DIRECT header will be mapped, + * and maybe a send buffer could be not page aligned. + */ + t->max_send_size = smb_direct_max_send_size; + max_send_sges = DIV_ROUND_UP(t->max_send_size, PAGE_SIZE) + 2; + if (max_send_sges > SMB_DIRECT_MAX_SEND_SGES) { + pr_err("max_send_size %d is too large\n", t->max_send_size); + return -EINVAL; + } + + /* + * allow smb_direct_max_outstanding_rw_ops of in-flight RDMA + * read/writes. HCA guarantees at least max_send_sge of sges for + * a RDMA read/write work request, and if memory registration is used, + * we need reg_mr, local_inv wrs for each read/write. + */ + t->max_rdma_rw_size = smb_direct_max_read_write_size; + max_pages = DIV_ROUND_UP(t->max_rdma_rw_size, PAGE_SIZE) + 1; + max_rw_wrs = DIV_ROUND_UP(max_pages, SMB_DIRECT_MAX_SEND_SGES); + max_rw_wrs += rdma_rw_mr_factor(device, t->cm_id->port_num, + max_pages) * 2; + max_rw_wrs *= smb_direct_max_outstanding_rw_ops; + + max_send_wrs = smb_direct_send_credit_target + max_rw_wrs; + if (max_send_wrs > device->attrs.max_cqe || + max_send_wrs > device->attrs.max_qp_wr) { + pr_err("consider lowering send_credit_target = %d, or max_outstanding_rw_ops = %d\n", + smb_direct_send_credit_target, + smb_direct_max_outstanding_rw_ops); + pr_err("Possible CQE overrun, device reporting max_cqe %d max_qp_wr %d\n", + device->attrs.max_cqe, device->attrs.max_qp_wr); + return -EINVAL; + } + + if (smb_direct_receive_credit_max > device->attrs.max_cqe || + smb_direct_receive_credit_max > device->attrs.max_qp_wr) { + pr_err("consider lowering receive_credit_max = %d\n", + smb_direct_receive_credit_max); + pr_err("Possible CQE overrun, device reporting max_cpe %d max_qp_wr %d\n", + device->attrs.max_cqe, device->attrs.max_qp_wr); + return -EINVAL; + } + + if (device->attrs.max_send_sge < SMB_DIRECT_MAX_SEND_SGES) { + pr_err("warning: device max_send_sge = %d too small\n", + device->attrs.max_send_sge); + return -EINVAL; + } + if (device->attrs.max_recv_sge < SMB_DIRECT_MAX_RECV_SGES) { + pr_err("warning: device max_recv_sge = %d too small\n", + device->attrs.max_recv_sge); + return -EINVAL; + } + + t->recv_credits = 0; + t->count_avail_recvmsg = 0; + + t->recv_credit_max = smb_direct_receive_credit_max; + t->recv_credit_target = 10; + t->new_recv_credits = 0; + + t->send_credit_target = smb_direct_send_credit_target; + atomic_set(&t->send_credits, 0); + atomic_set(&t->rw_avail_ops, smb_direct_max_outstanding_rw_ops); + + t->max_send_size = smb_direct_max_send_size; + t->max_recv_size = smb_direct_max_receive_size; + t->max_fragmented_recv_size = smb_direct_max_fragmented_recv_size; + + cap->max_send_wr = max_send_wrs; + cap->max_recv_wr = t->recv_credit_max; + cap->max_send_sge = SMB_DIRECT_MAX_SEND_SGES; + cap->max_recv_sge = SMB_DIRECT_MAX_RECV_SGES; + cap->max_inline_data = 0; + cap->max_rdma_ctxs = 0; + return 0; +} + +static void smb_direct_destroy_pools(struct smb_direct_transport *t) +{ + struct smb_direct_recvmsg *recvmsg; + + while ((recvmsg = get_free_recvmsg(t))) + mempool_free(recvmsg, t->recvmsg_mempool); + while ((recvmsg = get_empty_recvmsg(t))) + mempool_free(recvmsg, t->recvmsg_mempool); + + mempool_destroy(t->recvmsg_mempool); + t->recvmsg_mempool = NULL; + + kmem_cache_destroy(t->recvmsg_cache); + t->recvmsg_cache = NULL; + + mempool_destroy(t->sendmsg_mempool); + t->sendmsg_mempool = NULL; + + kmem_cache_destroy(t->sendmsg_cache); + t->sendmsg_cache = NULL; +} + +static int smb_direct_create_pools(struct smb_direct_transport *t) +{ + char name[80]; + int i; + struct smb_direct_recvmsg *recvmsg; + + snprintf(name, sizeof(name), "smb_direct_rqst_pool_%p", t); + t->sendmsg_cache = kmem_cache_create(name, + sizeof(struct smb_direct_sendmsg) + + sizeof(struct smb_direct_negotiate_resp), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!t->sendmsg_cache) + return -ENOMEM; + + t->sendmsg_mempool = mempool_create(t->send_credit_target, + mempool_alloc_slab, mempool_free_slab, + t->sendmsg_cache); + if (!t->sendmsg_mempool) + goto err; + + snprintf(name, sizeof(name), "smb_direct_resp_%p", t); + t->recvmsg_cache = kmem_cache_create(name, + sizeof(struct smb_direct_recvmsg) + + t->max_recv_size, + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!t->recvmsg_cache) + goto err; + + t->recvmsg_mempool = + mempool_create(t->recv_credit_max, mempool_alloc_slab, + mempool_free_slab, t->recvmsg_cache); + if (!t->recvmsg_mempool) + goto err; + + INIT_LIST_HEAD(&t->recvmsg_queue); + + for (i = 0; i < t->recv_credit_max; i++) { + recvmsg = mempool_alloc(t->recvmsg_mempool, GFP_KERNEL); + if (!recvmsg) + goto err; + recvmsg->transport = t; + list_add(&recvmsg->list, &t->recvmsg_queue); + } + t->count_avail_recvmsg = t->recv_credit_max; + + return 0; +err: + smb_direct_destroy_pools(t); + return -ENOMEM; +} + +static int smb_direct_create_qpair(struct smb_direct_transport *t, + struct ib_qp_cap *cap) +{ + int ret; + struct ib_qp_init_attr qp_attr; + + t->pd = ib_alloc_pd(t->cm_id->device, 0); + if (IS_ERR(t->pd)) { + pr_err("Can't create RDMA PD\n"); + ret = PTR_ERR(t->pd); + t->pd = NULL; + return ret; + } + + t->send_cq = ib_alloc_cq(t->cm_id->device, t, + t->send_credit_target, 0, IB_POLL_WORKQUEUE); + if (IS_ERR(t->send_cq)) { + pr_err("Can't create RDMA send CQ\n"); + ret = PTR_ERR(t->send_cq); + t->send_cq = NULL; + goto err; + } + + t->recv_cq = ib_alloc_cq(t->cm_id->device, t, + cap->max_send_wr + cap->max_rdma_ctxs, + 0, IB_POLL_WORKQUEUE); + if (IS_ERR(t->recv_cq)) { + pr_err("Can't create RDMA recv CQ\n"); + ret = PTR_ERR(t->recv_cq); + t->recv_cq = NULL; + goto err; + } + + memset(&qp_attr, 0, sizeof(qp_attr)); + qp_attr.event_handler = smb_direct_qpair_handler; + qp_attr.qp_context = t; + qp_attr.cap = *cap; + qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; + qp_attr.qp_type = IB_QPT_RC; + qp_attr.send_cq = t->send_cq; + qp_attr.recv_cq = t->recv_cq; + qp_attr.port_num = ~0; + + ret = rdma_create_qp(t->cm_id, t->pd, &qp_attr); + if (ret) { + pr_err("Can't create RDMA QP: %d\n", ret); + goto err; + } + + t->qp = t->cm_id->qp; + t->cm_id->event_handler = smb_direct_cm_handler; + + return 0; +err: + if (t->qp) { + ib_destroy_qp(t->qp); + t->qp = NULL; + } + if (t->recv_cq) { + ib_destroy_cq(t->recv_cq); + t->recv_cq = NULL; + } + if (t->send_cq) { + ib_destroy_cq(t->send_cq); + t->send_cq = NULL; + } + if (t->pd) { + ib_dealloc_pd(t->pd); + t->pd = NULL; + } + return ret; +} + +static int smb_direct_prepare(struct ksmbd_transport *t) +{ + struct smb_direct_transport *st = smb_trans_direct_transfort(t); + int ret; + struct ib_qp_cap qp_cap; + + ret = smb_direct_init_params(st, &qp_cap); + if (ret) { + pr_err("Can't configure RDMA parameters\n"); + return ret; + } + + ret = smb_direct_create_pools(st); + if (ret) { + pr_err("Can't init RDMA pool: %d\n", ret); + return ret; + } + + ret = smb_direct_create_qpair(st, &qp_cap); + if (ret) { + pr_err("Can't accept RDMA client: %d\n", ret); + return ret; + } + + ret = smb_direct_negotiate(st); + if (ret) { + pr_err("Can't negotiate: %d\n", ret); + return ret; + } + + st->status = SMB_DIRECT_CS_CONNECTED; + return 0; +} + +static bool rdma_frwr_is_supported(struct ib_device_attr *attrs) +{ + if (!(attrs->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) + return false; + if (attrs->max_fast_reg_page_list_len == 0) + return false; + return true; +} + +static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id) +{ + struct smb_direct_transport *t; + + if (!rdma_frwr_is_supported(&new_cm_id->device->attrs)) { + ksmbd_debug(RDMA, + "Fast Registration Work Requests is not supported. device capabilities=%llx\n", + new_cm_id->device->attrs.device_cap_flags); + return -EPROTONOSUPPORT; + } + + t = alloc_transport(new_cm_id); + if (!t) + return -ENOMEM; + + KSMBD_TRANS(t)->handler = kthread_run(ksmbd_conn_handler_loop, + KSMBD_TRANS(t)->conn, "ksmbd:r%u", + SMB_DIRECT_PORT); + if (IS_ERR(KSMBD_TRANS(t)->handler)) { + int ret = PTR_ERR(KSMBD_TRANS(t)->handler); + + pr_err("Can't start thread\n"); + free_transport(t); + return ret; + } + + return 0; +} + +static int smb_direct_listen_handler(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event) +{ + switch (event->event) { + case RDMA_CM_EVENT_CONNECT_REQUEST: { + int ret = smb_direct_handle_connect_request(cm_id); + + if (ret) { + pr_err("Can't create transport: %d\n", ret); + return ret; + } + + ksmbd_debug(RDMA, "Received connection request. cm_id=%p\n", + cm_id); + break; + } + default: + pr_err("Unexpected listen event. cm_id=%p, event=%s (%d)\n", + cm_id, rdma_event_msg(event->event), event->event); + break; + } + return 0; +} + +static int smb_direct_listen(int port) +{ + int ret; + struct rdma_cm_id *cm_id; + struct sockaddr_in sin = { + .sin_family = AF_INET, + .sin_addr.s_addr = htonl(INADDR_ANY), + .sin_port = htons(port), + }; + + cm_id = rdma_create_id(&init_net, smb_direct_listen_handler, + &smb_direct_listener, RDMA_PS_TCP, IB_QPT_RC); + if (IS_ERR(cm_id)) { + pr_err("Can't create cm id: %ld\n", PTR_ERR(cm_id)); + return PTR_ERR(cm_id); + } + + ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin); + if (ret) { + pr_err("Can't bind: %d\n", ret); + goto err; + } + + smb_direct_listener.cm_id = cm_id; + + ret = rdma_listen(cm_id, 10); + if (ret) { + pr_err("Can't listen: %d\n", ret); + goto err; + } + return 0; +err: + smb_direct_listener.cm_id = NULL; + rdma_destroy_id(cm_id); + return ret; +} + +int ksmbd_rdma_init(void) +{ + int ret; + + smb_direct_listener.cm_id = NULL; + + /* When a client is running out of send credits, the credits are + * granted by the server's sending a packet using this queue. + * This avoids the situation that a clients cannot send packets + * for lack of credits + */ + smb_direct_wq = alloc_workqueue("ksmbd-smb_direct-wq", + WQ_HIGHPRI | WQ_MEM_RECLAIM, 0); + if (!smb_direct_wq) + return -ENOMEM; + + ret = smb_direct_listen(SMB_DIRECT_PORT); + if (ret) { + destroy_workqueue(smb_direct_wq); + smb_direct_wq = NULL; + pr_err("Can't listen: %d\n", ret); + return ret; + } + + ksmbd_debug(RDMA, "init RDMA listener. cm_id=%p\n", + smb_direct_listener.cm_id); + return 0; +} + +int ksmbd_rdma_destroy(void) +{ + if (smb_direct_listener.cm_id) + rdma_destroy_id(smb_direct_listener.cm_id); + smb_direct_listener.cm_id = NULL; + + if (smb_direct_wq) { + flush_workqueue(smb_direct_wq); + destroy_workqueue(smb_direct_wq); + smb_direct_wq = NULL; + } + return 0; +} + +bool ksmbd_rdma_capable_netdev(struct net_device *netdev) +{ + struct ib_device *ibdev; + bool rdma_capable = false; + + ibdev = ib_device_get_by_netdev(netdev, RDMA_DRIVER_UNKNOWN); + if (ibdev) { + if (rdma_frwr_is_supported(&ibdev->attrs)) + rdma_capable = true; + ib_device_put(ibdev); + } + return rdma_capable; +} + +static struct ksmbd_transport_ops ksmbd_smb_direct_transport_ops = { + .prepare = smb_direct_prepare, + .disconnect = smb_direct_disconnect, + .writev = smb_direct_writev, + .read = smb_direct_read, + .rdma_read = smb_direct_rdma_read, + .rdma_write = smb_direct_rdma_write, +}; diff --git a/fs/ksmbd/transport_rdma.h b/fs/ksmbd/transport_rdma.h new file mode 100644 index 000000000000..0fa8adc0776f --- /dev/null +++ b/fs/ksmbd/transport_rdma.h @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2017, Microsoft Corporation. + * Copyright (C) 2018, LG Electronics. + */ + +#ifndef __KSMBD_TRANSPORT_RDMA_H__ +#define __KSMBD_TRANSPORT_RDMA_H__ + +#define SMB_DIRECT_PORT 5445 + +/* SMB DIRECT negotiation request packet [MS-SMBD] 2.2.1 */ +struct smb_direct_negotiate_req { + __le16 min_version; + __le16 max_version; + __le16 reserved; + __le16 credits_requested; + __le32 preferred_send_size; + __le32 max_receive_size; + __le32 max_fragmented_size; +} __packed; + +/* SMB DIRECT negotiation response packet [MS-SMBD] 2.2.2 */ +struct smb_direct_negotiate_resp { + __le16 min_version; + __le16 max_version; + __le16 negotiated_version; + __le16 reserved; + __le16 credits_requested; + __le16 credits_granted; + __le32 status; + __le32 max_readwrite_size; + __le32 preferred_send_size; + __le32 max_receive_size; + __le32 max_fragmented_size; +} __packed; + +#define SMB_DIRECT_RESPONSE_REQUESTED 0x0001 + +/* SMB DIRECT data transfer packet with payload [MS-SMBD] 2.2.3 */ +struct smb_direct_data_transfer { + __le16 credits_requested; + __le16 credits_granted; + __le16 flags; + __le16 reserved; + __le32 remaining_data_length; + __le32 data_offset; + __le32 data_length; + __le32 padding; + __u8 buffer[]; +} __packed; + +#ifdef CONFIG_SMB_SERVER_SMBDIRECT +int ksmbd_rdma_init(void); +int ksmbd_rdma_destroy(void); +bool ksmbd_rdma_capable_netdev(struct net_device *netdev); +#else +static inline int ksmbd_rdma_init(void) { return 0; } +static inline int ksmbd_rdma_destroy(void) { return 0; } +static inline bool ksmbd_rdma_capable_netdev(struct net_device *netdev) { return false; } +#endif + +#endif /* __KSMBD_TRANSPORT_RDMA_H__ */ diff --git a/fs/ksmbd/transport_tcp.c b/fs/ksmbd/transport_tcp.c new file mode 100644 index 000000000000..dc15a5ecd2e0 --- /dev/null +++ b/fs/ksmbd/transport_tcp.c @@ -0,0 +1,618 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2016 Namjae Jeon <linkinjeon@kernel.org> + * Copyright (C) 2018 Samsung Electronics Co., Ltd. + */ + +#include <linux/freezer.h> + +#include "smb_common.h" +#include "server.h" +#include "auth.h" +#include "connection.h" +#include "transport_tcp.h" + +#define IFACE_STATE_DOWN BIT(0) +#define IFACE_STATE_CONFIGURED BIT(1) + +struct interface { + struct task_struct *ksmbd_kthread; + struct socket *ksmbd_socket; + struct list_head entry; + char *name; + struct mutex sock_release_lock; + int state; +}; + +static LIST_HEAD(iface_list); + +static int bind_additional_ifaces; + +struct tcp_transport { + struct ksmbd_transport transport; + struct socket *sock; + struct kvec *iov; + unsigned int nr_iov; +}; + +static struct ksmbd_transport_ops ksmbd_tcp_transport_ops; + +static void tcp_stop_kthread(struct task_struct *kthread); +static struct interface *alloc_iface(char *ifname); + +#define KSMBD_TRANS(t) (&(t)->transport) +#define TCP_TRANS(t) ((struct tcp_transport *)container_of(t, \ + struct tcp_transport, transport)) + +static inline void ksmbd_tcp_nodelay(struct socket *sock) +{ + tcp_sock_set_nodelay(sock->sk); +} + +static inline void ksmbd_tcp_reuseaddr(struct socket *sock) +{ + sock_set_reuseaddr(sock->sk); +} + +static inline void ksmbd_tcp_rcv_timeout(struct socket *sock, s64 secs) +{ + lock_sock(sock->sk); + if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1) + sock->sk->sk_rcvtimeo = secs * HZ; + else + sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; + release_sock(sock->sk); +} + +static inline void ksmbd_tcp_snd_timeout(struct socket *sock, s64 secs) +{ + sock_set_sndtimeo(sock->sk, secs); +} + +static struct tcp_transport *alloc_transport(struct socket *client_sk) +{ + struct tcp_transport *t; + struct ksmbd_conn *conn; + + t = kzalloc(sizeof(*t), GFP_KERNEL); + if (!t) + return NULL; + t->sock = client_sk; + + conn = ksmbd_conn_alloc(); + if (!conn) { + kfree(t); + return NULL; + } + + conn->transport = KSMBD_TRANS(t); + KSMBD_TRANS(t)->conn = conn; + KSMBD_TRANS(t)->ops = &ksmbd_tcp_transport_ops; + return t; +} + +static void free_transport(struct tcp_transport *t) +{ + kernel_sock_shutdown(t->sock, SHUT_RDWR); + sock_release(t->sock); + t->sock = NULL; + + ksmbd_conn_free(KSMBD_TRANS(t)->conn); + kfree(t->iov); + kfree(t); +} + +/** + * kvec_array_init() - initialize a IO vector segment + * @new: IO vector to be initialized + * @iov: base IO vector + * @nr_segs: number of segments in base iov + * @bytes: total iovec length so far for read + * + * Return: Number of IO segments + */ +static unsigned int kvec_array_init(struct kvec *new, struct kvec *iov, + unsigned int nr_segs, size_t bytes) +{ + size_t base = 0; + + while (bytes || !iov->iov_len) { + int copy = min(bytes, iov->iov_len); + + bytes -= copy; + base += copy; + if (iov->iov_len == base) { + iov++; + nr_segs--; + base = 0; + } + } + + memcpy(new, iov, sizeof(*iov) * nr_segs); + new->iov_base += base; + new->iov_len -= base; + return nr_segs; +} + +/** + * get_conn_iovec() - get connection iovec for reading from socket + * @t: TCP transport instance + * @nr_segs: number of segments in iov + * + * Return: return existing or newly allocate iovec + */ +static struct kvec *get_conn_iovec(struct tcp_transport *t, unsigned int nr_segs) +{ + struct kvec *new_iov; + + if (t->iov && nr_segs <= t->nr_iov) + return t->iov; + + /* not big enough -- allocate a new one and release the old */ + new_iov = kmalloc_array(nr_segs, sizeof(*new_iov), GFP_KERNEL); + if (new_iov) { + kfree(t->iov); + t->iov = new_iov; + t->nr_iov = nr_segs; + } + return new_iov; +} + +static unsigned short ksmbd_tcp_get_port(const struct sockaddr *sa) +{ + switch (sa->sa_family) { + case AF_INET: + return ntohs(((struct sockaddr_in *)sa)->sin_port); + case AF_INET6: + return ntohs(((struct sockaddr_in6 *)sa)->sin6_port); + } + return 0; +} + +/** + * ksmbd_tcp_new_connection() - create a new tcp session on mount + * @client_sk: socket associated with new connection + * + * whenever a new connection is requested, create a conn thread + * (session thread) to handle new incoming smb requests from the connection + * + * Return: 0 on success, otherwise error + */ +static int ksmbd_tcp_new_connection(struct socket *client_sk) +{ + struct sockaddr *csin; + int rc = 0; + struct tcp_transport *t; + + t = alloc_transport(client_sk); + if (!t) + return -ENOMEM; + + csin = KSMBD_TCP_PEER_SOCKADDR(KSMBD_TRANS(t)->conn); + if (kernel_getpeername(client_sk, csin) < 0) { + pr_err("client ip resolution failed\n"); + rc = -EINVAL; + goto out_error; + } + + KSMBD_TRANS(t)->handler = kthread_run(ksmbd_conn_handler_loop, + KSMBD_TRANS(t)->conn, + "ksmbd:%u", + ksmbd_tcp_get_port(csin)); + if (IS_ERR(KSMBD_TRANS(t)->handler)) { + pr_err("cannot start conn thread\n"); + rc = PTR_ERR(KSMBD_TRANS(t)->handler); + free_transport(t); + } + return rc; + +out_error: + free_transport(t); + return rc; +} + +/** + * ksmbd_kthread_fn() - listen to new SMB connections and callback server + * @p: arguments to forker thread + * + * Return: Returns a task_struct or ERR_PTR + */ +static int ksmbd_kthread_fn(void *p) +{ + struct socket *client_sk = NULL; + struct interface *iface = (struct interface *)p; + int ret; + + while (!kthread_should_stop()) { + mutex_lock(&iface->sock_release_lock); + if (!iface->ksmbd_socket) { + mutex_unlock(&iface->sock_release_lock); + break; + } + ret = kernel_accept(iface->ksmbd_socket, &client_sk, + O_NONBLOCK); + mutex_unlock(&iface->sock_release_lock); + if (ret) { + if (ret == -EAGAIN) + /* check for new connections every 100 msecs */ + schedule_timeout_interruptible(HZ / 10); + continue; + } + + ksmbd_debug(CONN, "connect success: accepted new connection\n"); + client_sk->sk->sk_rcvtimeo = KSMBD_TCP_RECV_TIMEOUT; + client_sk->sk->sk_sndtimeo = KSMBD_TCP_SEND_TIMEOUT; + + ksmbd_tcp_new_connection(client_sk); + } + + ksmbd_debug(CONN, "releasing socket\n"); + return 0; +} + +/** + * ksmbd_tcp_run_kthread() - start forker thread + * @iface: pointer to struct interface + * + * start forker thread(ksmbd/0) at module init time to listen + * on port 445 for new SMB connection requests. It creates per connection + * server threads(ksmbd/x) + * + * Return: 0 on success or error number + */ +static int ksmbd_tcp_run_kthread(struct interface *iface) +{ + int rc; + struct task_struct *kthread; + + kthread = kthread_run(ksmbd_kthread_fn, (void *)iface, "ksmbd-%s", + iface->name); + if (IS_ERR(kthread)) { + rc = PTR_ERR(kthread); + return rc; + } + iface->ksmbd_kthread = kthread; + + return 0; +} + +/** + * ksmbd_tcp_readv() - read data from socket in given iovec + * @t: TCP transport instance + * @iov_orig: base IO vector + * @nr_segs: number of segments in base iov + * @to_read: number of bytes to read from socket + * + * Return: on success return number of bytes read from socket, + * otherwise return error number + */ +static int ksmbd_tcp_readv(struct tcp_transport *t, struct kvec *iov_orig, + unsigned int nr_segs, unsigned int to_read) +{ + int length = 0; + int total_read; + unsigned int segs; + struct msghdr ksmbd_msg; + struct kvec *iov; + struct ksmbd_conn *conn = KSMBD_TRANS(t)->conn; + + iov = get_conn_iovec(t, nr_segs); + if (!iov) + return -ENOMEM; + + ksmbd_msg.msg_control = NULL; + ksmbd_msg.msg_controllen = 0; + + for (total_read = 0; to_read; total_read += length, to_read -= length) { + try_to_freeze(); + + if (!ksmbd_conn_alive(conn)) { + total_read = -ESHUTDOWN; + break; + } + segs = kvec_array_init(iov, iov_orig, nr_segs, total_read); + + length = kernel_recvmsg(t->sock, &ksmbd_msg, + iov, segs, to_read, 0); + + if (length == -EINTR) { + total_read = -ESHUTDOWN; + break; + } else if (conn->status == KSMBD_SESS_NEED_RECONNECT) { + total_read = -EAGAIN; + break; + } else if (length == -ERESTARTSYS || length == -EAGAIN) { + usleep_range(1000, 2000); + length = 0; + continue; + } else if (length <= 0) { + total_read = -EAGAIN; + break; + } + } + return total_read; +} + +/** + * ksmbd_tcp_read() - read data from socket in given buffer + * @t: TCP transport instance + * @buf: buffer to store read data from socket + * @to_read: number of bytes to read from socket + * + * Return: on success return number of bytes read from socket, + * otherwise return error number + */ +static int ksmbd_tcp_read(struct ksmbd_transport *t, char *buf, unsigned int to_read) +{ + struct kvec iov; + + iov.iov_base = buf; + iov.iov_len = to_read; + + return ksmbd_tcp_readv(TCP_TRANS(t), &iov, 1, to_read); +} + +static int ksmbd_tcp_writev(struct ksmbd_transport *t, struct kvec *iov, + int nvecs, int size, bool need_invalidate, + unsigned int remote_key) + +{ + struct msghdr smb_msg = {.msg_flags = MSG_NOSIGNAL}; + + return kernel_sendmsg(TCP_TRANS(t)->sock, &smb_msg, iov, nvecs, size); +} + +static void ksmbd_tcp_disconnect(struct ksmbd_transport *t) +{ + free_transport(TCP_TRANS(t)); +} + +static void tcp_destroy_socket(struct socket *ksmbd_socket) +{ + int ret; + + if (!ksmbd_socket) + return; + + /* set zero to timeout */ + ksmbd_tcp_rcv_timeout(ksmbd_socket, 0); + ksmbd_tcp_snd_timeout(ksmbd_socket, 0); + + ret = kernel_sock_shutdown(ksmbd_socket, SHUT_RDWR); + if (ret) + pr_err("Failed to shutdown socket: %d\n", ret); + sock_release(ksmbd_socket); +} + +/** + * create_socket - create socket for ksmbd/0 + * + * Return: Returns a task_struct or ERR_PTR + */ +static int create_socket(struct interface *iface) +{ + int ret; + struct sockaddr_in6 sin6; + struct sockaddr_in sin; + struct socket *ksmbd_socket; + bool ipv4 = false; + + ret = sock_create(PF_INET6, SOCK_STREAM, IPPROTO_TCP, &ksmbd_socket); + if (ret) { + pr_err("Can't create socket for ipv6, try ipv4: %d\n", ret); + ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, + &ksmbd_socket); + if (ret) { + pr_err("Can't create socket for ipv4: %d\n", ret); + goto out_error; + } + + sin.sin_family = PF_INET; + sin.sin_addr.s_addr = htonl(INADDR_ANY); + sin.sin_port = htons(server_conf.tcp_port); + ipv4 = true; + } else { + sin6.sin6_family = PF_INET6; + sin6.sin6_addr = in6addr_any; + sin6.sin6_port = htons(server_conf.tcp_port); + } + + ksmbd_tcp_nodelay(ksmbd_socket); + ksmbd_tcp_reuseaddr(ksmbd_socket); + + ret = sock_setsockopt(ksmbd_socket, + SOL_SOCKET, + SO_BINDTODEVICE, + KERNEL_SOCKPTR(iface->name), + strlen(iface->name)); + if (ret != -ENODEV && ret < 0) { + pr_err("Failed to set SO_BINDTODEVICE: %d\n", ret); + goto out_error; + } + + if (ipv4) + ret = kernel_bind(ksmbd_socket, (struct sockaddr *)&sin, + sizeof(sin)); + else + ret = kernel_bind(ksmbd_socket, (struct sockaddr *)&sin6, + sizeof(sin6)); + if (ret) { + pr_err("Failed to bind socket: %d\n", ret); + goto out_error; + } + + ksmbd_socket->sk->sk_rcvtimeo = KSMBD_TCP_RECV_TIMEOUT; + ksmbd_socket->sk->sk_sndtimeo = KSMBD_TCP_SEND_TIMEOUT; + + ret = kernel_listen(ksmbd_socket, KSMBD_SOCKET_BACKLOG); + if (ret) { + pr_err("Port listen() error: %d\n", ret); + goto out_error; + } + + iface->ksmbd_socket = ksmbd_socket; + ret = ksmbd_tcp_run_kthread(iface); + if (ret) { + pr_err("Can't start ksmbd main kthread: %d\n", ret); + goto out_error; + } + iface->state = IFACE_STATE_CONFIGURED; + + return 0; + +out_error: + tcp_destroy_socket(ksmbd_socket); + iface->ksmbd_socket = NULL; + return ret; +} + +static int ksmbd_netdev_event(struct notifier_block *nb, unsigned long event, + void *ptr) +{ + struct net_device *netdev = netdev_notifier_info_to_dev(ptr); + struct interface *iface; + int ret, found = 0; + + switch (event) { + case NETDEV_UP: + if (netdev->priv_flags & IFF_BRIDGE_PORT) + return NOTIFY_OK; + + list_for_each_entry(iface, &iface_list, entry) { + if (!strcmp(iface->name, netdev->name)) { + found = 1; + if (iface->state != IFACE_STATE_DOWN) + break; + ret = create_socket(iface); + if (ret) + return NOTIFY_OK; + break; + } + } + if (!found && bind_additional_ifaces) { + iface = alloc_iface(kstrdup(netdev->name, GFP_KERNEL)); + if (!iface) + return NOTIFY_OK; + ret = create_socket(iface); + if (ret) + break; + } + break; + case NETDEV_DOWN: + list_for_each_entry(iface, &iface_list, entry) { + if (!strcmp(iface->name, netdev->name) && + iface->state == IFACE_STATE_CONFIGURED) { + tcp_stop_kthread(iface->ksmbd_kthread); + iface->ksmbd_kthread = NULL; + mutex_lock(&iface->sock_release_lock); + tcp_destroy_socket(iface->ksmbd_socket); + iface->ksmbd_socket = NULL; + mutex_unlock(&iface->sock_release_lock); + + iface->state = IFACE_STATE_DOWN; + break; + } + } + break; + } + + return NOTIFY_DONE; +} + +static struct notifier_block ksmbd_netdev_notifier = { + .notifier_call = ksmbd_netdev_event, +}; + +int ksmbd_tcp_init(void) +{ + register_netdevice_notifier(&ksmbd_netdev_notifier); + + return 0; +} + +static void tcp_stop_kthread(struct task_struct *kthread) +{ + int ret; + + if (!kthread) + return; + + ret = kthread_stop(kthread); + if (ret) + pr_err("failed to stop forker thread\n"); +} + +void ksmbd_tcp_destroy(void) +{ + struct interface *iface, *tmp; + + unregister_netdevice_notifier(&ksmbd_netdev_notifier); + + list_for_each_entry_safe(iface, tmp, &iface_list, entry) { + list_del(&iface->entry); + kfree(iface->name); + kfree(iface); + } +} + +static struct interface *alloc_iface(char *ifname) +{ + struct interface *iface; + + if (!ifname) + return NULL; + + iface = kzalloc(sizeof(struct interface), GFP_KERNEL); + if (!iface) { + kfree(ifname); + return NULL; + } + + iface->name = ifname; + iface->state = IFACE_STATE_DOWN; + list_add(&iface->entry, &iface_list); + mutex_init(&iface->sock_release_lock); + return iface; +} + +int ksmbd_tcp_set_interfaces(char *ifc_list, int ifc_list_sz) +{ + int sz = 0; + + if (!ifc_list_sz) { + struct net_device *netdev; + + rtnl_lock(); + for_each_netdev(&init_net, netdev) { + if (netdev->priv_flags & IFF_BRIDGE_PORT) + continue; + if (!alloc_iface(kstrdup(netdev->name, GFP_KERNEL))) + return -ENOMEM; + } + rtnl_unlock(); + bind_additional_ifaces = 1; + return 0; + } + + while (ifc_list_sz > 0) { + if (!alloc_iface(kstrdup(ifc_list, GFP_KERNEL))) + return -ENOMEM; + + sz = strlen(ifc_list); + if (!sz) + break; + + ifc_list += sz + 1; + ifc_list_sz -= (sz + 1); + } + + bind_additional_ifaces = 0; + + return 0; +} + +static struct ksmbd_transport_ops ksmbd_tcp_transport_ops = { + .read = ksmbd_tcp_read, + .writev = ksmbd_tcp_writev, + .disconnect = ksmbd_tcp_disconnect, +}; diff --git a/fs/ksmbd/transport_tcp.h b/fs/ksmbd/transport_tcp.h new file mode 100644 index 000000000000..e338bebe322f --- /dev/null +++ b/fs/ksmbd/transport_tcp.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2018 Samsung Electronics Co., Ltd. + */ + +#ifndef __KSMBD_TRANSPORT_TCP_H__ +#define __KSMBD_TRANSPORT_TCP_H__ + +int ksmbd_tcp_set_interfaces(char *ifc_list, int ifc_list_sz); +int ksmbd_tcp_init(void); +void ksmbd_tcp_destroy(void); + +#endif /* __KSMBD_TRANSPORT_TCP_H__ */ diff --git a/fs/ksmbd/unicode.c b/fs/ksmbd/unicode.c new file mode 100644 index 000000000000..a0db699ddafd --- /dev/null +++ b/fs/ksmbd/unicode.c @@ -0,0 +1,384 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Some of the source code in this file came from fs/cifs/cifs_unicode.c + * + * Copyright (c) International Business Machines Corp., 2000,2009 + * Modified by Steve French (sfrench@us.ibm.com) + * Modified by Namjae Jeon (linkinjeon@kernel.org) + */ +#include <linux/fs.h> +#include <linux/slab.h> +#include <asm/unaligned.h> +#include "glob.h" +#include "unicode.h" +#include "uniupr.h" +#include "smb_common.h" + +/* + * smb_utf16_bytes() - how long will a string be after conversion? + * @from: pointer to input string + * @maxbytes: don't go past this many bytes of input string + * @codepage: destination codepage + * + * Walk a utf16le string and return the number of bytes that the string will + * be after being converted to the given charset, not including any null + * termination required. Don't walk past maxbytes in the source buffer. + * + * Return: string length after conversion + */ +static int smb_utf16_bytes(const __le16 *from, int maxbytes, + const struct nls_table *codepage) +{ + int i; + int charlen, outlen = 0; + int maxwords = maxbytes / 2; + char tmp[NLS_MAX_CHARSET_SIZE]; + __u16 ftmp; + + for (i = 0; i < maxwords; i++) { + ftmp = get_unaligned_le16(&from[i]); + if (ftmp == 0) + break; + + charlen = codepage->uni2char(ftmp, tmp, NLS_MAX_CHARSET_SIZE); + if (charlen > 0) + outlen += charlen; + else + outlen++; + } + + return outlen; +} + +/* + * cifs_mapchar() - convert a host-endian char to proper char in codepage + * @target: where converted character should be copied + * @src_char: 2 byte host-endian source character + * @cp: codepage to which character should be converted + * @mapchar: should character be mapped according to mapchars mount option? + * + * This function handles the conversion of a single character. It is the + * responsibility of the caller to ensure that the target buffer is large + * enough to hold the result of the conversion (at least NLS_MAX_CHARSET_SIZE). + * + * Return: string length after conversion + */ +static int +cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp, + bool mapchar) +{ + int len = 1; + + if (!mapchar) + goto cp_convert; + + /* + * BB: Cannot handle remapping UNI_SLASH until all the calls to + * build_path_from_dentry are modified, as they use slash as + * separator. + */ + switch (src_char) { + case UNI_COLON: + *target = ':'; + break; + case UNI_ASTERISK: + *target = '*'; + break; + case UNI_QUESTION: + *target = '?'; + break; + case UNI_PIPE: + *target = '|'; + break; + case UNI_GRTRTHAN: + *target = '>'; + break; + case UNI_LESSTHAN: + *target = '<'; + break; + default: + goto cp_convert; + } + +out: + return len; + +cp_convert: + len = cp->uni2char(src_char, target, NLS_MAX_CHARSET_SIZE); + if (len <= 0) { + *target = '?'; + len = 1; + } + + goto out; +} + +/* + * is_char_allowed() - check for valid character + * @ch: input character to be checked + * + * Return: 1 if char is allowed, otherwise 0 + */ +static inline int is_char_allowed(char *ch) +{ + /* check for control chars, wildcards etc. */ + if (!(*ch & 0x80) && + (*ch <= 0x1f || + *ch == '?' || *ch == '"' || *ch == '<' || + *ch == '>' || *ch == '|')) + return 0; + + return 1; +} + +/* + * smb_from_utf16() - convert utf16le string to local charset + * @to: destination buffer + * @from: source buffer + * @tolen: destination buffer size (in bytes) + * @fromlen: source buffer size (in bytes) + * @codepage: codepage to which characters should be converted + * @mapchar: should characters be remapped according to the mapchars option? + * + * Convert a little-endian utf16le string (as sent by the server) to a string + * in the provided codepage. The tolen and fromlen parameters are to ensure + * that the code doesn't walk off of the end of the buffer (which is always + * a danger if the alignment of the source buffer is off). The destination + * string is always properly null terminated and fits in the destination + * buffer. Returns the length of the destination string in bytes (including + * null terminator). + * + * Note that some windows versions actually send multiword UTF-16 characters + * instead of straight UTF16-2. The linux nls routines however aren't able to + * deal with those characters properly. In the event that we get some of + * those characters, they won't be translated properly. + * + * Return: string length after conversion + */ +static int smb_from_utf16(char *to, const __le16 *from, int tolen, int fromlen, + const struct nls_table *codepage, bool mapchar) +{ + int i, charlen, safelen; + int outlen = 0; + int nullsize = nls_nullsize(codepage); + int fromwords = fromlen / 2; + char tmp[NLS_MAX_CHARSET_SIZE]; + __u16 ftmp; + + /* + * because the chars can be of varying widths, we need to take care + * not to overflow the destination buffer when we get close to the + * end of it. Until we get to this offset, we don't need to check + * for overflow however. + */ + safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize); + + for (i = 0; i < fromwords; i++) { + ftmp = get_unaligned_le16(&from[i]); + if (ftmp == 0) + break; + + /* + * check to see if converting this character might make the + * conversion bleed into the null terminator + */ + if (outlen >= safelen) { + charlen = cifs_mapchar(tmp, ftmp, codepage, mapchar); + if ((outlen + charlen) > (tolen - nullsize)) + break; + } + + /* put converted char into 'to' buffer */ + charlen = cifs_mapchar(&to[outlen], ftmp, codepage, mapchar); + outlen += charlen; + } + + /* properly null-terminate string */ + for (i = 0; i < nullsize; i++) + to[outlen++] = 0; + + return outlen; +} + +/* + * smb_strtoUTF16() - Convert character string to unicode string + * @to: destination buffer + * @from: source buffer + * @len: destination buffer size (in bytes) + * @codepage: codepage to which characters should be converted + * + * Return: string length after conversion + */ +int smb_strtoUTF16(__le16 *to, const char *from, int len, + const struct nls_table *codepage) +{ + int charlen; + int i; + wchar_t wchar_to; /* needed to quiet sparse */ + + /* special case for utf8 to handle no plane0 chars */ + if (!strcmp(codepage->charset, "utf8")) { + /* + * convert utf8 -> utf16, we assume we have enough space + * as caller should have assumed conversion does not overflow + * in destination len is length in wchar_t units (16bits) + */ + i = utf8s_to_utf16s(from, len, UTF16_LITTLE_ENDIAN, + (wchar_t *)to, len); + + /* if success terminate and exit */ + if (i >= 0) + goto success; + /* + * if fails fall back to UCS encoding as this + * function should not return negative values + * currently can fail only if source contains + * invalid encoded characters + */ + } + + for (i = 0; len > 0 && *from; i++, from += charlen, len -= charlen) { + charlen = codepage->char2uni(from, len, &wchar_to); + if (charlen < 1) { + /* A question mark */ + wchar_to = 0x003f; + charlen = 1; + } + put_unaligned_le16(wchar_to, &to[i]); + } + +success: + put_unaligned_le16(0, &to[i]); + return i; +} + +/* + * smb_strndup_from_utf16() - copy a string from wire format to the local + * codepage + * @src: source string + * @maxlen: don't walk past this many bytes in the source string + * @is_unicode: is this a unicode string? + * @codepage: destination codepage + * + * Take a string given by the server, convert it to the local codepage and + * put it in a new buffer. Returns a pointer to the new string or NULL on + * error. + * + * Return: destination string buffer or error ptr + */ +char *smb_strndup_from_utf16(const char *src, const int maxlen, + const bool is_unicode, + const struct nls_table *codepage) +{ + int len, ret; + char *dst; + + if (is_unicode) { + len = smb_utf16_bytes((__le16 *)src, maxlen, codepage); + len += nls_nullsize(codepage); + dst = kmalloc(len, GFP_KERNEL); + if (!dst) + return ERR_PTR(-ENOMEM); + ret = smb_from_utf16(dst, (__le16 *)src, len, maxlen, codepage, + false); + if (ret < 0) { + kfree(dst); + return ERR_PTR(-EINVAL); + } + } else { + len = strnlen(src, maxlen); + len++; + dst = kmalloc(len, GFP_KERNEL); + if (!dst) + return ERR_PTR(-ENOMEM); + strscpy(dst, src, len); + } + + return dst; +} + +/* + * Convert 16 bit Unicode pathname to wire format from string in current code + * page. Conversion may involve remapping up the six characters that are + * only legal in POSIX-like OS (if they are present in the string). Path + * names are little endian 16 bit Unicode on the wire + */ +/* + * smbConvertToUTF16() - convert string from local charset to utf16 + * @target: destination buffer + * @source: source buffer + * @srclen: source buffer size (in bytes) + * @cp: codepage to which characters should be converted + * @mapchar: should characters be remapped according to the mapchars option? + * + * Convert 16 bit Unicode pathname to wire format from string in current code + * page. Conversion may involve remapping up the six characters that are + * only legal in POSIX-like OS (if they are present in the string). Path + * names are little endian 16 bit Unicode on the wire + * + * Return: char length after conversion + */ +int smbConvertToUTF16(__le16 *target, const char *source, int srclen, + const struct nls_table *cp, int mapchars) +{ + int i, j, charlen; + char src_char; + __le16 dst_char; + wchar_t tmp; + + if (!mapchars) + return smb_strtoUTF16(target, source, srclen, cp); + + for (i = 0, j = 0; i < srclen; j++) { + src_char = source[i]; + charlen = 1; + switch (src_char) { + case 0: + put_unaligned(0, &target[j]); + return j; + case ':': + dst_char = cpu_to_le16(UNI_COLON); + break; + case '*': + dst_char = cpu_to_le16(UNI_ASTERISK); + break; + case '?': + dst_char = cpu_to_le16(UNI_QUESTION); + break; + case '<': + dst_char = cpu_to_le16(UNI_LESSTHAN); + break; + case '>': + dst_char = cpu_to_le16(UNI_GRTRTHAN); + break; + case '|': + dst_char = cpu_to_le16(UNI_PIPE); + break; + /* + * FIXME: We can not handle remapping backslash (UNI_SLASH) + * until all the calls to build_path_from_dentry are modified, + * as they use backslash as separator. + */ + default: + charlen = cp->char2uni(source + i, srclen - i, &tmp); + dst_char = cpu_to_le16(tmp); + + /* + * if no match, use question mark, which at least in + * some cases serves as wild card + */ + if (charlen < 1) { + dst_char = cpu_to_le16(0x003f); + charlen = 1; + } + } + /* + * character may take more than one byte in the source string, + * but will take exactly two bytes in the target string + */ + i += charlen; + put_unaligned(dst_char, &target[j]); + } + + return j; +} diff --git a/fs/ksmbd/unicode.h b/fs/ksmbd/unicode.h new file mode 100644 index 000000000000..5593024230ae --- /dev/null +++ b/fs/ksmbd/unicode.h @@ -0,0 +1,357 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Some of the source code in this file came from fs/cifs/cifs_unicode.c + * cifs_unicode: Unicode kernel case support + * + * Function: + * Convert a unicode character to upper or lower case using + * compressed tables. + * + * Copyright (c) International Business Machines Corp., 2000,2009 + * + * + * Notes: + * These APIs are based on the C library functions. The semantics + * should match the C functions but with expanded size operands. + * + * The upper/lower functions are based on a table created by mkupr. + * This is a compressed table of upper and lower case conversion. + * + */ +#ifndef _CIFS_UNICODE_H +#define _CIFS_UNICODE_H + +#include <asm/byteorder.h> +#include <linux/types.h> +#include <linux/nls.h> + +#define UNIUPR_NOLOWER /* Example to not expand lower case tables */ + +/* + * Windows maps these to the user defined 16 bit Unicode range since they are + * reserved symbols (along with \ and /), otherwise illegal to store + * in filenames in NTFS + */ +#define UNI_ASTERISK ((__u16)('*' + 0xF000)) +#define UNI_QUESTION ((__u16)('?' + 0xF000)) +#define UNI_COLON ((__u16)(':' + 0xF000)) +#define UNI_GRTRTHAN ((__u16)('>' + 0xF000)) +#define UNI_LESSTHAN ((__u16)('<' + 0xF000)) +#define UNI_PIPE ((__u16)('|' + 0xF000)) +#define UNI_SLASH ((__u16)('\\' + 0xF000)) + +/* Just define what we want from uniupr.h. We don't want to define the tables + * in each source file. + */ +#ifndef UNICASERANGE_DEFINED +struct UniCaseRange { + wchar_t start; + wchar_t end; + signed char *table; +}; +#endif /* UNICASERANGE_DEFINED */ + +#ifndef UNIUPR_NOUPPER +extern signed char SmbUniUpperTable[512]; +extern const struct UniCaseRange SmbUniUpperRange[]; +#endif /* UNIUPR_NOUPPER */ + +#ifndef UNIUPR_NOLOWER +extern signed char CifsUniLowerTable[512]; +extern const struct UniCaseRange CifsUniLowerRange[]; +#endif /* UNIUPR_NOLOWER */ + +#ifdef __KERNEL__ +int smb_strtoUTF16(__le16 *to, const char *from, int len, + const struct nls_table *codepage); +char *smb_strndup_from_utf16(const char *src, const int maxlen, + const bool is_unicode, + const struct nls_table *codepage); +int smbConvertToUTF16(__le16 *target, const char *source, int srclen, + const struct nls_table *cp, int mapchars); +char *ksmbd_extract_sharename(char *treename); +#endif + +/* + * UniStrcat: Concatenate the second string to the first + * + * Returns: + * Address of the first string + */ +static inline wchar_t *UniStrcat(wchar_t *ucs1, const wchar_t *ucs2) +{ + wchar_t *anchor = ucs1; /* save a pointer to start of ucs1 */ + + while (*ucs1++) + /*NULL*/; /* To end of first string */ + ucs1--; /* Return to the null */ + while ((*ucs1++ = *ucs2++)) + /*NULL*/; /* copy string 2 over */ + return anchor; +} + +/* + * UniStrchr: Find a character in a string + * + * Returns: + * Address of first occurrence of character in string + * or NULL if the character is not in the string + */ +static inline wchar_t *UniStrchr(const wchar_t *ucs, wchar_t uc) +{ + while ((*ucs != uc) && *ucs) + ucs++; + + if (*ucs == uc) + return (wchar_t *)ucs; + return NULL; +} + +/* + * UniStrcmp: Compare two strings + * + * Returns: + * < 0: First string is less than second + * = 0: Strings are equal + * > 0: First string is greater than second + */ +static inline int UniStrcmp(const wchar_t *ucs1, const wchar_t *ucs2) +{ + while ((*ucs1 == *ucs2) && *ucs1) { + ucs1++; + ucs2++; + } + return (int)*ucs1 - (int)*ucs2; +} + +/* + * UniStrcpy: Copy a string + */ +static inline wchar_t *UniStrcpy(wchar_t *ucs1, const wchar_t *ucs2) +{ + wchar_t *anchor = ucs1; /* save the start of result string */ + + while ((*ucs1++ = *ucs2++)) + /*NULL*/; + return anchor; +} + +/* + * UniStrlen: Return the length of a string (in 16 bit Unicode chars not bytes) + */ +static inline size_t UniStrlen(const wchar_t *ucs1) +{ + int i = 0; + + while (*ucs1++) + i++; + return i; +} + +/* + * UniStrnlen: Return the length (in 16 bit Unicode chars not bytes) of a + * string (length limited) + */ +static inline size_t UniStrnlen(const wchar_t *ucs1, int maxlen) +{ + int i = 0; + + while (*ucs1++) { + i++; + if (i >= maxlen) + break; + } + return i; +} + +/* + * UniStrncat: Concatenate length limited string + */ +static inline wchar_t *UniStrncat(wchar_t *ucs1, const wchar_t *ucs2, size_t n) +{ + wchar_t *anchor = ucs1; /* save pointer to string 1 */ + + while (*ucs1++) + /*NULL*/; + ucs1--; /* point to null terminator of s1 */ + while (n-- && (*ucs1 = *ucs2)) { /* copy s2 after s1 */ + ucs1++; + ucs2++; + } + *ucs1 = 0; /* Null terminate the result */ + return anchor; +} + +/* + * UniStrncmp: Compare length limited string + */ +static inline int UniStrncmp(const wchar_t *ucs1, const wchar_t *ucs2, size_t n) +{ + if (!n) + return 0; /* Null strings are equal */ + while ((*ucs1 == *ucs2) && *ucs1 && --n) { + ucs1++; + ucs2++; + } + return (int)*ucs1 - (int)*ucs2; +} + +/* + * UniStrncmp_le: Compare length limited string - native to little-endian + */ +static inline int +UniStrncmp_le(const wchar_t *ucs1, const wchar_t *ucs2, size_t n) +{ + if (!n) + return 0; /* Null strings are equal */ + while ((*ucs1 == __le16_to_cpu(*ucs2)) && *ucs1 && --n) { + ucs1++; + ucs2++; + } + return (int)*ucs1 - (int)__le16_to_cpu(*ucs2); +} + +/* + * UniStrncpy: Copy length limited string with pad + */ +static inline wchar_t *UniStrncpy(wchar_t *ucs1, const wchar_t *ucs2, size_t n) +{ + wchar_t *anchor = ucs1; + + while (n-- && *ucs2) /* Copy the strings */ + *ucs1++ = *ucs2++; + + n++; + while (n--) /* Pad with nulls */ + *ucs1++ = 0; + return anchor; +} + +/* + * UniStrncpy_le: Copy length limited string with pad to little-endian + */ +static inline wchar_t *UniStrncpy_le(wchar_t *ucs1, const wchar_t *ucs2, size_t n) +{ + wchar_t *anchor = ucs1; + + while (n-- && *ucs2) /* Copy the strings */ + *ucs1++ = __le16_to_cpu(*ucs2++); + + n++; + while (n--) /* Pad with nulls */ + *ucs1++ = 0; + return anchor; +} + +/* + * UniStrstr: Find a string in a string + * + * Returns: + * Address of first match found + * NULL if no matching string is found + */ +static inline wchar_t *UniStrstr(const wchar_t *ucs1, const wchar_t *ucs2) +{ + const wchar_t *anchor1 = ucs1; + const wchar_t *anchor2 = ucs2; + + while (*ucs1) { + if (*ucs1 == *ucs2) { + /* Partial match found */ + ucs1++; + ucs2++; + } else { + if (!*ucs2) /* Match found */ + return (wchar_t *)anchor1; + ucs1 = ++anchor1; /* No match */ + ucs2 = anchor2; + } + } + + if (!*ucs2) /* Both end together */ + return (wchar_t *)anchor1; /* Match found */ + return NULL; /* No match */ +} + +#ifndef UNIUPR_NOUPPER +/* + * UniToupper: Convert a unicode character to upper case + */ +static inline wchar_t UniToupper(register wchar_t uc) +{ + register const struct UniCaseRange *rp; + + if (uc < sizeof(SmbUniUpperTable)) { + /* Latin characters */ + return uc + SmbUniUpperTable[uc]; /* Use base tables */ + } + + rp = SmbUniUpperRange; /* Use range tables */ + while (rp->start) { + if (uc < rp->start) /* Before start of range */ + return uc; /* Uppercase = input */ + if (uc <= rp->end) /* In range */ + return uc + rp->table[uc - rp->start]; + rp++; /* Try next range */ + } + return uc; /* Past last range */ +} + +/* + * UniStrupr: Upper case a unicode string + */ +static inline __le16 *UniStrupr(register __le16 *upin) +{ + register __le16 *up; + + up = upin; + while (*up) { /* For all characters */ + *up = cpu_to_le16(UniToupper(le16_to_cpu(*up))); + up++; + } + return upin; /* Return input pointer */ +} +#endif /* UNIUPR_NOUPPER */ + +#ifndef UNIUPR_NOLOWER +/* + * UniTolower: Convert a unicode character to lower case + */ +static inline wchar_t UniTolower(register wchar_t uc) +{ + register const struct UniCaseRange *rp; + + if (uc < sizeof(CifsUniLowerTable)) { + /* Latin characters */ + return uc + CifsUniLowerTable[uc]; /* Use base tables */ + } + + rp = CifsUniLowerRange; /* Use range tables */ + while (rp->start) { + if (uc < rp->start) /* Before start of range */ + return uc; /* Uppercase = input */ + if (uc <= rp->end) /* In range */ + return uc + rp->table[uc - rp->start]; + rp++; /* Try next range */ + } + return uc; /* Past last range */ +} + +/* + * UniStrlwr: Lower case a unicode string + */ +static inline wchar_t *UniStrlwr(register wchar_t *upin) +{ + register wchar_t *up; + + up = upin; + while (*up) { /* For all characters */ + *up = UniTolower(*up); + up++; + } + return upin; /* Return input pointer */ +} + +#endif + +#endif /* _CIFS_UNICODE_H */ diff --git a/fs/ksmbd/uniupr.h b/fs/ksmbd/uniupr.h new file mode 100644 index 000000000000..26583b776897 --- /dev/null +++ b/fs/ksmbd/uniupr.h @@ -0,0 +1,268 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Some of the source code in this file came from fs/cifs/uniupr.h + * Copyright (c) International Business Machines Corp., 2000,2002 + * + * uniupr.h - Unicode compressed case ranges + * + */ +#ifndef __KSMBD_UNIUPR_H +#define __KSMBD_UNIUPR_H + +#ifndef UNIUPR_NOUPPER +/* + * Latin upper case + */ +signed char SmbUniUpperTable[512] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 000-00f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 010-01f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 020-02f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 030-03f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 040-04f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 050-05f */ + 0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, + -32, -32, -32, -32, -32, /* 060-06f */ + -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, + -32, 0, 0, 0, 0, 0, /* 070-07f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 080-08f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 090-09f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0a0-0af */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0b0-0bf */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0c0-0cf */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0d0-0df */ + -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, + -32, -32, -32, -32, -32, -32, /* 0e0-0ef */ + -32, -32, -32, -32, -32, -32, -32, 0, -32, -32, + -32, -32, -32, -32, -32, 121, /* 0f0-0ff */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 100-10f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 110-11f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 120-12f */ + 0, 0, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0, /* 130-13f */ + -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, /* 140-14f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 150-15f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 160-16f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0, /* 170-17f */ + 0, 0, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, /* 180-18f */ + 0, 0, -1, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, /* 190-19f */ + 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0, /* 1a0-1af */ + -1, 0, 0, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, /* 1b0-1bf */ + 0, 0, 0, 0, 0, -1, -2, 0, -1, -2, 0, -1, -2, 0, -1, 0, /* 1c0-1cf */ + -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, -79, 0, -1, /* 1d0-1df */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e0-1ef */ + 0, 0, -1, -2, 0, -1, 0, 0, 0, -1, 0, -1, 0, -1, 0, -1, /* 1f0-1ff */ +}; + +/* Upper case range - Greek */ +static signed char UniCaseRangeU03a0[47] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -38, -37, -37, -37, /* 3a0-3af */ + 0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, + -32, -32, -32, -32, /* 3b0-3bf */ + -32, -32, -31, -32, -32, -32, -32, -32, -32, -32, -32, -32, -64, + -63, -63, +}; + +/* Upper case range - Cyrillic */ +static signed char UniCaseRangeU0430[48] = { + -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, + -32, -32, -32, -32, /* 430-43f */ + -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, + -32, -32, -32, -32, /* 440-44f */ + 0, -80, -80, -80, -80, -80, -80, -80, -80, -80, -80, + -80, -80, 0, -80, -80, /* 450-45f */ +}; + +/* Upper case range - Extended cyrillic */ +static signed char UniCaseRangeU0490[61] = { + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 490-49f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 4a0-4af */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 4b0-4bf */ + 0, 0, -1, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, +}; + +/* Upper case range - Extended latin and greek */ +static signed char UniCaseRangeU1e00[509] = { + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e00-1e0f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e10-1e1f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e20-1e2f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e30-1e3f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e40-1e4f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e50-1e5f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e60-1e6f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e70-1e7f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e80-1e8f */ + 0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, -59, 0, -1, 0, -1, /* 1e90-1e9f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ea0-1eaf */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1eb0-1ebf */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ec0-1ecf */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ed0-1edf */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ee0-1eef */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, /* 1ef0-1eff */ + 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f00-1f0f */ + 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f10-1f1f */ + 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f20-1f2f */ + 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f30-1f3f */ + 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f40-1f4f */ + 0, 8, 0, 8, 0, 8, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f50-1f5f */ + 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f60-1f6f */ + 74, 74, 86, 86, 86, 86, 100, 100, 0, 0, 112, 112, + 126, 126, 0, 0, /* 1f70-1f7f */ + 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f80-1f8f */ + 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f90-1f9f */ + 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fa0-1faf */ + 8, 8, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fb0-1fbf */ + 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fc0-1fcf */ + 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fd0-1fdf */ + 8, 8, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fe0-1fef */ + 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +/* Upper case range - Wide latin */ +static signed char UniCaseRangeUff40[27] = { + 0, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, + -32, -32, -32, -32, -32, /* ff40-ff4f */ + -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, -32, +}; + +/* + * Upper Case Range + */ +const struct UniCaseRange SmbUniUpperRange[] = { + {0x03a0, 0x03ce, UniCaseRangeU03a0}, + {0x0430, 0x045f, UniCaseRangeU0430}, + {0x0490, 0x04cc, UniCaseRangeU0490}, + {0x1e00, 0x1ffc, UniCaseRangeU1e00}, + {0xff40, 0xff5a, UniCaseRangeUff40}, + {0} +}; +#endif + +#ifndef UNIUPR_NOLOWER +/* + * Latin lower case + */ +signed char CifsUniLowerTable[512] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 000-00f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 010-01f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 020-02f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 030-03f */ + 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, /* 040-04f */ + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 0, 0, + 0, 0, 0, /* 050-05f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 060-06f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 070-07f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 080-08f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 090-09f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0a0-0af */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0b0-0bf */ + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, /* 0c0-0cf */ + 32, 32, 32, 32, 32, 32, 32, 0, 32, 32, 32, 32, + 32, 32, 32, 0, /* 0d0-0df */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0e0-0ef */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0f0-0ff */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 100-10f */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 110-11f */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 120-12f */ + 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, /* 130-13f */ + 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, /* 140-14f */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 150-15f */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 160-16f */ + 1, 0, 1, 0, 1, 0, 1, 0, -121, 1, 0, 1, 0, 1, 0, + 0, /* 170-17f */ + 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 79, + 0, /* 180-18f */ + 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, /* 190-19f */ + 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, /* 1a0-1af */ + 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, /* 1b0-1bf */ + 0, 0, 0, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 1, 0, 1, /* 1c0-1cf */ + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, /* 1d0-1df */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e0-1ef */ + 0, 2, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1f0-1ff */ +}; + +/* Lower case range - Greek */ +static signed char UniCaseRangeL0380[44] = { + 0, 0, 0, 0, 0, 0, 38, 0, 37, 37, 37, 0, 64, 0, 63, 63, /* 380-38f */ + 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, /* 390-39f */ + 32, 32, 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, +}; + +/* Lower case range - Cyrillic */ +static signed char UniCaseRangeL0400[48] = { + 0, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, + 0, 80, 80, /* 400-40f */ + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, /* 410-41f */ + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, /* 420-42f */ +}; + +/* Lower case range - Extended cyrillic */ +static signed char UniCaseRangeL0490[60] = { + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 490-49f */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 4a0-4af */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 4b0-4bf */ + 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, +}; + +/* Lower case range - Extended latin and greek */ +static signed char UniCaseRangeL1e00[504] = { + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e00-1e0f */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e10-1e1f */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e20-1e2f */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e30-1e3f */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e40-1e4f */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e50-1e5f */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e60-1e6f */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e70-1e7f */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1e80-1e8f */ + 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, /* 1e90-1e9f */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1ea0-1eaf */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1eb0-1ebf */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1ec0-1ecf */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1ed0-1edf */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, /* 1ee0-1eef */ + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, /* 1ef0-1eff */ + 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f00-1f0f */ + 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, 0, 0, /* 1f10-1f1f */ + 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f20-1f2f */ + 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f30-1f3f */ + 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, 0, 0, /* 1f40-1f4f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, -8, 0, -8, 0, -8, 0, -8, /* 1f50-1f5f */ + 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f60-1f6f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f70-1f7f */ + 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f80-1f8f */ + 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1f90-1f9f */ + 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -8, -8, -8, -8, -8, -8, /* 1fa0-1faf */ + 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -74, -74, -9, 0, 0, 0, /* 1fb0-1fbf */ + 0, 0, 0, 0, 0, 0, 0, 0, -86, -86, -86, -86, -9, 0, + 0, 0, /* 1fc0-1fcf */ + 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -100, -100, 0, 0, 0, 0, /* 1fd0-1fdf */ + 0, 0, 0, 0, 0, 0, 0, 0, -8, -8, -112, -112, -7, 0, + 0, 0, /* 1fe0-1fef */ + 0, 0, 0, 0, 0, 0, 0, 0, +}; + +/* Lower case range - Wide latin */ +static signed char UniCaseRangeLff20[27] = { + 0, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, + 32, /* ff20-ff2f */ + 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, +}; + +/* + * Lower Case Range + */ +const struct UniCaseRange CifsUniLowerRange[] = { + {0x0380, 0x03ab, UniCaseRangeL0380}, + {0x0400, 0x042f, UniCaseRangeL0400}, + {0x0490, 0x04cb, UniCaseRangeL0490}, + {0x1e00, 0x1ff7, UniCaseRangeL1e00}, + {0xff20, 0xff3a, UniCaseRangeLff20}, + {0} +}; +#endif + +#endif /* __KSMBD_UNIUPR_H */ diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c new file mode 100644 index 000000000000..aee28ee6b19c --- /dev/null +++ b/fs/ksmbd/vfs.c @@ -0,0 +1,1895 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2016 Namjae Jeon <linkinjeon@kernel.org> + * Copyright (C) 2018 Samsung Electronics Co., Ltd. + */ + +#include <linux/kernel.h> +#include <linux/fs.h> +#include <linux/uaccess.h> +#include <linux/backing-dev.h> +#include <linux/writeback.h> +#include <linux/xattr.h> +#include <linux/falloc.h> +#include <linux/genhd.h> +#include <linux/fsnotify.h> +#include <linux/dcache.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/sched/xacct.h> +#include <linux/crc32c.h> + +#include "glob.h" +#include "oplock.h" +#include "connection.h" +#include "vfs.h" +#include "vfs_cache.h" +#include "smbacl.h" +#include "ndr.h" +#include "auth.h" +#include "misc.h" + +#include "smb_common.h" +#include "mgmt/share_config.h" +#include "mgmt/tree_connect.h" +#include "mgmt/user_session.h" +#include "mgmt/user_config.h" + +static char *extract_last_component(char *path) +{ + char *p = strrchr(path, '/'); + + if (p && p[1] != '\0') { + *p = '\0'; + p++; + } else { + p = NULL; + pr_err("Invalid path %s\n", path); + } + return p; +} + +static void ksmbd_vfs_inherit_owner(struct ksmbd_work *work, + struct inode *parent_inode, + struct inode *inode) +{ + if (!test_share_config_flag(work->tcon->share_conf, + KSMBD_SHARE_FLAG_INHERIT_OWNER)) + return; + + i_uid_write(inode, i_uid_read(parent_inode)); +} + +/** + * ksmbd_vfs_lock_parent() - lock parent dentry if it is stable + * + * the parent dentry got by dget_parent or @parent could be + * unstable, we try to lock a parent inode and lookup the + * child dentry again. + * + * the reference count of @parent isn't incremented. + */ +int ksmbd_vfs_lock_parent(struct dentry *parent, struct dentry *child) +{ + struct dentry *dentry; + int ret = 0; + + inode_lock_nested(d_inode(parent), I_MUTEX_PARENT); + dentry = lookup_one_len(child->d_name.name, parent, + child->d_name.len); + if (IS_ERR(dentry)) { + ret = PTR_ERR(dentry); + goto out_err; + } + + if (dentry != child) { + ret = -ESTALE; + dput(dentry); + goto out_err; + } + + dput(dentry); + return 0; +out_err: + inode_unlock(d_inode(parent)); + return ret; +} + +int ksmbd_vfs_may_delete(struct user_namespace *user_ns, + struct dentry *dentry) +{ + struct dentry *parent; + int ret; + + parent = dget_parent(dentry); + ret = ksmbd_vfs_lock_parent(parent, dentry); + if (ret) { + dput(parent); + return ret; + } + + ret = inode_permission(user_ns, d_inode(parent), + MAY_EXEC | MAY_WRITE); + + inode_unlock(d_inode(parent)); + dput(parent); + return ret; +} + +int ksmbd_vfs_query_maximal_access(struct user_namespace *user_ns, + struct dentry *dentry, __le32 *daccess) +{ + struct dentry *parent; + int ret = 0; + + *daccess = cpu_to_le32(FILE_READ_ATTRIBUTES | READ_CONTROL); + + if (!inode_permission(user_ns, d_inode(dentry), MAY_OPEN | MAY_WRITE)) + *daccess |= cpu_to_le32(WRITE_DAC | WRITE_OWNER | SYNCHRONIZE | + FILE_WRITE_DATA | FILE_APPEND_DATA | + FILE_WRITE_EA | FILE_WRITE_ATTRIBUTES | + FILE_DELETE_CHILD); + + if (!inode_permission(user_ns, d_inode(dentry), MAY_OPEN | MAY_READ)) + *daccess |= FILE_READ_DATA_LE | FILE_READ_EA_LE; + + if (!inode_permission(user_ns, d_inode(dentry), MAY_OPEN | MAY_EXEC)) + *daccess |= FILE_EXECUTE_LE; + + parent = dget_parent(dentry); + ret = ksmbd_vfs_lock_parent(parent, dentry); + if (ret) { + dput(parent); + return ret; + } + + if (!inode_permission(user_ns, d_inode(parent), MAY_EXEC | MAY_WRITE)) + *daccess |= FILE_DELETE_LE; + + inode_unlock(d_inode(parent)); + dput(parent); + return ret; +} + +/** + * ksmbd_vfs_create() - vfs helper for smb create file + * @work: work + * @name: file name + * @mode: file create mode + * + * Return: 0 on success, otherwise error + */ +int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode) +{ + struct path path; + struct dentry *dentry; + int err; + + dentry = kern_path_create(AT_FDCWD, name, &path, 0); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + if (err != -ENOENT) + pr_err("path create failed for %s, err %d\n", + name, err); + return err; + } + + mode |= S_IFREG; + err = vfs_create(mnt_user_ns(path.mnt), d_inode(path.dentry), + dentry, mode, true); + if (!err) { + ksmbd_vfs_inherit_owner(work, d_inode(path.dentry), + d_inode(dentry)); + } else { + pr_err("File(%s): creation failed (err:%d)\n", name, err); + } + done_path_create(&path, dentry); + return err; +} + +/** + * ksmbd_vfs_mkdir() - vfs helper for smb create directory + * @work: work + * @name: directory name + * @mode: directory create mode + * + * Return: 0 on success, otherwise error + */ +int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode) +{ + struct path path; + struct dentry *dentry; + int err; + + dentry = kern_path_create(AT_FDCWD, name, &path, LOOKUP_DIRECTORY); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + if (err != -EEXIST) + ksmbd_debug(VFS, "path create failed for %s, err %d\n", + name, err); + return err; + } + + mode |= S_IFDIR; + err = vfs_mkdir(mnt_user_ns(path.mnt), d_inode(path.dentry), + dentry, mode); + if (err) { + goto out; + } else if (d_unhashed(dentry)) { + struct dentry *d; + + d = lookup_one_len(dentry->d_name.name, dentry->d_parent, + dentry->d_name.len); + if (IS_ERR(d)) { + err = PTR_ERR(d); + goto out; + } + if (unlikely(d_is_negative(d))) { + dput(d); + err = -ENOENT; + goto out; + } + + ksmbd_vfs_inherit_owner(work, d_inode(path.dentry), d_inode(d)); + dput(d); + } +out: + done_path_create(&path, dentry); + if (err) + pr_err("mkdir(%s): creation failed (err:%d)\n", name, err); + return err; +} + +static ssize_t ksmbd_vfs_getcasexattr(struct user_namespace *user_ns, + struct dentry *dentry, char *attr_name, + int attr_name_len, char **attr_value) +{ + char *name, *xattr_list = NULL; + ssize_t value_len = -ENOENT, xattr_list_len; + + xattr_list_len = ksmbd_vfs_listxattr(dentry, &xattr_list); + if (xattr_list_len <= 0) + goto out; + + for (name = xattr_list; name - xattr_list < xattr_list_len; + name += strlen(name) + 1) { + ksmbd_debug(VFS, "%s, len %zd\n", name, strlen(name)); + if (strncasecmp(attr_name, name, attr_name_len)) + continue; + + value_len = ksmbd_vfs_getxattr(user_ns, + dentry, + name, + attr_value); + if (value_len < 0) + pr_err("failed to get xattr in file\n"); + break; + } + +out: + kvfree(xattr_list); + return value_len; +} + +static int ksmbd_vfs_stream_read(struct ksmbd_file *fp, char *buf, loff_t *pos, + size_t count) +{ + ssize_t v_len; + char *stream_buf = NULL; + + ksmbd_debug(VFS, "read stream data pos : %llu, count : %zd\n", + *pos, count); + + v_len = ksmbd_vfs_getcasexattr(file_mnt_user_ns(fp->filp), + fp->filp->f_path.dentry, + fp->stream.name, + fp->stream.size, + &stream_buf); + if ((int)v_len <= 0) + return (int)v_len; + + if (v_len <= *pos) { + count = -EINVAL; + goto free_buf; + } + + if (v_len - *pos < count) + count = v_len - *pos; + + memcpy(buf, &stream_buf[*pos], count); + +free_buf: + kvfree(stream_buf); + return count; +} + +/** + * check_lock_range() - vfs helper for smb byte range file locking + * @filp: the file to apply the lock to + * @start: lock start byte offset + * @end: lock end byte offset + * @type: byte range type read/write + * + * Return: 0 on success, otherwise error + */ +static int check_lock_range(struct file *filp, loff_t start, loff_t end, + unsigned char type) +{ + struct file_lock *flock; + struct file_lock_context *ctx = file_inode(filp)->i_flctx; + int error = 0; + + if (!ctx || list_empty_careful(&ctx->flc_posix)) + return 0; + + spin_lock(&ctx->flc_lock); + list_for_each_entry(flock, &ctx->flc_posix, fl_list) { + /* check conflict locks */ + if (flock->fl_end >= start && end >= flock->fl_start) { + if (flock->fl_type == F_RDLCK) { + if (type == WRITE) { + pr_err("not allow write by shared lock\n"); + error = 1; + goto out; + } + } else if (flock->fl_type == F_WRLCK) { + /* check owner in lock */ + if (flock->fl_file != filp) { + error = 1; + pr_err("not allow rw access by exclusive lock from other opens\n"); + goto out; + } + } + } + } +out: + spin_unlock(&ctx->flc_lock); + return error; +} + +/** + * ksmbd_vfs_read() - vfs helper for smb file read + * @work: smb work + * @fid: file id of open file + * @count: read byte count + * @pos: file pos + * + * Return: number of read bytes on success, otherwise error + */ +int ksmbd_vfs_read(struct ksmbd_work *work, struct ksmbd_file *fp, size_t count, + loff_t *pos) +{ + struct file *filp = fp->filp; + ssize_t nbytes = 0; + char *rbuf = work->aux_payload_buf; + struct inode *inode = file_inode(filp); + + if (S_ISDIR(inode->i_mode)) + return -EISDIR; + + if (unlikely(count == 0)) + return 0; + + if (work->conn->connection_type) { + if (!(fp->daccess & (FILE_READ_DATA_LE | FILE_EXECUTE_LE))) { + pr_err("no right to read(%pd)\n", + fp->filp->f_path.dentry); + return -EACCES; + } + } + + if (ksmbd_stream_fd(fp)) + return ksmbd_vfs_stream_read(fp, rbuf, pos, count); + + if (!work->tcon->posix_extensions) { + int ret; + + ret = check_lock_range(filp, *pos, *pos + count - 1, READ); + if (ret) { + pr_err("unable to read due to lock\n"); + return -EAGAIN; + } + } + + nbytes = kernel_read(filp, rbuf, count, pos); + if (nbytes < 0) { + pr_err("smb read failed for (%s), err = %zd\n", + fp->filename, nbytes); + return nbytes; + } + + filp->f_pos = *pos; + return nbytes; +} + +static int ksmbd_vfs_stream_write(struct ksmbd_file *fp, char *buf, loff_t *pos, + size_t count) +{ + char *stream_buf = NULL, *wbuf; + struct user_namespace *user_ns = file_mnt_user_ns(fp->filp); + size_t size, v_len; + int err = 0; + + ksmbd_debug(VFS, "write stream data pos : %llu, count : %zd\n", + *pos, count); + + size = *pos + count; + if (size > XATTR_SIZE_MAX) { + size = XATTR_SIZE_MAX; + count = (*pos + count) - XATTR_SIZE_MAX; + } + + v_len = ksmbd_vfs_getcasexattr(user_ns, + fp->filp->f_path.dentry, + fp->stream.name, + fp->stream.size, + &stream_buf); + if ((int)v_len < 0) { + pr_err("not found stream in xattr : %zd\n", v_len); + err = (int)v_len; + goto out; + } + + if (v_len < size) { + wbuf = kvmalloc(size, GFP_KERNEL | __GFP_ZERO); + if (!wbuf) { + err = -ENOMEM; + goto out; + } + + if (v_len > 0) + memcpy(wbuf, stream_buf, v_len); + kvfree(stream_buf); + stream_buf = wbuf; + } + + memcpy(&stream_buf[*pos], buf, count); + + err = ksmbd_vfs_setxattr(user_ns, + fp->filp->f_path.dentry, + fp->stream.name, + (void *)stream_buf, + size, + 0); + if (err < 0) + goto out; + + fp->filp->f_pos = *pos; + err = 0; +out: + kvfree(stream_buf); + return err; +} + +/** + * ksmbd_vfs_write() - vfs helper for smb file write + * @work: work + * @fid: file id of open file + * @buf: buf containing data for writing + * @count: read byte count + * @pos: file pos + * @sync: fsync after write + * @written: number of bytes written + * + * Return: 0 on success, otherwise error + */ +int ksmbd_vfs_write(struct ksmbd_work *work, struct ksmbd_file *fp, + char *buf, size_t count, loff_t *pos, bool sync, + ssize_t *written) +{ + struct ksmbd_session *sess = work->sess; + struct file *filp; + loff_t offset = *pos; + int err = 0; + + if (sess->conn->connection_type) { + if (!(fp->daccess & FILE_WRITE_DATA_LE)) { + pr_err("no right to write(%pd)\n", + fp->filp->f_path.dentry); + err = -EACCES; + goto out; + } + } + + filp = fp->filp; + + if (ksmbd_stream_fd(fp)) { + err = ksmbd_vfs_stream_write(fp, buf, pos, count); + if (!err) + *written = count; + goto out; + } + + if (!work->tcon->posix_extensions) { + err = check_lock_range(filp, *pos, *pos + count - 1, WRITE); + if (err) { + pr_err("unable to write due to lock\n"); + err = -EAGAIN; + goto out; + } + } + + /* Do we need to break any of a levelII oplock? */ + smb_break_all_levII_oplock(work, fp, 1); + + err = kernel_write(filp, buf, count, pos); + if (err < 0) { + ksmbd_debug(VFS, "smb write failed, err = %d\n", err); + goto out; + } + + filp->f_pos = *pos; + *written = err; + err = 0; + if (sync) { + err = vfs_fsync_range(filp, offset, offset + *written, 0); + if (err < 0) + pr_err("fsync failed for filename = %pd, err = %d\n", + fp->filp->f_path.dentry, err); + } + +out: + return err; +} + +/** + * ksmbd_vfs_getattr() - vfs helper for smb getattr + * @work: work + * @fid: file id of open file + * @attrs: inode attributes + * + * Return: 0 on success, otherwise error + */ +int ksmbd_vfs_getattr(struct path *path, struct kstat *stat) +{ + int err; + + err = vfs_getattr(path, stat, STATX_BTIME, AT_STATX_SYNC_AS_STAT); + if (err) + pr_err("getattr failed, err %d\n", err); + return err; +} + +/** + * ksmbd_vfs_fsync() - vfs helper for smb fsync + * @work: work + * @fid: file id of open file + * + * Return: 0 on success, otherwise error + */ +int ksmbd_vfs_fsync(struct ksmbd_work *work, u64 fid, u64 p_id) +{ + struct ksmbd_file *fp; + int err; + + fp = ksmbd_lookup_fd_slow(work, fid, p_id); + if (!fp) { + pr_err("failed to get filp for fid %llu\n", fid); + return -ENOENT; + } + err = vfs_fsync(fp->filp, 0); + if (err < 0) + pr_err("smb fsync failed, err = %d\n", err); + ksmbd_fd_put(work, fp); + return err; +} + +/** + * ksmbd_vfs_remove_file() - vfs helper for smb rmdir or unlink + * @name: absolute directory or file name + * + * Return: 0 on success, otherwise error + */ +int ksmbd_vfs_remove_file(struct ksmbd_work *work, char *name) +{ + struct path path; + struct dentry *parent; + int err; + int flags = 0; + + if (ksmbd_override_fsids(work)) + return -ENOMEM; + + if (test_share_config_flag(work->tcon->share_conf, + KSMBD_SHARE_FLAG_FOLLOW_SYMLINKS)) + flags = LOOKUP_FOLLOW; + + err = kern_path(name, flags, &path); + if (err) { + ksmbd_debug(VFS, "can't get %s, err %d\n", name, err); + ksmbd_revert_fsids(work); + return err; + } + + parent = dget_parent(path.dentry); + err = ksmbd_vfs_lock_parent(parent, path.dentry); + if (err) { + dput(parent); + path_put(&path); + ksmbd_revert_fsids(work); + return err; + } + + if (!d_inode(path.dentry)->i_nlink) { + err = -ENOENT; + goto out_err; + } + + if (S_ISDIR(d_inode(path.dentry)->i_mode)) { + err = vfs_rmdir(mnt_user_ns(path.mnt), d_inode(parent), + path.dentry); + if (err && err != -ENOTEMPTY) + ksmbd_debug(VFS, "%s: rmdir failed, err %d\n", name, + err); + } else { + err = vfs_unlink(mnt_user_ns(path.mnt), d_inode(parent), + path.dentry, NULL); + if (err) + ksmbd_debug(VFS, "%s: unlink failed, err %d\n", name, + err); + } + +out_err: + inode_unlock(d_inode(parent)); + dput(parent); + path_put(&path); + ksmbd_revert_fsids(work); + return err; +} + +/** + * ksmbd_vfs_link() - vfs helper for creating smb hardlink + * @oldname: source file name + * @newname: hardlink name + * + * Return: 0 on success, otherwise error + */ +int ksmbd_vfs_link(struct ksmbd_work *work, const char *oldname, + const char *newname) +{ + struct path oldpath, newpath; + struct dentry *dentry; + int err; + int flags = 0; + + if (ksmbd_override_fsids(work)) + return -ENOMEM; + + if (test_share_config_flag(work->tcon->share_conf, + KSMBD_SHARE_FLAG_FOLLOW_SYMLINKS)) + flags = LOOKUP_FOLLOW; + + err = kern_path(oldname, flags, &oldpath); + if (err) { + pr_err("cannot get linux path for %s, err = %d\n", + oldname, err); + goto out1; + } + + dentry = kern_path_create(AT_FDCWD, newname, &newpath, + flags | LOOKUP_REVAL); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + pr_err("path create err for %s, err %d\n", newname, err); + goto out2; + } + + err = -EXDEV; + if (oldpath.mnt != newpath.mnt) { + pr_err("vfs_link failed err %d\n", err); + goto out3; + } + + err = vfs_link(oldpath.dentry, mnt_user_ns(newpath.mnt), + d_inode(newpath.dentry), + dentry, NULL); + if (err) + ksmbd_debug(VFS, "vfs_link failed err %d\n", err); + +out3: + done_path_create(&newpath, dentry); +out2: + path_put(&oldpath); +out1: + ksmbd_revert_fsids(work); + return err; +} + +static int ksmbd_validate_entry_in_use(struct dentry *src_dent) +{ + struct dentry *dst_dent; + + spin_lock(&src_dent->d_lock); + list_for_each_entry(dst_dent, &src_dent->d_subdirs, d_child) { + struct ksmbd_file *child_fp; + + if (d_really_is_negative(dst_dent)) + continue; + + child_fp = ksmbd_lookup_fd_inode(d_inode(dst_dent)); + if (child_fp) { + spin_unlock(&src_dent->d_lock); + ksmbd_debug(VFS, "Forbid rename, sub file/dir is in use\n"); + return -EACCES; + } + } + spin_unlock(&src_dent->d_lock); + + return 0; +} + +static int __ksmbd_vfs_rename(struct ksmbd_work *work, + struct user_namespace *src_user_ns, + struct dentry *src_dent_parent, + struct dentry *src_dent, + struct user_namespace *dst_user_ns, + struct dentry *dst_dent_parent, + struct dentry *trap_dent, + char *dst_name) +{ + struct dentry *dst_dent; + int err; + + if (!work->tcon->posix_extensions) { + err = ksmbd_validate_entry_in_use(src_dent); + if (err) + return err; + } + + if (d_really_is_negative(src_dent_parent)) + return -ENOENT; + if (d_really_is_negative(dst_dent_parent)) + return -ENOENT; + if (d_really_is_negative(src_dent)) + return -ENOENT; + if (src_dent == trap_dent) + return -EINVAL; + + if (ksmbd_override_fsids(work)) + return -ENOMEM; + + dst_dent = lookup_one_len(dst_name, dst_dent_parent, strlen(dst_name)); + err = PTR_ERR(dst_dent); + if (IS_ERR(dst_dent)) { + pr_err("lookup failed %s [%d]\n", dst_name, err); + goto out; + } + + err = -ENOTEMPTY; + if (dst_dent != trap_dent && !d_really_is_positive(dst_dent)) { + struct renamedata rd = { + .old_mnt_userns = src_user_ns, + .old_dir = d_inode(src_dent_parent), + .old_dentry = src_dent, + .new_mnt_userns = dst_user_ns, + .new_dir = d_inode(dst_dent_parent), + .new_dentry = dst_dent, + }; + err = vfs_rename(&rd); + } + if (err) + pr_err("vfs_rename failed err %d\n", err); + if (dst_dent) + dput(dst_dent); +out: + ksmbd_revert_fsids(work); + return err; +} + +int ksmbd_vfs_fp_rename(struct ksmbd_work *work, struct ksmbd_file *fp, + char *newname) +{ + struct path dst_path; + struct dentry *src_dent_parent, *dst_dent_parent; + struct dentry *src_dent, *trap_dent, *src_child; + char *dst_name; + int err; + int flags; + + dst_name = extract_last_component(newname); + if (!dst_name) + return -EINVAL; + + src_dent_parent = dget_parent(fp->filp->f_path.dentry); + src_dent = fp->filp->f_path.dentry; + + flags = LOOKUP_DIRECTORY; + if (test_share_config_flag(work->tcon->share_conf, + KSMBD_SHARE_FLAG_FOLLOW_SYMLINKS)) + flags |= LOOKUP_FOLLOW; + + err = kern_path(newname, flags, &dst_path); + if (err) { + ksmbd_debug(VFS, "Cannot get path for %s [%d]\n", newname, err); + goto out; + } + dst_dent_parent = dst_path.dentry; + + trap_dent = lock_rename(src_dent_parent, dst_dent_parent); + dget(src_dent); + dget(dst_dent_parent); + src_child = lookup_one_len(src_dent->d_name.name, src_dent_parent, + src_dent->d_name.len); + if (IS_ERR(src_child)) { + err = PTR_ERR(src_child); + goto out_lock; + } + + if (src_child != src_dent) { + err = -ESTALE; + dput(src_child); + goto out_lock; + } + dput(src_child); + + err = __ksmbd_vfs_rename(work, + file_mnt_user_ns(fp->filp), + src_dent_parent, + src_dent, + mnt_user_ns(dst_path.mnt), + dst_dent_parent, + trap_dent, + dst_name); +out_lock: + dput(src_dent); + dput(dst_dent_parent); + unlock_rename(src_dent_parent, dst_dent_parent); + path_put(&dst_path); +out: + dput(src_dent_parent); + return err; +} + +/** + * ksmbd_vfs_truncate() - vfs helper for smb file truncate + * @work: work + * @name: old filename + * @fid: file id of old file + * @size: truncate to given size + * + * Return: 0 on success, otherwise error + */ +int ksmbd_vfs_truncate(struct ksmbd_work *work, const char *name, + struct ksmbd_file *fp, loff_t size) +{ + struct path path; + int err = 0; + + if (name) { + err = kern_path(name, 0, &path); + if (err) { + pr_err("cannot get linux path for %s, err %d\n", + name, err); + return err; + } + err = vfs_truncate(&path, size); + if (err) + pr_err("truncate failed for %s err %d\n", + name, err); + path_put(&path); + } else { + struct file *filp; + + filp = fp->filp; + + /* Do we need to break any of a levelII oplock? */ + smb_break_all_levII_oplock(work, fp, 1); + + if (!work->tcon->posix_extensions) { + struct inode *inode = file_inode(filp); + + if (size < inode->i_size) { + err = check_lock_range(filp, size, + inode->i_size - 1, WRITE); + } else { + err = check_lock_range(filp, inode->i_size, + size - 1, WRITE); + } + + if (err) { + pr_err("failed due to lock\n"); + return -EAGAIN; + } + } + + err = vfs_truncate(&filp->f_path, size); + if (err) + pr_err("truncate failed for filename : %s err %d\n", + fp->filename, err); + } + + return err; +} + +/** + * ksmbd_vfs_listxattr() - vfs helper for smb list extended attributes + * @dentry: dentry of file for listing xattrs + * @list: destination buffer + * @size: destination buffer length + * + * Return: xattr list length on success, otherwise error + */ +ssize_t ksmbd_vfs_listxattr(struct dentry *dentry, char **list) +{ + ssize_t size; + char *vlist = NULL; + + size = vfs_listxattr(dentry, NULL, 0); + if (size <= 0) + return size; + + vlist = kvmalloc(size, GFP_KERNEL | __GFP_ZERO); + if (!vlist) + return -ENOMEM; + + *list = vlist; + size = vfs_listxattr(dentry, vlist, size); + if (size < 0) { + ksmbd_debug(VFS, "listxattr failed\n"); + kvfree(vlist); + *list = NULL; + } + + return size; +} + +static ssize_t ksmbd_vfs_xattr_len(struct user_namespace *user_ns, + struct dentry *dentry, char *xattr_name) +{ + return vfs_getxattr(user_ns, dentry, xattr_name, NULL, 0); +} + +/** + * ksmbd_vfs_getxattr() - vfs helper for smb get extended attributes value + * @user_ns: user namespace + * @dentry: dentry of file for getting xattrs + * @xattr_name: name of xattr name to query + * @xattr_buf: destination buffer xattr value + * + * Return: read xattr value length on success, otherwise error + */ +ssize_t ksmbd_vfs_getxattr(struct user_namespace *user_ns, + struct dentry *dentry, + char *xattr_name, char **xattr_buf) +{ + ssize_t xattr_len; + char *buf; + + *xattr_buf = NULL; + xattr_len = ksmbd_vfs_xattr_len(user_ns, dentry, xattr_name); + if (xattr_len < 0) + return xattr_len; + + buf = kmalloc(xattr_len + 1, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + xattr_len = vfs_getxattr(user_ns, dentry, xattr_name, + (void *)buf, xattr_len); + if (xattr_len > 0) + *xattr_buf = buf; + else + kfree(buf); + return xattr_len; +} + +/** + * ksmbd_vfs_setxattr() - vfs helper for smb set extended attributes value + * @user_ns: user namespace + * @dentry: dentry to set XATTR at + * @name: xattr name for setxattr + * @value: xattr value to set + * @size: size of xattr value + * @flags: destination buffer length + * + * Return: 0 on success, otherwise error + */ +int ksmbd_vfs_setxattr(struct user_namespace *user_ns, + struct dentry *dentry, const char *attr_name, + const void *attr_value, size_t attr_size, int flags) +{ + int err; + + err = vfs_setxattr(user_ns, + dentry, + attr_name, + attr_value, + attr_size, + flags); + if (err) + ksmbd_debug(VFS, "setxattr failed, err %d\n", err); + return err; +} + +/** + * ksmbd_vfs_set_fadvise() - convert smb IO caching options to linux options + * @filp: file pointer for IO + * @options: smb IO options + */ +void ksmbd_vfs_set_fadvise(struct file *filp, __le32 option) +{ + struct address_space *mapping; + + mapping = filp->f_mapping; + + if (!option || !mapping) + return; + + if (option & FILE_WRITE_THROUGH_LE) { + filp->f_flags |= O_SYNC; + } else if (option & FILE_SEQUENTIAL_ONLY_LE) { + filp->f_ra.ra_pages = inode_to_bdi(mapping->host)->ra_pages * 2; + spin_lock(&filp->f_lock); + filp->f_mode &= ~FMODE_RANDOM; + spin_unlock(&filp->f_lock); + } else if (option & FILE_RANDOM_ACCESS_LE) { + spin_lock(&filp->f_lock); + filp->f_mode |= FMODE_RANDOM; + spin_unlock(&filp->f_lock); + } +} + +int ksmbd_vfs_zero_data(struct ksmbd_work *work, struct ksmbd_file *fp, + loff_t off, loff_t len) +{ + smb_break_all_levII_oplock(work, fp, 1); + if (fp->f_ci->m_fattr & ATTR_SPARSE_FILE_LE) + return vfs_fallocate(fp->filp, + FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + off, len); + + return vfs_fallocate(fp->filp, FALLOC_FL_ZERO_RANGE, off, len); +} + +int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length, + struct file_allocated_range_buffer *ranges, + int in_count, int *out_count) +{ + struct file *f = fp->filp; + struct inode *inode = file_inode(fp->filp); + loff_t maxbytes = (u64)inode->i_sb->s_maxbytes, end; + loff_t extent_start, extent_end; + int ret = 0; + + if (start > maxbytes) + return -EFBIG; + + if (!in_count) + return 0; + + /* + * Shrink request scope to what the fs can actually handle. + */ + if (length > maxbytes || (maxbytes - length) < start) + length = maxbytes - start; + + if (start + length > inode->i_size) + length = inode->i_size - start; + + *out_count = 0; + end = start + length; + while (start < end && *out_count < in_count) { + extent_start = f->f_op->llseek(f, start, SEEK_DATA); + if (extent_start < 0) { + if (extent_start != -ENXIO) + ret = (int)extent_start; + break; + } + + if (extent_start >= end) + break; + + extent_end = f->f_op->llseek(f, extent_start, SEEK_HOLE); + if (extent_end < 0) { + if (extent_end != -ENXIO) + ret = (int)extent_end; + break; + } else if (extent_start >= extent_end) { + break; + } + + ranges[*out_count].file_offset = cpu_to_le64(extent_start); + ranges[(*out_count)++].length = + cpu_to_le64(min(extent_end, end) - extent_start); + + start = extent_end; + } + + return ret; +} + +int ksmbd_vfs_remove_xattr(struct user_namespace *user_ns, + struct dentry *dentry, char *attr_name) +{ + return vfs_removexattr(user_ns, dentry, attr_name); +} + +int ksmbd_vfs_unlink(struct user_namespace *user_ns, + struct dentry *dir, struct dentry *dentry) +{ + int err = 0; + + err = ksmbd_vfs_lock_parent(dir, dentry); + if (err) + return err; + dget(dentry); + + if (S_ISDIR(d_inode(dentry)->i_mode)) + err = vfs_rmdir(user_ns, d_inode(dir), dentry); + else + err = vfs_unlink(user_ns, d_inode(dir), dentry, NULL); + + dput(dentry); + inode_unlock(d_inode(dir)); + if (err) + ksmbd_debug(VFS, "failed to delete, err %d\n", err); + + return err; +} + +static int __dir_empty(struct dir_context *ctx, const char *name, int namlen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct ksmbd_readdir_data *buf; + + buf = container_of(ctx, struct ksmbd_readdir_data, ctx); + buf->dirent_count++; + + if (buf->dirent_count > 2) + return -ENOTEMPTY; + return 0; +} + +/** + * ksmbd_vfs_empty_dir() - check for empty directory + * @fp: ksmbd file pointer + * + * Return: true if directory empty, otherwise false + */ +int ksmbd_vfs_empty_dir(struct ksmbd_file *fp) +{ + int err; + struct ksmbd_readdir_data readdir_data; + + memset(&readdir_data, 0, sizeof(struct ksmbd_readdir_data)); + + set_ctx_actor(&readdir_data.ctx, __dir_empty); + readdir_data.dirent_count = 0; + + err = iterate_dir(fp->filp, &readdir_data.ctx); + if (readdir_data.dirent_count > 2) + err = -ENOTEMPTY; + else + err = 0; + return err; +} + +static int __caseless_lookup(struct dir_context *ctx, const char *name, + int namlen, loff_t offset, u64 ino, + unsigned int d_type) +{ + struct ksmbd_readdir_data *buf; + + buf = container_of(ctx, struct ksmbd_readdir_data, ctx); + + if (buf->used != namlen) + return 0; + if (!strncasecmp((char *)buf->private, name, namlen)) { + memcpy((char *)buf->private, name, namlen); + buf->dirent_count = 1; + return -EEXIST; + } + return 0; +} + +/** + * ksmbd_vfs_lookup_in_dir() - lookup a file in a directory + * @dir: path info + * @name: filename to lookup + * @namelen: filename length + * + * Return: 0 on success, otherwise error + */ +static int ksmbd_vfs_lookup_in_dir(struct path *dir, char *name, size_t namelen) +{ + int ret; + struct file *dfilp; + int flags = O_RDONLY | O_LARGEFILE; + struct ksmbd_readdir_data readdir_data = { + .ctx.actor = __caseless_lookup, + .private = name, + .used = namelen, + .dirent_count = 0, + }; + + dfilp = dentry_open(dir, flags, current_cred()); + if (IS_ERR(dfilp)) + return PTR_ERR(dfilp); + + ret = iterate_dir(dfilp, &readdir_data.ctx); + if (readdir_data.dirent_count > 0) + ret = 0; + fput(dfilp); + return ret; +} + +/** + * ksmbd_vfs_kern_path() - lookup a file and get path info + * @name: name of file for lookup + * @flags: lookup flags + * @path: if lookup succeed, return path info + * @caseless: caseless filename lookup + * + * Return: 0 on success, otherwise error + */ +int ksmbd_vfs_kern_path(char *name, unsigned int flags, struct path *path, + bool caseless) +{ + int err; + + if (name[0] != '/') + return -EINVAL; + + err = kern_path(name, flags, path); + if (!err) + return 0; + + if (caseless) { + char *filepath; + struct path parent; + size_t path_len, remain_len; + + filepath = kstrdup(name, GFP_KERNEL); + if (!filepath) + return -ENOMEM; + + path_len = strlen(filepath); + remain_len = path_len - 1; + + err = kern_path("/", flags, &parent); + if (err) + goto out; + + while (d_can_lookup(parent.dentry)) { + char *filename = filepath + path_len - remain_len; + char *next = strchrnul(filename, '/'); + size_t filename_len = next - filename; + bool is_last = !next[0]; + + if (filename_len == 0) + break; + + err = ksmbd_vfs_lookup_in_dir(&parent, filename, + filename_len); + if (err) { + path_put(&parent); + goto out; + } + + path_put(&parent); + next[0] = '\0'; + + err = kern_path(filepath, flags, &parent); + if (err) + goto out; + + if (is_last) { + path->mnt = parent.mnt; + path->dentry = parent.dentry; + goto out; + } + + next[0] = '/'; + remain_len -= filename_len + 1; + } + + path_put(&parent); + err = -EINVAL; +out: + kfree(filepath); + } + return err; +} + +int ksmbd_vfs_remove_acl_xattrs(struct user_namespace *user_ns, + struct dentry *dentry) +{ + char *name, *xattr_list = NULL; + ssize_t xattr_list_len; + int err = 0; + + xattr_list_len = ksmbd_vfs_listxattr(dentry, &xattr_list); + if (xattr_list_len < 0) { + goto out; + } else if (!xattr_list_len) { + ksmbd_debug(SMB, "empty xattr in the file\n"); + goto out; + } + + for (name = xattr_list; name - xattr_list < xattr_list_len; + name += strlen(name) + 1) { + ksmbd_debug(SMB, "%s, len %zd\n", name, strlen(name)); + + if (!strncmp(name, XATTR_NAME_POSIX_ACL_ACCESS, + sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1) || + !strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, + sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1)) { + err = ksmbd_vfs_remove_xattr(user_ns, dentry, name); + if (err) + ksmbd_debug(SMB, + "remove acl xattr failed : %s\n", name); + } + } +out: + kvfree(xattr_list); + return err; +} + +int ksmbd_vfs_remove_sd_xattrs(struct user_namespace *user_ns, + struct dentry *dentry) +{ + char *name, *xattr_list = NULL; + ssize_t xattr_list_len; + int err = 0; + + xattr_list_len = ksmbd_vfs_listxattr(dentry, &xattr_list); + if (xattr_list_len < 0) { + goto out; + } else if (!xattr_list_len) { + ksmbd_debug(SMB, "empty xattr in the file\n"); + goto out; + } + + for (name = xattr_list; name - xattr_list < xattr_list_len; + name += strlen(name) + 1) { + ksmbd_debug(SMB, "%s, len %zd\n", name, strlen(name)); + + if (!strncmp(name, XATTR_NAME_SD, XATTR_NAME_SD_LEN)) { + err = ksmbd_vfs_remove_xattr(user_ns, dentry, name); + if (err) + ksmbd_debug(SMB, "remove xattr failed : %s\n", name); + } + } +out: + kvfree(xattr_list); + return err; +} + +static struct xattr_smb_acl *ksmbd_vfs_make_xattr_posix_acl(struct user_namespace *user_ns, + struct inode *inode, + int acl_type) +{ + struct xattr_smb_acl *smb_acl = NULL; + struct posix_acl *posix_acls; + struct posix_acl_entry *pa_entry; + struct xattr_acl_entry *xa_entry; + int i; + + if (!IS_ENABLED(CONFIG_FS_POSIX_ACL)) + return NULL; + + posix_acls = get_acl(inode, acl_type); + if (!posix_acls) + return NULL; + + smb_acl = kzalloc(sizeof(struct xattr_smb_acl) + + sizeof(struct xattr_acl_entry) * posix_acls->a_count, + GFP_KERNEL); + if (!smb_acl) + goto out; + + smb_acl->count = posix_acls->a_count; + pa_entry = posix_acls->a_entries; + xa_entry = smb_acl->entries; + for (i = 0; i < posix_acls->a_count; i++, pa_entry++, xa_entry++) { + switch (pa_entry->e_tag) { + case ACL_USER: + xa_entry->type = SMB_ACL_USER; + xa_entry->uid = from_kuid(user_ns, pa_entry->e_uid); + break; + case ACL_USER_OBJ: + xa_entry->type = SMB_ACL_USER_OBJ; + break; + case ACL_GROUP: + xa_entry->type = SMB_ACL_GROUP; + xa_entry->gid = from_kgid(user_ns, pa_entry->e_gid); + break; + case ACL_GROUP_OBJ: + xa_entry->type = SMB_ACL_GROUP_OBJ; + break; + case ACL_OTHER: + xa_entry->type = SMB_ACL_OTHER; + break; + case ACL_MASK: + xa_entry->type = SMB_ACL_MASK; + break; + default: + pr_err("unknown type : 0x%x\n", pa_entry->e_tag); + goto out; + } + + if (pa_entry->e_perm & ACL_READ) + xa_entry->perm |= SMB_ACL_READ; + if (pa_entry->e_perm & ACL_WRITE) + xa_entry->perm |= SMB_ACL_WRITE; + if (pa_entry->e_perm & ACL_EXECUTE) + xa_entry->perm |= SMB_ACL_EXECUTE; + } +out: + posix_acl_release(posix_acls); + return smb_acl; +} + +int ksmbd_vfs_set_sd_xattr(struct ksmbd_conn *conn, + struct user_namespace *user_ns, + struct dentry *dentry, + struct smb_ntsd *pntsd, int len) +{ + int rc; + struct ndr sd_ndr = {0}, acl_ndr = {0}; + struct xattr_ntacl acl = {0}; + struct xattr_smb_acl *smb_acl, *def_smb_acl = NULL; + struct inode *inode = d_inode(dentry); + + acl.version = 4; + acl.hash_type = XATTR_SD_HASH_TYPE_SHA256; + acl.current_time = ksmbd_UnixTimeToNT(current_time(inode)); + + memcpy(acl.desc, "posix_acl", 9); + acl.desc_len = 10; + + pntsd->osidoffset = + cpu_to_le32(le32_to_cpu(pntsd->osidoffset) + NDR_NTSD_OFFSETOF); + pntsd->gsidoffset = + cpu_to_le32(le32_to_cpu(pntsd->gsidoffset) + NDR_NTSD_OFFSETOF); + pntsd->dacloffset = + cpu_to_le32(le32_to_cpu(pntsd->dacloffset) + NDR_NTSD_OFFSETOF); + + acl.sd_buf = (char *)pntsd; + acl.sd_size = len; + + rc = ksmbd_gen_sd_hash(conn, acl.sd_buf, acl.sd_size, acl.hash); + if (rc) { + pr_err("failed to generate hash for ndr acl\n"); + return rc; + } + + smb_acl = ksmbd_vfs_make_xattr_posix_acl(user_ns, inode, + ACL_TYPE_ACCESS); + if (S_ISDIR(inode->i_mode)) + def_smb_acl = ksmbd_vfs_make_xattr_posix_acl(user_ns, inode, + ACL_TYPE_DEFAULT); + + rc = ndr_encode_posix_acl(&acl_ndr, user_ns, inode, + smb_acl, def_smb_acl); + if (rc) { + pr_err("failed to encode ndr to posix acl\n"); + goto out; + } + + rc = ksmbd_gen_sd_hash(conn, acl_ndr.data, acl_ndr.offset, + acl.posix_acl_hash); + if (rc) { + pr_err("failed to generate hash for ndr acl\n"); + goto out; + } + + rc = ndr_encode_v4_ntacl(&sd_ndr, &acl); + if (rc) { + pr_err("failed to encode ndr to posix acl\n"); + goto out; + } + + rc = ksmbd_vfs_setxattr(user_ns, dentry, + XATTR_NAME_SD, sd_ndr.data, + sd_ndr.offset, 0); + if (rc < 0) + pr_err("Failed to store XATTR ntacl :%d\n", rc); + + kfree(sd_ndr.data); +out: + kfree(acl_ndr.data); + kfree(smb_acl); + kfree(def_smb_acl); + return rc; +} + +int ksmbd_vfs_get_sd_xattr(struct ksmbd_conn *conn, + struct user_namespace *user_ns, + struct dentry *dentry, + struct smb_ntsd **pntsd) +{ + int rc; + struct ndr n; + struct inode *inode = d_inode(dentry); + struct ndr acl_ndr = {0}; + struct xattr_ntacl acl; + struct xattr_smb_acl *smb_acl = NULL, *def_smb_acl = NULL; + __u8 cmp_hash[XATTR_SD_HASH_SIZE] = {0}; + + rc = ksmbd_vfs_getxattr(user_ns, dentry, XATTR_NAME_SD, &n.data); + if (rc <= 0) + return rc; + + n.length = rc; + rc = ndr_decode_v4_ntacl(&n, &acl); + if (rc) + goto free_n_data; + + smb_acl = ksmbd_vfs_make_xattr_posix_acl(user_ns, inode, + ACL_TYPE_ACCESS); + if (S_ISDIR(inode->i_mode)) + def_smb_acl = ksmbd_vfs_make_xattr_posix_acl(user_ns, inode, + ACL_TYPE_DEFAULT); + + rc = ndr_encode_posix_acl(&acl_ndr, user_ns, inode, smb_acl, + def_smb_acl); + if (rc) { + pr_err("failed to encode ndr to posix acl\n"); + goto out_free; + } + + rc = ksmbd_gen_sd_hash(conn, acl_ndr.data, acl_ndr.offset, cmp_hash); + if (rc) { + pr_err("failed to generate hash for ndr acl\n"); + goto out_free; + } + + if (memcmp(cmp_hash, acl.posix_acl_hash, XATTR_SD_HASH_SIZE)) { + pr_err("hash value diff\n"); + rc = -EINVAL; + goto out_free; + } + + *pntsd = acl.sd_buf; + (*pntsd)->osidoffset = cpu_to_le32(le32_to_cpu((*pntsd)->osidoffset) - + NDR_NTSD_OFFSETOF); + (*pntsd)->gsidoffset = cpu_to_le32(le32_to_cpu((*pntsd)->gsidoffset) - + NDR_NTSD_OFFSETOF); + (*pntsd)->dacloffset = cpu_to_le32(le32_to_cpu((*pntsd)->dacloffset) - + NDR_NTSD_OFFSETOF); + + rc = acl.sd_size; +out_free: + kfree(acl_ndr.data); + kfree(smb_acl); + kfree(def_smb_acl); + if (rc < 0) { + kfree(acl.sd_buf); + *pntsd = NULL; + } + +free_n_data: + kfree(n.data); + return rc; +} + +int ksmbd_vfs_set_dos_attrib_xattr(struct user_namespace *user_ns, + struct dentry *dentry, + struct xattr_dos_attrib *da) +{ + struct ndr n; + int err; + + err = ndr_encode_dos_attr(&n, da); + if (err) + return err; + + err = ksmbd_vfs_setxattr(user_ns, dentry, XATTR_NAME_DOS_ATTRIBUTE, + (void *)n.data, n.offset, 0); + if (err) + ksmbd_debug(SMB, "failed to store dos attribute in xattr\n"); + kfree(n.data); + + return err; +} + +int ksmbd_vfs_get_dos_attrib_xattr(struct user_namespace *user_ns, + struct dentry *dentry, + struct xattr_dos_attrib *da) +{ + struct ndr n; + int err; + + err = ksmbd_vfs_getxattr(user_ns, dentry, XATTR_NAME_DOS_ATTRIBUTE, + (char **)&n.data); + if (err > 0) { + n.length = err; + if (ndr_decode_dos_attr(&n, da)) + err = -EINVAL; + kfree(n.data); + } else { + ksmbd_debug(SMB, "failed to load dos attribute in xattr\n"); + } + + return err; +} + +/** + * ksmbd_vfs_init_kstat() - convert unix stat information to smb stat format + * @p: destination buffer + * @ksmbd_kstat: ksmbd kstat wrapper + */ +void *ksmbd_vfs_init_kstat(char **p, struct ksmbd_kstat *ksmbd_kstat) +{ + struct file_directory_info *info = (struct file_directory_info *)(*p); + struct kstat *kstat = ksmbd_kstat->kstat; + u64 time; + + info->FileIndex = 0; + info->CreationTime = cpu_to_le64(ksmbd_kstat->create_time); + time = ksmbd_UnixTimeToNT(kstat->atime); + info->LastAccessTime = cpu_to_le64(time); + time = ksmbd_UnixTimeToNT(kstat->mtime); + info->LastWriteTime = cpu_to_le64(time); + time = ksmbd_UnixTimeToNT(kstat->ctime); + info->ChangeTime = cpu_to_le64(time); + + if (ksmbd_kstat->file_attributes & ATTR_DIRECTORY_LE) { + info->EndOfFile = 0; + info->AllocationSize = 0; + } else { + info->EndOfFile = cpu_to_le64(kstat->size); + info->AllocationSize = cpu_to_le64(kstat->blocks << 9); + } + info->ExtFileAttributes = ksmbd_kstat->file_attributes; + + return info; +} + +int ksmbd_vfs_fill_dentry_attrs(struct ksmbd_work *work, + struct user_namespace *user_ns, + struct dentry *dentry, + struct ksmbd_kstat *ksmbd_kstat) +{ + u64 time; + int rc; + + generic_fillattr(user_ns, d_inode(dentry), ksmbd_kstat->kstat); + + time = ksmbd_UnixTimeToNT(ksmbd_kstat->kstat->ctime); + ksmbd_kstat->create_time = time; + + /* + * set default value for the case that store dos attributes is not yes + * or that acl is disable in server's filesystem and the config is yes. + */ + if (S_ISDIR(ksmbd_kstat->kstat->mode)) + ksmbd_kstat->file_attributes = ATTR_DIRECTORY_LE; + else + ksmbd_kstat->file_attributes = ATTR_ARCHIVE_LE; + + if (test_share_config_flag(work->tcon->share_conf, + KSMBD_SHARE_FLAG_STORE_DOS_ATTRS)) { + struct xattr_dos_attrib da; + + rc = ksmbd_vfs_get_dos_attrib_xattr(user_ns, dentry, &da); + if (rc > 0) { + ksmbd_kstat->file_attributes = cpu_to_le32(da.attr); + ksmbd_kstat->create_time = da.create_time; + } else { + ksmbd_debug(VFS, "fail to load dos attribute.\n"); + } + } + + return 0; +} + +ssize_t ksmbd_vfs_casexattr_len(struct user_namespace *user_ns, + struct dentry *dentry, char *attr_name, + int attr_name_len) +{ + char *name, *xattr_list = NULL; + ssize_t value_len = -ENOENT, xattr_list_len; + + xattr_list_len = ksmbd_vfs_listxattr(dentry, &xattr_list); + if (xattr_list_len <= 0) + goto out; + + for (name = xattr_list; name - xattr_list < xattr_list_len; + name += strlen(name) + 1) { + ksmbd_debug(VFS, "%s, len %zd\n", name, strlen(name)); + if (strncasecmp(attr_name, name, attr_name_len)) + continue; + + value_len = ksmbd_vfs_xattr_len(user_ns, dentry, name); + break; + } + +out: + kvfree(xattr_list); + return value_len; +} + +int ksmbd_vfs_xattr_stream_name(char *stream_name, char **xattr_stream_name, + size_t *xattr_stream_name_size, int s_type) +{ + char *type, *buf; + + if (s_type == DIR_STREAM) + type = ":$INDEX_ALLOCATION"; + else + type = ":$DATA"; + + buf = kasprintf(GFP_KERNEL, "%s%s%s", + XATTR_NAME_STREAM, stream_name, type); + if (!buf) + return -ENOMEM; + + *xattr_stream_name = buf; + *xattr_stream_name_size = strlen(buf) + 1; + + return 0; +} + +int ksmbd_vfs_copy_file_ranges(struct ksmbd_work *work, + struct ksmbd_file *src_fp, + struct ksmbd_file *dst_fp, + struct srv_copychunk *chunks, + unsigned int chunk_count, + unsigned int *chunk_count_written, + unsigned int *chunk_size_written, + loff_t *total_size_written) +{ + unsigned int i; + loff_t src_off, dst_off, src_file_size; + size_t len; + int ret; + + *chunk_count_written = 0; + *chunk_size_written = 0; + *total_size_written = 0; + + if (!(src_fp->daccess & (FILE_READ_DATA_LE | FILE_EXECUTE_LE))) { + pr_err("no right to read(%pd)\n", src_fp->filp->f_path.dentry); + return -EACCES; + } + if (!(dst_fp->daccess & (FILE_WRITE_DATA_LE | FILE_APPEND_DATA_LE))) { + pr_err("no right to write(%pd)\n", dst_fp->filp->f_path.dentry); + return -EACCES; + } + + if (ksmbd_stream_fd(src_fp) || ksmbd_stream_fd(dst_fp)) + return -EBADF; + + smb_break_all_levII_oplock(work, dst_fp, 1); + + if (!work->tcon->posix_extensions) { + for (i = 0; i < chunk_count; i++) { + src_off = le64_to_cpu(chunks[i].SourceOffset); + dst_off = le64_to_cpu(chunks[i].TargetOffset); + len = le32_to_cpu(chunks[i].Length); + + if (check_lock_range(src_fp->filp, src_off, + src_off + len - 1, READ)) + return -EAGAIN; + if (check_lock_range(dst_fp->filp, dst_off, + dst_off + len - 1, WRITE)) + return -EAGAIN; + } + } + + src_file_size = i_size_read(file_inode(src_fp->filp)); + + for (i = 0; i < chunk_count; i++) { + src_off = le64_to_cpu(chunks[i].SourceOffset); + dst_off = le64_to_cpu(chunks[i].TargetOffset); + len = le32_to_cpu(chunks[i].Length); + + if (src_off + len > src_file_size) + return -E2BIG; + + ret = vfs_copy_file_range(src_fp->filp, src_off, + dst_fp->filp, dst_off, len, 0); + if (ret < 0) + return ret; + + *chunk_count_written += 1; + *total_size_written += ret; + } + return 0; +} + +void ksmbd_vfs_posix_lock_wait(struct file_lock *flock) +{ + wait_event(flock->fl_wait, !flock->fl_blocker); +} + +int ksmbd_vfs_posix_lock_wait_timeout(struct file_lock *flock, long timeout) +{ + return wait_event_interruptible_timeout(flock->fl_wait, + !flock->fl_blocker, + timeout); +} + +void ksmbd_vfs_posix_lock_unblock(struct file_lock *flock) +{ + locks_delete_block(flock); +} + +int ksmbd_vfs_set_init_posix_acl(struct user_namespace *user_ns, + struct inode *inode) +{ + struct posix_acl_state acl_state; + struct posix_acl *acls; + int rc; + + if (!IS_ENABLED(CONFIG_FS_POSIX_ACL)) + return -EOPNOTSUPP; + + ksmbd_debug(SMB, "Set posix acls\n"); + rc = init_acl_state(&acl_state, 1); + if (rc) + return rc; + + /* Set default owner group */ + acl_state.owner.allow = (inode->i_mode & 0700) >> 6; + acl_state.group.allow = (inode->i_mode & 0070) >> 3; + acl_state.other.allow = inode->i_mode & 0007; + acl_state.users->aces[acl_state.users->n].uid = inode->i_uid; + acl_state.users->aces[acl_state.users->n++].perms.allow = + acl_state.owner.allow; + acl_state.groups->aces[acl_state.groups->n].gid = inode->i_gid; + acl_state.groups->aces[acl_state.groups->n++].perms.allow = + acl_state.group.allow; + acl_state.mask.allow = 0x07; + + acls = posix_acl_alloc(6, GFP_KERNEL); + if (!acls) { + free_acl_state(&acl_state); + return -ENOMEM; + } + posix_state_to_acl(&acl_state, acls->a_entries); + rc = set_posix_acl(user_ns, inode, ACL_TYPE_ACCESS, acls); + if (rc < 0) + ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_ACCESS) failed, rc : %d\n", + rc); + else if (S_ISDIR(inode->i_mode)) { + posix_state_to_acl(&acl_state, acls->a_entries); + rc = set_posix_acl(user_ns, inode, ACL_TYPE_DEFAULT, + acls); + if (rc < 0) + ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_DEFAULT) failed, rc : %d\n", + rc); + } + free_acl_state(&acl_state); + posix_acl_release(acls); + return rc; +} + +int ksmbd_vfs_inherit_posix_acl(struct user_namespace *user_ns, + struct inode *inode, struct inode *parent_inode) +{ + struct posix_acl *acls; + struct posix_acl_entry *pace; + int rc, i; + + if (!IS_ENABLED(CONFIG_FS_POSIX_ACL)) + return -EOPNOTSUPP; + + acls = get_acl(parent_inode, ACL_TYPE_DEFAULT); + if (!acls) + return -ENOENT; + pace = acls->a_entries; + + for (i = 0; i < acls->a_count; i++, pace++) { + if (pace->e_tag == ACL_MASK) { + pace->e_perm = 0x07; + break; + } + } + + rc = set_posix_acl(user_ns, inode, ACL_TYPE_ACCESS, acls); + if (rc < 0) + ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_ACCESS) failed, rc : %d\n", + rc); + if (S_ISDIR(inode->i_mode)) { + rc = set_posix_acl(user_ns, inode, ACL_TYPE_DEFAULT, + acls); + if (rc < 0) + ksmbd_debug(SMB, "Set posix acl(ACL_TYPE_DEFAULT) failed, rc : %d\n", + rc); + } + posix_acl_release(acls); + return rc; +} diff --git a/fs/ksmbd/vfs.h b/fs/ksmbd/vfs.h new file mode 100644 index 000000000000..cb0cba0d5d07 --- /dev/null +++ b/fs/ksmbd/vfs.h @@ -0,0 +1,197 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2016 Namjae Jeon <linkinjeon@kernel.org> + * Copyright (C) 2018 Samsung Electronics Co., Ltd. + */ + +#ifndef __KSMBD_VFS_H__ +#define __KSMBD_VFS_H__ + +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/namei.h> +#include <uapi/linux/xattr.h> +#include <linux/posix_acl.h> + +#include "smbacl.h" +#include "xattr.h" + +/* + * Enumeration for stream type. + */ +enum { + DATA_STREAM = 1, /* type $DATA */ + DIR_STREAM /* type $INDEX_ALLOCATION */ +}; + +/* CreateOptions */ +/* Flag is set, it must not be a file , valid for directory only */ +#define FILE_DIRECTORY_FILE_LE cpu_to_le32(0x00000001) +#define FILE_WRITE_THROUGH_LE cpu_to_le32(0x00000002) +#define FILE_SEQUENTIAL_ONLY_LE cpu_to_le32(0x00000004) + +/* Should not buffer on server*/ +#define FILE_NO_INTERMEDIATE_BUFFERING_LE cpu_to_le32(0x00000008) +/* MBZ */ +#define FILE_SYNCHRONOUS_IO_ALERT_LE cpu_to_le32(0x00000010) +/* MBZ */ +#define FILE_SYNCHRONOUS_IO_NONALERT_LE cpu_to_le32(0x00000020) + +/* Flaf must not be set for directory */ +#define FILE_NON_DIRECTORY_FILE_LE cpu_to_le32(0x00000040) + +/* Should be zero */ +#define CREATE_TREE_CONNECTION cpu_to_le32(0x00000080) +#define FILE_COMPLETE_IF_OPLOCKED_LE cpu_to_le32(0x00000100) +#define FILE_NO_EA_KNOWLEDGE_LE cpu_to_le32(0x00000200) +#define FILE_OPEN_REMOTE_INSTANCE cpu_to_le32(0x00000400) + +/** + * Doc says this is obsolete "open for recovery" flag should be zero + * in any case. + */ +#define CREATE_OPEN_FOR_RECOVERY cpu_to_le32(0x00000400) +#define FILE_RANDOM_ACCESS_LE cpu_to_le32(0x00000800) +#define FILE_DELETE_ON_CLOSE_LE cpu_to_le32(0x00001000) +#define FILE_OPEN_BY_FILE_ID_LE cpu_to_le32(0x00002000) +#define FILE_OPEN_FOR_BACKUP_INTENT_LE cpu_to_le32(0x00004000) +#define FILE_NO_COMPRESSION_LE cpu_to_le32(0x00008000) + +/* Should be zero*/ +#define FILE_OPEN_REQUIRING_OPLOCK cpu_to_le32(0x00010000) +#define FILE_DISALLOW_EXCLUSIVE cpu_to_le32(0x00020000) +#define FILE_RESERVE_OPFILTER_LE cpu_to_le32(0x00100000) +#define FILE_OPEN_REPARSE_POINT_LE cpu_to_le32(0x00200000) +#define FILE_OPEN_NO_RECALL_LE cpu_to_le32(0x00400000) + +/* Should be zero */ +#define FILE_OPEN_FOR_FREE_SPACE_QUERY_LE cpu_to_le32(0x00800000) +#define CREATE_OPTIONS_MASK cpu_to_le32(0x00FFFFFF) +#define CREATE_OPTION_READONLY 0x10000000 +/* system. NB not sent over wire */ +#define CREATE_OPTION_SPECIAL 0x20000000 + +struct ksmbd_work; +struct ksmbd_file; +struct ksmbd_conn; + +struct ksmbd_dir_info { + const char *name; + char *wptr; + char *rptr; + int name_len; + int out_buf_len; + int num_entry; + int data_count; + int last_entry_offset; + bool hide_dot_file; + int flags; +}; + +struct ksmbd_readdir_data { + struct dir_context ctx; + union { + void *private; + char *dirent; + }; + + unsigned int used; + unsigned int dirent_count; + unsigned int file_attr; +}; + +/* ksmbd kstat wrapper to get valid create time when reading dir entry */ +struct ksmbd_kstat { + struct kstat *kstat; + unsigned long long create_time; + __le32 file_attributes; +}; + +int ksmbd_vfs_lock_parent(struct dentry *parent, struct dentry *child); +int ksmbd_vfs_may_delete(struct user_namespace *user_ns, struct dentry *dentry); +int ksmbd_vfs_query_maximal_access(struct user_namespace *user_ns, + struct dentry *dentry, __le32 *daccess); +int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode); +int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode); +int ksmbd_vfs_read(struct ksmbd_work *work, struct ksmbd_file *fp, + size_t count, loff_t *pos); +int ksmbd_vfs_write(struct ksmbd_work *work, struct ksmbd_file *fp, + char *buf, size_t count, loff_t *pos, bool sync, + ssize_t *written); +int ksmbd_vfs_fsync(struct ksmbd_work *work, u64 fid, u64 p_id); +int ksmbd_vfs_remove_file(struct ksmbd_work *work, char *name); +int ksmbd_vfs_link(struct ksmbd_work *work, + const char *oldname, const char *newname); +int ksmbd_vfs_getattr(struct path *path, struct kstat *stat); +int ksmbd_vfs_fp_rename(struct ksmbd_work *work, struct ksmbd_file *fp, + char *newname); +int ksmbd_vfs_truncate(struct ksmbd_work *work, const char *name, + struct ksmbd_file *fp, loff_t size); +struct srv_copychunk; +int ksmbd_vfs_copy_file_ranges(struct ksmbd_work *work, + struct ksmbd_file *src_fp, + struct ksmbd_file *dst_fp, + struct srv_copychunk *chunks, + unsigned int chunk_count, + unsigned int *chunk_count_written, + unsigned int *chunk_size_written, + loff_t *total_size_written); +ssize_t ksmbd_vfs_listxattr(struct dentry *dentry, char **list); +ssize_t ksmbd_vfs_getxattr(struct user_namespace *user_ns, + struct dentry *dentry, + char *xattr_name, + char **xattr_buf); +ssize_t ksmbd_vfs_casexattr_len(struct user_namespace *user_ns, + struct dentry *dentry, char *attr_name, + int attr_name_len); +int ksmbd_vfs_setxattr(struct user_namespace *user_ns, + struct dentry *dentry, const char *attr_name, + const void *attr_value, size_t attr_size, int flags); +int ksmbd_vfs_xattr_stream_name(char *stream_name, char **xattr_stream_name, + size_t *xattr_stream_name_size, int s_type); +int ksmbd_vfs_remove_xattr(struct user_namespace *user_ns, + struct dentry *dentry, char *attr_name); +int ksmbd_vfs_kern_path(char *name, unsigned int flags, struct path *path, + bool caseless); +int ksmbd_vfs_empty_dir(struct ksmbd_file *fp); +void ksmbd_vfs_set_fadvise(struct file *filp, __le32 option); +int ksmbd_vfs_zero_data(struct ksmbd_work *work, struct ksmbd_file *fp, + loff_t off, loff_t len); +struct file_allocated_range_buffer; +int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length, + struct file_allocated_range_buffer *ranges, + int in_count, int *out_count); +int ksmbd_vfs_unlink(struct user_namespace *user_ns, + struct dentry *dir, struct dentry *dentry); +void *ksmbd_vfs_init_kstat(char **p, struct ksmbd_kstat *ksmbd_kstat); +int ksmbd_vfs_fill_dentry_attrs(struct ksmbd_work *work, + struct user_namespace *user_ns, + struct dentry *dentry, + struct ksmbd_kstat *ksmbd_kstat); +void ksmbd_vfs_posix_lock_wait(struct file_lock *flock); +int ksmbd_vfs_posix_lock_wait_timeout(struct file_lock *flock, long timeout); +void ksmbd_vfs_posix_lock_unblock(struct file_lock *flock); +int ksmbd_vfs_remove_acl_xattrs(struct user_namespace *user_ns, + struct dentry *dentry); +int ksmbd_vfs_remove_sd_xattrs(struct user_namespace *user_ns, + struct dentry *dentry); +int ksmbd_vfs_set_sd_xattr(struct ksmbd_conn *conn, + struct user_namespace *user_ns, + struct dentry *dentry, + struct smb_ntsd *pntsd, int len); +int ksmbd_vfs_get_sd_xattr(struct ksmbd_conn *conn, + struct user_namespace *user_ns, + struct dentry *dentry, + struct smb_ntsd **pntsd); +int ksmbd_vfs_set_dos_attrib_xattr(struct user_namespace *user_ns, + struct dentry *dentry, + struct xattr_dos_attrib *da); +int ksmbd_vfs_get_dos_attrib_xattr(struct user_namespace *user_ns, + struct dentry *dentry, + struct xattr_dos_attrib *da); +int ksmbd_vfs_set_init_posix_acl(struct user_namespace *user_ns, + struct inode *inode); +int ksmbd_vfs_inherit_posix_acl(struct user_namespace *user_ns, + struct inode *inode, + struct inode *parent_inode); +#endif /* __KSMBD_VFS_H__ */ diff --git a/fs/ksmbd/vfs_cache.c b/fs/ksmbd/vfs_cache.c new file mode 100644 index 000000000000..92d8c61ffd2a --- /dev/null +++ b/fs/ksmbd/vfs_cache.c @@ -0,0 +1,725 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (C) 2016 Namjae Jeon <linkinjeon@kernel.org> + * Copyright (C) 2019 Samsung Electronics Co., Ltd. + */ + +#include <linux/fs.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> + +#include "glob.h" +#include "vfs_cache.h" +#include "oplock.h" +#include "vfs.h" +#include "connection.h" +#include "mgmt/tree_connect.h" +#include "mgmt/user_session.h" +#include "smb_common.h" + +#define S_DEL_PENDING 1 +#define S_DEL_ON_CLS 2 +#define S_DEL_ON_CLS_STREAM 8 + +static unsigned int inode_hash_mask __read_mostly; +static unsigned int inode_hash_shift __read_mostly; +static struct hlist_head *inode_hashtable __read_mostly; +static DEFINE_RWLOCK(inode_hash_lock); + +static struct ksmbd_file_table global_ft; +static atomic_long_t fd_limit; +static struct kmem_cache *filp_cache; + +void ksmbd_set_fd_limit(unsigned long limit) +{ + limit = min(limit, get_max_files()); + atomic_long_set(&fd_limit, limit); +} + +static bool fd_limit_depleted(void) +{ + long v = atomic_long_dec_return(&fd_limit); + + if (v >= 0) + return false; + atomic_long_inc(&fd_limit); + return true; +} + +static void fd_limit_close(void) +{ + atomic_long_inc(&fd_limit); +} + +/* + * INODE hash + */ + +static unsigned long inode_hash(struct super_block *sb, unsigned long hashval) +{ + unsigned long tmp; + + tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) / + L1_CACHE_BYTES; + tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> inode_hash_shift); + return tmp & inode_hash_mask; +} + +static struct ksmbd_inode *__ksmbd_inode_lookup(struct inode *inode) +{ + struct hlist_head *head = inode_hashtable + + inode_hash(inode->i_sb, inode->i_ino); + struct ksmbd_inode *ci = NULL, *ret_ci = NULL; + + hlist_for_each_entry(ci, head, m_hash) { + if (ci->m_inode == inode) { + if (atomic_inc_not_zero(&ci->m_count)) + ret_ci = ci; + break; + } + } + return ret_ci; +} + +static struct ksmbd_inode *ksmbd_inode_lookup(struct ksmbd_file *fp) +{ + return __ksmbd_inode_lookup(file_inode(fp->filp)); +} + +static struct ksmbd_inode *ksmbd_inode_lookup_by_vfsinode(struct inode *inode) +{ + struct ksmbd_inode *ci; + + read_lock(&inode_hash_lock); + ci = __ksmbd_inode_lookup(inode); + read_unlock(&inode_hash_lock); + return ci; +} + +int ksmbd_query_inode_status(struct inode *inode) +{ + struct ksmbd_inode *ci; + int ret = KSMBD_INODE_STATUS_UNKNOWN; + + read_lock(&inode_hash_lock); + ci = __ksmbd_inode_lookup(inode); + if (ci) { + ret = KSMBD_INODE_STATUS_OK; + if (ci->m_flags & S_DEL_PENDING) + ret = KSMBD_INODE_STATUS_PENDING_DELETE; + atomic_dec(&ci->m_count); + } + read_unlock(&inode_hash_lock); + return ret; +} + +bool ksmbd_inode_pending_delete(struct ksmbd_file *fp) +{ + return (fp->f_ci->m_flags & S_DEL_PENDING); +} + +void ksmbd_set_inode_pending_delete(struct ksmbd_file *fp) +{ + fp->f_ci->m_flags |= S_DEL_PENDING; +} + +void ksmbd_clear_inode_pending_delete(struct ksmbd_file *fp) +{ + fp->f_ci->m_flags &= ~S_DEL_PENDING; +} + +void ksmbd_fd_set_delete_on_close(struct ksmbd_file *fp, + int file_info) +{ + if (ksmbd_stream_fd(fp)) { + fp->f_ci->m_flags |= S_DEL_ON_CLS_STREAM; + return; + } + + fp->f_ci->m_flags |= S_DEL_ON_CLS; +} + +static void ksmbd_inode_hash(struct ksmbd_inode *ci) +{ + struct hlist_head *b = inode_hashtable + + inode_hash(ci->m_inode->i_sb, ci->m_inode->i_ino); + + hlist_add_head(&ci->m_hash, b); +} + +static void ksmbd_inode_unhash(struct ksmbd_inode *ci) +{ + write_lock(&inode_hash_lock); + hlist_del_init(&ci->m_hash); + write_unlock(&inode_hash_lock); +} + +static int ksmbd_inode_init(struct ksmbd_inode *ci, struct ksmbd_file *fp) +{ + ci->m_inode = file_inode(fp->filp); + atomic_set(&ci->m_count, 1); + atomic_set(&ci->op_count, 0); + atomic_set(&ci->sop_count, 0); + ci->m_flags = 0; + ci->m_fattr = 0; + INIT_LIST_HEAD(&ci->m_fp_list); + INIT_LIST_HEAD(&ci->m_op_list); + rwlock_init(&ci->m_lock); + return 0; +} + +static struct ksmbd_inode *ksmbd_inode_get(struct ksmbd_file *fp) +{ + struct ksmbd_inode *ci, *tmpci; + int rc; + + read_lock(&inode_hash_lock); + ci = ksmbd_inode_lookup(fp); + read_unlock(&inode_hash_lock); + if (ci) + return ci; + + ci = kmalloc(sizeof(struct ksmbd_inode), GFP_KERNEL); + if (!ci) + return NULL; + + rc = ksmbd_inode_init(ci, fp); + if (rc) { + pr_err("inode initialized failed\n"); + kfree(ci); + return NULL; + } + + write_lock(&inode_hash_lock); + tmpci = ksmbd_inode_lookup(fp); + if (!tmpci) { + ksmbd_inode_hash(ci); + } else { + kfree(ci); + ci = tmpci; + } + write_unlock(&inode_hash_lock); + return ci; +} + +static void ksmbd_inode_free(struct ksmbd_inode *ci) +{ + ksmbd_inode_unhash(ci); + kfree(ci); +} + +static void ksmbd_inode_put(struct ksmbd_inode *ci) +{ + if (atomic_dec_and_test(&ci->m_count)) + ksmbd_inode_free(ci); +} + +int __init ksmbd_inode_hash_init(void) +{ + unsigned int loop; + unsigned long numentries = 16384; + unsigned long bucketsize = sizeof(struct hlist_head); + unsigned long size; + + inode_hash_shift = ilog2(numentries); + inode_hash_mask = (1 << inode_hash_shift) - 1; + + size = bucketsize << inode_hash_shift; + + /* init master fp hash table */ + inode_hashtable = vmalloc(size); + if (!inode_hashtable) + return -ENOMEM; + + for (loop = 0; loop < (1U << inode_hash_shift); loop++) + INIT_HLIST_HEAD(&inode_hashtable[loop]); + return 0; +} + +void ksmbd_release_inode_hash(void) +{ + vfree(inode_hashtable); +} + +static void __ksmbd_inode_close(struct ksmbd_file *fp) +{ + struct dentry *dir, *dentry; + struct ksmbd_inode *ci = fp->f_ci; + int err; + struct file *filp; + + filp = fp->filp; + if (ksmbd_stream_fd(fp) && (ci->m_flags & S_DEL_ON_CLS_STREAM)) { + ci->m_flags &= ~S_DEL_ON_CLS_STREAM; + err = ksmbd_vfs_remove_xattr(file_mnt_user_ns(filp), + filp->f_path.dentry, + fp->stream.name); + if (err) + pr_err("remove xattr failed : %s\n", + fp->stream.name); + } + + if (atomic_dec_and_test(&ci->m_count)) { + write_lock(&ci->m_lock); + if (ci->m_flags & (S_DEL_ON_CLS | S_DEL_PENDING)) { + dentry = filp->f_path.dentry; + dir = dentry->d_parent; + ci->m_flags &= ~(S_DEL_ON_CLS | S_DEL_PENDING); + write_unlock(&ci->m_lock); + ksmbd_vfs_unlink(file_mnt_user_ns(filp), dir, dentry); + write_lock(&ci->m_lock); + } + write_unlock(&ci->m_lock); + + ksmbd_inode_free(ci); + } +} + +static void __ksmbd_remove_durable_fd(struct ksmbd_file *fp) +{ + if (!has_file_id(fp->persistent_id)) + return; + + write_lock(&global_ft.lock); + idr_remove(global_ft.idr, fp->persistent_id); + write_unlock(&global_ft.lock); +} + +static void __ksmbd_remove_fd(struct ksmbd_file_table *ft, struct ksmbd_file *fp) +{ + if (!has_file_id(fp->volatile_id)) + return; + + write_lock(&fp->f_ci->m_lock); + list_del_init(&fp->node); + write_unlock(&fp->f_ci->m_lock); + + write_lock(&ft->lock); + idr_remove(ft->idr, fp->volatile_id); + write_unlock(&ft->lock); +} + +static void __ksmbd_close_fd(struct ksmbd_file_table *ft, struct ksmbd_file *fp) +{ + struct file *filp; + struct ksmbd_lock *smb_lock, *tmp_lock; + + fd_limit_close(); + __ksmbd_remove_durable_fd(fp); + __ksmbd_remove_fd(ft, fp); + + close_id_del_oplock(fp); + filp = fp->filp; + + __ksmbd_inode_close(fp); + if (!IS_ERR_OR_NULL(filp)) + fput(filp); + + /* because the reference count of fp is 0, it is guaranteed that + * there are not accesses to fp->lock_list. + */ + list_for_each_entry_safe(smb_lock, tmp_lock, &fp->lock_list, flist) { + spin_lock(&fp->conn->llist_lock); + list_del(&smb_lock->clist); + spin_unlock(&fp->conn->llist_lock); + + list_del(&smb_lock->flist); + locks_free_lock(smb_lock->fl); + kfree(smb_lock); + } + + kfree(fp->filename); + if (ksmbd_stream_fd(fp)) + kfree(fp->stream.name); + kmem_cache_free(filp_cache, fp); +} + +static struct ksmbd_file *ksmbd_fp_get(struct ksmbd_file *fp) +{ + if (!atomic_inc_not_zero(&fp->refcount)) + return NULL; + return fp; +} + +static struct ksmbd_file *__ksmbd_lookup_fd(struct ksmbd_file_table *ft, + u64 id) +{ + struct ksmbd_file *fp; + + if (!has_file_id(id)) + return NULL; + + read_lock(&ft->lock); + fp = idr_find(ft->idr, id); + if (fp) + fp = ksmbd_fp_get(fp); + read_unlock(&ft->lock); + return fp; +} + +static void __put_fd_final(struct ksmbd_work *work, struct ksmbd_file *fp) +{ + __ksmbd_close_fd(&work->sess->file_table, fp); + atomic_dec(&work->conn->stats.open_files_count); +} + +static void set_close_state_blocked_works(struct ksmbd_file *fp) +{ + struct ksmbd_work *cancel_work, *ctmp; + + spin_lock(&fp->f_lock); + list_for_each_entry_safe(cancel_work, ctmp, &fp->blocked_works, + fp_entry) { + list_del(&cancel_work->fp_entry); + cancel_work->state = KSMBD_WORK_CLOSED; + cancel_work->cancel_fn(cancel_work->cancel_argv); + } + spin_unlock(&fp->f_lock); +} + +int ksmbd_close_fd(struct ksmbd_work *work, u64 id) +{ + struct ksmbd_file *fp; + struct ksmbd_file_table *ft; + + if (!has_file_id(id)) + return 0; + + ft = &work->sess->file_table; + read_lock(&ft->lock); + fp = idr_find(ft->idr, id); + if (fp) { + set_close_state_blocked_works(fp); + + if (!atomic_dec_and_test(&fp->refcount)) + fp = NULL; + } + read_unlock(&ft->lock); + + if (!fp) + return -EINVAL; + + __put_fd_final(work, fp); + return 0; +} + +void ksmbd_fd_put(struct ksmbd_work *work, struct ksmbd_file *fp) +{ + if (!fp) + return; + + if (!atomic_dec_and_test(&fp->refcount)) + return; + __put_fd_final(work, fp); +} + +static bool __sanity_check(struct ksmbd_tree_connect *tcon, struct ksmbd_file *fp) +{ + if (!fp) + return false; + if (fp->tcon != tcon) + return false; + return true; +} + +struct ksmbd_file *ksmbd_lookup_foreign_fd(struct ksmbd_work *work, u64 id) +{ + return __ksmbd_lookup_fd(&work->sess->file_table, id); +} + +struct ksmbd_file *ksmbd_lookup_fd_fast(struct ksmbd_work *work, u64 id) +{ + struct ksmbd_file *fp = __ksmbd_lookup_fd(&work->sess->file_table, id); + + if (__sanity_check(work->tcon, fp)) + return fp; + + ksmbd_fd_put(work, fp); + return NULL; +} + +struct ksmbd_file *ksmbd_lookup_fd_slow(struct ksmbd_work *work, u64 id, + u64 pid) +{ + struct ksmbd_file *fp; + + if (!has_file_id(id)) { + id = work->compound_fid; + pid = work->compound_pfid; + } + + fp = __ksmbd_lookup_fd(&work->sess->file_table, id); + if (!__sanity_check(work->tcon, fp)) { + ksmbd_fd_put(work, fp); + return NULL; + } + if (fp->persistent_id != pid) { + ksmbd_fd_put(work, fp); + return NULL; + } + return fp; +} + +struct ksmbd_file *ksmbd_lookup_durable_fd(unsigned long long id) +{ + return __ksmbd_lookup_fd(&global_ft, id); +} + +struct ksmbd_file *ksmbd_lookup_fd_cguid(char *cguid) +{ + struct ksmbd_file *fp = NULL; + unsigned int id; + + read_lock(&global_ft.lock); + idr_for_each_entry(global_ft.idr, fp, id) { + if (!memcmp(fp->create_guid, + cguid, + SMB2_CREATE_GUID_SIZE)) { + fp = ksmbd_fp_get(fp); + break; + } + } + read_unlock(&global_ft.lock); + + return fp; +} + +struct ksmbd_file *ksmbd_lookup_fd_inode(struct inode *inode) +{ + struct ksmbd_file *lfp; + struct ksmbd_inode *ci; + + ci = ksmbd_inode_lookup_by_vfsinode(inode); + if (!ci) + return NULL; + + read_lock(&ci->m_lock); + list_for_each_entry(lfp, &ci->m_fp_list, node) { + if (inode == file_inode(lfp->filp)) { + atomic_dec(&ci->m_count); + read_unlock(&ci->m_lock); + return lfp; + } + } + atomic_dec(&ci->m_count); + read_unlock(&ci->m_lock); + return NULL; +} + +#define OPEN_ID_TYPE_VOLATILE_ID (0) +#define OPEN_ID_TYPE_PERSISTENT_ID (1) + +static void __open_id_set(struct ksmbd_file *fp, u64 id, int type) +{ + if (type == OPEN_ID_TYPE_VOLATILE_ID) + fp->volatile_id = id; + if (type == OPEN_ID_TYPE_PERSISTENT_ID) + fp->persistent_id = id; +} + +static int __open_id(struct ksmbd_file_table *ft, struct ksmbd_file *fp, + int type) +{ + u64 id = 0; + int ret; + + if (type == OPEN_ID_TYPE_VOLATILE_ID && fd_limit_depleted()) { + __open_id_set(fp, KSMBD_NO_FID, type); + return -EMFILE; + } + + idr_preload(GFP_KERNEL); + write_lock(&ft->lock); + ret = idr_alloc_cyclic(ft->idr, fp, 0, INT_MAX - 1, GFP_NOWAIT); + if (ret >= 0) { + id = ret; + ret = 0; + } else { + id = KSMBD_NO_FID; + fd_limit_close(); + } + + __open_id_set(fp, id, type); + write_unlock(&ft->lock); + idr_preload_end(); + return ret; +} + +unsigned int ksmbd_open_durable_fd(struct ksmbd_file *fp) +{ + __open_id(&global_ft, fp, OPEN_ID_TYPE_PERSISTENT_ID); + return fp->persistent_id; +} + +struct ksmbd_file *ksmbd_open_fd(struct ksmbd_work *work, struct file *filp) +{ + struct ksmbd_file *fp; + int ret; + + fp = kmem_cache_zalloc(filp_cache, GFP_KERNEL); + if (!fp) { + pr_err("Failed to allocate memory\n"); + return ERR_PTR(-ENOMEM); + } + + INIT_LIST_HEAD(&fp->blocked_works); + INIT_LIST_HEAD(&fp->node); + INIT_LIST_HEAD(&fp->lock_list); + spin_lock_init(&fp->f_lock); + atomic_set(&fp->refcount, 1); + + fp->filp = filp; + fp->conn = work->sess->conn; + fp->tcon = work->tcon; + fp->volatile_id = KSMBD_NO_FID; + fp->persistent_id = KSMBD_NO_FID; + fp->f_ci = ksmbd_inode_get(fp); + + if (!fp->f_ci) { + ret = -ENOMEM; + goto err_out; + } + + ret = __open_id(&work->sess->file_table, fp, OPEN_ID_TYPE_VOLATILE_ID); + if (ret) { + ksmbd_inode_put(fp->f_ci); + goto err_out; + } + + atomic_inc(&work->conn->stats.open_files_count); + return fp; + +err_out: + kmem_cache_free(filp_cache, fp); + return ERR_PTR(ret); +} + +static int +__close_file_table_ids(struct ksmbd_file_table *ft, + struct ksmbd_tree_connect *tcon, + bool (*skip)(struct ksmbd_tree_connect *tcon, + struct ksmbd_file *fp)) +{ + unsigned int id; + struct ksmbd_file *fp; + int num = 0; + + idr_for_each_entry(ft->idr, fp, id) { + if (skip(tcon, fp)) + continue; + + set_close_state_blocked_works(fp); + + if (!atomic_dec_and_test(&fp->refcount)) + continue; + __ksmbd_close_fd(ft, fp); + num++; + } + return num; +} + +static bool tree_conn_fd_check(struct ksmbd_tree_connect *tcon, + struct ksmbd_file *fp) +{ + return fp->tcon != tcon; +} + +static bool session_fd_check(struct ksmbd_tree_connect *tcon, + struct ksmbd_file *fp) +{ + return false; +} + +void ksmbd_close_tree_conn_fds(struct ksmbd_work *work) +{ + int num = __close_file_table_ids(&work->sess->file_table, + work->tcon, + tree_conn_fd_check); + + atomic_sub(num, &work->conn->stats.open_files_count); +} + +void ksmbd_close_session_fds(struct ksmbd_work *work) +{ + int num = __close_file_table_ids(&work->sess->file_table, + work->tcon, + session_fd_check); + + atomic_sub(num, &work->conn->stats.open_files_count); +} + +int ksmbd_init_global_file_table(void) +{ + return ksmbd_init_file_table(&global_ft); +} + +void ksmbd_free_global_file_table(void) +{ + struct ksmbd_file *fp = NULL; + unsigned int id; + + idr_for_each_entry(global_ft.idr, fp, id) { + __ksmbd_remove_durable_fd(fp); + kmem_cache_free(filp_cache, fp); + } + + ksmbd_destroy_file_table(&global_ft); +} + +int ksmbd_file_table_flush(struct ksmbd_work *work) +{ + struct ksmbd_file *fp = NULL; + unsigned int id; + int ret; + + read_lock(&work->sess->file_table.lock); + idr_for_each_entry(work->sess->file_table.idr, fp, id) { + ret = ksmbd_vfs_fsync(work, fp->volatile_id, KSMBD_NO_FID); + if (ret) + break; + } + read_unlock(&work->sess->file_table.lock); + return ret; +} + +int ksmbd_init_file_table(struct ksmbd_file_table *ft) +{ + ft->idr = kzalloc(sizeof(struct idr), GFP_KERNEL); + if (!ft->idr) + return -ENOMEM; + + idr_init(ft->idr); + rwlock_init(&ft->lock); + return 0; +} + +void ksmbd_destroy_file_table(struct ksmbd_file_table *ft) +{ + if (!ft->idr) + return; + + __close_file_table_ids(ft, NULL, session_fd_check); + idr_destroy(ft->idr); + kfree(ft->idr); + ft->idr = NULL; +} + +int ksmbd_init_file_cache(void) +{ + filp_cache = kmem_cache_create("ksmbd_file_cache", + sizeof(struct ksmbd_file), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!filp_cache) + goto out; + + return 0; + +out: + pr_err("failed to allocate file cache\n"); + return -ENOMEM; +} + +void ksmbd_exit_file_cache(void) +{ + kmem_cache_destroy(filp_cache); +} diff --git a/fs/ksmbd/vfs_cache.h b/fs/ksmbd/vfs_cache.h new file mode 100644 index 000000000000..70dfe6a99f13 --- /dev/null +++ b/fs/ksmbd/vfs_cache.h @@ -0,0 +1,178 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2019 Samsung Electronics Co., Ltd. + */ + +#ifndef __VFS_CACHE_H__ +#define __VFS_CACHE_H__ + +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/rwsem.h> +#include <linux/spinlock.h> +#include <linux/idr.h> +#include <linux/workqueue.h> + +#include "vfs.h" + +/* Windows style file permissions for extended response */ +#define FILE_GENERIC_ALL 0x1F01FF +#define FILE_GENERIC_READ 0x120089 +#define FILE_GENERIC_WRITE 0x120116 +#define FILE_GENERIC_EXECUTE 0X1200a0 + +#define KSMBD_START_FID 0 +#define KSMBD_NO_FID (INT_MAX) +#define SMB2_NO_FID (0xFFFFFFFFFFFFFFFFULL) + +struct ksmbd_conn; +struct ksmbd_session; + +struct ksmbd_lock { + struct file_lock *fl; + struct list_head clist; + struct list_head flist; + struct list_head llist; + unsigned int flags; + int cmd; + int zero_len; + unsigned long long start; + unsigned long long end; +}; + +struct stream { + char *name; + ssize_t size; +}; + +struct ksmbd_inode { + rwlock_t m_lock; + atomic_t m_count; + atomic_t op_count; + /* opinfo count for streams */ + atomic_t sop_count; + struct inode *m_inode; + unsigned int m_flags; + struct hlist_node m_hash; + struct list_head m_fp_list; + struct list_head m_op_list; + struct oplock_info *m_opinfo; + __le32 m_fattr; +}; + +struct ksmbd_file { + struct file *filp; + char *filename; + u64 persistent_id; + u64 volatile_id; + + spinlock_t f_lock; + + struct ksmbd_inode *f_ci; + struct ksmbd_inode *f_parent_ci; + struct oplock_info __rcu *f_opinfo; + struct ksmbd_conn *conn; + struct ksmbd_tree_connect *tcon; + + atomic_t refcount; + __le32 daccess; + __le32 saccess; + __le32 coption; + __le32 cdoption; + __u64 create_time; + __u64 itime; + + bool is_nt_open; + bool attrib_only; + + char client_guid[16]; + char create_guid[16]; + char app_instance_id[16]; + + struct stream stream; + struct list_head node; + struct list_head blocked_works; + struct list_head lock_list; + + int durable_timeout; + + /* for SMB1 */ + int pid; + + /* conflict lock fail count for SMB1 */ + unsigned int cflock_cnt; + /* last lock failure start offset for SMB1 */ + unsigned long long llock_fstart; + + int dirent_offset; + + /* if ls is happening on directory, below is valid*/ + struct ksmbd_readdir_data readdir_data; + int dot_dotdot[2]; +}; + +static inline void set_ctx_actor(struct dir_context *ctx, + filldir_t actor) +{ + ctx->actor = actor; +} + +#define KSMBD_NR_OPEN_DEFAULT BITS_PER_LONG + +struct ksmbd_file_table { + rwlock_t lock; + struct idr *idr; +}; + +static inline bool has_file_id(u64 id) +{ + return id < KSMBD_NO_FID; +} + +static inline bool ksmbd_stream_fd(struct ksmbd_file *fp) +{ + return fp->stream.name != NULL; +} + +int ksmbd_init_file_table(struct ksmbd_file_table *ft); +void ksmbd_destroy_file_table(struct ksmbd_file_table *ft); +int ksmbd_close_fd(struct ksmbd_work *work, u64 id); +struct ksmbd_file *ksmbd_lookup_fd_fast(struct ksmbd_work *work, u64 id); +struct ksmbd_file *ksmbd_lookup_foreign_fd(struct ksmbd_work *work, u64 id); +struct ksmbd_file *ksmbd_lookup_fd_slow(struct ksmbd_work *work, u64 id, + u64 pid); +void ksmbd_fd_put(struct ksmbd_work *work, struct ksmbd_file *fp); +struct ksmbd_file *ksmbd_lookup_durable_fd(unsigned long long id); +struct ksmbd_file *ksmbd_lookup_fd_cguid(char *cguid); +struct ksmbd_file *ksmbd_lookup_fd_inode(struct inode *inode); +unsigned int ksmbd_open_durable_fd(struct ksmbd_file *fp); +struct ksmbd_file *ksmbd_open_fd(struct ksmbd_work *work, struct file *filp); +void ksmbd_close_tree_conn_fds(struct ksmbd_work *work); +void ksmbd_close_session_fds(struct ksmbd_work *work); +int ksmbd_close_inode_fds(struct ksmbd_work *work, struct inode *inode); +int ksmbd_init_global_file_table(void); +void ksmbd_free_global_file_table(void); +int ksmbd_file_table_flush(struct ksmbd_work *work); +void ksmbd_set_fd_limit(unsigned long limit); + +/* + * INODE hash + */ +int __init ksmbd_inode_hash_init(void); +void ksmbd_release_inode_hash(void); + +enum KSMBD_INODE_STATUS { + KSMBD_INODE_STATUS_OK, + KSMBD_INODE_STATUS_UNKNOWN, + KSMBD_INODE_STATUS_PENDING_DELETE, +}; + +int ksmbd_query_inode_status(struct inode *inode); +bool ksmbd_inode_pending_delete(struct ksmbd_file *fp); +void ksmbd_set_inode_pending_delete(struct ksmbd_file *fp); +void ksmbd_clear_inode_pending_delete(struct ksmbd_file *fp); +void ksmbd_fd_set_delete_on_close(struct ksmbd_file *fp, + int file_info); +int ksmbd_init_file_cache(void); +void ksmbd_exit_file_cache(void); +#endif /* __VFS_CACHE_H__ */ diff --git a/fs/ksmbd/xattr.h b/fs/ksmbd/xattr.h new file mode 100644 index 000000000000..8857c01093d9 --- /dev/null +++ b/fs/ksmbd/xattr.h @@ -0,0 +1,122 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) 2021 Samsung Electronics Co., Ltd. + */ + +#ifndef __XATTR_H__ +#define __XATTR_H__ + +/* + * These are on-disk structures to store additional metadata into xattr to + * reproduce windows filesystem semantics. And they are encoded with NDR to + * compatible with samba's xattr meta format. The compatibility with samba + * is important because it can lose the information(file attribute, + * creation time, acls) about the existing files when switching between + * ksmbd and samba. + */ + +/* + * Dos attribute flags used for what variable is valid. + */ +enum { + XATTR_DOSINFO_ATTRIB = 0x00000001, + XATTR_DOSINFO_EA_SIZE = 0x00000002, + XATTR_DOSINFO_SIZE = 0x00000004, + XATTR_DOSINFO_ALLOC_SIZE = 0x00000008, + XATTR_DOSINFO_CREATE_TIME = 0x00000010, + XATTR_DOSINFO_CHANGE_TIME = 0x00000020, + XATTR_DOSINFO_ITIME = 0x00000040 +}; + +/* + * Dos attribute structure which is compatible with samba's one. + * Storing it into the xattr named "DOSATTRIB" separately from inode + * allows ksmbd to faithfully reproduce windows filesystem semantics + * on top of a POSIX filesystem. + */ +struct xattr_dos_attrib { + __u16 version; /* version 3 or version 4 */ + __u32 flags; /* valid flags */ + __u32 attr; /* Dos attribute */ + __u32 ea_size; /* EA size */ + __u64 size; + __u64 alloc_size; + __u64 create_time; /* File creation time */ + __u64 change_time; /* File change time */ + __u64 itime; /* Invented/Initial time */ +}; + +/* + * Enumeration is used for computing posix acl hash. + */ +enum { + SMB_ACL_TAG_INVALID = 0, + SMB_ACL_USER, + SMB_ACL_USER_OBJ, + SMB_ACL_GROUP, + SMB_ACL_GROUP_OBJ, + SMB_ACL_OTHER, + SMB_ACL_MASK +}; + +#define SMB_ACL_READ 4 +#define SMB_ACL_WRITE 2 +#define SMB_ACL_EXECUTE 1 + +struct xattr_acl_entry { + int type; + uid_t uid; + gid_t gid; + mode_t perm; +}; + +/* + * xattr_smb_acl structure is used for computing posix acl hash. + */ +struct xattr_smb_acl { + int count; + int next; + struct xattr_acl_entry entries[0]; +}; + +/* 64bytes hash in xattr_ntacl is computed with sha256 */ +#define XATTR_SD_HASH_TYPE_SHA256 0x1 +#define XATTR_SD_HASH_SIZE 64 + +/* + * xattr_ntacl is used for storing ntacl and hashes. + * Hash is used for checking valid posix acl and ntacl in xattr. + */ +struct xattr_ntacl { + __u16 version; /* version 4*/ + void *sd_buf; + __u32 sd_size; + __u16 hash_type; /* hash type */ + __u8 desc[10]; /* posix_acl description */ + __u16 desc_len; + __u64 current_time; + __u8 hash[XATTR_SD_HASH_SIZE]; /* 64bytes hash for ntacl */ + __u8 posix_acl_hash[XATTR_SD_HASH_SIZE]; /* 64bytes hash for posix acl */ +}; + +/* DOS ATTRIBUITE XATTR PREFIX */ +#define DOS_ATTRIBUTE_PREFIX "DOSATTRIB" +#define DOS_ATTRIBUTE_PREFIX_LEN (sizeof(DOS_ATTRIBUTE_PREFIX) - 1) +#define XATTR_NAME_DOS_ATTRIBUTE (XATTR_USER_PREFIX DOS_ATTRIBUTE_PREFIX) +#define XATTR_NAME_DOS_ATTRIBUTE_LEN \ + (sizeof(XATTR_USER_PREFIX DOS_ATTRIBUTE_PREFIX) - 1) + +/* STREAM XATTR PREFIX */ +#define STREAM_PREFIX "DosStream." +#define STREAM_PREFIX_LEN (sizeof(STREAM_PREFIX) - 1) +#define XATTR_NAME_STREAM (XATTR_USER_PREFIX STREAM_PREFIX) +#define XATTR_NAME_STREAM_LEN (sizeof(XATTR_NAME_STREAM) - 1) + +/* SECURITY DESCRIPTOR(NTACL) XATTR PREFIX */ +#define SD_PREFIX "NTACL" +#define SD_PREFIX_LEN (sizeof(SD_PREFIX) - 1) +#define XATTR_NAME_SD (XATTR_SECURITY_PREFIX SD_PREFIX) +#define XATTR_NAME_SD_LEN \ + (sizeof(XATTR_SECURITY_PREFIX SD_PREFIX) - 1) + +#endif /* __XATTR_H__ */ diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 2de048f80eb8..0ab9756ed235 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -584,7 +584,7 @@ static struct ctl_table nlm_sysctls[] = { .data = &nsm_use_hostnames, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dobool, }, { .procname = "nsm_local_state", diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 4c10fb5138f1..e10ae2c41279 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -40,12 +40,15 @@ nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp, /* Obtain file pointer. Not used by FREE_ALL call. */ if (filp != NULL) { - if ((error = nlm_lookup_file(rqstp, &file, &lock->fh)) != 0) + int mode = lock_to_openmode(&lock->fl); + + error = nlm_lookup_file(rqstp, &file, lock); + if (error) goto no_locks; *filp = file; /* Set up the missing parts of the file_lock structure */ - lock->fl.fl_file = file->f_file; + lock->fl.fl_file = file->f_file[mode]; lock->fl.fl_pid = current->tgid; lock->fl.fl_lmops = &nlmsvc_lock_operations; nlmsvc_locks_init_private(&lock->fl, host, (pid_t)lock->svid); diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 61d3cc2283dc..e9b85d8fd5fe 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -31,6 +31,7 @@ #include <linux/lockd/nlm.h> #include <linux/lockd/lockd.h> #include <linux/kthread.h> +#include <linux/exportfs.h> #define NLMDBG_FACILITY NLMDBG_SVCLOCK @@ -395,28 +396,10 @@ nlmsvc_release_lockowner(struct nlm_lock *lock) nlmsvc_put_lockowner(lock->fl.fl_owner); } -static void nlmsvc_locks_copy_lock(struct file_lock *new, struct file_lock *fl) -{ - struct nlm_lockowner *nlm_lo = (struct nlm_lockowner *)fl->fl_owner; - new->fl_owner = nlmsvc_get_lockowner(nlm_lo); -} - -static void nlmsvc_locks_release_private(struct file_lock *fl) -{ - nlmsvc_put_lockowner((struct nlm_lockowner *)fl->fl_owner); -} - -static const struct file_lock_operations nlmsvc_lock_ops = { - .fl_copy_lock = nlmsvc_locks_copy_lock, - .fl_release_private = nlmsvc_locks_release_private, -}; - void nlmsvc_locks_init_private(struct file_lock *fl, struct nlm_host *host, pid_t pid) { fl->fl_owner = nlmsvc_find_lockowner(host, pid); - if (fl->fl_owner != NULL) - fl->fl_ops = &nlmsvc_lock_ops; } /* @@ -488,17 +471,24 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, struct nlm_cookie *cookie, int reclaim) { struct nlm_block *block = NULL; + struct inode *inode = nlmsvc_file_inode(file); int error; + int mode; + int async_block = 0; __be32 ret; dprintk("lockd: nlmsvc_lock(%s/%ld, ty=%d, pi=%d, %Ld-%Ld, bl=%d)\n", - locks_inode(file->f_file)->i_sb->s_id, - locks_inode(file->f_file)->i_ino, + inode->i_sb->s_id, inode->i_ino, lock->fl.fl_type, lock->fl.fl_pid, (long long)lock->fl.fl_start, (long long)lock->fl.fl_end, wait); + if (inode->i_sb->s_export_op->flags & EXPORT_OP_SYNC_LOCKS) { + async_block = wait; + wait = 0; + } + /* Lock file against concurrent access */ mutex_lock(&file->f_mutex); /* Get existing block (in case client is busy-waiting) @@ -542,7 +532,8 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, if (!wait) lock->fl.fl_flags &= ~FL_SLEEP; - error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL); + mode = lock_to_openmode(&lock->fl); + error = vfs_lock_file(file->f_file[mode], F_SETLK, &lock->fl, NULL); lock->fl.fl_flags &= ~FL_SLEEP; dprintk("lockd: vfs_lock_file returned %d\n", error); @@ -558,7 +549,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, */ if (wait) break; - ret = nlm_lck_denied; + ret = async_block ? nlm_lck_blocked : nlm_lck_denied; goto out; case FILE_LOCK_DEFERRED: if (wait) @@ -595,12 +586,13 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, struct nlm_lock *conflock, struct nlm_cookie *cookie) { int error; + int mode; __be32 ret; struct nlm_lockowner *test_owner; dprintk("lockd: nlmsvc_testlock(%s/%ld, ty=%d, %Ld-%Ld)\n", - locks_inode(file->f_file)->i_sb->s_id, - locks_inode(file->f_file)->i_ino, + nlmsvc_file_inode(file)->i_sb->s_id, + nlmsvc_file_inode(file)->i_ino, lock->fl.fl_type, (long long)lock->fl.fl_start, (long long)lock->fl.fl_end); @@ -613,7 +605,8 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, /* If there's a conflicting lock, remember to clean up the test lock */ test_owner = (struct nlm_lockowner *)lock->fl.fl_owner; - error = vfs_test_lock(file->f_file, &lock->fl); + mode = lock_to_openmode(&lock->fl); + error = vfs_test_lock(file->f_file[mode], &lock->fl); if (error) { /* We can't currently deal with deferred test requests */ if (error == FILE_LOCK_DEFERRED) @@ -634,7 +627,7 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, conflock->caller = "somehost"; /* FIXME */ conflock->len = strlen(conflock->caller); conflock->oh.len = 0; /* don't return OH info */ - conflock->svid = ((struct nlm_lockowner *)lock->fl.fl_owner)->pid; + conflock->svid = lock->fl.fl_pid; conflock->fl.fl_type = lock->fl.fl_type; conflock->fl.fl_start = lock->fl.fl_start; conflock->fl.fl_end = lock->fl.fl_end; @@ -659,11 +652,11 @@ out: __be32 nlmsvc_unlock(struct net *net, struct nlm_file *file, struct nlm_lock *lock) { - int error; + int error = 0; dprintk("lockd: nlmsvc_unlock(%s/%ld, pi=%d, %Ld-%Ld)\n", - locks_inode(file->f_file)->i_sb->s_id, - locks_inode(file->f_file)->i_ino, + nlmsvc_file_inode(file)->i_sb->s_id, + nlmsvc_file_inode(file)->i_ino, lock->fl.fl_pid, (long long)lock->fl.fl_start, (long long)lock->fl.fl_end); @@ -672,7 +665,12 @@ nlmsvc_unlock(struct net *net, struct nlm_file *file, struct nlm_lock *lock) nlmsvc_cancel_blocked(net, file, lock); lock->fl.fl_type = F_UNLCK; - error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL); + if (file->f_file[O_RDONLY]) + error = vfs_lock_file(file->f_file[O_RDONLY], F_SETLK, + &lock->fl, NULL); + if (file->f_file[O_WRONLY]) + error = vfs_lock_file(file->f_file[O_WRONLY], F_SETLK, + &lock->fl, NULL); return (error < 0)? nlm_lck_denied_nolocks : nlm_granted; } @@ -689,10 +687,11 @@ nlmsvc_cancel_blocked(struct net *net, struct nlm_file *file, struct nlm_lock *l { struct nlm_block *block; int status = 0; + int mode; dprintk("lockd: nlmsvc_cancel(%s/%ld, pi=%d, %Ld-%Ld)\n", - locks_inode(file->f_file)->i_sb->s_id, - locks_inode(file->f_file)->i_ino, + nlmsvc_file_inode(file)->i_sb->s_id, + nlmsvc_file_inode(file)->i_ino, lock->fl.fl_pid, (long long)lock->fl.fl_start, (long long)lock->fl.fl_end); @@ -704,7 +703,8 @@ nlmsvc_cancel_blocked(struct net *net, struct nlm_file *file, struct nlm_lock *l block = nlmsvc_lookup_block(file, lock); mutex_unlock(&file->f_mutex); if (block != NULL) { - vfs_cancel_lock(block->b_file->f_file, + mode = lock_to_openmode(&lock->fl); + vfs_cancel_lock(block->b_file->f_file[mode], &block->b_call->a_args.lock.fl); status = nlmsvc_unlink_block(block); nlmsvc_release_block(block); @@ -788,9 +788,21 @@ nlmsvc_notify_blocked(struct file_lock *fl) printk(KERN_WARNING "lockd: notification for unknown block!\n"); } +static fl_owner_t nlmsvc_get_owner(fl_owner_t owner) +{ + return nlmsvc_get_lockowner(owner); +} + +static void nlmsvc_put_owner(fl_owner_t owner) +{ + nlmsvc_put_lockowner(owner); +} + const struct lock_manager_operations nlmsvc_lock_operations = { .lm_notify = nlmsvc_notify_blocked, .lm_grant = nlmsvc_grant_deferred, + .lm_get_owner = nlmsvc_get_owner, + .lm_put_owner = nlmsvc_put_owner, }; /* @@ -809,6 +821,7 @@ nlmsvc_grant_blocked(struct nlm_block *block) { struct nlm_file *file = block->b_file; struct nlm_lock *lock = &block->b_call->a_args.lock; + int mode; int error; loff_t fl_start, fl_end; @@ -834,7 +847,8 @@ nlmsvc_grant_blocked(struct nlm_block *block) lock->fl.fl_flags |= FL_SLEEP; fl_start = lock->fl.fl_start; fl_end = lock->fl.fl_end; - error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL); + mode = lock_to_openmode(&lock->fl); + error = vfs_lock_file(file->f_file[mode], F_SETLK, &lock->fl, NULL); lock->fl.fl_flags &= ~FL_SLEEP; lock->fl.fl_start = fl_start; lock->fl.fl_end = fl_end; diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 4ae4b63b5392..99696d3f6dd6 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -55,6 +55,7 @@ nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp, struct nlm_host *host = NULL; struct nlm_file *file = NULL; struct nlm_lock *lock = &argp->lock; + int mode; __be32 error = 0; /* nfsd callbacks must have been installed for this procedure */ @@ -69,13 +70,14 @@ nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp, /* Obtain file pointer. Not used by FREE_ALL call. */ if (filp != NULL) { - error = cast_status(nlm_lookup_file(rqstp, &file, &lock->fh)); + error = cast_status(nlm_lookup_file(rqstp, &file, lock)); if (error != 0) goto no_locks; *filp = file; /* Set up the missing parts of the file_lock structure */ - lock->fl.fl_file = file->f_file; + mode = lock_to_openmode(&lock->fl); + lock->fl.fl_file = file->f_file[mode]; lock->fl.fl_pid = current->tgid; lock->fl.fl_lmops = &nlmsvc_lock_operations; nlmsvc_locks_init_private(&lock->fl, host, (pid_t)lock->svid); diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index 028fc152da22..cb3a7512c33e 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -45,7 +45,7 @@ static inline void nlm_debug_print_fh(char *msg, struct nfs_fh *f) static inline void nlm_debug_print_file(char *msg, struct nlm_file *file) { - struct inode *inode = locks_inode(file->f_file); + struct inode *inode = nlmsvc_file_inode(file); dprintk("lockd: %s %s/%ld\n", msg, inode->i_sb->s_id, inode->i_ino); @@ -71,56 +71,75 @@ static inline unsigned int file_hash(struct nfs_fh *f) return tmp & (FILE_NRHASH - 1); } +int lock_to_openmode(struct file_lock *lock) +{ + return (lock->fl_type == F_WRLCK) ? O_WRONLY : O_RDONLY; +} + +/* + * Open the file. Note that if we're reexporting, for example, + * this could block the lockd thread for a while. + * + * We have to make sure we have the right credential to open + * the file. + */ +static __be32 nlm_do_fopen(struct svc_rqst *rqstp, + struct nlm_file *file, int mode) +{ + struct file **fp = &file->f_file[mode]; + __be32 nfserr; + + if (*fp) + return 0; + nfserr = nlmsvc_ops->fopen(rqstp, &file->f_handle, fp, mode); + if (nfserr) + dprintk("lockd: open failed (error %d)\n", nfserr); + return nfserr; +} + /* * Lookup file info. If it doesn't exist, create a file info struct * and open a (VFS) file for the given inode. - * - * FIXME: - * Note that we open the file O_RDONLY even when creating write locks. - * This is not quite right, but for now, we assume the client performs - * the proper R/W checking. */ __be32 nlm_lookup_file(struct svc_rqst *rqstp, struct nlm_file **result, - struct nfs_fh *f) + struct nlm_lock *lock) { struct nlm_file *file; unsigned int hash; __be32 nfserr; + int mode; - nlm_debug_print_fh("nlm_lookup_file", f); + nlm_debug_print_fh("nlm_lookup_file", &lock->fh); - hash = file_hash(f); + hash = file_hash(&lock->fh); + mode = lock_to_openmode(&lock->fl); /* Lock file table */ mutex_lock(&nlm_file_mutex); hlist_for_each_entry(file, &nlm_files[hash], f_list) - if (!nfs_compare_fh(&file->f_handle, f)) + if (!nfs_compare_fh(&file->f_handle, &lock->fh)) { + mutex_lock(&file->f_mutex); + nfserr = nlm_do_fopen(rqstp, file, mode); + mutex_unlock(&file->f_mutex); goto found; - - nlm_debug_print_fh("creating file for", f); + } + nlm_debug_print_fh("creating file for", &lock->fh); nfserr = nlm_lck_denied_nolocks; file = kzalloc(sizeof(*file), GFP_KERNEL); if (!file) - goto out_unlock; + goto out_free; - memcpy(&file->f_handle, f, sizeof(struct nfs_fh)); + memcpy(&file->f_handle, &lock->fh, sizeof(struct nfs_fh)); mutex_init(&file->f_mutex); INIT_HLIST_NODE(&file->f_list); INIT_LIST_HEAD(&file->f_blocks); - /* Open the file. Note that this must not sleep for too long, else - * we would lock up lockd:-) So no NFS re-exports, folks. - * - * We have to make sure we have the right credential to open - * the file. - */ - if ((nfserr = nlmsvc_ops->fopen(rqstp, f, &file->f_file)) != 0) { - dprintk("lockd: open failed (error %d)\n", nfserr); - goto out_free; - } + nfserr = nlm_do_fopen(rqstp, file, mode); + if (nfserr) + goto out_unlock; hlist_add_head(&file->f_list, &nlm_files[hash]); @@ -128,7 +147,6 @@ found: dprintk("lockd: found file %p (count %d)\n", file, file->f_count); *result = file; file->f_count++; - nfserr = 0; out_unlock: mutex_unlock(&nlm_file_mutex); @@ -148,13 +166,34 @@ nlm_delete_file(struct nlm_file *file) nlm_debug_print_file("closing file", file); if (!hlist_unhashed(&file->f_list)) { hlist_del(&file->f_list); - nlmsvc_ops->fclose(file->f_file); + if (file->f_file[O_RDONLY]) + nlmsvc_ops->fclose(file->f_file[O_RDONLY]); + if (file->f_file[O_WRONLY]) + nlmsvc_ops->fclose(file->f_file[O_WRONLY]); kfree(file); } else { printk(KERN_WARNING "lockd: attempt to release unknown file!\n"); } } +static int nlm_unlock_files(struct nlm_file *file) +{ + struct file_lock lock; + struct file *f; + + lock.fl_type = F_UNLCK; + lock.fl_start = 0; + lock.fl_end = OFFSET_MAX; + for (f = file->f_file[0]; f <= file->f_file[1]; f++) { + if (f && vfs_lock_file(f, F_SETLK, &lock, NULL) < 0) { + pr_warn("lockd: unlock failure in %s:%d\n", + __FILE__, __LINE__); + return 1; + } + } + return 0; +} + /* * Loop over all locks on the given file and perform the specified * action. @@ -182,17 +221,10 @@ again: lockhost = ((struct nlm_lockowner *)fl->fl_owner)->host; if (match(lockhost, host)) { - struct file_lock lock = *fl; spin_unlock(&flctx->flc_lock); - lock.fl_type = F_UNLCK; - lock.fl_start = 0; - lock.fl_end = OFFSET_MAX; - if (vfs_lock_file(file->f_file, F_SETLK, &lock, NULL) < 0) { - printk("lockd: unlock failure in %s:%d\n", - __FILE__, __LINE__); + if (nlm_unlock_files(file)) return 1; - } goto again; } } @@ -246,6 +278,15 @@ nlm_file_inuse(struct nlm_file *file) return 0; } +static void nlm_close_files(struct nlm_file *file) +{ + struct file *f; + + for (f = file->f_file[0]; f <= file->f_file[1]; f++) + if (f) + nlmsvc_ops->fclose(f); +} + /* * Loop over all files in the file table. */ @@ -276,7 +317,7 @@ nlm_traverse_files(void *data, nlm_host_match_fn_t match, if (list_empty(&file->f_blocks) && !file->f_locks && !file->f_shares && !file->f_count) { hlist_del(&file->f_list); - nlmsvc_ops->fclose(file->f_file); + nlm_close_files(file); kfree(file); } } @@ -410,12 +451,13 @@ nlmsvc_invalidate_all(void) nlm_traverse_files(NULL, nlmsvc_is_client, NULL); } + static int nlmsvc_match_sb(void *datap, struct nlm_file *file) { struct super_block *sb = datap; - return sb == locks_inode(file->f_file)->i_sb; + return sb == nlmsvc_file_inode(file)->i_sb; } /** diff --git a/fs/locks.c b/fs/locks.c index 74b2a1dfe8d8..3d6fb4ae847b 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1397,103 +1397,6 @@ static int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl) return error; } -#ifdef CONFIG_MANDATORY_FILE_LOCKING -/** - * locks_mandatory_locked - Check for an active lock - * @file: the file to check - * - * Searches the inode's list of locks to find any POSIX locks which conflict. - * This function is called from locks_verify_locked() only. - */ -int locks_mandatory_locked(struct file *file) -{ - int ret; - struct inode *inode = locks_inode(file); - struct file_lock_context *ctx; - struct file_lock *fl; - - ctx = smp_load_acquire(&inode->i_flctx); - if (!ctx || list_empty_careful(&ctx->flc_posix)) - return 0; - - /* - * Search the lock list for this inode for any POSIX locks. - */ - spin_lock(&ctx->flc_lock); - ret = 0; - list_for_each_entry(fl, &ctx->flc_posix, fl_list) { - if (fl->fl_owner != current->files && - fl->fl_owner != file) { - ret = -EAGAIN; - break; - } - } - spin_unlock(&ctx->flc_lock); - return ret; -} - -/** - * locks_mandatory_area - Check for a conflicting lock - * @inode: the file to check - * @filp: how the file was opened (if it was) - * @start: first byte in the file to check - * @end: lastbyte in the file to check - * @type: %F_WRLCK for a write lock, else %F_RDLCK - * - * Searches the inode's list of locks to find any POSIX locks which conflict. - */ -int locks_mandatory_area(struct inode *inode, struct file *filp, loff_t start, - loff_t end, unsigned char type) -{ - struct file_lock fl; - int error; - bool sleep = false; - - locks_init_lock(&fl); - fl.fl_pid = current->tgid; - fl.fl_file = filp; - fl.fl_flags = FL_POSIX | FL_ACCESS; - if (filp && !(filp->f_flags & O_NONBLOCK)) - sleep = true; - fl.fl_type = type; - fl.fl_start = start; - fl.fl_end = end; - - for (;;) { - if (filp) { - fl.fl_owner = filp; - fl.fl_flags &= ~FL_SLEEP; - error = posix_lock_inode(inode, &fl, NULL); - if (!error) - break; - } - - if (sleep) - fl.fl_flags |= FL_SLEEP; - fl.fl_owner = current->files; - error = posix_lock_inode(inode, &fl, NULL); - if (error != FILE_LOCK_DEFERRED) - break; - error = wait_event_interruptible(fl.fl_wait, - list_empty(&fl.fl_blocked_member)); - if (!error) { - /* - * If we've been sleeping someone might have - * changed the permissions behind our back. - */ - if (__mandatory_lock(inode)) - continue; - } - - break; - } - locks_delete_block(&fl); - - return error; -} -EXPORT_SYMBOL(locks_mandatory_area); -#endif /* CONFIG_MANDATORY_FILE_LOCKING */ - static void lease_clear_pending(struct file_lock *fl, int arg) { switch (arg) { @@ -2486,14 +2389,6 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, if (file_lock == NULL) return -ENOLCK; - /* Don't allow mandatory locks on files that may be memory mapped - * and shared. - */ - if (mandatory_lock(inode) && mapping_writably_mapped(filp->f_mapping)) { - error = -EAGAIN; - goto out; - } - error = flock_to_posix_lock(filp, file_lock, flock); if (error) goto out; @@ -2611,21 +2506,12 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd, struct flock64 *flock) { struct file_lock *file_lock = locks_alloc_lock(); - struct inode *inode = locks_inode(filp); struct file *f; int error; if (file_lock == NULL) return -ENOLCK; - /* Don't allow mandatory locks on files that may be memory mapped - * and shared. - */ - if (mandatory_lock(inode) && mapping_writably_mapped(filp->f_mapping)) { - error = -EAGAIN; - goto out; - } - error = flock64_to_posix_lock(filp, file_lock, flock); if (error) goto out; @@ -2857,8 +2743,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, seq_puts(f, "POSIX "); seq_printf(f, " %s ", - (inode == NULL) ? "*NOINODE*" : - mandatory_lock(inode) ? "MANDATORY" : "ADVISORY "); + (inode == NULL) ? "*NOINODE*" : "ADVISORY "); } else if (IS_FLOCK(fl)) { if (fl->fl_type & LOCK_MAND) { seq_puts(f, "FLOCK MSNFS "); diff --git a/fs/namei.c b/fs/namei.c index bf6d8a738c59..d049d3972695 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -204,6 +204,14 @@ getname_flags(const char __user *filename, int flags, int *empty) } struct filename * +getname_uflags(const char __user *filename, int uflags) +{ + int flags = (uflags & AT_EMPTY_PATH) ? LOOKUP_EMPTY : 0; + + return getname_flags(filename, flags, NULL); +} + +struct filename * getname(const char __user * filename) { return getname_flags(filename, 0, NULL); @@ -247,6 +255,9 @@ getname_kernel(const char * filename) void putname(struct filename *name) { + if (IS_ERR_OR_NULL(name)) + return; + BUG_ON(name->refcnt <= 0); if (--name->refcnt > 0) @@ -2456,7 +2467,7 @@ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path return err; } -int filename_lookup(int dfd, struct filename *name, unsigned flags, +static int __filename_lookup(int dfd, struct filename *name, unsigned flags, struct path *path, struct path *root) { int retval; @@ -2474,6 +2485,14 @@ int filename_lookup(int dfd, struct filename *name, unsigned flags, audit_inode(name, path->dentry, flags & LOOKUP_MOUNTPOINT ? AUDIT_INODE_NOEVAL : 0); restore_nameidata(); + return retval; +} + +int filename_lookup(int dfd, struct filename *name, unsigned flags, + struct path *path, struct path *root) +{ + int retval = __filename_lookup(dfd, name, flags, path, root); + putname(name); return retval; } @@ -2495,7 +2514,7 @@ static int path_parentat(struct nameidata *nd, unsigned flags, return err; } -static struct filename *filename_parentat(int dfd, struct filename *name, +static int __filename_parentat(int dfd, struct filename *name, unsigned int flags, struct path *parent, struct qstr *last, int *type) { @@ -2503,7 +2522,7 @@ static struct filename *filename_parentat(int dfd, struct filename *name, struct nameidata nd; if (IS_ERR(name)) - return name; + return PTR_ERR(name); set_nameidata(&nd, dfd, name, NULL); retval = path_parentat(&nd, flags | LOOKUP_RCU, parent); if (unlikely(retval == -ECHILD)) @@ -2514,29 +2533,34 @@ static struct filename *filename_parentat(int dfd, struct filename *name, *last = nd.last; *type = nd.last_type; audit_inode(name, parent->dentry, AUDIT_INODE_PARENT); - } else { - putname(name); - name = ERR_PTR(retval); } restore_nameidata(); - return name; + return retval; +} + +static int filename_parentat(int dfd, struct filename *name, + unsigned int flags, struct path *parent, + struct qstr *last, int *type) +{ + int retval = __filename_parentat(dfd, name, flags, parent, last, type); + + putname(name); + return retval; } /* does lookup, returns the object with parent locked */ struct dentry *kern_path_locked(const char *name, struct path *path) { - struct filename *filename; struct dentry *d; struct qstr last; - int type; + int type, error; - filename = filename_parentat(AT_FDCWD, getname_kernel(name), 0, path, + error = filename_parentat(AT_FDCWD, getname_kernel(name), 0, path, &last, &type); - if (IS_ERR(filename)) - return ERR_CAST(filename); + if (error) + return ERR_PTR(error); if (unlikely(type != LAST_NORM)) { path_put(path); - putname(filename); return ERR_PTR(-EINVAL); } inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); @@ -2545,7 +2569,6 @@ struct dentry *kern_path_locked(const char *name, struct path *path) inode_unlock(path->dentry->d_inode); path_put(path); } - putname(filename); return d; } @@ -2575,8 +2598,9 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, } EXPORT_SYMBOL(vfs_path_lookup); -static int lookup_one_len_common(const char *name, struct dentry *base, - int len, struct qstr *this) +static int lookup_one_common(struct user_namespace *mnt_userns, + const char *name, struct dentry *base, int len, + struct qstr *this) { this->name = name; this->len = len; @@ -2604,7 +2628,7 @@ static int lookup_one_len_common(const char *name, struct dentry *base, return err; } - return inode_permission(&init_user_ns, base->d_inode, MAY_EXEC); + return inode_permission(mnt_userns, base->d_inode, MAY_EXEC); } /** @@ -2628,7 +2652,7 @@ struct dentry *try_lookup_one_len(const char *name, struct dentry *base, int len WARN_ON_ONCE(!inode_is_locked(base->d_inode)); - err = lookup_one_len_common(name, base, len, &this); + err = lookup_one_common(&init_user_ns, name, base, len, &this); if (err) return ERR_PTR(err); @@ -2655,7 +2679,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) WARN_ON_ONCE(!inode_is_locked(base->d_inode)); - err = lookup_one_len_common(name, base, len, &this); + err = lookup_one_common(&init_user_ns, name, base, len, &this); if (err) return ERR_PTR(err); @@ -2665,6 +2689,36 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) EXPORT_SYMBOL(lookup_one_len); /** + * lookup_one - filesystem helper to lookup single pathname component + * @mnt_userns: user namespace of the mount the lookup is performed from + * @name: pathname component to lookup + * @base: base directory to lookup from + * @len: maximum length @len should be interpreted to + * + * Note that this routine is purely a helper for filesystem usage and should + * not be called by generic code. + * + * The caller must hold base->i_mutex. + */ +struct dentry *lookup_one(struct user_namespace *mnt_userns, const char *name, + struct dentry *base, int len) +{ + struct dentry *dentry; + struct qstr this; + int err; + + WARN_ON_ONCE(!inode_is_locked(base->d_inode)); + + err = lookup_one_common(mnt_userns, name, base, len, &this); + if (err) + return ERR_PTR(err); + + dentry = lookup_dcache(&this, base, 0); + return dentry ? dentry : __lookup_slow(&this, base, 0); +} +EXPORT_SYMBOL(lookup_one); + +/** * lookup_one_len_unlocked - filesystem helper to lookup single pathname component * @name: pathname component to lookup * @base: base directory to lookup from @@ -2683,7 +2737,7 @@ struct dentry *lookup_one_len_unlocked(const char *name, int err; struct dentry *ret; - err = lookup_one_len_common(name, base, len, &this); + err = lookup_one_common(&init_user_ns, name, base, len, &this); if (err) return ERR_PTR(err); @@ -3023,9 +3077,7 @@ static int handle_truncate(struct user_namespace *mnt_userns, struct file *filp) /* * Refuse to truncate files with mandatory locks held on them. */ - error = locks_verify_locked(filp); - if (!error) - error = security_path_truncate(path); + error = security_path_truncate(path); if (!error) { error = do_truncate(mnt_userns, path->dentry, 0, ATTR_MTIME|ATTR_CTIME|ATTR_OPEN, @@ -3566,7 +3618,7 @@ struct file *do_file_open_root(const struct path *root, return file; } -static struct dentry *filename_create(int dfd, struct filename *name, +static struct dentry *__filename_create(int dfd, struct filename *name, struct path *path, unsigned int lookup_flags) { struct dentry *dentry = ERR_PTR(-EEXIST); @@ -3582,9 +3634,9 @@ static struct dentry *filename_create(int dfd, struct filename *name, */ lookup_flags &= LOOKUP_REVAL; - name = filename_parentat(dfd, name, lookup_flags, path, &last, &type); - if (IS_ERR(name)) - return ERR_CAST(name); + error = __filename_parentat(dfd, name, lookup_flags, path, &last, &type); + if (error) + return ERR_PTR(error); /* * Yucky last component or no last component at all? @@ -3622,7 +3674,6 @@ static struct dentry *filename_create(int dfd, struct filename *name, error = err2; goto fail; } - putname(name); return dentry; fail: dput(dentry); @@ -3633,10 +3684,18 @@ unlock: mnt_drop_write(path->mnt); out: path_put(path); - putname(name); return dentry; } +static inline struct dentry *filename_create(int dfd, struct filename *name, + struct path *path, unsigned int lookup_flags) +{ + struct dentry *res = __filename_create(dfd, name, path, lookup_flags); + + putname(name); + return res; +} + struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path, unsigned int lookup_flags) { @@ -3725,7 +3784,7 @@ static int may_mknod(umode_t mode) } } -static long do_mknodat(int dfd, const char __user *filename, umode_t mode, +static int do_mknodat(int dfd, struct filename *name, umode_t mode, unsigned int dev) { struct user_namespace *mnt_userns; @@ -3736,17 +3795,18 @@ static long do_mknodat(int dfd, const char __user *filename, umode_t mode, error = may_mknod(mode); if (error) - return error; + goto out1; retry: - dentry = user_path_create(dfd, filename, &path, lookup_flags); + dentry = __filename_create(dfd, name, &path, lookup_flags); + error = PTR_ERR(dentry); if (IS_ERR(dentry)) - return PTR_ERR(dentry); + goto out1; if (!IS_POSIXACL(path.dentry->d_inode)) mode &= ~current_umask(); error = security_path_mknod(&path, dentry, mode, dev); if (error) - goto out; + goto out2; mnt_userns = mnt_user_ns(path.mnt); switch (mode & S_IFMT) { @@ -3765,24 +3825,26 @@ retry: dentry, mode, 0); break; } -out: +out2: done_path_create(&path, dentry); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } +out1: + putname(name); return error; } SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode, unsigned int, dev) { - return do_mknodat(dfd, filename, mode, dev); + return do_mknodat(dfd, getname(filename), mode, dev); } SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev) { - return do_mknodat(AT_FDCWD, filename, mode, dev); + return do_mknodat(AT_FDCWD, getname(filename), mode, dev); } /** @@ -3827,7 +3889,7 @@ int vfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, } EXPORT_SYMBOL(vfs_mkdir); -static long do_mkdirat(int dfd, const char __user *pathname, umode_t mode) +int do_mkdirat(int dfd, struct filename *name, umode_t mode) { struct dentry *dentry; struct path path; @@ -3835,9 +3897,10 @@ static long do_mkdirat(int dfd, const char __user *pathname, umode_t mode) unsigned int lookup_flags = LOOKUP_DIRECTORY; retry: - dentry = user_path_create(dfd, pathname, &path, lookup_flags); + dentry = __filename_create(dfd, name, &path, lookup_flags); + error = PTR_ERR(dentry); if (IS_ERR(dentry)) - return PTR_ERR(dentry); + goto out_putname; if (!IS_POSIXACL(path.dentry->d_inode)) mode &= ~current_umask(); @@ -3853,17 +3916,19 @@ retry: lookup_flags |= LOOKUP_REVAL; goto retry; } +out_putname: + putname(name); return error; } SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode) { - return do_mkdirat(dfd, pathname, mode); + return do_mkdirat(dfd, getname(pathname), mode); } SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode) { - return do_mkdirat(AT_FDCWD, pathname, mode); + return do_mkdirat(AT_FDCWD, getname(pathname), mode); } /** @@ -3921,62 +3986,62 @@ out: } EXPORT_SYMBOL(vfs_rmdir); -long do_rmdir(int dfd, struct filename *name) +int do_rmdir(int dfd, struct filename *name) { struct user_namespace *mnt_userns; - int error = 0; + int error; struct dentry *dentry; struct path path; struct qstr last; int type; unsigned int lookup_flags = 0; retry: - name = filename_parentat(dfd, name, lookup_flags, - &path, &last, &type); - if (IS_ERR(name)) - return PTR_ERR(name); + error = __filename_parentat(dfd, name, lookup_flags, &path, &last, &type); + if (error) + goto exit1; switch (type) { case LAST_DOTDOT: error = -ENOTEMPTY; - goto exit1; + goto exit2; case LAST_DOT: error = -EINVAL; - goto exit1; + goto exit2; case LAST_ROOT: error = -EBUSY; - goto exit1; + goto exit2; } error = mnt_want_write(path.mnt); if (error) - goto exit1; + goto exit2; inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT); dentry = __lookup_hash(&last, path.dentry, lookup_flags); error = PTR_ERR(dentry); if (IS_ERR(dentry)) - goto exit2; + goto exit3; if (!dentry->d_inode) { error = -ENOENT; - goto exit3; + goto exit4; } error = security_path_rmdir(&path, dentry); if (error) - goto exit3; + goto exit4; mnt_userns = mnt_user_ns(path.mnt); error = vfs_rmdir(mnt_userns, path.dentry->d_inode, dentry); -exit3: +exit4: dput(dentry); -exit2: +exit3: inode_unlock(path.dentry->d_inode); mnt_drop_write(path.mnt); -exit1: +exit2: path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; goto retry; } +exit1: putname(name); return error; } @@ -4059,7 +4124,7 @@ EXPORT_SYMBOL(vfs_unlink); * writeout happening, and we don't want to prevent access to the directory * while waiting on the I/O. */ -long do_unlinkat(int dfd, struct filename *name) +int do_unlinkat(int dfd, struct filename *name) { int error; struct dentry *dentry; @@ -4070,17 +4135,17 @@ long do_unlinkat(int dfd, struct filename *name) struct inode *delegated_inode = NULL; unsigned int lookup_flags = 0; retry: - name = filename_parentat(dfd, name, lookup_flags, &path, &last, &type); - if (IS_ERR(name)) - return PTR_ERR(name); + error = __filename_parentat(dfd, name, lookup_flags, &path, &last, &type); + if (error) + goto exit1; error = -EISDIR; if (type != LAST_NORM) - goto exit1; + goto exit2; error = mnt_want_write(path.mnt); if (error) - goto exit1; + goto exit2; retry_deleg: inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT); dentry = __lookup_hash(&last, path.dentry, lookup_flags); @@ -4097,11 +4162,11 @@ retry_deleg: ihold(inode); error = security_path_unlink(&path, dentry); if (error) - goto exit2; + goto exit3; mnt_userns = mnt_user_ns(path.mnt); error = vfs_unlink(mnt_userns, path.dentry->d_inode, dentry, &delegated_inode); -exit2: +exit3: dput(dentry); } inode_unlock(path.dentry->d_inode); @@ -4114,13 +4179,14 @@ exit2: goto retry_deleg; } mnt_drop_write(path.mnt); -exit1: +exit2: path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; inode = NULL; goto retry; } +exit1: putname(name); return error; @@ -4131,7 +4197,7 @@ slashes: error = -EISDIR; else error = -ENOTDIR; - goto exit2; + goto exit3; } SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag) @@ -4186,23 +4252,22 @@ int vfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, } EXPORT_SYMBOL(vfs_symlink); -static long do_symlinkat(const char __user *oldname, int newdfd, - const char __user *newname) +int do_symlinkat(struct filename *from, int newdfd, struct filename *to) { int error; - struct filename *from; struct dentry *dentry; struct path path; unsigned int lookup_flags = 0; - from = getname(oldname); - if (IS_ERR(from)) - return PTR_ERR(from); + if (IS_ERR(from)) { + error = PTR_ERR(from); + goto out_putnames; + } retry: - dentry = user_path_create(newdfd, newname, &path, lookup_flags); + dentry = __filename_create(newdfd, to, &path, lookup_flags); error = PTR_ERR(dentry); if (IS_ERR(dentry)) - goto out_putname; + goto out_putnames; error = security_path_symlink(&path, dentry, from->name); if (!error) { @@ -4217,7 +4282,8 @@ retry: lookup_flags |= LOOKUP_REVAL; goto retry; } -out_putname: +out_putnames: + putname(to); putname(from); return error; } @@ -4225,12 +4291,12 @@ out_putname: SYSCALL_DEFINE3(symlinkat, const char __user *, oldname, int, newdfd, const char __user *, newname) { - return do_symlinkat(oldname, newdfd, newname); + return do_symlinkat(getname(oldname), newdfd, getname(newname)); } SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname) { - return do_symlinkat(oldname, AT_FDCWD, newname); + return do_symlinkat(getname(oldname), AT_FDCWD, getname(newname)); } /** @@ -4331,8 +4397,8 @@ EXPORT_SYMBOL(vfs_link); * with linux 2.0, and to avoid hard-linking to directories * and other special files. --ADM */ -static int do_linkat(int olddfd, const char __user *oldname, int newdfd, - const char __user *newname, int flags) +int do_linkat(int olddfd, struct filename *old, int newdfd, + struct filename *new, int flags) { struct user_namespace *mnt_userns; struct dentry *new_dentry; @@ -4341,31 +4407,32 @@ static int do_linkat(int olddfd, const char __user *oldname, int newdfd, int how = 0; int error; - if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0) - return -EINVAL; + if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0) { + error = -EINVAL; + goto out_putnames; + } /* * To use null names we require CAP_DAC_READ_SEARCH * This ensures that not everyone will be able to create * handlink using the passed filedescriptor. */ - if (flags & AT_EMPTY_PATH) { - if (!capable(CAP_DAC_READ_SEARCH)) - return -ENOENT; - how = LOOKUP_EMPTY; + if (flags & AT_EMPTY_PATH && !capable(CAP_DAC_READ_SEARCH)) { + error = -ENOENT; + goto out_putnames; } if (flags & AT_SYMLINK_FOLLOW) how |= LOOKUP_FOLLOW; retry: - error = user_path_at(olddfd, oldname, how, &old_path); + error = __filename_lookup(olddfd, old, how, &old_path, NULL); if (error) - return error; + goto out_putnames; - new_dentry = user_path_create(newdfd, newname, &new_path, + new_dentry = __filename_create(newdfd, new, &new_path, (how & LOOKUP_REVAL)); error = PTR_ERR(new_dentry); if (IS_ERR(new_dentry)) - goto out; + goto out_putpath; error = -EXDEV; if (old_path.mnt != new_path.mnt) @@ -4393,8 +4460,11 @@ out_dput: how |= LOOKUP_REVAL; goto retry; } -out: +out_putpath: path_put(&old_path); +out_putnames: + putname(old); + putname(new); return error; } @@ -4402,12 +4472,13 @@ out: SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname, int, newdfd, const char __user *, newname, int, flags) { - return do_linkat(olddfd, oldname, newdfd, newname, flags); + return do_linkat(olddfd, getname_uflags(oldname, flags), + newdfd, getname(newname), flags); } SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname) { - return do_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0); + return do_linkat(AT_FDCWD, getname(oldname), AT_FDCWD, getname(newname), 0); } /** @@ -4602,29 +4673,25 @@ int do_renameat2(int olddfd, struct filename *from, int newdfd, int error = -EINVAL; if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) - goto put_both; + goto put_names; if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) && (flags & RENAME_EXCHANGE)) - goto put_both; + goto put_names; if (flags & RENAME_EXCHANGE) target_flags = 0; retry: - from = filename_parentat(olddfd, from, lookup_flags, &old_path, + error = __filename_parentat(olddfd, from, lookup_flags, &old_path, &old_last, &old_type); - if (IS_ERR(from)) { - error = PTR_ERR(from); - goto put_new; - } + if (error) + goto put_names; - to = filename_parentat(newdfd, to, lookup_flags, &new_path, &new_last, + error = __filename_parentat(newdfd, to, lookup_flags, &new_path, &new_last, &new_type); - if (IS_ERR(to)) { - error = PTR_ERR(to); + if (error) goto exit1; - } error = -EXDEV; if (old_path.mnt != new_path.mnt) @@ -4727,12 +4794,9 @@ exit1: lookup_flags |= LOOKUP_REVAL; goto retry; } -put_both: - if (!IS_ERR(from)) - putname(from); -put_new: - if (!IS_ERR(to)) - putname(to); +put_names: + putname(from); + putname(to); return error; } diff --git a/fs/namespace.c b/fs/namespace.c index ab4174a3c802..12852363d90e 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1715,18 +1715,14 @@ static inline bool may_mount(void) return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN); } -#ifdef CONFIG_MANDATORY_FILE_LOCKING -static inline bool may_mandlock(void) +static void warn_mandlock(void) { - return capable(CAP_SYS_ADMIN); + pr_warn_once("=======================================================\n" + "WARNING: The mand mount option has been deprecated and\n" + " and is ignored by this kernel. Remove the mand\n" + " option from the mount to silence this warning.\n" + "=======================================================\n"); } -#else -static inline bool may_mandlock(void) -{ - pr_warn("VFS: \"mand\" mount option not supported"); - return false; -} -#endif static int can_umount(const struct path *path, int flags) { @@ -1938,6 +1934,20 @@ void drop_collected_mounts(struct vfsmount *mnt) namespace_unlock(); } +static bool has_locked_children(struct mount *mnt, struct dentry *dentry) +{ + struct mount *child; + + list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { + if (!is_subdir(child->mnt_mountpoint, dentry)) + continue; + + if (child->mnt.mnt_flags & MNT_LOCKED) + return true; + } + return false; +} + /** * clone_private_mount - create a private clone of a path * @path: path to clone @@ -1953,10 +1963,19 @@ struct vfsmount *clone_private_mount(const struct path *path) struct mount *old_mnt = real_mount(path->mnt); struct mount *new_mnt; + down_read(&namespace_sem); if (IS_MNT_UNBINDABLE(old_mnt)) - return ERR_PTR(-EINVAL); + goto invalid; + + if (!check_mnt(old_mnt)) + goto invalid; + + if (has_locked_children(old_mnt, path->dentry)) + goto invalid; new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE); + up_read(&namespace_sem); + if (IS_ERR(new_mnt)) return ERR_CAST(new_mnt); @@ -1964,6 +1983,10 @@ struct vfsmount *clone_private_mount(const struct path *path) new_mnt->mnt_ns = MNT_NS_INTERNAL; return &new_mnt->mnt; + +invalid: + up_read(&namespace_sem); + return ERR_PTR(-EINVAL); } EXPORT_SYMBOL_GPL(clone_private_mount); @@ -2315,19 +2338,6 @@ static int do_change_type(struct path *path, int ms_flags) return err; } -static bool has_locked_children(struct mount *mnt, struct dentry *dentry) -{ - struct mount *child; - list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { - if (!is_subdir(child->mnt_mountpoint, dentry)) - continue; - - if (child->mnt.mnt_flags & MNT_LOCKED) - return true; - } - return false; -} - static struct mount *__do_loopback(struct path *old_path, int recurse) { struct mount *mnt = ERR_PTR(-EINVAL), *old = real_mount(old_path->mnt); @@ -2684,6 +2694,78 @@ out: return ret; } +static int do_set_group(struct path *from_path, struct path *to_path) +{ + struct mount *from, *to; + int err; + + from = real_mount(from_path->mnt); + to = real_mount(to_path->mnt); + + namespace_lock(); + + err = -EINVAL; + /* To and From must be mounted */ + if (!is_mounted(&from->mnt)) + goto out; + if (!is_mounted(&to->mnt)) + goto out; + + err = -EPERM; + /* We should be allowed to modify mount namespaces of both mounts */ + if (!ns_capable(from->mnt_ns->user_ns, CAP_SYS_ADMIN)) + goto out; + if (!ns_capable(to->mnt_ns->user_ns, CAP_SYS_ADMIN)) + goto out; + + err = -EINVAL; + /* To and From paths should be mount roots */ + if (from_path->dentry != from_path->mnt->mnt_root) + goto out; + if (to_path->dentry != to_path->mnt->mnt_root) + goto out; + + /* Setting sharing groups is only allowed across same superblock */ + if (from->mnt.mnt_sb != to->mnt.mnt_sb) + goto out; + + /* From mount root should be wider than To mount root */ + if (!is_subdir(to->mnt.mnt_root, from->mnt.mnt_root)) + goto out; + + /* From mount should not have locked children in place of To's root */ + if (has_locked_children(from, to->mnt.mnt_root)) + goto out; + + /* Setting sharing groups is only allowed on private mounts */ + if (IS_MNT_SHARED(to) || IS_MNT_SLAVE(to)) + goto out; + + /* From should not be private */ + if (!IS_MNT_SHARED(from) && !IS_MNT_SLAVE(from)) + goto out; + + if (IS_MNT_SLAVE(from)) { + struct mount *m = from->mnt_master; + + list_add(&to->mnt_slave, &m->mnt_slave_list); + to->mnt_master = m; + } + + if (IS_MNT_SHARED(from)) { + to->mnt_group_id = from->mnt_group_id; + list_add(&to->mnt_share, &from->mnt_share); + lock_mount_hash(); + set_mnt_shared(to); + unlock_mount_hash(); + } + + err = 0; +out: + namespace_unlock(); + return err; +} + static int do_move_mount(struct path *old_path, struct path *new_path) { struct mnt_namespace *ns; @@ -3179,8 +3261,8 @@ int path_mount(const char *dev_name, struct path *path, return ret; if (!may_mount()) return -EPERM; - if ((flags & SB_MANDLOCK) && !may_mandlock()) - return -EPERM; + if (flags & SB_MANDLOCK) + warn_mandlock(); /* Default to relatime unless overriden */ if (!(flags & MS_NOATIME)) @@ -3563,9 +3645,8 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags, if (fc->phase != FS_CONTEXT_AWAITING_MOUNT) goto err_unlock; - ret = -EPERM; - if ((fc->sb_flags & SB_MANDLOCK) && !may_mandlock()) - goto err_unlock; + if (fc->sb_flags & SB_MANDLOCK) + warn_mandlock(); newmount.mnt = vfs_create_mount(fc); if (IS_ERR(newmount.mnt)) { @@ -3669,7 +3750,10 @@ SYSCALL_DEFINE5(move_mount, if (ret < 0) goto out_to; - ret = do_move_mount(&from_path, &to_path); + if (flags & MOVE_MOUNT_SET_GROUP) + ret = do_set_group(&from_path, &to_path); + else + ret = do_move_mount(&from_path, &to_path); out_to: path_put(&to_path); diff --git a/fs/nfs/export.c b/fs/nfs/export.c index 37a1a88df771..d772c20bbfd1 100644 --- a/fs/nfs/export.c +++ b/fs/nfs/export.c @@ -180,5 +180,5 @@ const struct export_operations nfs_export_ops = { .fetch_iversion = nfs_fetch_iversion, .flags = EXPORT_OP_NOWCC|EXPORT_OP_NOSUBTREECHK| EXPORT_OP_CLOSE_BEFORE_UNLINK|EXPORT_OP_REMOTE_FS| - EXPORT_OP_NOATOMIC_ATTR, + EXPORT_OP_NOATOMIC_ATTR|EXPORT_OP_SYNC_LOCKS, }; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 1fef107961bc..aa353fd58240 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -806,9 +806,8 @@ int nfs_lock(struct file *filp, int cmd, struct file_lock *fl) nfs_inc_stats(inode, NFSIOS_VFSLOCK); - /* No mandatory locks over NFS */ - if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) - goto out_err; + if (fl->fl_flags & FL_RECLAIM) + return -ENOGRACE; if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FCNTL) is_local = 1; diff --git a/fs/nfs/nfs3_fs.h b/fs/nfs/nfs3_fs.h index c8a192802dda..03a4e679fd99 100644 --- a/fs/nfs/nfs3_fs.h +++ b/fs/nfs/nfs3_fs.h @@ -11,7 +11,7 @@ * nfs3acl.c */ #ifdef CONFIG_NFS_V3_ACL -extern struct posix_acl *nfs3_get_acl(struct inode *inode, int type); +extern struct posix_acl *nfs3_get_acl(struct inode *inode, int type, bool rcu); extern int nfs3_set_acl(struct user_namespace *mnt_userns, struct inode *inode, struct posix_acl *acl, int type); extern int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 9ec560aa4a50..93de0b58647a 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -44,7 +44,7 @@ static void nfs3_abort_get_acl(struct posix_acl **p) cmpxchg(p, sentinel, ACL_NOT_CACHED); } -struct posix_acl *nfs3_get_acl(struct inode *inode, int type) +struct posix_acl *nfs3_get_acl(struct inode *inode, int type, bool rcu) { struct nfs_server *server = NFS_SERVER(inode); struct page *pages[NFSACL_MAXPAGES] = { }; @@ -62,6 +62,9 @@ struct posix_acl *nfs3_get_acl(struct inode *inode, int type) }; int status, count; + if (rcu) + return ERR_PTR(-ECHILD); + if (!nfs_server_capable(inode, NFS_CAP_ACLS)) return ERR_PTR(-EOPNOTSUPP); diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c index 3f5b3d7b62b7..606fa155c28a 100644 --- a/fs/nfsd/lockd.c +++ b/fs/nfsd/lockd.c @@ -25,9 +25,11 @@ * Note: we hold the dentry use count while the file is open. */ static __be32 -nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp) +nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp, + int mode) { __be32 nfserr; + int access; struct svc_fh fh; /* must initialize before using! but maxsize doesn't matter */ @@ -36,7 +38,9 @@ nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp) memcpy((char*)&fh.fh_handle.fh_base, f->data, f->size); fh.fh_export = NULL; - nfserr = nfsd_open(rqstp, &fh, S_IFREG, NFSD_MAY_LOCK, filp); + access = (mode == O_WRONLY) ? NFSD_MAY_WRITE : NFSD_MAY_READ; + access |= NFSD_MAY_LOCK; + nfserr = nfsd_open(rqstp, &fh, S_IFREG, access, filp); fh_put(&fh); /* We return nlm error codes as nlm doesn't know * about nfsd, but nfsd does know about nlm.. diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index fa67ecd5fe63..42356416f0a0 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2687,9 +2687,9 @@ static void force_expire_client(struct nfs4_client *clp) trace_nfsd_clid_admin_expired(&clp->cl_clientid); - spin_lock(&clp->cl_lock); + spin_lock(&nn->client_lock); clp->cl_time = 0; - spin_unlock(&clp->cl_lock); + spin_unlock(&nn->client_lock); wait_event(expiry_wq, atomic_read(&clp->cl_rpc_users) == 0); spin_lock(&nn->client_lock); @@ -5735,16 +5735,6 @@ check_special_stateids(struct net *net, svc_fh *current_fh, stateid_t *stateid, NFS4_SHARE_DENY_READ); } -/* - * Allow READ/WRITE during grace period on recovered state only for files - * that are not able to provide mandatory locking. - */ -static inline int -grace_disallows_io(struct net *net, struct inode *inode) -{ - return opens_in_grace(net) && mandatory_lock(inode); -} - static __be32 check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_session) { /* @@ -6026,7 +6016,6 @@ nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, stateid_t *stateid, int flags, struct nfsd_file **nfp, struct nfs4_stid **cstid) { - struct inode *ino = d_inode(fhp->fh_dentry); struct net *net = SVC_NET(rqstp); struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct nfs4_stid *s = NULL; @@ -6035,9 +6024,6 @@ nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, if (nfp) *nfp = NULL; - if (grace_disallows_io(net, ino)) - return nfserr_grace; - if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) { status = check_special_stateids(net, fhp, stateid, flags); goto done; @@ -6835,6 +6821,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_blocked_lock *nbl = NULL; struct file_lock *file_lock = NULL; struct file_lock *conflock = NULL; + struct super_block *sb; __be32 status = 0; int lkflg; int err; @@ -6856,6 +6843,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, dprintk("NFSD: nfsd4_lock: permission denied!\n"); return status; } + sb = cstate->current_fh.fh_dentry->d_sb; if (lock->lk_is_new) { if (nfsd4_has_session(cstate)) @@ -6901,10 +6889,14 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (!locks_in_grace(net) && lock->lk_reclaim) goto out; + if (lock->lk_reclaim) + fl_flags |= FL_RECLAIM; + fp = lock_stp->st_stid.sc_file; switch (lock->lk_type) { case NFS4_READW_LT: - if (nfsd4_has_session(cstate)) + if (nfsd4_has_session(cstate) && + !(sb->s_export_op->flags & EXPORT_OP_SYNC_LOCKS)) fl_flags |= FL_SLEEP; fallthrough; case NFS4_READ_LT: @@ -6916,7 +6908,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, fl_type = F_RDLCK; break; case NFS4_WRITEW_LT: - if (nfsd4_has_session(cstate)) + if (nfsd4_has_session(cstate) && + !(sb->s_export_op->flags & EXPORT_OP_SYNC_LOCKS)) fl_flags |= FL_SLEEP; fallthrough; case NFS4_WRITE_LT: @@ -7036,8 +7029,7 @@ out: /* * The NFSv4 spec allows a client to do a LOCKT without holding an OPEN, * so we do a temporary open here just to get an open file to pass to - * vfs_test_lock. (Arguably perhaps test_lock should be done with an - * inode operation.) + * vfs_test_lock. */ static __be32 nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file_lock *lock) { @@ -7052,7 +7044,9 @@ static __be32 nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct NFSD_MAY_READ)); if (err) goto out; + lock->fl_file = nf->nf_file; err = nfserrno(vfs_test_lock(nf->nf_file, lock)); + lock->fl_file = NULL; out: fh_unlock(fhp); nfsd_file_put(nf); diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 60d7c59e7935..90fcd6178823 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -881,6 +881,7 @@ nfserrno (int errno) { nfserr_serverfault, -ENFILE }, { nfserr_io, -EUCLEAN }, { nfserr_perm, -ENOKEY }, + { nfserr_no_grace, -ENOGRACE}, }; int i; diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index adaec43548d1..538520957a81 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -400,18 +400,16 @@ TRACE_EVENT(nfsd_dirent, TP_STRUCT__entry( __field(u32, fh_hash) __field(u64, ino) - __field(int, len) - __dynamic_array(unsigned char, name, namlen) + __string_len(name, name, namlen) ), TP_fast_assign( __entry->fh_hash = fhp ? knfsd_fh_hash(&fhp->fh_handle) : 0; __entry->ino = ino; - __entry->len = namlen; - memcpy(__get_str(name), name, namlen); + __assign_str_len(name, name, namlen) ), - TP_printk("fh_hash=0x%08x ino=%llu name=%.*s", - __entry->fh_hash, __entry->ino, - __entry->len, __get_str(name)) + TP_printk("fh_hash=0x%08x ino=%llu name=%s", + __entry->fh_hash, __entry->ino, __get_str(name) + ) ) #include "state.h" @@ -608,7 +606,7 @@ DECLARE_EVENT_CLASS(nfsd_clid_class, __array(unsigned char, addr, sizeof(struct sockaddr_in6)) __field(unsigned long, flavor) __array(unsigned char, verifier, NFS4_VERIFIER_SIZE) - __dynamic_array(char, name, clp->cl_name.len + 1) + __string_len(name, name, clp->cl_name.len) ), TP_fast_assign( __entry->cl_boot = clp->cl_clientid.cl_boot; @@ -618,8 +616,7 @@ DECLARE_EVENT_CLASS(nfsd_clid_class, __entry->flavor = clp->cl_cred.cr_flavor; memcpy(__entry->verifier, (void *)&clp->cl_verifier, NFS4_VERIFIER_SIZE); - memcpy(__get_str(name), clp->cl_name.data, clp->cl_name.len); - __get_str(name)[clp->cl_name.len] = '\0'; + __assign_str_len(name, clp->cl_name.data, clp->cl_name.len); ), TP_printk("addr=%pISpc name='%s' verifier=0x%s flavor=%s client=%08x:%08x", __entry->addr, __get_str(name), diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index a224a5e23cc1..738d564ca4ce 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -244,7 +244,6 @@ out_nfserr: * returned. Otherwise the covered directory is returned. * NOTE: this mountpoint crossing is not supported properly by all * clients and is explicitly disallowed for NFSv3 - * NeilBrown <neilb@cse.unsw.edu.au> */ __be32 nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name, @@ -333,7 +332,6 @@ nfsd_get_write_access(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap) { struct inode *inode = d_inode(fhp->fh_dentry); - int host_err; if (iap->ia_size < inode->i_size) { __be32 err; @@ -343,20 +341,7 @@ nfsd_get_write_access(struct svc_rqst *rqstp, struct svc_fh *fhp, if (err) return err; } - - host_err = get_write_access(inode); - if (host_err) - goto out_nfserrno; - - host_err = locks_verify_truncate(inode, NULL, iap->ia_size); - if (host_err) - goto out_put_write_access; - return 0; - -out_put_write_access: - put_write_access(inode); -out_nfserrno: - return nfserrno(host_err); + return nfserrno(get_write_access(inode)); } /* @@ -750,13 +735,6 @@ __nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, err = nfserr_perm; if (IS_APPEND(inode) && (may_flags & NFSD_MAY_WRITE)) goto out; - /* - * We must ignore files (but only files) which might have mandatory - * locks on them because there is no way to know if the accesser has - * the lock. - */ - if (S_ISREG((inode)->i_mode) && mandatory_lock(inode)) - goto out; if (!inode->i_fop) goto out; @@ -847,26 +825,16 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct svc_rqst *rqstp = sd->u.data; struct page **pp = rqstp->rq_next_page; struct page *page = buf->page; - size_t size; - - size = sd->len; if (rqstp->rq_res.page_len == 0) { - get_page(page); - put_page(*rqstp->rq_next_page); - *(rqstp->rq_next_page++) = page; + svc_rqst_replace_page(rqstp, page); rqstp->rq_res.page_base = buf->offset; - rqstp->rq_res.page_len = size; } else if (page != pp[-1]) { - get_page(page); - if (*rqstp->rq_next_page) - put_page(*rqstp->rq_next_page); - *(rqstp->rq_next_page++) = page; - rqstp->rq_res.page_len += size; - } else - rqstp->rq_res.page_len += size; + svc_rqst_replace_page(rqstp, page); + } + rqstp->rq_res.page_len += sd->len; - return size; + return sd->len; } static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe, diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 4abd928b0bc8..f6b2d280aab5 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -1053,7 +1053,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_time_gran = 1; sb->s_max_links = NILFS_LINK_MAX; - sb->s_bdi = bdi_get(sb->s_bdev->bd_bdi); + sb->s_bdi = bdi_get(sb->s_bdev->bd_disk->bdi); err = load_nilfs(nilfs, sb); if (err) diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 64864fb40b40..6facdf476255 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/fanotify.h> #include <linux/fcntl.h> +#include <linux/fdtable.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/anon_inodes.h> @@ -54,22 +55,27 @@ static int fanotify_max_queued_events __read_mostly; #include <linux/sysctl.h> +static long ft_zero = 0; +static long ft_int_max = INT_MAX; + struct ctl_table fanotify_table[] = { { .procname = "max_user_groups", .data = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS], - .maxlen = sizeof(int), + .maxlen = sizeof(long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, + .proc_handler = proc_doulongvec_minmax, + .extra1 = &ft_zero, + .extra2 = &ft_int_max, }, { .procname = "max_user_marks", .data = &init_user_ns.ucount_max[UCOUNT_FANOTIFY_MARKS], - .maxlen = sizeof(int), + .maxlen = sizeof(long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, + .proc_handler = proc_doulongvec_minmax, + .extra1 = &ft_zero, + .extra2 = &ft_int_max, }, { .procname = "max_queued_events", @@ -104,8 +110,10 @@ struct kmem_cache *fanotify_path_event_cachep __read_mostly; struct kmem_cache *fanotify_perm_event_cachep __read_mostly; #define FANOTIFY_EVENT_ALIGN 4 -#define FANOTIFY_INFO_HDR_LEN \ +#define FANOTIFY_FID_INFO_HDR_LEN \ (sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle)) +#define FANOTIFY_PIDFD_INFO_HDR_LEN \ + sizeof(struct fanotify_event_info_pidfd) static int fanotify_fid_info_len(int fh_len, int name_len) { @@ -114,10 +122,11 @@ static int fanotify_fid_info_len(int fh_len, int name_len) if (name_len) info_len += name_len + 1; - return roundup(FANOTIFY_INFO_HDR_LEN + info_len, FANOTIFY_EVENT_ALIGN); + return roundup(FANOTIFY_FID_INFO_HDR_LEN + info_len, + FANOTIFY_EVENT_ALIGN); } -static int fanotify_event_info_len(unsigned int fid_mode, +static int fanotify_event_info_len(unsigned int info_mode, struct fanotify_event *event) { struct fanotify_info *info = fanotify_event_info(event); @@ -128,7 +137,8 @@ static int fanotify_event_info_len(unsigned int fid_mode, if (dir_fh_len) { info_len += fanotify_fid_info_len(dir_fh_len, info->name_len); - } else if ((fid_mode & FAN_REPORT_NAME) && (event->mask & FAN_ONDIR)) { + } else if ((info_mode & FAN_REPORT_NAME) && + (event->mask & FAN_ONDIR)) { /* * With group flag FAN_REPORT_NAME, if name was not recorded in * event on a directory, we will report the name ".". @@ -136,6 +146,9 @@ static int fanotify_event_info_len(unsigned int fid_mode, dot_len = 1; } + if (info_mode & FAN_REPORT_PIDFD) + info_len += FANOTIFY_PIDFD_INFO_HDR_LEN; + if (fh_len) info_len += fanotify_fid_info_len(fh_len, dot_len); @@ -171,7 +184,7 @@ static struct fanotify_event *get_one_event(struct fsnotify_group *group, size_t event_size = FAN_EVENT_METADATA_LEN; struct fanotify_event *event = NULL; struct fsnotify_event *fsn_event; - unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS); + unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES); pr_debug("%s: group=%p count=%zd\n", __func__, group, count); @@ -181,8 +194,8 @@ static struct fanotify_event *get_one_event(struct fsnotify_group *group, goto out; event = FANOTIFY_E(fsn_event); - if (fid_mode) - event_size += fanotify_event_info_len(fid_mode, event); + if (info_mode) + event_size += fanotify_event_info_len(info_mode, event); if (event_size > count) { event = ERR_PTR(-EINVAL); @@ -303,9 +316,10 @@ static int process_access_response(struct fsnotify_group *group, return -ENOENT; } -static int copy_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh, - int info_type, const char *name, size_t name_len, - char __user *buf, size_t count) +static int copy_fid_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh, + int info_type, const char *name, + size_t name_len, + char __user *buf, size_t count) { struct fanotify_event_info_fid info = { }; struct file_handle handle = { }; @@ -398,6 +412,117 @@ static int copy_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh, return info_len; } +static int copy_pidfd_info_to_user(int pidfd, + char __user *buf, + size_t count) +{ + struct fanotify_event_info_pidfd info = { }; + size_t info_len = FANOTIFY_PIDFD_INFO_HDR_LEN; + + if (WARN_ON_ONCE(info_len > count)) + return -EFAULT; + + info.hdr.info_type = FAN_EVENT_INFO_TYPE_PIDFD; + info.hdr.len = info_len; + info.pidfd = pidfd; + + if (copy_to_user(buf, &info, info_len)) + return -EFAULT; + + return info_len; +} + +static int copy_info_records_to_user(struct fanotify_event *event, + struct fanotify_info *info, + unsigned int info_mode, int pidfd, + char __user *buf, size_t count) +{ + int ret, total_bytes = 0, info_type = 0; + unsigned int fid_mode = info_mode & FANOTIFY_FID_BITS; + unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD; + + /* + * Event info records order is as follows: dir fid + name, child fid. + */ + if (fanotify_event_dir_fh_len(event)) { + info_type = info->name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME : + FAN_EVENT_INFO_TYPE_DFID; + ret = copy_fid_info_to_user(fanotify_event_fsid(event), + fanotify_info_dir_fh(info), + info_type, + fanotify_info_name(info), + info->name_len, buf, count); + if (ret < 0) + return ret; + + buf += ret; + count -= ret; + total_bytes += ret; + } + + if (fanotify_event_object_fh_len(event)) { + const char *dot = NULL; + int dot_len = 0; + + if (fid_mode == FAN_REPORT_FID || info_type) { + /* + * With only group flag FAN_REPORT_FID only type FID is + * reported. Second info record type is always FID. + */ + info_type = FAN_EVENT_INFO_TYPE_FID; + } else if ((fid_mode & FAN_REPORT_NAME) && + (event->mask & FAN_ONDIR)) { + /* + * With group flag FAN_REPORT_NAME, if name was not + * recorded in an event on a directory, report the name + * "." with info type DFID_NAME. + */ + info_type = FAN_EVENT_INFO_TYPE_DFID_NAME; + dot = "."; + dot_len = 1; + } else if ((event->mask & ALL_FSNOTIFY_DIRENT_EVENTS) || + (event->mask & FAN_ONDIR)) { + /* + * With group flag FAN_REPORT_DIR_FID, a single info + * record has type DFID for directory entry modification + * event and for event on a directory. + */ + info_type = FAN_EVENT_INFO_TYPE_DFID; + } else { + /* + * With group flags FAN_REPORT_DIR_FID|FAN_REPORT_FID, + * a single info record has type FID for event on a + * non-directory, when there is no directory to report. + * For example, on FAN_DELETE_SELF event. + */ + info_type = FAN_EVENT_INFO_TYPE_FID; + } + + ret = copy_fid_info_to_user(fanotify_event_fsid(event), + fanotify_event_object_fh(event), + info_type, dot, dot_len, + buf, count); + if (ret < 0) + return ret; + + buf += ret; + count -= ret; + total_bytes += ret; + } + + if (pidfd_mode) { + ret = copy_pidfd_info_to_user(pidfd, buf, count); + if (ret < 0) + return ret; + + buf += ret; + count -= ret; + total_bytes += ret; + } + + return total_bytes; +} + static ssize_t copy_event_to_user(struct fsnotify_group *group, struct fanotify_event *event, char __user *buf, size_t count) @@ -405,15 +530,15 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, struct fanotify_event_metadata metadata; struct path *path = fanotify_event_path(event); struct fanotify_info *info = fanotify_event_info(event); - unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS); + unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES); + unsigned int pidfd_mode = info_mode & FAN_REPORT_PIDFD; struct file *f = NULL; - int ret, fd = FAN_NOFD; - int info_type = 0; + int ret, pidfd = FAN_NOPIDFD, fd = FAN_NOFD; pr_debug("%s: group=%p event=%p\n", __func__, group, event); metadata.event_len = FAN_EVENT_METADATA_LEN + - fanotify_event_info_len(fid_mode, event); + fanotify_event_info_len(info_mode, event); metadata.metadata_len = FAN_EVENT_METADATA_LEN; metadata.vers = FANOTIFY_METADATA_VERSION; metadata.reserved = 0; @@ -442,6 +567,33 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, } metadata.fd = fd; + if (pidfd_mode) { + /* + * Complain if the FAN_REPORT_PIDFD and FAN_REPORT_TID mutual + * exclusion is ever lifted. At the time of incoporating pidfd + * support within fanotify, the pidfd API only supported the + * creation of pidfds for thread-group leaders. + */ + WARN_ON_ONCE(FAN_GROUP_FLAG(group, FAN_REPORT_TID)); + + /* + * The PIDTYPE_TGID check for an event->pid is performed + * preemptively in an attempt to catch out cases where the event + * listener reads events after the event generating process has + * already terminated. Report FAN_NOPIDFD to the event listener + * in those cases, with all other pidfd creation errors being + * reported as FAN_EPIDFD. + */ + if (metadata.pid == 0 || + !pid_has_task(event->pid, PIDTYPE_TGID)) { + pidfd = FAN_NOPIDFD; + } else { + pidfd = pidfd_create(event->pid, 0); + if (pidfd < 0) + pidfd = FAN_EPIDFD; + } + } + ret = -EFAULT; /* * Sanity check copy size in case get_one_event() and @@ -462,67 +614,11 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, if (f) fd_install(fd, f); - /* Event info records order is: dir fid + name, child fid */ - if (fanotify_event_dir_fh_len(event)) { - info_type = info->name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME : - FAN_EVENT_INFO_TYPE_DFID; - ret = copy_info_to_user(fanotify_event_fsid(event), - fanotify_info_dir_fh(info), - info_type, fanotify_info_name(info), - info->name_len, buf, count); + if (info_mode) { + ret = copy_info_records_to_user(event, info, info_mode, pidfd, + buf, count); if (ret < 0) goto out_close_fd; - - buf += ret; - count -= ret; - } - - if (fanotify_event_object_fh_len(event)) { - const char *dot = NULL; - int dot_len = 0; - - if (fid_mode == FAN_REPORT_FID || info_type) { - /* - * With only group flag FAN_REPORT_FID only type FID is - * reported. Second info record type is always FID. - */ - info_type = FAN_EVENT_INFO_TYPE_FID; - } else if ((fid_mode & FAN_REPORT_NAME) && - (event->mask & FAN_ONDIR)) { - /* - * With group flag FAN_REPORT_NAME, if name was not - * recorded in an event on a directory, report the - * name "." with info type DFID_NAME. - */ - info_type = FAN_EVENT_INFO_TYPE_DFID_NAME; - dot = "."; - dot_len = 1; - } else if ((event->mask & ALL_FSNOTIFY_DIRENT_EVENTS) || - (event->mask & FAN_ONDIR)) { - /* - * With group flag FAN_REPORT_DIR_FID, a single info - * record has type DFID for directory entry modification - * event and for event on a directory. - */ - info_type = FAN_EVENT_INFO_TYPE_DFID; - } else { - /* - * With group flags FAN_REPORT_DIR_FID|FAN_REPORT_FID, - * a single info record has type FID for event on a - * non-directory, when there is no directory to report. - * For example, on FAN_DELETE_SELF event. - */ - info_type = FAN_EVENT_INFO_TYPE_FID; - } - - ret = copy_info_to_user(fanotify_event_fsid(event), - fanotify_event_object_fh(event), - info_type, dot, dot_len, buf, count); - if (ret < 0) - goto out_close_fd; - - buf += ret; - count -= ret; } return metadata.event_len; @@ -532,6 +628,10 @@ out_close_fd: put_unused_fd(fd); fput(f); } + + if (pidfd >= 0) + close_fd(pidfd); + return ret; } @@ -1077,6 +1177,14 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) #endif return -EINVAL; + /* + * A pidfd can only be returned for a thread-group leader; thus + * FAN_REPORT_PIDFD and FAN_REPORT_TID need to remain mutually + * exclusive. + */ + if ((flags & FAN_REPORT_PIDFD) && (flags & FAN_REPORT_TID)) + return -EINVAL; + if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS) return -EINVAL; @@ -1478,7 +1586,7 @@ static int __init fanotify_user_setup(void) FANOTIFY_DEFAULT_MAX_USER_MARKS); BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS); - BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 10); + BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 11); BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 9); fanotify_mark_cache = KMEM_CACHE(fsnotify_mark, diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 30d422b8c0fc..963e6ce75b96 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -87,15 +87,15 @@ static void fsnotify_unmount_inodes(struct super_block *sb) if (iput_inode) iput(iput_inode); - /* Wait for outstanding inode references from connectors */ - wait_var_event(&sb->s_fsnotify_inode_refs, - !atomic_long_read(&sb->s_fsnotify_inode_refs)); } void fsnotify_sb_delete(struct super_block *sb) { fsnotify_unmount_inodes(sb); fsnotify_clear_marks_by_sb(sb); + /* Wait for outstanding object references from connectors */ + wait_var_event(&sb->s_fsnotify_connectors, + !atomic_long_read(&sb->s_fsnotify_connectors)); } /* diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h index ff2063ec6b0f..87d8a50ee803 100644 --- a/fs/notify/fsnotify.h +++ b/fs/notify/fsnotify.h @@ -27,6 +27,21 @@ static inline struct super_block *fsnotify_conn_sb( return container_of(conn->obj, struct super_block, s_fsnotify_marks); } +static inline struct super_block *fsnotify_connector_sb( + struct fsnotify_mark_connector *conn) +{ + switch (conn->type) { + case FSNOTIFY_OBJ_TYPE_INODE: + return fsnotify_conn_inode(conn)->i_sb; + case FSNOTIFY_OBJ_TYPE_VFSMOUNT: + return fsnotify_conn_mount(conn)->mnt.mnt_sb; + case FSNOTIFY_OBJ_TYPE_SB: + return fsnotify_conn_sb(conn); + default: + return NULL; + } +} + /* destroy all events sitting in this groups notification queue */ extern void fsnotify_flush_notify(struct fsnotify_group *group); diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 98f61b31745a..62051247f6d2 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -55,22 +55,27 @@ struct kmem_cache *inotify_inode_mark_cachep __read_mostly; #include <linux/sysctl.h> +static long it_zero = 0; +static long it_int_max = INT_MAX; + struct ctl_table inotify_table[] = { { .procname = "max_user_instances", .data = &init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES], - .maxlen = sizeof(int), + .maxlen = sizeof(long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, + .proc_handler = proc_doulongvec_minmax, + .extra1 = &it_zero, + .extra2 = &it_int_max, }, { .procname = "max_user_watches", .data = &init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES], - .maxlen = sizeof(int), + .maxlen = sizeof(long), .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, + .proc_handler = proc_doulongvec_minmax, + .extra1 = &it_zero, + .extra2 = &it_int_max, }, { .procname = "max_queued_events", diff --git a/fs/notify/mark.c b/fs/notify/mark.c index d32ab349db74..95006d1d29ab 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -169,6 +169,37 @@ static void fsnotify_connector_destroy_workfn(struct work_struct *work) } } +static void fsnotify_get_inode_ref(struct inode *inode) +{ + ihold(inode); + atomic_long_inc(&inode->i_sb->s_fsnotify_connectors); +} + +static void fsnotify_put_inode_ref(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + iput(inode); + if (atomic_long_dec_and_test(&sb->s_fsnotify_connectors)) + wake_up_var(&sb->s_fsnotify_connectors); +} + +static void fsnotify_get_sb_connectors(struct fsnotify_mark_connector *conn) +{ + struct super_block *sb = fsnotify_connector_sb(conn); + + if (sb) + atomic_long_inc(&sb->s_fsnotify_connectors); +} + +static void fsnotify_put_sb_connectors(struct fsnotify_mark_connector *conn) +{ + struct super_block *sb = fsnotify_connector_sb(conn); + + if (sb && atomic_long_dec_and_test(&sb->s_fsnotify_connectors)) + wake_up_var(&sb->s_fsnotify_connectors); +} + static void *fsnotify_detach_connector_from_object( struct fsnotify_mark_connector *conn, unsigned int *type) @@ -182,13 +213,13 @@ static void *fsnotify_detach_connector_from_object( if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) { inode = fsnotify_conn_inode(conn); inode->i_fsnotify_mask = 0; - atomic_long_inc(&inode->i_sb->s_fsnotify_inode_refs); } else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) { fsnotify_conn_mount(conn)->mnt_fsnotify_mask = 0; } else if (conn->type == FSNOTIFY_OBJ_TYPE_SB) { fsnotify_conn_sb(conn)->s_fsnotify_mask = 0; } + fsnotify_put_sb_connectors(conn); rcu_assign_pointer(*(conn->obj), NULL); conn->obj = NULL; conn->type = FSNOTIFY_OBJ_TYPE_DETACHED; @@ -209,19 +240,12 @@ static void fsnotify_final_mark_destroy(struct fsnotify_mark *mark) /* Drop object reference originally held by a connector */ static void fsnotify_drop_object(unsigned int type, void *objp) { - struct inode *inode; - struct super_block *sb; - if (!objp) return; /* Currently only inode references are passed to be dropped */ if (WARN_ON_ONCE(type != FSNOTIFY_OBJ_TYPE_INODE)) return; - inode = objp; - sb = inode->i_sb; - iput(inode); - if (atomic_long_dec_and_test(&sb->s_fsnotify_inode_refs)) - wake_up_var(&sb->s_fsnotify_inode_refs); + fsnotify_put_inode_ref(objp); } void fsnotify_put_mark(struct fsnotify_mark *mark) @@ -493,8 +517,12 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp, conn->fsid.val[0] = conn->fsid.val[1] = 0; conn->flags = 0; } - if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) - inode = igrab(fsnotify_conn_inode(conn)); + if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) { + inode = fsnotify_conn_inode(conn); + fsnotify_get_inode_ref(inode); + } + fsnotify_get_sb_connectors(conn); + /* * cmpxchg() provides the barrier so that readers of *connp can see * only initialized structure @@ -502,7 +530,7 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp, if (cmpxchg(connp, NULL, conn)) { /* Someone else created list structure for us */ if (inode) - iput(inode); + fsnotify_put_inode_ref(inode); kmem_cache_free(fsnotify_mark_connector_cachep, conn); } diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 5c72a7e6d6c5..23a72a423955 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -289,7 +289,7 @@ unlock: return status; } -struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type) +struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type, bool rcu) { struct ocfs2_super *osb; struct buffer_head *di_bh = NULL; @@ -297,6 +297,9 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type) int had_lock; struct ocfs2_lock_holder oh; + if (rcu) + return ERR_PTR(-ECHILD); + osb = OCFS2_SB(inode->i_sb); if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) return NULL; diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h index f59d8d0a61fa..95a57c888ab6 100644 --- a/fs/ocfs2/acl.h +++ b/fs/ocfs2/acl.h @@ -16,7 +16,7 @@ struct ocfs2_acl_entry { __le32 e_id; }; -struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type); +struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type, bool rcu); int ocfs2_iop_set_acl(struct user_namespace *mnt_userns, struct inode *inode, struct posix_acl *acl, int type); extern int ocfs2_acl_chmod(struct inode *, struct buffer_head *); diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c index fab7c6a4a7d0..73a3854b2afb 100644 --- a/fs/ocfs2/locks.c +++ b/fs/ocfs2/locks.c @@ -101,8 +101,6 @@ int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl) if (!(fl->fl_flags & FL_FLOCK)) return -ENOLCK; - if (__mandatory_lock(inode)) - return -ENOLCK; if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) || ocfs2_mount_local(osb)) @@ -121,8 +119,6 @@ int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl) if (!(fl->fl_flags & FL_POSIX)) return -ENOLCK; - if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) - return -ENOLCK; return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl); } diff --git a/fs/open.c b/fs/open.c index 94bef26ff1b6..daa324606a41 100644 --- a/fs/open.c +++ b/fs/open.c @@ -105,9 +105,7 @@ long vfs_truncate(const struct path *path, loff_t length) if (error) goto put_write_and_out; - error = locks_verify_truncate(inode, NULL, length); - if (!error) - error = security_path_truncate(path); + error = security_path_truncate(path); if (!error) error = do_truncate(mnt_userns, path->dentry, length, 0, NULL); @@ -189,9 +187,7 @@ long do_sys_ftruncate(unsigned int fd, loff_t length, int small) if (IS_APPEND(file_inode(f.file))) goto out_putf; sb_start_write(inode->i_sb); - error = locks_verify_truncate(inode, f.file, length); - if (!error) - error = security_path_truncate(&f.file->f_path); + error = security_path_truncate(&f.file->f_path); if (!error) error = do_truncate(file_mnt_user_ns(f.file), dentry, length, ATTR_MTIME | ATTR_CTIME, f.file); diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c index 18852b9ed82b..605e5a3506ec 100644 --- a/fs/orangefs/acl.c +++ b/fs/orangefs/acl.c @@ -10,12 +10,15 @@ #include "orangefs-bufmap.h" #include <linux/posix_acl_xattr.h> -struct posix_acl *orangefs_get_acl(struct inode *inode, int type) +struct posix_acl *orangefs_get_acl(struct inode *inode, int type, bool rcu) { struct posix_acl *acl; int ret; char *key = NULL, *value = NULL; + if (rcu) + return ERR_PTR(-ECHILD); + switch (type) { case ACL_TYPE_ACCESS: key = XATTR_NAME_POSIX_ACL_ACCESS; diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 16ac617df7d7..c1bb4c4b5d67 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -882,12 +882,7 @@ int orangefs_getattr(struct user_namespace *mnt_userns, const struct path *path, if (!(request_mask & STATX_SIZE)) stat->result_mask &= ~STATX_SIZE; - stat->attributes_mask = STATX_ATTR_IMMUTABLE | - STATX_ATTR_APPEND; - if (inode->i_flags & S_IMMUTABLE) - stat->attributes |= STATX_ATTR_IMMUTABLE; - if (inode->i_flags & S_APPEND) - stat->attributes |= STATX_ATTR_APPEND; + generic_fill_statx_attr(inode, stat); } return ret; } diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index 0e6b97682e41..b5940ec1836a 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -106,7 +106,7 @@ enum orangefs_vfs_op_states { extern int orangefs_init_acl(struct inode *inode, struct inode *dir); extern const struct xattr_handler *orangefs_xattr_handlers[]; -extern struct posix_acl *orangefs_get_acl(struct inode *inode, int type); +extern struct posix_acl *orangefs_get_acl(struct inode *inode, int type, bool rcu); extern int orangefs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, struct posix_acl *acl, int type); diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 2846b943e80c..4e7d5bfa2949 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -8,6 +8,7 @@ #include <linux/fs.h> #include <linux/slab.h> #include <linux/file.h> +#include <linux/fileattr.h> #include <linux/splice.h> #include <linux/xattr.h> #include <linux/security.h> @@ -62,7 +63,7 @@ int ovl_copy_xattr(struct super_block *sb, struct dentry *old, return list_size; } - buf = kzalloc(list_size, GFP_KERNEL); + buf = kvzalloc(list_size, GFP_KERNEL); if (!buf) return -ENOMEM; @@ -105,11 +106,12 @@ retry: if (size > value_size) { void *new; - new = krealloc(value, size, GFP_KERNEL); + new = kvmalloc(size, GFP_KERNEL); if (!new) { error = -ENOMEM; break; } + kvfree(value); value = new; value_size = size; goto retry; @@ -124,12 +126,50 @@ retry: error = 0; } } - kfree(value); + kvfree(value); out: - kfree(buf); + kvfree(buf); return error; } +static int ovl_copy_fileattr(struct inode *inode, struct path *old, + struct path *new) +{ + struct fileattr oldfa = { .flags_valid = true }; + struct fileattr newfa = { .flags_valid = true }; + int err; + + err = ovl_real_fileattr_get(old, &oldfa); + if (err) + return err; + + err = ovl_real_fileattr_get(new, &newfa); + if (err) + return err; + + /* + * We cannot set immutable and append-only flags on upper inode, + * because we would not be able to link upper inode to upper dir + * not set overlay private xattr on upper inode. + * Store these flags in overlay.protattr xattr instead. + */ + if (oldfa.flags & OVL_PROT_FS_FLAGS_MASK) { + err = ovl_set_protattr(inode, new->dentry, &oldfa); + if (err) + return err; + } + + BUILD_BUG_ON(OVL_COPY_FS_FLAGS_MASK & ~FS_COMMON_FL); + newfa.flags &= ~OVL_COPY_FS_FLAGS_MASK; + newfa.flags |= (oldfa.flags & OVL_COPY_FS_FLAGS_MASK); + + BUILD_BUG_ON(OVL_COPY_FSX_FLAGS_MASK & ~FS_XFLAG_COMMON); + newfa.fsx_xflags &= ~OVL_COPY_FSX_FLAGS_MASK; + newfa.fsx_xflags |= (oldfa.fsx_xflags & OVL_COPY_FSX_FLAGS_MASK); + + return ovl_real_fileattr_set(new, &newfa); +} + static int ovl_copy_up_data(struct ovl_fs *ofs, struct path *old, struct path *new, loff_t len) { @@ -331,8 +371,8 @@ out_err: return ERR_PTR(err); } -int ovl_set_origin(struct ovl_fs *ofs, struct dentry *dentry, - struct dentry *lower, struct dentry *upper) +int ovl_set_origin(struct ovl_fs *ofs, struct dentry *lower, + struct dentry *upper) { const struct ovl_fh *fh = NULL; int err; @@ -351,7 +391,7 @@ int ovl_set_origin(struct ovl_fs *ofs, struct dentry *dentry, /* * Do not fail when upper doesn't support xattrs. */ - err = ovl_check_setxattr(dentry, upper, OVL_XATTR_ORIGIN, fh->buf, + err = ovl_check_setxattr(ofs, upper, OVL_XATTR_ORIGIN, fh->buf, fh ? fh->fb.len : 0, 0); kfree(fh); @@ -493,20 +533,21 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c) static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp) { struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb); + struct inode *inode = d_inode(c->dentry); + struct path upperpath, datapath; int err; + ovl_path_upper(c->dentry, &upperpath); + if (WARN_ON(upperpath.dentry != NULL)) + return -EIO; + + upperpath.dentry = temp; + /* * Copy up data first and then xattrs. Writing data after * xattrs will remove security.capability xattr automatically. */ if (S_ISREG(c->stat.mode) && !c->metacopy) { - struct path upperpath, datapath; - - ovl_path_upper(c->dentry, &upperpath); - if (WARN_ON(upperpath.dentry != NULL)) - return -EIO; - upperpath.dentry = temp; - ovl_path_lowerdata(c->dentry, &datapath); err = ovl_copy_up_data(ofs, &datapath, &upperpath, c->stat.size); @@ -518,6 +559,16 @@ static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp) if (err) return err; + if (inode->i_flags & OVL_COPY_I_FLAGS_MASK) { + /* + * Copy the fileattr inode flags that are the source of already + * copied i_flags + */ + err = ovl_copy_fileattr(inode, &c->lowerpath, &upperpath); + if (err) + return err; + } + /* * Store identifier of lower inode in upper inode xattr to * allow lookup of the copy up origin inode. @@ -526,13 +577,13 @@ static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp) * hard link. */ if (c->origin) { - err = ovl_set_origin(ofs, c->dentry, c->lowerpath.dentry, temp); + err = ovl_set_origin(ofs, c->lowerpath.dentry, temp); if (err) return err; } if (c->metacopy) { - err = ovl_check_setxattr(c->dentry, temp, OVL_XATTR_METACOPY, + err = ovl_check_setxattr(ofs, temp, OVL_XATTR_METACOPY, NULL, 0, -EOPNOTSUPP); if (err) return err; diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 93efe7048a77..1fefb2b8960e 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -233,9 +233,10 @@ struct dentry *ovl_create_temp(struct dentry *workdir, struct ovl_cattr *attr) static int ovl_set_opaque_xerr(struct dentry *dentry, struct dentry *upper, int xerr) { + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); int err; - err = ovl_check_setxattr(dentry, upper, OVL_XATTR_OPAQUE, "y", 1, xerr); + err = ovl_check_setxattr(ofs, upper, OVL_XATTR_OPAQUE, "y", 1, xerr); if (!err) ovl_dentry_set_opaque(dentry); @@ -320,6 +321,7 @@ static bool ovl_type_origin(struct dentry *dentry) static int ovl_create_upper(struct dentry *dentry, struct inode *inode, struct ovl_cattr *attr) { + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); struct inode *udir = upperdir->d_inode; struct dentry *newdentry; @@ -338,7 +340,8 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode, if (IS_ERR(newdentry)) goto out_unlock; - if (ovl_type_merge(dentry->d_parent) && d_is_dir(newdentry)) { + if (ovl_type_merge(dentry->d_parent) && d_is_dir(newdentry) && + !ovl_allow_offline_changes(ofs)) { /* Setting opaque here is just an optimization, allow to fail */ ovl_set_opaque(dentry, newdentry); } @@ -542,8 +545,10 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, goto out_cleanup; } err = ovl_instantiate(dentry, inode, newdentry, hardlink); - if (err) - goto out_cleanup; + if (err) { + ovl_cleanup(udir, newdentry); + dput(newdentry); + } out_dput: dput(upper); out_unlock: @@ -1043,6 +1048,7 @@ static bool ovl_need_absolute_redirect(struct dentry *dentry, bool samedir) static int ovl_set_redirect(struct dentry *dentry, bool samedir) { int err; + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); const char *redirect = ovl_dentry_get_redirect(dentry); bool absolute_redirect = ovl_need_absolute_redirect(dentry, samedir); @@ -1053,7 +1059,7 @@ static int ovl_set_redirect(struct dentry *dentry, bool samedir) if (IS_ERR(redirect)) return PTR_ERR(redirect); - err = ovl_check_setxattr(dentry, ovl_dentry_upper(dentry), + err = ovl_check_setxattr(ofs, ovl_dentry_upper(dentry), OVL_XATTR_REDIRECT, redirect, strlen(redirect), -EXDEV); if (!err) { diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index 41ebf52f1bbc..ebde05c9cf62 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -392,6 +392,7 @@ static struct dentry *ovl_lookup_real_one(struct dentry *connected, */ take_dentry_name_snapshot(&name, real); this = lookup_one_len(name.name.name, connected, name.name.len); + release_dentry_name_snapshot(&name); err = PTR_ERR(this); if (IS_ERR(this)) { goto fail; @@ -406,7 +407,6 @@ static struct dentry *ovl_lookup_real_one(struct dentry *connected, } out: - release_dentry_name_snapshot(&name); dput(parent); inode_unlock(dir); return this; diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 4d53d3b7e5fe..d081faa55e83 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -392,6 +392,51 @@ out_unlock: return ret; } +/* + * Calling iter_file_splice_write() directly from overlay's f_op may deadlock + * due to lock order inversion between pipe->mutex in iter_file_splice_write() + * and file_start_write(real.file) in ovl_write_iter(). + * + * So do everything ovl_write_iter() does and call iter_file_splice_write() on + * the real file. + */ +static ssize_t ovl_splice_write(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags) +{ + struct fd real; + const struct cred *old_cred; + struct inode *inode = file_inode(out); + struct inode *realinode = ovl_inode_real(inode); + ssize_t ret; + + inode_lock(inode); + /* Update mode */ + ovl_copyattr(realinode, inode); + ret = file_remove_privs(out); + if (ret) + goto out_unlock; + + ret = ovl_real_fdget(out, &real); + if (ret) + goto out_unlock; + + old_cred = ovl_override_creds(inode->i_sb); + file_start_write(real.file); + + ret = iter_file_splice_write(pipe, real.file, ppos, len, flags); + + file_end_write(real.file); + /* Update size */ + ovl_copyattr(realinode, inode); + revert_creds(old_cred); + fdput(real); + +out_unlock: + inode_unlock(inode); + + return ret; +} + static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct fd real; @@ -603,7 +648,7 @@ const struct file_operations ovl_file_operations = { .fadvise = ovl_fadvise, .flush = ovl_flush, .splice_read = generic_file_splice_read, - .splice_write = iter_file_splice_write, + .splice_write = ovl_splice_write, .copy_file_range = ovl_copy_file_range, .remap_file_range = ovl_remap_file_range, diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 5e828a1c98a8..832b17589733 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -13,6 +13,7 @@ #include <linux/fiemap.h> #include <linux/fileattr.h> #include <linux/security.h> +#include <linux/namei.h> #include "overlayfs.h" @@ -33,12 +34,6 @@ int ovl_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, goto out; if (attr->ia_valid & ATTR_SIZE) { - struct inode *realinode = d_inode(ovl_dentry_real(dentry)); - - err = -ETXTBSY; - if (atomic_read(&realinode->i_writecount) < 0) - goto out_drop_write; - /* Truncate should trigger data copy up as well */ full_copy_up = true; } @@ -162,7 +157,8 @@ int ovl_getattr(struct user_namespace *mnt_userns, const struct path *path, enum ovl_path_type type; struct path realpath; const struct cred *old_cred; - bool is_dir = S_ISDIR(dentry->d_inode->i_mode); + struct inode *inode = d_inode(dentry); + bool is_dir = S_ISDIR(inode->i_mode); int fsid = 0; int err; bool metacopy_blocks = false; @@ -175,6 +171,9 @@ int ovl_getattr(struct user_namespace *mnt_userns, const struct path *path, if (err) goto out; + /* Report the effective immutable/append-only STATX flags */ + generic_fill_statx_attr(inode, stat); + /* * For non-dir or same fs, we use st_ino of the copy up origin. * This guaranties constant st_dev/st_ino across copy up. @@ -448,7 +447,7 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) return res; } -struct posix_acl *ovl_get_acl(struct inode *inode, int type) +struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu) { struct inode *realinode = ovl_inode_real(inode); const struct cred *old_cred; @@ -457,6 +456,9 @@ struct posix_acl *ovl_get_acl(struct inode *inode, int type) if (!IS_ENABLED(CONFIG_FS_POSIX_ACL) || !IS_POSIXACL(realinode)) return NULL; + if (rcu) + return get_cached_acl_rcu(realinode, type); + old_cred = ovl_override_creds(inode->i_sb); acl = get_acl(realinode, type); revert_creds(old_cred); @@ -503,16 +505,14 @@ static int ovl_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, * Introducing security_inode_fileattr_get/set() hooks would solve this issue * properly. */ -static int ovl_security_fileattr(struct dentry *dentry, struct fileattr *fa, +static int ovl_security_fileattr(struct path *realpath, struct fileattr *fa, bool set) { - struct path realpath; struct file *file; unsigned int cmd; int err; - ovl_path_real(dentry, &realpath); - file = dentry_open(&realpath, O_RDONLY, current_cred()); + file = dentry_open(realpath, O_RDONLY, current_cred()); if (IS_ERR(file)) return PTR_ERR(file); @@ -527,12 +527,24 @@ static int ovl_security_fileattr(struct dentry *dentry, struct fileattr *fa, return err; } +int ovl_real_fileattr_set(struct path *realpath, struct fileattr *fa) +{ + int err; + + err = ovl_security_fileattr(realpath, fa, true); + if (err) + return err; + + return vfs_fileattr_set(&init_user_ns, realpath->dentry, fa); +} + int ovl_fileattr_set(struct user_namespace *mnt_userns, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); - struct dentry *upperdentry; + struct path upperpath; const struct cred *old_cred; + unsigned int flags; int err; err = ovl_want_write(dentry); @@ -541,31 +553,78 @@ int ovl_fileattr_set(struct user_namespace *mnt_userns, err = ovl_copy_up(dentry); if (!err) { - upperdentry = ovl_dentry_upper(dentry); + ovl_path_real(dentry, &upperpath); old_cred = ovl_override_creds(inode->i_sb); - err = ovl_security_fileattr(dentry, fa, true); + /* + * Store immutable/append-only flags in xattr and clear them + * in upper fileattr (in case they were set by older kernel) + * so children of "ovl-immutable" directories lower aliases of + * "ovl-immutable" hardlinks could be copied up. + * Clear xattr when flags are cleared. + */ + err = ovl_set_protattr(inode, upperpath.dentry, fa); if (!err) - err = vfs_fileattr_set(&init_user_ns, upperdentry, fa); + err = ovl_real_fileattr_set(&upperpath, fa); revert_creds(old_cred); - ovl_copyflags(ovl_inode_real(inode), inode); + + /* + * Merge real inode flags with inode flags read from + * overlay.protattr xattr + */ + flags = ovl_inode_real(inode)->i_flags & OVL_COPY_I_FLAGS_MASK; + + BUILD_BUG_ON(OVL_PROT_I_FLAGS_MASK & ~OVL_COPY_I_FLAGS_MASK); + flags |= inode->i_flags & OVL_PROT_I_FLAGS_MASK; + inode_set_flags(inode, flags, OVL_COPY_I_FLAGS_MASK); + + /* Update ctime */ + ovl_copyattr(ovl_inode_real(inode), inode); } ovl_drop_write(dentry); out: return err; } +/* Convert inode protection flags to fileattr flags */ +static void ovl_fileattr_prot_flags(struct inode *inode, struct fileattr *fa) +{ + BUILD_BUG_ON(OVL_PROT_FS_FLAGS_MASK & ~FS_COMMON_FL); + BUILD_BUG_ON(OVL_PROT_FSX_FLAGS_MASK & ~FS_XFLAG_COMMON); + + if (inode->i_flags & S_APPEND) { + fa->flags |= FS_APPEND_FL; + fa->fsx_xflags |= FS_XFLAG_APPEND; + } + if (inode->i_flags & S_IMMUTABLE) { + fa->flags |= FS_IMMUTABLE_FL; + fa->fsx_xflags |= FS_XFLAG_IMMUTABLE; + } +} + +int ovl_real_fileattr_get(struct path *realpath, struct fileattr *fa) +{ + int err; + + err = ovl_security_fileattr(realpath, fa, false); + if (err) + return err; + + return vfs_fileattr_get(realpath->dentry, fa); +} + int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); - struct dentry *realdentry = ovl_dentry_real(dentry); + struct path realpath; const struct cred *old_cred; int err; + ovl_path_real(dentry, &realpath); + old_cred = ovl_override_creds(inode->i_sb); - err = ovl_security_fileattr(dentry, fa, false); - if (!err) - err = vfs_fileattr_get(realdentry, fa); + err = ovl_real_fileattr_get(&realpath, fa); + ovl_fileattr_prot_flags(inode, fa); revert_creds(old_cred); return err; @@ -1118,6 +1177,10 @@ struct inode *ovl_get_inode(struct super_block *sb, } } + /* Check for immutable/append-only inode flags in xattr */ + if (upperdentry) + ovl_check_protattr(inode, upperdentry); + if (inode->i_state & I_NEW) unlock_new_inode(inode); out: diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 210cd6f66e28..1a9b515fc45d 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -392,7 +392,7 @@ invalid: upperdentry, d_inode(upperdentry)->i_mode & S_IFMT, d_inode(origin)->i_mode & S_IFMT); dput(origin); - return -EIO; + return -ESTALE; } static int ovl_check_origin(struct ovl_fs *ofs, struct dentry *upperdentry, @@ -811,7 +811,7 @@ static int ovl_fix_origin(struct ovl_fs *ofs, struct dentry *dentry, if (err) return err; - err = ovl_set_origin(ofs, dentry, lower, upper); + err = ovl_set_origin(ofs, lower, upper); if (!err) err = ovl_set_impure(dentry->d_parent, upper->d_parent); diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 6ec73db4bf9e..3894f3347955 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -34,6 +34,7 @@ enum ovl_xattr { OVL_XATTR_NLINK, OVL_XATTR_UPPER, OVL_XATTR_METACOPY, + OVL_XATTR_PROTATTR, }; enum ovl_inode_flag { @@ -262,6 +263,18 @@ static inline bool ovl_open_flags_need_copy_up(int flags) return ((OPEN_FMODE(flags) & FMODE_WRITE) || (flags & O_TRUNC)); } +static inline bool ovl_allow_offline_changes(struct ovl_fs *ofs) +{ + /* + * To avoid regressions in existing setups with overlay lower offline + * changes, we allow lower changes only if none of the new features + * are used. + */ + return (!ofs->config.index && !ofs->config.metacopy && + !ofs->config.redirect_dir && ofs->config.xino != OVL_XINO_ON); +} + + /* util.c */ int ovl_want_write(struct dentry *dentry); void ovl_drop_write(struct dentry *dentry); @@ -320,7 +333,7 @@ bool ovl_already_copied_up(struct dentry *dentry, int flags); bool ovl_check_origin_xattr(struct ovl_fs *ofs, struct dentry *dentry); bool ovl_check_dir_xattr(struct super_block *sb, struct dentry *dentry, enum ovl_xattr ox); -int ovl_check_setxattr(struct dentry *dentry, struct dentry *upperdentry, +int ovl_check_setxattr(struct ovl_fs *ofs, struct dentry *upperdentry, enum ovl_xattr ox, const void *value, size_t size, int xerr); int ovl_set_impure(struct dentry *dentry, struct dentry *upperdentry); @@ -485,7 +498,7 @@ int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name, int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name, void *value, size_t size); ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size); -struct posix_acl *ovl_get_acl(struct inode *inode, int type); +struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu); int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags); bool ovl_is_private_xattr(struct super_block *sb, const char *name); @@ -518,9 +531,28 @@ static inline void ovl_copyattr(struct inode *from, struct inode *to) i_size_write(to, i_size_read(from)); } +/* vfs inode flags copied from real to ovl inode */ +#define OVL_COPY_I_FLAGS_MASK (S_SYNC | S_NOATIME | S_APPEND | S_IMMUTABLE) +/* vfs inode flags read from overlay.protattr xattr to ovl inode */ +#define OVL_PROT_I_FLAGS_MASK (S_APPEND | S_IMMUTABLE) + +/* + * fileattr flags copied from lower to upper inode on copy up. + * We cannot copy up immutable/append-only flags, because that would prevent + * linking temp inode to upper dir, so we store them in xattr instead. + */ +#define OVL_COPY_FS_FLAGS_MASK (FS_SYNC_FL | FS_NOATIME_FL) +#define OVL_COPY_FSX_FLAGS_MASK (FS_XFLAG_SYNC | FS_XFLAG_NOATIME) +#define OVL_PROT_FS_FLAGS_MASK (FS_APPEND_FL | FS_IMMUTABLE_FL) +#define OVL_PROT_FSX_FLAGS_MASK (FS_XFLAG_APPEND | FS_XFLAG_IMMUTABLE) + +void ovl_check_protattr(struct inode *inode, struct dentry *upper); +int ovl_set_protattr(struct inode *inode, struct dentry *upper, + struct fileattr *fa); + static inline void ovl_copyflags(struct inode *from, struct inode *to) { - unsigned int mask = S_SYNC | S_IMMUTABLE | S_APPEND | S_NOATIME; + unsigned int mask = OVL_COPY_I_FLAGS_MASK; inode_set_flags(to, from->i_flags & mask, mask); } @@ -548,6 +580,8 @@ struct dentry *ovl_create_temp(struct dentry *workdir, struct ovl_cattr *attr); extern const struct file_operations ovl_file_operations; int __init ovl_aio_request_cache_init(void); void ovl_aio_request_cache_destroy(void); +int ovl_real_fileattr_get(struct path *realpath, struct fileattr *fa); +int ovl_real_fileattr_set(struct path *realpath, struct fileattr *fa); int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa); int ovl_fileattr_set(struct user_namespace *mnt_userns, struct dentry *dentry, struct fileattr *fa); @@ -561,8 +595,8 @@ int ovl_copy_xattr(struct super_block *sb, struct dentry *old, int ovl_set_attr(struct dentry *upper, struct kstat *stat); struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real, bool is_upper); -int ovl_set_origin(struct ovl_fs *ofs, struct dentry *dentry, - struct dentry *lower, struct dentry *upper); +int ovl_set_origin(struct ovl_fs *ofs, struct dentry *lower, + struct dentry *upper); /* export.c */ extern const struct export_operations ovl_export_operations; diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index e8ad2c2c77dd..150fdf3bc68d 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -481,6 +481,8 @@ static int ovl_cache_update_ino(struct path *path, struct ovl_cache_entry *p) } this = lookup_one_len(p->name, dir, p->len); if (IS_ERR_OR_NULL(this) || !this->d_inode) { + /* Mark a stale entry */ + p->is_whiteout = true; if (IS_ERR(this)) { err = PTR_ERR(this); this = NULL; @@ -776,6 +778,9 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx) if (err) goto out; } + } + /* ovl_cache_update_ino() sets is_whiteout on stale entry */ + if (!p->is_whiteout) { if (!dir_emit(ctx, p->name, p->len, p->ino, p->type)) break; } diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index b01d4147520d..178daa5e82c9 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -1599,9 +1599,7 @@ static bool ovl_lower_uuid_ok(struct ovl_fs *ofs, const uuid_t *uuid) * user opted-in to one of the new features that require following the * lower inode of non-dir upper. */ - if (!ofs->config.index && !ofs->config.metacopy && - ofs->config.xino != OVL_XINO_ON && - uuid_is_null(uuid)) + if (ovl_allow_offline_changes(ofs) && uuid_is_null(uuid)) return false; for (i = 0; i < ofs->numfs; i++) { diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index b9d03627f364..f48284a2a896 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -10,6 +10,7 @@ #include <linux/cred.h> #include <linux/xattr.h> #include <linux/exportfs.h> +#include <linux/fileattr.h> #include <linux/uuid.h> #include <linux/namei.h> #include <linux/ratelimit.h> @@ -585,6 +586,7 @@ bool ovl_check_dir_xattr(struct super_block *sb, struct dentry *dentry, #define OVL_XATTR_NLINK_POSTFIX "nlink" #define OVL_XATTR_UPPER_POSTFIX "upper" #define OVL_XATTR_METACOPY_POSTFIX "metacopy" +#define OVL_XATTR_PROTATTR_POSTFIX "protattr" #define OVL_XATTR_TAB_ENTRY(x) \ [x] = { [false] = OVL_XATTR_TRUSTED_PREFIX x ## _POSTFIX, \ @@ -598,14 +600,14 @@ const char *const ovl_xattr_table[][2] = { OVL_XATTR_TAB_ENTRY(OVL_XATTR_NLINK), OVL_XATTR_TAB_ENTRY(OVL_XATTR_UPPER), OVL_XATTR_TAB_ENTRY(OVL_XATTR_METACOPY), + OVL_XATTR_TAB_ENTRY(OVL_XATTR_PROTATTR), }; -int ovl_check_setxattr(struct dentry *dentry, struct dentry *upperdentry, +int ovl_check_setxattr(struct ovl_fs *ofs, struct dentry *upperdentry, enum ovl_xattr ox, const void *value, size_t size, int xerr) { int err; - struct ovl_fs *ofs = dentry->d_sb->s_fs_info; if (ofs->noxattr) return xerr; @@ -623,6 +625,7 @@ int ovl_check_setxattr(struct dentry *dentry, struct dentry *upperdentry, int ovl_set_impure(struct dentry *dentry, struct dentry *upperdentry) { + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); int err; if (ovl_test_flag(OVL_IMPURE, d_inode(dentry))) @@ -632,14 +635,95 @@ int ovl_set_impure(struct dentry *dentry, struct dentry *upperdentry) * Do not fail when upper doesn't support xattrs. * Upper inodes won't have origin nor redirect xattr anyway. */ - err = ovl_check_setxattr(dentry, upperdentry, OVL_XATTR_IMPURE, - "y", 1, 0); + err = ovl_check_setxattr(ofs, upperdentry, OVL_XATTR_IMPURE, "y", 1, 0); if (!err) ovl_set_flag(OVL_IMPURE, d_inode(dentry)); return err; } + +#define OVL_PROTATTR_MAX 32 /* Reserved for future flags */ + +void ovl_check_protattr(struct inode *inode, struct dentry *upper) +{ + struct ovl_fs *ofs = OVL_FS(inode->i_sb); + u32 iflags = inode->i_flags & OVL_PROT_I_FLAGS_MASK; + char buf[OVL_PROTATTR_MAX+1]; + int res, n; + + res = ovl_do_getxattr(ofs, upper, OVL_XATTR_PROTATTR, buf, + OVL_PROTATTR_MAX); + if (res < 0) + return; + + /* + * Initialize inode flags from overlay.protattr xattr and upper inode + * flags. If upper inode has those fileattr flags set (i.e. from old + * kernel), we do not clear them on ovl_get_inode(), but we will clear + * them on next fileattr_set(). + */ + for (n = 0; n < res; n++) { + if (buf[n] == 'a') + iflags |= S_APPEND; + else if (buf[n] == 'i') + iflags |= S_IMMUTABLE; + else + break; + } + + if (!res || n < res) { + pr_warn_ratelimited("incompatible overlay.protattr format (%pd2, len=%d)\n", + upper, res); + } else { + inode_set_flags(inode, iflags, OVL_PROT_I_FLAGS_MASK); + } +} + +int ovl_set_protattr(struct inode *inode, struct dentry *upper, + struct fileattr *fa) +{ + struct ovl_fs *ofs = OVL_FS(inode->i_sb); + char buf[OVL_PROTATTR_MAX]; + int len = 0, err = 0; + u32 iflags = 0; + + BUILD_BUG_ON(HWEIGHT32(OVL_PROT_FS_FLAGS_MASK) > OVL_PROTATTR_MAX); + + if (fa->flags & FS_APPEND_FL) { + buf[len++] = 'a'; + iflags |= S_APPEND; + } + if (fa->flags & FS_IMMUTABLE_FL) { + buf[len++] = 'i'; + iflags |= S_IMMUTABLE; + } + + /* + * Do not allow to set protection flags when upper doesn't support + * xattrs, because we do not set those fileattr flags on upper inode. + * Remove xattr if it exist and all protection flags are cleared. + */ + if (len) { + err = ovl_check_setxattr(ofs, upper, OVL_XATTR_PROTATTR, + buf, len, -EPERM); + } else if (inode->i_flags & OVL_PROT_I_FLAGS_MASK) { + err = ovl_do_removexattr(ofs, upper, OVL_XATTR_PROTATTR); + if (err == -EOPNOTSUPP || err == -ENODATA) + err = 0; + } + if (err) + return err; + + inode_set_flags(inode, iflags, OVL_PROT_I_FLAGS_MASK); + + /* Mask out the fileattr flags that should not be set in upper inode */ + fa->flags &= ~OVL_PROT_FS_FLAGS_MASK; + fa->fsx_xflags &= ~OVL_PROT_FSX_FLAGS_MASK; + + return 0; +} + /** * Caller must hold a reference to inode to prevent it from being freed while * it is marked inuse. diff --git a/fs/pipe.c b/fs/pipe.c index 8e6ef62aeb1c..6d4342bad9f1 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -363,10 +363,9 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) * _very_ unlikely case that the pipe was full, but we got * no data. */ - if (unlikely(was_full)) { + if (unlikely(was_full)) wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); - kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); - } + kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); /* * But because we didn't read anything, at this point we can @@ -385,12 +384,11 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) wake_next_reader = false; __pipe_unlock(pipe); - if (was_full) { + if (was_full) wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); - kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); - } if (wake_next_reader) wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); + kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); if (ret > 0) file_accessed(filp); return ret; @@ -444,9 +442,6 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) #endif /* - * Epoll nonsensically wants a wakeup whether the pipe - * was already empty or not. - * * If it wasn't empty we try to merge new data into * the last buffer. * @@ -455,9 +450,9 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) * spanning multiple pages. */ head = pipe->head; - was_empty = true; + was_empty = pipe_empty(head, pipe->tail); chars = total_len & (PAGE_SIZE-1); - if (chars && !pipe_empty(head, pipe->tail)) { + if (chars && !was_empty) { unsigned int mask = pipe->ring_size - 1; struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask]; int offset = buf->offset + buf->len; @@ -568,10 +563,9 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) * become empty while we dropped the lock. */ __pipe_unlock(pipe); - if (was_empty) { + if (was_empty) wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); - kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); - } + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe)); __pipe_lock(pipe); was_empty = pipe_empty(pipe->head, pipe->tail); @@ -590,11 +584,13 @@ out: * This is particularly important for small writes, because of * how (for example) the GNU make jobserver uses small writes to * wake up pending jobs + * + * Epoll nonsensically wants a wakeup whether the pipe + * was already empty or not. */ - if (was_empty) { + if (was_empty || pipe->poll_usage) wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); - kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); - } + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); if (wake_next_writer) wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) { @@ -654,6 +650,9 @@ pipe_poll(struct file *filp, poll_table *wait) struct pipe_inode_info *pipe = filp->private_data; unsigned int head, tail; + /* Epoll has some historical nasty semantics, this enables them */ + pipe->poll_usage = 1; + /* * Reading pipe state only -- no need for acquiring the semaphore. * diff --git a/fs/posix_acl.c b/fs/posix_acl.c index f3309a7edb49..f5c25f580dd9 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -22,6 +22,7 @@ #include <linux/xattr.h> #include <linux/export.h> #include <linux/user_namespace.h> +#include <linux/namei.h> static struct posix_acl **acl_by_type(struct inode *inode, int type) { @@ -56,7 +57,17 @@ EXPORT_SYMBOL(get_cached_acl); struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type) { - return rcu_dereference(*acl_by_type(inode, type)); + struct posix_acl *acl = rcu_dereference(*acl_by_type(inode, type)); + + if (acl == ACL_DONT_CACHE) { + struct posix_acl *ret; + + ret = inode->i_op->get_acl(inode, type, LOOKUP_RCU); + if (!IS_ERR(ret)) + acl = ret; + } + + return acl; } EXPORT_SYMBOL(get_cached_acl_rcu); @@ -138,7 +149,7 @@ struct posix_acl *get_acl(struct inode *inode, int type) set_cached_acl(inode, type, NULL); return NULL; } - acl = inode->i_op->get_acl(inode, type); + acl = inode->i_op->get_acl(inode, type, false); if (IS_ERR(acl)) { /* diff --git a/fs/read_write.c b/fs/read_write.c index 9db7adf160d2..af057c57bdc6 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -365,12 +365,8 @@ out_putf: int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count) { - struct inode *inode; - int retval = -EINVAL; - - inode = file_inode(file); if (unlikely((ssize_t) count < 0)) - return retval; + return -EINVAL; /* * ranged mandatory locking does not apply to streams - it makes sense @@ -381,19 +377,12 @@ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t if (unlikely(pos < 0)) { if (!unsigned_offsets(file)) - return retval; + return -EINVAL; if (count >= -pos) /* both values are in 0..LLONG_MAX */ return -EOVERFLOW; } else if (unlikely((loff_t) (pos + count) < 0)) { if (!unsigned_offsets(file)) - return retval; - } - - if (unlikely(inode->i_flctx && mandatory_lock(inode))) { - retval = locks_mandatory_area(inode, file, pos, pos + count - 1, - read_write == READ ? F_RDLCK : F_WRLCK); - if (retval < 0) - return retval; + return -EINVAL; } } diff --git a/fs/reiserfs/acl.h b/fs/reiserfs/acl.h index fd58618da360..d9052b8ce6dd 100644 --- a/fs/reiserfs/acl.h +++ b/fs/reiserfs/acl.h @@ -48,7 +48,7 @@ static inline int reiserfs_acl_count(size_t size) } #ifdef CONFIG_REISERFS_FS_POSIX_ACL -struct posix_acl *reiserfs_get_acl(struct inode *inode, int type); +struct posix_acl *reiserfs_get_acl(struct inode *inode, int type, bool rcu); int reiserfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, struct posix_acl *acl, int type); int reiserfs_acl_chmod(struct inode *inode); diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index a9547144a099..d6fcddc46f5b 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -190,13 +190,16 @@ fail: * inode->i_mutex: down * BKL held [before 2.5.x] */ -struct posix_acl *reiserfs_get_acl(struct inode *inode, int type) +struct posix_acl *reiserfs_get_acl(struct inode *inode, int type, bool rcu) { char *name, *value; struct posix_acl *acl; int size; int retval; + if (rcu) + return ERR_PTR(-ECHILD); + switch (type) { case ACL_TYPE_ACCESS: name = XATTR_NAME_POSIX_ACL_ACCESS; diff --git a/fs/remap_range.c b/fs/remap_range.c index e4a5fdd7ad7b..6d4a9beaa097 100644 --- a/fs/remap_range.c +++ b/fs/remap_range.c @@ -99,24 +99,12 @@ static int generic_remap_checks(struct file *file_in, loff_t pos_in, static int remap_verify_area(struct file *file, loff_t pos, loff_t len, bool write) { - struct inode *inode = file_inode(file); - if (unlikely(pos < 0 || len < 0)) return -EINVAL; if (unlikely((loff_t) (pos + len) < 0)) return -EINVAL; - if (unlikely(inode->i_flctx && mandatory_lock(inode))) { - loff_t end = len ? pos + len - 1 : OFFSET_MAX; - int retval; - - retval = locks_mandatory_area(inode, file, pos, end, - write ? F_WRLCK : F_RDLCK); - if (retval < 0) - return retval; - } - return security_file_permission(file, write ? MAY_WRITE : MAY_READ); } diff --git a/fs/signalfd.c b/fs/signalfd.c index 167b5889db4b..040e1cf90528 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -114,10 +114,10 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo, break; case SIL_FAULT_BNDERR: case SIL_FAULT_PKUERR: - case SIL_PERF_EVENT: + case SIL_FAULT_PERF_EVENT: /* * Fall through to the SIL_FAULT case. SIL_FAULT_BNDERR, - * SIL_FAULT_PKUERR, and SIL_PERF_EVENT are only + * SIL_FAULT_PKUERR, and SIL_FAULT_PERF_EVENT are only * generated by faults that deliver them synchronously to * userspace. In case someone injects one of these signals * and signalfd catches it treat it as SIL_FAULT. diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 855f0e87066d..2db8bcf7ff85 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -49,8 +49,7 @@ static int copy_bio_to_actor(struct bio *bio, bytes_to_copy = min_t(int, bytes_to_copy, req_length - copied_bytes); - memcpy(actor_addr + actor_offset, - page_address(bvec->bv_page) + bvec->bv_offset + offset, + memcpy(actor_addr + actor_offset, bvec_virt(bvec) + offset, bytes_to_copy); actor_offset += bytes_to_copy; @@ -177,7 +176,7 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length, goto out_free_bio; } /* Extract the length of the metadata block */ - data = page_address(bvec->bv_page) + bvec->bv_offset; + data = bvec_virt(bvec); length = data[offset]; if (offset < bvec->bv_len - 1) { length |= data[offset + 1] << 8; @@ -186,7 +185,7 @@ int squashfs_read_data(struct super_block *sb, u64 index, int length, res = -EIO; goto out_free_bio; } - data = page_address(bvec->bv_page) + bvec->bv_offset; + data = bvec_virt(bvec); length |= data[0] << 8; } bio_free_pages(bio); diff --git a/fs/squashfs/lz4_wrapper.c b/fs/squashfs/lz4_wrapper.c index 233d5582fbee..b685b6238316 100644 --- a/fs/squashfs/lz4_wrapper.c +++ b/fs/squashfs/lz4_wrapper.c @@ -101,7 +101,7 @@ static int lz4_uncompress(struct squashfs_sb_info *msblk, void *strm, while (bio_next_segment(bio, &iter_all)) { int avail = min(bytes, ((int)bvec->bv_len) - offset); - data = page_address(bvec->bv_page) + bvec->bv_offset; + data = bvec_virt(bvec); memcpy(buff, data + offset, avail); buff += avail; bytes -= avail; diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c index 97bb7d92ddcd..cb510a631968 100644 --- a/fs/squashfs/lzo_wrapper.c +++ b/fs/squashfs/lzo_wrapper.c @@ -76,7 +76,7 @@ static int lzo_uncompress(struct squashfs_sb_info *msblk, void *strm, while (bio_next_segment(bio, &iter_all)) { int avail = min(bytes, ((int)bvec->bv_len) - offset); - data = page_address(bvec->bv_page) + bvec->bv_offset; + data = bvec_virt(bvec); memcpy(buff, data + offset, avail); buff += avail; bytes -= avail; diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c index e80419aed862..68f6d09bb3a2 100644 --- a/fs/squashfs/xz_wrapper.c +++ b/fs/squashfs/xz_wrapper.c @@ -146,7 +146,7 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm, } avail = min(length, ((int)bvec->bv_len) - offset); - data = page_address(bvec->bv_page) + bvec->bv_offset; + data = bvec_virt(bvec); length -= avail; stream->buf.in = data + offset; stream->buf.in_size = avail; diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c index bcb881ec47f2..a20e9042146b 100644 --- a/fs/squashfs/zlib_wrapper.c +++ b/fs/squashfs/zlib_wrapper.c @@ -76,7 +76,7 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm, } avail = min(length, ((int)bvec->bv_len) - offset); - data = page_address(bvec->bv_page) + bvec->bv_offset; + data = bvec_virt(bvec); length -= avail; stream->next_in = data + offset; stream->avail_in = avail; diff --git a/fs/squashfs/zstd_wrapper.c b/fs/squashfs/zstd_wrapper.c index b7cb1faa652d..0015cf8b5582 100644 --- a/fs/squashfs/zstd_wrapper.c +++ b/fs/squashfs/zstd_wrapper.c @@ -94,7 +94,7 @@ static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm, } avail = min(length, ((int)bvec->bv_len) - offset); - data = page_address(bvec->bv_page) + bvec->bv_offset; + data = bvec_virt(bvec); length -= avail; in_buf.src = data + offset; in_buf.size = avail; diff --git a/fs/stat.c b/fs/stat.c index 1fa38bdec1a6..28d2020ba1f4 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -60,6 +60,24 @@ void generic_fillattr(struct user_namespace *mnt_userns, struct inode *inode, EXPORT_SYMBOL(generic_fillattr); /** + * generic_fill_statx_attr - Fill in the statx attributes from the inode flags + * @inode: Inode to use as the source + * @stat: Where to fill in the attribute flags + * + * Fill in the STATX_ATTR_* flags in the kstat structure for properties of the + * inode that are published on i_flags and enforced by the VFS. + */ +void generic_fill_statx_attr(struct inode *inode, struct kstat *stat) +{ + if (inode->i_flags & S_IMMUTABLE) + stat->attributes |= STATX_ATTR_IMMUTABLE; + if (inode->i_flags & S_APPEND) + stat->attributes |= STATX_ATTR_APPEND; + stat->attributes_mask |= KSTAT_ATTR_VFS_FLAGS; +} +EXPORT_SYMBOL(generic_fill_statx_attr); + +/** * vfs_getattr_nosec - getattr without security checks * @path: file to get attributes from * @stat: structure to return attributes in diff --git a/fs/super.c b/fs/super.c index 91b7f156735b..bcef3a6f4c4b 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1203,7 +1203,7 @@ static int set_bdev_super(struct super_block *s, void *data) { s->s_bdev = data; s->s_dev = s->s_bdev->bd_dev; - s->s_bdi = bdi_get(s->s_bdev->bd_bdi); + s->s_bdi = bdi_get(s->s_bdev->bd_disk->bdi); if (blk_queue_stable_writes(s->s_bdev->bd_disk->queue)) s->s_iflags |= SB_I_STABLE_WRITES; diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 9aefa7779b29..d019d6ac6ad0 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -174,8 +174,8 @@ static int sysfs_kf_bin_open(struct kernfs_open_file *of) { struct bin_attribute *battr = of->kn->priv; - if (battr->mapping) - of->file->f_mapping = battr->mapping; + if (battr->f_mapping) + of->file->f_mapping = battr->f_mapping(); return 0; } diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index 64e6a6698935..f29d62004527 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -446,7 +446,7 @@ int compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj, if (!target) return -ENOENT; - entry = kernfs_find_and_get(target_kobj->sd, target_name); + entry = kernfs_find_and_get(target, target_name); if (!entry) { kernfs_put(target); return -ENOENT; diff --git a/fs/timerfd.c b/fs/timerfd.c index c5509d2448e3..e9c96a0c79f1 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -115,6 +115,22 @@ void timerfd_clock_was_set(void) rcu_read_unlock(); } +static void timerfd_resume_work(struct work_struct *work) +{ + timerfd_clock_was_set(); +} + +static DECLARE_WORK(timerfd_work, timerfd_resume_work); + +/* + * Invoked from timekeeping_resume(). Defer the actual update to work so + * timerfd_clock_was_set() runs in task context. + */ +void timerfd_resume(void) +{ + schedule_work(&timerfd_work); +} + static void __timerfd_remove_cancel(struct timerfd_ctx *ctx) { if (ctx->might_cancel) { diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 2e4e1d159969..5cfa28cd00cd 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1630,6 +1630,17 @@ static const char *ubifs_get_link(struct dentry *dentry, return fscrypt_get_symlink(inode, ui->data, ui->data_len, done); } +static int ubifs_symlink_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags) +{ + ubifs_getattr(mnt_userns, path, stat, request_mask, query_flags); + + if (IS_ENCRYPTED(d_inode(path->dentry))) + return fscrypt_symlink_getattr(path, stat); + return 0; +} + const struct address_space_operations ubifs_file_address_operations = { .readpage = ubifs_readpage, .writepage = ubifs_writepage, @@ -1655,7 +1666,7 @@ const struct inode_operations ubifs_file_inode_operations = { const struct inode_operations ubifs_symlink_inode_operations = { .get_link = ubifs_get_link, .setattr = ubifs_setattr, - .getattr = ubifs_getattr, + .getattr = ubifs_symlink_getattr, .listxattr = ubifs_listxattr, .update_time = ubifs_update_time, }; diff --git a/fs/udf/dir.c b/fs/udf/dir.c index c19dba45aa20..70abdfad2df1 100644 --- a/fs/udf/dir.c +++ b/fs/udf/dir.c @@ -35,7 +35,6 @@ #include "udf_i.h" #include "udf_sb.h" - static int udf_readdir(struct file *file, struct dir_context *ctx) { struct inode *dir = file_inode(file); @@ -135,7 +134,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) lfi = cfi.lengthFileIdent; if (fibh.sbh == fibh.ebh) { - nameptr = fi->fileIdent + liu; + nameptr = udf_get_fi_ident(fi); } else { int poffset; /* Unpaded ending offset */ @@ -153,7 +152,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) } } nameptr = copy_name; - memcpy(nameptr, fi->fileIdent + liu, + memcpy(nameptr, udf_get_fi_ident(fi), lfi - poffset); memcpy(nameptr + lfi - poffset, fibh.ebh->b_data, poffset); diff --git a/fs/udf/ecma_167.h b/fs/udf/ecma_167.h index 185c3e247648..de17a97e8667 100644 --- a/fs/udf/ecma_167.h +++ b/fs/udf/ecma_167.h @@ -307,14 +307,14 @@ struct logicalVolDesc { struct regid impIdent; uint8_t impUse[128]; struct extent_ad integritySeqExt; - uint8_t partitionMaps[0]; + uint8_t partitionMaps[]; } __packed; /* Generic Partition Map (ECMA 167r3 3/10.7.1) */ struct genericPartitionMap { uint8_t partitionMapType; uint8_t partitionMapLength; - uint8_t partitionMapping[0]; + uint8_t partitionMapping[]; } __packed; /* Partition Map Type (ECMA 167r3 3/10.7.1.1) */ @@ -342,7 +342,7 @@ struct unallocSpaceDesc { struct tag descTag; __le32 volDescSeqNum; __le32 numAllocDescs; - struct extent_ad allocDescs[0]; + struct extent_ad allocDescs[]; } __packed; /* Terminating Descriptor (ECMA 167r3 3/10.9) */ @@ -360,9 +360,9 @@ struct logicalVolIntegrityDesc { uint8_t logicalVolContentsUse[32]; __le32 numOfPartitions; __le32 lengthOfImpUse; - __le32 freeSpaceTable[0]; - __le32 sizeTable[0]; - uint8_t impUse[0]; + __le32 freeSpaceTable[]; + /* __le32 sizeTable[]; */ + /* uint8_t impUse[]; */ } __packed; /* Integrity Type (ECMA 167r3 3/10.10.3) */ @@ -471,9 +471,9 @@ struct fileIdentDesc { uint8_t lengthFileIdent; struct long_ad icb; __le16 lengthOfImpUse; - uint8_t impUse[0]; - uint8_t fileIdent[0]; - uint8_t padding[0]; + uint8_t impUse[]; + /* uint8_t fileIdent[]; */ + /* uint8_t padding[]; */ } __packed; /* File Characteristics (ECMA 167r3 4/14.4.3) */ @@ -578,8 +578,8 @@ struct fileEntry { __le64 uniqueID; __le32 lengthExtendedAttr; __le32 lengthAllocDescs; - uint8_t extendedAttr[0]; - uint8_t allocDescs[0]; + uint8_t extendedAttr[]; + /* uint8_t allocDescs[]; */ } __packed; /* Permissions (ECMA 167r3 4/14.9.5) */ @@ -632,7 +632,7 @@ struct genericFormat { uint8_t attrSubtype; uint8_t reserved[3]; __le32 attrLength; - uint8_t attrData[0]; + uint8_t attrData[]; } __packed; /* Character Set Information (ECMA 167r3 4/14.10.3) */ @@ -643,7 +643,7 @@ struct charSetInfo { __le32 attrLength; __le32 escapeSeqLength; uint8_t charSetType; - uint8_t escapeSeq[0]; + uint8_t escapeSeq[]; } __packed; /* Alternate Permissions (ECMA 167r3 4/14.10.4) */ @@ -682,7 +682,7 @@ struct infoTimesExtAttr { __le32 attrLength; __le32 dataLength; __le32 infoTimeExistence; - uint8_t infoTimes[0]; + uint8_t infoTimes[]; } __packed; /* Device Specification (ECMA 167r3 4/14.10.7) */ @@ -694,7 +694,7 @@ struct deviceSpec { __le32 impUseLength; __le32 majorDeviceIdent; __le32 minorDeviceIdent; - uint8_t impUse[0]; + uint8_t impUse[]; } __packed; /* Implementation Use Extended Attr (ECMA 167r3 4/14.10.8) */ @@ -705,7 +705,7 @@ struct impUseExtAttr { __le32 attrLength; __le32 impUseLength; struct regid impIdent; - uint8_t impUse[0]; + uint8_t impUse[]; } __packed; /* Application Use Extended Attribute (ECMA 167r3 4/14.10.9) */ @@ -716,7 +716,7 @@ struct appUseExtAttr { __le32 attrLength; __le32 appUseLength; struct regid appIdent; - uint8_t appUse[0]; + uint8_t appUse[]; } __packed; #define EXTATTR_CHAR_SET 1 @@ -733,7 +733,7 @@ struct unallocSpaceEntry { struct tag descTag; struct icbtag icbTag; __le32 lengthAllocDescs; - uint8_t allocDescs[0]; + uint8_t allocDescs[]; } __packed; /* Space Bitmap Descriptor (ECMA 167r3 4/14.12) */ @@ -741,7 +741,7 @@ struct spaceBitmapDesc { struct tag descTag; __le32 numOfBits; __le32 numOfBytes; - uint8_t bitmap[0]; + uint8_t bitmap[]; } __packed; /* Partition Integrity Entry (ECMA 167r3 4/14.13) */ @@ -780,7 +780,7 @@ struct pathComponent { uint8_t componentType; uint8_t lengthComponentIdent; __le16 componentFileVersionNum; - dchars componentIdent[0]; + dchars componentIdent[]; } __packed; /* File Entry (ECMA 167r3 4/14.17) */ @@ -809,8 +809,8 @@ struct extendedFileEntry { __le64 uniqueID; __le32 lengthExtendedAttr; __le32 lengthAllocDescs; - uint8_t extendedAttr[0]; - uint8_t allocDescs[0]; + uint8_t extendedAttr[]; + /* uint8_t allocDescs[]; */ } __packed; #endif /* _ECMA_167_H */ diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 4917670860a0..1d6b7a50736b 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -390,8 +390,7 @@ struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, dfibh.eoffset += (sfibh.eoffset - sfibh.soffset); dfi = (struct fileIdentDesc *)(dbh->b_data + dfibh.soffset); if (udf_write_fi(inode, sfi, dfi, &dfibh, sfi->impUse, - sfi->fileIdent + - le16_to_cpu(sfi->lengthOfImpUse))) { + udf_get_fi_ident(sfi))) { iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB; brelse(dbh); return NULL; diff --git a/fs/udf/misc.c b/fs/udf/misc.c index eab94527340d..1614d308d0f0 100644 --- a/fs/udf/misc.c +++ b/fs/udf/misc.c @@ -173,13 +173,22 @@ struct genericFormat *udf_get_extendedattr(struct inode *inode, uint32_t type, else offset = le32_to_cpu(eahd->appAttrLocation); - while (offset < iinfo->i_lenEAttr) { + while (offset + sizeof(*gaf) < iinfo->i_lenEAttr) { + uint32_t attrLength; + gaf = (struct genericFormat *)&ea[offset]; + attrLength = le32_to_cpu(gaf->attrLength); + + /* Detect undersized elements and buffer overflows */ + if ((attrLength < sizeof(*gaf)) || + (attrLength > (iinfo->i_lenEAttr - offset))) + break; + if (le32_to_cpu(gaf->attrType) == type && gaf->attrSubtype == subtype) return gaf; else - offset += le32_to_cpu(gaf->attrLength); + offset += attrLength; } } diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 7c7c9bbbfa57..caeef08efed2 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -74,12 +74,11 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi, if (fileident) { if (adinicb || (offset + lfi < 0)) { - memcpy((uint8_t *)sfi->fileIdent + liu, fileident, lfi); + memcpy(udf_get_fi_ident(sfi), fileident, lfi); } else if (offset >= 0) { memcpy(fibh->ebh->b_data + offset, fileident, lfi); } else { - memcpy((uint8_t *)sfi->fileIdent + liu, fileident, - -offset); + memcpy(udf_get_fi_ident(sfi), fileident, -offset); memcpy(fibh->ebh->b_data, fileident - offset, lfi + offset); } @@ -88,11 +87,11 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi, offset += lfi; if (adinicb || (offset + padlen < 0)) { - memset((uint8_t *)sfi->padding + liu + lfi, 0x00, padlen); + memset(udf_get_fi_ident(sfi) + lfi, 0x00, padlen); } else if (offset >= 0) { memset(fibh->ebh->b_data + offset, 0x00, padlen); } else { - memset((uint8_t *)sfi->padding + liu + lfi, 0x00, -offset); + memset(udf_get_fi_ident(sfi) + lfi, 0x00, -offset); memset(fibh->ebh->b_data, 0x00, padlen + offset); } @@ -226,7 +225,7 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir, lfi = cfi->lengthFileIdent; if (fibh->sbh == fibh->ebh) { - nameptr = fi->fileIdent + liu; + nameptr = udf_get_fi_ident(fi); } else { int poffset; /* Unpaded ending offset */ @@ -246,7 +245,7 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir, } } nameptr = copy_name; - memcpy(nameptr, fi->fileIdent + liu, + memcpy(nameptr, udf_get_fi_ident(fi), lfi - poffset); memcpy(nameptr + lfi - poffset, fibh->ebh->b_data, poffset); diff --git a/fs/udf/osta_udf.h b/fs/udf/osta_udf.h index 22bc4fb2feb9..157de0ec0cd5 100644 --- a/fs/udf/osta_udf.h +++ b/fs/udf/osta_udf.h @@ -111,7 +111,7 @@ struct logicalVolIntegrityDescImpUse { __le16 minUDFReadRev; __le16 minUDFWriteRev; __le16 maxUDFWriteRev; - uint8_t impUse[0]; + uint8_t impUse[]; } __packed; /* Implementation Use Volume Descriptor (UDF 2.60 2.2.7) */ @@ -178,15 +178,6 @@ struct metadataPartitionMap { uint8_t reserved2[5]; } __packed; -/* Virtual Allocation Table (UDF 1.5 2.2.10) */ -struct virtualAllocationTable15 { - __le32 vatEntry[0]; - struct regid vatIdent; - __le32 previousVATICBLoc; -} __packed; - -#define ICBTAG_FILE_TYPE_VAT15 0x00U - /* Virtual Allocation Table (UDF 2.60 2.2.11) */ struct virtualAllocationTable20 { __le16 lengthHeader; @@ -199,8 +190,8 @@ struct virtualAllocationTable20 { __le16 minUDFWriteRev; __le16 maxUDFWriteRev; __le16 reserved; - uint8_t impUse[0]; - __le32 vatEntry[0]; + uint8_t impUse[]; + /* __le32 vatEntry[]; */ } __packed; #define ICBTAG_FILE_TYPE_VAT20 0xF8U @@ -217,8 +208,7 @@ struct sparingTable { __le16 reallocationTableLen; __le16 reserved; __le32 sequenceNum; - struct sparingEntry - mapEntry[0]; + struct sparingEntry mapEntry[]; } __packed; /* Metadata File (and Metadata Mirror File) (UDF 2.60 2.2.13.1) */ @@ -241,7 +231,7 @@ struct allocDescImpUse { /* FreeEASpace (UDF 2.60 3.3.4.5.1.1) */ struct freeEaSpace { __le16 headerChecksum; - uint8_t freeEASpace[0]; + uint8_t freeEASpace[]; } __packed; /* DVD Copyright Management Information (UDF 2.60 3.3.4.5.1.2) */ @@ -265,7 +255,7 @@ struct LVExtensionEA { /* FreeAppEASpace (UDF 2.60 3.3.4.6.1) */ struct freeAppEASpace { __le16 headerChecksum; - uint8_t freeEASpace[0]; + uint8_t freeEASpace[]; } __packed; /* UDF Defined System Stream (UDF 2.60 3.3.7) */ diff --git a/fs/udf/super.c b/fs/udf/super.c index 2f83c1204e20..b2d7c57d0688 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -108,16 +108,10 @@ struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct super_block *sb) return NULL; lvid = (struct logicalVolIntegrityDesc *)UDF_SB(sb)->s_lvid_bh->b_data; partnum = le32_to_cpu(lvid->numOfPartitions); - if ((sb->s_blocksize - sizeof(struct logicalVolIntegrityDescImpUse) - - offsetof(struct logicalVolIntegrityDesc, impUse)) / - (2 * sizeof(uint32_t)) < partnum) { - udf_err(sb, "Logical volume integrity descriptor corrupted " - "(numOfPartitions = %u)!\n", partnum); - return NULL; - } /* The offset is to skip freeSpaceTable and sizeTable arrays */ offset = partnum * 2 * sizeof(uint32_t); - return (struct logicalVolIntegrityDescImpUse *)&(lvid->impUse[offset]); + return (struct logicalVolIntegrityDescImpUse *) + (((uint8_t *)(lvid + 1)) + offset); } /* UDF filesystem type */ @@ -349,10 +343,10 @@ static int udf_show_options(struct seq_file *seq, struct dentry *root) seq_printf(seq, ",lastblock=%u", sbi->s_last_block); if (sbi->s_anchor != 0) seq_printf(seq, ",anchor=%u", sbi->s_anchor); - if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) - seq_puts(seq, ",utf8"); - if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP) && sbi->s_nls_map) + if (sbi->s_nls_map) seq_printf(seq, ",iocharset=%s", sbi->s_nls_map->charset); + else + seq_puts(seq, ",iocharset=utf8"); return 0; } @@ -558,19 +552,24 @@ static int udf_parse_options(char *options, struct udf_options *uopt, /* Ignored (never implemented properly) */ break; case Opt_utf8: - uopt->flags |= (1 << UDF_FLAG_UTF8); + if (!remount) { + unload_nls(uopt->nls_map); + uopt->nls_map = NULL; + } break; case Opt_iocharset: if (!remount) { - if (uopt->nls_map) - unload_nls(uopt->nls_map); - /* - * load_nls() failure is handled later in - * udf_fill_super() after all options are - * parsed. - */ + unload_nls(uopt->nls_map); + uopt->nls_map = NULL; + } + /* When nls_map is not loaded then UTF-8 is used */ + if (!remount && strcmp(args[0].from, "utf8") != 0) { uopt->nls_map = load_nls(args[0].from); - uopt->flags |= (1 << UDF_FLAG_NLS_MAP); + if (!uopt->nls_map) { + pr_err("iocharset %s not found\n", + args[0].from); + return 0; + } } break; case Opt_uforget: @@ -1542,6 +1541,7 @@ static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_ struct udf_sb_info *sbi = UDF_SB(sb); struct logicalVolIntegrityDesc *lvid; int indirections = 0; + u32 parts, impuselen; while (++indirections <= UDF_MAX_LVID_NESTING) { final_bh = NULL; @@ -1568,15 +1568,27 @@ static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_ lvid = (struct logicalVolIntegrityDesc *)final_bh->b_data; if (lvid->nextIntegrityExt.extLength == 0) - return; + goto check; loc = leea_to_cpu(lvid->nextIntegrityExt); } udf_warn(sb, "Too many LVID indirections (max %u), ignoring.\n", UDF_MAX_LVID_NESTING); +out_err: brelse(sbi->s_lvid_bh); sbi->s_lvid_bh = NULL; + return; +check: + parts = le32_to_cpu(lvid->numOfPartitions); + impuselen = le32_to_cpu(lvid->lengthOfImpUse); + if (parts >= sb->s_blocksize || impuselen >= sb->s_blocksize || + sizeof(struct logicalVolIntegrityDesc) + impuselen + + 2 * parts * sizeof(u32) > sb->s_blocksize) { + udf_warn(sb, "Corrupted LVID (parts=%u, impuselen=%u), " + "ignoring.\n", parts, impuselen); + goto out_err; + } } /* @@ -2139,21 +2151,6 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) if (!udf_parse_options((char *)options, &uopt, false)) goto parse_options_failure; - if (uopt.flags & (1 << UDF_FLAG_UTF8) && - uopt.flags & (1 << UDF_FLAG_NLS_MAP)) { - udf_err(sb, "utf8 cannot be combined with iocharset\n"); - goto parse_options_failure; - } - if ((uopt.flags & (1 << UDF_FLAG_NLS_MAP)) && !uopt.nls_map) { - uopt.nls_map = load_nls_default(); - if (!uopt.nls_map) - uopt.flags &= ~(1 << UDF_FLAG_NLS_MAP); - else - udf_debug("Using default NLS map\n"); - } - if (!(uopt.flags & (1 << UDF_FLAG_NLS_MAP))) - uopt.flags |= (1 << UDF_FLAG_UTF8); - fileset.logicalBlockNum = 0xFFFFFFFF; fileset.partitionReferenceNum = 0xFFFF; @@ -2308,8 +2305,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) error_out: iput(sbi->s_vat_inode); parse_options_failure: - if (uopt.nls_map) - unload_nls(uopt.nls_map); + unload_nls(uopt.nls_map); if (lvid_open) udf_close_lvid(sb); brelse(sbi->s_lvid_bh); @@ -2359,8 +2355,7 @@ static void udf_put_super(struct super_block *sb) sbi = UDF_SB(sb); iput(sbi->s_vat_inode); - if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) - unload_nls(sbi->s_nls_map); + unload_nls(sbi->s_nls_map); if (!sb_rdonly(sb)) udf_close_lvid(sb); brelse(sbi->s_lvid_bh); diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h index 758efe557a19..4fa620543d30 100644 --- a/fs/udf/udf_sb.h +++ b/fs/udf/udf_sb.h @@ -20,8 +20,6 @@ #define UDF_FLAG_UNDELETE 6 #define UDF_FLAG_UNHIDE 7 #define UDF_FLAG_VARCONV 8 -#define UDF_FLAG_NLS_MAP 9 -#define UDF_FLAG_UTF8 10 #define UDF_FLAG_UID_FORGET 11 /* save -1 for uid to disk */ #define UDF_FLAG_GID_FORGET 12 #define UDF_FLAG_UID_SET 13 diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index 9dd0814f1077..7e258f15b8ef 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -130,6 +130,10 @@ static inline unsigned int udf_dir_entry_len(struct fileIdentDesc *cfi) le16_to_cpu(cfi->lengthOfImpUse) + cfi->lengthFileIdent, UDF_NAME_PAD); } +static inline uint8_t *udf_get_fi_ident(struct fileIdentDesc *fi) +{ + return ((uint8_t *)(fi + 1)) + le16_to_cpu(fi->lengthOfImpUse); +} /* file.c */ extern long udf_ioctl(struct file *, unsigned int, unsigned long); diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c index 5fcfa96463eb..622569007b53 100644 --- a/fs/udf/unicode.c +++ b/fs/udf/unicode.c @@ -177,7 +177,7 @@ static int udf_name_from_CS0(struct super_block *sb, return 0; } - if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) + if (UDF_SB(sb)->s_nls_map) conv_f = UDF_SB(sb)->s_nls_map->uni2char; else conv_f = NULL; @@ -285,7 +285,7 @@ static int udf_name_to_CS0(struct super_block *sb, if (ocu_max_len <= 0) return 0; - if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) + if (UDF_SB(sb)->s_nls_map) conv_f = UDF_SB(sb)->s_nls_map->char2uni; else conv_f = NULL; diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index e986b95d94c9..6f49bf39183c 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -29,67 +29,3 @@ kmem_alloc(size_t size, xfs_km_flags_t flags) congestion_wait(BLK_RW_ASYNC, HZ/50); } while (1); } - - -/* - * __vmalloc() will allocate data pages and auxiliary structures (e.g. - * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context here. Hence - * we need to tell memory reclaim that we are in such a context via - * PF_MEMALLOC_NOFS to prevent memory reclaim re-entering the filesystem here - * and potentially deadlocking. - */ -static void * -__kmem_vmalloc(size_t size, xfs_km_flags_t flags) -{ - unsigned nofs_flag = 0; - void *ptr; - gfp_t lflags = kmem_flags_convert(flags); - - if (flags & KM_NOFS) - nofs_flag = memalloc_nofs_save(); - - ptr = __vmalloc(size, lflags); - - if (flags & KM_NOFS) - memalloc_nofs_restore(nofs_flag); - - return ptr; -} - -/* - * Same as kmem_alloc_large, except we guarantee the buffer returned is aligned - * to the @align_mask. We only guarantee alignment up to page size, we'll clamp - * alignment at page size if it is larger. vmalloc always returns a PAGE_SIZE - * aligned region. - */ -void * -kmem_alloc_io(size_t size, int align_mask, xfs_km_flags_t flags) -{ - void *ptr; - - trace_kmem_alloc_io(size, flags, _RET_IP_); - - if (WARN_ON_ONCE(align_mask >= PAGE_SIZE)) - align_mask = PAGE_SIZE - 1; - - ptr = kmem_alloc(size, flags | KM_MAYFAIL); - if (ptr) { - if (!((uintptr_t)ptr & align_mask)) - return ptr; - kfree(ptr); - } - return __kmem_vmalloc(size, flags); -} - -void * -kmem_alloc_large(size_t size, xfs_km_flags_t flags) -{ - void *ptr; - - trace_kmem_alloc_large(size, flags, _RET_IP_); - - ptr = kmem_alloc(size, flags | KM_MAYFAIL); - if (ptr) - return ptr; - return __kmem_vmalloc(size, flags); -} diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index 38007117697e..54da6d717a06 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -57,8 +57,6 @@ kmem_flags_convert(xfs_km_flags_t flags) } extern void *kmem_alloc(size_t, xfs_km_flags_t); -extern void *kmem_alloc_io(size_t size, int align_mask, xfs_km_flags_t flags); -extern void *kmem_alloc_large(size_t size, xfs_km_flags_t); static inline void kmem_free(const void *ptr) { kvfree(ptr); diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index ee9ec0c50bec..005abfd9fd34 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -313,7 +313,6 @@ xfs_get_aghdr_buf( if (error) return error; - bp->b_bn = blkno; bp->b_maps[0].bm_bn = blkno; bp->b_ops = ops; @@ -469,7 +468,7 @@ xfs_rmaproot_init( rrec->rm_offset = 0; /* account for refc btree root */ - if (xfs_sb_version_hasreflink(&mp->m_sb)) { + if (xfs_has_reflink(mp)) { rrec = XFS_RMAP_REC_ADDR(block, 5); rrec->rm_startblock = cpu_to_be32(xfs_refc_block(mp)); rrec->rm_blockcount = cpu_to_be32(1); @@ -528,7 +527,7 @@ xfs_agfblock_init( agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp)); agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1); agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1); - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) { + if (xfs_has_rmapbt(mp)) { agf->agf_roots[XFS_BTNUM_RMAPi] = cpu_to_be32(XFS_RMAP_BLOCK(mp)); agf->agf_levels[XFS_BTNUM_RMAPi] = cpu_to_be32(1); @@ -541,9 +540,9 @@ xfs_agfblock_init( tmpsize = id->agsize - mp->m_ag_prealloc_blocks; agf->agf_freeblks = cpu_to_be32(tmpsize); agf->agf_longest = cpu_to_be32(tmpsize); - if (xfs_sb_version_hascrc(&mp->m_sb)) + if (xfs_has_crc(mp)) uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid); - if (xfs_sb_version_hasreflink(&mp->m_sb)) { + if (xfs_has_reflink(mp)) { agf->agf_refcount_root = cpu_to_be32( xfs_refc_block(mp)); agf->agf_refcount_level = cpu_to_be32(1); @@ -569,7 +568,7 @@ xfs_agflblock_init( __be32 *agfl_bno; int bucket; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC); agfl->agfl_seqno = cpu_to_be32(id->agno); uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid); @@ -599,17 +598,17 @@ xfs_agiblock_init( agi->agi_freecount = 0; agi->agi_newino = cpu_to_be32(NULLAGINO); agi->agi_dirino = cpu_to_be32(NULLAGINO); - if (xfs_sb_version_hascrc(&mp->m_sb)) + if (xfs_has_crc(mp)) uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid); - if (xfs_sb_version_hasfinobt(&mp->m_sb)) { + if (xfs_has_finobt(mp)) { agi->agi_free_root = cpu_to_be32(XFS_FIBT_BLOCK(mp)); agi->agi_free_level = cpu_to_be32(1); } for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); - if (xfs_sb_version_hasinobtcounts(&mp->m_sb)) { + if (xfs_has_inobtcounts(mp)) { agi->agi_iblocks = cpu_to_be32(1); - if (xfs_sb_version_hasfinobt(&mp->m_sb)) + if (xfs_has_finobt(mp)) agi->agi_fblocks = cpu_to_be32(1); } } @@ -719,14 +718,14 @@ xfs_ag_init_headers( .ops = &xfs_finobt_buf_ops, .work = &xfs_btroot_init, .type = XFS_BTNUM_FINO, - .need_init = xfs_sb_version_hasfinobt(&mp->m_sb) + .need_init = xfs_has_finobt(mp) }, { /* RMAP root block */ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_RMAP_BLOCK(mp)), .numblks = BTOBB(mp->m_sb.sb_blocksize), .ops = &xfs_rmapbt_buf_ops, .work = &xfs_rmaproot_init, - .need_init = xfs_sb_version_hasrmapbt(&mp->m_sb) + .need_init = xfs_has_rmapbt(mp) }, { /* REFC root block */ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, xfs_refc_block(mp)), @@ -734,7 +733,7 @@ xfs_ag_init_headers( .ops = &xfs_refcountbt_buf_ops, .work = &xfs_btroot_init, .type = XFS_BTNUM_REFC, - .need_init = xfs_sb_version_hasreflink(&mp->m_sb) + .need_init = xfs_has_reflink(mp) }, { /* NULL terminating block */ .daddr = XFS_BUF_DADDR_NULL, diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 6929157d8d6e..95157f5a5a6c 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -51,7 +51,7 @@ xfs_agfl_size( { unsigned int size = mp->m_sb.sb_sectsize; - if (xfs_sb_version_hascrc(&mp->m_sb)) + if (xfs_has_crc(mp)) size -= sizeof(struct xfs_agfl); return size / sizeof(xfs_agblock_t); @@ -61,9 +61,9 @@ unsigned int xfs_refc_block( struct xfs_mount *mp) { - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + if (xfs_has_rmapbt(mp)) return XFS_RMAP_BLOCK(mp) + 1; - if (xfs_sb_version_hasfinobt(&mp->m_sb)) + if (xfs_has_finobt(mp)) return XFS_FIBT_BLOCK(mp) + 1; return XFS_IBT_BLOCK(mp) + 1; } @@ -72,11 +72,11 @@ xfs_extlen_t xfs_prealloc_blocks( struct xfs_mount *mp) { - if (xfs_sb_version_hasreflink(&mp->m_sb)) + if (xfs_has_reflink(mp)) return xfs_refc_block(mp) + 1; - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + if (xfs_has_rmapbt(mp)) return XFS_RMAP_BLOCK(mp) + 1; - if (xfs_sb_version_hasfinobt(&mp->m_sb)) + if (xfs_has_finobt(mp)) return XFS_FIBT_BLOCK(mp) + 1; return XFS_IBT_BLOCK(mp) + 1; } @@ -126,11 +126,11 @@ xfs_alloc_ag_max_usable( blocks = XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)); /* ag headers */ blocks += XFS_ALLOC_AGFL_RESERVE; blocks += 3; /* AGF, AGI btree root blocks */ - if (xfs_sb_version_hasfinobt(&mp->m_sb)) + if (xfs_has_finobt(mp)) blocks++; /* finobt root block */ - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + if (xfs_has_rmapbt(mp)) blocks++; /* rmap root block */ - if (xfs_sb_version_hasreflink(&mp->m_sb)) + if (xfs_has_reflink(mp)) blocks++; /* refcount root block */ return mp->m_sb.sb_agblocks - blocks; @@ -598,7 +598,7 @@ xfs_agfl_verify( * AGFL is what the AGF says is active. We can't get to the AGF, so we * can't verify just those entries are valid. */ - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return NULL; if (!xfs_verify_magic(bp, agfl->agfl_magicnum)) @@ -638,7 +638,7 @@ xfs_agfl_read_verify( * AGFL is what the AGF says is active. We can't get to the AGF, so we * can't verify just those entries are valid. */ - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return; if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF)) @@ -659,7 +659,7 @@ xfs_agfl_write_verify( xfs_failaddr_t fa; /* no verification of non-crc AGFLs */ - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return; fa = xfs_agfl_verify(bp); @@ -2264,7 +2264,7 @@ xfs_alloc_min_freelist( min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1, mp->m_ag_maxlevels); /* space needed reverse mapping used space btree */ - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + if (xfs_has_rmapbt(mp)) min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1, mp->m_rmap_maxlevels); @@ -2373,7 +2373,7 @@ xfs_agfl_needs_reset( int active; /* no agfl header on v4 supers */ - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return false; /* @@ -2877,7 +2877,7 @@ xfs_agf_verify( struct xfs_mount *mp = bp->b_mount; struct xfs_agf *agf = bp->b_addr; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(agf->agf_lsn))) @@ -2907,12 +2907,12 @@ xfs_agf_verify( be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > mp->m_ag_maxlevels) return __this_address; - if (xfs_sb_version_hasrmapbt(&mp->m_sb) && + if (xfs_has_rmapbt(mp) && (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 || be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > mp->m_rmap_maxlevels)) return __this_address; - if (xfs_sb_version_hasrmapbt(&mp->m_sb) && + if (xfs_has_rmapbt(mp) && be32_to_cpu(agf->agf_rmap_blocks) > be32_to_cpu(agf->agf_length)) return __this_address; @@ -2925,16 +2925,16 @@ xfs_agf_verify( if (bp->b_pag && be32_to_cpu(agf->agf_seqno) != bp->b_pag->pag_agno) return __this_address; - if (xfs_sb_version_haslazysbcount(&mp->m_sb) && + if (xfs_has_lazysbcount(mp) && be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length)) return __this_address; - if (xfs_sb_version_hasreflink(&mp->m_sb) && + if (xfs_has_reflink(mp) && be32_to_cpu(agf->agf_refcount_blocks) > be32_to_cpu(agf->agf_length)) return __this_address; - if (xfs_sb_version_hasreflink(&mp->m_sb) && + if (xfs_has_reflink(mp) && (be32_to_cpu(agf->agf_refcount_level) < 1 || be32_to_cpu(agf->agf_refcount_level) > mp->m_refc_maxlevels)) return __this_address; @@ -2950,7 +2950,7 @@ xfs_agf_read_verify( struct xfs_mount *mp = bp->b_mount; xfs_failaddr_t fa; - if (xfs_sb_version_hascrc(&mp->m_sb) && + if (xfs_has_crc(mp) && !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF)) xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { @@ -2975,7 +2975,7 @@ xfs_agf_write_verify( return; } - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return; if (bip) @@ -3073,13 +3073,13 @@ xfs_alloc_read_agf( * counter only tracks non-root blocks. */ allocbt_blks = pag->pagf_btreeblks; - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + if (xfs_has_rmapbt(mp)) allocbt_blks -= be32_to_cpu(agf->agf_rmap_blocks) - 1; if (allocbt_blks > 0) atomic64_add(allocbt_blks, &mp->m_allocbt_blks); } #ifdef DEBUG - else if (!XFS_FORCED_SHUTDOWN(mp)) { + else if (!xfs_is_shutdown(mp)) { ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks)); ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks)); ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount)); @@ -3166,7 +3166,7 @@ xfs_alloc_vextent( * the first a.g. fails. */ if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) && - (mp->m_flags & XFS_MOUNT_32BITINODES)) { + xfs_is_inode32(mp)) { args->fsbno = XFS_AGB_TO_FSB(mp, ((mp->m_agfrotor / rotorstep) % mp->m_sb.sb_agcount), 0); @@ -3392,7 +3392,7 @@ struct xfs_alloc_query_range_info { STATIC int xfs_alloc_query_range_helper( struct xfs_btree_cur *cur, - union xfs_btree_rec *rec, + const union xfs_btree_rec *rec, void *priv) { struct xfs_alloc_query_range_info *query = priv; @@ -3407,8 +3407,8 @@ xfs_alloc_query_range_helper( int xfs_alloc_query_range( struct xfs_btree_cur *cur, - struct xfs_alloc_rec_incore *low_rec, - struct xfs_alloc_rec_incore *high_rec, + const struct xfs_alloc_rec_incore *low_rec, + const struct xfs_alloc_rec_incore *high_rec, xfs_alloc_query_range_fn fn, void *priv) { diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index e30900b6f8ba..df4aefaf0046 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -220,13 +220,13 @@ int xfs_free_extent_fix_freelist(struct xfs_trans *tp, struct xfs_perag *pag, xfs_extlen_t xfs_prealloc_blocks(struct xfs_mount *mp); typedef int (*xfs_alloc_query_range_fn)( - struct xfs_btree_cur *cur, - struct xfs_alloc_rec_incore *rec, - void *priv); + struct xfs_btree_cur *cur, + const struct xfs_alloc_rec_incore *rec, + void *priv); int xfs_alloc_query_range(struct xfs_btree_cur *cur, - struct xfs_alloc_rec_incore *low_rec, - struct xfs_alloc_rec_incore *high_rec, + const struct xfs_alloc_rec_incore *low_rec, + const struct xfs_alloc_rec_incore *high_rec, xfs_alloc_query_range_fn fn, void *priv); int xfs_alloc_query_all(struct xfs_btree_cur *cur, xfs_alloc_query_range_fn fn, void *priv); @@ -243,7 +243,7 @@ static inline __be32 * xfs_buf_to_agfl_bno( struct xfs_buf *bp) { - if (xfs_sb_version_hascrc(&bp->b_mount->m_sb)) + if (xfs_has_crc(bp->b_mount)) return bp->b_addr + sizeof(struct xfs_agfl); return bp->b_addr; } diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 6b363f78cfa2..6746fd735550 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -31,9 +31,9 @@ xfs_allocbt_dup_cursor( STATIC void xfs_allocbt_set_root( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *ptr, - int inc) + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr, + int inc) { struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agf *agf = agbp->b_addr; @@ -50,10 +50,10 @@ xfs_allocbt_set_root( STATIC int xfs_allocbt_alloc_block( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *start, - union xfs_btree_ptr *new, - int *stat) + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat) { int error; xfs_agblock_t bno; @@ -87,7 +87,7 @@ xfs_allocbt_free_block( xfs_agblock_t bno; int error; - bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp)); + bno = xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp)); error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1); if (error) return error; @@ -103,11 +103,11 @@ xfs_allocbt_free_block( */ STATIC void xfs_allocbt_update_lastrec( - struct xfs_btree_cur *cur, - struct xfs_btree_block *block, - union xfs_btree_rec *rec, - int ptr, - int reason) + struct xfs_btree_cur *cur, + const struct xfs_btree_block *block, + const union xfs_btree_rec *rec, + int ptr, + int reason) { struct xfs_agf *agf = cur->bc_ag.agbp->b_addr; struct xfs_perag *pag; @@ -177,8 +177,8 @@ xfs_allocbt_get_maxrecs( STATIC void xfs_allocbt_init_key_from_rec( - union xfs_btree_key *key, - union xfs_btree_rec *rec) + union xfs_btree_key *key, + const union xfs_btree_rec *rec) { key->alloc.ar_startblock = rec->alloc.ar_startblock; key->alloc.ar_blockcount = rec->alloc.ar_blockcount; @@ -186,10 +186,10 @@ xfs_allocbt_init_key_from_rec( STATIC void xfs_bnobt_init_high_key_from_rec( - union xfs_btree_key *key, - union xfs_btree_rec *rec) + union xfs_btree_key *key, + const union xfs_btree_rec *rec) { - __u32 x; + __u32 x; x = be32_to_cpu(rec->alloc.ar_startblock); x += be32_to_cpu(rec->alloc.ar_blockcount) - 1; @@ -199,8 +199,8 @@ xfs_bnobt_init_high_key_from_rec( STATIC void xfs_cntbt_init_high_key_from_rec( - union xfs_btree_key *key, - union xfs_btree_rec *rec) + union xfs_btree_key *key, + const union xfs_btree_rec *rec) { key->alloc.ar_blockcount = rec->alloc.ar_blockcount; key->alloc.ar_startblock = 0; @@ -229,23 +229,23 @@ xfs_allocbt_init_ptr_from_cur( STATIC int64_t xfs_bnobt_key_diff( - struct xfs_btree_cur *cur, - union xfs_btree_key *key) + struct xfs_btree_cur *cur, + const union xfs_btree_key *key) { - xfs_alloc_rec_incore_t *rec = &cur->bc_rec.a; - xfs_alloc_key_t *kp = &key->alloc; + struct xfs_alloc_rec_incore *rec = &cur->bc_rec.a; + const struct xfs_alloc_rec *kp = &key->alloc; return (int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock; } STATIC int64_t xfs_cntbt_key_diff( - struct xfs_btree_cur *cur, - union xfs_btree_key *key) + struct xfs_btree_cur *cur, + const union xfs_btree_key *key) { - xfs_alloc_rec_incore_t *rec = &cur->bc_rec.a; - xfs_alloc_key_t *kp = &key->alloc; - int64_t diff; + struct xfs_alloc_rec_incore *rec = &cur->bc_rec.a; + const struct xfs_alloc_rec *kp = &key->alloc; + int64_t diff; diff = (int64_t)be32_to_cpu(kp->ar_blockcount) - rec->ar_blockcount; if (diff) @@ -256,9 +256,9 @@ xfs_cntbt_key_diff( STATIC int64_t xfs_bnobt_diff_two_keys( - struct xfs_btree_cur *cur, - union xfs_btree_key *k1, - union xfs_btree_key *k2) + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2) { return (int64_t)be32_to_cpu(k1->alloc.ar_startblock) - be32_to_cpu(k2->alloc.ar_startblock); @@ -266,11 +266,11 @@ xfs_bnobt_diff_two_keys( STATIC int64_t xfs_cntbt_diff_two_keys( - struct xfs_btree_cur *cur, - union xfs_btree_key *k1, - union xfs_btree_key *k2) + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2) { - int64_t diff; + int64_t diff; diff = be32_to_cpu(k1->alloc.ar_blockcount) - be32_to_cpu(k2->alloc.ar_blockcount); @@ -295,7 +295,7 @@ xfs_allocbt_verify( if (!xfs_verify_magic(bp, block->bb_magic)) return __this_address; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { fa = xfs_btree_sblock_v5hdr_verify(bp); if (fa) return fa; @@ -376,9 +376,9 @@ const struct xfs_buf_ops xfs_cntbt_buf_ops = { STATIC int xfs_bnobt_keys_inorder( - struct xfs_btree_cur *cur, - union xfs_btree_key *k1, - union xfs_btree_key *k2) + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2) { return be32_to_cpu(k1->alloc.ar_startblock) < be32_to_cpu(k2->alloc.ar_startblock); @@ -386,9 +386,9 @@ xfs_bnobt_keys_inorder( STATIC int xfs_bnobt_recs_inorder( - struct xfs_btree_cur *cur, - union xfs_btree_rec *r1, - union xfs_btree_rec *r2) + struct xfs_btree_cur *cur, + const union xfs_btree_rec *r1, + const union xfs_btree_rec *r2) { return be32_to_cpu(r1->alloc.ar_startblock) + be32_to_cpu(r1->alloc.ar_blockcount) <= @@ -397,9 +397,9 @@ xfs_bnobt_recs_inorder( STATIC int xfs_cntbt_keys_inorder( - struct xfs_btree_cur *cur, - union xfs_btree_key *k1, - union xfs_btree_key *k2) + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2) { return be32_to_cpu(k1->alloc.ar_blockcount) < be32_to_cpu(k2->alloc.ar_blockcount) || @@ -410,9 +410,9 @@ xfs_cntbt_keys_inorder( STATIC int xfs_cntbt_recs_inorder( - struct xfs_btree_cur *cur, - union xfs_btree_rec *r1, - union xfs_btree_rec *r2) + struct xfs_btree_cur *cur, + const union xfs_btree_rec *r1, + const union xfs_btree_rec *r2) { return be32_to_cpu(r1->alloc.ar_blockcount) < be32_to_cpu(r2->alloc.ar_blockcount) || @@ -498,7 +498,7 @@ xfs_allocbt_init_common( atomic_inc(&pag->pag_ref); cur->bc_ag.pag = pag; - if (xfs_sb_version_hascrc(&mp->m_sb)) + if (xfs_has_crc(mp)) cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; return cur; diff --git a/fs/xfs/libxfs/xfs_alloc_btree.h b/fs/xfs/libxfs/xfs_alloc_btree.h index 9eb4c667a6b8..2f6b816aaf9f 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.h +++ b/fs/xfs/libxfs/xfs_alloc_btree.h @@ -20,7 +20,7 @@ struct xbtree_afakeroot; * Btree block header size depends on a superblock flag. */ #define XFS_ALLOC_BLOCK_LEN(mp) \ - (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ + (xfs_has_crc(((mp))) ? \ XFS_BTREE_SBLOCK_CRC_LEN : XFS_BTREE_SBLOCK_LEN) /* diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 191d51725988..fbc9d816882c 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -146,7 +146,7 @@ xfs_attr_get( XFS_STATS_INC(args->dp->i_mount, xs_attr_get); - if (XFS_FORCED_SHUTDOWN(args->dp->i_mount)) + if (xfs_is_shutdown(args->dp->i_mount)) return -EIO; args->geo = args->dp->i_mount->m_attr_geo; @@ -224,7 +224,7 @@ xfs_attr_try_sf_addname( if (!error && !(args->op_flags & XFS_DA_OP_NOTIME)) xfs_trans_ichgtime(args->trans, dp, XFS_ICHGTIME_CHG); - if (dp->i_mount->m_flags & XFS_MOUNT_WSYNC) + if (xfs_has_wsync(dp->i_mount)) xfs_trans_set_sync(args->trans); return error; @@ -335,6 +335,7 @@ xfs_attr_sf_addname( * the attr fork to leaf format and will restart with the leaf * add. */ + trace_xfs_attr_sf_addname_return(XFS_DAS_UNINIT, args->dp); dac->flags |= XFS_DAC_DEFER_FINISH; return -EAGAIN; } @@ -394,6 +395,8 @@ xfs_attr_set_iter( * handling code below */ dac->flags |= XFS_DAC_DEFER_FINISH; + trace_xfs_attr_set_iter_return( + dac->dela_state, args->dp); return -EAGAIN; } else if (error) { return error; @@ -411,6 +414,7 @@ xfs_attr_set_iter( dac->dela_state = XFS_DAS_FOUND_NBLK; } + trace_xfs_attr_set_iter_return(dac->dela_state, args->dp); return -EAGAIN; case XFS_DAS_FOUND_LBLK: /* @@ -438,6 +442,8 @@ xfs_attr_set_iter( error = xfs_attr_rmtval_set_blk(dac); if (error) return error; + trace_xfs_attr_set_iter_return(dac->dela_state, + args->dp); return -EAGAIN; } @@ -472,6 +478,7 @@ xfs_attr_set_iter( * series. */ dac->dela_state = XFS_DAS_FLIP_LFLAG; + trace_xfs_attr_set_iter_return(dac->dela_state, args->dp); return -EAGAIN; case XFS_DAS_FLIP_LFLAG: /* @@ -488,11 +495,15 @@ xfs_attr_set_iter( /* Set state in case xfs_attr_rmtval_remove returns -EAGAIN */ dac->dela_state = XFS_DAS_RM_LBLK; if (args->rmtblkno) { - error = __xfs_attr_rmtval_remove(dac); + error = xfs_attr_rmtval_remove(dac); + if (error == -EAGAIN) + trace_xfs_attr_set_iter_return( + dac->dela_state, args->dp); if (error) return error; dac->dela_state = XFS_DAS_RD_LEAF; + trace_xfs_attr_set_iter_return(dac->dela_state, args->dp); return -EAGAIN; } @@ -542,6 +553,8 @@ xfs_attr_set_iter( error = xfs_attr_rmtval_set_blk(dac); if (error) return error; + trace_xfs_attr_set_iter_return( + dac->dela_state, args->dp); return -EAGAIN; } @@ -577,6 +590,7 @@ xfs_attr_set_iter( * series */ dac->dela_state = XFS_DAS_FLIP_NFLAG; + trace_xfs_attr_set_iter_return(dac->dela_state, args->dp); return -EAGAIN; case XFS_DAS_FLIP_NFLAG: @@ -595,11 +609,16 @@ xfs_attr_set_iter( /* Set state in case xfs_attr_rmtval_remove returns -EAGAIN */ dac->dela_state = XFS_DAS_RM_NBLK; if (args->rmtblkno) { - error = __xfs_attr_rmtval_remove(dac); + error = xfs_attr_rmtval_remove(dac); + if (error == -EAGAIN) + trace_xfs_attr_set_iter_return( + dac->dela_state, args->dp); + if (error) return error; dac->dela_state = XFS_DAS_CLR_FLAG; + trace_xfs_attr_set_iter_return(dac->dela_state, args->dp); return -EAGAIN; } @@ -623,8 +642,8 @@ out: /* * Return EEXIST if attr is found, or ENOATTR if not */ -int -xfs_has_attr( +static int +xfs_attr_lookup( struct xfs_da_args *args) { struct xfs_inode *dp = args->dp; @@ -691,7 +710,7 @@ xfs_attr_set( int rmt_blks = 0; unsigned int total; - if (XFS_FORCED_SHUTDOWN(dp->i_mount)) + if (xfs_is_shutdown(dp->i_mount)) return -EIO; error = xfs_qm_dqattach(dp); @@ -761,8 +780,8 @@ xfs_attr_set( goto out_trans_cancel; } + error = xfs_attr_lookup(args); if (args->value) { - error = xfs_has_attr(args); if (error == -EEXIST && (args->attr_flags & XATTR_CREATE)) goto out_trans_cancel; if (error == -ENOATTR && (args->attr_flags & XATTR_REPLACE)) @@ -777,7 +796,6 @@ xfs_attr_set( if (!args->trans) goto out_unlock; } else { - error = xfs_has_attr(args); if (error != -EEXIST) goto out_trans_cancel; @@ -790,7 +808,7 @@ xfs_attr_set( * If this is a synchronous mount, make sure that the * transaction goes to disk before returning to the user. */ - if (mp->m_flags & XFS_MOUNT_WSYNC) + if (xfs_has_wsync(mp)) xfs_trans_set_sync(args->trans); if (!(args->op_flags & XFS_DA_OP_NOTIME)) @@ -1176,6 +1194,8 @@ xfs_attr_node_addname( * this point. */ dac->flags |= XFS_DAC_DEFER_FINISH; + trace_xfs_attr_node_addname_return( + dac->dela_state, args->dp); return -EAGAIN; } @@ -1421,11 +1441,14 @@ xfs_attr_remove_iter( * May return -EAGAIN. Roll and repeat until all remote * blocks are removed. */ - error = __xfs_attr_rmtval_remove(dac); - if (error == -EAGAIN) + error = xfs_attr_rmtval_remove(dac); + if (error == -EAGAIN) { + trace_xfs_attr_remove_iter_return( + dac->dela_state, args->dp); return error; - else if (error) + } else if (error) { goto out; + } /* * Refill the state structure with buffers (the prior @@ -1438,6 +1461,7 @@ xfs_attr_remove_iter( goto out; dac->dela_state = XFS_DAS_RM_NAME; dac->flags |= XFS_DAC_DEFER_FINISH; + trace_xfs_attr_remove_iter_return(dac->dela_state, args->dp); return -EAGAIN; } @@ -1466,6 +1490,8 @@ xfs_attr_remove_iter( dac->flags |= XFS_DAC_DEFER_FINISH; dac->dela_state = XFS_DAS_RM_SHRINK; + trace_xfs_attr_remove_iter_return( + dac->dela_state, args->dp); return -EAGAIN; } @@ -1514,7 +1540,7 @@ xfs_attr_fillstate(xfs_da_state_t *state) ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); for (blk = path->blk, level = 0; level < path->active; blk++, level++) { if (blk->bp) { - blk->disk_blkno = XFS_BUF_ADDR(blk->bp); + blk->disk_blkno = xfs_buf_daddr(blk->bp); blk->bp = NULL; } else { blk->disk_blkno = 0; @@ -1529,7 +1555,7 @@ xfs_attr_fillstate(xfs_da_state_t *state) ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); for (blk = path->blk, level = 0; level < path->active; blk++, level++) { if (blk->bp) { - blk->disk_blkno = XFS_BUF_ADDR(blk->bp); + blk->disk_blkno = xfs_buf_daddr(blk->bp); blk->bp = NULL; } else { blk->disk_blkno = 0; diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index 8de5d1d2733e..5e71f719bdd5 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -490,7 +490,6 @@ int xfs_attr_get_ilocked(struct xfs_da_args *args); int xfs_attr_get(struct xfs_da_args *args); int xfs_attr_set(struct xfs_da_args *args); int xfs_attr_set_args(struct xfs_da_args *args); -int xfs_has_attr(struct xfs_da_args *args); int xfs_attr_remove_args(struct xfs_da_args *args); int xfs_attr_remove_iter(struct xfs_delattr_context *dac); bool xfs_attr_namecheck(const void *name, size_t length); diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index b910bd209949..e1d11e314228 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -384,7 +384,7 @@ xfs_attr3_leaf_write_verify( return; } - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return; if (bip) @@ -406,7 +406,7 @@ xfs_attr3_leaf_read_verify( struct xfs_mount *mp = bp->b_mount; xfs_failaddr_t fa; - if (xfs_sb_version_hascrc(&mp->m_sb) && + if (xfs_has_crc(mp) && !xfs_buf_verify_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF)) xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { @@ -489,7 +489,7 @@ xfs_attr_copy_value( } if (!args->value) { - args->value = kmem_alloc_large(valuelen, KM_NOLOCKDEP); + args->value = kvmalloc(valuelen, GFP_KERNEL | __GFP_NOLOCKDEP); if (!args->value) return -ENOMEM; } @@ -568,7 +568,7 @@ xfs_attr_shortform_bytesfit( * literal area, but for the old format we are done if there is no * space in the fixed attribute fork. */ - if (!(mp->m_flags & XFS_MOUNT_ATTR2)) + if (!xfs_has_attr2(mp)) return 0; dsize = dp->i_df.if_bytes; @@ -576,7 +576,7 @@ xfs_attr_shortform_bytesfit( switch (dp->i_df.if_format) { case XFS_DINODE_FMT_EXTENTS: /* - * If there is no attr fork and the data fork is extents, + * If there is no attr fork and the data fork is extents, * determine if creating the default attr fork will result * in the extents form migrating to btree. If so, the * minimum offset only needs to be the space required for @@ -621,21 +621,27 @@ xfs_attr_shortform_bytesfit( } /* - * Switch on the ATTR2 superblock bit (implies also FEATURES2) + * Switch on the ATTR2 superblock bit (implies also FEATURES2) unless: + * - noattr2 mount option is set, + * - on-disk version bit says it is already set, or + * - the attr2 mount option is not set to enable automatic upgrade from attr1. */ STATIC void -xfs_sbversion_add_attr2(xfs_mount_t *mp, xfs_trans_t *tp) +xfs_sbversion_add_attr2( + struct xfs_mount *mp, + struct xfs_trans *tp) { - if ((mp->m_flags & XFS_MOUNT_ATTR2) && - !(xfs_sb_version_hasattr2(&mp->m_sb))) { - spin_lock(&mp->m_sb_lock); - if (!xfs_sb_version_hasattr2(&mp->m_sb)) { - xfs_sb_version_addattr2(&mp->m_sb); - spin_unlock(&mp->m_sb_lock); - xfs_log_sb(tp); - } else - spin_unlock(&mp->m_sb_lock); - } + if (xfs_has_noattr2(mp)) + return; + if (mp->m_sb.sb_features2 & XFS_SB_VERSION2_ATTR2BIT) + return; + if (!xfs_has_attr2(mp)) + return; + + spin_lock(&mp->m_sb_lock); + xfs_add_attr2(mp); + spin_unlock(&mp->m_sb_lock); + xfs_log_sb(tp); } /* @@ -810,8 +816,7 @@ xfs_attr_sf_removename( * Fix up the start offset of the attribute fork */ totsize -= size; - if (totsize == sizeof(xfs_attr_sf_hdr_t) && - (mp->m_flags & XFS_MOUNT_ATTR2) && + if (totsize == sizeof(xfs_attr_sf_hdr_t) && xfs_has_attr2(mp) && (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) && !(args->op_flags & XFS_DA_OP_ADDNAME)) { xfs_attr_fork_remove(dp, args->trans); @@ -821,7 +826,7 @@ xfs_attr_sf_removename( ASSERT(dp->i_forkoff); ASSERT(totsize > sizeof(xfs_attr_sf_hdr_t) || (args->op_flags & XFS_DA_OP_ADDNAME) || - !(mp->m_flags & XFS_MOUNT_ATTR2) || + !xfs_has_attr2(mp) || dp->i_df.if_format == XFS_DINODE_FMT_BTREE); xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA); @@ -997,7 +1002,7 @@ xfs_attr_shortform_allfit( bytes += xfs_attr_sf_entsize_byname(name_loc->namelen, be16_to_cpu(name_loc->valuelen)); } - if ((dp->i_mount->m_flags & XFS_MOUNT_ATTR2) && + if (xfs_has_attr2(dp->i_mount) && (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) && (bytes == sizeof(struct xfs_attr_sf_hdr))) return -1; @@ -1122,7 +1127,7 @@ xfs_attr3_leaf_to_shortform( goto out; if (forkoff == -1) { - ASSERT(dp->i_mount->m_flags & XFS_MOUNT_ATTR2); + ASSERT(xfs_has_attr2(dp->i_mount)); ASSERT(dp->i_df.if_format != XFS_DINODE_FMT_BTREE); xfs_attr_fork_remove(dp, args->trans); goto out; @@ -1199,9 +1204,9 @@ xfs_attr3_leaf_to_node( xfs_trans_buf_set_type(args->trans, bp2, XFS_BLFT_ATTR_LEAF_BUF); bp2->b_ops = bp1->b_ops; memcpy(bp2->b_addr, bp1->b_addr, args->geo->blksize); - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { struct xfs_da3_blkinfo *hdr3 = bp2->b_addr; - hdr3->blkno = cpu_to_be64(bp2->b_bn); + hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp2)); } xfs_trans_log_buf(args->trans, bp2, 0, args->geo->blksize - 1); @@ -1264,12 +1269,12 @@ xfs_attr3_leaf_create( memset(&ichdr, 0, sizeof(ichdr)); ichdr.firstused = args->geo->blksize; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { struct xfs_da3_blkinfo *hdr3 = bp->b_addr; ichdr.magic = XFS_ATTR3_LEAF_MAGIC; - hdr3->blkno = cpu_to_be64(bp->b_bn); + hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp)); hdr3->owner = cpu_to_be64(dp->i_ino); uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid); diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 0c8bee3abc3b..83b95be9ded8 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -51,7 +51,7 @@ xfs_attr3_rmt_blocks( struct xfs_mount *mp, int attrlen) { - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { int buflen = XFS_ATTR3_RMT_BUF_SPACE(mp, mp->m_sb.sb_blocksize); return (attrlen + buflen - 1) / buflen; } @@ -126,11 +126,11 @@ __xfs_attr3_rmt_read_verify( int blksize = mp->m_attr_geo->blksize; /* no verification of non-crc buffers */ - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return 0; ptr = bp->b_addr; - bno = bp->b_bn; + bno = xfs_buf_daddr(bp); len = BBTOB(bp->b_length); ASSERT(len >= blksize); @@ -191,11 +191,11 @@ xfs_attr3_rmt_write_verify( xfs_daddr_t bno; /* no verification of non-crc buffers */ - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return; ptr = bp->b_addr; - bno = bp->b_bn; + bno = xfs_buf_daddr(bp); len = BBTOB(bp->b_length); ASSERT(len >= blksize); @@ -246,7 +246,7 @@ xfs_attr3_rmt_hdr_set( { struct xfs_attr3_rmt_hdr *rmt = ptr; - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return 0; rmt->rm_magic = cpu_to_be32(XFS_ATTR3_RMT_MAGIC); @@ -284,7 +284,7 @@ xfs_attr_rmtval_copyout( uint8_t **dst) { char *src = bp->b_addr; - xfs_daddr_t bno = bp->b_bn; + xfs_daddr_t bno = xfs_buf_daddr(bp); int len = BBTOB(bp->b_length); int blksize = mp->m_attr_geo->blksize; @@ -296,7 +296,7 @@ xfs_attr_rmtval_copyout( byte_cnt = min(*valuelen, byte_cnt); - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { if (xfs_attr3_rmt_hdr_ok(src, ino, *offset, byte_cnt, bno)) { xfs_alert(mp, @@ -332,7 +332,7 @@ xfs_attr_rmtval_copyin( uint8_t **src) { char *dst = bp->b_addr; - xfs_daddr_t bno = bp->b_bn; + xfs_daddr_t bno = xfs_buf_daddr(bp); int len = BBTOB(bp->b_length); int blksize = mp->m_attr_geo->blksize; @@ -672,7 +672,7 @@ xfs_attr_rmtval_invalidate( * routine until it returns something other than -EAGAIN. */ int -__xfs_attr_rmtval_remove( +xfs_attr_rmtval_remove( struct xfs_delattr_context *dac) { struct xfs_da_args *args = dac->da_args; @@ -696,6 +696,7 @@ __xfs_attr_rmtval_remove( */ if (!done) { dac->flags |= XFS_DAC_DEFER_FINISH; + trace_xfs_attr_rmtval_remove_return(dac->dela_state, args->dp); return -EAGAIN; } diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h index 61b85b918db8..d72eff30ca18 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.h +++ b/fs/xfs/libxfs/xfs_attr_remote.h @@ -12,7 +12,7 @@ int xfs_attr_rmtval_get(struct xfs_da_args *args); int xfs_attr_rmtval_stale(struct xfs_inode *ip, struct xfs_bmbt_irec *map, xfs_buf_flags_t incore_flags); int xfs_attr_rmtval_invalidate(struct xfs_da_args *args); -int __xfs_attr_rmtval_remove(struct xfs_delattr_context *dac); +int xfs_attr_rmtval_remove(struct xfs_delattr_context *dac); int xfs_attr_rmt_find_hole(struct xfs_da_args *args); int xfs_attr_rmtval_set_value(struct xfs_da_args *args); int xfs_attr_rmtval_set_blk(struct xfs_delattr_context *dac); diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 948092babb6a..b48230f1a361 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -242,7 +242,7 @@ xfs_bmap_get_bp( for (i = 0; i < XFS_BTREE_MAXLEVELS; i++) { if (!cur->bc_bufs[i]) break; - if (XFS_BUF_ADDR(cur->bc_bufs[i]) == bno) + if (xfs_buf_daddr(cur->bc_bufs[i]) == bno) return cur->bc_bufs[i]; } @@ -251,7 +251,7 @@ xfs_bmap_get_bp( struct xfs_buf_log_item *bip = (struct xfs_buf_log_item *)lip; if (bip->bli_item.li_type == XFS_LI_BUF && - XFS_BUF_ADDR(bip->bli_buf) == bno) + xfs_buf_daddr(bip->bli_buf) == bno) return bip->bli_buf; } @@ -739,7 +739,7 @@ xfs_bmap_extents_to_btree( */ abp->b_ops = &xfs_bmbt_buf_ops; ablock = XFS_BUF_TO_BLOCK(abp); - xfs_btree_init_block_int(mp, ablock, abp->b_bn, + xfs_btree_init_block_int(mp, ablock, xfs_buf_daddr(abp), XFS_BTNUM_BMAP, 0, 0, ip->i_ino, XFS_BTREE_LONG_PTRS); @@ -1047,7 +1047,7 @@ xfs_bmap_set_attrforkoff( ip->i_forkoff = xfs_attr_shortform_bytesfit(ip, size); if (!ip->i_forkoff) ip->i_forkoff = default_size; - else if ((ip->i_mount->m_flags & XFS_MOUNT_ATTR2) && version) + else if (xfs_has_attr2(ip->i_mount) && version) *version = 2; break; default: @@ -1115,17 +1115,17 @@ xfs_bmap_add_attrfork( xfs_trans_log_inode(tp, ip, logflags); if (error) goto trans_cancel; - if (!xfs_sb_version_hasattr(&mp->m_sb) || - (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2)) { + if (!xfs_has_attr(mp) || + (!xfs_has_attr2(mp) && version == 2)) { bool log_sb = false; spin_lock(&mp->m_sb_lock); - if (!xfs_sb_version_hasattr(&mp->m_sb)) { - xfs_sb_version_addattr(&mp->m_sb); + if (!xfs_has_attr(mp)) { + xfs_add_attr(mp); log_sb = true; } - if (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2) { - xfs_sb_version_addattr2(&mp->m_sb); + if (!xfs_has_attr2(mp) && version == 2) { + xfs_add_attr2(mp); log_sb = true; } spin_unlock(&mp->m_sb_lock); @@ -3422,7 +3422,7 @@ xfs_bmap_compute_alignments( int stripe_align = 0; /* stripe alignment for allocation is determined by mount parameters */ - if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC)) + if (mp->m_swidth && xfs_has_swalloc(mp)) stripe_align = mp->m_swidth; else if (mp->m_dalign) stripe_align = mp->m_dalign; @@ -3938,7 +3938,7 @@ xfs_bmapi_read( XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) return -EFSCORRUPTED; - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; XFS_STATS_INC(mp, xs_blk_mapr); @@ -4420,7 +4420,7 @@ xfs_bmapi_write( return -EFSCORRUPTED; } - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; XFS_STATS_INC(mp, xs_blk_mapw); @@ -4703,7 +4703,7 @@ xfs_bmapi_remap( return -EFSCORRUPTED; } - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; error = xfs_iread_extents(tp, ip, whichfork); @@ -5361,7 +5361,7 @@ __xfs_bunmapi( ifp = XFS_IFORK_PTR(ip, whichfork); if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp))) return -EFSCORRUPTED; - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); @@ -5852,7 +5852,7 @@ xfs_bmap_collapse_extents( return -EFSCORRUPTED; } - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL)); @@ -5930,7 +5930,7 @@ xfs_bmap_can_insert_extents( ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + if (xfs_is_shutdown(ip->i_mount)) return -EIO; xfs_ilock(ip, XFS_ILOCK_EXCL); @@ -5967,7 +5967,7 @@ xfs_bmap_insert_extents( return -EFSCORRUPTED; } - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL)); @@ -6070,7 +6070,7 @@ xfs_bmap_split_extent( return -EFSCORRUPTED; } - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; /* Read in all the extents */ diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 1ceba020940e..72444b8b38a6 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -58,7 +58,7 @@ xfs_bmdr_to_bmbt( void xfs_bmbt_disk_get_all( - struct xfs_bmbt_rec *rec, + const struct xfs_bmbt_rec *rec, struct xfs_bmbt_irec *irec) { uint64_t l0 = get_unaligned_be64(&rec->l0); @@ -78,7 +78,7 @@ xfs_bmbt_disk_get_all( */ xfs_filblks_t xfs_bmbt_disk_get_blockcount( - xfs_bmbt_rec_t *r) + const struct xfs_bmbt_rec *r) { return (xfs_filblks_t)(be64_to_cpu(r->l1) & xfs_mask64lo(21)); } @@ -88,7 +88,7 @@ xfs_bmbt_disk_get_blockcount( */ xfs_fileoff_t xfs_bmbt_disk_get_startoff( - xfs_bmbt_rec_t *r) + const struct xfs_bmbt_rec *r) { return ((xfs_fileoff_t)be64_to_cpu(r->l0) & xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9; @@ -136,7 +136,7 @@ xfs_bmbt_to_bmdr( xfs_bmbt_key_t *tkp; __be64 *tpp; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_CRC_MAGIC)); ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)); @@ -193,10 +193,10 @@ xfs_bmbt_update_cursor( STATIC int xfs_bmbt_alloc_block( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *start, - union xfs_btree_ptr *new, - int *stat) + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat) { xfs_alloc_arg_t args; /* block allocation args */ int error; /* error return value */ @@ -282,7 +282,7 @@ xfs_bmbt_free_block( struct xfs_mount *mp = cur->bc_mp; struct xfs_inode *ip = cur->bc_ino.ip; struct xfs_trans *tp = cur->bc_tp; - xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp)); + xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); struct xfs_owner_info oinfo; xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork); @@ -352,8 +352,8 @@ xfs_bmbt_get_dmaxrecs( STATIC void xfs_bmbt_init_key_from_rec( - union xfs_btree_key *key, - union xfs_btree_rec *rec) + union xfs_btree_key *key, + const union xfs_btree_rec *rec) { key->bmbt.br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(&rec->bmbt)); @@ -361,8 +361,8 @@ xfs_bmbt_init_key_from_rec( STATIC void xfs_bmbt_init_high_key_from_rec( - union xfs_btree_key *key, - union xfs_btree_rec *rec) + union xfs_btree_key *key, + const union xfs_btree_rec *rec) { key->bmbt.br_startoff = cpu_to_be64( xfs_bmbt_disk_get_startoff(&rec->bmbt) + @@ -387,8 +387,8 @@ xfs_bmbt_init_ptr_from_cur( STATIC int64_t xfs_bmbt_key_diff( - struct xfs_btree_cur *cur, - union xfs_btree_key *key) + struct xfs_btree_cur *cur, + const union xfs_btree_key *key) { return (int64_t)be64_to_cpu(key->bmbt.br_startoff) - cur->bc_rec.b.br_startoff; @@ -396,12 +396,12 @@ xfs_bmbt_key_diff( STATIC int64_t xfs_bmbt_diff_two_keys( - struct xfs_btree_cur *cur, - union xfs_btree_key *k1, - union xfs_btree_key *k2) + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2) { - uint64_t a = be64_to_cpu(k1->bmbt.br_startoff); - uint64_t b = be64_to_cpu(k2->bmbt.br_startoff); + uint64_t a = be64_to_cpu(k1->bmbt.br_startoff); + uint64_t b = be64_to_cpu(k2->bmbt.br_startoff); /* * Note: This routine previously casted a and b to int64 and subtracted @@ -428,7 +428,7 @@ xfs_bmbt_verify( if (!xfs_verify_magic(bp, block->bb_magic)) return __this_address; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { /* * XXX: need a better way of verifying the owner here. Right now * just make sure there has been one set. @@ -497,9 +497,9 @@ const struct xfs_buf_ops xfs_bmbt_buf_ops = { STATIC int xfs_bmbt_keys_inorder( - struct xfs_btree_cur *cur, - union xfs_btree_key *k1, - union xfs_btree_key *k2) + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2) { return be64_to_cpu(k1->bmbt.br_startoff) < be64_to_cpu(k2->bmbt.br_startoff); @@ -507,9 +507,9 @@ xfs_bmbt_keys_inorder( STATIC int xfs_bmbt_recs_inorder( - struct xfs_btree_cur *cur, - union xfs_btree_rec *r1, - union xfs_btree_rec *r2) + struct xfs_btree_cur *cur, + const union xfs_btree_rec *r1, + const union xfs_btree_rec *r2) { return xfs_bmbt_disk_get_startoff(&r1->bmbt) + xfs_bmbt_disk_get_blockcount(&r1->bmbt) <= @@ -563,7 +563,7 @@ xfs_bmbt_init_cursor( cur->bc_ops = &xfs_bmbt_ops; cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE; - if (xfs_sb_version_hascrc(&mp->m_sb)) + if (xfs_has_crc(mp)) cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; cur->bc_ino.forksize = XFS_IFORK_SIZE(ip, whichfork); diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h index 72bf74c79fb9..729e3bc569be 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.h +++ b/fs/xfs/libxfs/xfs_bmap_btree.h @@ -16,7 +16,7 @@ struct xfs_trans; * Btree block header size depends on a superblock flag. */ #define XFS_BMBT_BLOCK_LEN(mp) \ - (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ + (xfs_has_crc(((mp))) ? \ XFS_BTREE_LBLOCK_CRC_LEN : XFS_BTREE_LBLOCK_LEN) #define XFS_BMBT_REC_ADDR(mp, block, index) \ @@ -88,9 +88,10 @@ extern void xfs_bmdr_to_bmbt(struct xfs_inode *, xfs_bmdr_block_t *, int, struct xfs_btree_block *, int); void xfs_bmbt_disk_set_all(struct xfs_bmbt_rec *r, struct xfs_bmbt_irec *s); -extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(xfs_bmbt_rec_t *r); -extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(xfs_bmbt_rec_t *r); -extern void xfs_bmbt_disk_get_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s); +extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(const struct xfs_bmbt_rec *r); +extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(const struct xfs_bmbt_rec *r); +void xfs_bmbt_disk_get_all(const struct xfs_bmbt_rec *r, + struct xfs_bmbt_irec *s); extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int, xfs_bmdr_block_t *, int); diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index be74a6b53689..298395481713 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -64,13 +64,13 @@ __xfs_btree_check_lblock( { struct xfs_mount *mp = cur->bc_mp; xfs_btnum_t btnum = cur->bc_btnum; - int crc = xfs_sb_version_hascrc(&mp->m_sb); + int crc = xfs_has_crc(mp); if (crc) { if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; if (block->bb_u.l.bb_blkno != - cpu_to_be64(bp ? bp->b_bn : XFS_BUF_DADDR_NULL)) + cpu_to_be64(bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL)) return __this_address; if (block->bb_u.l.bb_pad != cpu_to_be32(0)) return __this_address; @@ -129,13 +129,13 @@ __xfs_btree_check_sblock( { struct xfs_mount *mp = cur->bc_mp; xfs_btnum_t btnum = cur->bc_btnum; - int crc = xfs_sb_version_hascrc(&mp->m_sb); + int crc = xfs_has_crc(mp); if (crc) { if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; if (block->bb_u.s.bb_blkno != - cpu_to_be64(bp ? bp->b_bn : XFS_BUF_DADDR_NULL)) + cpu_to_be64(bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL)) return __this_address; } @@ -225,10 +225,10 @@ xfs_btree_check_sptr( */ static int xfs_btree_check_ptr( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *ptr, - int index, - int level) + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr, + int index, + int level) { if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { if (xfs_btree_check_lptr(cur, be64_to_cpu((&ptr->l)[index]), @@ -273,7 +273,7 @@ xfs_btree_lblock_calc_crc( struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); struct xfs_buf_log_item *bip = bp->b_log_item; - if (!xfs_sb_version_hascrc(&bp->b_mount->m_sb)) + if (!xfs_has_crc(bp->b_mount)) return; if (bip) block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); @@ -287,7 +287,7 @@ xfs_btree_lblock_verify_crc( struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); struct xfs_mount *mp = bp->b_mount; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.l.bb_lsn))) return false; return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF); @@ -311,7 +311,7 @@ xfs_btree_sblock_calc_crc( struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); struct xfs_buf_log_item *bip = bp->b_log_item; - if (!xfs_sb_version_hascrc(&bp->b_mount->m_sb)) + if (!xfs_has_crc(bp->b_mount)) return; if (bip) block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); @@ -325,7 +325,7 @@ xfs_btree_sblock_verify_crc( struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); struct xfs_mount *mp = bp->b_mount; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn))) return false; return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF); @@ -374,7 +374,7 @@ xfs_btree_del_cursor( } ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || cur->bc_ino.allocated == 0 || - XFS_FORCED_SHUTDOWN(cur->bc_mp)); + xfs_is_shutdown(cur->bc_mp)); if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) kmem_free(cur->bc_ops); if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS) && cur->bc_ag.pag) @@ -420,7 +420,7 @@ xfs_btree_dup_cursor( bp = cur->bc_bufs[i]; if (bp) { error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, - XFS_BUF_ADDR(bp), mp->m_bsize, + xfs_buf_daddr(bp), mp->m_bsize, 0, &bp, cur->bc_ops->buf_ops); if (error) { @@ -935,9 +935,9 @@ xfs_btree_readahead( STATIC int xfs_btree_ptr_to_daddr( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *ptr, - xfs_daddr_t *daddr) + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr, + xfs_daddr_t *daddr) { xfs_fsblock_t fsbno; xfs_agblock_t agbno; @@ -1012,8 +1012,8 @@ xfs_btree_setbuf( bool xfs_btree_ptr_is_null( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *ptr) + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr) { if (cur->bc_flags & XFS_BTREE_LONG_PTRS) return ptr->l == cpu_to_be64(NULLFSBLOCK); @@ -1059,10 +1059,10 @@ xfs_btree_get_sibling( void xfs_btree_set_sibling( - struct xfs_btree_cur *cur, - struct xfs_btree_block *block, - union xfs_btree_ptr *ptr, - int lr) + struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + const union xfs_btree_ptr *ptr, + int lr) { ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB); @@ -1090,7 +1090,7 @@ xfs_btree_init_block_int( __u64 owner, unsigned int flags) { - int crc = xfs_sb_version_hascrc(&mp->m_sb); + int crc = xfs_has_crc(mp); __u32 magic = xfs_btree_magic(crc, btnum); buf->bb_magic = cpu_to_be32(magic); @@ -1131,7 +1131,7 @@ xfs_btree_init_block( __u16 numrecs, __u64 owner) { - xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn, + xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), xfs_buf_daddr(bp), btnum, level, numrecs, owner, 0); } @@ -1155,9 +1155,9 @@ xfs_btree_init_block_cur( else owner = cur->bc_ag.pag->pag_agno; - xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn, - cur->bc_btnum, level, numrecs, - owner, cur->bc_flags); + xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), + xfs_buf_daddr(bp), cur->bc_btnum, level, + numrecs, owner, cur->bc_flags); } /* @@ -1192,10 +1192,10 @@ xfs_btree_buf_to_ptr( { if (cur->bc_flags & XFS_BTREE_LONG_PTRS) ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp, - XFS_BUF_ADDR(bp))); + xfs_buf_daddr(bp))); else { ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp, - XFS_BUF_ADDR(bp))); + xfs_buf_daddr(bp))); } } @@ -1229,10 +1229,10 @@ xfs_btree_set_refs( int xfs_btree_get_buf_block( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *ptr, - struct xfs_btree_block **block, - struct xfs_buf **bpp) + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr, + struct xfs_btree_block **block, + struct xfs_buf **bpp) { struct xfs_mount *mp = cur->bc_mp; xfs_daddr_t d; @@ -1257,11 +1257,11 @@ xfs_btree_get_buf_block( */ STATIC int xfs_btree_read_buf_block( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *ptr, - int flags, - struct xfs_btree_block **block, - struct xfs_buf **bpp) + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr, + int flags, + struct xfs_btree_block **block, + struct xfs_buf **bpp) { struct xfs_mount *mp = cur->bc_mp; xfs_daddr_t d; @@ -1289,10 +1289,10 @@ xfs_btree_read_buf_block( */ void xfs_btree_copy_keys( - struct xfs_btree_cur *cur, - union xfs_btree_key *dst_key, - union xfs_btree_key *src_key, - int numkeys) + struct xfs_btree_cur *cur, + union xfs_btree_key *dst_key, + const union xfs_btree_key *src_key, + int numkeys) { ASSERT(numkeys >= 0); memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len); @@ -1713,10 +1713,10 @@ error0: int xfs_btree_lookup_get_block( - struct xfs_btree_cur *cur, /* btree cursor */ - int level, /* level in the btree */ - union xfs_btree_ptr *pp, /* ptr to btree block */ - struct xfs_btree_block **blkp) /* return btree block */ + struct xfs_btree_cur *cur, /* btree cursor */ + int level, /* level in the btree */ + const union xfs_btree_ptr *pp, /* ptr to btree block */ + struct xfs_btree_block **blkp) /* return btree block */ { struct xfs_buf *bp; /* buffer pointer for btree block */ xfs_daddr_t daddr; @@ -1739,7 +1739,7 @@ xfs_btree_lookup_get_block( error = xfs_btree_ptr_to_daddr(cur, pp, &daddr); if (error) return error; - if (bp && XFS_BUF_ADDR(bp) == daddr) { + if (bp && xfs_buf_daddr(bp) == daddr) { *blkp = XFS_BUF_TO_BLOCK(bp); return 0; } @@ -1749,7 +1749,7 @@ xfs_btree_lookup_get_block( return error; /* Check the inode owner since the verifiers don't. */ - if (xfs_sb_version_hascrc(&cur->bc_mp->m_sb) && + if (xfs_has_crc(cur->bc_mp) && !(cur->bc_ino.flags & XFS_BTCUR_BMBT_INVALID_OWNER) && (cur->bc_flags & XFS_BTREE_LONG_PTRS) && be64_to_cpu((*blkp)->bb_u.l.bb_owner) != @@ -2923,10 +2923,11 @@ xfs_btree_new_iroot( */ memcpy(cblock, block, xfs_btree_block_len(cur)); if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) { + __be64 bno = cpu_to_be64(xfs_buf_daddr(cbp)); if (cur->bc_flags & XFS_BTREE_LONG_PTRS) - cblock->bb_u.l.bb_blkno = cpu_to_be64(cbp->b_bn); + cblock->bb_u.l.bb_blkno = bno; else - cblock->bb_u.s.bb_blkno = cpu_to_be64(cbp->b_bn); + cblock->bb_u.s.bb_blkno = bno; } be16_add_cpu(&block->bb_level, 1); @@ -3225,7 +3226,7 @@ xfs_btree_insrec( /* Get pointers to the btree buffer and block. */ block = xfs_btree_get_block(cur, level, &bp); - old_bn = bp ? bp->b_bn : XFS_BUF_DADDR_NULL; + old_bn = bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL; numrecs = xfs_btree_get_numrecs(block); #ifdef DEBUG @@ -3341,7 +3342,7 @@ xfs_btree_insrec( * some records into the new tree block), so use the regular key * update mechanism. */ - if (bp && bp->b_bn != old_bn) { + if (bp && xfs_buf_daddr(bp) != old_bn) { xfs_btree_get_keys(cur, block, lkey); } else if (xfs_btree_needs_key_update(cur, optr)) { error = xfs_btree_update_keys(cur, level); @@ -4418,11 +4419,11 @@ xfs_btree_lblock_v5hdr_verify( struct xfs_mount *mp = bp->b_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return __this_address; if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; - if (block->bb_u.l.bb_blkno != cpu_to_be64(bp->b_bn)) + if (block->bb_u.l.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp))) return __this_address; if (owner != XFS_RMAP_OWN_UNKNOWN && be64_to_cpu(block->bb_u.l.bb_owner) != owner) @@ -4468,11 +4469,11 @@ xfs_btree_sblock_v5hdr_verify( struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); struct xfs_perag *pag = bp->b_pag; - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return __this_address; if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; - if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) + if (block->bb_u.s.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp))) return __this_address; if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) return __this_address; @@ -4499,7 +4500,7 @@ xfs_btree_sblock_verify( return __this_address; /* sibling pointer verification */ - agno = xfs_daddr_to_agno(mp, XFS_BUF_ADDR(bp)); + agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp)); if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) && !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_leftsib))) return __this_address; @@ -4536,8 +4537,8 @@ xfs_btree_compute_maxlevels( STATIC int xfs_btree_simple_query_range( struct xfs_btree_cur *cur, - union xfs_btree_key *low_key, - union xfs_btree_key *high_key, + const union xfs_btree_key *low_key, + const union xfs_btree_key *high_key, xfs_btree_query_range_fn fn, void *priv) { @@ -4627,8 +4628,8 @@ out: STATIC int xfs_btree_overlapped_query_range( struct xfs_btree_cur *cur, - union xfs_btree_key *low_key, - union xfs_btree_key *high_key, + const union xfs_btree_key *low_key, + const union xfs_btree_key *high_key, xfs_btree_query_range_fn fn, void *priv) { @@ -4769,8 +4770,8 @@ out: int xfs_btree_query_range( struct xfs_btree_cur *cur, - union xfs_btree_irec *low_rec, - union xfs_btree_irec *high_rec, + const union xfs_btree_irec *low_rec, + const union xfs_btree_irec *high_rec, xfs_btree_query_range_fn fn, void *priv) { @@ -4877,7 +4878,7 @@ xfs_btree_diff_two_ptrs( STATIC int xfs_btree_has_record_helper( struct xfs_btree_cur *cur, - union xfs_btree_rec *rec, + const union xfs_btree_rec *rec, void *priv) { return -ECANCELED; @@ -4886,12 +4887,12 @@ xfs_btree_has_record_helper( /* Is there a record covering a given range of keys? */ int xfs_btree_has_record( - struct xfs_btree_cur *cur, - union xfs_btree_irec *low, - union xfs_btree_irec *high, - bool *exists) + struct xfs_btree_cur *cur, + const union xfs_btree_irec *low, + const union xfs_btree_irec *high, + bool *exists) { - int error; + int error; error = xfs_btree_query_range(cur, low, high, &xfs_btree_has_record_helper, NULL); diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 4dbdc659c396..4eaf8517f850 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -106,19 +106,19 @@ struct xfs_btree_ops { /* update btree root pointer */ void (*set_root)(struct xfs_btree_cur *cur, - union xfs_btree_ptr *nptr, int level_change); + const union xfs_btree_ptr *nptr, int level_change); /* block allocation / freeing */ int (*alloc_block)(struct xfs_btree_cur *cur, - union xfs_btree_ptr *start_bno, + const union xfs_btree_ptr *start_bno, union xfs_btree_ptr *new_bno, int *stat); int (*free_block)(struct xfs_btree_cur *cur, struct xfs_buf *bp); /* update last record information */ void (*update_lastrec)(struct xfs_btree_cur *cur, - struct xfs_btree_block *block, - union xfs_btree_rec *rec, + const struct xfs_btree_block *block, + const union xfs_btree_rec *rec, int ptr, int reason); /* records in block/level */ @@ -130,37 +130,37 @@ struct xfs_btree_ops { /* init values of btree structures */ void (*init_key_from_rec)(union xfs_btree_key *key, - union xfs_btree_rec *rec); + const union xfs_btree_rec *rec); void (*init_rec_from_cur)(struct xfs_btree_cur *cur, union xfs_btree_rec *rec); void (*init_ptr_from_cur)(struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr); void (*init_high_key_from_rec)(union xfs_btree_key *key, - union xfs_btree_rec *rec); + const union xfs_btree_rec *rec); /* difference between key value and cursor value */ int64_t (*key_diff)(struct xfs_btree_cur *cur, - union xfs_btree_key *key); + const union xfs_btree_key *key); /* * Difference between key2 and key1 -- positive if key1 > key2, * negative if key1 < key2, and zero if equal. */ int64_t (*diff_two_keys)(struct xfs_btree_cur *cur, - union xfs_btree_key *key1, - union xfs_btree_key *key2); + const union xfs_btree_key *key1, + const union xfs_btree_key *key2); const struct xfs_buf_ops *buf_ops; /* check that k1 is lower than k2 */ int (*keys_inorder)(struct xfs_btree_cur *cur, - union xfs_btree_key *k1, - union xfs_btree_key *k2); + const union xfs_btree_key *k1, + const union xfs_btree_key *k2); /* check that r1 is lower than r2 */ int (*recs_inorder)(struct xfs_btree_cur *cur, - union xfs_btree_rec *r1, - union xfs_btree_rec *r2); + const union xfs_btree_rec *r1, + const union xfs_btree_rec *r2); }; /* @@ -423,7 +423,7 @@ void xfs_btree_log_recs(struct xfs_btree_cur *, struct xfs_buf *, int, int); /* * Helpers. */ -static inline int xfs_btree_get_numrecs(struct xfs_btree_block *block) +static inline int xfs_btree_get_numrecs(const struct xfs_btree_block *block) { return be16_to_cpu(block->bb_numrecs); } @@ -434,7 +434,7 @@ static inline void xfs_btree_set_numrecs(struct xfs_btree_block *block, block->bb_numrecs = cpu_to_be16(numrecs); } -static inline int xfs_btree_get_level(struct xfs_btree_block *block) +static inline int xfs_btree_get_level(const struct xfs_btree_block *block) { return be16_to_cpu(block->bb_level); } @@ -471,10 +471,11 @@ unsigned long long xfs_btree_calc_size(uint *limits, unsigned long long len); * code on its own. */ typedef int (*xfs_btree_query_range_fn)(struct xfs_btree_cur *cur, - union xfs_btree_rec *rec, void *priv); + const union xfs_btree_rec *rec, void *priv); int xfs_btree_query_range(struct xfs_btree_cur *cur, - union xfs_btree_irec *low_rec, union xfs_btree_irec *high_rec, + const union xfs_btree_irec *low_rec, + const union xfs_btree_irec *high_rec, xfs_btree_query_range_fn fn, void *priv); int xfs_btree_query_all(struct xfs_btree_cur *cur, xfs_btree_query_range_fn fn, void *priv); @@ -502,10 +503,11 @@ union xfs_btree_key *xfs_btree_high_key_addr(struct xfs_btree_cur *cur, int n, union xfs_btree_ptr *xfs_btree_ptr_addr(struct xfs_btree_cur *cur, int n, struct xfs_btree_block *block); int xfs_btree_lookup_get_block(struct xfs_btree_cur *cur, int level, - union xfs_btree_ptr *pp, struct xfs_btree_block **blkp); + const union xfs_btree_ptr *pp, struct xfs_btree_block **blkp); struct xfs_btree_block *xfs_btree_get_block(struct xfs_btree_cur *cur, int level, struct xfs_buf **bpp); -bool xfs_btree_ptr_is_null(struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr); +bool xfs_btree_ptr_is_null(struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr); int64_t xfs_btree_diff_two_ptrs(struct xfs_btree_cur *cur, const union xfs_btree_ptr *a, const union xfs_btree_ptr *b); @@ -516,8 +518,9 @@ void xfs_btree_get_keys(struct xfs_btree_cur *cur, struct xfs_btree_block *block, union xfs_btree_key *key); union xfs_btree_key *xfs_btree_high_key_from_key(struct xfs_btree_cur *cur, union xfs_btree_key *key); -int xfs_btree_has_record(struct xfs_btree_cur *cur, union xfs_btree_irec *low, - union xfs_btree_irec *high, bool *exists); +int xfs_btree_has_record(struct xfs_btree_cur *cur, + const union xfs_btree_irec *low, + const union xfs_btree_irec *high, bool *exists); bool xfs_btree_has_more_records(struct xfs_btree_cur *cur); struct xfs_ifork *xfs_btree_ifork_ptr(struct xfs_btree_cur *cur); @@ -540,10 +543,11 @@ xfs_btree_islastblock( void xfs_btree_set_ptr_null(struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr); -int xfs_btree_get_buf_block(struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr, - struct xfs_btree_block **block, struct xfs_buf **bpp); +int xfs_btree_get_buf_block(struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr, struct xfs_btree_block **block, + struct xfs_buf **bpp); void xfs_btree_set_sibling(struct xfs_btree_cur *cur, - struct xfs_btree_block *block, union xfs_btree_ptr *ptr, + struct xfs_btree_block *block, const union xfs_btree_ptr *ptr, int lr); void xfs_btree_init_block_cur(struct xfs_btree_cur *cur, struct xfs_buf *bp, int level, int numrecs); @@ -551,7 +555,7 @@ void xfs_btree_copy_ptrs(struct xfs_btree_cur *cur, union xfs_btree_ptr *dst_ptr, const union xfs_btree_ptr *src_ptr, int numptrs); void xfs_btree_copy_keys(struct xfs_btree_cur *cur, - union xfs_btree_key *dst_key, union xfs_btree_key *src_key, - int numkeys); + union xfs_btree_key *dst_key, + const union xfs_btree_key *src_key, int numkeys); #endif /* __XFS_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c index aa8dc9521c39..ac9e80152b5c 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.c +++ b/fs/xfs/libxfs/xfs_btree_staging.c @@ -59,10 +59,10 @@ xfs_btree_fakeroot_dup_cursor( */ STATIC int xfs_btree_fakeroot_alloc_block( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *start_bno, - union xfs_btree_ptr *new_bno, - int *stat) + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *start_bno, + union xfs_btree_ptr *new_bno, + int *stat) { ASSERT(0); return -EFSCORRUPTED; @@ -112,9 +112,9 @@ xfs_btree_fakeroot_init_ptr_from_cur( /* Update the btree root information for a per-AG fake root. */ STATIC void xfs_btree_afakeroot_set_root( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *ptr, - int inc) + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr, + int inc) { struct xbtree_afakeroot *afake = cur->bc_ag.afake; diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 747ec77912c3..c062e2c85178 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -129,7 +129,7 @@ xfs_da3_node_hdr_from_disk( struct xfs_da3_icnode_hdr *to, struct xfs_da_intnode *from) { - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { struct xfs_da3_intnode *from3 = (struct xfs_da3_intnode *)from; to->forw = be32_to_cpu(from3->hdr.info.hdr.forw); @@ -156,7 +156,7 @@ xfs_da3_node_hdr_to_disk( struct xfs_da_intnode *to, struct xfs_da3_icnode_hdr *from) { - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { struct xfs_da3_intnode *to3 = (struct xfs_da3_intnode *)to; ASSERT(from->magic == XFS_DA3_NODE_MAGIC); @@ -191,10 +191,10 @@ xfs_da3_blkinfo_verify( if (!xfs_verify_magic16(bp, hdr->magic)) return __this_address; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; - if (be64_to_cpu(hdr3->blkno) != bp->b_bn) + if (be64_to_cpu(hdr3->blkno) != xfs_buf_daddr(bp)) return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn))) return __this_address; @@ -253,7 +253,7 @@ xfs_da3_node_write_verify( return; } - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return; if (bip) @@ -442,12 +442,12 @@ xfs_da3_node_create( xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF); node = bp->b_addr; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { struct xfs_da3_node_hdr *hdr3 = bp->b_addr; memset(hdr3, 0, sizeof(struct xfs_da3_node_hdr)); ichdr.magic = XFS_DA3_NODE_MAGIC; - hdr3->info.blkno = cpu_to_be64(bp->b_bn); + hdr3->info.blkno = cpu_to_be64(xfs_buf_daddr(bp)); hdr3->info.owner = cpu_to_be64(args->dp->i_ino); uuid_copy(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid); } else { @@ -711,7 +711,7 @@ xfs_da3_root_split( oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) { struct xfs_da3_intnode *node3 = (struct xfs_da3_intnode *)node; - node3->hdr.info.blkno = cpu_to_be64(bp->b_bn); + node3->hdr.info.blkno = cpu_to_be64(xfs_buf_daddr(bp)); } xfs_trans_log_buf(tp, bp, 0, size - 1); @@ -1219,7 +1219,7 @@ xfs_da3_root_join( xfs_trans_buf_copy_type(root_blk->bp, bp); if (oldroothdr.magic == XFS_DA3_NODE_MAGIC) { struct xfs_da3_blkinfo *da3 = root_blk->bp->b_addr; - da3->blkno = cpu_to_be64(root_blk->bp->b_bn); + da3->blkno = cpu_to_be64(xfs_buf_daddr(root_blk->bp)); } xfs_trans_log_buf(args->trans, root_blk->bp, 0, args->geo->blksize - 1); diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index b876b44c0204..5a49caa5c9df 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -789,7 +789,7 @@ struct xfs_attr3_rmt_hdr { #define XFS_ATTR3_RMT_CRC_OFF offsetof(struct xfs_attr3_rmt_hdr, rm_crc) #define XFS_ATTR3_RMT_BUF_SPACE(mp, bufsize) \ - ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \ + ((bufsize) - (xfs_has_crc((mp)) ? \ sizeof(struct xfs_attr3_rmt_hdr) : 0)) /* Number of bytes in a directory block. */ diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 050bdcc4fe73..50546eadaae2 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -115,7 +115,7 @@ xfs_da_mount( dageo->fsblog = mp->m_sb.sb_blocklog; dageo->blksize = xfs_dir2_dirblock_bytes(&mp->m_sb); dageo->fsbcount = 1 << mp->m_sb.sb_dirblklog; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { dageo->node_hdr_size = sizeof(struct xfs_da3_node_hdr); dageo->leaf_hdr_size = sizeof(struct xfs_dir3_leaf_hdr); dageo->free_hdr_size = sizeof(struct xfs_dir3_free_hdr); @@ -730,7 +730,7 @@ xfs_dir2_hashname( struct xfs_mount *mp, struct xfs_name *name) { - if (unlikely(xfs_sb_version_hasasciici(&mp->m_sb))) + if (unlikely(xfs_has_asciici(mp))) return xfs_ascii_ci_hashname(name); return xfs_da_hashname(name->name, name->len); } @@ -741,7 +741,7 @@ xfs_dir2_compname( const unsigned char *name, int len) { - if (unlikely(xfs_sb_version_hasasciici(&args->dp->i_mount->m_sb))) + if (unlikely(xfs_has_asciici(args->dp->i_mount))) return xfs_ascii_ci_compname(args, name, len); return xfs_da_compname(args, name, len); } diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index 75e1421f69c4..df0869bba275 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -53,10 +53,10 @@ xfs_dir3_block_verify( if (!xfs_verify_magic(bp, hdr3->magic)) return __this_address; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; - if (be64_to_cpu(hdr3->blkno) != bp->b_bn) + if (be64_to_cpu(hdr3->blkno) != xfs_buf_daddr(bp)) return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn))) return __this_address; @@ -71,7 +71,7 @@ xfs_dir3_block_read_verify( struct xfs_mount *mp = bp->b_mount; xfs_failaddr_t fa; - if (xfs_sb_version_hascrc(&mp->m_sb) && + if (xfs_has_crc(mp) && !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF)) xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { @@ -96,7 +96,7 @@ xfs_dir3_block_write_verify( return; } - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return; if (bip) @@ -121,7 +121,7 @@ xfs_dir3_block_header_check( { struct xfs_mount *mp = dp->i_mount; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; if (be64_to_cpu(hdr3->owner) != dp->i_ino) @@ -171,10 +171,10 @@ xfs_dir3_block_init( bp->b_ops = &xfs_dir3_block_buf_ops; xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_BLOCK_BUF); - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { memset(hdr3, 0, sizeof(*hdr3)); hdr3->magic = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC); - hdr3->blkno = cpu_to_be64(bp->b_bn); + hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp)); hdr3->owner = cpu_to_be64(dp->i_ino); uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid); return; diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index e67fa086f2c1..dbcf58979a59 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -29,7 +29,7 @@ xfs_dir2_data_bestfree_p( struct xfs_mount *mp, struct xfs_dir2_data_hdr *hdr) { - if (xfs_sb_version_hascrc(&mp->m_sb)) + if (xfs_has_crc(mp)) return ((struct xfs_dir3_data_hdr *)hdr)->best_free; return hdr->bestfree; } @@ -51,7 +51,7 @@ xfs_dir2_data_get_ftype( struct xfs_mount *mp, struct xfs_dir2_data_entry *dep) { - if (xfs_sb_version_hasftype(&mp->m_sb)) { + if (xfs_has_ftype(mp)) { uint8_t ftype = dep->name[dep->namelen]; if (likely(ftype < XFS_DIR3_FT_MAX)) @@ -70,7 +70,7 @@ xfs_dir2_data_put_ftype( ASSERT(ftype < XFS_DIR3_FT_MAX); ASSERT(dep->namelen != 0); - if (xfs_sb_version_hasftype(&mp->m_sb)) + if (xfs_has_ftype(mp)) dep->name[dep->namelen] = ftype; } @@ -297,10 +297,10 @@ xfs_dir3_data_verify( if (!xfs_verify_magic(bp, hdr3->magic)) return __this_address; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; - if (be64_to_cpu(hdr3->blkno) != bp->b_bn) + if (be64_to_cpu(hdr3->blkno) != xfs_buf_daddr(bp)) return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn))) return __this_address; @@ -343,7 +343,7 @@ xfs_dir3_data_read_verify( struct xfs_mount *mp = bp->b_mount; xfs_failaddr_t fa; - if (xfs_sb_version_hascrc(&mp->m_sb) && + if (xfs_has_crc(mp) && !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF)) xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { @@ -368,7 +368,7 @@ xfs_dir3_data_write_verify( return; } - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return; if (bip) @@ -401,7 +401,7 @@ xfs_dir3_data_header_check( { struct xfs_mount *mp = dp->i_mount; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { struct xfs_dir3_data_hdr *hdr3 = bp->b_addr; if (be64_to_cpu(hdr3->hdr.owner) != dp->i_ino) @@ -717,12 +717,12 @@ xfs_dir3_data_init( * Initialize the header. */ hdr = bp->b_addr; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; memset(hdr3, 0, sizeof(*hdr3)); hdr3->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC); - hdr3->blkno = cpu_to_be64(bp->b_bn); + hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp)); hdr3->owner = cpu_to_be64(dp->i_ino); uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid); diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index 5369d8bb2593..d9b66306a9a7 100644 --- a/fs/xfs/libxfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -37,7 +37,7 @@ xfs_dir2_leaf_hdr_from_disk( struct xfs_dir3_icleaf_hdr *to, struct xfs_dir2_leaf *from) { - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { struct xfs_dir3_leaf *from3 = (struct xfs_dir3_leaf *)from; to->forw = be32_to_cpu(from3->hdr.info.hdr.forw); @@ -68,7 +68,7 @@ xfs_dir2_leaf_hdr_to_disk( struct xfs_dir2_leaf *to, struct xfs_dir3_icleaf_hdr *from) { - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { struct xfs_dir3_leaf *to3 = (struct xfs_dir3_leaf *)to; ASSERT(from->magic == XFS_DIR3_LEAF1_MAGIC || @@ -108,7 +108,7 @@ xfs_dir3_leaf1_check( if (leafhdr.magic == XFS_DIR3_LEAF1_MAGIC) { struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; - if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn) + if (be64_to_cpu(leaf3->info.blkno) != xfs_buf_daddr(bp)) return __this_address; } else if (leafhdr.magic != XFS_DIR2_LEAF1_MAGIC) return __this_address; @@ -209,7 +209,7 @@ xfs_dir3_leaf_read_verify( struct xfs_mount *mp = bp->b_mount; xfs_failaddr_t fa; - if (xfs_sb_version_hascrc(&mp->m_sb) && + if (xfs_has_crc(mp) && !xfs_buf_verify_cksum(bp, XFS_DIR3_LEAF_CRC_OFF)) xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { @@ -234,7 +234,7 @@ xfs_dir3_leaf_write_verify( return; } - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return; if (bip) @@ -308,7 +308,7 @@ xfs_dir3_leaf_init( ASSERT(type == XFS_DIR2_LEAF1_MAGIC || type == XFS_DIR2_LEAFN_MAGIC); - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; memset(leaf3, 0, sizeof(*leaf3)); @@ -316,7 +316,7 @@ xfs_dir3_leaf_init( leaf3->info.hdr.magic = (type == XFS_DIR2_LEAF1_MAGIC) ? cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) : cpu_to_be16(XFS_DIR3_LEAFN_MAGIC); - leaf3->info.blkno = cpu_to_be64(bp->b_bn); + leaf3->info.blkno = cpu_to_be64(xfs_buf_daddr(bp)); leaf3->info.owner = cpu_to_be64(owner); uuid_copy(&leaf3->info.uuid, &mp->m_sb.sb_meta_uuid); } else { diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index d0520afb913a..7a03aeb9f4c9 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -68,7 +68,7 @@ xfs_dir3_leafn_check( if (leafhdr.magic == XFS_DIR3_LEAFN_MAGIC) { struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; - if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn) + if (be64_to_cpu(leaf3->info.blkno) != xfs_buf_daddr(bp)) return __this_address; } else if (leafhdr.magic != XFS_DIR2_LEAFN_MAGIC) return __this_address; @@ -105,12 +105,12 @@ xfs_dir3_free_verify( if (!xfs_verify_magic(bp, hdr->magic)) return __this_address; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; - if (be64_to_cpu(hdr3->blkno) != bp->b_bn) + if (be64_to_cpu(hdr3->blkno) != xfs_buf_daddr(bp)) return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn))) return __this_address; @@ -128,7 +128,7 @@ xfs_dir3_free_read_verify( struct xfs_mount *mp = bp->b_mount; xfs_failaddr_t fa; - if (xfs_sb_version_hascrc(&mp->m_sb) && + if (xfs_has_crc(mp) && !xfs_buf_verify_cksum(bp, XFS_DIR3_FREE_CRC_OFF)) xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { @@ -153,7 +153,7 @@ xfs_dir3_free_write_verify( return; } - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return; if (bip) @@ -185,7 +185,7 @@ xfs_dir3_free_header_check( firstdb = (xfs_dir2_da_to_db(mp->m_dir_geo, fbno) - xfs_dir2_byte_to_db(mp->m_dir_geo, XFS_DIR2_FREE_OFFSET)) * maxbests; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { struct xfs_dir3_free_hdr *hdr3 = bp->b_addr; if (be32_to_cpu(hdr3->firstdb) != firstdb) @@ -247,7 +247,7 @@ xfs_dir2_free_hdr_from_disk( struct xfs_dir3_icfree_hdr *to, struct xfs_dir2_free *from) { - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { struct xfs_dir3_free *from3 = (struct xfs_dir3_free *)from; to->magic = be32_to_cpu(from3->hdr.hdr.magic); @@ -274,7 +274,7 @@ xfs_dir2_free_hdr_to_disk( struct xfs_dir2_free *to, struct xfs_dir3_icfree_hdr *from) { - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { struct xfs_dir3_free *to3 = (struct xfs_dir3_free *)to; ASSERT(from->magic == XFS_DIR3_FREE_MAGIC); @@ -341,12 +341,12 @@ xfs_dir3_free_get_buf( memset(bp->b_addr, 0, sizeof(struct xfs_dir3_free_hdr)); memset(&hdr, 0, sizeof(hdr)); - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { struct xfs_dir3_free_hdr *hdr3 = bp->b_addr; hdr.magic = XFS_DIR3_FREE_MAGIC; - hdr3->hdr.blkno = cpu_to_be64(bp->b_bn); + hdr3->hdr.blkno = cpu_to_be64(xfs_buf_daddr(bp)); hdr3->hdr.owner = cpu_to_be64(dp->i_ino); uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_meta_uuid); } else diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h index 94943ce49cab..711709a2aa53 100644 --- a/fs/xfs/libxfs/xfs_dir2_priv.h +++ b/fs/xfs/libxfs/xfs_dir2_priv.h @@ -196,7 +196,7 @@ xfs_dir2_data_entsize( len = offsetof(struct xfs_dir2_data_entry, name[0]) + namelen + sizeof(xfs_dir2_data_off_t) /* tag */; - if (xfs_sb_version_hasftype(&mp->m_sb)) + if (xfs_has_ftype(mp)) len += sizeof(uint8_t); return round_up(len, XFS_DIR2_DATA_ALIGN); } diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index 46d18bf9d5e1..5a97a87eaa20 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -48,7 +48,7 @@ xfs_dir2_sf_entsize( count += sizeof(struct xfs_dir2_sf_entry); /* namelen + offset */ count += hdr->i8count ? XFS_INO64_SIZE : XFS_INO32_SIZE; /* ino # */ - if (xfs_sb_version_hasftype(&mp->m_sb)) + if (xfs_has_ftype(mp)) count += sizeof(uint8_t); return count; } @@ -76,7 +76,7 @@ xfs_dir2_sf_get_ino( { uint8_t *from = sfep->name + sfep->namelen; - if (xfs_sb_version_hasftype(&mp->m_sb)) + if (xfs_has_ftype(mp)) from++; if (!hdr->i8count) @@ -95,7 +95,7 @@ xfs_dir2_sf_put_ino( ASSERT(ino <= XFS_MAXINUMBER); - if (xfs_sb_version_hasftype(&mp->m_sb)) + if (xfs_has_ftype(mp)) to++; if (hdr->i8count) @@ -135,7 +135,7 @@ xfs_dir2_sf_get_ftype( struct xfs_mount *mp, struct xfs_dir2_sf_entry *sfep) { - if (xfs_sb_version_hasftype(&mp->m_sb)) { + if (xfs_has_ftype(mp)) { uint8_t ftype = sfep->name[sfep->namelen]; if (ftype < XFS_DIR3_FT_MAX) @@ -153,7 +153,7 @@ xfs_dir2_sf_put_ftype( { ASSERT(ftype < XFS_DIR3_FT_MAX); - if (xfs_sb_version_hasftype(&mp->m_sb)) + if (xfs_has_ftype(mp)) sfep->name[sfep->namelen] = ftype; } @@ -192,7 +192,7 @@ xfs_dir2_block_sfsize( * if there is a filetype field, add the extra byte to the namelen * for each entry that we see. */ - has_ftype = xfs_sb_version_hasftype(&mp->m_sb) ? 1 : 0; + has_ftype = xfs_has_ftype(mp) ? 1 : 0; count = i8count = namelen = 0; btp = xfs_dir2_block_tail_p(geo, hdr); diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c index 6766417d5ba4..deeb74becabc 100644 --- a/fs/xfs/libxfs/xfs_dquot_buf.c +++ b/fs/xfs/libxfs/xfs_dquot_buf.c @@ -70,7 +70,7 @@ xfs_dquot_verify( return __this_address; if ((ddq->d_type & XFS_DQTYPE_BIGTIME) && - !xfs_sb_version_hasbigtime(&mp->m_sb)) + !xfs_has_bigtime(mp)) return __this_address; if ((ddq->d_type & XFS_DQTYPE_BIGTIME) && !ddq->d_id) @@ -106,7 +106,7 @@ xfs_dqblk_verify( struct xfs_dqblk *dqb, xfs_dqid_t id) /* used only during quotacheck */ { - if (xfs_sb_version_hascrc(&mp->m_sb) && + if (xfs_has_crc(mp) && !uuid_equal(&dqb->dd_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; @@ -134,7 +134,7 @@ xfs_dqblk_repair( dqb->dd_diskdq.d_type = type; dqb->dd_diskdq.d_id = cpu_to_be32(id); - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { uuid_copy(&dqb->dd_uuid, &mp->m_sb.sb_meta_uuid); xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk), XFS_DQUOT_CRC_OFF); @@ -151,7 +151,7 @@ xfs_dquot_buf_verify_crc( int ndquots; int i; - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return true; /* diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 76e2461b9e66..2d7057b7984b 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -9,7 +9,7 @@ /* * XFS On Disk Format Definitions * - * This header file defines all the on-disk format definitions for + * This header file defines all the on-disk format definitions for * general XFS objects. Directory and attribute related objects are defined in * xfs_da_format.h, which log and log item formats are defined in * xfs_log_format.h. Everything else goes here. @@ -265,7 +265,6 @@ typedef struct xfs_dsb { /* must be padded to 64 bit alignment */ } xfs_dsb_t; - /* * Misc. Flags - warning - these will be cleared by xfs_repair unless * a feature bit is set when the flag is used. @@ -280,37 +279,9 @@ typedef struct xfs_dsb { #define XFS_SB_VERSION_NUM(sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS) -/* - * The first XFS version we support is a v4 superblock with V2 directories. - */ -static inline bool xfs_sb_good_v4_features(struct xfs_sb *sbp) -{ - if (!(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT)) - return false; - if (!(sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT)) - return false; - - /* check for unknown features in the fs */ - if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) || - ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && - (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS))) - return false; - - return true; -} - -static inline bool xfs_sb_good_version(struct xfs_sb *sbp) +static inline bool xfs_sb_is_v5(struct xfs_sb *sbp) { - if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) - return true; - if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) - return xfs_sb_good_v4_features(sbp); - return false; -} - -static inline bool xfs_sb_version_hasrealtime(struct xfs_sb *sbp) -{ - return sbp->sb_rblocks > 0; + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; } /* @@ -322,9 +293,10 @@ static inline bool xfs_sb_has_mismatched_features2(struct xfs_sb *sbp) return sbp->sb_bad_features2 != sbp->sb_features2; } -static inline bool xfs_sb_version_hasattr(struct xfs_sb *sbp) +static inline bool xfs_sb_version_hasmorebits(struct xfs_sb *sbp) { - return (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT); + return xfs_sb_is_v5(sbp) || + (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT); } static inline void xfs_sb_version_addattr(struct xfs_sb *sbp) @@ -332,87 +304,18 @@ static inline void xfs_sb_version_addattr(struct xfs_sb *sbp) sbp->sb_versionnum |= XFS_SB_VERSION_ATTRBIT; } -static inline bool xfs_sb_version_hasquota(struct xfs_sb *sbp) -{ - return (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT); -} - static inline void xfs_sb_version_addquota(struct xfs_sb *sbp) { sbp->sb_versionnum |= XFS_SB_VERSION_QUOTABIT; } -static inline bool xfs_sb_version_hasalign(struct xfs_sb *sbp) -{ - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 || - (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT)); -} - -static inline bool xfs_sb_version_hasdalign(struct xfs_sb *sbp) -{ - return (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT); -} - -static inline bool xfs_sb_version_haslogv2(struct xfs_sb *sbp) -{ - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 || - (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT); -} - -static inline bool xfs_sb_version_hassector(struct xfs_sb *sbp) -{ - return (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT); -} - -static inline bool xfs_sb_version_hasasciici(struct xfs_sb *sbp) -{ - return (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT); -} - -static inline bool xfs_sb_version_hasmorebits(struct xfs_sb *sbp) -{ - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 || - (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT); -} - -/* - * sb_features2 bit version macros. - */ -static inline bool xfs_sb_version_haslazysbcount(struct xfs_sb *sbp) -{ - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || - (xfs_sb_version_hasmorebits(sbp) && - (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT)); -} - -static inline bool xfs_sb_version_hasattr2(struct xfs_sb *sbp) -{ - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || - (xfs_sb_version_hasmorebits(sbp) && - (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT)); -} - static inline void xfs_sb_version_addattr2(struct xfs_sb *sbp) { sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT; sbp->sb_features2 |= XFS_SB_VERSION2_ATTR2BIT; } -static inline void xfs_sb_version_removeattr2(struct xfs_sb *sbp) -{ - sbp->sb_features2 &= ~XFS_SB_VERSION2_ATTR2BIT; - if (!sbp->sb_features2) - sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT; -} - -static inline bool xfs_sb_version_hasprojid32bit(struct xfs_sb *sbp) -{ - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || - (xfs_sb_version_hasmorebits(sbp) && - (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT)); -} - -static inline void xfs_sb_version_addprojid32bit(struct xfs_sb *sbp) +static inline void xfs_sb_version_addprojid32(struct xfs_sb *sbp) { sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT; sbp->sb_features2 |= XFS_SB_VERSION2_PROJID32BIT; @@ -495,106 +398,21 @@ xfs_sb_has_incompat_log_feature( return (sbp->sb_features_log_incompat & feature) != 0; } -/* - * V5 superblock specific feature checks - */ -static inline bool xfs_sb_version_hascrc(struct xfs_sb *sbp) -{ - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; -} - -/* - * v5 file systems support V3 inodes only, earlier file systems support - * v2 and v1 inodes. - */ -static inline bool xfs_sb_version_has_v3inode(struct xfs_sb *sbp) -{ - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; -} - -static inline bool xfs_dinode_good_version(struct xfs_sb *sbp, - uint8_t version) -{ - if (xfs_sb_version_has_v3inode(sbp)) - return version == 3; - return version == 1 || version == 2; -} - -static inline bool xfs_sb_version_has_pquotino(struct xfs_sb *sbp) -{ - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; -} - -static inline int xfs_sb_version_hasftype(struct xfs_sb *sbp) -{ - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && - xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_FTYPE)) || - (xfs_sb_version_hasmorebits(sbp) && - (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE)); -} - -static inline bool xfs_sb_version_hasfinobt(xfs_sb_t *sbp) -{ - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) && - (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT); -} - -static inline bool xfs_sb_version_hassparseinodes(struct xfs_sb *sbp) -{ - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && - xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_SPINODES); -} - -/* - * XFS_SB_FEAT_INCOMPAT_META_UUID indicates that the metadata UUID - * is stored separately from the user-visible UUID; this allows the - * user-visible UUID to be changed on V5 filesystems which have a - * filesystem UUID stamped into every piece of metadata. - */ -static inline bool xfs_sb_version_hasmetauuid(struct xfs_sb *sbp) -{ - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) && - (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID); -} - -static inline bool xfs_sb_version_hasrmapbt(struct xfs_sb *sbp) -{ - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) && - (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_RMAPBT); -} - -static inline bool xfs_sb_version_hasreflink(struct xfs_sb *sbp) -{ - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && - (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_REFLINK); -} - -static inline bool xfs_sb_version_hasbigtime(struct xfs_sb *sbp) -{ - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && - (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_BIGTIME); -} - -/* - * Inode btree block counter. We record the number of inobt and finobt blocks - * in the AGI header so that we can skip the finobt walk at mount time when - * setting up per-AG reservations. - */ -static inline bool xfs_sb_version_hasinobtcounts(struct xfs_sb *sbp) +static inline void +xfs_sb_remove_incompat_log_features( + struct xfs_sb *sbp) { - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && - (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_INOBTCNT); + sbp->sb_features_log_incompat &= ~XFS_SB_FEAT_INCOMPAT_LOG_ALL; } -static inline bool xfs_sb_version_needsrepair(struct xfs_sb *sbp) +static inline void +xfs_sb_add_incompat_log_features( + struct xfs_sb *sbp, + unsigned int features) { - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && - (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR); + sbp->sb_features_log_incompat |= features; } -/* - * end of superblock version macros - */ static inline bool xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino) @@ -1062,12 +880,12 @@ enum xfs_dinode_fmt { /* * Inode size for given fs. */ -#define XFS_DINODE_SIZE(sbp) \ - (xfs_sb_version_has_v3inode(sbp) ? \ +#define XFS_DINODE_SIZE(mp) \ + (xfs_has_v3inodes(mp) ? \ sizeof(struct xfs_dinode) : \ offsetof(struct xfs_dinode, di_crc)) #define XFS_LITINO(mp) \ - ((mp)->m_sb.sb_inodesize - XFS_DINODE_SIZE(&(mp)->m_sb)) + ((mp)->m_sb.sb_inodesize - XFS_DINODE_SIZE(mp)) /* * Inode data & attribute fork sizes, per inode. @@ -1454,7 +1272,7 @@ struct xfs_dsymlink_hdr { #define XFS_SYMLINK_MAPS 3 #define XFS_SYMLINK_BUF_SPACE(mp, bufsize) \ - ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \ + ((bufsize) - (xfs_has_crc((mp)) ? \ sizeof(struct xfs_dsymlink_hdr) : 0)) @@ -1686,7 +1504,7 @@ struct xfs_rmap_key { typedef __be32 xfs_rmap_ptr_t; #define XFS_RMAP_BLOCK(mp) \ - (xfs_sb_version_hasfinobt(&((mp)->m_sb)) ? \ + (xfs_has_finobt(((mp))) ? \ XFS_FIBT_BLOCK(mp) + 1 : \ XFS_IBT_BLOCK(mp) + 1) @@ -1918,7 +1736,7 @@ struct xfs_acl { * limited only by the maximum size of the xattr that stores the information. */ #define XFS_ACL_MAX_ENTRIES(mp) \ - (xfs_sb_version_hascrc(&mp->m_sb) \ + (xfs_has_crc(mp) \ ? (XFS_XATTR_SIZE_MAX - sizeof(struct xfs_acl)) / \ sizeof(struct xfs_acl_entry) \ : 25) diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index aaf8805a82df..994ad783d407 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -58,7 +58,7 @@ xfs_inobt_update( union xfs_btree_rec rec; rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino); - if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) { + if (xfs_has_sparseinodes(cur->bc_mp)) { rec.inobt.ir_u.sp.ir_holemask = cpu_to_be16(irec->ir_holemask); rec.inobt.ir_u.sp.ir_count = irec->ir_count; rec.inobt.ir_u.sp.ir_freecount = irec->ir_freecount; @@ -74,11 +74,11 @@ xfs_inobt_update( void xfs_inobt_btrec_to_irec( struct xfs_mount *mp, - union xfs_btree_rec *rec, + const union xfs_btree_rec *rec, struct xfs_inobt_rec_incore *irec) { irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino); - if (xfs_sb_version_hassparseinodes(&mp->m_sb)) { + if (xfs_has_sparseinodes(mp)) { irec->ir_holemask = be16_to_cpu(rec->inobt.ir_u.sp.ir_holemask); irec->ir_count = rec->inobt.ir_u.sp.ir_count; irec->ir_freecount = rec->inobt.ir_u.sp.ir_freecount; @@ -241,7 +241,7 @@ xfs_check_agi_freecount( } } while (i == 1); - if (!XFS_FORCED_SHUTDOWN(cur->bc_mp)) + if (!xfs_is_shutdown(cur->bc_mp)) ASSERT(freecount == cur->bc_ag.pag->pagi_freecount); } return 0; @@ -302,7 +302,7 @@ xfs_ialloc_inode_init( * That means for v3 inode we log the entire buffer rather than just the * inode cores. */ - if (xfs_sb_version_has_v3inode(&mp->m_sb)) { + if (xfs_has_v3inodes(mp)) { version = 3; ino = XFS_AGINO_TO_INO(mp, agno, XFS_AGB_TO_AGINO(mp, agbno)); @@ -337,7 +337,6 @@ xfs_ialloc_inode_init( xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length)); for (i = 0; i < M_IGEO(mp)->inodes_per_cluster; i++) { int ioffset = i << mp->m_sb.sb_inodelog; - uint isize = XFS_DINODE_SIZE(&mp->m_sb); free = xfs_make_iptr(mp, fbuf, i); free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); @@ -354,7 +353,7 @@ xfs_ialloc_inode_init( } else if (tp) { /* just log the inode core */ xfs_trans_log_buf(tp, fbuf, ioffset, - ioffset + isize - 1); + ioffset + XFS_DINODE_SIZE(mp) - 1); } } @@ -635,7 +634,7 @@ xfs_ialloc_ag_alloc( #ifdef DEBUG /* randomly do sparse inode allocations */ - if (xfs_sb_version_hassparseinodes(&tp->t_mountp->m_sb) && + if (xfs_has_sparseinodes(tp->t_mountp) && igeo->ialloc_min_blks < igeo->ialloc_blks) do_sparse = prandom_u32() & 1; #endif @@ -712,7 +711,7 @@ xfs_ialloc_ag_alloc( */ isaligned = 0; if (igeo->ialloc_align) { - ASSERT(!(args.mp->m_flags & XFS_MOUNT_NOALIGN)); + ASSERT(!xfs_has_noalign(args.mp)); args.alignment = args.mp->m_dalign; isaligned = 1; } else @@ -754,7 +753,7 @@ xfs_ialloc_ag_alloc( * Finally, try a sparse allocation if the filesystem supports it and * the sparse allocation length is smaller than a full chunk. */ - if (xfs_sb_version_hassparseinodes(&args.mp->m_sb) && + if (xfs_has_sparseinodes(args.mp) && igeo->ialloc_min_blks < igeo->ialloc_blks && args.fsbno == NULLFSBLOCK) { sparse_alloc: @@ -856,7 +855,7 @@ sparse_alloc: * from the previous call. Set merge false to replace any * existing record with this one. */ - if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) { + if (xfs_has_finobt(args.mp)) { error = xfs_inobt_insert_sprec(args.mp, tp, agbp, pag, XFS_BTNUM_FINO, &rec, false); if (error) @@ -869,7 +868,7 @@ sparse_alloc: if (error) return error; - if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) { + if (xfs_has_finobt(args.mp)) { error = xfs_inobt_insert(args.mp, tp, agbp, pag, newino, newlen, XFS_BTNUM_FINO); if (error) @@ -1448,7 +1447,7 @@ xfs_dialloc_ag( int offset; int i; - if (!xfs_sb_version_hasfinobt(&mp->m_sb)) + if (!xfs_has_finobt(mp)) return xfs_dialloc_ag_inobt(tp, agbp, pag, parent, inop); /* @@ -1784,7 +1783,7 @@ xfs_dialloc( break; } - if (XFS_FORCED_SHUTDOWN(mp)) { + if (xfs_is_shutdown(mp)) { error = -EFSCORRUPTED; break; } @@ -1953,8 +1952,7 @@ xfs_difree_inobt( * remove the chunk if the block size is large enough for multiple inode * chunks (that might not be free). */ - if (!(mp->m_flags & XFS_MOUNT_IKEEP) && - rec.ir_free == XFS_INOBT_ALL_FREE && + if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE && mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) { struct xfs_perag *pag = agbp->b_pag; @@ -1994,7 +1992,7 @@ xfs_difree_inobt( goto error0; } - /* + /* * Change the inode free counts and log the ag/sb changes. */ be32_add_cpu(&agi->agi_freecount, 1); @@ -2098,9 +2096,8 @@ xfs_difree_finobt( * enough for multiple chunks. Leave the finobt record to remain in sync * with the inobt. */ - if (rec.ir_free == XFS_INOBT_ALL_FREE && - mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK && - !(mp->m_flags & XFS_MOUNT_IKEEP)) { + if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE && + mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) { error = xfs_btree_delete(cur, &i); if (error) goto error; @@ -2189,7 +2186,7 @@ xfs_difree( /* * Fix up the free inode btree. */ - if (xfs_sb_version_hasfinobt(&mp->m_sb)) { + if (xfs_has_finobt(mp)) { error = xfs_difree_finobt(mp, tp, agbp, pag, agino, &rec); if (error) goto error0; @@ -2478,7 +2475,7 @@ xfs_agi_verify( struct xfs_agi *agi = bp->b_addr; int i; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { if (!uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(agi->agi_lsn))) @@ -2497,7 +2494,7 @@ xfs_agi_verify( be32_to_cpu(agi->agi_level) > M_IGEO(mp)->inobt_maxlevels) return __this_address; - if (xfs_sb_version_hasfinobt(&mp->m_sb) && + if (xfs_has_finobt(mp) && (be32_to_cpu(agi->agi_free_level) < 1 || be32_to_cpu(agi->agi_free_level) > M_IGEO(mp)->inobt_maxlevels)) return __this_address; @@ -2528,7 +2525,7 @@ xfs_agi_read_verify( struct xfs_mount *mp = bp->b_mount; xfs_failaddr_t fa; - if (xfs_sb_version_hascrc(&mp->m_sb) && + if (xfs_has_crc(mp) && !xfs_buf_verify_cksum(bp, XFS_AGI_CRC_OFF)) xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { @@ -2553,7 +2550,7 @@ xfs_agi_write_verify( return; } - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return; if (bip) @@ -2626,7 +2623,7 @@ xfs_ialloc_read_agi( * we are in the middle of a forced shutdown. */ ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) || - XFS_FORCED_SHUTDOWN(mp)); + xfs_is_shutdown(mp)); return 0; } @@ -2716,7 +2713,7 @@ struct xfs_ialloc_count_inodes { STATIC int xfs_ialloc_count_inodes_rec( struct xfs_btree_cur *cur, - union xfs_btree_rec *rec, + const union xfs_btree_rec *rec, void *priv) { struct xfs_inobt_rec_incore irec; @@ -2773,7 +2770,7 @@ xfs_ialloc_setup_geometry( uint inodes; igeo->new_diflags2 = 0; - if (xfs_sb_version_hasbigtime(&mp->m_sb)) + if (xfs_has_bigtime(mp)) igeo->new_diflags2 |= XFS_DIFLAG2_BIGTIME; /* Compute inode btree geometry. */ @@ -2828,7 +2825,7 @@ xfs_ialloc_setup_geometry( * cannot change the behavior. */ igeo->inode_cluster_size_raw = XFS_INODE_BIG_CLUSTER_SIZE; - if (xfs_sb_version_has_v3inode(&mp->m_sb)) { + if (xfs_has_v3inodes(mp)) { int new_size = igeo->inode_cluster_size_raw; new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE; @@ -2846,7 +2843,7 @@ xfs_ialloc_setup_geometry( igeo->inodes_per_cluster = XFS_FSB_TO_INO(mp, igeo->blocks_per_cluster); /* Calculate inode cluster alignment. */ - if (xfs_sb_version_hasalign(&mp->m_sb) && + if (xfs_has_align(mp) && mp->m_sb.sb_inoalignmt >= igeo->blocks_per_cluster) igeo->cluster_align = mp->m_sb.sb_inoalignmt; else @@ -2894,15 +2891,15 @@ xfs_ialloc_calc_rootino( first_bno += xfs_alloc_min_freelist(mp, NULL); /* ...the free inode btree root... */ - if (xfs_sb_version_hasfinobt(&mp->m_sb)) + if (xfs_has_finobt(mp)) first_bno++; /* ...the reverse mapping btree root... */ - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + if (xfs_has_rmapbt(mp)) first_bno++; /* ...the reference count btree... */ - if (xfs_sb_version_hasreflink(&mp->m_sb)) + if (xfs_has_reflink(mp)) first_bno++; /* @@ -2920,9 +2917,9 @@ xfs_ialloc_calc_rootino( * Now round first_bno up to whatever allocation alignment is given * by the filesystem or was passed in. */ - if (xfs_sb_version_hasdalign(&mp->m_sb) && igeo->ialloc_align > 0) + if (xfs_has_dalign(mp) && igeo->ialloc_align > 0) first_bno = roundup(first_bno, sunit); - else if (xfs_sb_version_hasalign(&mp->m_sb) && + else if (xfs_has_align(mp) && mp->m_sb.sb_inoalignmt > 1) first_bno = roundup(first_bno, mp->m_sb.sb_inoalignmt); @@ -2953,7 +2950,7 @@ xfs_ialloc_check_shrink( int has; int error; - if (!xfs_sb_version_hassparseinodes(&mp->m_sb)) + if (!xfs_has_sparseinodes(mp)) return 0; pag = xfs_perag_get(mp, agno); diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index 9a2112b4ad5e..8b5c2b709022 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -106,7 +106,8 @@ int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, struct xfs_buf **bpp); union xfs_btree_rec; -void xfs_inobt_btrec_to_irec(struct xfs_mount *mp, union xfs_btree_rec *rec, +void xfs_inobt_btrec_to_irec(struct xfs_mount *mp, + const union xfs_btree_rec *rec, struct xfs_inobt_rec_incore *irec); int xfs_ialloc_has_inodes_at_extent(struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, bool *exists); diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 823a038939f8..27190840c5d8 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -40,9 +40,9 @@ xfs_inobt_dup_cursor( STATIC void xfs_inobt_set_root( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *nptr, - int inc) /* level change */ + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *nptr, + int inc) /* level change */ { struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agi *agi = agbp->b_addr; @@ -54,9 +54,9 @@ xfs_inobt_set_root( STATIC void xfs_finobt_set_root( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *nptr, - int inc) /* level change */ + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *nptr, + int inc) /* level change */ { struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agi *agi = agbp->b_addr; @@ -76,7 +76,7 @@ xfs_inobt_mod_blockcount( struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agi *agi = agbp->b_addr; - if (!xfs_sb_version_hasinobtcounts(&cur->bc_mp->m_sb)) + if (!xfs_has_inobtcounts(cur->bc_mp)) return; if (cur->bc_btnum == XFS_BTNUM_FINO) @@ -88,11 +88,11 @@ xfs_inobt_mod_blockcount( STATIC int __xfs_inobt_alloc_block( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *start, - union xfs_btree_ptr *new, - int *stat, - enum xfs_ag_resv_type resv) + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat, + enum xfs_ag_resv_type resv) { xfs_alloc_arg_t args; /* block allocation args */ int error; /* error return value */ @@ -127,20 +127,20 @@ __xfs_inobt_alloc_block( STATIC int xfs_inobt_alloc_block( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *start, - union xfs_btree_ptr *new, - int *stat) + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat) { return __xfs_inobt_alloc_block(cur, start, new, stat, XFS_AG_RESV_NONE); } STATIC int xfs_finobt_alloc_block( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *start, - union xfs_btree_ptr *new, - int *stat) + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat) { if (cur->bc_mp->m_finobt_nores) return xfs_inobt_alloc_block(cur, start, new, stat); @@ -156,7 +156,7 @@ __xfs_inobt_free_block( { xfs_inobt_mod_blockcount(cur, -1); return xfs_free_extent(cur->bc_tp, - XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)), 1, + XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)), 1, &XFS_RMAP_OINFO_INOBT, resv); } @@ -188,18 +188,18 @@ xfs_inobt_get_maxrecs( STATIC void xfs_inobt_init_key_from_rec( - union xfs_btree_key *key, - union xfs_btree_rec *rec) + union xfs_btree_key *key, + const union xfs_btree_rec *rec) { key->inobt.ir_startino = rec->inobt.ir_startino; } STATIC void xfs_inobt_init_high_key_from_rec( - union xfs_btree_key *key, - union xfs_btree_rec *rec) + union xfs_btree_key *key, + const union xfs_btree_rec *rec) { - __u32 x; + __u32 x; x = be32_to_cpu(rec->inobt.ir_startino); x += XFS_INODES_PER_CHUNK - 1; @@ -212,7 +212,7 @@ xfs_inobt_init_rec_from_cur( union xfs_btree_rec *rec) { rec->inobt.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino); - if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) { + if (xfs_has_sparseinodes(cur->bc_mp)) { rec->inobt.ir_u.sp.ir_holemask = cpu_to_be16(cur->bc_rec.i.ir_holemask); rec->inobt.ir_u.sp.ir_count = cur->bc_rec.i.ir_count; @@ -253,8 +253,8 @@ xfs_finobt_init_ptr_from_cur( STATIC int64_t xfs_inobt_key_diff( - struct xfs_btree_cur *cur, - union xfs_btree_key *key) + struct xfs_btree_cur *cur, + const union xfs_btree_key *key) { return (int64_t)be32_to_cpu(key->inobt.ir_startino) - cur->bc_rec.i.ir_startino; @@ -262,9 +262,9 @@ xfs_inobt_key_diff( STATIC int64_t xfs_inobt_diff_two_keys( - struct xfs_btree_cur *cur, - union xfs_btree_key *k1, - union xfs_btree_key *k2) + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2) { return (int64_t)be32_to_cpu(k1->inobt.ir_startino) - be32_to_cpu(k2->inobt.ir_startino); @@ -292,7 +292,7 @@ xfs_inobt_verify( * but beware of the landmine (i.e. need to check pag->pagi_init) if we * ever do. */ - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { fa = xfs_btree_sblock_v5hdr_verify(bp); if (fa) return fa; @@ -360,9 +360,9 @@ const struct xfs_buf_ops xfs_finobt_buf_ops = { STATIC int xfs_inobt_keys_inorder( - struct xfs_btree_cur *cur, - union xfs_btree_key *k1, - union xfs_btree_key *k2) + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2) { return be32_to_cpu(k1->inobt.ir_startino) < be32_to_cpu(k2->inobt.ir_startino); @@ -370,9 +370,9 @@ xfs_inobt_keys_inorder( STATIC int xfs_inobt_recs_inorder( - struct xfs_btree_cur *cur, - union xfs_btree_rec *r1, - union xfs_btree_rec *r2) + struct xfs_btree_cur *cur, + const union xfs_btree_rec *r1, + const union xfs_btree_rec *r2) { return be32_to_cpu(r1->inobt.ir_startino) + XFS_INODES_PER_CHUNK <= be32_to_cpu(r2->inobt.ir_startino); @@ -446,7 +446,7 @@ xfs_inobt_init_common( cur->bc_blocklog = mp->m_sb.sb_blocklog; - if (xfs_sb_version_hascrc(&mp->m_sb)) + if (xfs_has_crc(mp)) cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; /* take a reference for the cursor */ @@ -511,7 +511,7 @@ xfs_inobt_commit_staged_btree( fields = XFS_AGI_ROOT | XFS_AGI_LEVEL; agi->agi_root = cpu_to_be32(afake->af_root); agi->agi_level = cpu_to_be32(afake->af_levels); - if (xfs_sb_version_hasinobtcounts(&cur->bc_mp->m_sb)) { + if (xfs_has_inobtcounts(cur->bc_mp)) { agi->agi_iblocks = cpu_to_be32(afake->af_blocks); fields |= XFS_AGI_IBLOCKS; } @@ -521,7 +521,7 @@ xfs_inobt_commit_staged_btree( fields = XFS_AGI_FREE_ROOT | XFS_AGI_FREE_LEVEL; agi->agi_free_root = cpu_to_be32(afake->af_root); agi->agi_free_level = cpu_to_be32(afake->af_levels); - if (xfs_sb_version_hasinobtcounts(&cur->bc_mp->m_sb)) { + if (xfs_has_inobtcounts(cur->bc_mp)) { agi->agi_fblocks = cpu_to_be32(afake->af_blocks); fields |= XFS_AGI_IBLOCKS; } @@ -737,10 +737,10 @@ xfs_finobt_calc_reserves( xfs_extlen_t tree_len = 0; int error; - if (!xfs_sb_version_hasfinobt(&mp->m_sb)) + if (!xfs_has_finobt(mp)) return 0; - if (xfs_sb_version_hasinobtcounts(&mp->m_sb)) + if (xfs_has_inobtcounts(mp)) error = xfs_finobt_read_blocks(mp, tp, pag, &tree_len); else error = xfs_inobt_count_blocks(mp, tp, pag, XFS_BTNUM_FINO, diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h index e530c82b2217..8a322d402e61 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.h +++ b/fs/xfs/libxfs/xfs_ialloc_btree.h @@ -19,7 +19,7 @@ struct xfs_perag; * Btree block header size depends on a superblock flag. */ #define XFS_INOBT_BLOCK_LEN(mp) \ - (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ + (xfs_has_crc(((mp))) ? \ XFS_BTREE_SBLOCK_CRC_LEN : XFS_BTREE_SBLOCK_LEN) /* diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 84ea2e0af9f0..3932b4ebf903 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -48,7 +48,7 @@ xfs_inode_buf_verify( /* * Validate the magic number and version of every inode in the buffer */ - agno = xfs_daddr_to_agno(mp, XFS_BUF_ADDR(bp)); + agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp)); ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock; for (i = 0; i < ni; i++) { int di_ok; @@ -58,7 +58,7 @@ xfs_inode_buf_verify( dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog)); unlinked_ino = be32_to_cpu(dip->di_next_unlinked); di_ok = xfs_verify_magic16(bp, dip->di_magic) && - xfs_dinode_good_version(&mp->m_sb, dip->di_version) && + xfs_dinode_good_version(mp, dip->di_version) && xfs_verify_agino_or_null(mp, agno, unlinked_ino); if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP))) { @@ -71,7 +71,7 @@ xfs_inode_buf_verify( #ifdef DEBUG xfs_alert(mp, "bad inode magic/vsn daddr %lld #%d (magic=%x)", - (unsigned long long)bp->b_bn, i, + (unsigned long long)xfs_buf_daddr(bp), i, be16_to_cpu(dip->di_magic)); #endif xfs_buf_verifier_error(bp, -EFSCORRUPTED, @@ -192,7 +192,7 @@ xfs_inode_from_disk( * inode. If the inode is unused, mode is zero and we shouldn't mess * with the uninitialized part of it. */ - if (!xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) + if (!xfs_has_v3inodes(ip->i_mount)) ip->i_flushiter = be16_to_cpu(from->di_flushiter); inode->i_generation = be32_to_cpu(from->di_gen); inode->i_mode = be16_to_cpu(from->di_mode); @@ -235,7 +235,7 @@ xfs_inode_from_disk( if (from->di_dmevmask || from->di_dmstate) xfs_iflags_set(ip, XFS_IPRESERVE_DM_FIELDS); - if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) { + if (xfs_has_v3inodes(ip->i_mount)) { inode_set_iversion_queried(inode, be64_to_cpu(from->di_changecount)); ip->i_crtime = xfs_inode_from_disk_ts(from, from->di_crtime); @@ -313,7 +313,7 @@ xfs_inode_to_disk( to->di_aformat = xfs_ifork_format(ip->i_afp); to->di_flags = cpu_to_be16(ip->i_diflags); - if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) { + if (xfs_has_v3inodes(ip->i_mount)) { to->di_version = 3; to->di_changecount = cpu_to_be64(inode_peek_iversion(inode)); to->di_crtime = xfs_inode_to_disk_ts(ip, ip->i_crtime); @@ -413,7 +413,7 @@ xfs_dinode_verify( /* Verify v3 integrity information first */ if (dip->di_version >= 3) { - if (!xfs_sb_version_has_v3inode(&mp->m_sb)) + if (!xfs_has_v3inodes(mp)) return __this_address; if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize, XFS_DINODE_CRC_OFF)) @@ -515,7 +515,7 @@ xfs_dinode_verify( /* don't allow reflink/cowextsize if we don't have reflink */ if ((flags2 & (XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE)) && - !xfs_sb_version_hasreflink(&mp->m_sb)) + !xfs_has_reflink(mp)) return __this_address; /* only regular files get reflink */ @@ -534,7 +534,7 @@ xfs_dinode_verify( /* bigtime iflag can only happen on bigtime filesystems */ if (xfs_dinode_has_bigtime(dip) && - !xfs_sb_version_hasbigtime(&mp->m_sb)) + !xfs_has_bigtime(mp)) return __this_address; return NULL; @@ -550,7 +550,7 @@ xfs_dinode_calc_crc( if (dip->di_version < 3) return; - ASSERT(xfs_sb_version_hascrc(&mp->m_sb)); + ASSERT(xfs_has_crc(mp)); crc = xfs_start_cksum_update((char *)dip, mp->m_sb.sb_inodesize, XFS_DINODE_CRC_OFF); dip->di_crc = xfs_end_cksum(crc); @@ -677,7 +677,7 @@ xfs_inode_validate_cowextsize( hint_flag = (flags2 & XFS_DIFLAG2_COWEXTSIZE); cowextsize_bytes = XFS_FSB_TO_B(mp, cowextsize); - if (hint_flag && !xfs_sb_version_hasreflink(&mp->m_sb)) + if (hint_flag && !xfs_has_reflink(mp)) return __this_address; if (hint_flag && !(S_ISDIR(mode) || S_ISREG(mode))) diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h index 7f865bb4df84..585ed5a110af 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.h +++ b/fs/xfs/libxfs/xfs_inode_buf.h @@ -21,7 +21,7 @@ struct xfs_imap { int xfs_imap_to_bp(struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_imap *imap, struct xfs_buf **bpp); -void xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *); +void xfs_dinode_calc_crc(struct xfs_mount *mp, struct xfs_dinode *dip); void xfs_inode_to_disk(struct xfs_inode *ip, struct xfs_dinode *to, xfs_lsn_t lsn); int xfs_inode_from_disk(struct xfs_inode *ip, struct xfs_dinode *from); @@ -42,4 +42,13 @@ static inline uint64_t xfs_inode_encode_bigtime(struct timespec64 tv) struct timespec64 xfs_inode_from_disk_ts(struct xfs_dinode *dip, const xfs_timestamp_t ts); +static inline bool +xfs_dinode_good_version(struct xfs_mount *mp, uint8_t version) +{ + if (xfs_has_v3inodes(mp)) + return version == 3; + return version == 1 || version == 2; +} + + #endif /* __XFS_INODE_BUF_H__ */ diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 2c5bcbc19264..b322db523d65 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -41,10 +41,10 @@ typedef uint32_t xlog_tid_t; #define XFS_MIN_LOG_FACTOR 3 #define XLOG_REC_SHIFT(log) \ - BTOBB(1 << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \ + BTOBB(1 << (xfs_has_logv2(log->l_mp) ? \ XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT)) #define XLOG_TOTAL_REC_SHIFT(log) \ - BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \ + BTOBB(XLOG_MAX_ICLOGS << (xfs_has_logv2(log->l_mp) ? \ XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT)) /* get lsn fields */ @@ -434,7 +434,7 @@ struct xfs_log_dinode { }; #define xfs_log_dinode_size(mp) \ - (xfs_sb_version_has_v3inode(&(mp)->m_sb) ? \ + (xfs_has_v3inodes((mp)) ? \ sizeof(struct xfs_log_dinode) : \ offsetof(struct xfs_log_dinode, di_next_unlinked)) diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index 3cca2bfe714c..ff69a0000817 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -122,6 +122,8 @@ void xlog_buf_readahead(struct xlog *log, xfs_daddr_t blkno, uint len, const struct xfs_buf_ops *ops); bool xlog_is_buffer_cancelled(struct xlog *log, xfs_daddr_t blkno, uint len); +int xlog_recover_iget(struct xfs_mount *mp, xfs_ino_t ino, + struct xfs_inode **ipp); void xlog_recover_release_intent(struct xlog *log, unsigned short intent_type, uint64_t intent_id); diff --git a/fs/xfs/libxfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c index 7f55eb3f3653..67798ff5e14e 100644 --- a/fs/xfs/libxfs/xfs_log_rlimit.c +++ b/fs/xfs/libxfs/xfs_log_rlimit.c @@ -92,7 +92,7 @@ xfs_log_calc_minimum_size( if (tres.tr_logcount > 1) max_logres *= tres.tr_logcount; - if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1) + if (xfs_has_logv2(mp) && mp->m_sb.sb_logsunit > 1) lsunit = BTOBB(mp->m_sb.sb_logsunit); /* diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h index 0f0af4e35032..a02c5062f9b2 100644 --- a/fs/xfs/libxfs/xfs_quota_defs.h +++ b/fs/xfs/libxfs/xfs_quota_defs.h @@ -60,37 +60,15 @@ typedef uint8_t xfs_dqtype_t; #define XFS_DQUOT_LOGRES(mp) \ ((sizeof(struct xfs_dq_logformat) + sizeof(struct xfs_disk_dquot)) * 6) -#define XFS_IS_QUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT) -#define XFS_IS_UQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_UQUOTA_ACCT) -#define XFS_IS_PQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_PQUOTA_ACCT) -#define XFS_IS_GQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_GQUOTA_ACCT) +#define XFS_IS_QUOTA_ON(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT) +#define XFS_IS_UQUOTA_ON(mp) ((mp)->m_qflags & XFS_UQUOTA_ACCT) +#define XFS_IS_PQUOTA_ON(mp) ((mp)->m_qflags & XFS_PQUOTA_ACCT) +#define XFS_IS_GQUOTA_ON(mp) ((mp)->m_qflags & XFS_GQUOTA_ACCT) #define XFS_IS_UQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_UQUOTA_ENFD) #define XFS_IS_GQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_GQUOTA_ENFD) #define XFS_IS_PQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_PQUOTA_ENFD) /* - * Incore only flags for quotaoff - these bits get cleared when quota(s) - * are in the process of getting turned off. These flags are in m_qflags but - * never in sb_qflags. - */ -#define XFS_UQUOTA_ACTIVE 0x1000 /* uquotas are being turned off */ -#define XFS_GQUOTA_ACTIVE 0x2000 /* gquotas are being turned off */ -#define XFS_PQUOTA_ACTIVE 0x4000 /* pquotas are being turned off */ -#define XFS_ALL_QUOTA_ACTIVE \ - (XFS_UQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE) - -/* - * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees - * quota will be not be switched off as long as that inode lock is held. - */ -#define XFS_IS_QUOTA_ON(mp) ((mp)->m_qflags & (XFS_UQUOTA_ACTIVE | \ - XFS_GQUOTA_ACTIVE | \ - XFS_PQUOTA_ACTIVE)) -#define XFS_IS_UQUOTA_ON(mp) ((mp)->m_qflags & XFS_UQUOTA_ACTIVE) -#define XFS_IS_GQUOTA_ON(mp) ((mp)->m_qflags & XFS_GQUOTA_ACTIVE) -#define XFS_IS_PQUOTA_ON(mp) ((mp)->m_qflags & XFS_PQUOTA_ACTIVE) - -/* * Flags to tell various functions what to do. Not all of these are meaningful * to a single function. None of these XFS_QMOPT_* flags are meant to have * persistent values (ie. their values can and will change between versions) diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 860a0c9801ba..e5d767a7fc5d 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -91,7 +91,7 @@ xfs_refcount_lookup_eq( /* Convert on-disk record to in-core format. */ void xfs_refcount_btrec_to_irec( - union xfs_btree_rec *rec, + const union xfs_btree_rec *rec, struct xfs_refcount_irec *irec) { irec->rc_startblock = be32_to_cpu(rec->refc.rc_startblock); @@ -1253,7 +1253,7 @@ xfs_refcount_increase_extent( struct xfs_trans *tp, struct xfs_bmbt_irec *PREV) { - if (!xfs_sb_version_hasreflink(&tp->t_mountp->m_sb)) + if (!xfs_has_reflink(tp->t_mountp)) return; __xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE, PREV->br_startblock, @@ -1268,7 +1268,7 @@ xfs_refcount_decrease_extent( struct xfs_trans *tp, struct xfs_bmbt_irec *PREV) { - if (!xfs_sb_version_hasreflink(&tp->t_mountp->m_sb)) + if (!xfs_has_reflink(tp->t_mountp)) return; __xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE, PREV->br_startblock, @@ -1617,7 +1617,7 @@ xfs_refcount_alloc_cow_extent( { struct xfs_mount *mp = tp->t_mountp; - if (!xfs_sb_version_hasreflink(&mp->m_sb)) + if (!xfs_has_reflink(mp)) return; __xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, fsb, len); @@ -1636,7 +1636,7 @@ xfs_refcount_free_cow_extent( { struct xfs_mount *mp = tp->t_mountp; - if (!xfs_sb_version_hasreflink(&mp->m_sb)) + if (!xfs_has_reflink(mp)) return; /* Remove rmap entry */ @@ -1654,7 +1654,7 @@ struct xfs_refcount_recovery { STATIC int xfs_refcount_recover_extent( struct xfs_btree_cur *cur, - union xfs_btree_rec *rec, + const union xfs_btree_rec *rec, void *priv) { struct list_head *debris = priv; diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h index 9f6e9aae4da0..02cb3aa405be 100644 --- a/fs/xfs/libxfs/xfs_refcount.h +++ b/fs/xfs/libxfs/xfs_refcount.h @@ -78,7 +78,7 @@ static inline xfs_fileoff_t xfs_refcount_max_unmap(int log_res) extern int xfs_refcount_has_record(struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, bool *exists); union xfs_btree_rec; -extern void xfs_refcount_btrec_to_irec(union xfs_btree_rec *rec, +extern void xfs_refcount_btrec_to_irec(const union xfs_btree_rec *rec, struct xfs_refcount_irec *irec); extern int xfs_refcount_insert(struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec, int *stat); diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 92d336c17e83..1ef9b99962ab 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -31,9 +31,9 @@ xfs_refcountbt_dup_cursor( STATIC void xfs_refcountbt_set_root( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *ptr, - int inc) + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr, + int inc) { struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agf *agf = agbp->b_addr; @@ -51,10 +51,10 @@ xfs_refcountbt_set_root( STATIC int xfs_refcountbt_alloc_block( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *start, - union xfs_btree_ptr *new, - int *stat) + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat) { struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agf *agf = agbp->b_addr; @@ -102,7 +102,7 @@ xfs_refcountbt_free_block( struct xfs_mount *mp = cur->bc_mp; struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agf *agf = agbp->b_addr; - xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp)); + xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); int error; trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_ag.pag->pag_agno, @@ -135,18 +135,18 @@ xfs_refcountbt_get_maxrecs( STATIC void xfs_refcountbt_init_key_from_rec( - union xfs_btree_key *key, - union xfs_btree_rec *rec) + union xfs_btree_key *key, + const union xfs_btree_rec *rec) { key->refc.rc_startblock = rec->refc.rc_startblock; } STATIC void xfs_refcountbt_init_high_key_from_rec( - union xfs_btree_key *key, - union xfs_btree_rec *rec) + union xfs_btree_key *key, + const union xfs_btree_rec *rec) { - __u32 x; + __u32 x; x = be32_to_cpu(rec->refc.rc_startblock); x += be32_to_cpu(rec->refc.rc_blockcount) - 1; @@ -177,20 +177,20 @@ xfs_refcountbt_init_ptr_from_cur( STATIC int64_t xfs_refcountbt_key_diff( - struct xfs_btree_cur *cur, - union xfs_btree_key *key) + struct xfs_btree_cur *cur, + const union xfs_btree_key *key) { struct xfs_refcount_irec *rec = &cur->bc_rec.rc; - struct xfs_refcount_key *kp = &key->refc; + const struct xfs_refcount_key *kp = &key->refc; return (int64_t)be32_to_cpu(kp->rc_startblock) - rec->rc_startblock; } STATIC int64_t xfs_refcountbt_diff_two_keys( - struct xfs_btree_cur *cur, - union xfs_btree_key *k1, - union xfs_btree_key *k2) + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2) { return (int64_t)be32_to_cpu(k1->refc.rc_startblock) - be32_to_cpu(k2->refc.rc_startblock); @@ -209,7 +209,7 @@ xfs_refcountbt_verify( if (!xfs_verify_magic(bp, block->bb_magic)) return __this_address; - if (!xfs_sb_version_hasreflink(&mp->m_sb)) + if (!xfs_has_reflink(mp)) return __this_address; fa = xfs_btree_sblock_v5hdr_verify(bp); if (fa) @@ -269,9 +269,9 @@ const struct xfs_buf_ops xfs_refcountbt_buf_ops = { STATIC int xfs_refcountbt_keys_inorder( - struct xfs_btree_cur *cur, - union xfs_btree_key *k1, - union xfs_btree_key *k2) + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2) { return be32_to_cpu(k1->refc.rc_startblock) < be32_to_cpu(k2->refc.rc_startblock); @@ -279,9 +279,9 @@ xfs_refcountbt_keys_inorder( STATIC int xfs_refcountbt_recs_inorder( - struct xfs_btree_cur *cur, - union xfs_btree_rec *r1, - union xfs_btree_rec *r2) + struct xfs_btree_cur *cur, + const union xfs_btree_rec *r1, + const union xfs_btree_rec *r2) { return be32_to_cpu(r1->refc.rc_startblock) + be32_to_cpu(r1->refc.rc_blockcount) <= @@ -462,7 +462,7 @@ xfs_refcountbt_calc_reserves( xfs_extlen_t tree_len; int error; - if (!xfs_sb_version_hasreflink(&mp->m_sb)) + if (!xfs_has_reflink(mp)) return 0; error = xfs_alloc_read_agf(mp, tp, pag->pag_agno, 0, &agbp); diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index d1dfad0204e3..f45929b1b94a 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -179,8 +179,8 @@ done: /* Convert an internal btree record to an rmap record. */ int xfs_rmap_btrec_to_irec( - union xfs_btree_rec *rec, - struct xfs_rmap_irec *irec) + const union xfs_btree_rec *rec, + struct xfs_rmap_irec *irec) { irec->rm_startblock = be32_to_cpu(rec->rmap.rm_startblock); irec->rm_blockcount = be32_to_cpu(rec->rmap.rm_blockcount); @@ -255,9 +255,9 @@ struct xfs_find_left_neighbor_info { /* For each rmap given, figure out if it matches the key we want. */ STATIC int xfs_rmap_find_left_neighbor_helper( - struct xfs_btree_cur *cur, - struct xfs_rmap_irec *rec, - void *priv) + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) { struct xfs_find_left_neighbor_info *info = priv; @@ -331,9 +331,9 @@ xfs_rmap_find_left_neighbor( /* For each rmap given, figure out if it matches the key we want. */ STATIC int xfs_rmap_lookup_le_range_helper( - struct xfs_btree_cur *cur, - struct xfs_rmap_irec *rec, - void *priv) + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) { struct xfs_find_left_neighbor_info *info = priv; @@ -705,7 +705,7 @@ xfs_rmap_free( struct xfs_btree_cur *cur; int error; - if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) + if (!xfs_has_rmapbt(mp)) return 0; cur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag); @@ -959,7 +959,7 @@ xfs_rmap_alloc( struct xfs_btree_cur *cur; int error; - if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) + if (!xfs_has_rmapbt(mp)) return 0; cur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag); @@ -2278,9 +2278,9 @@ struct xfs_rmap_query_range_info { /* Format btree record and pass to our callback. */ STATIC int xfs_rmap_query_range_helper( - struct xfs_btree_cur *cur, - union xfs_btree_rec *rec, - void *priv) + struct xfs_btree_cur *cur, + const union xfs_btree_rec *rec, + void *priv) { struct xfs_rmap_query_range_info *query = priv; struct xfs_rmap_irec irec; @@ -2296,8 +2296,8 @@ xfs_rmap_query_range_helper( int xfs_rmap_query_range( struct xfs_btree_cur *cur, - struct xfs_rmap_irec *low_rec, - struct xfs_rmap_irec *high_rec, + const struct xfs_rmap_irec *low_rec, + const struct xfs_rmap_irec *high_rec, xfs_rmap_query_range_fn fn, void *priv) { @@ -2459,7 +2459,7 @@ xfs_rmap_update_is_needed( struct xfs_mount *mp, int whichfork) { - return xfs_sb_version_hasrmapbt(&mp->m_sb) && whichfork != XFS_COW_FORK; + return xfs_has_rmapbt(mp) && whichfork != XFS_COW_FORK; } /* @@ -2707,7 +2707,7 @@ struct xfs_rmap_key_state { STATIC int xfs_rmap_has_other_keys_helper( struct xfs_btree_cur *cur, - struct xfs_rmap_irec *rec, + const struct xfs_rmap_irec *rec, void *priv) { struct xfs_rmap_key_state *rks = priv; diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h index f2423cf7f1e2..fd67904ed446 100644 --- a/fs/xfs/libxfs/xfs_rmap.h +++ b/fs/xfs/libxfs/xfs_rmap.h @@ -134,12 +134,13 @@ int xfs_rmap_get_rec(struct xfs_btree_cur *cur, struct xfs_rmap_irec *irec, int *stat); typedef int (*xfs_rmap_query_range_fn)( - struct xfs_btree_cur *cur, - struct xfs_rmap_irec *rec, - void *priv); + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv); int xfs_rmap_query_range(struct xfs_btree_cur *cur, - struct xfs_rmap_irec *low_rec, struct xfs_rmap_irec *high_rec, + const struct xfs_rmap_irec *low_rec, + const struct xfs_rmap_irec *high_rec, xfs_rmap_query_range_fn fn, void *priv); int xfs_rmap_query_all(struct xfs_btree_cur *cur, xfs_rmap_query_range_fn fn, void *priv); @@ -192,7 +193,7 @@ int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno, int xfs_rmap_compare(const struct xfs_rmap_irec *a, const struct xfs_rmap_irec *b); union xfs_btree_rec; -int xfs_rmap_btrec_to_irec(union xfs_btree_rec *rec, +int xfs_rmap_btrec_to_irec(const union xfs_btree_rec *rec, struct xfs_rmap_irec *irec); int xfs_rmap_has_record(struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, bool *exists); diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index f29bc71b9950..b7dbbfb3aeed 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -57,9 +57,9 @@ xfs_rmapbt_dup_cursor( STATIC void xfs_rmapbt_set_root( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *ptr, - int inc) + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr, + int inc) { struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agf *agf = agbp->b_addr; @@ -76,10 +76,10 @@ xfs_rmapbt_set_root( STATIC int xfs_rmapbt_alloc_block( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *start, - union xfs_btree_ptr *new, - int *stat) + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat) { struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agf *agf = agbp->b_addr; @@ -122,7 +122,7 @@ xfs_rmapbt_free_block( xfs_agblock_t bno; int error; - bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp)); + bno = xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp)); trace_xfs_rmapbt_free_block(cur->bc_mp, pag->pag_agno, bno, 1); be32_add_cpu(&agf->agf_rmap_blocks, -1); @@ -156,8 +156,8 @@ xfs_rmapbt_get_maxrecs( STATIC void xfs_rmapbt_init_key_from_rec( - union xfs_btree_key *key, - union xfs_btree_rec *rec) + union xfs_btree_key *key, + const union xfs_btree_rec *rec) { key->rmap.rm_startblock = rec->rmap.rm_startblock; key->rmap.rm_owner = rec->rmap.rm_owner; @@ -173,11 +173,11 @@ xfs_rmapbt_init_key_from_rec( */ STATIC void xfs_rmapbt_init_high_key_from_rec( - union xfs_btree_key *key, - union xfs_btree_rec *rec) + union xfs_btree_key *key, + const union xfs_btree_rec *rec) { - uint64_t off; - int adj; + uint64_t off; + int adj; adj = be32_to_cpu(rec->rmap.rm_blockcount) - 1; @@ -219,13 +219,13 @@ xfs_rmapbt_init_ptr_from_cur( STATIC int64_t xfs_rmapbt_key_diff( - struct xfs_btree_cur *cur, - union xfs_btree_key *key) + struct xfs_btree_cur *cur, + const union xfs_btree_key *key) { - struct xfs_rmap_irec *rec = &cur->bc_rec.r; - struct xfs_rmap_key *kp = &key->rmap; - __u64 x, y; - int64_t d; + struct xfs_rmap_irec *rec = &cur->bc_rec.r; + const struct xfs_rmap_key *kp = &key->rmap; + __u64 x, y; + int64_t d; d = (int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock; if (d) @@ -249,14 +249,14 @@ xfs_rmapbt_key_diff( STATIC int64_t xfs_rmapbt_diff_two_keys( - struct xfs_btree_cur *cur, - union xfs_btree_key *k1, - union xfs_btree_key *k2) + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2) { - struct xfs_rmap_key *kp1 = &k1->rmap; - struct xfs_rmap_key *kp2 = &k2->rmap; - int64_t d; - __u64 x, y; + const struct xfs_rmap_key *kp1 = &k1->rmap; + const struct xfs_rmap_key *kp2 = &k2->rmap; + int64_t d; + __u64 x, y; d = (int64_t)be32_to_cpu(kp1->rm_startblock) - be32_to_cpu(kp2->rm_startblock); @@ -304,7 +304,7 @@ xfs_rmapbt_verify( if (!xfs_verify_magic(bp, block->bb_magic)) return __this_address; - if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) + if (!xfs_has_rmapbt(mp)) return __this_address; fa = xfs_btree_sblock_v5hdr_verify(bp); if (fa) @@ -364,9 +364,9 @@ const struct xfs_buf_ops xfs_rmapbt_buf_ops = { STATIC int xfs_rmapbt_keys_inorder( - struct xfs_btree_cur *cur, - union xfs_btree_key *k1, - union xfs_btree_key *k2) + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2) { uint32_t x; uint32_t y; @@ -394,9 +394,9 @@ xfs_rmapbt_keys_inorder( STATIC int xfs_rmapbt_recs_inorder( - struct xfs_btree_cur *cur, - union xfs_btree_rec *r1, - union xfs_btree_rec *r2) + struct xfs_btree_cur *cur, + const union xfs_btree_rec *r1, + const union xfs_btree_rec *r2) { uint32_t x; uint32_t y; @@ -558,7 +558,7 @@ xfs_rmapbt_compute_maxlevels( * disallow reflinking when less than 10% of the per-AG metadata * block reservation since the fallback is a regular file copy. */ - if (xfs_sb_version_hasreflink(&mp->m_sb)) + if (xfs_has_reflink(mp)) mp->m_rmap_maxlevels = XFS_BTREE_MAXLEVELS; else mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels( @@ -606,7 +606,7 @@ xfs_rmapbt_calc_reserves( xfs_extlen_t tree_len; int error; - if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) + if (!xfs_has_rmapbt(mp)) return 0; error = xfs_alloc_read_agf(mp, tp, pag->pag_agno, 0, &agbp); diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h index 88d8d18788a2..f2eee6572af4 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.h +++ b/fs/xfs/libxfs/xfs_rmap_btree.h @@ -59,4 +59,4 @@ extern xfs_extlen_t xfs_rmapbt_max_size(struct xfs_mount *mp, extern int xfs_rmapbt_calc_reserves(struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_perag *pag, xfs_extlen_t *ask, xfs_extlen_t *used); -#endif /* __XFS_RMAP_BTREE_H__ */ +#endif /* __XFS_RMAP_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 483375c6a735..5740ba664867 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -1009,8 +1009,8 @@ xfs_rtfree_extent( int xfs_rtalloc_query_range( struct xfs_trans *tp, - struct xfs_rtalloc_rec *low_rec, - struct xfs_rtalloc_rec *high_rec, + const struct xfs_rtalloc_rec *low_rec, + const struct xfs_rtalloc_rec *high_rec, xfs_rtalloc_query_range_fn fn, void *priv) { @@ -1018,6 +1018,7 @@ xfs_rtalloc_query_range( struct xfs_mount *mp = tp->t_mountp; xfs_rtblock_t rtstart; xfs_rtblock_t rtend; + xfs_rtblock_t high_key; int is_free; int error = 0; @@ -1026,12 +1027,12 @@ xfs_rtalloc_query_range( if (low_rec->ar_startext >= mp->m_sb.sb_rextents || low_rec->ar_startext == high_rec->ar_startext) return 0; - high_rec->ar_startext = min(high_rec->ar_startext, - mp->m_sb.sb_rextents - 1); + + high_key = min(high_rec->ar_startext, mp->m_sb.sb_rextents - 1); /* Iterate the bitmap, looking for discrepancies. */ rtstart = low_rec->ar_startext; - while (rtstart <= high_rec->ar_startext) { + while (rtstart <= high_key) { /* Is the first block free? */ error = xfs_rtcheck_range(mp, tp, rtstart, 1, 1, &rtend, &is_free); @@ -1039,8 +1040,7 @@ xfs_rtalloc_query_range( break; /* How long does the extent go for? */ - error = xfs_rtfind_forw(mp, tp, rtstart, - high_rec->ar_startext, &rtend); + error = xfs_rtfind_forw(mp, tp, rtstart, high_key, &rtend); if (error) break; diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 04f5386446db..e58349be78bd 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -30,13 +30,110 @@ * Physical superblock buffer manipulations. Shared with libxfs in userspace. */ +/* + * We support all XFS versions newer than a v4 superblock with V2 directories. + */ +bool +xfs_sb_good_version( + struct xfs_sb *sbp) +{ + /* all v5 filesystems are supported */ + if (xfs_sb_is_v5(sbp)) + return true; + + /* versions prior to v4 are not supported */ + if (XFS_SB_VERSION_NUM(sbp) < XFS_SB_VERSION_4) + return false; + + /* V4 filesystems need v2 directories and unwritten extents */ + if (!(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT)) + return false; + if (!(sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT)) + return false; + + /* And must not have any unknown v4 feature bits set */ + if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) || + ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && + (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS))) + return false; + + /* It's a supported v4 filesystem */ + return true; +} + +uint64_t +xfs_sb_version_to_features( + struct xfs_sb *sbp) +{ + uint64_t features = 0; + + /* optional V4 features */ + if (sbp->sb_rblocks > 0) + features |= XFS_FEAT_REALTIME; + if (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT) + features |= XFS_FEAT_ATTR; + if (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT) + features |= XFS_FEAT_QUOTA; + if (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT) + features |= XFS_FEAT_ALIGN; + if (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT) + features |= XFS_FEAT_LOGV2; + if (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT) + features |= XFS_FEAT_DALIGN; + if (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT) + features |= XFS_FEAT_EXTFLG; + if (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT) + features |= XFS_FEAT_SECTOR; + if (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT) + features |= XFS_FEAT_ASCIICI; + if (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) { + if (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT) + features |= XFS_FEAT_LAZYSBCOUNT; + if (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT) + features |= XFS_FEAT_ATTR2; + if (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT) + features |= XFS_FEAT_PROJID32; + if (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE) + features |= XFS_FEAT_FTYPE; + } + + if (!xfs_sb_is_v5(sbp)) + return features; + + /* Always on V5 features */ + features |= XFS_FEAT_ALIGN | XFS_FEAT_LOGV2 | XFS_FEAT_EXTFLG | + XFS_FEAT_LAZYSBCOUNT | XFS_FEAT_ATTR2 | XFS_FEAT_PROJID32 | + XFS_FEAT_V3INODES | XFS_FEAT_CRC | XFS_FEAT_PQUOTINO; + + /* Optional V5 features */ + if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT) + features |= XFS_FEAT_FINOBT; + if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_RMAPBT) + features |= XFS_FEAT_RMAPBT; + if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_REFLINK) + features |= XFS_FEAT_REFLINK; + if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_INOBTCNT) + features |= XFS_FEAT_INOBTCNT; + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_FTYPE) + features |= XFS_FEAT_FTYPE; + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_SPINODES) + features |= XFS_FEAT_SPINODES; + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID) + features |= XFS_FEAT_META_UUID; + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_BIGTIME) + features |= XFS_FEAT_BIGTIME; + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR) + features |= XFS_FEAT_NEEDSREPAIR; + return features; +} + /* Check all the superblock fields we care about when reading one in. */ STATIC int xfs_validate_sb_read( struct xfs_mount *mp, struct xfs_sb *sbp) { - if (XFS_SB_VERSION_NUM(sbp) != XFS_SB_VERSION_5) + if (!xfs_sb_is_v5(sbp)) return 0; /* @@ -56,7 +153,7 @@ xfs_validate_sb_read( "Superblock has unknown read-only compatible features (0x%x) enabled.", (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_UNKNOWN)); - if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { + if (!xfs_is_readonly(mp)) { xfs_warn(mp, "Attempted to mount read-only compatible filesystem read-write."); xfs_warn(mp, @@ -95,7 +192,7 @@ xfs_validate_sb_write( * secondary superblocks, so allow this usage to continue because * we never read counters from such superblocks. */ - if (XFS_BUF_ADDR(bp) == XFS_SB_DADDR && !sbp->sb_inprogress && + if (xfs_buf_daddr(bp) == XFS_SB_DADDR && !sbp->sb_inprogress && (sbp->sb_fdblocks > sbp->sb_dblocks || !xfs_verify_icount(mp, sbp->sb_icount) || sbp->sb_ifree > sbp->sb_icount)) { @@ -103,7 +200,7 @@ xfs_validate_sb_write( return -EFSCORRUPTED; } - if (XFS_SB_VERSION_NUM(sbp) != XFS_SB_VERSION_5) + if (!xfs_sb_is_v5(sbp)) return 0; /* @@ -162,6 +259,7 @@ xfs_validate_sb_common( struct xfs_dsb *dsb = bp->b_addr; uint32_t agcount = 0; uint32_t rem; + bool has_dalign; if (!xfs_verify_magic(bp, dsb->sb_magicnum)) { xfs_warn(mp, "bad magic number"); @@ -173,12 +271,41 @@ xfs_validate_sb_common( return -EWRONGFS; } - if (xfs_sb_version_has_pquotino(sbp)) { + /* + * Validate feature flags and state + */ + if (xfs_sb_is_v5(sbp)) { + if (sbp->sb_blocksize < XFS_MIN_CRC_BLOCKSIZE) { + xfs_notice(mp, +"Block size (%u bytes) too small for Version 5 superblock (minimum %d bytes)", + sbp->sb_blocksize, XFS_MIN_CRC_BLOCKSIZE); + return -EFSCORRUPTED; + } + + /* V5 has a separate project quota inode */ if (sbp->sb_qflags & (XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD)) { xfs_notice(mp, "Version 5 of Super block has XFS_OQUOTA bits."); return -EFSCORRUPTED; } + + /* + * Full inode chunks must be aligned to inode chunk size when + * sparse inodes are enabled to support the sparse chunk + * allocation algorithm and prevent overlapping inode records. + */ + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_SPINODES) { + uint32_t align; + + align = XFS_INODES_PER_CHUNK * sbp->sb_inodesize + >> sbp->sb_blocklog; + if (sbp->sb_inoalignmt != align) { + xfs_warn(mp, +"Inode block alignment (%u) must match chunk size (%u) for sparse inodes.", + sbp->sb_inoalignmt, align); + return -EINVAL; + } + } } else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD | XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) { xfs_notice(mp, @@ -186,24 +313,6 @@ xfs_validate_sb_common( return -EFSCORRUPTED; } - /* - * Full inode chunks must be aligned to inode chunk size when - * sparse inodes are enabled to support the sparse chunk - * allocation algorithm and prevent overlapping inode records. - */ - if (xfs_sb_version_hassparseinodes(sbp)) { - uint32_t align; - - align = XFS_INODES_PER_CHUNK * sbp->sb_inodesize - >> sbp->sb_blocklog; - if (sbp->sb_inoalignmt != align) { - xfs_warn(mp, -"Inode block alignment (%u) must match chunk size (%u) for sparse inodes.", - sbp->sb_inoalignmt, align); - return -EINVAL; - } - } - if (unlikely( sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) { xfs_warn(mp, @@ -303,7 +412,8 @@ xfs_validate_sb_common( * Either (sb_unit and !hasdalign) or (!sb_unit and hasdalign) * would imply the image is corrupted. */ - if (!!sbp->sb_unit ^ xfs_sb_version_hasdalign(sbp)) { + has_dalign = sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT; + if (!!sbp->sb_unit ^ has_dalign) { xfs_notice(mp, "SB stripe alignment sanity check failed"); return -EFSCORRUPTED; } @@ -312,12 +422,6 @@ xfs_validate_sb_common( XFS_FSB_TO_B(mp, sbp->sb_width), 0, false)) return -EFSCORRUPTED; - if (xfs_sb_version_hascrc(&mp->m_sb) && - sbp->sb_blocksize < XFS_MIN_CRC_BLOCKSIZE) { - xfs_notice(mp, "v5 SB sanity check failed"); - return -EFSCORRUPTED; - } - /* * Currently only very few inode sizes are supported. */ @@ -361,7 +465,7 @@ xfs_sb_quota_from_disk(struct xfs_sb *sbp) * We need to do these manipilations only if we are working * with an older version of on-disk superblock. */ - if (xfs_sb_version_has_pquotino(sbp)) + if (xfs_sb_is_v5(sbp)) return; if (sbp->sb_qflags & XFS_OQUOTA_ENFD) @@ -454,7 +558,8 @@ __xfs_sb_from_disk( * sb_meta_uuid is only on disk if it differs from sb_uuid and the * feature flag is set; if not set we keep it only in memory. */ - if (xfs_sb_version_hasmetauuid(to)) + if (xfs_sb_is_v5(to) && + (to->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID)) uuid_copy(&to->sb_meta_uuid, &from->sb_meta_uuid); else uuid_copy(&to->sb_meta_uuid, &from->sb_uuid); @@ -479,7 +584,12 @@ xfs_sb_quota_to_disk( uint16_t qflags = from->sb_qflags; to->sb_uquotino = cpu_to_be64(from->sb_uquotino); - if (xfs_sb_version_has_pquotino(from)) { + + /* + * The in-memory superblock quota state matches the v5 on-disk format so + * just write them out and return + */ + if (xfs_sb_is_v5(from)) { to->sb_qflags = cpu_to_be16(from->sb_qflags); to->sb_gquotino = cpu_to_be64(from->sb_gquotino); to->sb_pquotino = cpu_to_be64(from->sb_pquotino); @@ -487,9 +597,9 @@ xfs_sb_quota_to_disk( } /* - * The in-core version of sb_qflags do not have XFS_OQUOTA_* - * flags, whereas the on-disk version does. So, convert incore - * XFS_{PG}QUOTA_* flags to on-disk XFS_OQUOTA_* flags. + * For older superblocks (v4), the in-core version of sb_qflags do not + * have XFS_OQUOTA_* flags, whereas the on-disk version does. So, + * convert incore XFS_{PG}QUOTA_* flags to on-disk XFS_OQUOTA_* flags. */ qflags &= ~(XFS_PQUOTA_ENFD | XFS_PQUOTA_CHKD | XFS_GQUOTA_ENFD | XFS_GQUOTA_CHKD); @@ -589,19 +699,20 @@ xfs_sb_to_disk( to->sb_features2 = cpu_to_be32(from->sb_features2); to->sb_bad_features2 = cpu_to_be32(from->sb_bad_features2); - if (xfs_sb_version_hascrc(from)) { - to->sb_features_compat = cpu_to_be32(from->sb_features_compat); - to->sb_features_ro_compat = - cpu_to_be32(from->sb_features_ro_compat); - to->sb_features_incompat = - cpu_to_be32(from->sb_features_incompat); - to->sb_features_log_incompat = - cpu_to_be32(from->sb_features_log_incompat); - to->sb_spino_align = cpu_to_be32(from->sb_spino_align); - to->sb_lsn = cpu_to_be64(from->sb_lsn); - if (xfs_sb_version_hasmetauuid(from)) - uuid_copy(&to->sb_meta_uuid, &from->sb_meta_uuid); - } + if (!xfs_sb_is_v5(from)) + return; + + to->sb_features_compat = cpu_to_be32(from->sb_features_compat); + to->sb_features_ro_compat = + cpu_to_be32(from->sb_features_ro_compat); + to->sb_features_incompat = + cpu_to_be32(from->sb_features_incompat); + to->sb_features_log_incompat = + cpu_to_be32(from->sb_features_log_incompat); + to->sb_spino_align = cpu_to_be32(from->sb_spino_align); + to->sb_lsn = cpu_to_be64(from->sb_lsn); + if (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID) + uuid_copy(&to->sb_meta_uuid, &from->sb_meta_uuid); } /* @@ -636,8 +747,8 @@ xfs_sb_read_verify( if (!xfs_buf_verify_cksum(bp, XFS_SB_CRC_OFF)) { /* Only fail bad secondaries on a known V5 filesystem */ - if (bp->b_bn == XFS_SB_DADDR || - xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_buf_daddr(bp) == XFS_SB_DADDR || + xfs_has_crc(mp)) { error = -EFSBADCRC; goto out_error; } @@ -704,7 +815,7 @@ xfs_sb_write_verify( if (error) goto out_error; - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_sb_is_v5(&sb)) return; if (bip) @@ -801,7 +912,7 @@ xfs_log_sb( * unclean shutdown, this will be corrected by log recovery rebuilding * the counters from the AGF block counts. */ - if (xfs_sb_version_haslazysbcount(&mp->m_sb)) { + if (xfs_has_lazysbcount(mp)) { mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount); mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree); mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks); @@ -950,10 +1061,12 @@ out: void xfs_fs_geometry( - struct xfs_sb *sbp, + struct xfs_mount *mp, struct xfs_fsop_geom *geo, int struct_version) { + struct xfs_sb *sbp = &mp->m_sb; + memset(geo, 0, sizeof(struct xfs_fsop_geom)); geo->blocksize = sbp->sb_blocksize; @@ -984,51 +1097,51 @@ xfs_fs_geometry( geo->flags = XFS_FSOP_GEOM_FLAGS_NLINK | XFS_FSOP_GEOM_FLAGS_DIRV2 | XFS_FSOP_GEOM_FLAGS_EXTFLG; - if (xfs_sb_version_hasattr(sbp)) + if (xfs_has_attr(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_ATTR; - if (xfs_sb_version_hasquota(sbp)) + if (xfs_has_quota(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_QUOTA; - if (xfs_sb_version_hasalign(sbp)) + if (xfs_has_align(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_IALIGN; - if (xfs_sb_version_hasdalign(sbp)) + if (xfs_has_dalign(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_DALIGN; - if (xfs_sb_version_hassector(sbp)) - geo->flags |= XFS_FSOP_GEOM_FLAGS_SECTOR; - if (xfs_sb_version_hasasciici(sbp)) + if (xfs_has_asciici(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_DIRV2CI; - if (xfs_sb_version_haslazysbcount(sbp)) + if (xfs_has_lazysbcount(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_LAZYSB; - if (xfs_sb_version_hasattr2(sbp)) + if (xfs_has_attr2(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_ATTR2; - if (xfs_sb_version_hasprojid32bit(sbp)) + if (xfs_has_projid32(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_PROJID32; - if (xfs_sb_version_hascrc(sbp)) + if (xfs_has_crc(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_V5SB; - if (xfs_sb_version_hasftype(sbp)) + if (xfs_has_ftype(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_FTYPE; - if (xfs_sb_version_hasfinobt(sbp)) + if (xfs_has_finobt(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_FINOBT; - if (xfs_sb_version_hassparseinodes(sbp)) + if (xfs_has_sparseinodes(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_SPINODES; - if (xfs_sb_version_hasrmapbt(sbp)) + if (xfs_has_rmapbt(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_RMAPBT; - if (xfs_sb_version_hasreflink(sbp)) + if (xfs_has_reflink(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_REFLINK; - if (xfs_sb_version_hasbigtime(sbp)) + if (xfs_has_bigtime(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_BIGTIME; - if (xfs_sb_version_hasinobtcounts(sbp)) + if (xfs_has_inobtcounts(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_INOBTCNT; - if (xfs_sb_version_hassector(sbp)) + if (xfs_has_sector(mp)) { + geo->flags |= XFS_FSOP_GEOM_FLAGS_SECTOR; geo->logsectsize = sbp->sb_logsectsize; - else + } else { geo->logsectsize = BBSIZE; + } geo->rtsectsize = sbp->sb_blocksize; geo->dirblocksize = xfs_dir2_dirblock_bytes(sbp); if (struct_version < 4) return; - if (xfs_sb_version_haslogv2(sbp)) + if (xfs_has_logv2(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_LOGV2; geo->logsunit = sbp->sb_logsunit; diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h index 0c1602d9b53d..a5e14740ec9a 100644 --- a/fs/xfs/libxfs/xfs_sb.h +++ b/fs/xfs/libxfs/xfs_sb.h @@ -20,11 +20,13 @@ extern void xfs_sb_mount_common(struct xfs_mount *mp, struct xfs_sb *sbp); extern void xfs_sb_from_disk(struct xfs_sb *to, struct xfs_dsb *from); extern void xfs_sb_to_disk(struct xfs_dsb *to, struct xfs_sb *from); extern void xfs_sb_quota_from_disk(struct xfs_sb *sbp); +extern bool xfs_sb_good_version(struct xfs_sb *sbp); +extern uint64_t xfs_sb_version_to_features(struct xfs_sb *sbp); extern int xfs_update_secondary_sbs(struct xfs_mount *mp); #define XFS_FS_GEOM_MAX_STRUCT_VER (4) -extern void xfs_fs_geometry(struct xfs_sb *sbp, struct xfs_fsop_geom *geo, +extern void xfs_fs_geometry(struct xfs_mount *mp, struct xfs_fsop_geom *geo, int struct_version); extern int xfs_sb_read_secondary(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c index 594bc447a7dd..f0b38f4aba80 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.c +++ b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -42,7 +42,7 @@ xfs_symlink_hdr_set( { struct xfs_dsymlink_hdr *dsl = bp->b_addr; - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return 0; memset(dsl, 0, sizeof(struct xfs_dsymlink_hdr)); @@ -51,7 +51,7 @@ xfs_symlink_hdr_set( dsl->sl_bytes = cpu_to_be32(size); uuid_copy(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid); dsl->sl_owner = cpu_to_be64(ino); - dsl->sl_blkno = cpu_to_be64(bp->b_bn); + dsl->sl_blkno = cpu_to_be64(xfs_buf_daddr(bp)); bp->b_ops = &xfs_symlink_buf_ops; return sizeof(struct xfs_dsymlink_hdr); @@ -89,13 +89,13 @@ xfs_symlink_verify( struct xfs_mount *mp = bp->b_mount; struct xfs_dsymlink_hdr *dsl = bp->b_addr; - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return __this_address; if (!xfs_verify_magic(bp, dsl->sl_magic)) return __this_address; if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; - if (bp->b_bn != be64_to_cpu(dsl->sl_blkno)) + if (xfs_buf_daddr(bp) != be64_to_cpu(dsl->sl_blkno)) return __this_address; if (be32_to_cpu(dsl->sl_offset) + be32_to_cpu(dsl->sl_bytes) >= XFS_SYMLINK_MAXLEN) @@ -116,7 +116,7 @@ xfs_symlink_read_verify( xfs_failaddr_t fa; /* no verification of non-crc buffers */ - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return; if (!xfs_buf_verify_cksum(bp, XFS_SYMLINK_CRC_OFF)) @@ -137,7 +137,7 @@ xfs_symlink_write_verify( xfs_failaddr_t fa; /* no verification of non-crc buffers */ - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return; fa = xfs_symlink_verify(bp); @@ -173,7 +173,7 @@ xfs_symlink_local_to_remote( xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SYMLINK_BUF); - if (!xfs_sb_version_hascrc(&mp->m_sb)) { + if (!xfs_has_crc(mp)) { bp->b_ops = NULL; memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1); diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c index 16f723ebe8dd..8b5547073379 100644 --- a/fs/xfs/libxfs/xfs_trans_inode.c +++ b/fs/xfs/libxfs/xfs_trans_inode.c @@ -136,7 +136,7 @@ xfs_trans_log_inode( * to upgrade this inode to bigtime format, do so now. */ if ((flags & (XFS_ILOG_CORE | XFS_ILOG_TIMESTAMP)) && - xfs_sb_version_hasbigtime(&ip->i_mount->m_sb) && + xfs_has_bigtime(ip->i_mount) && !xfs_inode_has_bigtime(ip)) { ip->i_diflags2 |= XFS_DIFLAG2_BIGTIME; flags |= XFS_ILOG_CORE; diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index d1a0848cb52e..5e300daa2559 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -71,9 +71,9 @@ xfs_allocfree_log_count( uint blocks; blocks = num_ops * 2 * (2 * mp->m_ag_maxlevels - 1); - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + if (xfs_has_rmapbt(mp)) blocks += num_ops * (2 * mp->m_rmap_maxlevels - 1); - if (xfs_sb_version_hasreflink(&mp->m_sb)) + if (xfs_has_reflink(mp)) blocks += num_ops * (2 * mp->m_refc_maxlevels - 1); return blocks; @@ -155,7 +155,7 @@ STATIC uint xfs_calc_finobt_res( struct xfs_mount *mp) { - if (!xfs_sb_version_hasfinobt(&mp->m_sb)) + if (!xfs_has_finobt(mp)) return 0; return xfs_calc_inobt_res(mp); @@ -187,7 +187,7 @@ xfs_calc_inode_chunk_res( XFS_FSB_TO_B(mp, 1)); if (alloc) { /* icreate tx uses ordered buffers */ - if (xfs_sb_version_has_v3inode(&mp->m_sb)) + if (xfs_has_v3inodes(mp)) return res; size = XFS_FSB_TO_B(mp, 1); } @@ -268,7 +268,7 @@ xfs_calc_write_reservation( xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz); - if (xfs_sb_version_hasrealtime(&mp->m_sb)) { + if (xfs_has_realtime(mp)) { t2 = xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), blksz) + @@ -317,7 +317,7 @@ xfs_calc_itruncate_reservation( t2 = xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4), blksz); - if (xfs_sb_version_hasrealtime(&mp->m_sb)) { + if (xfs_has_realtime(mp)) { t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + xfs_calc_buf_res(xfs_rtalloc_log_count(mp, 2), blksz) + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz); @@ -799,29 +799,6 @@ xfs_calc_qm_dqalloc_reservation( } /* - * Turning off quotas. - * the quota off logitems: sizeof(struct xfs_qoff_logitem) * 2 - * the superblock for the quota flags: sector size - */ -STATIC uint -xfs_calc_qm_quotaoff_reservation( - struct xfs_mount *mp) -{ - return sizeof(struct xfs_qoff_logitem) * 2 + - xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); -} - -/* - * End of turning off quotas. - * the quota off logitems: sizeof(struct xfs_qoff_logitem) * 2 - */ -STATIC uint -xfs_calc_qm_quotaoff_end_reservation(void) -{ - return sizeof(struct xfs_qoff_logitem) * 2; -} - -/* * Syncing the incore super block changes to disk. * the super block to reflect the changes: sector size */ @@ -842,14 +819,14 @@ xfs_trans_resv_calc( * require a permanent reservation on space. */ resp->tr_write.tr_logres = xfs_calc_write_reservation(mp); - if (xfs_sb_version_hasreflink(&mp->m_sb)) + if (xfs_has_reflink(mp)) resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK; else resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT; resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES; resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp); - if (xfs_sb_version_hasreflink(&mp->m_sb)) + if (xfs_has_reflink(mp)) resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT_REFLINK; else @@ -910,7 +887,7 @@ xfs_trans_resv_calc( resp->tr_growrtalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES; resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp); - if (xfs_sb_version_hasreflink(&mp->m_sb)) + if (xfs_has_reflink(mp)) resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK; else resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT; @@ -923,13 +900,6 @@ xfs_trans_resv_calc( resp->tr_qm_setqlim.tr_logres = xfs_calc_qm_setqlim_reservation(); resp->tr_qm_setqlim.tr_logcount = XFS_DEFAULT_LOG_COUNT; - resp->tr_qm_quotaoff.tr_logres = xfs_calc_qm_quotaoff_reservation(mp); - resp->tr_qm_quotaoff.tr_logcount = XFS_DEFAULT_LOG_COUNT; - - resp->tr_qm_equotaoff.tr_logres = - xfs_calc_qm_quotaoff_end_reservation(); - resp->tr_qm_equotaoff.tr_logcount = XFS_DEFAULT_LOG_COUNT; - resp->tr_sb.tr_logres = xfs_calc_sb_reservation(mp); resp->tr_sb.tr_logcount = XFS_DEFAULT_LOG_COUNT; diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h index 7241ab28cf84..fc4e9b369a3a 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.h +++ b/fs/xfs/libxfs/xfs_trans_resv.h @@ -46,8 +46,6 @@ struct xfs_trans_resv { struct xfs_trans_res tr_growrtfree; /* grow realtime freeing */ struct xfs_trans_res tr_qm_setqlim; /* adjust quota limits */ struct xfs_trans_res tr_qm_dqalloc; /* allocate quota on disk */ - struct xfs_trans_res tr_qm_quotaoff; /* turn quota off */ - struct xfs_trans_res tr_qm_equotaoff;/* end of turn quota off */ struct xfs_trans_res tr_sb; /* modify superblock */ struct xfs_trans_res tr_fsyncts; /* update timestamps on fsync */ }; diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h index 7ad3659c5d2a..50332be34388 100644 --- a/fs/xfs/libxfs/xfs_trans_space.h +++ b/fs/xfs/libxfs/xfs_trans_space.h @@ -57,8 +57,7 @@ XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK) #define XFS_IALLOC_SPACE_RES(mp) \ (M_IGEO(mp)->ialloc_blks + \ - ((xfs_sb_version_hasfinobt(&mp->m_sb) ? 2 : 1) * \ - M_IGEO(mp)->inobt_maxlevels)) + ((xfs_has_finobt(mp) ? 2 : 1) * M_IGEO(mp)->inobt_maxlevels)) /* * Space reservation values for various transactions. @@ -94,8 +93,7 @@ #define XFS_SYMLINK_SPACE_RES(mp,nl,b) \ (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl) + (b)) #define XFS_IFREE_SPACE_RES(mp) \ - (xfs_sb_version_hasfinobt(&mp->m_sb) ? \ - M_IGEO(mp)->inobt_maxlevels : 0) + (xfs_has_finobt(mp) ? M_IGEO(mp)->inobt_maxlevels : 0) #endif /* __XFS_TRANS_SPACE_H__ */ diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c index e8f4abee7892..e810d23f2d97 100644 --- a/fs/xfs/libxfs/xfs_types.c +++ b/fs/xfs/libxfs/xfs_types.c @@ -169,7 +169,7 @@ xfs_internal_inum( xfs_ino_t ino) { return ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino || - (xfs_sb_version_hasquota(&mp->m_sb) && + (xfs_has_quota(mp) && xfs_is_quota_inode(&mp->m_sb, ino)); } diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 0870ef6f933d..b6da06b40989 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -87,6 +87,11 @@ typedef void * xfs_failaddr_t; #define XFS_ATTR_FORK 1 #define XFS_COW_FORK 2 +#define XFS_WHICHFORK_STRINGS \ + { XFS_DATA_FORK, "data" }, \ + { XFS_ATTR_FORK, "attr" }, \ + { XFS_COW_FORK, "cow" } + /* * Min numbers of data/attr fork btree root pointers. */ diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index be1a7e1e65f7..ae3c9f6e2c69 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -36,7 +36,7 @@ xchk_superblock_xref( agbno = XFS_SB_BLOCK(mp); - error = xchk_ag_init(sc, agno, &sc->sa); + error = xchk_ag_init_existing(sc, agno, &sc->sa); if (!xchk_xref_process_error(sc, agno, agbno, &error)) return; @@ -63,6 +63,7 @@ xchk_superblock( struct xfs_mount *mp = sc->mp; struct xfs_buf *bp; struct xfs_dsb *sb; + struct xfs_perag *pag; xfs_agnumber_t agno; uint32_t v2_ok; __be32 features_mask; @@ -73,6 +74,15 @@ xchk_superblock( if (agno == 0) return 0; + /* + * Grab an active reference to the perag structure. If we can't get + * it, we're racing with something that's tearing down the AG, so + * signal that the AG no longer exists. + */ + pag = xfs_perag_get(mp, agno); + if (!pag) + return -ENOENT; + error = xfs_sb_read_secondary(mp, sc->tp, agno, &bp); /* * The superblock verifier can return several different error codes @@ -92,7 +102,7 @@ xchk_superblock( break; } if (!xchk_process_error(sc, agno, XFS_SB_BLOCK(mp), &error)) - return error; + goto out_pag; sb = bp->b_addr; @@ -248,7 +258,7 @@ xchk_superblock( xchk_block_set_corrupt(sc, bp); } else { v2_ok = XFS_SB_VERSION2_OKBITS; - if (XFS_SB_VERSION_NUM(&mp->m_sb) >= XFS_SB_VERSION_5) + if (xfs_sb_is_v5(&mp->m_sb)) v2_ok |= XFS_SB_VERSION2_CRCBIT; if (!!(sb->sb_features2 & cpu_to_be32(~v2_ok))) @@ -273,7 +283,7 @@ xchk_superblock( (cpu_to_be32(mp->m_sb.sb_features2) & features_mask)) xchk_block_set_corrupt(sc, bp); - if (!xfs_sb_version_hascrc(&mp->m_sb)) { + if (!xfs_has_crc(mp)) { /* all v5 fields must be zero */ if (memchr_inv(&sb->sb_features_compat, 0, sizeof(struct xfs_dsb) - @@ -324,7 +334,7 @@ xchk_superblock( /* Don't care about sb_lsn */ } - if (xfs_sb_version_hasmetauuid(&mp->m_sb)) { + if (xfs_has_metauuid(mp)) { /* The metadata UUID must be the same for all supers */ if (!uuid_equal(&sb->sb_meta_uuid, &mp->m_sb.sb_meta_uuid)) xchk_block_set_corrupt(sc, bp); @@ -336,7 +346,8 @@ xchk_superblock( xchk_block_set_corrupt(sc, bp); xchk_superblock_xref(sc, bp); - +out_pag: + xfs_perag_put(pag); return error; } @@ -346,7 +357,7 @@ xchk_superblock( STATIC int xchk_agf_record_bno_lengths( struct xfs_btree_cur *cur, - struct xfs_alloc_rec_incore *rec, + const struct xfs_alloc_rec_incore *rec, void *priv) { xfs_extlen_t *blocks = priv; @@ -419,7 +430,7 @@ xchk_agf_xref_btreeblks( int error; /* agf_btreeblks didn't exist before lazysbcount */ - if (!xfs_sb_version_haslazysbcount(&sc->mp->m_sb)) + if (!xfs_has_lazysbcount(sc->mp)) return; /* Check agf_rmap_blocks; set up for agf_btreeblks check */ @@ -438,7 +449,7 @@ xchk_agf_xref_btreeblks( * No rmap cursor; we can't xref if we have the rmapbt feature. * We also can't do it if we're missing the free space btree cursors. */ - if ((xfs_sb_version_hasrmapbt(&mp->m_sb) && !sc->sa.rmap_cur) || + if ((xfs_has_rmapbt(mp) && !sc->sa.rmap_cur) || !sc->sa.bno_cur || !sc->sa.cnt_cur) return; @@ -527,6 +538,7 @@ xchk_agf( xchk_buffer_recheck(sc, sc->sa.agf_bp); agf = sc->sa.agf_bp->b_addr; + pag = sc->sa.pag; /* Check the AG length */ eoag = be32_to_cpu(agf->agf_length); @@ -550,7 +562,7 @@ xchk_agf( if (level <= 0 || level > XFS_BTREE_MAXLEVELS) xchk_block_set_corrupt(sc, sc->sa.agf_bp); - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) { + if (xfs_has_rmapbt(mp)) { agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_RMAP]); if (!xfs_verify_agbno(mp, agno, agbno)) xchk_block_set_corrupt(sc, sc->sa.agf_bp); @@ -560,7 +572,7 @@ xchk_agf( xchk_block_set_corrupt(sc, sc->sa.agf_bp); } - if (xfs_sb_version_hasreflink(&mp->m_sb)) { + if (xfs_has_reflink(mp)) { agbno = be32_to_cpu(agf->agf_refcount_root); if (!xfs_verify_agbno(mp, agno, agbno)) xchk_block_set_corrupt(sc, sc->sa.agf_bp); @@ -582,15 +594,13 @@ xchk_agf( xchk_block_set_corrupt(sc, sc->sa.agf_bp); /* Do the incore counters match? */ - pag = xfs_perag_get(mp, agno); if (pag->pagf_freeblks != be32_to_cpu(agf->agf_freeblks)) xchk_block_set_corrupt(sc, sc->sa.agf_bp); if (pag->pagf_flcount != be32_to_cpu(agf->agf_flcount)) xchk_block_set_corrupt(sc, sc->sa.agf_bp); - if (xfs_sb_version_haslazysbcount(&sc->mp->m_sb) && + if (xfs_has_lazysbcount(sc->mp) && pag->pagf_btreeblks != be32_to_cpu(agf->agf_btreeblks)) xchk_block_set_corrupt(sc, sc->sa.agf_bp); - xfs_perag_put(pag); xchk_agf_xref(sc); out: @@ -630,7 +640,7 @@ xchk_agfl_block( { struct xchk_agfl_info *sai = priv; struct xfs_scrub *sc = sai->sc; - xfs_agnumber_t agno = sc->sa.agno; + xfs_agnumber_t agno = sc->sa.pag->pag_agno; if (xfs_verify_agbno(mp, agno, agbno) && sai->nr_entries < sai->sz_entries) @@ -787,7 +797,7 @@ xchk_agi_xref_fiblocks( xfs_agblock_t blocks; int error = 0; - if (!xfs_sb_version_hasinobtcounts(&sc->mp->m_sb)) + if (!xfs_has_inobtcounts(sc->mp)) return; if (sc->sa.ino_cur) { @@ -857,6 +867,7 @@ xchk_agi( xchk_buffer_recheck(sc, sc->sa.agi_bp); agi = sc->sa.agi_bp->b_addr; + pag = sc->sa.pag; /* Check the AG length */ eoag = be32_to_cpu(agi->agi_length); @@ -872,7 +883,7 @@ xchk_agi( if (level <= 0 || level > XFS_BTREE_MAXLEVELS) xchk_block_set_corrupt(sc, sc->sa.agi_bp); - if (xfs_sb_version_hasfinobt(&mp->m_sb)) { + if (xfs_has_finobt(mp)) { agbno = be32_to_cpu(agi->agi_free_root); if (!xfs_verify_agbno(mp, agno, agbno)) xchk_block_set_corrupt(sc, sc->sa.agi_bp); @@ -909,12 +920,10 @@ xchk_agi( xchk_block_set_corrupt(sc, sc->sa.agi_bp); /* Do the incore counters match? */ - pag = xfs_perag_get(mp, agno); if (pag->pagi_count != be32_to_cpu(agi->agi_count)) xchk_block_set_corrupt(sc, sc->sa.agi_bp); if (pag->pagi_freecount != be32_to_cpu(agi->agi_freecount)) xchk_block_set_corrupt(sc, sc->sa.agi_bp); - xfs_perag_put(pag); xchk_agi_xref(sc); out: diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index e95f8c98f0f7..0f8deee66f15 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -70,7 +70,7 @@ struct xrep_agf_allocbt { STATIC int xrep_agf_walk_allocbt( struct xfs_btree_cur *cur, - struct xfs_alloc_rec_incore *rec, + const struct xfs_alloc_rec_incore *rec, void *priv) { struct xrep_agf_allocbt *raa = priv; @@ -94,7 +94,7 @@ xrep_agf_check_agfl_block( { struct xfs_scrub *sc = priv; - if (!xfs_verify_agbno(mp, sc->sa.agno, agbno)) + if (!xfs_verify_agbno(mp, sc->sa.pag->pag_agno, agbno)) return -EFSCORRUPTED; return 0; } @@ -164,7 +164,7 @@ xrep_agf_find_btrees( return -EFSCORRUPTED; /* We must find the refcountbt root if that feature is enabled. */ - if (xfs_sb_version_hasreflink(&sc->mp->m_sb) && + if (xfs_has_reflink(sc->mp) && !xrep_check_btree_root(sc, &fab[XREP_AGF_REFCOUNTBT])) return -EFSCORRUPTED; @@ -188,12 +188,13 @@ xrep_agf_init_header( memset(agf, 0, BBTOB(agf_bp->b_length)); agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC); agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION); - agf->agf_seqno = cpu_to_be32(sc->sa.agno); - agf->agf_length = cpu_to_be32(xfs_ag_block_count(mp, sc->sa.agno)); + agf->agf_seqno = cpu_to_be32(sc->sa.pag->pag_agno); + agf->agf_length = cpu_to_be32(xfs_ag_block_count(mp, + sc->sa.pag->pag_agno)); agf->agf_flfirst = old_agf->agf_flfirst; agf->agf_fllast = old_agf->agf_fllast; agf->agf_flcount = old_agf->agf_flcount; - if (xfs_sb_version_hascrc(&mp->m_sb)) + if (xfs_has_crc(mp)) uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid); /* Mark the incore AGF data stale until we're done fixing things. */ @@ -223,7 +224,7 @@ xrep_agf_set_roots( agf->agf_levels[XFS_BTNUM_RMAPi] = cpu_to_be32(fab[XREP_AGF_RMAPBT].height); - if (xfs_sb_version_hasreflink(&sc->mp->m_sb)) { + if (xfs_has_reflink(sc->mp)) { agf->agf_refcount_root = cpu_to_be32(fab[XREP_AGF_REFCOUNTBT].root); agf->agf_refcount_level = @@ -280,7 +281,7 @@ xrep_agf_calc_from_btrees( agf->agf_btreeblks = cpu_to_be32(btreeblks); /* Update the AGF counters from the refcountbt. */ - if (xfs_sb_version_hasreflink(&mp->m_sb)) { + if (xfs_has_reflink(mp)) { cur = xfs_refcountbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag); error = xfs_btree_count_blocks(cur, &blocks); @@ -363,16 +364,16 @@ xrep_agf( int error; /* We require the rmapbt to rebuild anything. */ - if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) + if (!xfs_has_rmapbt(mp)) return -EOPNOTSUPP; - xchk_perag_get(sc->mp, &sc->sa); /* * Make sure we have the AGF buffer, as scrub might have decided it * was corrupt after xfs_alloc_read_agf failed with -EFSCORRUPTED. */ error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp, - XFS_AG_DADDR(mp, sc->sa.agno, XFS_AGF_DADDR(mp)), + XFS_AG_DADDR(mp, sc->sa.pag->pag_agno, + XFS_AGF_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), 0, &agf_bp, NULL); if (error) return error; @@ -388,7 +389,7 @@ xrep_agf( * btrees rooted in the AGF. If the AGFL contents are obviously bad * then we'll bail out. */ - error = xfs_alloc_read_agfl(mp, sc->tp, sc->sa.agno, &agfl_bp); + error = xfs_alloc_read_agfl(mp, sc->tp, sc->sa.pag->pag_agno, &agfl_bp); if (error) return error; @@ -442,7 +443,7 @@ struct xrep_agfl { STATIC int xrep_agfl_walk_rmap( struct xfs_btree_cur *cur, - struct xfs_rmap_irec *rec, + const struct xfs_rmap_irec *rec, void *priv) { struct xrep_agfl *ra = priv; @@ -586,7 +587,7 @@ xrep_agfl_init_header( agfl = XFS_BUF_TO_AGFL(agfl_bp); memset(agfl, 0xFF, BBTOB(agfl_bp->b_length)); agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC); - agfl->agfl_seqno = cpu_to_be32(sc->sa.agno); + agfl->agfl_seqno = cpu_to_be32(sc->sa.pag->pag_agno); uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid); /* @@ -599,7 +600,8 @@ xrep_agfl_init_header( for_each_xbitmap_extent(br, n, agfl_extents) { agbno = XFS_FSB_TO_AGBNO(mp, br->start); - trace_xrep_agfl_insert(mp, sc->sa.agno, agbno, br->len); + trace_xrep_agfl_insert(mp, sc->sa.pag->pag_agno, agbno, + br->len); while (br->len > 0 && fl_off < flcount) { agfl_bno[fl_off] = cpu_to_be32(agbno); @@ -638,10 +640,9 @@ xrep_agfl( int error; /* We require the rmapbt to rebuild anything. */ - if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) + if (!xfs_has_rmapbt(mp)) return -EOPNOTSUPP; - xchk_perag_get(sc->mp, &sc->sa); xbitmap_init(&agfl_extents); /* @@ -649,7 +650,8 @@ xrep_agfl( * nothing wrong with the AGF, but all the AG header repair functions * have this chicken-and-egg problem. */ - error = xfs_alloc_read_agf(mp, sc->tp, sc->sa.agno, 0, &agf_bp); + error = xfs_alloc_read_agf(mp, sc->tp, sc->sa.pag->pag_agno, 0, + &agf_bp); if (error) return error; @@ -658,7 +660,8 @@ xrep_agfl( * was corrupt after xfs_alloc_read_agfl failed with -EFSCORRUPTED. */ error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp, - XFS_AG_DADDR(mp, sc->sa.agno, XFS_AGFL_DADDR(mp)), + XFS_AG_DADDR(mp, sc->sa.pag->pag_agno, + XFS_AGFL_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), 0, &agfl_bp, NULL); if (error) return error; @@ -723,7 +726,8 @@ xrep_agi_find_btrees( int error; /* Read the AGF. */ - error = xfs_alloc_read_agf(mp, sc->tp, sc->sa.agno, 0, &agf_bp); + error = xfs_alloc_read_agf(mp, sc->tp, sc->sa.pag->pag_agno, 0, + &agf_bp); if (error) return error; @@ -737,7 +741,7 @@ xrep_agi_find_btrees( return -EFSCORRUPTED; /* We must find the finobt root if that feature is enabled. */ - if (xfs_sb_version_hasfinobt(&mp->m_sb) && + if (xfs_has_finobt(mp) && !xrep_check_btree_root(sc, &fab[XREP_AGI_FINOBT])) return -EFSCORRUPTED; @@ -761,11 +765,12 @@ xrep_agi_init_header( memset(agi, 0, BBTOB(agi_bp->b_length)); agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC); agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION); - agi->agi_seqno = cpu_to_be32(sc->sa.agno); - agi->agi_length = cpu_to_be32(xfs_ag_block_count(mp, sc->sa.agno)); + agi->agi_seqno = cpu_to_be32(sc->sa.pag->pag_agno); + agi->agi_length = cpu_to_be32(xfs_ag_block_count(mp, + sc->sa.pag->pag_agno)); agi->agi_newino = cpu_to_be32(NULLAGINO); agi->agi_dirino = cpu_to_be32(NULLAGINO); - if (xfs_sb_version_hascrc(&mp->m_sb)) + if (xfs_has_crc(mp)) uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid); /* We don't know how to fix the unlinked list yet. */ @@ -787,7 +792,7 @@ xrep_agi_set_roots( agi->agi_root = cpu_to_be32(fab[XREP_AGI_INOBT].root); agi->agi_level = cpu_to_be32(fab[XREP_AGI_INOBT].height); - if (xfs_sb_version_hasfinobt(&sc->mp->m_sb)) { + if (xfs_has_finobt(sc->mp)) { agi->agi_free_root = cpu_to_be32(fab[XREP_AGI_FINOBT].root); agi->agi_free_level = cpu_to_be32(fab[XREP_AGI_FINOBT].height); } @@ -811,7 +816,7 @@ xrep_agi_calc_from_btrees( error = xfs_ialloc_count_inodes(cur, &count, &freecount); if (error) goto err; - if (xfs_sb_version_hasinobtcounts(&mp->m_sb)) { + if (xfs_has_inobtcounts(mp)) { xfs_agblock_t blocks; error = xfs_btree_count_blocks(cur, &blocks); @@ -824,8 +829,7 @@ xrep_agi_calc_from_btrees( agi->agi_count = cpu_to_be32(count); agi->agi_freecount = cpu_to_be32(freecount); - if (xfs_sb_version_hasfinobt(&mp->m_sb) && - xfs_sb_version_hasinobtcounts(&mp->m_sb)) { + if (xfs_has_finobt(mp) && xfs_has_inobtcounts(mp)) { xfs_agblock_t blocks; cur = xfs_inobt_init_cursor(mp, sc->tp, agi_bp, @@ -893,16 +897,16 @@ xrep_agi( int error; /* We require the rmapbt to rebuild anything. */ - if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) + if (!xfs_has_rmapbt(mp)) return -EOPNOTSUPP; - xchk_perag_get(sc->mp, &sc->sa); /* * Make sure we have the AGI buffer, as scrub might have decided it * was corrupt after xfs_ialloc_read_agi failed with -EFSCORRUPTED. */ error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp, - XFS_AG_DADDR(mp, sc->sa.agno, XFS_AGI_DADDR(mp)), + XFS_AG_DADDR(mp, sc->sa.pag->pag_agno, + XFS_AGI_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), 0, &agi_bp, NULL); if (error) return error; diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c index d5741980094a..87518e1292f8 100644 --- a/fs/xfs/scrub/alloc.c +++ b/fs/xfs/scrub/alloc.c @@ -91,7 +91,7 @@ xchk_allocbt_xref( STATIC int xchk_allocbt_rec( struct xchk_btree *bs, - union xfs_btree_rec *rec) + const union xfs_btree_rec *rec) { struct xfs_mount *mp = bs->cur->bc_mp; xfs_agnumber_t agno = bs->cur->bc_ag.pag->pag_agno; diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c index 552af0cf8482..b6f0c9f3f124 100644 --- a/fs/xfs/scrub/attr.c +++ b/fs/xfs/scrub/attr.c @@ -25,11 +25,11 @@ * reallocating the buffer if necessary. Buffer contents are not preserved * across a reallocation. */ -int +static int xchk_setup_xattr_buf( struct xfs_scrub *sc, size_t value_size, - xfs_km_flags_t flags) + gfp_t flags) { size_t sz; struct xchk_xattr_buf *ab = sc->buf; @@ -57,7 +57,7 @@ xchk_setup_xattr_buf( * Don't zero the buffer upon allocation to avoid runtime overhead. * All users must be careful never to read uninitialized contents. */ - ab = kmem_alloc_large(sizeof(*ab) + sz, flags); + ab = kvmalloc(sizeof(*ab) + sz, flags); if (!ab) return -ENOMEM; @@ -79,7 +79,7 @@ xchk_setup_xattr( * without the inode lock held, which means we can sleep. */ if (sc->flags & XCHK_TRY_HARDER) { - error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX, 0); + error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX, GFP_KERNEL); if (error) return error; } @@ -138,7 +138,8 @@ xchk_xattr_listent( * doesn't work, we overload the seen_enough variable to convey * the error message back to the main scrub function. */ - error = xchk_setup_xattr_buf(sx->sc, valuelen, KM_MAYFAIL); + error = xchk_setup_xattr_buf(sx->sc, valuelen, + GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (error == -ENOMEM) error = -EDEADLOCK; if (error) { @@ -323,7 +324,8 @@ xchk_xattr_block( return 0; /* Allocate memory for block usage checking. */ - error = xchk_setup_xattr_buf(ds->sc, 0, KM_MAYFAIL); + error = xchk_setup_xattr_buf(ds->sc, 0, + GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (error == -ENOMEM) return -EDEADLOCK; if (error) @@ -334,7 +336,7 @@ xchk_xattr_block( bitmap_zero(usedmap, mp->m_attr_geo->blksize); /* Check all the padding. */ - if (xfs_sb_version_hascrc(&ds->sc->mp->m_sb)) { + if (xfs_has_crc(ds->sc->mp)) { struct xfs_attr3_leafblock *leaf = bp->b_addr; if (leaf->hdr.pad1 != 0 || leaf->hdr.pad2 != 0 || diff --git a/fs/xfs/scrub/attr.h b/fs/xfs/scrub/attr.h index 13a1d2e8424d..1719e1c4da59 100644 --- a/fs/xfs/scrub/attr.h +++ b/fs/xfs/scrub/attr.h @@ -65,7 +65,4 @@ xchk_xattr_dstmap( BITS_TO_LONGS(sc->mp->m_attr_geo->blksize); } -int xchk_setup_xattr_buf(struct xfs_scrub *sc, size_t value_size, - xfs_km_flags_t flags); - #endif /* __XFS_SCRUB_ATTR_H__ */ diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c index 813b5f219113..d6d24c866bc4 100644 --- a/fs/xfs/scrub/bitmap.c +++ b/fs/xfs/scrub/bitmap.c @@ -260,7 +260,7 @@ xbitmap_set_btcur_path( xfs_btree_get_block(cur, i, &bp); if (!bp) continue; - fsb = XFS_DADDR_TO_FSB(cur->bc_mp, bp->b_bn); + fsb = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)); error = xbitmap_set(bitmap, fsb, 1); if (error) return error; @@ -284,7 +284,7 @@ xbitmap_collect_btblock( if (!bp) return 0; - fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, bp->b_bn); + fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)); return xbitmap_set(bitmap, fsbno, 1); } diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index 1d146c9d9de1..017da9ceaee9 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -260,10 +260,10 @@ xchk_bmap_iextent_xref( agbno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock); len = irec->br_blockcount; - error = xchk_ag_init(info->sc, agno, &info->sc->sa); + error = xchk_ag_init_existing(info->sc, agno, &info->sc->sa); if (!xchk_fblock_process_error(info->sc, info->whichfork, irec->br_startoff, &error)) - return; + goto out_free; xchk_xref_is_used_space(info->sc, agbno, len); xchk_xref_is_not_inode_chunk(info->sc, agbno, len); @@ -283,6 +283,7 @@ xchk_bmap_iextent_xref( break; } +out_free: xchk_ag_free(info->sc, &info->sc->sa); } @@ -383,7 +384,7 @@ xchk_bmap_iextent( STATIC int xchk_bmapbt_rec( struct xchk_btree *bs, - union xfs_btree_rec *rec) + const union xfs_btree_rec *rec) { struct xfs_bmbt_irec irec; struct xfs_bmbt_irec iext_irec; @@ -400,7 +401,7 @@ xchk_bmapbt_rec( * Check the owners of the btree blocks up to the level below * the root since the verifiers don't do that. */ - if (xfs_sb_version_hascrc(&bs->cur->bc_mp->m_sb) && + if (xfs_has_crc(bs->cur->bc_mp) && bs->cur->bc_ptrs[0] == 1) { for (i = 0; i < bs->cur->bc_nlevels - 1; i++) { block = xfs_btree_get_block(bs->cur, i, &bp); @@ -473,10 +474,11 @@ struct xchk_bmap_check_rmap_info { STATIC int xchk_bmap_check_rmap( struct xfs_btree_cur *cur, - struct xfs_rmap_irec *rec, + const struct xfs_rmap_irec *rec, void *priv) { struct xfs_bmbt_irec irec; + struct xfs_rmap_irec check_rec; struct xchk_bmap_check_rmap_info *sbcri = priv; struct xfs_ifork *ifp; struct xfs_scrub *sc = sbcri->sc; @@ -510,28 +512,30 @@ xchk_bmap_check_rmap( * length, so we have to loop through the bmbt to make sure that the * entire rmap is covered by bmbt records. */ + check_rec = *rec; while (have_map) { - if (irec.br_startoff != rec->rm_offset) + if (irec.br_startoff != check_rec.rm_offset) xchk_fblock_set_corrupt(sc, sbcri->whichfork, - rec->rm_offset); + check_rec.rm_offset); if (irec.br_startblock != XFS_AGB_TO_FSB(sc->mp, - cur->bc_ag.pag->pag_agno, rec->rm_startblock)) + cur->bc_ag.pag->pag_agno, + check_rec.rm_startblock)) xchk_fblock_set_corrupt(sc, sbcri->whichfork, - rec->rm_offset); - if (irec.br_blockcount > rec->rm_blockcount) + check_rec.rm_offset); + if (irec.br_blockcount > check_rec.rm_blockcount) xchk_fblock_set_corrupt(sc, sbcri->whichfork, - rec->rm_offset); + check_rec.rm_offset); if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) break; - rec->rm_startblock += irec.br_blockcount; - rec->rm_offset += irec.br_blockcount; - rec->rm_blockcount -= irec.br_blockcount; - if (rec->rm_blockcount == 0) + check_rec.rm_startblock += irec.br_blockcount; + check_rec.rm_offset += irec.br_blockcount; + check_rec.rm_blockcount -= irec.br_blockcount; + if (check_rec.rm_blockcount == 0) break; have_map = xfs_iext_next_extent(ifp, &sbcri->icur, &irec); if (!have_map) xchk_fblock_set_corrupt(sc, sbcri->whichfork, - rec->rm_offset); + check_rec.rm_offset); } out: @@ -581,7 +585,7 @@ xchk_bmap_check_rmaps( bool zero_size; int error; - if (!xfs_sb_version_hasrmapbt(&sc->mp->m_sb) || + if (!xfs_has_rmapbt(sc->mp) || whichfork == XFS_COW_FORK || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) return 0; @@ -659,8 +663,7 @@ xchk_bmap( } break; case XFS_ATTR_FORK: - if (!xfs_sb_version_hasattr(&mp->m_sb) && - !xfs_sb_version_hasattr2(&mp->m_sb)) + if (!xfs_has_attr(mp) && !xfs_has_attr2(mp)) xchk_ino_set_corrupt(sc, sc->ip->i_ino); break; default: diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c index bd1172358964..eccb855dc904 100644 --- a/fs/xfs/scrub/btree.c +++ b/fs/xfs/scrub/btree.c @@ -374,10 +374,10 @@ xchk_btree_check_block_owner( init_sa = bs->cur->bc_flags & XFS_BTREE_LONG_PTRS; if (init_sa) { - error = xchk_ag_init(bs->sc, agno, &bs->sc->sa); + error = xchk_ag_init_existing(bs->sc, agno, &bs->sc->sa); if (!xchk_btree_xref_process_error(bs->sc, bs->cur, level, &error)) - return error; + goto out_free; } xchk_xref_is_used_space(bs->sc, agbno, 1); @@ -393,6 +393,7 @@ xchk_btree_check_block_owner( if (!bs->sc->sa.rmap_cur && btnum == XFS_BTNUM_RMAP) bs->cur = NULL; +out_free: if (init_sa) xchk_ag_free(bs->sc, &bs->sc->sa); @@ -435,12 +436,12 @@ xchk_btree_check_owner( if (!co) return -ENOMEM; co->level = level; - co->daddr = XFS_BUF_ADDR(bp); + co->daddr = xfs_buf_daddr(bp); list_add_tail(&co->list, &bs->to_check); return 0; } - return xchk_btree_check_block_owner(bs, level, XFS_BUF_ADDR(bp)); + return xchk_btree_check_block_owner(bs, level, xfs_buf_daddr(bp)); } /* Decide if we want to check minrecs of a btree block in the inode root. */ diff --git a/fs/xfs/scrub/btree.h b/fs/xfs/scrub/btree.h index 5572e475f8ed..b7d2fc01fbf9 100644 --- a/fs/xfs/scrub/btree.h +++ b/fs/xfs/scrub/btree.h @@ -26,8 +26,8 @@ void xchk_btree_xref_set_corrupt(struct xfs_scrub *sc, struct xchk_btree; typedef int (*xchk_btree_rec_fn)( - struct xchk_btree *bs, - union xfs_btree_rec *rec); + struct xchk_btree *bs, + const union xfs_btree_rec *rec); struct xchk_btree { /* caller-provided scrub state */ diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 8558ca05e11d..bf1f3607d0b6 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -186,7 +186,7 @@ xchk_block_set_preen( struct xfs_buf *bp) { sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN; - trace_xchk_block_preen(sc, bp->b_bn, __return_address); + trace_xchk_block_preen(sc, xfs_buf_daddr(bp), __return_address); } /* @@ -219,7 +219,7 @@ xchk_block_set_corrupt( struct xfs_buf *bp) { sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; - trace_xchk_block_error(sc, bp->b_bn, __return_address); + trace_xchk_block_error(sc, xfs_buf_daddr(bp), __return_address); } /* Record a corruption while cross-referencing. */ @@ -229,7 +229,7 @@ xchk_block_xref_set_corrupt( struct xfs_buf *bp) { sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT; - trace_xchk_block_error(sc, bp->b_bn, __return_address); + trace_xchk_block_error(sc, xfs_buf_daddr(bp), __return_address); } /* @@ -324,7 +324,7 @@ struct xchk_rmap_ownedby_info { STATIC int xchk_count_rmap_ownedby_irec( struct xfs_btree_cur *cur, - struct xfs_rmap_irec *rec, + const struct xfs_rmap_irec *rec, void *priv) { struct xchk_rmap_ownedby_info *sroi = priv; @@ -394,11 +394,11 @@ want_ag_read_header_failure( } /* - * Grab all the headers for an AG. + * Grab the perag structure and all the headers for an AG. * - * The headers should be released by xchk_ag_free, but as a fail - * safe we attach all the buffers we grab to the scrub transaction so - * they'll all be freed when we cancel it. + * The headers should be released by xchk_ag_free, but as a fail safe we attach + * all the buffers we grab to the scrub transaction so they'll all be freed + * when we cancel it. Returns ENOENT if we can't grab the perag structure. */ int xchk_ag_read_headers( @@ -409,22 +409,24 @@ xchk_ag_read_headers( struct xfs_mount *mp = sc->mp; int error; - sa->agno = agno; + ASSERT(!sa->pag); + sa->pag = xfs_perag_get(mp, agno); + if (!sa->pag) + return -ENOENT; error = xfs_ialloc_read_agi(mp, sc->tp, agno, &sa->agi_bp); if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI)) - goto out; + return error; error = xfs_alloc_read_agf(mp, sc->tp, agno, 0, &sa->agf_bp); if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGF)) - goto out; + return error; error = xfs_alloc_read_agfl(mp, sc->tp, agno, &sa->agfl_bp); if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGFL)) - goto out; - error = 0; -out: - return error; + return error; + + return 0; } /* Release all the AG btree cursors. */ @@ -461,7 +463,6 @@ xchk_ag_btcur_init( { struct xfs_mount *mp = sc->mp; - xchk_perag_get(sc->mp, sa); if (sa->agf_bp && xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_BNO)) { /* Set up a bnobt cursor for cross-referencing. */ @@ -484,21 +485,21 @@ xchk_ag_btcur_init( } /* Set up a finobt cursor for cross-referencing. */ - if (sa->agi_bp && xfs_sb_version_hasfinobt(&mp->m_sb) && + if (sa->agi_bp && xfs_has_finobt(mp) && xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_FINO)) { sa->fino_cur = xfs_inobt_init_cursor(mp, sc->tp, sa->agi_bp, sa->pag, XFS_BTNUM_FINO); } /* Set up a rmapbt cursor for cross-referencing. */ - if (sa->agf_bp && xfs_sb_version_hasrmapbt(&mp->m_sb) && + if (sa->agf_bp && xfs_has_rmapbt(mp) && xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_RMAP)) { sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, sa->agf_bp, sa->pag); } /* Set up a refcountbt cursor for cross-referencing. */ - if (sa->agf_bp && xfs_sb_version_hasreflink(&mp->m_sb) && + if (sa->agf_bp && xfs_has_reflink(mp) && xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_REFC)) { sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp, sa->agf_bp, sa->pag); @@ -528,15 +529,14 @@ xchk_ag_free( xfs_perag_put(sa->pag); sa->pag = NULL; } - sa->agno = NULLAGNUMBER; } /* - * For scrub, grab the AGI and the AGF headers, in that order. Locking - * order requires us to get the AGI before the AGF. We use the - * transaction to avoid deadlocking on crosslinked metadata buffers; - * either the caller passes one in (bmap scrub) or we have to create a - * transaction ourselves. + * For scrub, grab the perag structure, the AGI, and the AGF headers, in that + * order. Locking order requires us to get the AGI before the AGF. We use the + * transaction to avoid deadlocking on crosslinked metadata buffers; either the + * caller passes one in (bmap scrub) or we have to create a transaction + * ourselves. Returns ENOENT if the perag struct cannot be grabbed. */ int xchk_ag_init( @@ -554,19 +554,6 @@ xchk_ag_init( return 0; } -/* - * Grab the per-ag structure if we haven't already gotten it. Teardown of the - * xchk_ag will release it for us. - */ -void -xchk_perag_get( - struct xfs_mount *mp, - struct xchk_ag *sa) -{ - if (!sa->pag) - sa->pag = xfs_perag_get(mp, sa->agno); -} - /* Per-scrubber setup functions */ /* @@ -797,7 +784,7 @@ xchk_buffer_recheck( if (!fa) return; sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; - trace_xchk_block_error(sc, bp->b_bn, fa); + trace_xchk_block_error(sc, xfs_buf_daddr(bp), fa); } /* @@ -842,7 +829,7 @@ xchk_metadata_inode_forks( return error; /* Look for incorrect shared blocks. */ - if (xfs_sb_version_hasreflink(&sc->mp->m_sb)) { + if (xfs_has_reflink(sc->mp)) { error = xfs_reflink_inode_has_shared_extents(sc->tp, sc->ip, &shared); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, @@ -884,6 +871,7 @@ xchk_stop_reaping( { sc->flags |= XCHK_REAPING_DISABLED; xfs_blockgc_stop(sc->mp); + xfs_inodegc_stop(sc->mp); } /* Restart background reaping of resources. */ @@ -891,6 +879,13 @@ void xchk_start_reaping( struct xfs_scrub *sc) { - xfs_blockgc_start(sc->mp); + /* + * Readonly filesystems do not perform inactivation or speculative + * preallocation, so there's no need to restart the workers. + */ + if (!xfs_is_readonly(sc->mp)) { + xfs_inodegc_start(sc->mp); + xfs_blockgc_start(sc->mp); + } sc->flags &= ~XCHK_REAPING_DISABLED; } diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index 0410faf7d735..454145db10e7 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -107,7 +107,23 @@ int xchk_setup_fscounters(struct xfs_scrub *sc); void xchk_ag_free(struct xfs_scrub *sc, struct xchk_ag *sa); int xchk_ag_init(struct xfs_scrub *sc, xfs_agnumber_t agno, struct xchk_ag *sa); -void xchk_perag_get(struct xfs_mount *mp, struct xchk_ag *sa); + +/* + * Grab all AG resources, treating the inability to grab the perag structure as + * a fs corruption. This is intended for callers checking an ondisk reference + * to a given AG, which means that the AG must still exist. + */ +static inline int +xchk_ag_init_existing( + struct xfs_scrub *sc, + xfs_agnumber_t agno, + struct xchk_ag *sa) +{ + int error = xchk_ag_init(sc, agno, sa); + + return error == -ENOENT ? -EFSCORRUPTED : error; +} + int xchk_ag_read_headers(struct xfs_scrub *sc, xfs_agnumber_t agno, struct xchk_ag *sa); void xchk_ag_btcur_free(struct xchk_ag *sa); diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c index 9f0dbb47c82c..8a52514bc1ff 100644 --- a/fs/xfs/scrub/dabtree.c +++ b/fs/xfs/scrub/dabtree.c @@ -367,11 +367,11 @@ xchk_da_btree_block( pmaxrecs = &ds->maxrecs[level]; /* We only started zeroing the header on v5 filesystems. */ - if (xfs_sb_version_hascrc(&ds->sc->mp->m_sb) && hdr3->hdr.pad) + if (xfs_has_crc(ds->sc->mp) && hdr3->hdr.pad) xchk_da_set_corrupt(ds, level); /* Check the owner. */ - if (xfs_sb_version_hascrc(&ip->i_mount->m_sb)) { + if (xfs_has_crc(ip->i_mount)) { owner = be64_to_cpu(hdr3->owner); if (owner != ip->i_ino) xchk_da_set_corrupt(ds, level); diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index 28dda391d5df..200a63f58fe7 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -51,7 +51,7 @@ xchk_dir_check_ftype( int ino_dtype; int error = 0; - if (!xfs_sb_version_hasftype(&mp->m_sb)) { + if (!xfs_has_ftype(mp)) { if (dtype != DT_UNKNOWN && dtype != DT_DIR) xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset); @@ -140,7 +140,7 @@ xchk_dir_actor( if (!strncmp(".", name, namelen)) { /* If this is "." then check that the inum matches the dir. */ - if (xfs_sb_version_hasftype(&mp->m_sb) && type != DT_DIR) + if (xfs_has_ftype(mp) && type != DT_DIR) xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset); checked_ftype = true; @@ -152,7 +152,7 @@ xchk_dir_actor( * If this is ".." in the root inode, check that the inum * matches this dir. */ - if (xfs_sb_version_hasftype(&mp->m_sb) && type != DT_DIR) + if (xfs_has_ftype(mp) && type != DT_DIR) xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset); checked_ftype = true; @@ -526,7 +526,7 @@ xchk_directory_leaf1_bestfree( bestcount = be32_to_cpu(ltp->bestcount); bestp = xfs_dir2_leaf_bests_p(ltp); - if (xfs_sb_version_hascrc(&sc->mp->m_sb)) { + if (xfs_has_crc(sc->mp)) { struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr; if (hdr3->pad != cpu_to_be32(0)) @@ -623,7 +623,7 @@ xchk_directory_free_bestfree( return error; xchk_buffer_recheck(sc, bp); - if (xfs_sb_version_hascrc(&sc->mp->m_sb)) { + if (xfs_has_crc(sc->mp)) { struct xfs_dir3_free_hdr *hdr3 = bp->b_addr; if (hdr3->pad != cpu_to_be32(0)) diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index fd7941e04ae1..48a6cbdf95d0 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -148,9 +148,9 @@ xchk_fscount_btreeblks( xfs_extlen_t blocks; int error; - error = xchk_ag_init(sc, agno, &sc->sa); + error = xchk_ag_init_existing(sc, agno, &sc->sa); if (error) - return error; + goto out_free; error = xfs_btree_count_blocks(sc->sa.bno_cur, &blocks); if (error) @@ -207,7 +207,7 @@ retry: /* Add up the free/freelist/bnobt/cntbt blocks */ fsc->fdblocks += pag->pagf_freeblks; fsc->fdblocks += pag->pagf_flcount; - if (xfs_sb_version_haslazysbcount(&sc->mp->m_sb)) { + if (xfs_has_lazysbcount(sc->mp)) { fsc->fdblocks += pag->pagf_btreeblks; } else { error = xchk_fscount_btreeblks(sc, fsc, agno); diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index 30e568596b79..00848ee542fb 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -418,7 +418,7 @@ xchk_iallocbt_rec_alignment( STATIC int xchk_iallocbt_rec( struct xchk_btree *bs, - union xfs_btree_rec *rec) + const union xfs_btree_rec *rec) { struct xfs_mount *mp = bs->cur->bc_mp; struct xchk_iallocbt *iabt = bs->private; @@ -517,7 +517,7 @@ xchk_iallocbt_xref_rmap_btreeblks( int error; if (!sc->sa.ino_cur || !sc->sa.rmap_cur || - (xfs_sb_version_hasfinobt(&sc->mp->m_sb) && !sc->sa.fino_cur) || + (xfs_has_finobt(sc->mp) && !sc->sa.fino_cur) || xchk_skip_xref(sc->sm)) return; diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index 76fbc7ca4cec..2405b09d03d0 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -181,7 +181,7 @@ xchk_inode_flags2( /* reflink flag requires reflink feature */ if ((flags2 & XFS_DIFLAG2_REFLINK) && - !xfs_sb_version_hasreflink(&mp->m_sb)) + !xfs_has_reflink(mp)) goto bad; /* cowextsize flag is checked w.r.t. mode separately */ @@ -199,8 +199,7 @@ xchk_inode_flags2( goto bad; /* no bigtime iflag without the bigtime feature */ - if (xfs_dinode_has_bigtime(dip) && - !xfs_sb_version_hasbigtime(&mp->m_sb)) + if (xfs_dinode_has_bigtime(dip) && !xfs_has_bigtime(mp)) goto bad; return; @@ -278,7 +277,7 @@ xchk_dinode( xchk_ino_set_corrupt(sc, ino); if (dip->di_projid_hi != 0 && - !xfs_sb_version_hasprojid32bit(&mp->m_sb)) + !xfs_has_projid32(mp)) xchk_ino_set_corrupt(sc, ino); break; default: @@ -532,9 +531,9 @@ xchk_inode_xref( agno = XFS_INO_TO_AGNO(sc->mp, ino); agbno = XFS_INO_TO_AGBNO(sc->mp, ino); - error = xchk_ag_init(sc, agno, &sc->sa); + error = xchk_ag_init_existing(sc, agno, &sc->sa); if (!xchk_xref_process_error(sc, agno, agbno, &error)) - return; + goto out_free; xchk_xref_is_used_space(sc, agbno, 1); xchk_inode_xref_finobt(sc, ino); @@ -542,6 +541,7 @@ xchk_inode_xref( xchk_xref_is_not_shared(sc, agbno, 1); xchk_inode_xref_bmap(sc, dip); +out_free: xchk_ag_free(sc, &sc->sa); } @@ -560,7 +560,7 @@ xchk_inode_check_reflink_iflag( bool has_shared; int error; - if (!xfs_sb_version_hasreflink(&mp->m_sb)) + if (!xfs_has_reflink(mp)) return; error = xfs_reflink_inode_has_shared_extents(sc->tp, sc->ip, diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index acbb9839d42f..d6c1b00a4fc8 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -42,7 +42,7 @@ xchk_setup_quota( xfs_dqtype_t dqtype; int error; - if (!XFS_IS_QUOTA_RUNNING(sc->mp) || !XFS_IS_QUOTA_ON(sc->mp)) + if (!XFS_IS_QUOTA_ON(sc->mp)) return -ENOENT; dqtype = xchk_quota_to_dqtype(sc); @@ -127,7 +127,7 @@ xchk_quota_item( * a reflink filesystem we're allowed to exceed physical space * if there are no quota limits. */ - if (xfs_sb_version_hasreflink(&mp->m_sb)) { + if (xfs_has_reflink(mp)) { if (mp->m_sb.sb_dblocks < dq->q_blk.count) xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c index 7014b7408bad..2744eecdbaf0 100644 --- a/fs/xfs/scrub/refcount.c +++ b/fs/xfs/scrub/refcount.c @@ -91,7 +91,7 @@ struct xchk_refcnt_check { STATIC int xchk_refcountbt_rmap_check( struct xfs_btree_cur *cur, - struct xfs_rmap_irec *rec, + const struct xfs_rmap_irec *rec, void *priv) { struct xchk_refcnt_check *refchk = priv; @@ -330,7 +330,7 @@ xchk_refcountbt_xref( STATIC int xchk_refcountbt_rec( struct xchk_btree *bs, - union xfs_btree_rec *rec) + const union xfs_btree_rec *rec) { struct xfs_mount *mp = bs->cur->bc_mp; xfs_agblock_t *cow_blocks = bs->private; diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index ebb0e245aa72..8f3cba14ada3 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -248,19 +248,19 @@ xrep_calc_ag_resblks( * bnobt/cntbt or inobt/finobt as pairs. */ bnobt_sz = 2 * xfs_allocbt_calc_size(mp, freelen); - if (xfs_sb_version_hassparseinodes(&mp->m_sb)) + if (xfs_has_sparseinodes(mp)) inobt_sz = xfs_iallocbt_calc_size(mp, icount / XFS_INODES_PER_HOLEMASK_BIT); else inobt_sz = xfs_iallocbt_calc_size(mp, icount / XFS_INODES_PER_CHUNK); - if (xfs_sb_version_hasfinobt(&mp->m_sb)) + if (xfs_has_finobt(mp)) inobt_sz *= 2; - if (xfs_sb_version_hasreflink(&mp->m_sb)) + if (xfs_has_reflink(mp)) refcbt_sz = xfs_refcountbt_calc_size(mp, usedlen); else refcbt_sz = 0; - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) { + if (xfs_has_rmapbt(mp)) { /* * Guess how many blocks we need to rebuild the rmapbt. * For non-reflink filesystems we can't have more records than @@ -269,7 +269,7 @@ xrep_calc_ag_resblks( * many rmaps there could be in the AG, so we start off with * what we hope is an generous over-estimation. */ - if (xfs_sb_version_hasreflink(&mp->m_sb)) + if (xfs_has_reflink(mp)) rmapbt_sz = xfs_rmapbt_calc_size(mp, (unsigned long long)aglen * 2); else @@ -306,9 +306,9 @@ xrep_alloc_ag_block( return -ENOSPC; xfs_extent_busy_reuse(sc->mp, sc->sa.pag, bno, 1, false); - *fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, bno); + *fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, bno); if (resv == XFS_AG_RESV_RMAPBT) - xfs_ag_resv_rmapbt_alloc(sc->mp, sc->sa.agno); + xfs_ag_resv_rmapbt_alloc(sc->mp, sc->sa.pag->pag_agno); return 0; default: break; @@ -317,7 +317,7 @@ xrep_alloc_ag_block( args.tp = sc->tp; args.mp = sc->mp; args.oinfo = *oinfo; - args.fsbno = XFS_AGB_TO_FSB(args.mp, sc->sa.agno, 0); + args.fsbno = XFS_AGB_TO_FSB(args.mp, sc->sa.pag->pag_agno, 0); args.minlen = 1; args.maxlen = 1; args.prod = 1; @@ -352,14 +352,14 @@ xrep_init_btblock( trace_xrep_init_btblock(mp, XFS_FSB_TO_AGNO(mp, fsb), XFS_FSB_TO_AGBNO(mp, fsb), btnum); - ASSERT(XFS_FSB_TO_AGNO(mp, fsb) == sc->sa.agno); + ASSERT(XFS_FSB_TO_AGNO(mp, fsb) == sc->sa.pag->pag_agno); error = xfs_trans_get_buf(tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, fsb), XFS_FSB_TO_BB(mp, 1), 0, &bp); if (error) return error; xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); - xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.agno); + xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.pag->pag_agno); xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF); xfs_trans_log_buf(tp, bp, 0, BBTOB(bp->b_length) - 1); bp->b_ops = ops; @@ -481,7 +481,7 @@ xrep_fix_freelist( args.mp = sc->mp; args.tp = sc->tp; - args.agno = sc->sa.agno; + args.agno = sc->sa.pag->pag_agno; args.alignment = 1; args.pag = sc->sa.pag; @@ -611,11 +611,11 @@ xrep_reap_extents( xfs_fsblock_t fsbno; int error = 0; - ASSERT(xfs_sb_version_hasrmapbt(&sc->mp->m_sb)); + ASSERT(xfs_has_rmapbt(sc->mp)); for_each_xbitmap_block(fsbno, bmr, n, bitmap) { ASSERT(sc->ip != NULL || - XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.agno); + XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.pag->pag_agno); trace_xrep_dispose_btree_extent(sc->mp, XFS_FSB_TO_AGNO(sc->mp, fsbno), XFS_FSB_TO_AGBNO(sc->mp, fsbno), 1); @@ -690,7 +690,7 @@ xrep_findroot_block( int block_level; int error = 0; - daddr = XFS_AGB_TO_DADDR(mp, ri->sc->sa.agno, agbno); + daddr = XFS_AGB_TO_DADDR(mp, ri->sc->sa.pag->pag_agno, agbno); /* * Blocks in the AGFL have stale contents that might just happen to @@ -819,7 +819,7 @@ xrep_findroot_block( else fab->root = NULLAGBLOCK; - trace_xrep_findroot_block(mp, ri->sc->sa.agno, agbno, + trace_xrep_findroot_block(mp, ri->sc->sa.pag->pag_agno, agbno, be32_to_cpu(btblock->bb_magic), fab->height - 1); out: xfs_trans_brelse(ri->sc->tp, bp); @@ -833,7 +833,7 @@ out: STATIC int xrep_findroot_rmap( struct xfs_btree_cur *cur, - struct xfs_rmap_irec *rec, + const struct xfs_rmap_irec *rec, void *priv) { struct xrep_findroot *ri = priv; diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c index fc306573f0ac..8dae0345c7df 100644 --- a/fs/xfs/scrub/rmap.c +++ b/fs/xfs/scrub/rmap.c @@ -88,7 +88,7 @@ xchk_rmapbt_xref( STATIC int xchk_rmapbt_rec( struct xchk_btree *bs, - union xfs_btree_rec *rec) + const union xfs_btree_rec *rec) { struct xfs_mount *mp = bs->cur->bc_mp; struct xfs_rmap_irec irec; diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c index 37c0e2266c85..8fa012057405 100644 --- a/fs/xfs/scrub/rtbitmap.c +++ b/fs/xfs/scrub/rtbitmap.c @@ -41,7 +41,7 @@ xchk_setup_rt( STATIC int xchk_rtbitmap_rec( struct xfs_trans *tp, - struct xfs_rtalloc_rec *rec, + const struct xfs_rtalloc_rec *rec, void *priv) { struct xfs_scrub *sc = priv; diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 0e542636227c..51e4c61916d2 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -239,21 +239,21 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .type = ST_PERAG, .setup = xchk_setup_ag_iallocbt, .scrub = xchk_finobt, - .has = xfs_sb_version_hasfinobt, + .has = xfs_has_finobt, .repair = xrep_notsupported, }, [XFS_SCRUB_TYPE_RMAPBT] = { /* rmapbt */ .type = ST_PERAG, .setup = xchk_setup_ag_rmapbt, .scrub = xchk_rmapbt, - .has = xfs_sb_version_hasrmapbt, + .has = xfs_has_rmapbt, .repair = xrep_notsupported, }, [XFS_SCRUB_TYPE_REFCNTBT] = { /* refcountbt */ .type = ST_PERAG, .setup = xchk_setup_ag_refcountbt, .scrub = xchk_refcountbt, - .has = xfs_sb_version_hasreflink, + .has = xfs_has_reflink, .repair = xrep_notsupported, }, [XFS_SCRUB_TYPE_INODE] = { /* inode record */ @@ -308,14 +308,14 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .type = ST_FS, .setup = xchk_setup_rt, .scrub = xchk_rtbitmap, - .has = xfs_sb_version_hasrealtime, + .has = xfs_has_realtime, .repair = xrep_notsupported, }, [XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */ .type = ST_FS, .setup = xchk_setup_rt, .scrub = xchk_rtsummary, - .has = xfs_sb_version_hasrealtime, + .has = xfs_has_realtime, .repair = xrep_notsupported, }, [XFS_SCRUB_TYPE_UQUOTA] = { /* user quota */ @@ -383,7 +383,7 @@ xchk_validate_inputs( if (ops->setup == NULL || ops->scrub == NULL) goto out; /* Does this fs even support this type of metadata? */ - if (ops->has && !ops->has(&mp->m_sb)) + if (ops->has && !ops->has(mp)) goto out; error = -EINVAL; @@ -415,11 +415,11 @@ xchk_validate_inputs( */ if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) { error = -EOPNOTSUPP; - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) goto out; error = -EROFS; - if (mp->m_flags & XFS_MOUNT_RDONLY) + if (xfs_is_readonly(mp)) goto out; } @@ -464,9 +464,6 @@ xfs_scrub_metadata( struct xfs_scrub sc = { .file = file, .sm = sm, - .sa = { - .agno = NULLAGNUMBER, - }, }; struct xfs_mount *mp = XFS_I(file_inode(file))->i_mount; int error = 0; @@ -480,10 +477,10 @@ xfs_scrub_metadata( /* Forbidden if we are shut down or mounted norecovery. */ error = -ESHUTDOWN; - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) goto out; error = -ENOTRECOVERABLE; - if (mp->m_flags & XFS_MOUNT_NORECOVERY) + if (xfs_has_norecovery(mp)) goto out; error = xchk_validate_inputs(mp, sm); diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index 08a483cb46e2..80e5026bba44 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -27,7 +27,7 @@ struct xchk_meta_ops { int (*repair)(struct xfs_scrub *); /* Decide if we even have this piece of metadata. */ - bool (*has)(struct xfs_sb *); + bool (*has)(struct xfs_mount *); /* type describing required/allowed inputs */ enum xchk_type type; @@ -35,7 +35,6 @@ struct xchk_meta_ops { /* Buffer pointers and btree cursors for an entire AG. */ struct xchk_ag { - xfs_agnumber_t agno; struct xfs_perag *pag; /* AG btree roots */ diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c index 03882a605a3c..c0ef53fe6611 100644 --- a/fs/xfs/scrub/trace.c +++ b/fs/xfs/scrub/trace.c @@ -22,11 +22,11 @@ xchk_btree_cur_fsbno( int level) { if (level < cur->bc_nlevels && cur->bc_bufs[level]) - return XFS_DADDR_TO_FSB(cur->bc_mp, cur->bc_bufs[level]->b_bn); - else if (level == cur->bc_nlevels - 1 && - cur->bc_flags & XFS_BTREE_LONG_PTRS) + return XFS_DADDR_TO_FSB(cur->bc_mp, + xfs_buf_daddr(cur->bc_bufs[level])); + if (level == cur->bc_nlevels - 1 && cur->bc_flags & XFS_BTREE_LONG_PTRS) return XFS_INO_TO_FSB(cur->bc_mp, cur->bc_ino.ip->i_ino); - else if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS)) + if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS)) return XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.pag->pag_agno, 0); return NULLFSBLOCK; } diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index e46f5cef90da..a7bbb84f91a7 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -2,6 +2,10 @@ /* * Copyright (C) 2017 Oracle. All Rights Reserved. * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * NOTE: none of these tracepoints shall be considered a stable kernel ABI + * as they can change at any time. See xfs_trace.h for documentation of + * specific units found in tracepoint output. */ #undef TRACE_SYSTEM #define TRACE_SYSTEM xfs_scrub @@ -79,6 +83,16 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS); { XFS_SCRUB_TYPE_PQUOTA, "prjquota" }, \ { XFS_SCRUB_TYPE_FSCOUNTERS, "fscounters" } +#define XFS_SCRUB_FLAG_STRINGS \ + { XFS_SCRUB_IFLAG_REPAIR, "repair" }, \ + { XFS_SCRUB_OFLAG_CORRUPT, "corrupt" }, \ + { XFS_SCRUB_OFLAG_PREEN, "preen" }, \ + { XFS_SCRUB_OFLAG_XFAIL, "xfail" }, \ + { XFS_SCRUB_OFLAG_XCORRUPT, "xcorrupt" }, \ + { XFS_SCRUB_OFLAG_INCOMPLETE, "incomplete" }, \ + { XFS_SCRUB_OFLAG_WARNING, "warning" }, \ + { XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED, "norepair" } + DECLARE_EVENT_CLASS(xchk_class, TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_metadata *sm, int error), @@ -103,14 +117,14 @@ DECLARE_EVENT_CLASS(xchk_class, __entry->flags = sm->sm_flags; __entry->error = error; ), - TP_printk("dev %d:%d ino 0x%llx type %s agno %u inum %llu gen %u flags 0x%x error %d", + TP_printk("dev %d:%d ino 0x%llx type %s agno 0x%x inum 0x%llx gen 0x%x flags (%s) error %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), __entry->agno, __entry->inum, __entry->gen, - __entry->flags, + __print_flags(__entry->flags, "|", XFS_SCRUB_FLAG_STRINGS), __entry->error) ) #define DEFINE_SCRUB_EVENT(name) \ @@ -145,7 +159,7 @@ TRACE_EVENT(xchk_op_error, __entry->error = error; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d type %s agno %u agbno %u error %d ret_ip %pS", + TP_printk("dev %d:%d type %s agno 0x%x agbno 0x%x error %d ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), __entry->agno, @@ -176,10 +190,10 @@ TRACE_EVENT(xchk_file_op_error, __entry->error = error; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d ino 0x%llx fork %d type %s offset %llu error %d ret_ip %pS", + TP_printk("dev %d:%d ino 0x%llx fork %s type %s fileoff 0x%llx error %d ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, - __entry->whichfork, + __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), __entry->offset, __entry->error, @@ -193,29 +207,21 @@ DECLARE_EVENT_CLASS(xchk_block_error_class, __field(dev_t, dev) __field(unsigned int, type) __field(xfs_agnumber_t, agno) - __field(xfs_agblock_t, bno) + __field(xfs_agblock_t, agbno) __field(void *, ret_ip) ), TP_fast_assign( - xfs_fsblock_t fsbno; - xfs_agnumber_t agno; - xfs_agblock_t bno; - - fsbno = XFS_DADDR_TO_FSB(sc->mp, daddr); - agno = XFS_FSB_TO_AGNO(sc->mp, fsbno); - bno = XFS_FSB_TO_AGBNO(sc->mp, fsbno); - __entry->dev = sc->mp->m_super->s_dev; __entry->type = sc->sm->sm_type; - __entry->agno = agno; - __entry->bno = bno; + __entry->agno = xfs_daddr_to_agno(sc->mp, daddr); + __entry->agbno = xfs_daddr_to_agbno(sc->mp, daddr); __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d type %s agno %u agbno %u ret_ip %pS", + TP_printk("dev %d:%d type %s agno 0x%x agbno 0x%x ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), __entry->agno, - __entry->bno, + __entry->agbno, __entry->ret_ip) ) @@ -281,10 +287,10 @@ DECLARE_EVENT_CLASS(xchk_fblock_error_class, __entry->offset = offset; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d ino 0x%llx fork %d type %s offset %llu ret_ip %pS", + TP_printk("dev %d:%d ino 0x%llx fork %s type %s fileoff 0x%llx ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, - __entry->whichfork, + __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), __entry->offset, __entry->ret_ip) @@ -346,7 +352,7 @@ TRACE_EVENT(xchk_btree_op_error, __entry->error = error; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d type %s btree %s level %d ptr %d agno %u agbno %u error %d ret_ip %pS", + TP_printk("dev %d:%d type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x error %d ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), @@ -389,10 +395,10 @@ TRACE_EVENT(xchk_ifork_btree_op_error, __entry->error = error; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d ino 0x%llx fork %d type %s btree %s level %d ptr %d agno %u agbno %u error %d ret_ip %pS", + TP_printk("dev %d:%d ino 0x%llx fork %s type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x error %d ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, - __entry->whichfork, + __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), __entry->level, @@ -428,7 +434,7 @@ TRACE_EVENT(xchk_btree_error, __entry->ptr = cur->bc_ptrs[level]; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d type %s btree %s level %d ptr %d agno %u agbno %u ret_ip %pS", + TP_printk("dev %d:%d type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), @@ -468,10 +474,10 @@ TRACE_EVENT(xchk_ifork_btree_error, __entry->ptr = cur->bc_ptrs[level]; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d ino 0x%llx fork %d type %s btree %s level %d ptr %d agno %u agbno %u ret_ip %pS", + TP_printk("dev %d:%d ino 0x%llx fork %s type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, - __entry->whichfork, + __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), __entry->level, @@ -507,7 +513,7 @@ DECLARE_EVENT_CLASS(xchk_sbtree_class, __entry->nlevels = cur->bc_nlevels; __entry->ptr = cur->bc_ptrs[level]; ), - TP_printk("dev %d:%d type %s btree %s agno %u agbno %u level %d nlevels %d ptr %d", + TP_printk("dev %d:%d type %s btree %s agno 0x%x agbno 0x%x level %d nlevels %d ptr %d", MAJOR(__entry->dev), MINOR(__entry->dev), __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), @@ -580,7 +586,7 @@ TRACE_EVENT(xchk_iallocbt_check_cluster, __entry->holemask = holemask; __entry->cluster_ino = cluster_ino; ), - TP_printk("dev %d:%d agno %d startino %u daddr 0x%llx len %d chunkino %u nr_inodes %u cluster_mask 0x%x holemask 0x%x cluster_ino %u", + TP_printk("dev %d:%d agno 0x%x startino 0x%x daddr 0x%llx bbcount 0x%x chunkino 0x%x nr_inodes %u cluster_mask 0x%x holemask 0x%x cluster_ino 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->startino, @@ -670,7 +676,7 @@ DECLARE_EVENT_CLASS(xrep_extent_class, __entry->agbno = agbno; __entry->len = len; ), - TP_printk("dev %d:%d agno %u agbno %u len %u", + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->agbno, @@ -707,7 +713,7 @@ DECLARE_EVENT_CLASS(xrep_rmap_class, __entry->offset = offset; __entry->flags = flags; ), - TP_printk("dev %d:%d agno %u agbno %u len %u owner %lld offset %llu flags 0x%x", + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->agbno, @@ -745,7 +751,7 @@ TRACE_EVENT(xrep_refcount_extent_fn, __entry->blockcount = irec->rc_blockcount; __entry->refcount = irec->rc_refcount; ), - TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u", + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->startblock, @@ -769,7 +775,7 @@ TRACE_EVENT(xrep_init_btblock, __entry->agbno = agbno; __entry->btnum = btnum; ), - TP_printk("dev %d:%d agno %u agbno %u btree %s", + TP_printk("dev %d:%d agno 0x%x agbno 0x%x btree %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->agbno, @@ -793,7 +799,7 @@ TRACE_EVENT(xrep_findroot_block, __entry->magic = magic; __entry->level = level; ), - TP_printk("dev %d:%d agno %u agbno %u magic 0x%x level %u", + TP_printk("dev %d:%d agno 0x%x agbno 0x%x magic 0x%x level %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->agbno, @@ -821,7 +827,7 @@ TRACE_EVENT(xrep_calc_ag_resblks, __entry->freelen = freelen; __entry->usedlen = usedlen; ), - TP_printk("dev %d:%d agno %d icount %u aglen %u freelen %u usedlen %u", + TP_printk("dev %d:%d agno 0x%x icount %u aglen %u freelen %u usedlen %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->icount, @@ -850,7 +856,7 @@ TRACE_EVENT(xrep_calc_ag_resblks_btsize, __entry->rmapbt_sz = rmapbt_sz; __entry->refcbt_sz = refcbt_sz; ), - TP_printk("dev %d:%d agno %d bno %u ino %u rmap %u refcount %u", + TP_printk("dev %d:%d agno 0x%x bnobt %u inobt %u rmapbt %u refcountbt %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->bnobt_sz, @@ -894,7 +900,7 @@ TRACE_EVENT(xrep_ialloc_insert, __entry->freecount = freecount; __entry->freemask = freemask; ), - TP_printk("dev %d:%d agno %d startino %u holemask 0x%x count %u freecount %u freemask 0x%llx", + TP_printk("dev %d:%d agno 0x%x startino 0x%x holemask 0x%x count %u freecount %u freemask 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->startino, diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index d02bef24b32b..5c52ee869272 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -125,7 +125,7 @@ xfs_acl_to_disk(struct xfs_acl *aclp, const struct posix_acl *acl) } struct posix_acl * -xfs_get_acl(struct inode *inode, int type) +xfs_get_acl(struct inode *inode, int type, bool rcu) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; @@ -137,6 +137,9 @@ xfs_get_acl(struct inode *inode, int type) }; int error; + if (rcu) + return ERR_PTR(-ECHILD); + trace_xfs_get_acl(ip); switch (type) { @@ -232,7 +235,7 @@ xfs_acl_set_mode( inode->i_ctime = current_time(inode); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - if (mp->m_flags & XFS_MOUNT_WSYNC) + if (xfs_has_wsync(mp)) xfs_trans_set_sync(tp); return xfs_trans_commit(tp); } diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index 7bdb3a4ed798..bb6abdcb265d 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -10,13 +10,13 @@ struct inode; struct posix_acl; #ifdef CONFIG_XFS_POSIX_ACL -extern struct posix_acl *xfs_get_acl(struct inode *inode, int type); +extern struct posix_acl *xfs_get_acl(struct inode *inode, int type, bool rcu); extern int xfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, struct posix_acl *acl, int type); extern int __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); void xfs_forget_acl(struct inode *inode, const char *name); #else -static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type) +static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type, bool rcu) { return NULL; } diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index cb4e0fcf4c76..34fc6148032a 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -97,7 +97,7 @@ xfs_end_ioend( /* * Just clean up the in-memory structures if the fs has been shut down. */ - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { + if (xfs_is_shutdown(ip->i_mount)) { error = -EIO; goto done; } @@ -260,7 +260,7 @@ xfs_map_blocks( int retries = 0; int error = 0; - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; /* @@ -440,7 +440,7 @@ xfs_discard_page( xfs_fileoff_t pageoff_fsb = XFS_B_TO_FSBT(mp, pageoff); int error; - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) goto out_invalidate; xfs_alert_ratelimited(mp, @@ -449,7 +449,7 @@ xfs_discard_page( error = xfs_bmap_punch_delalloc_range(ip, start_fsb, i_blocks_per_page(inode, page) - pageoff_fsb); - if (error && !XFS_FORCED_SHUTDOWN(mp)) + if (error && !xfs_is_shutdown(mp)) xfs_alert(mp, "page discard unable to remove delalloc mapping."); out_invalidate: iomap_invalidatepage(page, pageoff, PAGE_SIZE - pageoff); @@ -462,22 +462,6 @@ static const struct iomap_writeback_ops xfs_writeback_ops = { }; STATIC int -xfs_vm_writepage( - struct page *page, - struct writeback_control *wbc) -{ - struct xfs_writepage_ctx wpc = { }; - - if (WARN_ON_ONCE(current->journal_info)) { - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return 0; - } - - return iomap_writepage(page, wbc, &wpc.ctx, &xfs_writeback_ops); -} - -STATIC int xfs_vm_writepages( struct address_space *mapping, struct writeback_control *wbc) @@ -559,7 +543,6 @@ xfs_iomap_swapfile_activate( const struct address_space_operations xfs_address_space_operations = { .readpage = xfs_vm_readpage, .readahead = xfs_vm_readahead, - .writepage = xfs_vm_writepage, .writepages = xfs_vm_writepages, .set_page_dirty = __set_page_dirty_nobuffers, .releasepage = iomap_releasepage, diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index aaa7e66c42d7..2b5da6218977 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -151,7 +151,7 @@ xfs_attr3_node_inactive( } xfs_da3_node_hdr_from_disk(dp->i_mount, &ichdr, bp->b_addr); - parent_blkno = bp->b_bn; + parent_blkno = xfs_buf_daddr(bp); if (!ichdr.count) { xfs_trans_brelse(*trans, bp); return 0; @@ -177,7 +177,7 @@ xfs_attr3_node_inactive( return error; /* save for re-read later */ - child_blkno = XFS_BUF_ADDR(child_bp); + child_blkno = xfs_buf_daddr(child_bp); /* * Invalidate the subtree, however we have to. @@ -271,7 +271,7 @@ xfs_attr3_root_inactive( error = xfs_da3_node_read(*trans, dp, 0, &bp, XFS_ATTR_FORK); if (error) return error; - blkno = bp->b_bn; + blkno = xfs_buf_daddr(bp); /* * Invalidate the tree, even if the "tree" is only a single leaf block. diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 25dcc98d50e6..2d1e5134cebe 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -529,7 +529,7 @@ xfs_attr_list( XFS_STATS_INC(dp->i_mount, xs_attr_list); - if (XFS_FORCED_SHUTDOWN(dp->i_mount)) + if (xfs_is_shutdown(dp->i_mount)) return -EIO; lock_mode = xfs_ilock_attr_map_shared(dp); diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index e3a691937e92..03159970133f 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -24,7 +24,6 @@ #include "xfs_error.h" #include "xfs_log_priv.h" #include "xfs_log_recover.h" -#include "xfs_quota.h" kmem_zone_t *xfs_bui_zone; kmem_zone_t *xfs_bud_zone; @@ -487,18 +486,10 @@ xfs_bui_item_recover( XFS_ATTR_FORK : XFS_DATA_FORK; bui_type = bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK; - /* Grab the inode. */ - error = xfs_iget(mp, NULL, bmap->me_owner, 0, 0, &ip); + error = xlog_recover_iget(mp, bmap->me_owner, &ip); if (error) return error; - error = xfs_qm_dqattach(ip); - if (error) - goto err_rele; - - if (VFS_I(ip)->i_nlink == 0) - xfs_iflags_set(ip, XFS_IRECOVERY); - /* Allocate transaction and do the work. */ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 0, 0, &tp); @@ -522,6 +513,9 @@ xfs_bui_item_recover( error = xfs_trans_log_finish_bmap_update(tp, budp, bui_type, ip, whichfork, bmap->me_startoff, bmap->me_startblock, &count, state); + if (error == -EFSCORRUPTED) + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bmap, + sizeof(*bmap)); if (error) goto err_cancel; diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 213a97a921bb..73a36b7be3bd 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -731,7 +731,7 @@ xfs_free_eofblocks( error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); if (error) { - ASSERT(XFS_FORCED_SHUTDOWN(mp)); + ASSERT(xfs_is_shutdown(mp)); return error; } @@ -789,7 +789,7 @@ xfs_alloc_file_space( trace_xfs_alloc_file_space(ip); - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; error = xfs_qm_dqattach(ip); @@ -1282,7 +1282,7 @@ xfs_swap_extents_check_format( * If we have to use the (expensive) rmap swap method, we can * handle any number of extents and any format. */ - if (xfs_sb_version_hasrmapbt(&ip->i_mount->m_sb)) + if (xfs_has_rmapbt(ip->i_mount)) return 0; /* @@ -1516,7 +1516,7 @@ xfs_swap_extent_forks( * event of a crash. Set the owner change log flags now and leave the * bmbt scan as the last step. */ - if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) { + if (xfs_has_v3inodes(ip->i_mount)) { if (ip->i_df.if_format == XFS_DINODE_FMT_BTREE) (*target_log_flags) |= XFS_ILOG_DOWNER; if (tip->i_df.if_format == XFS_DINODE_FMT_BTREE) @@ -1553,7 +1553,7 @@ xfs_swap_extent_forks( (*src_log_flags) |= XFS_ILOG_DEXT; break; case XFS_DINODE_FMT_BTREE: - ASSERT(!xfs_sb_version_has_v3inode(&ip->i_mount->m_sb) || + ASSERT(!xfs_has_v3inodes(ip->i_mount) || (*src_log_flags & XFS_ILOG_DOWNER)); (*src_log_flags) |= XFS_ILOG_DBROOT; break; @@ -1565,7 +1565,7 @@ xfs_swap_extent_forks( break; case XFS_DINODE_FMT_BTREE: (*target_log_flags) |= XFS_ILOG_DBROOT; - ASSERT(!xfs_sb_version_has_v3inode(&ip->i_mount->m_sb) || + ASSERT(!xfs_has_v3inodes(ip->i_mount) || (*target_log_flags & XFS_ILOG_DOWNER)); break; } @@ -1626,7 +1626,6 @@ xfs_swap_extents( struct xfs_bstat *sbp = &sxp->sx_stat; int src_log_flags, target_log_flags; int error = 0; - int lock_flags; uint64_t f; int resblks = 0; unsigned int flags = 0; @@ -1638,8 +1637,8 @@ xfs_swap_extents( * do the rest of the checks. */ lock_two_nondirectories(VFS_I(ip), VFS_I(tip)); - lock_flags = XFS_MMAPLOCK_EXCL; - xfs_lock_two_inodes(ip, XFS_MMAPLOCK_EXCL, tip, XFS_MMAPLOCK_EXCL); + filemap_invalidate_lock_two(VFS_I(ip)->i_mapping, + VFS_I(tip)->i_mapping); /* Verify that both files have the same format */ if ((VFS_I(ip)->i_mode & S_IFMT) != (VFS_I(tip)->i_mode & S_IFMT)) { @@ -1679,7 +1678,7 @@ xfs_swap_extents( * a block reservation because it's really just a remap operation * performed with log redo items! */ - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) { + if (xfs_has_rmapbt(mp)) { int w = XFS_DATA_FORK; uint32_t ipnext = ip->i_df.if_nextents; uint32_t tipnext = tip->i_df.if_nextents; @@ -1711,7 +1710,6 @@ xfs_swap_extents( * or cancel will unlock the inodes from this point onwards. */ xfs_lock_two_inodes(ip, XFS_ILOCK_EXCL, tip, XFS_ILOCK_EXCL); - lock_flags |= XFS_ILOCK_EXCL; xfs_trans_ijoin(tp, ip, 0); xfs_trans_ijoin(tp, tip, 0); @@ -1761,7 +1759,7 @@ xfs_swap_extents( src_log_flags = XFS_ILOG_CORE; target_log_flags = XFS_ILOG_CORE; - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + if (xfs_has_rmapbt(mp)) error = xfs_swap_extent_rmap(&tp, ip, tip); else error = xfs_swap_extent_forks(tp, ip, tip, &src_log_flags, @@ -1780,7 +1778,7 @@ xfs_swap_extents( } /* Swap the cow forks. */ - if (xfs_sb_version_hasreflink(&mp->m_sb)) { + if (xfs_has_reflink(mp)) { ASSERT(!ip->i_cowfp || ip->i_cowfp->if_format == XFS_DINODE_FMT_EXTENTS); ASSERT(!tip->i_cowfp || @@ -1822,7 +1820,7 @@ xfs_swap_extents( * If this is a synchronous mount, make sure that the * transaction goes to disk before returning to the user. */ - if (mp->m_flags & XFS_MOUNT_WSYNC) + if (xfs_has_wsync(mp)) xfs_trans_set_sync(tp); error = xfs_trans_commit(tp); @@ -1830,13 +1828,16 @@ xfs_swap_extents( trace_xfs_swap_extent_after(ip, 0); trace_xfs_swap_extent_after(tip, 1); +out_unlock_ilock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_iunlock(tip, XFS_ILOCK_EXCL); out_unlock: - xfs_iunlock(ip, lock_flags); - xfs_iunlock(tip, lock_flags); + filemap_invalidate_unlock_two(VFS_I(ip)->i_mapping, + VFS_I(tip)->i_mapping); unlock_two_nondirectories(VFS_I(ip), VFS_I(tip)); return error; out_trans_cancel: xfs_trans_cancel(tp); - goto out_unlock; + goto out_unlock_ilock; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 8ff42b3585e0..5fa6cd947dd4 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -251,7 +251,7 @@ _xfs_buf_alloc( return error; } - bp->b_bn = map[0].bm_bn; + bp->b_rhash_key = map[0].bm_bn; bp->b_length = 0; for (i = 0; i < nmaps; i++) { bp->b_maps[i].bm_bn = map[i].bm_bn; @@ -315,7 +315,6 @@ xfs_buf_alloc_kmem( struct xfs_buf *bp, xfs_buf_flags_t flags) { - int align_mask = xfs_buftarg_dma_alignment(bp->b_target); xfs_km_flags_t kmflag_mask = KM_NOFS; size_t size = BBTOB(bp->b_length); @@ -323,7 +322,7 @@ xfs_buf_alloc_kmem( if (!(flags & XBF_READ)) kmflag_mask |= KM_ZERO; - bp->b_addr = kmem_alloc_io(size, align_mask, kmflag_mask); + bp->b_addr = kmem_alloc(size, kmflag_mask); if (!bp->b_addr) return -ENOMEM; @@ -460,7 +459,7 @@ _xfs_buf_obj_cmp( */ BUILD_BUG_ON(offsetof(struct xfs_buf_map, bm_bn) != 0); - if (bp->b_bn != map->bm_bn) + if (bp->b_rhash_key != map->bm_bn) return 1; if (unlikely(bp->b_length != map->bm_len)) { @@ -482,7 +481,7 @@ static const struct rhashtable_params xfs_buf_hash_params = { .min_size = 32, /* empty AGs have minimal footprint */ .nelem_hint = 16, .key_len = sizeof(xfs_daddr_t), - .key_offset = offsetof(struct xfs_buf, b_bn), + .key_offset = offsetof(struct xfs_buf, b_rhash_key), .head_offset = offsetof(struct xfs_buf, b_rhash_head), .automatic_shrinking = true, .obj_cmpfn = _xfs_buf_obj_cmp, @@ -814,7 +813,7 @@ xfs_buf_read_map( * buffer. */ if (error) { - if (!XFS_FORCED_SHUTDOWN(target->bt_mount)) + if (!xfs_is_shutdown(target->bt_mount)) xfs_buf_ioerror_alert(bp, fa); bp->b_flags &= ~XBF_DONE; @@ -844,7 +843,7 @@ xfs_buf_readahead_map( { struct xfs_buf *bp; - if (bdi_read_congested(target->bt_bdev->bd_bdi)) + if (bdi_read_congested(target->bt_bdev->bd_disk->bdi)) return; xfs_buf_read_map(target, map, nmaps, @@ -854,7 +853,9 @@ xfs_buf_readahead_map( /* * Read an uncached buffer from disk. Allocates and returns a locked - * buffer containing the disk contents or nothing. + * buffer containing the disk contents or nothing. Uncached buffers always have + * a cache index of XFS_BUF_DADDR_NULL so we can easily determine if the buffer + * is cached or uncached during fault diagnosis. */ int xfs_buf_read_uncached( @@ -876,7 +877,7 @@ xfs_buf_read_uncached( /* set up the buffer for a read IO */ ASSERT(bp->b_map_count == 1); - bp->b_bn = XFS_BUF_DADDR_NULL; /* always null for uncached buffers */ + bp->b_rhash_key = XFS_BUF_DADDR_NULL; bp->b_maps[0].bm_bn = daddr; bp->b_flags |= XBF_READ; bp->b_ops = ops; @@ -1145,7 +1146,7 @@ xfs_buf_ioerror_permanent( return true; /* At unmount we may treat errors differently */ - if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount) + if (xfs_is_unmounting(mp) && mp->m_fail_unmount) return true; return false; @@ -1179,7 +1180,7 @@ xfs_buf_ioend_handle_error( * If we've already decided to shutdown the filesystem because of I/O * errors, there's no point in giving this a retry. */ - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) goto out_stale; xfs_buf_ioerror_alert_ratelimited(bp); @@ -1336,7 +1337,7 @@ xfs_buf_ioerror_alert( { xfs_buf_alert_ratelimited(bp, "XFS: metadata IO error", "metadata I/O error in \"%pS\" at daddr 0x%llx len %d error %d", - func, (uint64_t)XFS_BUF_ADDR(bp), + func, (uint64_t)xfs_buf_daddr(bp), bp->b_length, -bp->b_error); } @@ -1514,17 +1515,18 @@ _xfs_buf_ioapply( SHUTDOWN_CORRUPT_INCORE); return; } - } else if (bp->b_bn != XFS_BUF_DADDR_NULL) { + } else if (bp->b_rhash_key != XFS_BUF_DADDR_NULL) { struct xfs_mount *mp = bp->b_mount; /* * non-crc filesystems don't attach verifiers during * log recovery, so don't warn for such filesystems. */ - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { xfs_warn(mp, "%s: no buf ops on daddr 0x%llx len %d", - __func__, bp->b_bn, bp->b_length); + __func__, xfs_buf_daddr(bp), + bp->b_length); xfs_hex_dump(bp->b_addr, XFS_CORRUPTION_DUMP_LEN); dump_stack(); @@ -1592,7 +1594,7 @@ __xfs_buf_submit( ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); /* on shutdown we stale and complete the buffer immediately */ - if (XFS_FORCED_SHUTDOWN(bp->b_mount)) { + if (xfs_is_shutdown(bp->b_mount)) { xfs_buf_ioend_fail(bp); return -EIO; } @@ -1794,7 +1796,7 @@ xfs_buftarg_drain( xfs_buf_alert_ratelimited(bp, "XFS: Corruption Alert", "Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!", - (long long)bp->b_bn); + (long long)xfs_buf_daddr(bp)); } xfs_buf_rele(bp); } @@ -1809,7 +1811,7 @@ xfs_buftarg_drain( * down the fs. */ if (write_fail) { - ASSERT(XFS_FORCED_SHUTDOWN(btp->bt_mount)); + ASSERT(xfs_is_shutdown(btp->bt_mount)); xfs_alert(btp->bt_mount, "Please run xfs_repair to determine the extent of the problem."); } @@ -2302,7 +2304,7 @@ xfs_verify_magic( struct xfs_mount *mp = bp->b_mount; int idx; - idx = xfs_sb_version_hascrc(&mp->m_sb); + idx = xfs_has_crc(mp); if (WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx])) return false; return dmagic == bp->b_ops->magic[idx]; @@ -2320,7 +2322,7 @@ xfs_verify_magic16( struct xfs_mount *mp = bp->b_mount; int idx; - idx = xfs_sb_version_hascrc(&mp->m_sb); + idx = xfs_has_crc(mp); if (WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx])) return false; return dmagic == bp->b_ops->magic16[idx]; diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 464dc548fa23..6b0200b8007d 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -133,7 +133,8 @@ struct xfs_buf { * fast-path on locking. */ struct rhash_head b_rhash_head; /* pag buffer hash node */ - xfs_daddr_t b_bn; /* block number of buffer */ + + xfs_daddr_t b_rhash_key; /* buffer cache index */ int b_length; /* size of buffer in BBs */ atomic_t b_hold; /* reference count */ atomic_t b_lru_ref; /* lru reclaim ref count */ @@ -296,18 +297,10 @@ extern int xfs_buf_delwri_pushbuf(struct xfs_buf *, struct list_head *); extern int xfs_buf_init(void); extern void xfs_buf_terminate(void); -/* - * These macros use the IO block map rather than b_bn. b_bn is now really - * just for the buffer cache index for cached buffers. As IO does not use b_bn - * anymore, uncached buffers do not use b_bn at all and hence must modify the IO - * map directly. Uncached buffers are not allowed to be discontiguous, so this - * is safe to do. - * - * In future, uncached buffers will pass the block number directly to the io - * request function and hence these macros will go away at that point. - */ -#define XFS_BUF_ADDR(bp) ((bp)->b_maps[0].bm_bn) -#define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_maps[0].bm_bn = (xfs_daddr_t)(bno)) +static inline xfs_daddr_t xfs_buf_daddr(struct xfs_buf *bp) +{ + return bp->b_maps[0].bm_bn; +} void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref); @@ -355,12 +348,6 @@ extern int xfs_setsize_buftarg(struct xfs_buftarg *, unsigned int); #define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) #define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) -static inline int -xfs_buftarg_dma_alignment(struct xfs_buftarg *bt) -{ - return queue_dma_alignment(bt->bt_bdev->bd_disk->queue); -} - int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops); bool xfs_verify_magic(struct xfs_buf *bp, __be32 dmagic); bool xfs_verify_magic16(struct xfs_buf *bp, __be16 dmagic); diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 2828ce45b701..b1ab100c09e1 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -428,7 +428,7 @@ xfs_buf_item_format( * occurs during recovery. */ if (bip->bli_flags & XFS_BLI_INODE_BUF) { - if (xfs_sb_version_has_v3inode(&lip->li_mountp->m_sb) || + if (xfs_has_v3inodes(lip->li_mountp) || !((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && xfs_log_item_in_current_chkpt(lip))) bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF; @@ -581,7 +581,7 @@ xfs_buf_item_push( if (bp->b_flags & XBF_WRITE_FAIL) { xfs_buf_alert_ratelimited(bp, "XFS: Failing async write", "Failing async write on buffer block 0x%llx. Retrying async write.", - (long long)bp->b_bn); + (long long)xfs_buf_daddr(bp)); } if (!xfs_buf_delwri_queue(bp, buffer_list)) @@ -616,7 +616,7 @@ xfs_buf_item_put( * that case, the bli is freed on buffer writeback completion. */ aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags) || - XFS_FORCED_SHUTDOWN(lip->li_mountp); + xfs_is_shutdown(lip->li_mountp); dirty = bip->bli_flags & XFS_BLI_DIRTY; if (dirty && !aborted) return false; diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index 4775485b4062..a476c7ef5d53 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -219,7 +219,7 @@ xlog_recover_validate_buf_type( * inconsistent state resulting in verification failures. Hence for now * just avoid the verification stage for non-crc filesystems */ - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return; magic32 = be32_to_cpu(*(__be32 *)bp->b_addr); @@ -497,7 +497,7 @@ xlog_recover_do_reg_buffer( if (fa) { xfs_alert(mp, "dquot corrupt at %pS trying to replay into block 0x%llx", - fa, bp->b_bn); + fa, xfs_buf_daddr(bp)); goto next; } } @@ -597,7 +597,7 @@ xlog_recover_do_inode_buffer( * Post recovery validation only works properly on CRC enabled * filesystems. */ - if (xfs_sb_version_hascrc(&mp->m_sb)) + if (xfs_has_crc(mp)) bp->b_ops = &xfs_inode_buf_ops; inodes_per_buf = BBTOB(bp->b_length) >> mp->m_sb.sb_inodelog; @@ -710,7 +710,7 @@ xlog_recover_get_buf_lsn( uint16_t blft; /* v4 filesystems always recover immediately */ - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) goto recover_immediately; /* @@ -787,7 +787,7 @@ xlog_recover_get_buf_lsn( * the relevant UUID in the superblock. */ lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn); - if (xfs_sb_version_hasmetauuid(&mp->m_sb)) + if (xfs_has_metauuid(mp)) uuid = &((struct xfs_dsb *)blk)->sb_meta_uuid; else uuid = &((struct xfs_dsb *)blk)->sb_uuid; diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index da1cc683560c..8310005af00f 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -32,7 +32,7 @@ xfs_dir3_get_dtype( struct xfs_mount *mp, uint8_t filetype) { - if (!xfs_sb_version_hasftype(&mp->m_sb)) + if (!xfs_has_ftype(mp)) return DT_UNKNOWN; if (filetype >= XFS_DIR3_FT_MAX) @@ -512,7 +512,7 @@ xfs_readdir( trace_xfs_readdir(dp); - if (XFS_FORCED_SHUTDOWN(dp->i_mount)) + if (xfs_is_shutdown(dp->i_mount)) return -EIO; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 736df5660f1f..0191de8ce9ce 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -169,7 +169,7 @@ xfs_ioc_trim( * We haven't recovered the log, so we cannot use our bnobt-guided * storage zapping commands. */ - if (mp->m_flags & XFS_MOUNT_NORECOVERY) + if (xfs_has_norecovery(mp)) return -EROFS; if (copy_from_user(&range, urange, sizeof(range))) diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index ecd5059d6928..c15d61d47a06 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -223,9 +223,9 @@ xfs_qm_init_dquot_blk( d->dd_diskdq.d_version = XFS_DQUOT_VERSION; d->dd_diskdq.d_id = cpu_to_be32(curid); d->dd_diskdq.d_type = type; - if (curid > 0 && xfs_sb_version_hasbigtime(&mp->m_sb)) + if (curid > 0 && xfs_has_bigtime(mp)) d->dd_diskdq.d_type |= XFS_DQTYPE_BIGTIME; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { uuid_copy(&d->dd_uuid, &mp->m_sb.sb_meta_uuid); xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk), XFS_DQUOT_CRC_OFF); @@ -526,7 +526,7 @@ xfs_dquot_check_type( * expect an exact match for user dquots and for non-root group and * project dquots. */ - if (xfs_sb_version_hascrc(&dqp->q_mount->m_sb) || + if (xfs_has_crc(dqp->q_mount) || dqp_type == XFS_DQTYPE_USER || dqp->q_id != 0) return ddqp_type == dqp_type; @@ -847,9 +847,6 @@ xfs_qm_dqget_checks( struct xfs_mount *mp, xfs_dqtype_t type) { - if (WARN_ON_ONCE(!XFS_IS_QUOTA_RUNNING(mp))) - return -ESRCH; - switch (type) { case XFS_DQTYPE_USER: if (!XFS_IS_UQUOTA_ON(mp)) @@ -1222,7 +1219,7 @@ xfs_qm_dqflush_check( /* bigtime flag should never be set on root dquots */ if (dqp->q_type & XFS_DQTYPE_BIGTIME) { - if (!xfs_sb_version_hasbigtime(&dqp->q_mount->m_sb)) + if (!xfs_has_bigtime(dqp->q_mount)) return __this_address; if (dqp->q_id == 0) return __this_address; @@ -1301,7 +1298,7 @@ xfs_qm_dqflush( * buffer always has a valid CRC. This ensures there is no possibility * of a dquot without an up-to-date CRC getting to disk. */ - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { dqblk->dd_lsn = cpu_to_be64(dqp->q_logitem.qli_item.li_lsn); xfs_update_cksum((char *)dqblk, sizeof(struct xfs_dqblk), XFS_DQUOT_CRC_OFF); diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index f642884a6834..6b5e3cf40c8b 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -54,6 +54,16 @@ struct xfs_dquot_res { xfs_qwarncnt_t warnings; }; +static inline bool +xfs_dquot_res_over_limits( + const struct xfs_dquot_res *qres) +{ + if ((qres->softlimit && qres->softlimit < qres->reserved) || + (qres->hardlimit && qres->hardlimit < qres->reserved)) + return true; + return false; +} + /* * The incore dquot structure */ diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index 8ed47b739b6c..6a1aae799cf1 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -218,137 +218,3 @@ xfs_qm_dquot_logitem_init( &xfs_dquot_item_ops); lp->qli_dquot = dqp; } - -/*------------------ QUOTAOFF LOG ITEMS -------------------*/ - -static inline struct xfs_qoff_logitem *QOFF_ITEM(struct xfs_log_item *lip) -{ - return container_of(lip, struct xfs_qoff_logitem, qql_item); -} - - -/* - * This returns the number of iovecs needed to log the given quotaoff item. - * We only need 1 iovec for an quotaoff item. It just logs the - * quotaoff_log_format structure. - */ -STATIC void -xfs_qm_qoff_logitem_size( - struct xfs_log_item *lip, - int *nvecs, - int *nbytes) -{ - *nvecs += 1; - *nbytes += sizeof(struct xfs_qoff_logitem); -} - -STATIC void -xfs_qm_qoff_logitem_format( - struct xfs_log_item *lip, - struct xfs_log_vec *lv) -{ - struct xfs_qoff_logitem *qflip = QOFF_ITEM(lip); - struct xfs_log_iovec *vecp = NULL; - struct xfs_qoff_logformat *qlf; - - qlf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_QUOTAOFF); - qlf->qf_type = XFS_LI_QUOTAOFF; - qlf->qf_size = 1; - qlf->qf_flags = qflip->qql_flags; - xlog_finish_iovec(lv, vecp, sizeof(struct xfs_qoff_logitem)); -} - -/* - * There isn't much you can do to push a quotaoff item. It is simply - * stuck waiting for the log to be flushed to disk. - */ -STATIC uint -xfs_qm_qoff_logitem_push( - struct xfs_log_item *lip, - struct list_head *buffer_list) -{ - return XFS_ITEM_LOCKED; -} - -STATIC xfs_lsn_t -xfs_qm_qoffend_logitem_committed( - struct xfs_log_item *lip, - xfs_lsn_t lsn) -{ - struct xfs_qoff_logitem *qfe = QOFF_ITEM(lip); - struct xfs_qoff_logitem *qfs = qfe->qql_start_lip; - - xfs_qm_qoff_logitem_relse(qfs); - - kmem_free(lip->li_lv_shadow); - kmem_free(qfe); - return (xfs_lsn_t)-1; -} - -STATIC void -xfs_qm_qoff_logitem_release( - struct xfs_log_item *lip) -{ - struct xfs_qoff_logitem *qoff = QOFF_ITEM(lip); - - if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) { - if (qoff->qql_start_lip) - xfs_qm_qoff_logitem_relse(qoff->qql_start_lip); - xfs_qm_qoff_logitem_relse(qoff); - } -} - -static const struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { - .iop_size = xfs_qm_qoff_logitem_size, - .iop_format = xfs_qm_qoff_logitem_format, - .iop_committed = xfs_qm_qoffend_logitem_committed, - .iop_push = xfs_qm_qoff_logitem_push, - .iop_release = xfs_qm_qoff_logitem_release, -}; - -static const struct xfs_item_ops xfs_qm_qoff_logitem_ops = { - .iop_size = xfs_qm_qoff_logitem_size, - .iop_format = xfs_qm_qoff_logitem_format, - .iop_push = xfs_qm_qoff_logitem_push, - .iop_release = xfs_qm_qoff_logitem_release, -}; - -/* - * Delete the quotaoff intent from the AIL and free it. On success, - * this should only be called for the start item. It can be used for - * either on shutdown or abort. - */ -void -xfs_qm_qoff_logitem_relse( - struct xfs_qoff_logitem *qoff) -{ - struct xfs_log_item *lip = &qoff->qql_item; - - ASSERT(test_bit(XFS_LI_IN_AIL, &lip->li_flags) || - test_bit(XFS_LI_ABORTED, &lip->li_flags) || - XFS_FORCED_SHUTDOWN(lip->li_mountp)); - xfs_trans_ail_delete(lip, 0); - kmem_free(lip->li_lv_shadow); - kmem_free(qoff); -} - -/* - * Allocate and initialize an quotaoff item of the correct quota type(s). - */ -struct xfs_qoff_logitem * -xfs_qm_qoff_logitem_init( - struct xfs_mount *mp, - struct xfs_qoff_logitem *start, - uint flags) -{ - struct xfs_qoff_logitem *qf; - - qf = kmem_zalloc(sizeof(struct xfs_qoff_logitem), 0); - - xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ? - &xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops); - qf->qql_item.li_mountp = mp; - qf->qql_start_lip = start; - qf->qql_flags = flags; - return qf; -} diff --git a/fs/xfs/xfs_dquot_item.h b/fs/xfs/xfs_dquot_item.h index 2b86a43d7ce2..794710c24474 100644 --- a/fs/xfs/xfs_dquot_item.h +++ b/fs/xfs/xfs_dquot_item.h @@ -9,7 +9,6 @@ struct xfs_dquot; struct xfs_trans; struct xfs_mount; -struct xfs_qoff_logitem; struct xfs_dq_logitem { struct xfs_log_item qli_item; /* common portion */ @@ -17,22 +16,6 @@ struct xfs_dq_logitem { xfs_lsn_t qli_flush_lsn; /* lsn at last flush */ }; -struct xfs_qoff_logitem { - struct xfs_log_item qql_item; /* common portion */ - struct xfs_qoff_logitem *qql_start_lip; /* qoff-start logitem, if any */ - unsigned int qql_flags; -}; - - void xfs_qm_dquot_logitem_init(struct xfs_dquot *dqp); -struct xfs_qoff_logitem *xfs_qm_qoff_logitem_init(struct xfs_mount *mp, - struct xfs_qoff_logitem *start, - uint flags); -void xfs_qm_qoff_logitem_relse(struct xfs_qoff_logitem *); -struct xfs_qoff_logitem *xfs_trans_get_qoff_item(struct xfs_trans *tp, - struct xfs_qoff_logitem *startqoff, - uint flags); -void xfs_trans_log_quotaoff_item(struct xfs_trans *tp, - struct xfs_qoff_logitem *qlp); #endif /* __XFS_DQUOT_ITEM_H__ */ diff --git a/fs/xfs/xfs_dquot_item_recover.c b/fs/xfs/xfs_dquot_item_recover.c index 5875c7e1bd28..8966ba842395 100644 --- a/fs/xfs/xfs_dquot_item_recover.c +++ b/fs/xfs/xfs_dquot_item_recover.c @@ -136,7 +136,7 @@ xlog_recover_dquot_commit_pass2( * If the dquot has an LSN in it, recover the dquot only if it's less * than the lsn of the transaction we are replaying. */ - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddq; xfs_lsn_t lsn = be64_to_cpu(dqb->dd_lsn); @@ -146,7 +146,7 @@ xlog_recover_dquot_commit_pass2( } memcpy(ddq, recddq, item->ri_buf[1].i_len); - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk), XFS_DQUOT_CRC_OFF); } diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index ce3bc1b291a1..81c445e9489b 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -371,7 +371,7 @@ xfs_buf_corruption_error( xfs_alert_tag(mp, XFS_PTAG_VERIFIER_ERROR, "Metadata corruption detected at %pS, %s block 0x%llx", - fa, bp->b_ops->name, bp->b_bn); + fa, bp->b_ops->name, xfs_buf_daddr(bp)); xfs_alert(mp, "Unmount and run xfs_repair"); @@ -402,7 +402,7 @@ xfs_buf_verifier_error( xfs_alert_tag(mp, XFS_PTAG_VERIFIER_ERROR, "Metadata %s detected at %pS, %s block 0x%llx %s", bp->b_error == -EFSBADCRC ? "CRC error" : "corruption", - fa, bp->b_ops->name, bp->b_bn, name); + fa, bp->b_ops->name, xfs_buf_daddr(bp), name); xfs_alert(mp, "Unmount and run xfs_repair"); diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 1717b7508356..5735d5ea87ee 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -75,4 +75,16 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp); #define XFS_PTAG_FSBLOCK_ZERO 0x00000080 #define XFS_PTAG_VERIFIER_ERROR 0x00000100 +#define XFS_PTAG_STRINGS \ + { XFS_NO_PTAG, "none" }, \ + { XFS_PTAG_IFLUSH, "iflush" }, \ + { XFS_PTAG_LOGRES, "logres" }, \ + { XFS_PTAG_AILDELETE, "aildelete" }, \ + { XFS_PTAG_ERROR_REPORT , "error_report" }, \ + { XFS_PTAG_SHUTDOWN_CORRUPT, "corrupt" }, \ + { XFS_PTAG_SHUTDOWN_IOERROR, "ioerror" }, \ + { XFS_PTAG_SHUTDOWN_LOGERROR, "logerror" }, \ + { XFS_PTAG_FSBLOCK_ZERO, "fsb_zero" }, \ + { XFS_PTAG_VERIFIER_ERROR, "verifier" } + #endif /* __XFS_ERROR_H__ */ diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index 1da59bdff245..1064c2342876 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -44,6 +44,7 @@ xfs_fs_encode_fh( int *max_len, struct inode *parent) { + struct xfs_mount *mp = XFS_M(inode->i_sb); struct fid *fid = (struct fid *)fh; struct xfs_fid64 *fid64 = (struct xfs_fid64 *)fh; int fileid_type; @@ -63,8 +64,7 @@ xfs_fs_encode_fh( * large enough filesystem may contain them, thus the slightly * confusing looking conditional below. */ - if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS) || - (XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_32BITINODES)) + if (!xfs_has_small_inums(mp) || xfs_is_inode32(mp)) fileid_type |= XFS_FILEID_TYPE_64FLAG; /* diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 2424230ca2c3..3f8a0713573a 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -629,6 +629,9 @@ xfs_efi_item_recover( error = xfs_trans_free_extent(tp, efdp, extp->ext_start, extp->ext_len, &XFS_RMAP_OINFO_ANY_OWNER, false); + if (error == -EFSCORRUPTED) + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + extp, sizeof(*extp)); if (error) goto abort_error; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index cc3cfb12df53..7aa943edfc02 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -185,7 +185,7 @@ xfs_file_fsync( if (error) return error; - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; xfs_iflags_clear(ip, XFS_ITRUNCATED); @@ -318,7 +318,7 @@ xfs_file_read_iter( XFS_STATS_INC(mp, xs_read_calls); - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; if (IS_DAX(inode)) @@ -462,7 +462,7 @@ xfs_dio_write_end_io( trace_xfs_end_io_direct_write(ip, offset, size); - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + if (xfs_is_shutdown(ip->i_mount)) return -EIO; if (error) @@ -814,7 +814,7 @@ xfs_file_write_iter( if (ocount == 0) return 0; - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + if (xfs_is_shutdown(ip->i_mount)) return -EIO; if (IS_DAX(inode)) @@ -1122,7 +1122,7 @@ static inline bool xfs_file_sync_writes(struct file *filp) { struct xfs_inode *ip = XFS_I(file_inode(filp)); - if (ip->i_mount->m_flags & XFS_MOUNT_WSYNC) + if (xfs_has_wsync(ip->i_mount)) return true; if (filp->f_flags & (__O_SYNC | O_DSYNC)) return true; @@ -1153,10 +1153,10 @@ xfs_file_remap_range( if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) return -EINVAL; - if (!xfs_sb_version_hasreflink(&mp->m_sb)) + if (!xfs_has_reflink(mp)) return -EOPNOTSUPP; - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; /* Prepare and then clone file data. */ @@ -1205,7 +1205,7 @@ xfs_file_open( { if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) return -EFBIG; - if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb))) + if (xfs_is_shutdown(XFS_M(inode->i_sb))) return -EIO; file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; return 0; @@ -1277,7 +1277,7 @@ xfs_file_llseek( { struct inode *inode = file->f_mapping->host; - if (XFS_FORCED_SHUTDOWN(XFS_I(inode)->i_mount)) + if (xfs_is_shutdown(XFS_I(inode)->i_mount)) return -EIO; switch (whence) { @@ -1302,7 +1302,7 @@ xfs_file_llseek( * * mmap_lock (MM) * sb_start_pagefault(vfs, freeze) - * i_mmaplock (XFS - truncate serialisation) + * invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation) * page_lock (MM) * i_lock (XFS - extent map serialisation) */ @@ -1323,24 +1323,27 @@ __xfs_filemap_fault( file_update_time(vmf->vma->vm_file); } - xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (IS_DAX(inode)) { pfn_t pfn; + xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); ret = dax_iomap_fault(vmf, pe_size, &pfn, NULL, (write_fault && !vmf->cow_page) ? &xfs_direct_write_iomap_ops : &xfs_read_iomap_ops); if (ret & VM_FAULT_NEEDDSYNC) ret = dax_finish_sync_fault(vmf, pe_size, pfn); + xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); } else { - if (write_fault) + if (write_fault) { + xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops); - else + xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); + } else { ret = filemap_fault(vmf); + } } - xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (write_fault) sb_end_pagefault(inode->i_sb); diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index eed6ca5f8f91..6a3ce0f6dc9e 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -295,7 +295,7 @@ xfs_filestream_lookup_ag( * Set the starting AG using the rotor for inode32, otherwise * use the directory inode's AG. */ - if (mp->m_flags & XFS_MOUNT_32BITINODES) { + if (xfs_is_inode32(mp)) { xfs_agnumber_t rotorstep = xfs_rotorstep; startag = (mp->m_agfrotor / rotorstep) % mp->m_sb.sb_agcount; mp->m_agfrotor = (mp->m_agfrotor + 1) % diff --git a/fs/xfs/xfs_filestream.h b/fs/xfs/xfs_filestream.h index 3af963743e4d..403226ebb80b 100644 --- a/fs/xfs/xfs_filestream.h +++ b/fs/xfs/xfs_filestream.h @@ -21,7 +21,7 @@ static inline int xfs_inode_is_filestream( struct xfs_inode *ip) { - return (ip->i_mount->m_flags & XFS_MOUNT_FILESTREAMS) || + return xfs_has_filestreams(ip->i_mount) || (ip->i_diflags & XFS_DIFLAG_FILESTREAM); } diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 7d0b09c1366e..48287caad28b 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -61,7 +61,7 @@ xfs_fsmap_to_internal( static int xfs_fsmap_owner_to_rmap( struct xfs_rmap_irec *dest, - struct xfs_fsmap *src) + const struct xfs_fsmap *src) { if (!(src->fmr_flags & FMR_OF_SPECIAL_OWNER)) { dest->rm_owner = src->fmr_owner; @@ -111,8 +111,8 @@ xfs_fsmap_owner_to_rmap( /* Convert an rmapbt owner into an fsmap owner. */ static int xfs_fsmap_owner_from_rmap( - struct xfs_fsmap *dest, - struct xfs_rmap_irec *src) + struct xfs_fsmap *dest, + const struct xfs_rmap_irec *src) { dest->fmr_flags = 0; if (!XFS_RMAP_NON_INODE_OWNER(src->rm_owner)) { @@ -171,7 +171,7 @@ struct xfs_getfsmap_info { struct xfs_getfsmap_dev { u32 dev; int (*fn)(struct xfs_trans *tp, - struct xfs_fsmap *keys, + const struct xfs_fsmap *keys, struct xfs_getfsmap_info *info); }; @@ -192,7 +192,7 @@ STATIC int xfs_getfsmap_is_shared( struct xfs_trans *tp, struct xfs_getfsmap_info *info, - struct xfs_rmap_irec *rec, + const struct xfs_rmap_irec *rec, bool *stat) { struct xfs_mount *mp = tp->t_mountp; @@ -202,7 +202,7 @@ xfs_getfsmap_is_shared( int error; *stat = false; - if (!xfs_sb_version_hasreflink(&mp->m_sb)) + if (!xfs_has_reflink(mp)) return 0; /* rt files will have no perag structure */ if (!info->pag) @@ -245,7 +245,7 @@ STATIC int xfs_getfsmap_helper( struct xfs_trans *tp, struct xfs_getfsmap_info *info, - struct xfs_rmap_irec *rec, + const struct xfs_rmap_irec *rec, xfs_daddr_t rec_daddr) { struct xfs_fsmap fmr; @@ -347,7 +347,7 @@ out: STATIC int xfs_getfsmap_datadev_helper( struct xfs_btree_cur *cur, - struct xfs_rmap_irec *rec, + const struct xfs_rmap_irec *rec, void *priv) { struct xfs_mount *mp = cur->bc_mp; @@ -365,7 +365,7 @@ xfs_getfsmap_datadev_helper( STATIC int xfs_getfsmap_datadev_bnobt_helper( struct xfs_btree_cur *cur, - struct xfs_alloc_rec_incore *rec, + const struct xfs_alloc_rec_incore *rec, void *priv) { struct xfs_mount *mp = cur->bc_mp; @@ -389,7 +389,7 @@ xfs_getfsmap_datadev_bnobt_helper( static void xfs_getfsmap_set_irec_flags( struct xfs_rmap_irec *irec, - struct xfs_fsmap *fmr) + const struct xfs_fsmap *fmr) { irec->rm_flags = 0; if (fmr->fmr_flags & FMR_OF_ATTR_FORK) @@ -404,7 +404,7 @@ xfs_getfsmap_set_irec_flags( STATIC int xfs_getfsmap_logdev( struct xfs_trans *tp, - struct xfs_fsmap *keys, + const struct xfs_fsmap *keys, struct xfs_getfsmap_info *info) { struct xfs_mount *mp = tp->t_mountp; @@ -451,7 +451,7 @@ xfs_getfsmap_logdev( STATIC int xfs_getfsmap_rtdev_rtbitmap_helper( struct xfs_trans *tp, - struct xfs_rtalloc_rec *rec, + const struct xfs_rtalloc_rec *rec, void *priv) { struct xfs_mount *mp = tp->t_mountp; @@ -473,7 +473,7 @@ xfs_getfsmap_rtdev_rtbitmap_helper( STATIC int __xfs_getfsmap_rtdev( struct xfs_trans *tp, - struct xfs_fsmap *keys, + const struct xfs_fsmap *keys, int (*query_fn)(struct xfs_trans *, struct xfs_getfsmap_info *), struct xfs_getfsmap_info *info) @@ -481,16 +481,14 @@ __xfs_getfsmap_rtdev( struct xfs_mount *mp = tp->t_mountp; xfs_fsblock_t start_fsb; xfs_fsblock_t end_fsb; - xfs_daddr_t eofs; + uint64_t eofs; int error = 0; eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks); if (keys[0].fmr_physical >= eofs) return 0; - if (keys[1].fmr_physical >= eofs) - keys[1].fmr_physical = eofs - 1; start_fsb = XFS_BB_TO_FSBT(mp, keys[0].fmr_physical); - end_fsb = XFS_BB_TO_FSB(mp, keys[1].fmr_physical); + end_fsb = XFS_BB_TO_FSB(mp, min(eofs - 1, keys[1].fmr_physical)); /* Set up search keys */ info->low.rm_startblock = start_fsb; @@ -523,27 +521,37 @@ xfs_getfsmap_rtdev_rtbitmap_query( { struct xfs_rtalloc_rec alow = { 0 }; struct xfs_rtalloc_rec ahigh = { 0 }; + struct xfs_mount *mp = tp->t_mountp; int error; - xfs_ilock(tp->t_mountp->m_rbmip, XFS_ILOCK_SHARED); + xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED); + /* + * Set up query parameters to return free rtextents covering the range + * we want. + */ alow.ar_startext = info->low.rm_startblock; ahigh.ar_startext = info->high.rm_startblock; - do_div(alow.ar_startext, tp->t_mountp->m_sb.sb_rextsize); - if (do_div(ahigh.ar_startext, tp->t_mountp->m_sb.sb_rextsize)) + do_div(alow.ar_startext, mp->m_sb.sb_rextsize); + if (do_div(ahigh.ar_startext, mp->m_sb.sb_rextsize)) ahigh.ar_startext++; error = xfs_rtalloc_query_range(tp, &alow, &ahigh, xfs_getfsmap_rtdev_rtbitmap_helper, info); if (error) goto err; - /* Report any gaps at the end of the rtbitmap */ + /* + * Report any gaps at the end of the rtbitmap by simulating a null + * rmap starting at the block after the end of the query range. + */ info->last = true; + ahigh.ar_startext = min(mp->m_sb.sb_rextents, ahigh.ar_startext); + error = xfs_getfsmap_rtdev_rtbitmap_helper(tp, &ahigh, info); if (error) goto err; err: - xfs_iunlock(tp->t_mountp->m_rbmip, XFS_ILOCK_SHARED); + xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED); return error; } @@ -551,7 +559,7 @@ err: STATIC int xfs_getfsmap_rtdev_rtbitmap( struct xfs_trans *tp, - struct xfs_fsmap *keys, + const struct xfs_fsmap *keys, struct xfs_getfsmap_info *info) { info->missing_owner = XFS_FMR_OWN_UNKNOWN; @@ -564,7 +572,7 @@ xfs_getfsmap_rtdev_rtbitmap( STATIC int __xfs_getfsmap_datadev( struct xfs_trans *tp, - struct xfs_fsmap *keys, + const struct xfs_fsmap *keys, struct xfs_getfsmap_info *info, int (*query_fn)(struct xfs_trans *, struct xfs_getfsmap_info *, @@ -579,16 +587,14 @@ __xfs_getfsmap_datadev( xfs_fsblock_t end_fsb; xfs_agnumber_t start_ag; xfs_agnumber_t end_ag; - xfs_daddr_t eofs; + uint64_t eofs; int error = 0; eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); if (keys[0].fmr_physical >= eofs) return 0; - if (keys[1].fmr_physical >= eofs) - keys[1].fmr_physical = eofs - 1; start_fsb = XFS_DADDR_TO_FSB(mp, keys[0].fmr_physical); - end_fsb = XFS_DADDR_TO_FSB(mp, keys[1].fmr_physical); + end_fsb = XFS_DADDR_TO_FSB(mp, min(eofs - 1, keys[1].fmr_physical)); /* * Convert the fsmap low/high keys to AG based keys. Initialize @@ -716,7 +722,7 @@ xfs_getfsmap_datadev_rmapbt_query( STATIC int xfs_getfsmap_datadev_rmapbt( struct xfs_trans *tp, - struct xfs_fsmap *keys, + const struct xfs_fsmap *keys, struct xfs_getfsmap_info *info) { info->missing_owner = XFS_FMR_OWN_FREE; @@ -751,7 +757,7 @@ xfs_getfsmap_datadev_bnobt_query( STATIC int xfs_getfsmap_datadev_bnobt( struct xfs_trans *tp, - struct xfs_fsmap *keys, + const struct xfs_fsmap *keys, struct xfs_getfsmap_info *info) { struct xfs_alloc_rec_incore akeys[2]; @@ -859,7 +865,7 @@ xfs_getfsmap( return -EINVAL; use_rmap = capable(CAP_SYS_ADMIN) && - xfs_sb_version_hasrmapbt(&mp->m_sb); + xfs_has_rmapbt(mp); head->fmh_entries = 0; /* Set up our device handlers. */ diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 6ed29b158312..33e26690a8c4 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -19,6 +19,7 @@ #include "xfs_log.h" #include "xfs_ag.h" #include "xfs_ag_resv.h" +#include "xfs_trace.h" /* * Write new AG headers to disk. Non-transactional, but need to be @@ -177,7 +178,7 @@ xfs_growfs_data_private( * particularly important for shrink because the write verifier * will fail if sb_fdblocks is ever larger than sb_dblocks. */ - if (xfs_sb_version_haslazysbcount(&mp->m_sb)) + if (xfs_has_lazysbcount(mp)) xfs_log_sb(tp); xfs_trans_set_sync(tp); @@ -511,6 +512,11 @@ xfs_fs_goingdown( * consistent. We don't do an unmount here; just shutdown the shop, make sure * that absolutely nothing persistent happens to this filesystem after this * point. + * + * The shutdown state change is atomic, resulting in the first and only the + * first shutdown call processing the shutdown. This means we only shutdown the + * log once as it requires, and we don't spam the logs when multiple concurrent + * shutdowns race to set the shutdown flags. */ void xfs_do_force_shutdown( @@ -519,48 +525,37 @@ xfs_do_force_shutdown( char *fname, int lnnum) { - bool logerror = flags & SHUTDOWN_LOG_IO_ERROR; - - /* - * No need to duplicate efforts. - */ - if (XFS_FORCED_SHUTDOWN(mp) && !logerror) - return; - - /* - * This flags XFS_MOUNT_FS_SHUTDOWN, makes sure that we don't - * queue up anybody new on the log reservations, and wakes up - * everybody who's sleeping on log reservations to tell them - * the bad news. - */ - if (xfs_log_force_umount(mp, logerror)) - return; + int tag; + const char *why; - if (flags & SHUTDOWN_FORCE_UMOUNT) { - xfs_alert(mp, -"User initiated shutdown (0x%x) received. Shutting down filesystem", - flags); + if (test_and_set_bit(XFS_OPSTATE_SHUTDOWN, &mp->m_opstate)) return; - } - - if (flags & SHUTDOWN_CORRUPT_INCORE) { - xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_CORRUPT, -"Corruption of in-memory data (0x%x) detected at %pS (%s:%d). Shutting down filesystem", - flags, __return_address, fname, lnnum); - if (XFS_ERRLEVEL_HIGH <= xfs_error_level) - xfs_stack_trace(); - } else if (logerror) { - xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR, -"Log I/O error (0x%x) detected at %pS (%s:%d). Shutting down filesystem", - flags, __return_address, fname, lnnum); + if (mp->m_sb_bp) + mp->m_sb_bp->b_flags |= XBF_DONE; + + if (flags & SHUTDOWN_FORCE_UMOUNT) + xfs_alert(mp, "User initiated shutdown received."); + + if (xlog_force_shutdown(mp->m_log, flags)) { + tag = XFS_PTAG_SHUTDOWN_LOGERROR; + why = "Log I/O Error"; + } else if (flags & SHUTDOWN_CORRUPT_INCORE) { + tag = XFS_PTAG_SHUTDOWN_CORRUPT; + why = "Corruption of in-memory data"; } else { - xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR, -"I/O error (0x%x) detected at %pS (%s:%d). Shutting down filesystem", - flags, __return_address, fname, lnnum); + tag = XFS_PTAG_SHUTDOWN_IOERROR; + why = "Metadata I/O Error"; } + trace_xfs_force_shutdown(mp, tag, flags, fname, lnnum); + + xfs_alert_tag(mp, tag, +"%s (0x%x) detected at %pS (%s:%d). Shutting down filesystem.", + why, flags, __return_address, fname, lnnum); xfs_alert(mp, "Please unmount the filesystem and rectify the problem(s)"); + if (xfs_error_level >= XFS_ERRLEVEL_HIGH) + xfs_stack_trace(); } /* diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c index eb10eacabc8f..72a075bb2c10 100644 --- a/fs/xfs/xfs_health.c +++ b/fs/xfs/xfs_health.c @@ -30,7 +30,7 @@ xfs_health_unmount( unsigned int checked = 0; bool warn = false; - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return; /* Measure AG corruption levels. */ diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 6007683482c6..f2210d927481 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -38,23 +38,11 @@ * radix tree tags when convenient. Avoid existing XFS_IWALK namespace. */ enum xfs_icwalk_goal { - /* Goals that are not related to tags; these must be < 0. */ - XFS_ICWALK_DQRELE = -1, - /* Goals directly associated with tagged inodes. */ XFS_ICWALK_BLOCKGC = XFS_ICI_BLOCKGC_TAG, XFS_ICWALK_RECLAIM = XFS_ICI_RECLAIM_TAG, }; -#define XFS_ICWALK_NULL_TAG (-1U) - -/* Compute the inode radix tree tag for this goal. */ -static inline unsigned int -xfs_icwalk_tag(enum xfs_icwalk_goal goal) -{ - return goal < 0 ? XFS_ICWALK_NULL_TAG : goal; -} - static int xfs_icwalk(struct xfs_mount *mp, enum xfs_icwalk_goal goal, struct xfs_icwalk *icw); static int xfs_icwalk_ag(struct xfs_perag *pag, @@ -64,9 +52,6 @@ static int xfs_icwalk_ag(struct xfs_perag *pag, * Private inode cache walk flags for struct xfs_icwalk. Must not * coincide with XFS_ICWALK_FLAGS_VALID. */ -#define XFS_ICWALK_FLAG_DROP_UDQUOT (1U << 31) -#define XFS_ICWALK_FLAG_DROP_GDQUOT (1U << 30) -#define XFS_ICWALK_FLAG_DROP_PDQUOT (1U << 29) /* Stop scanning after icw_scan_limit inodes. */ #define XFS_ICWALK_FLAG_SCAN_LIMIT (1U << 28) @@ -74,10 +59,7 @@ static int xfs_icwalk_ag(struct xfs_perag *pag, #define XFS_ICWALK_FLAG_RECLAIM_SICK (1U << 27) #define XFS_ICWALK_FLAG_UNION (1U << 26) /* union filter algorithm */ -#define XFS_ICWALK_PRIVATE_FLAGS (XFS_ICWALK_FLAG_DROP_UDQUOT | \ - XFS_ICWALK_FLAG_DROP_GDQUOT | \ - XFS_ICWALK_FLAG_DROP_PDQUOT | \ - XFS_ICWALK_FLAG_SCAN_LIMIT | \ +#define XFS_ICWALK_PRIVATE_FLAGS (XFS_ICWALK_FLAG_SCAN_LIMIT | \ XFS_ICWALK_FLAG_RECLAIM_SICK | \ XFS_ICWALK_FLAG_UNION) @@ -102,8 +84,9 @@ xfs_inode_alloc( return NULL; } - /* VFS doesn't initialise i_mode! */ + /* VFS doesn't initialise i_mode or i_state! */ VFS_I(ip)->i_mode = 0; + VFS_I(ip)->i_state = 0; XFS_STATS_INC(mp, vn_active); ASSERT(atomic_read(&ip->i_pincount) == 0); @@ -220,9 +203,14 @@ static inline void xfs_blockgc_queue( struct xfs_perag *pag) { + struct xfs_mount *mp = pag->pag_mount; + + if (!xfs_is_blockgc_enabled(mp)) + return; + rcu_read_lock(); if (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG)) - queue_delayed_work(pag->pag_mount->m_gc_workqueue, + queue_delayed_work(pag->pag_mount->m_blockgc_wq, &pag->pag_blockgc_work, msecs_to_jiffies(xfs_blockgc_secs * 1000)); rcu_read_unlock(); @@ -301,31 +289,6 @@ xfs_perag_clear_inode_tag( trace_xfs_perag_clear_inode_tag(mp, pag->pag_agno, tag, _RET_IP_); } -/* - * We set the inode flag atomically with the radix tree tag. - * Once we get tag lookups on the radix tree, this inode flag - * can go away. - */ -void -xfs_inode_mark_reclaimable( - struct xfs_inode *ip) -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_perag *pag; - - pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); - spin_lock(&pag->pag_ici_lock); - spin_lock(&ip->i_flags_lock); - - xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino), - XFS_ICI_RECLAIM_TAG); - __xfs_iflags_set(ip, XFS_IRECLAIMABLE); - - spin_unlock(&ip->i_flags_lock); - spin_unlock(&pag->pag_ici_lock); - xfs_perag_put(pag); -} - static inline void xfs_inew_wait( struct xfs_inode *ip) @@ -484,6 +447,21 @@ xfs_iget_check_free_state( return 0; } +/* Make all pending inactivation work start immediately. */ +static void +xfs_inodegc_queue_all( + struct xfs_mount *mp) +{ + struct xfs_inodegc *gc; + int cpu; + + for_each_online_cpu(cpu) { + gc = per_cpu_ptr(mp->m_inodegc, cpu); + if (!llist_empty(&gc->list)) + queue_work_on(cpu, mp->m_inodegc_wq, &gc->work); + } +} + /* * Check the validity of the inode we just found it the cache */ @@ -516,13 +494,30 @@ xfs_iget_cache_hit( * reclaimable state, wait for the initialisation to complete * before continuing. * + * If we're racing with the inactivation worker we also want to wait. + * If we're creating a new file, it's possible that the worker + * previously marked the inode as free on disk but hasn't finished + * updating the incore state yet. The AGI buffer will be dirty and + * locked to the icreate transaction, so a synchronous push of the + * inodegc workers would result in deadlock. For a regular iget, the + * worker is running already, so we might as well wait. + * * XXX(hch): eventually we should do something equivalent to * wait_on_inode to wait for these flags to be cleared * instead of polling for it. */ - if (ip->i_flags & (XFS_INEW | XFS_IRECLAIM)) + if (ip->i_flags & (XFS_INEW | XFS_IRECLAIM | XFS_INACTIVATING)) goto out_skip; + if (ip->i_flags & XFS_NEED_INACTIVE) { + /* Unlinked inodes cannot be re-grabbed. */ + if (VFS_I(ip)->i_nlink == 0) { + error = -ENOENT; + goto out_error; + } + goto out_inodegc_flush; + } + /* * Check the inode free state is valid. This also detects lookup * racing with unlinks. @@ -570,6 +565,17 @@ out_error: spin_unlock(&ip->i_flags_lock); rcu_read_unlock(); return error; + +out_inodegc_flush: + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); + /* + * Do not wait for the workers, because the caller could hold an AGI + * buffer lock. We're just going to sleep in a loop anyway. + */ + if (xfs_is_inodegc_enabled(mp)) + xfs_inodegc_queue_all(mp); + return -EAGAIN; } static int @@ -597,7 +603,7 @@ xfs_iget_cache_miss( /* * For version 5 superblocks, if we are initialising a new inode and we - * are not utilising the XFS_MOUNT_IKEEP inode cluster mode, we can + * are not utilising the XFS_FEAT_IKEEP inode cluster mode, we can * simply build the new inode core with a random generation number. * * For version 4 (and older) superblocks, log recovery is dependent on @@ -605,8 +611,8 @@ xfs_iget_cache_miss( * value and hence we must also read the inode off disk even when * initializing new inodes. */ - if (xfs_sb_version_has_v3inode(&mp->m_sb) && - (flags & XFS_IGET_CREATE) && !(mp->m_flags & XFS_MOUNT_IKEEP)) { + if (xfs_has_v3inodes(mp) && + (flags & XFS_IGET_CREATE) && !xfs_has_ikeep(mp)) { VFS_I(ip)->i_generation = prandom_u32(); } else { struct xfs_buf *bp; @@ -817,97 +823,6 @@ xfs_icache_inode_is_allocated( return 0; } -#ifdef CONFIG_XFS_QUOTA -/* Decide if we want to grab this inode to drop its dquots. */ -static bool -xfs_dqrele_igrab( - struct xfs_inode *ip) -{ - bool ret = false; - - ASSERT(rcu_read_lock_held()); - - /* Check for stale RCU freed inode */ - spin_lock(&ip->i_flags_lock); - if (!ip->i_ino) - goto out_unlock; - - /* - * Skip inodes that are anywhere in the reclaim machinery because we - * drop dquots before tagging an inode for reclamation. - */ - if (ip->i_flags & (XFS_IRECLAIM | XFS_IRECLAIMABLE)) - goto out_unlock; - - /* - * The inode looks alive; try to grab a VFS reference so that it won't - * get destroyed. If we got the reference, return true to say that - * we grabbed the inode. - * - * If we can't get the reference, then we know the inode had its VFS - * state torn down and hasn't yet entered the reclaim machinery. Since - * we also know that dquots are detached from an inode before it enters - * reclaim, we can skip the inode. - */ - ret = igrab(VFS_I(ip)) != NULL; - -out_unlock: - spin_unlock(&ip->i_flags_lock); - return ret; -} - -/* Drop this inode's dquots. */ -static void -xfs_dqrele_inode( - struct xfs_inode *ip, - struct xfs_icwalk *icw) -{ - if (xfs_iflags_test(ip, XFS_INEW)) - xfs_inew_wait(ip); - - xfs_ilock(ip, XFS_ILOCK_EXCL); - if (icw->icw_flags & XFS_ICWALK_FLAG_DROP_UDQUOT) { - xfs_qm_dqrele(ip->i_udquot); - ip->i_udquot = NULL; - } - if (icw->icw_flags & XFS_ICWALK_FLAG_DROP_GDQUOT) { - xfs_qm_dqrele(ip->i_gdquot); - ip->i_gdquot = NULL; - } - if (icw->icw_flags & XFS_ICWALK_FLAG_DROP_PDQUOT) { - xfs_qm_dqrele(ip->i_pdquot); - ip->i_pdquot = NULL; - } - xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_irele(ip); -} - -/* - * Detach all dquots from incore inodes if we can. The caller must already - * have dropped the relevant XFS_[UGP]QUOTA_ACTIVE flags so that dquots will - * not get reattached. - */ -int -xfs_dqrele_all_inodes( - struct xfs_mount *mp, - unsigned int qflags) -{ - struct xfs_icwalk icw = { .icw_flags = 0 }; - - if (qflags & XFS_UQUOTA_ACCT) - icw.icw_flags |= XFS_ICWALK_FLAG_DROP_UDQUOT; - if (qflags & XFS_GQUOTA_ACCT) - icw.icw_flags |= XFS_ICWALK_FLAG_DROP_GDQUOT; - if (qflags & XFS_PQUOTA_ACCT) - icw.icw_flags |= XFS_ICWALK_FLAG_DROP_PDQUOT; - - return xfs_icwalk(mp, XFS_ICWALK_DQRELE, &icw); -} -#else -# define xfs_dqrele_igrab(ip) (false) -# define xfs_dqrele_inode(ip, priv) ((void)0) -#endif /* CONFIG_XFS_QUOTA */ - /* * Grab the inode for reclaim exclusively. * @@ -976,7 +891,7 @@ xfs_reclaim_inode( if (xfs_iflags_test_and_set(ip, XFS_IFLUSHING)) goto out_iunlock; - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { + if (xfs_is_shutdown(ip->i_mount)) { xfs_iunpin_wait(ip); xfs_iflush_abort(ip); goto reclaim; @@ -988,6 +903,7 @@ xfs_reclaim_inode( xfs_iflags_clear(ip, XFS_IFLUSHING); reclaim: + trace_xfs_inode_reclaiming(ip); /* * Because we use RCU freeing we need to ensure the inode always appears @@ -1052,9 +968,8 @@ static inline bool xfs_want_reclaim_sick( struct xfs_mount *mp) { - return (mp->m_flags & XFS_MOUNT_UNMOUNTING) || - (mp->m_flags & XFS_MOUNT_NORECOVERY) || - XFS_FORCED_SHUTDOWN(mp); + return xfs_is_unmounting(mp) || xfs_has_norecovery(mp) || + xfs_is_shutdown(mp); } void @@ -1447,8 +1362,12 @@ xfs_blockgc_stop( struct xfs_perag *pag; xfs_agnumber_t agno; - for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG) + if (!xfs_clear_blockgc_enabled(mp)) + return; + + for_each_perag(mp, agno, pag) cancel_delayed_work_sync(&pag->pag_blockgc_work); + trace_xfs_blockgc_stop(mp, __return_address); } /* Enable post-EOF and CoW block auto-reclamation. */ @@ -1459,12 +1378,18 @@ xfs_blockgc_start( struct xfs_perag *pag; xfs_agnumber_t agno; + if (xfs_set_blockgc_enabled(mp)) + return; + + trace_xfs_blockgc_start(mp, __return_address); for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG) xfs_blockgc_queue(pag); } /* Don't try to run block gc on an inode that's in any of these states. */ #define XFS_BLOCKGC_NOGRAB_IFLAGS (XFS_INEW | \ + XFS_NEED_INACTIVE | \ + XFS_INACTIVATING | \ XFS_IRECLAIMABLE | \ XFS_IRECLAIM) /* @@ -1490,7 +1415,7 @@ xfs_blockgc_igrab( spin_unlock(&ip->i_flags_lock); /* nothing to sync during shutdown */ - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + if (xfs_is_shutdown(ip->i_mount)) return false; /* If we can't grab the inode, it must on it's way to reclaim. */ @@ -1536,27 +1461,62 @@ xfs_blockgc_worker( struct xfs_mount *mp = pag->pag_mount; int error; - if (!sb_start_write_trylock(mp->m_super)) - return; + trace_xfs_blockgc_worker(mp, __return_address); + error = xfs_icwalk_ag(pag, XFS_ICWALK_BLOCKGC, NULL); if (error) xfs_info(mp, "AG %u preallocation gc worker failed, err=%d", pag->pag_agno, error); - sb_end_write(mp->m_super); xfs_blockgc_queue(pag); } /* - * Try to free space in the filesystem by purging eofblocks and cowblocks. + * Try to free space in the filesystem by purging inactive inodes, eofblocks + * and cowblocks. */ int xfs_blockgc_free_space( struct xfs_mount *mp, struct xfs_icwalk *icw) { + int error; + trace_xfs_blockgc_free_space(mp, icw, _RET_IP_); - return xfs_icwalk(mp, XFS_ICWALK_BLOCKGC, icw); + error = xfs_icwalk(mp, XFS_ICWALK_BLOCKGC, icw); + if (error) + return error; + + xfs_inodegc_flush(mp); + return 0; +} + +/* + * Reclaim all the free space that we can by scheduling the background blockgc + * and inodegc workers immediately and waiting for them all to clear. + */ +void +xfs_blockgc_flush_all( + struct xfs_mount *mp) +{ + struct xfs_perag *pag; + xfs_agnumber_t agno; + + trace_xfs_blockgc_flush_all(mp, __return_address); + + /* + * For each blockgc worker, move its queue time up to now. If it + * wasn't queued, it will not be requeued. Then flush whatever's + * left. + */ + for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG) + mod_delayed_work(pag->pag_mount->m_blockgc_wq, + &pag->pag_blockgc_work, 0); + + for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG) + flush_delayed_work(&pag->pag_blockgc_work); + + xfs_inodegc_flush(mp); } /* @@ -1647,8 +1607,6 @@ xfs_icwalk_igrab( struct xfs_icwalk *icw) { switch (goal) { - case XFS_ICWALK_DQRELE: - return xfs_dqrele_igrab(ip); case XFS_ICWALK_BLOCKGC: return xfs_blockgc_igrab(ip); case XFS_ICWALK_RECLAIM: @@ -1672,9 +1630,6 @@ xfs_icwalk_process_inode( int error = 0; switch (goal) { - case XFS_ICWALK_DQRELE: - xfs_dqrele_inode(ip, icw); - break; case XFS_ICWALK_BLOCKGC: error = xfs_blockgc_scan_inode(ip, icw); break; @@ -1712,22 +1667,14 @@ restart: nr_found = 0; do { struct xfs_inode *batch[XFS_LOOKUP_BATCH]; - unsigned int tag = xfs_icwalk_tag(goal); int error = 0; int i; rcu_read_lock(); - if (tag == XFS_ICWALK_NULL_TAG) - nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, - (void **)batch, first_index, - XFS_LOOKUP_BATCH); - else - nr_found = radix_tree_gang_lookup_tag( - &pag->pag_ici_root, - (void **) batch, first_index, - XFS_LOOKUP_BATCH, tag); - + nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root, + (void **) batch, first_index, + XFS_LOOKUP_BATCH, goal); if (!nr_found) { done = true; rcu_read_unlock(); @@ -1805,20 +1752,6 @@ restart: return last_error; } -/* Fetch the next (possibly tagged) per-AG structure. */ -static inline struct xfs_perag * -xfs_icwalk_get_perag( - struct xfs_mount *mp, - xfs_agnumber_t agno, - enum xfs_icwalk_goal goal) -{ - unsigned int tag = xfs_icwalk_tag(goal); - - if (tag == XFS_ICWALK_NULL_TAG) - return xfs_perag_get(mp, agno); - return xfs_perag_get_tag(mp, agno, tag); -} - /* Walk all incore inodes to achieve a given goal. */ static int xfs_icwalk( @@ -1829,18 +1762,465 @@ xfs_icwalk( struct xfs_perag *pag; int error = 0; int last_error = 0; - xfs_agnumber_t agno = 0; + xfs_agnumber_t agno; - while ((pag = xfs_icwalk_get_perag(mp, agno, goal))) { - agno = pag->pag_agno + 1; + for_each_perag_tag(mp, agno, pag, goal) { error = xfs_icwalk_ag(pag, goal, icw); - xfs_perag_put(pag); if (error) { last_error = error; - if (error == -EFSCORRUPTED) + if (error == -EFSCORRUPTED) { + xfs_perag_put(pag); break; + } } } return last_error; BUILD_BUG_ON(XFS_ICWALK_PRIVATE_FLAGS & XFS_ICWALK_FLAGS_VALID); } + +#ifdef DEBUG +static void +xfs_check_delalloc( + struct xfs_inode *ip, + int whichfork) +{ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_bmbt_irec got; + struct xfs_iext_cursor icur; + + if (!ifp || !xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got)) + return; + do { + if (isnullstartblock(got.br_startblock)) { + xfs_warn(ip->i_mount, + "ino %llx %s fork has delalloc extent at [0x%llx:0x%llx]", + ip->i_ino, + whichfork == XFS_DATA_FORK ? "data" : "cow", + got.br_startoff, got.br_blockcount); + } + } while (xfs_iext_next_extent(ifp, &icur, &got)); +} +#else +#define xfs_check_delalloc(ip, whichfork) do { } while (0) +#endif + +/* Schedule the inode for reclaim. */ +static void +xfs_inodegc_set_reclaimable( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_perag *pag; + + if (!xfs_is_shutdown(mp) && ip->i_delayed_blks) { + xfs_check_delalloc(ip, XFS_DATA_FORK); + xfs_check_delalloc(ip, XFS_COW_FORK); + ASSERT(0); + } + + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); + spin_lock(&pag->pag_ici_lock); + spin_lock(&ip->i_flags_lock); + + trace_xfs_inode_set_reclaimable(ip); + ip->i_flags &= ~(XFS_NEED_INACTIVE | XFS_INACTIVATING); + ip->i_flags |= XFS_IRECLAIMABLE; + xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino), + XFS_ICI_RECLAIM_TAG); + + spin_unlock(&ip->i_flags_lock); + spin_unlock(&pag->pag_ici_lock); + xfs_perag_put(pag); +} + +/* + * Free all speculative preallocations and possibly even the inode itself. + * This is the last chance to make changes to an otherwise unreferenced file + * before incore reclamation happens. + */ +static void +xfs_inodegc_inactivate( + struct xfs_inode *ip) +{ + trace_xfs_inode_inactivating(ip); + xfs_inactive(ip); + xfs_inodegc_set_reclaimable(ip); +} + +void +xfs_inodegc_worker( + struct work_struct *work) +{ + struct xfs_inodegc *gc = container_of(work, struct xfs_inodegc, + work); + struct llist_node *node = llist_del_all(&gc->list); + struct xfs_inode *ip, *n; + + WRITE_ONCE(gc->items, 0); + + if (!node) + return; + + ip = llist_entry(node, struct xfs_inode, i_gclist); + trace_xfs_inodegc_worker(ip->i_mount, READ_ONCE(gc->shrinker_hits)); + + WRITE_ONCE(gc->shrinker_hits, 0); + llist_for_each_entry_safe(ip, n, node, i_gclist) { + xfs_iflags_set(ip, XFS_INACTIVATING); + xfs_inodegc_inactivate(ip); + } +} + +/* + * Force all currently queued inode inactivation work to run immediately, and + * wait for the work to finish. Two pass - queue all the work first pass, wait + * for it in a second pass. + */ +void +xfs_inodegc_flush( + struct xfs_mount *mp) +{ + struct xfs_inodegc *gc; + int cpu; + + if (!xfs_is_inodegc_enabled(mp)) + return; + + trace_xfs_inodegc_flush(mp, __return_address); + + xfs_inodegc_queue_all(mp); + + for_each_online_cpu(cpu) { + gc = per_cpu_ptr(mp->m_inodegc, cpu); + flush_work(&gc->work); + } +} + +/* + * Flush all the pending work and then disable the inode inactivation background + * workers and wait for them to stop. + */ +void +xfs_inodegc_stop( + struct xfs_mount *mp) +{ + struct xfs_inodegc *gc; + int cpu; + + if (!xfs_clear_inodegc_enabled(mp)) + return; + + xfs_inodegc_queue_all(mp); + + for_each_online_cpu(cpu) { + gc = per_cpu_ptr(mp->m_inodegc, cpu); + cancel_work_sync(&gc->work); + } + trace_xfs_inodegc_stop(mp, __return_address); +} + +/* + * Enable the inode inactivation background workers and schedule deferred inode + * inactivation work if there is any. + */ +void +xfs_inodegc_start( + struct xfs_mount *mp) +{ + if (xfs_set_inodegc_enabled(mp)) + return; + + trace_xfs_inodegc_start(mp, __return_address); + xfs_inodegc_queue_all(mp); +} + +#ifdef CONFIG_XFS_RT +static inline bool +xfs_inodegc_want_queue_rt_file( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + uint64_t freertx; + + if (!XFS_IS_REALTIME_INODE(ip)) + return false; + + freertx = READ_ONCE(mp->m_sb.sb_frextents); + return freertx < mp->m_low_rtexts[XFS_LOWSP_5_PCNT]; +} +#else +# define xfs_inodegc_want_queue_rt_file(ip) (false) +#endif /* CONFIG_XFS_RT */ + +/* + * Schedule the inactivation worker when: + * + * - We've accumulated more than one inode cluster buffer's worth of inodes. + * - There is less than 5% free space left. + * - Any of the quotas for this inode are near an enforcement limit. + */ +static inline bool +xfs_inodegc_want_queue_work( + struct xfs_inode *ip, + unsigned int items) +{ + struct xfs_mount *mp = ip->i_mount; + + if (items > mp->m_ino_geo.inodes_per_cluster) + return true; + + if (__percpu_counter_compare(&mp->m_fdblocks, + mp->m_low_space[XFS_LOWSP_5_PCNT], + XFS_FDBLOCKS_BATCH) < 0) + return true; + + if (xfs_inodegc_want_queue_rt_file(ip)) + return true; + + if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_USER)) + return true; + + if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_GROUP)) + return true; + + if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_PROJ)) + return true; + + return false; +} + +/* + * Upper bound on the number of inodes in each AG that can be queued for + * inactivation at any given time, to avoid monopolizing the workqueue. + */ +#define XFS_INODEGC_MAX_BACKLOG (4 * XFS_INODES_PER_CHUNK) + +/* + * Make the frontend wait for inactivations when: + * + * - Memory shrinkers queued the inactivation worker and it hasn't finished. + * - The queue depth exceeds the maximum allowable percpu backlog. + * + * Note: If the current thread is running a transaction, we don't ever want to + * wait for other transactions because that could introduce a deadlock. + */ +static inline bool +xfs_inodegc_want_flush_work( + struct xfs_inode *ip, + unsigned int items, + unsigned int shrinker_hits) +{ + if (current->journal_info) + return false; + + if (shrinker_hits > 0) + return true; + + if (items > XFS_INODEGC_MAX_BACKLOG) + return true; + + return false; +} + +/* + * Queue a background inactivation worker if there are inodes that need to be + * inactivated and higher level xfs code hasn't disabled the background + * workers. + */ +static void +xfs_inodegc_queue( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_inodegc *gc; + int items; + unsigned int shrinker_hits; + + trace_xfs_inode_set_need_inactive(ip); + spin_lock(&ip->i_flags_lock); + ip->i_flags |= XFS_NEED_INACTIVE; + spin_unlock(&ip->i_flags_lock); + + gc = get_cpu_ptr(mp->m_inodegc); + llist_add(&ip->i_gclist, &gc->list); + items = READ_ONCE(gc->items); + WRITE_ONCE(gc->items, items + 1); + shrinker_hits = READ_ONCE(gc->shrinker_hits); + put_cpu_ptr(gc); + + if (!xfs_is_inodegc_enabled(mp)) + return; + + if (xfs_inodegc_want_queue_work(ip, items)) { + trace_xfs_inodegc_queue(mp, __return_address); + queue_work(mp->m_inodegc_wq, &gc->work); + } + + if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) { + trace_xfs_inodegc_throttle(mp, __return_address); + flush_work(&gc->work); + } +} + +/* + * Fold the dead CPU inodegc queue into the current CPUs queue. + */ +void +xfs_inodegc_cpu_dead( + struct xfs_mount *mp, + unsigned int dead_cpu) +{ + struct xfs_inodegc *dead_gc, *gc; + struct llist_node *first, *last; + unsigned int count = 0; + + dead_gc = per_cpu_ptr(mp->m_inodegc, dead_cpu); + cancel_work_sync(&dead_gc->work); + + if (llist_empty(&dead_gc->list)) + return; + + first = dead_gc->list.first; + last = first; + while (last->next) { + last = last->next; + count++; + } + dead_gc->list.first = NULL; + dead_gc->items = 0; + + /* Add pending work to current CPU */ + gc = get_cpu_ptr(mp->m_inodegc); + llist_add_batch(first, last, &gc->list); + count += READ_ONCE(gc->items); + WRITE_ONCE(gc->items, count); + put_cpu_ptr(gc); + + if (xfs_is_inodegc_enabled(mp)) { + trace_xfs_inodegc_queue(mp, __return_address); + queue_work(mp->m_inodegc_wq, &gc->work); + } +} + +/* + * We set the inode flag atomically with the radix tree tag. Once we get tag + * lookups on the radix tree, this inode flag can go away. + * + * We always use background reclaim here because even if the inode is clean, it + * still may be under IO and hence we have wait for IO completion to occur + * before we can reclaim the inode. The background reclaim path handles this + * more efficiently than we can here, so simply let background reclaim tear down + * all inodes. + */ +void +xfs_inode_mark_reclaimable( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + bool need_inactive; + + XFS_STATS_INC(mp, vn_reclaim); + + /* + * We should never get here with any of the reclaim flags already set. + */ + ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_ALL_IRECLAIM_FLAGS)); + + need_inactive = xfs_inode_needs_inactive(ip); + if (need_inactive) { + xfs_inodegc_queue(ip); + return; + } + + /* Going straight to reclaim, so drop the dquots. */ + xfs_qm_dqdetach(ip); + xfs_inodegc_set_reclaimable(ip); +} + +/* + * Register a phony shrinker so that we can run background inodegc sooner when + * there's memory pressure. Inactivation does not itself free any memory but + * it does make inodes reclaimable, which eventually frees memory. + * + * The count function, seek value, and batch value are crafted to trigger the + * scan function during the second round of scanning. Hopefully this means + * that we reclaimed enough memory that initiating metadata transactions won't + * make things worse. + */ +#define XFS_INODEGC_SHRINKER_COUNT (1UL << DEF_PRIORITY) +#define XFS_INODEGC_SHRINKER_BATCH ((XFS_INODEGC_SHRINKER_COUNT / 2) + 1) + +static unsigned long +xfs_inodegc_shrinker_count( + struct shrinker *shrink, + struct shrink_control *sc) +{ + struct xfs_mount *mp = container_of(shrink, struct xfs_mount, + m_inodegc_shrinker); + struct xfs_inodegc *gc; + int cpu; + + if (!xfs_is_inodegc_enabled(mp)) + return 0; + + for_each_online_cpu(cpu) { + gc = per_cpu_ptr(mp->m_inodegc, cpu); + if (!llist_empty(&gc->list)) + return XFS_INODEGC_SHRINKER_COUNT; + } + + return 0; +} + +static unsigned long +xfs_inodegc_shrinker_scan( + struct shrinker *shrink, + struct shrink_control *sc) +{ + struct xfs_mount *mp = container_of(shrink, struct xfs_mount, + m_inodegc_shrinker); + struct xfs_inodegc *gc; + int cpu; + bool no_items = true; + + if (!xfs_is_inodegc_enabled(mp)) + return SHRINK_STOP; + + trace_xfs_inodegc_shrinker_scan(mp, sc, __return_address); + + for_each_online_cpu(cpu) { + gc = per_cpu_ptr(mp->m_inodegc, cpu); + if (!llist_empty(&gc->list)) { + unsigned int h = READ_ONCE(gc->shrinker_hits); + + WRITE_ONCE(gc->shrinker_hits, h + 1); + queue_work_on(cpu, mp->m_inodegc_wq, &gc->work); + no_items = false; + } + } + + /* + * If there are no inodes to inactivate, we don't want the shrinker + * to think there's deferred work to call us back about. + */ + if (no_items) + return LONG_MAX; + + return SHRINK_STOP; +} + +/* Register a shrinker so we can accelerate inodegc and throttle queuing. */ +int +xfs_inodegc_register_shrinker( + struct xfs_mount *mp) +{ + struct shrinker *shrink = &mp->m_inodegc_shrinker; + + shrink->count_objects = xfs_inodegc_shrinker_count; + shrink->scan_objects = xfs_inodegc_shrinker_scan; + shrink->seeks = 0; + shrink->flags = SHRINKER_NONSLAB; + shrink->batch = XFS_INODEGC_SHRINKER_BATCH; + + return register_shrinker(shrink); +} diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index c751cc32dc46..2e4cfddf8b8e 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -59,6 +59,7 @@ int xfs_blockgc_free_dquots(struct xfs_mount *mp, struct xfs_dquot *udqp, unsigned int iwalk_flags); int xfs_blockgc_free_quota(struct xfs_inode *ip, unsigned int iwalk_flags); int xfs_blockgc_free_space(struct xfs_mount *mp, struct xfs_icwalk *icm); +void xfs_blockgc_flush_all(struct xfs_mount *mp); void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip); void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip); @@ -68,16 +69,17 @@ void xfs_inode_clear_cowblocks_tag(struct xfs_inode *ip); void xfs_blockgc_worker(struct work_struct *work); -#ifdef CONFIG_XFS_QUOTA -int xfs_dqrele_all_inodes(struct xfs_mount *mp, unsigned int qflags); -#else -# define xfs_dqrele_all_inodes(mp, qflags) (0) -#endif - int xfs_icache_inode_is_allocated(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino, bool *inuse); void xfs_blockgc_stop(struct xfs_mount *mp); void xfs_blockgc_start(struct xfs_mount *mp); +void xfs_inodegc_worker(struct work_struct *work); +void xfs_inodegc_flush(struct xfs_mount *mp); +void xfs_inodegc_stop(struct xfs_mount *mp); +void xfs_inodegc_start(struct xfs_mount *mp); +void xfs_inodegc_cpu_dead(struct xfs_mount *mp, unsigned int cpu); +int xfs_inodegc_register_shrinker(struct xfs_mount *mp); + #endif diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c index 9b3994b9c716..017904a34c02 100644 --- a/fs/xfs/xfs_icreate_item.c +++ b/fs/xfs/xfs_icreate_item.c @@ -201,7 +201,7 @@ xlog_recover_icreate_commit_pass2( if (length != igeo->ialloc_blks && length != igeo->ialloc_min_blks) { xfs_warn(log->l_mp, - "%s: unsupported chunk length", __FUNCTION__); + "%s: unsupported chunk length", __func__); return -EINVAL; } @@ -209,7 +209,7 @@ xlog_recover_icreate_commit_pass2( if ((count >> mp->m_sb.sb_inopblog) != length) { xfs_warn(log->l_mp, "%s: inconsistent inode count and chunk length", - __FUNCTION__); + __func__); return -EINVAL; } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 990b72ae3635..a4f6f034fb81 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -132,7 +132,7 @@ xfs_ilock_attr_map_shared( /* * In addition to i_rwsem in the VFS inode, the xfs inode contains 2 - * multi-reader locks: i_mmap_lock and the i_lock. This routine allows + * multi-reader locks: invalidate_lock and the i_lock. This routine allows * various combinations of the locks to be obtained. * * The 3 locks should always be ordered so that the IO lock is obtained first, @@ -140,23 +140,23 @@ xfs_ilock_attr_map_shared( * * Basic locking order: * - * i_rwsem -> i_mmap_lock -> page_lock -> i_ilock + * i_rwsem -> invalidate_lock -> page_lock -> i_ilock * * mmap_lock locking order: * * i_rwsem -> page lock -> mmap_lock - * mmap_lock -> i_mmap_lock -> page_lock + * mmap_lock -> invalidate_lock -> page_lock * * The difference in mmap_lock locking order mean that we cannot hold the - * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can - * fault in pages during copy in/out (for buffered IO) or require the mmap_lock - * in get_user_pages() to map the user pages into the kernel address space for - * direct IO. Similarly the i_rwsem cannot be taken inside a page fault because - * page faults already hold the mmap_lock. + * invalidate_lock over syscall based read(2)/write(2) based IO. These IO paths + * can fault in pages during copy in/out (for buffered IO) or require the + * mmap_lock in get_user_pages() to map the user pages into the kernel address + * space for direct IO. Similarly the i_rwsem cannot be taken inside a page + * fault because page faults already hold the mmap_lock. * * Hence to serialise fully against both syscall and mmap based IO, we need to - * take both the i_rwsem and the i_mmap_lock. These locks should *only* be both - * taken in places where we need to invalidate the page cache in a race + * take both the i_rwsem and the invalidate_lock. These locks should *only* be + * both taken in places where we need to invalidate the page cache in a race * free manner (e.g. truncate, hole punch and other extent manipulation * functions). */ @@ -188,10 +188,13 @@ xfs_ilock( XFS_IOLOCK_DEP(lock_flags)); } - if (lock_flags & XFS_MMAPLOCK_EXCL) - mrupdate_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags)); - else if (lock_flags & XFS_MMAPLOCK_SHARED) - mraccess_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags)); + if (lock_flags & XFS_MMAPLOCK_EXCL) { + down_write_nested(&VFS_I(ip)->i_mapping->invalidate_lock, + XFS_MMAPLOCK_DEP(lock_flags)); + } else if (lock_flags & XFS_MMAPLOCK_SHARED) { + down_read_nested(&VFS_I(ip)->i_mapping->invalidate_lock, + XFS_MMAPLOCK_DEP(lock_flags)); + } if (lock_flags & XFS_ILOCK_EXCL) mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); @@ -240,10 +243,10 @@ xfs_ilock_nowait( } if (lock_flags & XFS_MMAPLOCK_EXCL) { - if (!mrtryupdate(&ip->i_mmaplock)) + if (!down_write_trylock(&VFS_I(ip)->i_mapping->invalidate_lock)) goto out_undo_iolock; } else if (lock_flags & XFS_MMAPLOCK_SHARED) { - if (!mrtryaccess(&ip->i_mmaplock)) + if (!down_read_trylock(&VFS_I(ip)->i_mapping->invalidate_lock)) goto out_undo_iolock; } @@ -258,9 +261,9 @@ xfs_ilock_nowait( out_undo_mmaplock: if (lock_flags & XFS_MMAPLOCK_EXCL) - mrunlock_excl(&ip->i_mmaplock); + up_write(&VFS_I(ip)->i_mapping->invalidate_lock); else if (lock_flags & XFS_MMAPLOCK_SHARED) - mrunlock_shared(&ip->i_mmaplock); + up_read(&VFS_I(ip)->i_mapping->invalidate_lock); out_undo_iolock: if (lock_flags & XFS_IOLOCK_EXCL) up_write(&VFS_I(ip)->i_rwsem); @@ -307,9 +310,9 @@ xfs_iunlock( up_read(&VFS_I(ip)->i_rwsem); if (lock_flags & XFS_MMAPLOCK_EXCL) - mrunlock_excl(&ip->i_mmaplock); + up_write(&VFS_I(ip)->i_mapping->invalidate_lock); else if (lock_flags & XFS_MMAPLOCK_SHARED) - mrunlock_shared(&ip->i_mmaplock); + up_read(&VFS_I(ip)->i_mapping->invalidate_lock); if (lock_flags & XFS_ILOCK_EXCL) mrunlock_excl(&ip->i_lock); @@ -335,7 +338,7 @@ xfs_ilock_demote( if (lock_flags & XFS_ILOCK_EXCL) mrdemote(&ip->i_lock); if (lock_flags & XFS_MMAPLOCK_EXCL) - mrdemote(&ip->i_mmaplock); + downgrade_write(&VFS_I(ip)->i_mapping->invalidate_lock); if (lock_flags & XFS_IOLOCK_EXCL) downgrade_write(&VFS_I(ip)->i_rwsem); @@ -343,9 +346,29 @@ xfs_ilock_demote( } #if defined(DEBUG) || defined(XFS_WARN) -int +static inline bool +__xfs_rwsem_islocked( + struct rw_semaphore *rwsem, + bool shared) +{ + if (!debug_locks) + return rwsem_is_locked(rwsem); + + if (!shared) + return lockdep_is_held_type(rwsem, 0); + + /* + * We are checking that the lock is held at least in shared + * mode but don't care that it might be held exclusively + * (i.e. shared | excl). Hence we check if the lock is held + * in any mode rather than an explicit shared mode. + */ + return lockdep_is_held_type(rwsem, -1); +} + +bool xfs_isilocked( - xfs_inode_t *ip, + struct xfs_inode *ip, uint lock_flags) { if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) { @@ -355,20 +378,17 @@ xfs_isilocked( } if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) { - if (!(lock_flags & XFS_MMAPLOCK_SHARED)) - return !!ip->i_mmaplock.mr_writer; - return rwsem_is_locked(&ip->i_mmaplock.mr_lock); + return __xfs_rwsem_islocked(&VFS_I(ip)->i_rwsem, + (lock_flags & XFS_IOLOCK_SHARED)); } - if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) { - if (!(lock_flags & XFS_IOLOCK_SHARED)) - return !debug_locks || - lockdep_is_held_type(&VFS_I(ip)->i_rwsem, 0); - return rwsem_is_locked(&VFS_I(ip)->i_rwsem); + if (lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) { + return __xfs_rwsem_islocked(&VFS_I(ip)->i_rwsem, + (lock_flags & XFS_IOLOCK_SHARED)); } ASSERT(0); - return 0; + return false; } #endif @@ -532,12 +552,10 @@ again: } /* - * xfs_lock_two_inodes() can only be used to lock one type of lock at a time - - * the mmaplock or the ilock, but not more than one type at a time. If we lock - * more than one at a time, lockdep will report false positives saying we have - * violated locking orders. The iolock must be double-locked separately since - * we use i_rwsem for that. We now support taking one lock EXCL and the other - * SHARED. + * xfs_lock_two_inodes() can only be used to lock ilock. The iolock and + * mmaplock must be double-locked separately since we use i_rwsem and + * invalidate_lock for that. We now support taking one lock EXCL and the + * other SHARED. */ void xfs_lock_two_inodes( @@ -555,15 +573,8 @@ xfs_lock_two_inodes( ASSERT(hweight32(ip1_mode) == 1); ASSERT(!(ip0_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))); ASSERT(!(ip1_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))); - ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) || - !(ip0_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); - ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) || - !(ip1_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); - ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) || - !(ip0_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); - ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) || - !(ip1_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); - + ASSERT(!(ip0_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL))); + ASSERT(!(ip1_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL))); ASSERT(ip0->i_ino != ip1->i_ino); if (ip0->i_ino > ip1->i_ino) { @@ -663,7 +674,7 @@ xfs_lookup( trace_xfs_lookup(dp, name); - if (XFS_FORCED_SHUTDOWN(dp->i_mount)) + if (xfs_is_shutdown(dp->i_mount)) return -EIO; error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name); @@ -705,7 +716,7 @@ xfs_inode_inherit_flags( di_flags |= XFS_DIFLAG_PROJINHERIT; } else if (S_ISREG(mode)) { if ((pip->i_diflags & XFS_DIFLAG_RTINHERIT) && - xfs_sb_version_hasrealtime(&ip->i_mount->m_sb)) + xfs_has_realtime(ip->i_mount)) di_flags |= XFS_DIFLAG_REALTIME; if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) { di_flags |= XFS_DIFLAG_EXTSIZE; @@ -826,8 +837,7 @@ xfs_init_new_inode( inode->i_rdev = rdev; ip->i_projid = prid; - if (dir && !(dir->i_mode & S_ISGID) && - (mp->m_flags & XFS_MOUNT_GRPID)) { + if (dir && !(dir->i_mode & S_ISGID) && xfs_has_grpid(mp)) { inode_fsuid_set(inode, mnt_userns); inode->i_gid = dir->i_gid; inode->i_mode = mode; @@ -857,7 +867,7 @@ xfs_init_new_inode( ip->i_extsize = 0; ip->i_diflags = 0; - if (xfs_sb_version_has_v3inode(&mp->m_sb)) { + if (xfs_has_v3inodes(mp)) { inode_set_iversion(inode, 1); ip->i_cowextsize = 0; ip->i_crtime = tv; @@ -897,7 +907,7 @@ xfs_init_new_inode( * this saves us from needing to run a separate transaction to set the * fork offset in the immediate future. */ - if (init_xattrs && xfs_sb_version_hasattr(&mp->m_sb)) { + if (init_xattrs && xfs_has_attr(mp)) { ip->i_forkoff = xfs_default_attroffset(ip) >> 3; ip->i_afp = xfs_ifork_alloc(XFS_DINODE_FMT_EXTENTS, 0); } @@ -976,7 +986,7 @@ xfs_create( trace_xfs_create(dp, name); - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; prid = xfs_get_initial_prid(dp); @@ -1068,7 +1078,7 @@ xfs_create( * create transaction goes to disk before returning to * the user. */ - if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) + if (xfs_has_wsync(mp) || xfs_has_dirsync(mp)) xfs_trans_set_sync(tp); /* @@ -1130,7 +1140,7 @@ xfs_create_tmpfile( uint resblks; xfs_ino_t ino; - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; prid = xfs_get_initial_prid(dp); @@ -1160,7 +1170,7 @@ xfs_create_tmpfile( if (error) goto out_trans_cancel; - if (mp->m_flags & XFS_MOUNT_WSYNC) + if (xfs_has_wsync(mp)) xfs_trans_set_sync(tp); /* @@ -1220,7 +1230,7 @@ xfs_link( ASSERT(!S_ISDIR(VFS_I(sip)->i_mode)); - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; error = xfs_qm_dqattach(sip); @@ -1294,7 +1304,7 @@ xfs_link( * link transaction goes to disk before returning to * the user. */ - if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) + if (xfs_has_wsync(mp) || xfs_has_dirsync(mp)) xfs_trans_set_sync(tp); return xfs_trans_commit(tp); @@ -1435,10 +1445,10 @@ xfs_release( return 0; /* If this is a read-only mount, don't do this (would generate I/O) */ - if (mp->m_flags & XFS_MOUNT_RDONLY) + if (xfs_is_readonly(mp)) return 0; - if (!XFS_FORCED_SHUTDOWN(mp)) { + if (!xfs_is_shutdown(mp)) { int truncated; /* @@ -1521,7 +1531,7 @@ xfs_inactive_truncate( error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); if (error) { - ASSERT(XFS_FORCED_SHUTDOWN(mp)); + ASSERT(xfs_is_shutdown(mp)); return error; } xfs_ilock(ip, XFS_ILOCK_EXCL); @@ -1592,7 +1602,7 @@ xfs_inactive_ifree( "Failed to remove inode(s) from unlinked list. " "Please free space, unmount and run xfs_repair."); } else { - ASSERT(XFS_FORCED_SHUTDOWN(mp)); + ASSERT(xfs_is_shutdown(mp)); } return error; } @@ -1628,7 +1638,7 @@ xfs_inactive_ifree( * might do that, we need to make sure. Otherwise the * inode might be lost for a long time or forever. */ - if (!XFS_FORCED_SHUTDOWN(mp)) { + if (!xfs_is_shutdown(mp)) { xfs_notice(mp, "%s: xfs_ifree returned error %d", __func__, error); xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); @@ -1655,6 +1665,59 @@ xfs_inactive_ifree( } /* + * Returns true if we need to update the on-disk metadata before we can free + * the memory used by this inode. Updates include freeing post-eof + * preallocations; freeing COW staging extents; and marking the inode free in + * the inobt if it is on the unlinked list. + */ +bool +xfs_inode_needs_inactive( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *cow_ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + + /* + * If the inode is already free, then there can be nothing + * to clean up here. + */ + if (VFS_I(ip)->i_mode == 0) + return false; + + /* If this is a read-only mount, don't do this (would generate I/O) */ + if (xfs_is_readonly(mp)) + return false; + + /* If the log isn't running, push inodes straight to reclaim. */ + if (xfs_is_shutdown(mp) || xfs_has_norecovery(mp)) + return false; + + /* Metadata inodes require explicit resource cleanup. */ + if (xfs_is_metadata_inode(ip)) + return false; + + /* Want to clean out the cow blocks if there are any. */ + if (cow_ifp && cow_ifp->if_bytes > 0) + return true; + + /* Unlinked files must be freed. */ + if (VFS_I(ip)->i_nlink == 0) + return true; + + /* + * This file isn't being freed, so check if there are post-eof blocks + * to free. @force is true because we are evicting an inode from the + * cache. Post-eof blocks must be freed, lest we end up with broken + * free space accounting. + * + * Note: don't bother with iolock here since lockdep complains about + * acquiring it in reclaim context. We have the only reference to the + * inode at this point anyways. + */ + return xfs_can_free_eofblocks(ip, true); +} + +/* * xfs_inactive * * This is called when the vnode reference count for the vnode @@ -1683,7 +1746,7 @@ xfs_inactive( ASSERT(!xfs_iflags_test(ip, XFS_IRECOVERY)); /* If this is a read-only mount, don't do this (would generate I/O) */ - if (mp->m_flags & XFS_MOUNT_RDONLY) + if (xfs_is_readonly(mp)) goto out; /* Metadata inodes require explicit resource cleanup. */ @@ -1958,7 +2021,7 @@ xfs_iunlink_destroy( rhashtable_free_and_destroy(&pag->pagi_unlinked_hash, xfs_iunlink_free_item, &freed_anything); - ASSERT(freed_anything == false || XFS_FORCED_SHUTDOWN(pag->pag_mount)); + ASSERT(freed_anything == false || xfs_is_shutdown(pag->pag_mount)); } /* @@ -2703,7 +2766,7 @@ xfs_remove( trace_xfs_remove(dp, name); - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; error = xfs_qm_dqattach(dp); @@ -2802,7 +2865,7 @@ xfs_remove( * remove transaction goes to disk before returning to * the user. */ - if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) + if (xfs_has_wsync(mp) || xfs_has_dirsync(mp)) xfs_trans_set_sync(tp); error = xfs_trans_commit(tp); @@ -2879,7 +2942,7 @@ xfs_finish_rename( * If this is a synchronous mount, make sure that the rename transaction * goes to disk before returning to the user. */ - if (tp->t_mountp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) + if (xfs_has_wsync(tp->t_mountp) || xfs_has_dirsync(tp->t_mountp)) xfs_trans_set_sync(tp); return xfs_trans_commit(tp); @@ -3462,7 +3525,7 @@ xfs_iflush( * happen but we need to still do it to ensure backwards compatibility * with old kernels that predate logging all inode changes. */ - if (!xfs_sb_version_has_v3inode(&mp->m_sb)) + if (!xfs_has_v3inodes(mp)) ip->i_flushiter++; /* @@ -3484,7 +3547,7 @@ xfs_iflush( xfs_inode_to_disk(ip, dip, iip->ili_item.li_lsn); /* Wrap, we never let the log put out DI_MAX_FLUSH */ - if (!xfs_sb_version_has_v3inode(&mp->m_sb)) { + if (!xfs_has_v3inodes(mp)) { if (ip->i_flushiter == DI_MAX_FLUSH) ip->i_flushiter = 0; } @@ -3603,7 +3666,7 @@ xfs_iflush_cluster( * AIL, leaving a dirty/unpinned inode attached to the buffer * that otherwise looks like it should be flushed. */ - if (XFS_FORCED_SHUTDOWN(mp)) { + if (xfs_is_shutdown(mp)) { xfs_iunpin_wait(ip); xfs_iflush_abort(ip); xfs_iunlock(ip, XFS_ILOCK_SHARED); @@ -3741,11 +3804,8 @@ xfs_ilock2_io_mmap( ret = xfs_iolock_two_inodes_and_break_layout(VFS_I(ip1), VFS_I(ip2)); if (ret) return ret; - if (ip1 == ip2) - xfs_ilock(ip1, XFS_MMAPLOCK_EXCL); - else - xfs_lock_two_inodes(ip1, XFS_MMAPLOCK_EXCL, - ip2, XFS_MMAPLOCK_EXCL); + filemap_invalidate_lock_two(VFS_I(ip1)->i_mapping, + VFS_I(ip2)->i_mapping); return 0; } @@ -3755,12 +3815,9 @@ xfs_iunlock2_io_mmap( struct xfs_inode *ip1, struct xfs_inode *ip2) { - bool same_inode = (ip1 == ip2); - - xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL); - if (!same_inode) - xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL); + filemap_invalidate_unlock_two(VFS_I(ip1)->i_mapping, + VFS_I(ip2)->i_mapping); inode_unlock(VFS_I(ip2)); - if (!same_inode) + if (ip1 != ip2) inode_unlock(VFS_I(ip1)); } diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 4b6703dbffb8..b21b177832d1 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -40,8 +40,8 @@ typedef struct xfs_inode { /* Transaction and locking information. */ struct xfs_inode_log_item *i_itemp; /* logging information */ mrlock_t i_lock; /* inode lock */ - mrlock_t i_mmaplock; /* inode mmap IO lock */ atomic_t i_pincount; /* inode pin count */ + struct llist_node i_gclist; /* deferred inactivation list */ /* * Bitsets of inode metadata that have been checked and/or are sick. @@ -240,6 +240,7 @@ static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip) #define __XFS_IPINNED_BIT 8 /* wakeup key for zero pin count */ #define XFS_IPINNED (1 << __XFS_IPINNED_BIT) #define XFS_IEOFBLOCKS (1 << 9) /* has the preallocblocks tag set */ +#define XFS_NEED_INACTIVE (1 << 10) /* see XFS_INACTIVATING below */ /* * If this unlinked inode is in the middle of recovery, don't let drop_inode * truncate and free the inode. This can happen if we iget the inode during @@ -249,13 +250,29 @@ static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip) #define XFS_ICOWBLOCKS (1 << 12)/* has the cowblocks tag set */ /* + * If we need to update on-disk metadata before this IRECLAIMABLE inode can be + * freed, then NEED_INACTIVE will be set. Once we start the updates, the + * INACTIVATING bit will be set to keep iget away from this inode. After the + * inactivation completes, both flags will be cleared and the inode is a + * plain old IRECLAIMABLE inode. + */ +#define XFS_INACTIVATING (1 << 13) + +/* All inode state flags related to inode reclaim. */ +#define XFS_ALL_IRECLAIM_FLAGS (XFS_IRECLAIMABLE | \ + XFS_IRECLAIM | \ + XFS_NEED_INACTIVE | \ + XFS_INACTIVATING) + +/* * Per-lifetime flags need to be reset when re-using a reclaimable inode during * inode lookup. This prevents unintended behaviour on the new inode from * ocurring. */ #define XFS_IRECLAIM_RESET_FLAGS \ (XFS_IRECLAIMABLE | XFS_IRECLAIM | \ - XFS_IDIRTY_RELEASE | XFS_ITRUNCATED) + XFS_IDIRTY_RELEASE | XFS_ITRUNCATED | XFS_NEED_INACTIVE | \ + XFS_INACTIVATING) /* * Flags for inode locking. @@ -382,8 +399,7 @@ enum layout_break_reason { * new subdirectory gets S_ISGID bit from parent. */ #define XFS_INHERIT_GID(pip) \ - (((pip)->i_mount->m_flags & XFS_MOUNT_GRPID) || \ - (VFS_I(pip)->i_mode & S_ISGID)) + (xfs_has_grpid((pip)->i_mount) || (VFS_I(pip)->i_mode & S_ISGID)) int xfs_release(struct xfs_inode *ip); void xfs_inactive(struct xfs_inode *ip); @@ -410,7 +426,7 @@ void xfs_ilock(xfs_inode_t *, uint); int xfs_ilock_nowait(xfs_inode_t *, uint); void xfs_iunlock(xfs_inode_t *, uint); void xfs_ilock_demote(xfs_inode_t *, uint); -int xfs_isilocked(xfs_inode_t *, uint); +bool xfs_isilocked(struct xfs_inode *, uint); uint xfs_ilock_data_map_shared(struct xfs_inode *); uint xfs_ilock_attr_map_shared(struct xfs_inode *); @@ -493,6 +509,8 @@ extern struct kmem_zone *xfs_inode_zone; /* The default CoW extent size hint. */ #define XFS_DEFAULT_COWEXTSZ_HINT 32 +bool xfs_inode_needs_inactive(struct xfs_inode *ip); + int xfs_iunlink_init(struct xfs_perag *pag); void xfs_iunlink_destroy(struct xfs_perag *pag); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 35de30849fcc..0659d19c211e 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -396,7 +396,7 @@ xfs_inode_to_log_dinode( /* log a dummy value to ensure log structure is fully initialised */ to->di_next_unlinked = NULLAGINO; - if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) { + if (xfs_has_v3inodes(ip->i_mount)) { to->di_version = 3; to->di_changecount = inode_peek_iversion(inode); to->di_crtime = xfs_inode_to_log_dinode_ts(ip, ip->i_crtime); diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c index e0072a6cd2d3..239dd2e3384e 100644 --- a/fs/xfs/xfs_inode_item_recover.c +++ b/fs/xfs/xfs_inode_item_recover.c @@ -295,7 +295,7 @@ xlog_recover_inode_commit_pass2( * superblock flag to determine whether we need to look at di_flushiter * to skip replay when the on disk inode is newer than the log one */ - if (!xfs_sb_version_has_v3inode(&mp->m_sb) && + if (!xfs_has_v3inodes(mp) && ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) { /* * Deal with the wrap case, DI_MAX_FLUSH is less diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 16039ea10ac9..0c795dc093ef 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -756,7 +756,7 @@ xfs_ioc_fsbulkstat( if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; if (copy_from_user(&bulkreq, arg, sizeof(struct xfs_fsop_bulkreq))) @@ -927,7 +927,7 @@ xfs_ioc_bulkstat( if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; if (copy_from_user(&hdr, &arg->hdr, sizeof(hdr))) @@ -977,7 +977,7 @@ xfs_ioc_inumbers( if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; if (copy_from_user(&hdr, &arg->hdr, sizeof(hdr))) @@ -1010,7 +1010,7 @@ xfs_ioc_fsgeometry( struct xfs_fsop_geom fsgeo; size_t len; - xfs_fs_geometry(&mp->m_sb, &fsgeo, struct_version); + xfs_fs_geometry(mp, &fsgeo, struct_version); if (struct_version <= 3) len = sizeof(struct xfs_fsop_geom_v1); @@ -1213,7 +1213,7 @@ xfs_ioctl_setattr_xflags( /* diflags2 only valid for v3 inodes. */ i_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags); - if (i_flags2 && !xfs_sb_version_has_v3inode(&mp->m_sb)) + if (i_flags2 && !xfs_has_v3inodes(mp)) return -EINVAL; ip->i_diflags = xfs_flags2diflags(ip, fa->fsx_xflags); @@ -1237,8 +1237,7 @@ xfs_ioctl_setattr_prepare_dax( if (S_ISDIR(inode->i_mode)) return; - if ((mp->m_flags & XFS_MOUNT_DAX_ALWAYS) || - (mp->m_flags & XFS_MOUNT_DAX_NEVER)) + if (xfs_has_dax_always(mp) || xfs_has_dax_never(mp)) return; if (((fa->fsx_xflags & FS_XFLAG_DAX) && @@ -1263,10 +1262,10 @@ xfs_ioctl_setattr_get_trans( struct xfs_trans *tp; int error = -EROFS; - if (mp->m_flags & XFS_MOUNT_RDONLY) + if (xfs_is_readonly(mp)) goto out_error; error = -EIO; - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) goto out_error; error = xfs_trans_alloc_ichange(ip, NULL, NULL, pdqp, @@ -1274,7 +1273,7 @@ xfs_ioctl_setattr_get_trans( if (error) goto out_error; - if (mp->m_flags & XFS_MOUNT_WSYNC) + if (xfs_has_wsync(mp)) xfs_trans_set_sync(tp); return tp; @@ -1362,9 +1361,9 @@ xfs_ioctl_setattr_check_projid( if (!fa->fsx_valid) return 0; - /* Disallow 32bit project ids if projid32bit feature is not enabled. */ + /* Disallow 32bit project ids if 32bit IDs are not enabled. */ if (fa->fsx_projid > (uint16_t)-1 && - !xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb)) + !xfs_has_projid32(ip->i_mount)) return -EINVAL; return 0; } @@ -1450,7 +1449,7 @@ xfs_fileattr_set( /* Change the ownerships and register project quota modifications */ if (ip->i_projid != fa->fsx_projid) { - if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) { + if (XFS_IS_PQUOTA_ON(mp)) { olddquot = xfs_qm_vop_chown(tp, ip, &ip->i_pdquot, pdqp); } @@ -1467,7 +1466,7 @@ xfs_fileattr_set( else ip->i_extsize = 0; - if (xfs_sb_version_has_v3inode(&mp->m_sb)) { + if (xfs_has_v3inodes(mp)) { if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) ip->i_cowextsize = XFS_B_TO_FSB(mp, fa->fsx_cowextsize); else @@ -1792,7 +1791,7 @@ xfs_ioc_swapext( goto out_put_tmp_file; } - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { + if (xfs_is_shutdown(ip->i_mount)) { error = -EIO; goto out_put_tmp_file; } @@ -2081,7 +2080,7 @@ xfs_file_ioctl( if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (mp->m_flags & XFS_MOUNT_RDONLY) + if (xfs_is_readonly(mp)) return -EROFS; if (copy_from_user(&inout, arg, sizeof(inout))) @@ -2198,7 +2197,7 @@ xfs_file_ioctl( if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (mp->m_flags & XFS_MOUNT_RDONLY) + if (xfs_is_readonly(mp)) return -EROFS; if (copy_from_user(&eofb, arg, sizeof(eofb))) diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index e6506773ba55..8783af203cfc 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -50,7 +50,7 @@ xfs_compat_ioc_fsgeometry_v1( { struct xfs_fsop_geom fsgeo; - xfs_fs_geometry(&mp->m_sb, &fsgeo, 3); + xfs_fs_geometry(mp, &fsgeo, 3); /* The 32-bit variant simply has some padding at the end */ if (copy_to_user(arg32, &fsgeo, sizeof(struct compat_xfs_fsop_geom_v1))) return -EFAULT; @@ -254,7 +254,7 @@ xfs_compat_ioc_fsbulkstat( if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; if (get_user(addr, &p32->lastip)) diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index d8cd2583dedb..093758440ad5 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -132,7 +132,7 @@ xfs_eof_alignment( * If mounted with the "-o swalloc" option the alignment is * increased from the strip unit size to the stripe width. */ - if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC)) + if (mp->m_swidth && xfs_has_swalloc(mp)) align = mp->m_swidth; else if (mp->m_dalign) align = mp->m_dalign; @@ -734,7 +734,7 @@ xfs_direct_write_iomap_begin( ASSERT(flags & (IOMAP_WRITE | IOMAP_ZERO)); - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; /* @@ -874,7 +874,7 @@ xfs_buffered_write_iomap_begin( int allocfork = XFS_DATA_FORK; int error = 0; - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; /* we can't use delayed allocations when using extent size hints */ @@ -994,7 +994,7 @@ xfs_buffered_write_iomap_begin( * Determine the initial size of the preallocation. * We clean up any extra preallocation when the file is closed. */ - if (mp->m_flags & XFS_MOUNT_ALLOCSIZE) + if (xfs_has_allocsize(mp)) prealloc_blocks = mp->m_allocsize_blocks; else prealloc_blocks = xfs_iomap_prealloc_size(ip, allocfork, @@ -1064,11 +1064,11 @@ found_cow: error = xfs_bmbt_to_iomap(ip, srcmap, &imap, 0); if (error) return error; - } else { - xfs_trim_extent(&cmap, offset_fsb, - imap.br_startoff - offset_fsb); + return xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED); } - return xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED); + + xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb); + return xfs_bmbt_to_iomap(ip, iomap, &cmap, 0); out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -1127,7 +1127,7 @@ xfs_buffered_write_iomap_end( error = xfs_bmap_punch_delalloc_range(ip, start_fsb, end_fsb - start_fsb); - if (error && !XFS_FORCED_SHUTDOWN(mp)) { + if (error && !xfs_is_shutdown(mp)) { xfs_alert(mp, "%s: unable to clean up ino %lld", __func__, ip->i_ino); return error; @@ -1162,7 +1162,7 @@ xfs_read_iomap_begin( ASSERT(!(flags & (IOMAP_WRITE | IOMAP_ZERO))); - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; error = xfs_ilock_for_iomap(ip, flags, &lockmode); @@ -1203,7 +1203,7 @@ xfs_seek_iomap_begin( int error = 0; unsigned lockmode; - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; lockmode = xfs_ilock_data_map_shared(ip); @@ -1285,7 +1285,7 @@ xfs_xattr_iomap_begin( int nimaps = 1, error = 0; unsigned lockmode; - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; lockmode = xfs_ilock_attr_map_shared(ip); diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 93c082db04b7..a607d6aca5c4 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -393,7 +393,7 @@ xfs_vn_unlink( * but still hashed. This is incompatible with case-insensitive * mode, so invalidate (unhash) the dentry in CI-mode. */ - if (xfs_sb_version_hasasciici(&XFS_M(dir->i_sb)->m_sb)) + if (xfs_has_asciici(XFS_M(dir->i_sb))) d_invalidate(dentry); return 0; } @@ -558,10 +558,10 @@ xfs_stat_blksize( * default buffered I/O size, return that, otherwise return the compat * default. */ - if (mp->m_flags & XFS_MOUNT_LARGEIO) { + if (xfs_has_large_iosize(mp)) { if (mp->m_swidth) return XFS_FSB_TO_B(mp, mp->m_swidth); - if (mp->m_flags & XFS_MOUNT_ALLOCSIZE) + if (xfs_has_allocsize(mp)) return 1U << mp->m_allocsize_log; } @@ -582,7 +582,7 @@ xfs_vn_getattr( trace_xfs_getattr(ip); - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; stat->size = XFS_ISIZE(ip); @@ -597,7 +597,7 @@ xfs_vn_getattr( stat->ctime = inode->i_ctime; stat->blocks = XFS_FSB_TO_BB(mp, ip->i_nblocks + ip->i_delayed_blks); - if (xfs_sb_version_has_v3inode(&mp->m_sb)) { + if (xfs_has_v3inodes(mp)) { if (request_mask & STATX_BTIME) { stat->result_mask |= STATX_BTIME; stat->btime = ip->i_crtime; @@ -673,10 +673,10 @@ xfs_vn_change_ok( { struct xfs_mount *mp = XFS_I(d_inode(dentry))->i_mount; - if (mp->m_flags & XFS_MOUNT_RDONLY) + if (xfs_is_readonly(mp)) return -EROFS; - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; return setattr_prepare(mnt_userns, dentry, iattr); @@ -778,7 +778,7 @@ xfs_setattr_nonsize( * in the transaction. */ if (!uid_eq(iuid, uid)) { - if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_UQUOTA_ON(mp)) { + if (XFS_IS_UQUOTA_ON(mp)) { ASSERT(mask & ATTR_UID); ASSERT(udqp); olddquot1 = xfs_qm_vop_chown(tp, ip, @@ -787,8 +787,8 @@ xfs_setattr_nonsize( inode->i_uid = uid; } if (!gid_eq(igid, gid)) { - if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_GQUOTA_ON(mp)) { - ASSERT(xfs_sb_version_has_pquotino(&mp->m_sb) || + if (XFS_IS_GQUOTA_ON(mp)) { + ASSERT(xfs_has_pquotino(mp) || !XFS_IS_PQUOTA_ON(mp)); ASSERT(mask & ATTR_GID); ASSERT(gdqp); @@ -808,7 +808,7 @@ xfs_setattr_nonsize( XFS_STATS_INC(mp, xs_ig_attrchg); - if (mp->m_flags & XFS_MOUNT_WSYNC) + if (xfs_has_wsync(mp)) xfs_trans_set_sync(tp); error = xfs_trans_commit(tp); @@ -1037,7 +1037,7 @@ xfs_setattr_size( XFS_STATS_INC(mp, xs_ig_attrchg); - if (mp->m_flags & XFS_MOUNT_WSYNC) + if (xfs_has_wsync(mp)) xfs_trans_set_sync(tp); error = xfs_trans_commit(tp); @@ -1287,11 +1287,11 @@ xfs_inode_should_enable_dax( { if (!IS_ENABLED(CONFIG_FS_DAX)) return false; - if (ip->i_mount->m_flags & XFS_MOUNT_DAX_NEVER) + if (xfs_has_dax_never(ip->i_mount)) return false; if (!xfs_inode_supports_dax(ip)) return false; - if (ip->i_mount->m_flags & XFS_MOUNT_DAX_ALWAYS) + if (xfs_has_dax_always(ip->i_mount)) return true; if (ip->i_diflags2 & XFS_DIFLAG2_DAX) return true; @@ -1344,7 +1344,7 @@ xfs_setup_inode( gfp_t gfp_mask; inode->i_ino = ip->i_ino; - inode->i_state = I_NEW; + inode->i_state |= I_NEW; inode_sb_list_add(inode); /* make the inode look hashed for the writeback code */ @@ -1401,7 +1401,7 @@ xfs_setup_iops( inode->i_mapping->a_ops = &xfs_address_space_operations; break; case S_IFDIR: - if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb)) + if (xfs_has_asciici(XFS_M(inode->i_sb))) inode->i_op = &xfs_dir_ci_inode_operations; else inode->i_op = &xfs_dir_inode_operations; diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index f331975a16de..c08c79d9e311 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -19,6 +19,7 @@ #include "xfs_error.h" #include "xfs_icache.h" #include "xfs_health.h" +#include "xfs_trans.h" /* * Bulk Stat @@ -107,7 +108,7 @@ xfs_bulkstat_one_int( buf->bs_forkoff = XFS_IFORK_BOFF(ip); buf->bs_version = XFS_BULKSTAT_VERSION_V5; - if (xfs_sb_version_has_v3inode(&mp->m_sb)) { + if (xfs_has_v3inodes(mp)) { buf->bs_btime = ip->i_crtime.tv_sec; buf->bs_btime_nsec = ip->i_crtime.tv_nsec; if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) @@ -163,6 +164,7 @@ xfs_bulkstat_one( .formatter = formatter, .breq = breq, }; + struct xfs_trans *tp; int error; if (breq->mnt_userns != &init_user_ns) { @@ -178,9 +180,18 @@ xfs_bulkstat_one( if (!bc.buf) return -ENOMEM; - error = xfs_bulkstat_one_int(breq->mp, breq->mnt_userns, NULL, - breq->startino, &bc); + /* + * Grab an empty transaction so that we can use its recursive buffer + * locking abilities to detect cycles in the inobt without deadlocking. + */ + error = xfs_trans_alloc_empty(breq->mp, &tp); + if (error) + goto out; + error = xfs_bulkstat_one_int(breq->mp, breq->mnt_userns, tp, + breq->startino, &bc); + xfs_trans_cancel(tp); +out: kmem_free(bc.buf); /* @@ -244,6 +255,7 @@ xfs_bulkstat( .formatter = formatter, .breq = breq, }; + struct xfs_trans *tp; int error; if (breq->mnt_userns != &init_user_ns) { @@ -259,9 +271,18 @@ xfs_bulkstat( if (!bc.buf) return -ENOMEM; - error = xfs_iwalk(breq->mp, NULL, breq->startino, breq->flags, - xfs_bulkstat_iwalk, breq->icount, &bc); + /* + * Grab an empty transaction so that we can use its recursive buffer + * locking abilities to detect cycles in the inobt without deadlocking. + */ + error = xfs_trans_alloc_empty(breq->mp, &tp); + if (error) + goto out; + error = xfs_iwalk(breq->mp, tp, breq->startino, breq->flags, + xfs_bulkstat_iwalk, breq->icount, &bc); + xfs_trans_cancel(tp); +out: kmem_free(bc.buf); /* @@ -374,13 +395,24 @@ xfs_inumbers( .formatter = formatter, .breq = breq, }; + struct xfs_trans *tp; int error = 0; if (xfs_bulkstat_already_done(breq->mp, breq->startino)) return 0; - error = xfs_inobt_walk(breq->mp, NULL, breq->startino, breq->flags, + /* + * Grab an empty transaction so that we can use its recursive buffer + * locking abilities to detect cycles in the inobt without deadlocking. + */ + error = xfs_trans_alloc_empty(breq->mp, &tp); + if (error) + goto out; + + error = xfs_inobt_walk(breq->mp, tp, breq->startino, breq->flags, xfs_inumbers_walk, breq->icount, &ic); + xfs_trans_cancel(tp); +out: /* * We found some inode groups, so clear the error status and return diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c index 917d51eefee3..7558486f4937 100644 --- a/fs/xfs/xfs_iwalk.c +++ b/fs/xfs/xfs_iwalk.c @@ -83,6 +83,9 @@ struct xfs_iwalk_ag { /* Skip empty inobt records? */ unsigned int skip_empty:1; + + /* Drop the (hopefully empty) transaction when calling iwalk_fn. */ + unsigned int drop_trans:1; }; /* @@ -352,7 +355,6 @@ xfs_iwalk_run_callbacks( int *has_more) { struct xfs_mount *mp = iwag->mp; - struct xfs_trans *tp = iwag->tp; struct xfs_inobt_rec_incore *irec; xfs_agino_t next_agino; int error; @@ -362,10 +364,15 @@ xfs_iwalk_run_callbacks( ASSERT(iwag->nr_recs > 0); /* Delete cursor but remember the last record we cached... */ - xfs_iwalk_del_inobt(tp, curpp, agi_bpp, 0); + xfs_iwalk_del_inobt(iwag->tp, curpp, agi_bpp, 0); irec = &iwag->recs[iwag->nr_recs - 1]; ASSERT(next_agino >= irec->ir_startino + XFS_INODES_PER_CHUNK); + if (iwag->drop_trans) { + xfs_trans_cancel(iwag->tp); + iwag->tp = NULL; + } + error = xfs_iwalk_ag_recs(iwag); if (error) return error; @@ -376,8 +383,15 @@ xfs_iwalk_run_callbacks( if (!has_more) return 0; + if (iwag->drop_trans) { + error = xfs_trans_alloc_empty(mp, &iwag->tp); + if (error) + return error; + } + /* ...and recreate the cursor just past where we left off. */ - error = xfs_inobt_cur(mp, tp, iwag->pag, XFS_BTNUM_INO, curpp, agi_bpp); + error = xfs_inobt_cur(mp, iwag->tp, iwag->pag, XFS_BTNUM_INO, curpp, + agi_bpp); if (error) return error; @@ -390,7 +404,6 @@ xfs_iwalk_ag( struct xfs_iwalk_ag *iwag) { struct xfs_mount *mp = iwag->mp; - struct xfs_trans *tp = iwag->tp; struct xfs_perag *pag = iwag->pag; struct xfs_buf *agi_bp = NULL; struct xfs_btree_cur *cur = NULL; @@ -469,7 +482,7 @@ xfs_iwalk_ag( error = xfs_iwalk_run_callbacks(iwag, &cur, &agi_bp, &has_more); out: - xfs_iwalk_del_inobt(tp, &cur, &agi_bp, error); + xfs_iwalk_del_inobt(iwag->tp, &cur, &agi_bp, error); return error; } @@ -599,8 +612,18 @@ xfs_iwalk_ag_work( error = xfs_iwalk_alloc(iwag); if (error) goto out; + /* + * Grab an empty transaction so that we can use its recursive buffer + * locking abilities to detect cycles in the inobt without deadlocking. + */ + error = xfs_trans_alloc_empty(mp, &iwag->tp); + if (error) + goto out; + iwag->drop_trans = 1; error = xfs_iwalk_ag(iwag); + if (iwag->tp) + xfs_trans_cancel(iwag->tp); xfs_iwalk_free(iwag); out: xfs_perag_put(iwag->pag); diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 60ac5fd63f1e..f6cd2d4aa770 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -41,6 +41,8 @@ xlog_dealloc_log( /* local state machine functions */ STATIC void xlog_state_done_syncing( struct xlog_in_core *iclog); +STATIC void xlog_state_do_callback( + struct xlog *log); STATIC int xlog_state_get_iclog_space( struct xlog *log, @@ -50,11 +52,6 @@ xlog_state_get_iclog_space( int *continued_write, int *logoffsetp); STATIC void -xlog_state_switch_iclogs( - struct xlog *log, - struct xlog_in_core *iclog, - int eventual_size); -STATIC void xlog_grant_push_ail( struct xlog *log, int need_bytes); @@ -246,7 +243,7 @@ xlog_grant_head_wait( list_add_tail(&tic->t_queue, &head->waiters); do { - if (XLOG_FORCED_SHUTDOWN(log)) + if (xlog_is_shutdown(log)) goto shutdown; xlog_grant_push_ail(log, need_bytes); @@ -260,7 +257,7 @@ xlog_grant_head_wait( trace_xfs_log_grant_wake(log, tic); spin_lock(&head->lock); - if (XLOG_FORCED_SHUTDOWN(log)) + if (xlog_is_shutdown(log)) goto shutdown; } while (xlog_space_left(log, &head->grant) < need_bytes); @@ -298,7 +295,7 @@ xlog_grant_head_check( int free_bytes; int error = 0; - ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY)); + ASSERT(!xlog_in_recovery(log)); /* * If there are other waiters on the queue then give them a chance at @@ -359,13 +356,13 @@ xfs_log_writable( * mounts allow internal writes for log recovery and unmount purposes, * so don't restrict that case. */ - if (mp->m_flags & XFS_MOUNT_NORECOVERY) + if (xfs_has_norecovery(mp)) return false; if (xfs_readonly_buftarg(mp->m_ddev_targp)) return false; if (xfs_readonly_buftarg(mp->m_log->l_targ)) return false; - if (XFS_FORCED_SHUTDOWN(mp)) + if (xlog_is_shutdown(mp->m_log)) return false; return true; } @@ -382,7 +379,7 @@ xfs_log_regrant( int need_bytes; int error = 0; - if (XLOG_FORCED_SHUTDOWN(log)) + if (xlog_is_shutdown(log)) return -EIO; XFS_STATS_INC(mp, xs_try_logspace); @@ -450,7 +447,7 @@ xfs_log_reserve( ASSERT(client == XFS_TRANSACTION || client == XFS_LOG); - if (XLOG_FORCED_SHUTDOWN(log)) + if (xlog_is_shutdown(log)) return -EIO; XFS_STATS_INC(mp, xs_try_logspace); @@ -487,6 +484,42 @@ out_error: } /* + * Run all the pending iclog callbacks and wake log force waiters and iclog + * space waiters so they can process the newly set shutdown state. We really + * don't care what order we process callbacks here because the log is shut down + * and so state cannot change on disk anymore. + * + * We avoid processing actively referenced iclogs so that we don't run callbacks + * while the iclog owner might still be preparing the iclog for IO submssion. + * These will be caught by xlog_state_iclog_release() and call this function + * again to process any callbacks that may have been added to that iclog. + */ +static void +xlog_state_shutdown_callbacks( + struct xlog *log) +{ + struct xlog_in_core *iclog; + LIST_HEAD(cb_list); + + spin_lock(&log->l_icloglock); + iclog = log->l_iclog; + do { + if (atomic_read(&iclog->ic_refcnt)) { + /* Reference holder will re-run iclog callbacks. */ + continue; + } + list_splice_init(&iclog->ic_callbacks, &cb_list); + wake_up_all(&iclog->ic_write_wait); + wake_up_all(&iclog->ic_force_wait); + } while ((iclog = iclog->ic_next) != log->l_iclog); + + wake_up_all(&log->l_flush_wait); + spin_unlock(&log->l_icloglock); + + xlog_cil_process_committed(&cb_list); +} + +/* * Flush iclog to disk if this is the last reference to the given iclog and the * it is in the WANT_SYNC state. * @@ -520,12 +553,11 @@ xlog_state_release_iclog( xfs_lsn_t old_tail_lsn) { xfs_lsn_t tail_lsn; + bool last_ref; + lockdep_assert_held(&log->l_icloglock); trace_xlog_iclog_release(iclog, _RET_IP_); - if (iclog->ic_state == XLOG_STATE_IOERROR) - return -EIO; - /* * Grabbing the current log tail needs to be atomic w.r.t. the writing * of the tail LSN into the iclog so we guarantee that the log tail does @@ -543,7 +575,23 @@ xlog_state_release_iclog( iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn); } - if (!atomic_dec_and_test(&iclog->ic_refcnt)) + last_ref = atomic_dec_and_test(&iclog->ic_refcnt); + + if (xlog_is_shutdown(log)) { + /* + * If there are no more references to this iclog, process the + * pending iclog callbacks that were waiting on the release of + * this iclog. + */ + if (last_ref) { + spin_unlock(&log->l_icloglock); + xlog_state_shutdown_callbacks(log); + spin_lock(&log->l_icloglock); + } + return -EIO; + } + + if (!last_ref) return 0; if (iclog->ic_state != XLOG_STATE_WANT_SYNC) { @@ -580,25 +628,27 @@ xfs_log_mount( xfs_daddr_t blk_offset, int num_bblks) { - bool fatal = xfs_sb_version_hascrc(&mp->m_sb); + struct xlog *log; + bool fatal = xfs_has_crc(mp); int error = 0; int min_logfsbs; - if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) { + if (!xfs_has_norecovery(mp)) { xfs_notice(mp, "Mounting V%d Filesystem", XFS_SB_VERSION_NUM(&mp->m_sb)); } else { xfs_notice(mp, "Mounting V%d filesystem in no-recovery mode. Filesystem will be inconsistent.", XFS_SB_VERSION_NUM(&mp->m_sb)); - ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); + ASSERT(xfs_is_readonly(mp)); } - mp->m_log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks); - if (IS_ERR(mp->m_log)) { - error = PTR_ERR(mp->m_log); + log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks); + if (IS_ERR(log)) { + error = PTR_ERR(log); goto out; } + mp->m_log = log; /* * Validate the given log space and drop a critical message via syslog @@ -663,51 +713,51 @@ xfs_log_mount( xfs_warn(mp, "AIL initialisation failed: error %d", error); goto out_free_log; } - mp->m_log->l_ailp = mp->m_ail; + log->l_ailp = mp->m_ail; /* * skip log recovery on a norecovery mount. pretend it all * just worked. */ - if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) { - int readonly = (mp->m_flags & XFS_MOUNT_RDONLY); - - if (readonly) - mp->m_flags &= ~XFS_MOUNT_RDONLY; - - error = xlog_recover(mp->m_log); - + if (!xfs_has_norecovery(mp)) { + /* + * log recovery ignores readonly state and so we need to clear + * mount-based read only state so it can write to disk. + */ + bool readonly = test_and_clear_bit(XFS_OPSTATE_READONLY, + &mp->m_opstate); + error = xlog_recover(log); if (readonly) - mp->m_flags |= XFS_MOUNT_RDONLY; + set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate); if (error) { xfs_warn(mp, "log mount/recovery failed: error %d", error); - xlog_recover_cancel(mp->m_log); + xlog_recover_cancel(log); goto out_destroy_ail; } } - error = xfs_sysfs_init(&mp->m_log->l_kobj, &xfs_log_ktype, &mp->m_kobj, + error = xfs_sysfs_init(&log->l_kobj, &xfs_log_ktype, &mp->m_kobj, "log"); if (error) goto out_destroy_ail; /* Normal transactions can now occur */ - mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY; + clear_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate); /* * Now the log has been fully initialised and we know were our * space grant counters are, we can initialise the permanent ticket * needed for delayed logging to work. */ - xlog_cil_init_post_recovery(mp->m_log); + xlog_cil_init_post_recovery(log); return 0; out_destroy_ail: xfs_trans_ail_destroy(mp); out_free_log: - xlog_dealloc_log(mp->m_log); + xlog_dealloc_log(log); out: return error; } @@ -726,19 +776,22 @@ int xfs_log_mount_finish( struct xfs_mount *mp) { - int error = 0; - bool readonly = (mp->m_flags & XFS_MOUNT_RDONLY); - bool recovered = mp->m_log->l_flags & XLOG_RECOVERY_NEEDED; + struct xlog *log = mp->m_log; + bool readonly; + int error = 0; - if (mp->m_flags & XFS_MOUNT_NORECOVERY) { - ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); + if (xfs_has_norecovery(mp)) { + ASSERT(xfs_is_readonly(mp)); return 0; - } else if (readonly) { - /* Allow unlinked processing to proceed */ - mp->m_flags &= ~XFS_MOUNT_RDONLY; } /* + * log recovery ignores readonly state and so we need to clear + * mount-based read only state so it can write to disk. + */ + readonly = test_and_clear_bit(XFS_OPSTATE_READONLY, &mp->m_opstate); + + /* * During the second phase of log recovery, we need iget and * iput to behave like they do for an active filesystem. * xfs_fs_drop_inode needs to be able to prevent the deletion @@ -759,7 +812,8 @@ xfs_log_mount_finish( * mount failure occurs. */ mp->m_super->s_flags |= SB_ACTIVE; - error = xlog_recover_finish(mp->m_log); + if (xlog_recovery_needed(log)) + error = xlog_recover_finish(log); if (!error) xfs_log_work_queue(mp); mp->m_super->s_flags &= ~SB_ACTIVE; @@ -774,17 +828,24 @@ xfs_log_mount_finish( * Don't push in the error case because the AIL may have pending intents * that aren't removed until recovery is cancelled. */ - if (!error && recovered) { - xfs_log_force(mp, XFS_LOG_SYNC); - xfs_ail_push_all_sync(mp->m_ail); + if (xlog_recovery_needed(log)) { + if (!error) { + xfs_log_force(mp, XFS_LOG_SYNC); + xfs_ail_push_all_sync(mp->m_ail); + } + xfs_notice(mp, "Ending recovery (logdev: %s)", + mp->m_logname ? mp->m_logname : "internal"); + } else { + xfs_info(mp, "Ending clean mount"); } xfs_buftarg_drain(mp->m_ddev_targp); + clear_bit(XLOG_RECOVERY_NEEDED, &log->l_opstate); if (readonly) - mp->m_flags |= XFS_MOUNT_RDONLY; + set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate); /* Make sure the log is dead if we're returning failure. */ - ASSERT(!error || (mp->m_log->l_flags & XLOG_IO_ERROR)); + ASSERT(!error || xlog_is_shutdown(log)); return error; } @@ -830,7 +891,7 @@ xlog_wait_on_iclog( struct xlog *log = iclog->ic_log; trace_xlog_iclog_wait_on(iclog, _RET_IP_); - if (!XLOG_FORCED_SHUTDOWN(log) && + if (!xlog_is_shutdown(log) && iclog->ic_state != XLOG_STATE_ACTIVE && iclog->ic_state != XLOG_STATE_DIRTY) { XFS_STATS_INC(log->l_mp, xs_log_force_sleep); @@ -839,7 +900,7 @@ xlog_wait_on_iclog( spin_unlock(&log->l_icloglock); } - if (XLOG_FORCED_SHUTDOWN(log)) + if (xlog_is_shutdown(log)) return -EIO; return 0; } @@ -870,7 +931,7 @@ xlog_write_unmount_record( /* account for space used by record data */ ticket->t_curr_res -= sizeof(ulf); - return xlog_write(log, &vec, ticket, NULL, NULL, XLOG_UNMOUNT_TRANS); + return xlog_write(log, NULL, &vec, ticket, XLOG_UNMOUNT_TRANS); } /* @@ -893,7 +954,7 @@ xlog_unmount_write( error = xlog_write_unmount_record(log, tic); /* * At this point, we're umounting anyway, so there's no point in - * transitioning log state to IOERROR. Just continue... + * transitioning log state to shutdown. Just continue... */ out_err: if (error) @@ -940,7 +1001,7 @@ xfs_log_unmount_write( xfs_log_force(mp, XFS_LOG_SYNC); - if (XLOG_FORCED_SHUTDOWN(log)) + if (xlog_is_shutdown(log)) return; /* @@ -972,6 +1033,20 @@ int xfs_log_quiesce( struct xfs_mount *mp) { + /* + * Clear log incompat features since we're quiescing the log. Report + * failures, though it's not fatal to have a higher log feature + * protection level than the log contents actually require. + */ + if (xfs_clear_incompat_log_features(mp)) { + int error; + + error = xfs_sync_sb(mp, false); + if (error) + xfs_warn(mp, + "Failed to clear log incompat features on quiesce"); + } + cancel_delayed_work_sync(&mp->m_log->l_work); xfs_log_force(mp, XFS_LOG_SYNC); @@ -1049,11 +1124,11 @@ xfs_log_space_wake( struct xlog *log = mp->m_log; int free_bytes; - if (XLOG_FORCED_SHUTDOWN(log)) + if (xlog_is_shutdown(log)) return; if (!list_empty_careful(&log->l_write_head.waiters)) { - ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY)); + ASSERT(!xlog_in_recovery(log)); spin_lock(&log->l_write_head.lock); free_bytes = xlog_space_left(log, &log->l_write_head.grant); @@ -1062,7 +1137,7 @@ xfs_log_space_wake( } if (!list_empty_careful(&log->l_reserve_head.waiters)) { - ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY)); + ASSERT(!xlog_in_recovery(log)); spin_lock(&log->l_reserve_head.lock); free_bytes = xlog_space_left(log, &log->l_reserve_head.grant); @@ -1140,7 +1215,7 @@ xfs_log_cover( ASSERT((xlog_cil_empty(mp->m_log) && xlog_iclogs_empty(mp->m_log) && !xfs_ail_min_lsn(mp->m_log->l_ailp)) || - XFS_FORCED_SHUTDOWN(mp)); + xlog_is_shutdown(mp->m_log)); if (!xfs_log_writable(mp)) return 0; @@ -1157,7 +1232,7 @@ xfs_log_cover( * handles this for us. */ need_covered = xfs_log_need_covered(mp); - if (!need_covered && !xfs_sb_version_haslazysbcount(&mp->m_sb)) + if (!need_covered && !xfs_has_lazysbcount(mp)) return 0; /* @@ -1230,16 +1305,18 @@ xlog_assign_tail_lsn( * wrap the tail, we should blow up. Rather than catch this case here, * we depend on other ASSERTions in other parts of the code. XXXmiken * - * This code also handles the case where the reservation head is behind - * the tail. The details of this case are described below, but the end - * result is that we return the size of the log as the amount of space left. + * If reservation head is behind the tail, we have a problem. Warn about it, + * but then treat it as if the log is empty. + * + * If the log is shut down, the head and tail may be invalid or out of whack, so + * shortcut invalidity asserts in this case so that we don't trigger them + * falsely. */ STATIC int xlog_space_left( struct xlog *log, atomic64_t *head) { - int free_bytes; int tail_bytes; int tail_cycle; int head_cycle; @@ -1249,29 +1326,30 @@ xlog_space_left( xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_bytes); tail_bytes = BBTOB(tail_bytes); if (tail_cycle == head_cycle && head_bytes >= tail_bytes) - free_bytes = log->l_logsize - (head_bytes - tail_bytes); - else if (tail_cycle + 1 < head_cycle) + return log->l_logsize - (head_bytes - tail_bytes); + if (tail_cycle + 1 < head_cycle) return 0; - else if (tail_cycle < head_cycle) { + + /* Ignore potential inconsistency when shutdown. */ + if (xlog_is_shutdown(log)) + return log->l_logsize; + + if (tail_cycle < head_cycle) { ASSERT(tail_cycle == (head_cycle - 1)); - free_bytes = tail_bytes - head_bytes; - } else { - /* - * The reservation head is behind the tail. - * In this case we just want to return the size of the - * log as the amount of space left. - */ - xfs_alert(log->l_mp, "xlog_space_left: head behind tail"); - xfs_alert(log->l_mp, - " tail_cycle = %d, tail_bytes = %d", - tail_cycle, tail_bytes); - xfs_alert(log->l_mp, - " GH cycle = %d, GH bytes = %d", - head_cycle, head_bytes); - ASSERT(0); - free_bytes = log->l_logsize; + return tail_bytes - head_bytes; } - return free_bytes; + + /* + * The reservation head is behind the tail. In this case we just want to + * return the size of the log as the amount of space left. + */ + xfs_alert(log->l_mp, "xlog_space_left: head behind tail"); + xfs_alert(log->l_mp, " tail_cycle = %d, tail_bytes = %d", + tail_cycle, tail_bytes); + xfs_alert(log->l_mp, " GH cycle = %d, GH bytes = %d", + head_cycle, head_bytes); + ASSERT(0); + return log->l_logsize; } @@ -1349,6 +1427,32 @@ xfs_log_work_queue( } /* + * Clear the log incompat flags if we have the opportunity. + * + * This only happens if we're about to log the second dummy transaction as part + * of covering the log and we can get the log incompat feature usage lock. + */ +static inline void +xlog_clear_incompat( + struct xlog *log) +{ + struct xfs_mount *mp = log->l_mp; + + if (!xfs_sb_has_incompat_log_feature(&mp->m_sb, + XFS_SB_FEAT_INCOMPAT_LOG_ALL)) + return; + + if (log->l_covered_state != XLOG_STATE_COVER_DONE2) + return; + + if (!down_write_trylock(&log->l_incompat_users)) + return; + + xfs_clear_incompat_log_features(mp); + up_write(&log->l_incompat_users); +} + +/* * Every sync period we need to unpin all items in the AIL and push them to * disk. If there is nothing dirty, then we might need to cover the log to * indicate that the filesystem is idle. @@ -1374,6 +1478,7 @@ xfs_log_worker( * synchronously log the superblock instead to ensure the * superblock is immediately unpinned and can be written back. */ + xlog_clear_incompat(log); xfs_sync_sb(mp, true); } else xfs_log_force(mp, 0); @@ -1417,7 +1522,7 @@ xlog_alloc_log( log->l_logBBstart = blk_offset; log->l_logBBsize = num_bblks; log->l_covered_state = XLOG_STATE_COVER_IDLE; - log->l_flags |= XLOG_ACTIVE_RECOVERY; + set_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate); INIT_DELAYED_WORK(&log->l_work, xfs_log_worker); log->l_prev_block = -1; @@ -1426,7 +1531,7 @@ xlog_alloc_log( xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0); log->l_curr_cycle = 1; /* 0 is bad since this is initial value */ - if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1) + if (xfs_has_logv2(mp) && mp->m_sb.sb_logsunit > 1) log->l_iclog_roundoff = mp->m_sb.sb_logsunit; else log->l_iclog_roundoff = BBSIZE; @@ -1435,7 +1540,7 @@ xlog_alloc_log( xlog_grant_head_init(&log->l_write_head); error = -EFSCORRUPTED; - if (xfs_sb_version_hassector(&mp->m_sb)) { + if (xfs_has_sector(mp)) { log2_size = mp->m_sb.sb_logsectlog; if (log2_size < BBSHIFT) { xfs_warn(mp, "Log sector size too small (0x%x < 0x%x)", @@ -1452,7 +1557,7 @@ xlog_alloc_log( /* for larger sector sizes, must have v2 or external log */ if (log2_size && log->l_logBBstart > 0 && - !xfs_sb_version_haslogv2(&mp->m_sb)) { + !xfs_has_logv2(mp)) { xfs_warn(mp, "log sector size (0x%x) invalid for configuration.", log2_size); @@ -1461,6 +1566,8 @@ xlog_alloc_log( } log->l_sectBBsize = 1 << log2_size; + init_rwsem(&log->l_incompat_users); + xlog_get_iclog_buffer_size(mp, log); spin_lock_init(&log->l_icloglock); @@ -1476,7 +1583,6 @@ xlog_alloc_log( */ ASSERT(log->l_iclog_size >= 4096); for (i = 0; i < log->l_iclog_bufs; i++) { - int align_mask = xfs_buftarg_dma_alignment(mp->m_logdev_targp); size_t bvec_size = howmany(log->l_iclog_size, PAGE_SIZE) * sizeof(struct bio_vec); @@ -1488,8 +1594,8 @@ xlog_alloc_log( iclog->ic_prev = prev_iclog; prev_iclog = iclog; - iclog->ic_data = kmem_alloc_io(log->l_iclog_size, align_mask, - KM_MAYFAIL | KM_ZERO); + iclog->ic_data = kvzalloc(log->l_iclog_size, + GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!iclog->ic_data) goto out_free_iclog; #ifdef DEBUG @@ -1499,7 +1605,7 @@ xlog_alloc_log( memset(head, 0, sizeof(xlog_rec_header_t)); head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM); head->h_version = cpu_to_be32( - xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1); + xfs_has_logv2(log->l_mp) ? 2 : 1); head->h_size = cpu_to_be32(log->l_iclog_size); /* new fields */ head->h_fmt = cpu_to_be32(XLOG_FMT); @@ -1551,37 +1657,6 @@ out: } /* xlog_alloc_log */ /* - * Write out the commit record of a transaction associated with the given - * ticket to close off a running log write. Return the lsn of the commit record. - */ -int -xlog_commit_record( - struct xlog *log, - struct xlog_ticket *ticket, - struct xlog_in_core **iclog, - xfs_lsn_t *lsn) -{ - struct xfs_log_iovec reg = { - .i_addr = NULL, - .i_len = 0, - .i_type = XLOG_REG_TYPE_COMMIT, - }; - struct xfs_log_vec vec = { - .lv_niovecs = 1, - .lv_iovecp = ®, - }; - int error; - - if (XLOG_FORCED_SHUTDOWN(log)) - return -EIO; - - error = xlog_write(log, &vec, ticket, lsn, iclog, XLOG_COMMIT_TRANS); - if (error) - xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); - return error; -} - -/* * Compute the LSN that we'd need to push the log tail towards in order to have * (a) enough on-disk log space to log the number of bytes specified, (b) at * least 25% of the log space free, and (c) at least 256 blocks free. If the @@ -1653,7 +1728,7 @@ xlog_grant_push_ail( xfs_lsn_t threshold_lsn; threshold_lsn = xlog_grant_push_threshold(log, need_bytes); - if (threshold_lsn == NULLCOMMITLSN || XLOG_FORCED_SHUTDOWN(log)) + if (threshold_lsn == NULLCOMMITLSN || xlog_is_shutdown(log)) return; /* @@ -1689,7 +1764,7 @@ xlog_pack_data( dp += BBSIZE; } - if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + if (xfs_has_logv2(log->l_mp)) { xlog_in_core_2_t *xhdr = iclog->ic_data; for ( ; i < BTOBB(size); i++) { @@ -1726,7 +1801,7 @@ xlog_cksum( offsetof(struct xlog_rec_header, h_crc)); /* ... then for additional cycle data for v2 logs ... */ - if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + if (xfs_has_logv2(log->l_mp)) { union xlog_in_core2 *xhdr = (union xlog_in_core2 *)rhead; int i; int xheads; @@ -1795,7 +1870,7 @@ xlog_write_iclog( * across the log IO to archieve that. */ down(&iclog->ic_sema); - if (unlikely(iclog->ic_state == XLOG_STATE_IOERROR)) { + if (xlog_is_shutdown(log)) { /* * It would seem logical to return EIO here, but we rely on * the log state machine to propagate I/O errors instead of @@ -1953,7 +2028,7 @@ xlog_sync( /* real byte length */ size = iclog->ic_offset; - if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) + if (xfs_has_logv2(log->l_mp)) size += roundoff; iclog->ic_header.h_len = cpu_to_be32(size); @@ -2303,8 +2378,7 @@ xlog_write_copy_finish( int *data_cnt, int *partial_copy, int *partial_copy_len, - int log_offset, - struct xlog_in_core **commit_iclog) + int log_offset) { int error; @@ -2323,27 +2397,20 @@ xlog_write_copy_finish( *partial_copy = 0; *partial_copy_len = 0; - if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) { - /* no more space in this iclog - push it. */ - spin_lock(&log->l_icloglock); - xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt); - *record_cnt = 0; - *data_cnt = 0; - - if (iclog->ic_state == XLOG_STATE_ACTIVE) - xlog_state_switch_iclogs(log, iclog, 0); - else - ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC || - iclog->ic_state == XLOG_STATE_IOERROR); - if (!commit_iclog) - goto release_iclog; - spin_unlock(&log->l_icloglock); - ASSERT(flags & XLOG_COMMIT_TRANS); - *commit_iclog = iclog; - } + if (iclog->ic_size - log_offset > sizeof(xlog_op_header_t)) + return 0; - return 0; + /* no more space in this iclog - push it. */ + spin_lock(&log->l_icloglock); + xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt); + *record_cnt = 0; + *data_cnt = 0; + if (iclog->ic_state == XLOG_STATE_ACTIVE) + xlog_state_switch_iclogs(log, iclog, 0); + else + ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC || + xlog_is_shutdown(log)); release_iclog: error = xlog_state_release_iclog(log, iclog, 0); spin_unlock(&log->l_icloglock); @@ -2393,10 +2460,9 @@ release_iclog: int xlog_write( struct xlog *log, + struct xfs_cil_ctx *ctx, struct xfs_log_vec *log_vector, struct xlog_ticket *ticket, - xfs_lsn_t *start_lsn, - struct xlog_in_core **commit_iclog, uint optype) { struct xlog_in_core *iclog = NULL; @@ -2426,8 +2492,6 @@ xlog_write( } len = xlog_write_calc_vec_length(ticket, log_vector, optype); - if (start_lsn) - *start_lsn = 0; while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) { void *ptr; int log_offset; @@ -2440,9 +2504,15 @@ xlog_write( ASSERT(log_offset <= iclog->ic_size - 1); ptr = iclog->ic_datap + log_offset; - /* Start_lsn is the first lsn written to. */ - if (start_lsn && !*start_lsn) - *start_lsn = be64_to_cpu(iclog->ic_header.h_lsn); + /* + * If we have a context pointer, pass it the first iclog we are + * writing to so it can record state needed for iclog write + * ordering. + */ + if (ctx) { + xlog_cil_set_ctx_write_state(ctx, iclog); + ctx = NULL; + } /* * This loop writes out as many regions as can fit in the amount @@ -2521,8 +2591,7 @@ xlog_write( &record_cnt, &data_cnt, &partial_copy, &partial_copy_len, - log_offset, - commit_iclog); + log_offset); if (error) return error; @@ -2560,12 +2629,7 @@ next_lv: spin_lock(&log->l_icloglock); xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); - if (commit_iclog) { - ASSERT(optype & XLOG_COMMIT_TRANS); - *commit_iclog = iclog; - } else { - error = xlog_state_release_iclog(log, iclog, 0); - } + error = xlog_state_release_iclog(log, iclog, 0); spin_unlock(&log->l_icloglock); return error; @@ -2751,8 +2815,7 @@ xlog_state_set_callback( static bool xlog_state_iodone_process_iclog( struct xlog *log, - struct xlog_in_core *iclog, - bool *ioerror) + struct xlog_in_core *iclog) { xfs_lsn_t lowest_lsn; xfs_lsn_t header_lsn; @@ -2764,15 +2827,6 @@ xlog_state_iodone_process_iclog( * Skip all iclogs in the ACTIVE & DIRTY states: */ return false; - case XLOG_STATE_IOERROR: - /* - * Between marking a filesystem SHUTDOWN and stopping the log, - * we do flush all iclogs to disk (if there wasn't a log I/O - * error). So, we do want things to go smoothly in case of just - * a SHUTDOWN w/o a LOG_IO_ERROR. - */ - *ioerror = true; - return false; case XLOG_STATE_DONE_SYNC: /* * Now that we have an iclog that is in the DONE_SYNC state, do @@ -2796,72 +2850,75 @@ xlog_state_iodone_process_iclog( } } -STATIC void -xlog_state_do_callback( +/* + * Loop over all the iclogs, running attached callbacks on them. Return true if + * we ran any callbacks, indicating that we dropped the icloglock. We don't need + * to handle transient shutdown state here at all because + * xlog_state_shutdown_callbacks() will be run to do the necessary shutdown + * cleanup of the callbacks. + */ +static bool +xlog_state_do_iclog_callbacks( struct xlog *log) + __releases(&log->l_icloglock) + __acquires(&log->l_icloglock) { - struct xlog_in_core *iclog; - struct xlog_in_core *first_iclog; - bool cycled_icloglock; - bool ioerror; - int flushcnt = 0; - int repeats = 0; + struct xlog_in_core *first_iclog = log->l_iclog; + struct xlog_in_core *iclog = first_iclog; + bool ran_callback = false; - spin_lock(&log->l_icloglock); do { - /* - * Scan all iclogs starting with the one pointed to by the - * log. Reset this starting point each time the log is - * unlocked (during callbacks). - * - * Keep looping through iclogs until one full pass is made - * without running any callbacks. - */ - first_iclog = log->l_iclog; - iclog = log->l_iclog; - cycled_icloglock = false; - ioerror = false; - repeats++; + LIST_HEAD(cb_list); - do { - LIST_HEAD(cb_list); + if (xlog_state_iodone_process_iclog(log, iclog)) + break; + if (iclog->ic_state != XLOG_STATE_CALLBACK) { + iclog = iclog->ic_next; + continue; + } + list_splice_init(&iclog->ic_callbacks, &cb_list); + spin_unlock(&log->l_icloglock); - if (xlog_state_iodone_process_iclog(log, iclog, - &ioerror)) - break; + trace_xlog_iclog_callbacks_start(iclog, _RET_IP_); + xlog_cil_process_committed(&cb_list); + trace_xlog_iclog_callbacks_done(iclog, _RET_IP_); + ran_callback = true; - if (iclog->ic_state != XLOG_STATE_CALLBACK && - iclog->ic_state != XLOG_STATE_IOERROR) { - iclog = iclog->ic_next; - continue; - } - list_splice_init(&iclog->ic_callbacks, &cb_list); - spin_unlock(&log->l_icloglock); + spin_lock(&log->l_icloglock); + xlog_state_clean_iclog(log, iclog); + iclog = iclog->ic_next; + } while (iclog != first_iclog); + + return ran_callback; +} - trace_xlog_iclog_callbacks_start(iclog, _RET_IP_); - xlog_cil_process_committed(&cb_list); - trace_xlog_iclog_callbacks_done(iclog, _RET_IP_); - cycled_icloglock = true; - spin_lock(&log->l_icloglock); - if (XLOG_FORCED_SHUTDOWN(log)) - wake_up_all(&iclog->ic_force_wait); - else - xlog_state_clean_iclog(log, iclog); - iclog = iclog->ic_next; - } while (first_iclog != iclog); +/* + * Loop running iclog completion callbacks until there are no more iclogs in a + * state that can run callbacks. + */ +STATIC void +xlog_state_do_callback( + struct xlog *log) +{ + int flushcnt = 0; + int repeats = 0; + + spin_lock(&log->l_icloglock); + while (xlog_state_do_iclog_callbacks(log)) { + if (xlog_is_shutdown(log)) + break; - if (repeats > 5000) { + if (++repeats > 5000) { flushcnt += repeats; repeats = 0; xfs_warn(log->l_mp, "%s: possible infinite loop (%d iterations)", __func__, flushcnt); } - } while (!ioerror && cycled_icloglock); + } - if (log->l_iclog->ic_state == XLOG_STATE_ACTIVE || - log->l_iclog->ic_state == XLOG_STATE_IOERROR) + if (log->l_iclog->ic_state == XLOG_STATE_ACTIVE) wake_up_all(&log->l_flush_wait); spin_unlock(&log->l_icloglock); @@ -2871,13 +2928,6 @@ xlog_state_do_callback( /* * Finish transitioning this iclog to the dirty state. * - * Make sure that we completely execute this routine only when this is - * the last call to the iclog. There is a good chance that iclog flushes, - * when we reach the end of the physical log, get turned into 2 separate - * calls to bwrite. Hence, one iclog flush could generate two calls to this - * routine. By using the reference count bwritecnt, we guarantee that only - * the second completion goes through. - * * Callbacks could take time, so they are done outside the scope of the * global state machine log lock. */ @@ -2896,7 +2946,7 @@ xlog_state_done_syncing( * split log writes, on the second, we shut down the file system and * no iclogs should ever be attempted to be written to disk again. */ - if (!XLOG_FORCED_SHUTDOWN(log)) { + if (!xlog_is_shutdown(log)) { ASSERT(iclog->ic_state == XLOG_STATE_SYNCING); iclog->ic_state = XLOG_STATE_DONE_SYNC; } @@ -2944,7 +2994,7 @@ xlog_state_get_iclog_space( restart: spin_lock(&log->l_icloglock); - if (XLOG_FORCED_SHUTDOWN(log)) { + if (xlog_is_shutdown(log)) { spin_unlock(&log->l_icloglock); return -EIO; } @@ -3122,7 +3172,7 @@ xfs_log_ticket_ungrant( * This routine will mark the current iclog in the ring as WANT_SYNC and move * the current iclog pointer to the next iclog in the ring. */ -STATIC void +void xlog_state_switch_iclogs( struct xlog *log, struct xlog_in_core *iclog, @@ -3237,10 +3287,10 @@ xfs_log_force( xlog_cil_force(log); spin_lock(&log->l_icloglock); - iclog = log->l_iclog; - if (iclog->ic_state == XLOG_STATE_IOERROR) + if (xlog_is_shutdown(log)) goto out_error; + iclog = log->l_iclog; trace_xlog_iclog_force(iclog, _RET_IP_); if (iclog->ic_state == XLOG_STATE_DIRTY || @@ -3294,6 +3344,20 @@ out_error: return -EIO; } +/* + * Force the log to a specific LSN. + * + * If an iclog with that lsn can be found: + * If it is in the DIRTY state, just return. + * If it is in the ACTIVE state, move the in-core log into the WANT_SYNC + * state and go to sleep or return. + * If it is in any other state, go to sleep or return. + * + * Synchronous forces are implemented with a wait queue. All callers trying + * to force a given lsn to disk must wait on the queue attached to the + * specific in-core log. When given in-core log finally completes its write + * to disk, that thread will wake up all threads waiting on the queue. + */ static int xlog_force_lsn( struct xlog *log, @@ -3306,10 +3370,10 @@ xlog_force_lsn( bool completed; spin_lock(&log->l_icloglock); - iclog = log->l_iclog; - if (iclog->ic_state == XLOG_STATE_IOERROR) + if (xlog_is_shutdown(log)) goto out_error; + iclog = log->l_iclog; while (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) { trace_xlog_iclog_force_lsn(iclog, _RET_IP_); iclog = iclog->ic_next; @@ -3379,18 +3443,13 @@ out_error: } /* - * Force the in-core log to disk for a specific LSN. - * - * Find in-core log with lsn. - * If it is in the DIRTY state, just return. - * If it is in the ACTIVE state, move the in-core log into the WANT_SYNC - * state and go to sleep or return. - * If it is in any other state, go to sleep or return. + * Force the log to a specific checkpoint sequence. * - * Synchronous forces are implemented with a wait queue. All callers trying - * to force a given lsn to disk must wait on the queue attached to the - * specific in-core log. When given in-core log finally completes its write - * to disk, that thread will wake up all threads waiting on the queue. + * First force the CIL so that all the required changes have been flushed to the + * iclogs. If the CIL force completed it will return a commit LSN that indicates + * the iclog that needs to be flushed to stable storage. If the caller needs + * a synchronous log force, we will wait on the iclog with the LSN returned by + * xlog_cil_force_seq() to be completed. */ int xfs_log_force_seq( @@ -3619,17 +3678,15 @@ xlog_verify_grant_tail( xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks); if (tail_cycle != cycle) { if (cycle - 1 != tail_cycle && - !(log->l_flags & XLOG_TAIL_WARN)) { + !test_and_set_bit(XLOG_TAIL_WARN, &log->l_opstate)) { xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, "%s: cycle - 1 != tail_cycle", __func__); - log->l_flags |= XLOG_TAIL_WARN; } if (space > BBTOB(tail_blocks) && - !(log->l_flags & XLOG_TAIL_WARN)) { + !test_and_set_bit(XLOG_TAIL_WARN, &log->l_opstate)) { xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, "%s: space > BBTOB(tail_blocks)", __func__); - log->l_flags |= XLOG_TAIL_WARN; } } } @@ -3765,105 +3822,66 @@ xlog_verify_iclog( #endif /* - * Mark all iclogs IOERROR. l_icloglock is held by the caller. - */ -STATIC int -xlog_state_ioerror( - struct xlog *log) -{ - xlog_in_core_t *iclog, *ic; - - iclog = log->l_iclog; - if (iclog->ic_state != XLOG_STATE_IOERROR) { - /* - * Mark all the incore logs IOERROR. - * From now on, no log flushes will result. - */ - ic = iclog; - do { - ic->ic_state = XLOG_STATE_IOERROR; - ic = ic->ic_next; - } while (ic != iclog); - return 0; - } - /* - * Return non-zero, if state transition has already happened. - */ - return 1; -} - -/* - * This is called from xfs_force_shutdown, when we're forcibly - * shutting down the filesystem, typically because of an IO error. + * Perform a forced shutdown on the log. This should be called once and once + * only by the high level filesystem shutdown code to shut the log subsystem + * down cleanly. + * * Our main objectives here are to make sure that: - * a. if !logerror, flush the logs to disk. Anything modified - * after this is ignored. - * b. the filesystem gets marked 'SHUTDOWN' for all interested - * parties to find out, 'atomically'. - * c. those who're sleeping on log reservations, pinned objects and - * other resources get woken up, and be told the bad news. - * d. nothing new gets queued up after (b) and (c) are done. + * a. if the shutdown was not due to a log IO error, flush the logs to + * disk. Anything modified after this is ignored. + * b. the log gets atomically marked 'XLOG_IO_ERROR' for all interested + * parties to find out. Nothing new gets queued after this is done. + * c. Tasks sleeping on log reservations, pinned objects and + * other resources get woken up. * - * Note: for the !logerror case we need to flush the regions held in memory out - * to disk first. This needs to be done before the log is marked as shutdown, - * otherwise the iclog writes will fail. + * Return true if the shutdown cause was a log IO error and we actually shut the + * log down. */ -int -xfs_log_force_umount( - struct xfs_mount *mp, - int logerror) +bool +xlog_force_shutdown( + struct xlog *log, + int shutdown_flags) { - struct xlog *log; - int retval; - - log = mp->m_log; + bool log_error = (shutdown_flags & SHUTDOWN_LOG_IO_ERROR); /* - * If this happens during log recovery, don't worry about - * locking; the log isn't open for business yet. + * If this happens during log recovery then we aren't using the runtime + * log mechanisms yet so there's nothing to shut down. */ - if (!log || - log->l_flags & XLOG_ACTIVE_RECOVERY) { - mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN; - if (mp->m_sb_bp) - mp->m_sb_bp->b_flags |= XBF_DONE; - return 0; - } + if (!log || xlog_in_recovery(log)) + return false; - /* - * Somebody could've already done the hard work for us. - * No need to get locks for this. - */ - if (logerror && log->l_iclog->ic_state == XLOG_STATE_IOERROR) { - ASSERT(XLOG_FORCED_SHUTDOWN(log)); - return 1; - } + ASSERT(!xlog_is_shutdown(log)); /* * Flush all the completed transactions to disk before marking the log - * being shut down. We need to do it in this order to ensure that - * completed operations are safely on disk before we shut down, and that - * we don't have to issue any buffer IO after the shutdown flags are set - * to guarantee this. + * being shut down. We need to do this first as shutting down the log + * before the force will prevent the log force from flushing the iclogs + * to disk. + * + * Re-entry due to a log IO error shutdown during the log force is + * prevented by the atomicity of higher level shutdown code. */ - if (!logerror) - xfs_log_force(mp, XFS_LOG_SYNC); + if (!log_error) + xfs_log_force(log->l_mp, XFS_LOG_SYNC); /* - * mark the filesystem and the as in a shutdown state and wake - * everybody up to tell them the bad news. + * Atomically set the shutdown state. If the shutdown state is already + * set, there someone else is performing the shutdown and so we are done + * here. This should never happen because we should only ever get called + * once by the first shutdown caller. + * + * Much of the log state machine transitions assume that shutdown state + * cannot change once they hold the log->l_icloglock. Hence we need to + * hold that lock here, even though we use the atomic test_and_set_bit() + * operation to set the shutdown state. */ spin_lock(&log->l_icloglock); - mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN; - if (mp->m_sb_bp) - mp->m_sb_bp->b_flags |= XBF_DONE; - - /* - * Mark the log and the iclogs with IO error flags to prevent any - * further log IO from being issued or completed. - */ - log->l_flags |= XLOG_IO_ERROR; - retval = xlog_state_ioerror(log); + if (test_and_set_bit(XLOG_IO_ERROR, &log->l_opstate)) { + spin_unlock(&log->l_icloglock); + ASSERT(0); + return false; + } spin_unlock(&log->l_icloglock); /* @@ -3883,12 +3901,12 @@ xfs_log_force_umount( * avoid races. */ spin_lock(&log->l_cilp->xc_push_lock); + wake_up_all(&log->l_cilp->xc_start_wait); wake_up_all(&log->l_cilp->xc_commit_wait); spin_unlock(&log->l_cilp->xc_push_lock); - xlog_state_do_callback(log); + xlog_state_shutdown_callbacks(log); - /* return non-zero if log IOERROR transition had already happened */ - return retval; + return log_error; } STATIC int @@ -3926,7 +3944,7 @@ xfs_log_check_lsn( * resets the in-core LSN. We can't validate in this mode, but * modifications are not allowed anyways so just return true. */ - if (mp->m_flags & XFS_MOUNT_NORECOVERY) + if (xfs_has_norecovery(mp)) return true; /* @@ -3952,11 +3970,22 @@ xfs_log_check_lsn( return valid; } -bool -xfs_log_in_recovery( - struct xfs_mount *mp) +/* + * Notify the log that we're about to start using a feature that is protected + * by a log incompat feature flag. This will prevent log covering from + * clearing those flags. + */ +void +xlog_use_incompat_feat( + struct xlog *log) { - struct xlog *log = mp->m_log; + down_read(&log->l_incompat_users); +} - return log->l_flags & XLOG_ACTIVE_RECOVERY; +/* Notify the log that we've finished using log incompat features. */ +void +xlog_drop_incompat_feat( + struct xlog *log) +{ + up_read(&log->l_incompat_users); } diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 813b972e9788..dc1b77b92fc1 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -104,6 +104,7 @@ struct xlog_ticket; struct xfs_log_item; struct xfs_item_ops; struct xfs_trans; +struct xlog; int xfs_log_force(struct xfs_mount *mp, uint flags); int xfs_log_force_seq(struct xfs_mount *mp, xfs_csn_t seq, uint flags, @@ -125,7 +126,6 @@ int xfs_log_reserve(struct xfs_mount *mp, bool permanent); int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic); void xfs_log_unmount(struct xfs_mount *mp); -int xfs_log_force_umount(struct xfs_mount *mp, int logerror); bool xfs_log_writable(struct xfs_mount *mp); struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket); @@ -138,8 +138,11 @@ void xfs_log_work_queue(struct xfs_mount *mp); int xfs_log_quiesce(struct xfs_mount *mp); void xfs_log_clean(struct xfs_mount *mp); bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t); -bool xfs_log_in_recovery(struct xfs_mount *); xfs_lsn_t xlog_grant_push_threshold(struct xlog *log, int need_bytes); +bool xlog_force_shutdown(struct xlog *log, int shutdown_flags); + +void xlog_use_incompat_feat(struct xlog *log); +void xlog_drop_incompat_feat(struct xlog *log); #endif /* __XFS_LOG_H__ */ diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 4c44bc3786c0..6c93c8ada6f3 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -48,6 +48,34 @@ xlog_cil_ticket_alloc( } /* + * Unavoidable forward declaration - xlog_cil_push_work() calls + * xlog_cil_ctx_alloc() itself. + */ +static void xlog_cil_push_work(struct work_struct *work); + +static struct xfs_cil_ctx * +xlog_cil_ctx_alloc(void) +{ + struct xfs_cil_ctx *ctx; + + ctx = kmem_zalloc(sizeof(*ctx), KM_NOFS); + INIT_LIST_HEAD(&ctx->committing); + INIT_LIST_HEAD(&ctx->busy_extents); + INIT_WORK(&ctx->push_work, xlog_cil_push_work); + return ctx; +} + +static void +xlog_cil_ctx_switch( + struct xfs_cil *cil, + struct xfs_cil_ctx *ctx) +{ + ctx->sequence = ++cil->xc_current_sequence; + ctx->cil = cil; + cil->xc_ctx = ctx; +} + +/* * After the first stage of log recovery is done, we know where the head and * tail of the log are. We need this log initialisation done before we can * initialise the first CIL checkpoint context. @@ -185,7 +213,15 @@ xlog_cil_alloc_shadow_bufs( */ kmem_free(lip->li_lv_shadow); - lv = kmem_alloc_large(buf_size, KM_NOFS); + /* + * We are in transaction context, which means this + * allocation will pick up GFP_NOFS from the + * memalloc_nofs_save/restore context the transaction + * holds. This means we can use GFP_KERNEL here so the + * generic kvmalloc() code will run vmalloc on + * contiguous page allocation failure as we require. + */ + lv = kvmalloc(buf_size, GFP_KERNEL); memset(lv, 0, xlog_cil_iovec_space(niovecs)); lv->lv_item = lip; @@ -535,7 +571,7 @@ xlog_discard_busy_extents( struct blk_plug plug; int error = 0; - ASSERT(mp->m_flags & XFS_MOUNT_DISCARD); + ASSERT(xfs_has_discard(mp)); blk_start_plug(&plug); list_for_each_entry(busyp, list, list) { @@ -576,7 +612,7 @@ xlog_cil_committed( struct xfs_cil_ctx *ctx) { struct xfs_mount *mp = ctx->cil->xc_log->l_mp; - bool abort = XLOG_FORCED_SHUTDOWN(ctx->cil->xc_log); + bool abort = xlog_is_shutdown(ctx->cil->xc_log); /* * If the I/O failed, we're aborting the commit and already shutdown. @@ -587,6 +623,7 @@ xlog_cil_committed( */ if (abort) { spin_lock(&ctx->cil->xc_push_lock); + wake_up_all(&ctx->cil->xc_start_wait); wake_up_all(&ctx->cil->xc_commit_wait); spin_unlock(&ctx->cil->xc_push_lock); } @@ -596,7 +633,7 @@ xlog_cil_committed( xfs_extent_busy_sort(&ctx->busy_extents); xfs_extent_busy_clear(mp, &ctx->busy_extents, - (mp->m_flags & XFS_MOUNT_DISCARD) && !abort); + xfs_has_discard(mp) && !abort); spin_lock(&ctx->cil->xc_push_lock); list_del(&ctx->committing); @@ -624,6 +661,180 @@ xlog_cil_process_committed( } /* +* Record the LSN of the iclog we were just granted space to start writing into. +* If the context doesn't have a start_lsn recorded, then this iclog will +* contain the start record for the checkpoint. Otherwise this write contains +* the commit record for the checkpoint. +*/ +void +xlog_cil_set_ctx_write_state( + struct xfs_cil_ctx *ctx, + struct xlog_in_core *iclog) +{ + struct xfs_cil *cil = ctx->cil; + xfs_lsn_t lsn = be64_to_cpu(iclog->ic_header.h_lsn); + + ASSERT(!ctx->commit_lsn); + if (!ctx->start_lsn) { + spin_lock(&cil->xc_push_lock); + /* + * The LSN we need to pass to the log items on transaction + * commit is the LSN reported by the first log vector write, not + * the commit lsn. If we use the commit record lsn then we can + * move the tail beyond the grant write head. + */ + ctx->start_lsn = lsn; + wake_up_all(&cil->xc_start_wait); + spin_unlock(&cil->xc_push_lock); + return; + } + + /* + * Take a reference to the iclog for the context so that we still hold + * it when xlog_write is done and has released it. This means the + * context controls when the iclog is released for IO. + */ + atomic_inc(&iclog->ic_refcnt); + + /* + * xlog_state_get_iclog_space() guarantees there is enough space in the + * iclog for an entire commit record, so we can attach the context + * callbacks now. This needs to be done before we make the commit_lsn + * visible to waiters so that checkpoints with commit records in the + * same iclog order their IO completion callbacks in the same order that + * the commit records appear in the iclog. + */ + spin_lock(&cil->xc_log->l_icloglock); + list_add_tail(&ctx->iclog_entry, &iclog->ic_callbacks); + spin_unlock(&cil->xc_log->l_icloglock); + + /* + * Now we can record the commit LSN and wake anyone waiting for this + * sequence to have the ordered commit record assigned to a physical + * location in the log. + */ + spin_lock(&cil->xc_push_lock); + ctx->commit_iclog = iclog; + ctx->commit_lsn = lsn; + wake_up_all(&cil->xc_commit_wait); + spin_unlock(&cil->xc_push_lock); +} + + +/* + * Ensure that the order of log writes follows checkpoint sequence order. This + * relies on the context LSN being zero until the log write has guaranteed the + * LSN that the log write will start at via xlog_state_get_iclog_space(). + */ +enum _record_type { + _START_RECORD, + _COMMIT_RECORD, +}; + +static int +xlog_cil_order_write( + struct xfs_cil *cil, + xfs_csn_t sequence, + enum _record_type record) +{ + struct xfs_cil_ctx *ctx; + +restart: + spin_lock(&cil->xc_push_lock); + list_for_each_entry(ctx, &cil->xc_committing, committing) { + /* + * Avoid getting stuck in this loop because we were woken by the + * shutdown, but then went back to sleep once already in the + * shutdown state. + */ + if (xlog_is_shutdown(cil->xc_log)) { + spin_unlock(&cil->xc_push_lock); + return -EIO; + } + + /* + * Higher sequences will wait for this one so skip them. + * Don't wait for our own sequence, either. + */ + if (ctx->sequence >= sequence) + continue; + + /* Wait until the LSN for the record has been recorded. */ + switch (record) { + case _START_RECORD: + if (!ctx->start_lsn) { + xlog_wait(&cil->xc_start_wait, &cil->xc_push_lock); + goto restart; + } + break; + case _COMMIT_RECORD: + if (!ctx->commit_lsn) { + xlog_wait(&cil->xc_commit_wait, &cil->xc_push_lock); + goto restart; + } + break; + } + } + spin_unlock(&cil->xc_push_lock); + return 0; +} + +/* + * Write out the log vector change now attached to the CIL context. This will + * write a start record that needs to be strictly ordered in ascending CIL + * sequence order so that log recovery will always use in-order start LSNs when + * replaying checkpoints. + */ +static int +xlog_cil_write_chain( + struct xfs_cil_ctx *ctx, + struct xfs_log_vec *chain) +{ + struct xlog *log = ctx->cil->xc_log; + int error; + + error = xlog_cil_order_write(ctx->cil, ctx->sequence, _START_RECORD); + if (error) + return error; + return xlog_write(log, ctx, chain, ctx->ticket, XLOG_START_TRANS); +} + +/* + * Write out the commit record of a checkpoint transaction to close off a + * running log write. These commit records are strictly ordered in ascending CIL + * sequence order so that log recovery will always replay the checkpoints in the + * correct order. + */ +static int +xlog_cil_write_commit_record( + struct xfs_cil_ctx *ctx) +{ + struct xlog *log = ctx->cil->xc_log; + struct xfs_log_iovec reg = { + .i_addr = NULL, + .i_len = 0, + .i_type = XLOG_REG_TYPE_COMMIT, + }; + struct xfs_log_vec vec = { + .lv_niovecs = 1, + .lv_iovecp = ®, + }; + int error; + + if (xlog_is_shutdown(log)) + return -EIO; + + error = xlog_cil_order_write(ctx->cil, ctx->sequence, _COMMIT_RECORD); + if (error) + return error; + + error = xlog_write(log, ctx, &vec, ctx->ticket, XLOG_COMMIT_TRANS); + if (error) + xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + return error; +} + +/* * Push the Committed Item List to the log. * * If the current sequence is the same as xc_push_seq we need to do a flush. If @@ -641,13 +852,12 @@ static void xlog_cil_push_work( struct work_struct *work) { - struct xfs_cil *cil = - container_of(work, struct xfs_cil, xc_push_work); + struct xfs_cil_ctx *ctx = + container_of(work, struct xfs_cil_ctx, push_work); + struct xfs_cil *cil = ctx->cil; struct xlog *log = cil->xc_log; struct xfs_log_vec *lv; - struct xfs_cil_ctx *ctx; struct xfs_cil_ctx *new_ctx; - struct xlog_in_core *commit_iclog; struct xlog_ticket *tic; int num_iovecs; int error = 0; @@ -655,20 +865,21 @@ xlog_cil_push_work( struct xfs_log_iovec lhdr; struct xfs_log_vec lvhdr = { NULL }; xfs_lsn_t preflush_tail_lsn; - xfs_lsn_t commit_lsn; xfs_csn_t push_seq; struct bio bio; DECLARE_COMPLETION_ONSTACK(bdev_flush); + bool push_commit_stable; - new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_NOFS); + new_ctx = xlog_cil_ctx_alloc(); new_ctx->ticket = xlog_cil_ticket_alloc(log); down_write(&cil->xc_ctx_lock); - ctx = cil->xc_ctx; spin_lock(&cil->xc_push_lock); push_seq = cil->xc_push_seq; ASSERT(push_seq <= ctx->sequence); + push_commit_stable = cil->xc_push_commit_stable; + cil->xc_push_commit_stable = false; /* * As we are about to switch to a new, empty CIL context, we no longer @@ -694,7 +905,7 @@ xlog_cil_push_work( /* check for a previously pushed sequence */ - if (push_seq < cil->xc_ctx->sequence) { + if (push_seq < ctx->sequence) { spin_unlock(&cil->xc_push_lock); goto out_skip; } @@ -767,19 +978,7 @@ xlog_cil_push_work( } /* - * initialise the new context and attach it to the CIL. Then attach - * the current context to the CIL committing list so it can be found - * during log forces to extract the commit lsn of the sequence that - * needs to be forced. - */ - INIT_LIST_HEAD(&new_ctx->committing); - INIT_LIST_HEAD(&new_ctx->busy_extents); - new_ctx->sequence = ctx->sequence + 1; - new_ctx->cil = cil; - cil->xc_ctx = new_ctx; - - /* - * The switch is now done, so we can drop the context lock and move out + * Switch the contexts so we can drop the context lock and move out * of a shared context. We can't just go straight to the commit record, * though - we need to synchronise with previous and future commits so * that the commit records are correctly ordered in the log to ensure @@ -804,7 +1003,7 @@ xlog_cil_push_work( * deferencing a freed context pointer. */ spin_lock(&cil->xc_push_lock); - cil->xc_current_sequence = new_ctx->sequence; + xlog_cil_ctx_switch(cil, new_ctx); spin_unlock(&cil->xc_push_lock); up_write(&cil->xc_ctx_lock); @@ -837,78 +1036,17 @@ xlog_cil_push_work( */ wait_for_completion(&bdev_flush); - error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, - XLOG_START_TRANS); + error = xlog_cil_write_chain(ctx, &lvhdr); if (error) goto out_abort_free_ticket; - /* - * now that we've written the checkpoint into the log, strictly - * order the commit records so replay will get them in the right order. - */ -restart: - spin_lock(&cil->xc_push_lock); - list_for_each_entry(new_ctx, &cil->xc_committing, committing) { - /* - * Avoid getting stuck in this loop because we were woken by the - * shutdown, but then went back to sleep once already in the - * shutdown state. - */ - if (XLOG_FORCED_SHUTDOWN(log)) { - spin_unlock(&cil->xc_push_lock); - goto out_abort_free_ticket; - } - - /* - * Higher sequences will wait for this one so skip them. - * Don't wait for our own sequence, either. - */ - if (new_ctx->sequence >= ctx->sequence) - continue; - if (!new_ctx->commit_lsn) { - /* - * It is still being pushed! Wait for the push to - * complete, then start again from the beginning. - */ - xlog_wait(&cil->xc_commit_wait, &cil->xc_push_lock); - goto restart; - } - } - spin_unlock(&cil->xc_push_lock); - - error = xlog_commit_record(log, tic, &commit_iclog, &commit_lsn); + error = xlog_cil_write_commit_record(ctx); if (error) goto out_abort_free_ticket; xfs_log_ticket_ungrant(log, tic); /* - * Once we attach the ctx to the iclog, a shutdown can process the - * iclog, run the callbacks and free the ctx. The only thing preventing - * this potential UAF situation here is that we are holding the - * icloglock. Hence we cannot access the ctx once we have attached the - * callbacks and dropped the icloglock. - */ - spin_lock(&log->l_icloglock); - if (commit_iclog->ic_state == XLOG_STATE_IOERROR) { - spin_unlock(&log->l_icloglock); - goto out_abort; - } - ASSERT_ALWAYS(commit_iclog->ic_state == XLOG_STATE_ACTIVE || - commit_iclog->ic_state == XLOG_STATE_WANT_SYNC); - list_add_tail(&ctx->iclog_entry, &commit_iclog->ic_callbacks); - - /* - * now the checkpoint commit is complete and we've attached the - * callbacks to the iclog we can assign the commit LSN to the context - * and wake up anyone who is waiting for the commit to complete. - */ - spin_lock(&cil->xc_push_lock); - ctx->commit_lsn = commit_lsn; - wake_up_all(&cil->xc_commit_wait); - spin_unlock(&cil->xc_push_lock); - - /* * If the checkpoint spans multiple iclogs, wait for all previous iclogs * to complete before we submit the commit_iclog. We can't use state * checks for this - ACTIVE can be either a past completed iclog or a @@ -919,21 +1057,19 @@ restart: * wakeup until this commit_iclog is written to disk. Hence we use the * iclog header lsn and compare it to the commit lsn to determine if we * need to wait on iclogs or not. - * - * NOTE: It is not safe to reference the ctx after this check as we drop - * the icloglock if we have to wait for completion of other iclogs. */ - if (ctx->start_lsn != commit_lsn) { + spin_lock(&log->l_icloglock); + if (ctx->start_lsn != ctx->commit_lsn) { xfs_lsn_t plsn; - plsn = be64_to_cpu(commit_iclog->ic_prev->ic_header.h_lsn); - if (plsn && XFS_LSN_CMP(plsn, commit_lsn) < 0) { + plsn = be64_to_cpu(ctx->commit_iclog->ic_prev->ic_header.h_lsn); + if (plsn && XFS_LSN_CMP(plsn, ctx->commit_lsn) < 0) { /* * Waiting on ic_force_wait orders the completion of * iclogs older than ic_prev. Hence we only need to wait * on the most recent older iclog here. */ - xlog_wait_on_iclog(commit_iclog->ic_prev); + xlog_wait_on_iclog(ctx->commit_iclog->ic_prev); spin_lock(&log->l_icloglock); } @@ -941,16 +1077,27 @@ restart: * We need to issue a pre-flush so that the ordering for this * checkpoint is correctly preserved down to stable storage. */ - commit_iclog->ic_flags |= XLOG_ICL_NEED_FLUSH; + ctx->commit_iclog->ic_flags |= XLOG_ICL_NEED_FLUSH; } /* * The commit iclog must be written to stable storage to guarantee * journal IO vs metadata writeback IO is correctly ordered on stable * storage. + * + * If the push caller needs the commit to be immediately stable and the + * commit_iclog is not yet marked as XLOG_STATE_WANT_SYNC to indicate it + * will be written when released, switch it's state to WANT_SYNC right + * now. */ - commit_iclog->ic_flags |= XLOG_ICL_NEED_FUA; - xlog_state_release_iclog(log, commit_iclog, preflush_tail_lsn); + ctx->commit_iclog->ic_flags |= XLOG_ICL_NEED_FUA; + if (push_commit_stable && + ctx->commit_iclog->ic_state == XLOG_STATE_ACTIVE) + xlog_state_switch_iclogs(log, ctx->commit_iclog, 0); + xlog_state_release_iclog(log, ctx->commit_iclog, preflush_tail_lsn); + + /* Not safe to reference ctx now! */ + spin_unlock(&log->l_icloglock); return; @@ -962,9 +1109,15 @@ out_skip: out_abort_free_ticket: xfs_log_ticket_ungrant(log, tic); -out_abort: - ASSERT(XLOG_FORCED_SHUTDOWN(log)); - xlog_cil_committed(ctx); + ASSERT(xlog_is_shutdown(log)); + if (!ctx->commit_iclog) { + xlog_cil_committed(ctx); + return; + } + spin_lock(&log->l_icloglock); + xlog_state_release_iclog(log, ctx->commit_iclog, 0); + /* Not safe to reference ctx now! */ + spin_unlock(&log->l_icloglock); } /* @@ -998,7 +1151,7 @@ xlog_cil_push_background( spin_lock(&cil->xc_push_lock); if (cil->xc_push_seq < cil->xc_current_sequence) { cil->xc_push_seq = cil->xc_current_sequence; - queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work); + queue_work(cil->xc_push_wq, &cil->xc_ctx->push_work); } /* @@ -1034,13 +1187,26 @@ xlog_cil_push_background( /* * xlog_cil_push_now() is used to trigger an immediate CIL push to the sequence * number that is passed. When it returns, the work will be queued for - * @push_seq, but it won't be completed. The caller is expected to do any - * waiting for push_seq to complete if it is required. + * @push_seq, but it won't be completed. + * + * If the caller is performing a synchronous force, we will flush the workqueue + * to get previously queued work moving to minimise the wait time they will + * undergo waiting for all outstanding pushes to complete. The caller is + * expected to do the required waiting for push_seq to complete. + * + * If the caller is performing an async push, we need to ensure that the + * checkpoint is fully flushed out of the iclogs when we finish the push. If we + * don't do this, then the commit record may remain sitting in memory in an + * ACTIVE iclog. This then requires another full log force to push to disk, + * which defeats the purpose of having an async, non-blocking CIL force + * mechanism. Hence in this case we need to pass a flag to the push work to + * indicate it needs to flush the commit record itself. */ static void xlog_cil_push_now( struct xlog *log, - xfs_lsn_t push_seq) + xfs_lsn_t push_seq, + bool async) { struct xfs_cil *cil = log->l_cilp; @@ -1050,7 +1216,8 @@ xlog_cil_push_now( ASSERT(push_seq && push_seq <= cil->xc_current_sequence); /* start on any pending background push to minimise wait time on it */ - flush_work(&cil->xc_push_work); + if (!async) + flush_workqueue(cil->xc_push_wq); /* * If the CIL is empty or we've already pushed the sequence then @@ -1063,7 +1230,8 @@ xlog_cil_push_now( } cil->xc_push_seq = push_seq; - queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work); + cil->xc_push_commit_stable = async; + queue_work(cil->xc_push_wq, &cil->xc_ctx->push_work); spin_unlock(&cil->xc_push_lock); } @@ -1116,7 +1284,7 @@ xlog_cil_commit( xlog_cil_insert_items(log, tp); - if (regrant && !XLOG_FORCED_SHUTDOWN(log)) + if (regrant && !xlog_is_shutdown(log)) xfs_log_ticket_regrant(log, tp->t_ticket); else xfs_log_ticket_ungrant(log, tp->t_ticket); @@ -1148,11 +1316,26 @@ xlog_cil_commit( } /* + * Flush the CIL to stable storage but don't wait for it to complete. This + * requires the CIL push to ensure the commit record for the push hits the disk, + * but otherwise is no different to a push done from a log force. + */ +void +xlog_cil_flush( + struct xlog *log) +{ + xfs_csn_t seq = log->l_cilp->xc_current_sequence; + + trace_xfs_log_force(log->l_mp, seq, _RET_IP_); + xlog_cil_push_now(log, seq, true); +} + +/* * Conditionally push the CIL based on the sequence passed in. * - * We only need to push if we haven't already pushed the sequence - * number given. Hence the only time we will trigger a push here is - * if the push sequence is the same as the current context. + * We only need to push if we haven't already pushed the sequence number given. + * Hence the only time we will trigger a push here is if the push sequence is + * the same as the current context. * * We return the current commit lsn to allow the callers to determine if a * iclog flush is necessary following this call. @@ -1168,13 +1351,17 @@ xlog_cil_force_seq( ASSERT(sequence <= cil->xc_current_sequence); + if (!sequence) + sequence = cil->xc_current_sequence; + trace_xfs_log_force(log->l_mp, sequence, _RET_IP_); + /* * check to see if we need to force out the current context. * xlog_cil_push() handles racing pushes for the same sequence, * so no need to deal with it here. */ restart: - xlog_cil_push_now(log, sequence); + xlog_cil_push_now(log, sequence, false); /* * See if we can find a previous sequence still committing. @@ -1189,7 +1376,7 @@ restart: * shutdown, but then went back to sleep once already in the * shutdown state. */ - if (XLOG_FORCED_SHUTDOWN(log)) + if (xlog_is_shutdown(log)) goto out_shutdown; if (ctx->sequence > sequence) continue; @@ -1198,6 +1385,7 @@ restart: * It is still being pushed! Wait for the push to * complete, then start again from the beginning. */ + XFS_STATS_INC(log->l_mp, xs_log_force_sleep); xlog_wait(&cil->xc_commit_wait, &cil->xc_push_lock); goto restart; } @@ -1282,32 +1470,35 @@ xlog_cil_init( cil = kmem_zalloc(sizeof(*cil), KM_MAYFAIL); if (!cil) return -ENOMEM; + /* + * Limit the CIL pipeline depth to 4 concurrent works to bound the + * concurrency the log spinlocks will be exposed to. + */ + cil->xc_push_wq = alloc_workqueue("xfs-cil/%s", + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_UNBOUND), + 4, log->l_mp->m_super->s_id); + if (!cil->xc_push_wq) + goto out_destroy_cil; - ctx = kmem_zalloc(sizeof(*ctx), KM_MAYFAIL); - if (!ctx) { - kmem_free(cil); - return -ENOMEM; - } - - INIT_WORK(&cil->xc_push_work, xlog_cil_push_work); INIT_LIST_HEAD(&cil->xc_cil); INIT_LIST_HEAD(&cil->xc_committing); spin_lock_init(&cil->xc_cil_lock); spin_lock_init(&cil->xc_push_lock); init_waitqueue_head(&cil->xc_push_wait); init_rwsem(&cil->xc_ctx_lock); + init_waitqueue_head(&cil->xc_start_wait); init_waitqueue_head(&cil->xc_commit_wait); - - INIT_LIST_HEAD(&ctx->committing); - INIT_LIST_HEAD(&ctx->busy_extents); - ctx->sequence = 1; - ctx->cil = cil; - cil->xc_ctx = ctx; - cil->xc_current_sequence = ctx->sequence; - cil->xc_log = log; log->l_cilp = cil; + + ctx = xlog_cil_ctx_alloc(); + xlog_cil_ctx_switch(cil, ctx); + return 0; + +out_destroy_cil: + kmem_free(cil); + return -ENOMEM; } void @@ -1321,6 +1512,7 @@ xlog_cil_destroy( } ASSERT(list_empty(&log->l_cilp->xc_cil)); + destroy_workqueue(log->l_cilp->xc_push_wq); kmem_free(log->l_cilp); } diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index f3e79a45d60a..844fbeec3545 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -12,15 +12,6 @@ struct xlog_ticket; struct xfs_mount; /* - * Flags for log structure - */ -#define XLOG_ACTIVE_RECOVERY 0x2 /* in the middle of recovery */ -#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */ -#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being - shutdown */ -#define XLOG_TAIL_WARN 0x10 /* log tail verify warning issued */ - -/* * get client id from packed copy. * * this hack is here because the xlog_pack code copies four bytes @@ -47,7 +38,6 @@ enum xlog_iclog_state { XLOG_STATE_DONE_SYNC, /* Done syncing to disk */ XLOG_STATE_CALLBACK, /* Callback functions now */ XLOG_STATE_DIRTY, /* Dirty IC log, not ready for ACTIVE status */ - XLOG_STATE_IOERROR, /* IO error happened in sync'ing log */ }; #define XLOG_STATE_STRINGS \ @@ -56,8 +46,7 @@ enum xlog_iclog_state { { XLOG_STATE_SYNCING, "XLOG_STATE_SYNCING" }, \ { XLOG_STATE_DONE_SYNC, "XLOG_STATE_DONE_SYNC" }, \ { XLOG_STATE_CALLBACK, "XLOG_STATE_CALLBACK" }, \ - { XLOG_STATE_DIRTY, "XLOG_STATE_DIRTY" }, \ - { XLOG_STATE_IOERROR, "XLOG_STATE_IOERROR" } + { XLOG_STATE_DIRTY, "XLOG_STATE_DIRTY" } /* * In core log flags @@ -251,6 +240,7 @@ struct xfs_cil_ctx { xfs_csn_t sequence; /* chkpt sequence # */ xfs_lsn_t start_lsn; /* first LSN of chkpt commit */ xfs_lsn_t commit_lsn; /* chkpt commit record lsn */ + struct xlog_in_core *commit_iclog; struct xlog_ticket *ticket; /* chkpt ticket */ int nvecs; /* number of regions */ int space_used; /* aggregate size of regions */ @@ -259,6 +249,7 @@ struct xfs_cil_ctx { struct list_head iclog_entry; struct list_head committing; /* ctx committing list */ struct work_struct discard_endio_work; + struct work_struct push_work; }; /* @@ -281,16 +272,18 @@ struct xfs_cil { struct xlog *xc_log; struct list_head xc_cil; spinlock_t xc_cil_lock; + struct workqueue_struct *xc_push_wq; struct rw_semaphore xc_ctx_lock ____cacheline_aligned_in_smp; struct xfs_cil_ctx *xc_ctx; spinlock_t xc_push_lock ____cacheline_aligned_in_smp; xfs_csn_t xc_push_seq; + bool xc_push_commit_stable; struct list_head xc_committing; wait_queue_head_t xc_commit_wait; + wait_queue_head_t xc_start_wait; xfs_csn_t xc_current_sequence; - struct work_struct xc_push_work; wait_queue_head_t xc_push_wait; /* background push throttle */ } ____cacheline_aligned_in_smp; @@ -407,7 +400,7 @@ struct xlog { struct xfs_buftarg *l_targ; /* buftarg of log */ struct workqueue_struct *l_ioend_workqueue; /* for I/O completions */ struct delayed_work l_work; /* background flush work */ - uint l_flags; + long l_opstate; /* operational state */ uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */ struct list_head *l_buf_cancel_table; int l_iclog_hsize; /* size of iclog header */ @@ -456,13 +449,40 @@ struct xlog { xfs_lsn_t l_recovery_lsn; uint32_t l_iclog_roundoff;/* padding roundoff */ + + /* Users of log incompat features should take a read lock. */ + struct rw_semaphore l_incompat_users; }; #define XLOG_BUF_CANCEL_BUCKET(log, blkno) \ ((log)->l_buf_cancel_table + ((uint64_t)blkno % XLOG_BC_TABLE_SIZE)) -#define XLOG_FORCED_SHUTDOWN(log) \ - (unlikely((log)->l_flags & XLOG_IO_ERROR)) +/* + * Bits for operational state + */ +#define XLOG_ACTIVE_RECOVERY 0 /* in the middle of recovery */ +#define XLOG_RECOVERY_NEEDED 1 /* log was recovered */ +#define XLOG_IO_ERROR 2 /* log hit an I/O error, and being + shutdown */ +#define XLOG_TAIL_WARN 3 /* log tail verify warning issued */ + +static inline bool +xlog_recovery_needed(struct xlog *log) +{ + return test_bit(XLOG_RECOVERY_NEEDED, &log->l_opstate); +} + +static inline bool +xlog_in_recovery(struct xlog *log) +{ + return test_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate); +} + +static inline bool +xlog_is_shutdown(struct xlog *log) +{ + return test_bit(XLOG_IO_ERROR, &log->l_opstate); +} /* common routines */ extern int @@ -496,14 +516,14 @@ xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes) void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket); void xlog_print_trans(struct xfs_trans *); -int xlog_write(struct xlog *log, struct xfs_log_vec *log_vector, - struct xlog_ticket *tic, xfs_lsn_t *start_lsn, - struct xlog_in_core **commit_iclog, uint optype); -int xlog_commit_record(struct xlog *log, struct xlog_ticket *ticket, - struct xlog_in_core **iclog, xfs_lsn_t *lsn); +int xlog_write(struct xlog *log, struct xfs_cil_ctx *ctx, + struct xfs_log_vec *log_vector, struct xlog_ticket *tic, + uint optype); void xfs_log_ticket_ungrant(struct xlog *log, struct xlog_ticket *ticket); void xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket); +void xlog_state_switch_iclogs(struct xlog *log, struct xlog_in_core *iclog, + int eventual_size); int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog, xfs_lsn_t log_tail_lsn); @@ -571,10 +591,14 @@ void xlog_cil_destroy(struct xlog *log); bool xlog_cil_empty(struct xlog *log); void xlog_cil_commit(struct xlog *log, struct xfs_trans *tp, xfs_csn_t *commit_seq, bool regrant); +void xlog_cil_set_ctx_write_state(struct xfs_cil_ctx *ctx, + struct xlog_in_core *iclog); + /* * CIL force routines */ +void xlog_cil_flush(struct xlog *log); xfs_lsn_t xlog_cil_force_seq(struct xlog *log, xfs_csn_t sequence); static inline void diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 1721fce2ec94..10562ecbd9ea 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -26,6 +26,8 @@ #include "xfs_error.h" #include "xfs_buf_item.h" #include "xfs_ag.h" +#include "xfs_quota.h" + #define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1) @@ -79,8 +81,6 @@ xlog_alloc_buffer( struct xlog *log, int nbblks) { - int align_mask = xfs_buftarg_dma_alignment(log->l_targ); - /* * Pass log block 0 since we don't have an addr yet, buffer will be * verified on read. @@ -108,7 +108,7 @@ xlog_alloc_buffer( if (nbblks > 1 && log->l_sectBBsize > 1) nbblks += log->l_sectBBsize; nbblks = round_up(nbblks, log->l_sectBBsize); - return kmem_alloc_io(BBTOB(nbblks), align_mask, KM_MAYFAIL | KM_ZERO); + return kvzalloc(BBTOB(nbblks), GFP_KERNEL | __GFP_RETRY_MAYFAIL); } /* @@ -146,7 +146,7 @@ xlog_do_io( error = xfs_rw_bdev(log->l_targ->bt_bdev, log->l_logBBstart + blk_no, BBTOB(nbblks), data, op); - if (error && !XFS_FORCED_SHUTDOWN(log->l_mp)) { + if (error && !xlog_is_shutdown(log)) { xfs_alert(log->l_mp, "log recovery %s I/O error at daddr 0x%llx len %d error %d", op == REQ_OP_WRITE ? "write" : "read", @@ -375,7 +375,7 @@ out: static inline int xlog_logrec_hblks(struct xlog *log, struct xlog_rec_header *rh) { - if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + if (xfs_has_logv2(log->l_mp)) { int h_size = be32_to_cpu(rh->h_size); if ((be32_to_cpu(rh->h_version) & XLOG_VERSION_2) && @@ -1347,7 +1347,7 @@ xlog_find_tail( * headers if we have a filesystem using non-persistent counters. */ if (clean) - log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN; + set_bit(XFS_OPSTATE_CLEAN, &log->l_mp->m_opstate); /* * Make sure that there are no blocks in front of the head @@ -1504,7 +1504,7 @@ xlog_add_record( recp->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM); recp->h_cycle = cpu_to_be32(cycle); recp->h_version = cpu_to_be32( - xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1); + xfs_has_logv2(log->l_mp) ? 2 : 1); recp->h_lsn = cpu_to_be64(xlog_assign_lsn(cycle, block)); recp->h_tail_lsn = cpu_to_be64(xlog_assign_lsn(tail_cycle, tail_block)); recp->h_fmt = cpu_to_be32(XLOG_FMT); @@ -1756,6 +1756,30 @@ xlog_recover_release_intent( spin_unlock(&ailp->ail_lock); } +int +xlog_recover_iget( + struct xfs_mount *mp, + xfs_ino_t ino, + struct xfs_inode **ipp) +{ + int error; + + error = xfs_iget(mp, NULL, ino, 0, 0, ipp); + if (error) + return error; + + error = xfs_qm_dqattach(*ipp); + if (error) { + xfs_irele(*ipp); + return error; + } + + if (VFS_I(*ipp)->i_nlink == 0) + xfs_iflags_set(*ipp, XFS_IRECOVERY); + + return 0; +} + /****************************************************************************** * * Log recover routines @@ -2062,7 +2086,9 @@ xlog_recover_add_to_cont_trans( old_ptr = item->ri_buf[item->ri_cnt-1].i_addr; old_len = item->ri_buf[item->ri_cnt-1].i_len; - ptr = krealloc(old_ptr, len + old_len, GFP_KERNEL | __GFP_NOFAIL); + ptr = kvrealloc(old_ptr, old_len, len + old_len, GFP_KERNEL); + if (!ptr) + return -ENOMEM; memcpy(&ptr[old_len], dp, len); item->ri_buf[item->ri_cnt-1].i_len += len; item->ri_buf[item->ri_cnt-1].i_addr = ptr; @@ -2786,6 +2812,13 @@ xlog_recover_process_iunlinks( } xfs_buf_rele(agibp); } + + /* + * Flush the pending unlinked inodes to ensure that the inactivations + * are fully completed on disk and the incore inodes can be reclaimed + * before we signal that recovery is complete. + */ + xfs_inodegc_flush(mp); } STATIC void @@ -2802,7 +2835,7 @@ xlog_unpack_data( dp += BBSIZE; } - if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + if (xfs_has_logv2(log->l_mp)) { xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead; for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) { j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); @@ -2850,7 +2883,7 @@ xlog_recover_process( * the kernel from one that does not add CRCs by default. */ if (crc != old_crc) { - if (old_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) { + if (old_crc || xfs_has_crc(log->l_mp)) { xfs_alert(log->l_mp, "log record CRC mismatch: found 0x%x, expected 0x%x.", le32_to_cpu(old_crc), @@ -2862,7 +2895,7 @@ xlog_recover_process( * If the filesystem is CRC enabled, this mismatch becomes a * fatal log corruption failure. */ - if (xfs_sb_version_hascrc(&log->l_mp->m_sb)) { + if (xfs_has_crc(log->l_mp)) { XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); return -EFSCORRUPTED; } @@ -2948,7 +2981,7 @@ xlog_do_recovery_pass( * Read the header of the tail block and get the iclog buffer size from * h_size. Use this to tell how many sectors make up the log header. */ - if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + if (xfs_has_logv2(log->l_mp)) { /* * When using variable length iclogs, read first sector of * iclog header and extract the header size from it. Get a @@ -3280,10 +3313,7 @@ xlog_do_recover( if (error) return error; - /* - * If IO errors happened during recovery, bail out. - */ - if (XFS_FORCED_SHUTDOWN(mp)) + if (xlog_is_shutdown(log)) return -EIO; /* @@ -3305,7 +3335,7 @@ xlog_do_recover( xfs_buf_hold(bp); error = _xfs_buf_read(bp, XBF_READ); if (error) { - if (!XFS_FORCED_SHUTDOWN(mp)) { + if (!xlog_is_shutdown(log)) { xfs_buf_ioerror_alert(bp, __this_address); ASSERT(0); } @@ -3318,6 +3348,7 @@ xlog_do_recover( xfs_buf_relse(bp); /* re-initialise in-core superblock and geometry structures */ + mp->m_features |= xfs_sb_version_to_features(sbp); xfs_reinit_percpu_counters(mp); error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi); if (error) { @@ -3329,7 +3360,7 @@ xlog_do_recover( xlog_recover_check_summary(log); /* Normal transactions can now occur */ - log->l_flags &= ~XLOG_ACTIVE_RECOVERY; + clear_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate); return 0; } @@ -3355,7 +3386,7 @@ xlog_recover( * could not be verified. Check the superblock LSN against the current * LSN now that it's known. */ - if (xfs_sb_version_hascrc(&log->l_mp->m_sb) && + if (xfs_has_crc(log->l_mp) && !xfs_log_check_lsn(log->l_mp, log->l_mp->m_sb.sb_lsn)) return -EINVAL; @@ -3382,7 +3413,7 @@ xlog_recover( * (e.g. unsupported transactions, then simply reject the * attempt at recovery before touching anything. */ - if (XFS_SB_VERSION_NUM(&log->l_mp->m_sb) == XFS_SB_VERSION_5 && + if (xfs_sb_is_v5(&log->l_mp->m_sb) && xfs_sb_has_incompat_log_feature(&log->l_mp->m_sb, XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)) { xfs_warn(log->l_mp, @@ -3413,68 +3444,64 @@ xlog_recover( : "internal"); error = xlog_do_recover(log, head_blk, tail_blk); - log->l_flags |= XLOG_RECOVERY_NEEDED; + set_bit(XLOG_RECOVERY_NEEDED, &log->l_opstate); } return error; } /* - * In the first part of recovery we replay inodes and buffers and build - * up the list of extent free items which need to be processed. Here - * we process the extent free items and clean up the on disk unlinked - * inode lists. This is separated from the first part of recovery so - * that the root and real-time bitmap inodes can be read in from disk in - * between the two stages. This is necessary so that we can free space - * in the real-time portion of the file system. + * In the first part of recovery we replay inodes and buffers and build up the + * list of intents which need to be processed. Here we process the intents and + * clean up the on disk unlinked inode lists. This is separated from the first + * part of recovery so that the root and real-time bitmap inodes can be read in + * from disk in between the two stages. This is necessary so that we can free + * space in the real-time portion of the file system. */ int xlog_recover_finish( struct xlog *log) { - /* - * Now we're ready to do the transactions needed for the - * rest of recovery. Start with completing all the extent - * free intent records and then process the unlinked inode - * lists. At this point, we essentially run in normal mode - * except that we're still performing recovery actions - * rather than accepting new requests. - */ - if (log->l_flags & XLOG_RECOVERY_NEEDED) { - int error; - error = xlog_recover_process_intents(log); - if (error) { - /* - * Cancel all the unprocessed intent items now so that - * we don't leave them pinned in the AIL. This can - * cause the AIL to livelock on the pinned item if - * anyone tries to push the AIL (inode reclaim does - * this) before we get around to xfs_log_mount_cancel. - */ - xlog_recover_cancel_intents(log); - xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); - xfs_alert(log->l_mp, "Failed to recover intents"); - return error; - } + int error; + error = xlog_recover_process_intents(log); + if (error) { /* - * Sync the log to get all the intents out of the AIL. - * This isn't absolutely necessary, but it helps in - * case the unlink transactions would have problems - * pushing the intents out of the way. + * Cancel all the unprocessed intent items now so that we don't + * leave them pinned in the AIL. This can cause the AIL to + * livelock on the pinned item if anyone tries to push the AIL + * (inode reclaim does this) before we get around to + * xfs_log_mount_cancel. */ - xfs_log_force(log->l_mp, XFS_LOG_SYNC); - - xlog_recover_process_iunlinks(log); + xlog_recover_cancel_intents(log); + xfs_alert(log->l_mp, "Failed to recover intents"); + xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + return error; + } - xlog_recover_check_summary(log); + /* + * Sync the log to get all the intents out of the AIL. This isn't + * absolutely necessary, but it helps in case the unlink transactions + * would have problems pushing the intents out of the way. + */ + xfs_log_force(log->l_mp, XFS_LOG_SYNC); - xfs_notice(log->l_mp, "Ending recovery (logdev: %s)", - log->l_mp->m_logname ? log->l_mp->m_logname - : "internal"); - log->l_flags &= ~XLOG_RECOVERY_NEEDED; - } else { - xfs_info(log->l_mp, "Ending clean mount"); + /* + * Now that we've recovered the log and all the intents, we can clear + * the log incompat feature bits in the superblock because there's no + * longer anything to protect. We rely on the AIL push to write out the + * updated superblock after everything else. + */ + if (xfs_clear_incompat_log_features(log->l_mp)) { + error = xfs_sync_sb(log->l_mp, false); + if (error < 0) { + xfs_alert(log->l_mp, + "Failed to clear log incompat features on recovery"); + return error; + } } + + xlog_recover_process_iunlinks(log); + xlog_recover_check_summary(log); return 0; } @@ -3482,7 +3509,7 @@ void xlog_recover_cancel( struct xlog *log) { - if (log->l_flags & XLOG_RECOVERY_NEEDED) + if (xlog_recovery_needed(log)) xlog_recover_cancel_intents(log); } diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index d0755494597f..06dac09eddbd 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -62,7 +62,7 @@ xfs_uuid_mount( /* Publish UUID in struct super_block */ uuid_copy(&mp->m_super->s_uuid, uuid); - if (mp->m_flags & XFS_MOUNT_NOUUID) + if (xfs_has_nouuid(mp)) return 0; if (uuid_is_null(uuid)) { @@ -104,7 +104,7 @@ xfs_uuid_unmount( uuid_t *uuid = &mp->m_sb.sb_uuid; int i; - if (mp->m_flags & XFS_MOUNT_NOUUID) + if (xfs_has_nouuid(mp)) return; mutex_lock(&xfs_uuid_table_mutex); @@ -225,6 +225,7 @@ reread: goto reread; } + mp->m_features |= xfs_sb_version_to_features(sbp); xfs_reinit_percpu_counters(mp); /* no need to be quiet anymore, so reset the buf ops */ @@ -318,7 +319,7 @@ xfs_validate_new_dalign( } } - if (!xfs_sb_version_hasdalign(&mp->m_sb)) { + if (!xfs_has_dalign(mp)) { xfs_warn(mp, "cannot change alignment: superblock does not support data alignment"); return -EINVAL; @@ -349,8 +350,7 @@ xfs_update_alignment( sbp->sb_unit = mp->m_dalign; sbp->sb_width = mp->m_swidth; mp->m_update_sb = true; - } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN && - xfs_sb_version_hasdalign(&mp->m_sb)) { + } else if (!xfs_has_noalign(mp) && xfs_has_dalign(mp)) { mp->m_dalign = sbp->sb_unit; mp->m_swidth = sbp->sb_width; } @@ -365,13 +365,16 @@ void xfs_set_low_space_thresholds( struct xfs_mount *mp) { - int i; + uint64_t dblocks = mp->m_sb.sb_dblocks; + uint64_t rtexts = mp->m_sb.sb_rextents; + int i; - for (i = 0; i < XFS_LOWSP_MAX; i++) { - uint64_t space = mp->m_sb.sb_dblocks; + do_div(dblocks, 100); + do_div(rtexts, 100); - do_div(space, 100); - mp->m_low_space[i] = space * (i + 1); + for (i = 0; i < XFS_LOWSP_MAX; i++) { + mp->m_low_space[i] = dblocks * (i + 1); + mp->m_low_rtexts[i] = rtexts * (i + 1); } } @@ -485,7 +488,7 @@ xfs_check_summary_counts( * counters. If any of them are obviously incorrect, we can recompute * them from the AGF headers in the next step. */ - if (XFS_LAST_UNMOUNT_WAS_CLEAN(mp) && + if (xfs_is_clean(mp) && (mp->m_sb.sb_fdblocks > mp->m_sb.sb_dblocks || !xfs_verify_icount(mp, mp->m_sb.sb_icount) || mp->m_sb.sb_ifree > mp->m_sb.sb_icount)) @@ -502,8 +505,7 @@ xfs_check_summary_counts( * superblock to be correct and we don't need to do anything here. * Otherwise, recalculate the summary counters. */ - if ((!xfs_sb_version_haslazysbcount(&mp->m_sb) || - XFS_LAST_UNMOUNT_WAS_CLEAN(mp)) && + if ((!xfs_has_lazysbcount(mp) || xfs_is_clean(mp)) && !xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS)) return 0; @@ -514,7 +516,8 @@ xfs_check_summary_counts( * Flush and reclaim dirty inodes in preparation for unmount. Inodes and * internal inode structures can be sitting in the CIL and AIL at this point, * so we need to unpin them, write them back and/or reclaim them before unmount - * can proceed. + * can proceed. In other words, callers are required to have inactivated all + * inodes. * * An inode cluster that has been freed can have its buffer still pinned in * memory because the transaction is still sitting in a iclog. The stale inodes @@ -543,9 +546,10 @@ xfs_unmount_flush_inodes( xfs_extent_busy_wait_all(mp); flush_workqueue(xfs_discard_wq); - mp->m_flags |= XFS_MOUNT_UNMOUNTING; + set_bit(XFS_OPSTATE_UNMOUNTING, &mp->m_opstate); xfs_ail_push_all_sync(mp->m_ail); + xfs_inodegc_stop(mp); cancel_delayed_work_sync(&mp->m_reclaim_work); xfs_reclaim_inodes(mp); xfs_health_unmount(mp); @@ -607,29 +611,13 @@ xfs_mountfs( xfs_warn(mp, "correcting sb_features alignment problem"); sbp->sb_features2 |= sbp->sb_bad_features2; mp->m_update_sb = true; - - /* - * Re-check for ATTR2 in case it was found in bad_features2 - * slot. - */ - if (xfs_sb_version_hasattr2(&mp->m_sb) && - !(mp->m_flags & XFS_MOUNT_NOATTR2)) - mp->m_flags |= XFS_MOUNT_ATTR2; } - if (xfs_sb_version_hasattr2(&mp->m_sb) && - (mp->m_flags & XFS_MOUNT_NOATTR2)) { - xfs_sb_version_removeattr2(&mp->m_sb); - mp->m_update_sb = true; - - /* update sb_versionnum for the clearing of the morebits */ - if (!sbp->sb_features2) - mp->m_update_sb = true; - } /* always use v2 inodes by default now */ if (!(mp->m_sb.sb_versionnum & XFS_SB_VERSION_NLINKBIT)) { mp->m_sb.sb_versionnum |= XFS_SB_VERSION_NLINKBIT; + mp->m_features |= XFS_FEAT_NLINK; mp->m_update_sb = true; } @@ -702,7 +690,7 @@ xfs_mountfs( * cluster size. Full inode chunk alignment must match the chunk size, * but that is checked on sb read verification... */ - if (xfs_sb_version_hassparseinodes(&mp->m_sb) && + if (xfs_has_sparseinodes(mp) && mp->m_sb.sb_spino_align != XFS_B_TO_FSBT(mp, igeo->inode_cluster_size_raw)) { xfs_warn(mp, @@ -764,6 +752,10 @@ xfs_mountfs( goto out_free_perag; } + error = xfs_inodegc_register_shrinker(mp); + if (error) + goto out_fail_wait; + /* * Log's mount-time initialization. The first part of recovery can place * some items on the AIL, to be handled when recovery is finished or @@ -774,7 +766,7 @@ xfs_mountfs( XFS_FSB_TO_BB(mp, sbp->sb_logblocks)); if (error) { xfs_warn(mp, "log mount failed"); - goto out_fail_wait; + goto out_inodegc_shrinker; } /* Make sure the summary counts are ok. */ @@ -782,6 +774,23 @@ xfs_mountfs( if (error) goto out_log_dealloc; + /* Enable background inode inactivation workers. */ + xfs_inodegc_start(mp); + xfs_blockgc_start(mp); + + /* + * Now that we've recovered any pending superblock feature bit + * additions, we can finish setting up the attr2 behaviour for the + * mount. The noattr2 option overrides the superblock flag, so only + * check the superblock feature flag if the mount option is not set. + */ + if (xfs_has_noattr2(mp)) { + mp->m_features &= ~XFS_FEAT_ATTR2; + } else if (!xfs_has_attr2(mp) && + (mp->m_sb.sb_features2 & XFS_SB_VERSION2_ATTR2BIT)) { + mp->m_features |= XFS_FEAT_ATTR2; + } + /* * Get and sanity-check the root inode. * Save the pointer to it in the mount structure. @@ -825,7 +834,7 @@ xfs_mountfs( * the next remount into writeable mode. Otherwise we would never * perform the update e.g. for the root filesystem. */ - if (mp->m_update_sb && !(mp->m_flags & XFS_MOUNT_RDONLY)) { + if (mp->m_update_sb && !xfs_is_readonly(mp)) { error = xfs_sync_sb(mp, false); if (error) { xfs_warn(mp, "failed to write sb changes"); @@ -836,13 +845,11 @@ xfs_mountfs( /* * Initialise the XFS quota management subsystem for this mount */ - if (XFS_IS_QUOTA_RUNNING(mp)) { + if (XFS_IS_QUOTA_ON(mp)) { error = xfs_qm_newmount(mp, "amount, "aflags); if (error) goto out_rtunmount; } else { - ASSERT(!XFS_IS_QUOTA_ON(mp)); - /* * If a file system had quotas running earlier, but decided to * mount without -o uquota/pquota/gquota options, revoke the @@ -884,10 +891,8 @@ xfs_mountfs( * We use the same quiesce mechanism as the rw->ro remount, as they are * semantically identical operations. */ - if ((mp->m_flags & (XFS_MOUNT_RDONLY|XFS_MOUNT_NORECOVERY)) == - XFS_MOUNT_RDONLY) { + if (xfs_is_readonly(mp) && !xfs_has_norecovery(mp)) xfs_log_clean(mp); - } /* * Complete the quota initialisation, post-log-replay component. @@ -910,7 +915,7 @@ xfs_mountfs( * This may drive us straight to ENOSPC on mount, but that implies * we were already there on the last unmount. Warn if this occurs. */ - if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { + if (!xfs_is_readonly(mp)) { resblks = xfs_default_resblks(mp); error = xfs_reserve_blocks(mp, &resblks, NULL); if (error) @@ -944,6 +949,15 @@ xfs_mountfs( xfs_irele(rip); /* Clean out dquots that might be in memory after quotacheck. */ xfs_qm_unmount(mp); + + /* + * Inactivate all inodes that might still be in memory after a log + * intent recovery failure so that reclaim can free them. Metadata + * inodes and the root directory shouldn't need inactivation, but the + * mount failed for some reason, so pull down all the state and flee. + */ + xfs_inodegc_flush(mp); + /* * Flush all inode reclamation work and flush the log. * We have to do this /after/ rtunmount and qm_unmount because those @@ -958,6 +972,8 @@ xfs_mountfs( xfs_unmount_flush_inodes(mp); out_log_dealloc: xfs_log_mount_cancel(mp); + out_inodegc_shrinker: + unregister_shrinker(&mp->m_inodegc_shrinker); out_fail_wait: if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) xfs_buftarg_drain(mp->m_logdev_targp); @@ -991,6 +1007,16 @@ xfs_unmountfs( uint64_t resblks; int error; + /* + * Perform all on-disk metadata updates required to inactivate inodes + * that the VFS evicted earlier in the unmount process. Freeing inodes + * and discarding CoW fork preallocations can cause shape changes to + * the free inode and refcount btrees, respectively, so we must finish + * this before we discard the metadata space reservations. Metadata + * inodes and the root directory do not require inactivation. + */ + xfs_inodegc_flush(mp); + xfs_blockgc_stop(mp); xfs_fs_unreserve_ag_blocks(mp); xfs_qm_unmount_quotas(mp); @@ -1028,6 +1054,7 @@ xfs_unmountfs( #if defined(DEBUG) xfs_errortag_clearall(mp); #endif + unregister_shrinker(&mp->m_inodegc_shrinker); xfs_free_perag(mp); xfs_errortag_del(mp); @@ -1049,20 +1076,12 @@ xfs_fs_writable( { ASSERT(level > SB_UNFROZEN); if ((mp->m_super->s_writers.frozen >= level) || - XFS_FORCED_SHUTDOWN(mp) || (mp->m_flags & XFS_MOUNT_RDONLY)) + xfs_is_shutdown(mp) || xfs_is_readonly(mp)) return false; return true; } -/* - * Deltas for the block count can vary from 1 to very large, but lock contention - * only occurs on frequent small block count updates such as in the delayed - * allocation path for buffered writes (page a time updates). Hence we set - * a large batch count (1024) to minimise global counter updates except when - * we get near to ENOSPC and we have to be very accurate with our updates. - */ -#define XFS_FDBLOCKS_BATCH 1024 int xfs_mod_fdblocks( struct xfs_mount *mp, @@ -1210,13 +1229,123 @@ void xfs_force_summary_recalc( struct xfs_mount *mp) { - if (!xfs_sb_version_haslazysbcount(&mp->m_sb)) + if (!xfs_has_lazysbcount(mp)) return; xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS); } /* + * Enable a log incompat feature flag in the primary superblock. The caller + * cannot have any other transactions in progress. + */ +int +xfs_add_incompat_log_feature( + struct xfs_mount *mp, + uint32_t feature) +{ + struct xfs_dsb *dsb; + int error; + + ASSERT(hweight32(feature) == 1); + ASSERT(!(feature & XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)); + + /* + * Force the log to disk and kick the background AIL thread to reduce + * the chances that the bwrite will stall waiting for the AIL to unpin + * the primary superblock buffer. This isn't a data integrity + * operation, so we don't need a synchronous push. + */ + error = xfs_log_force(mp, XFS_LOG_SYNC); + if (error) + return error; + xfs_ail_push_all(mp->m_ail); + + /* + * Lock the primary superblock buffer to serialize all callers that + * are trying to set feature bits. + */ + xfs_buf_lock(mp->m_sb_bp); + xfs_buf_hold(mp->m_sb_bp); + + if (xfs_is_shutdown(mp)) { + error = -EIO; + goto rele; + } + + if (xfs_sb_has_incompat_log_feature(&mp->m_sb, feature)) + goto rele; + + /* + * Write the primary superblock to disk immediately, because we need + * the log_incompat bit to be set in the primary super now to protect + * the log items that we're going to commit later. + */ + dsb = mp->m_sb_bp->b_addr; + xfs_sb_to_disk(dsb, &mp->m_sb); + dsb->sb_features_log_incompat |= cpu_to_be32(feature); + error = xfs_bwrite(mp->m_sb_bp); + if (error) + goto shutdown; + + /* + * Add the feature bits to the incore superblock before we unlock the + * buffer. + */ + xfs_sb_add_incompat_log_features(&mp->m_sb, feature); + xfs_buf_relse(mp->m_sb_bp); + + /* Log the superblock to disk. */ + return xfs_sync_sb(mp, false); +shutdown: + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); +rele: + xfs_buf_relse(mp->m_sb_bp); + return error; +} + +/* + * Clear all the log incompat flags from the superblock. + * + * The caller cannot be in a transaction, must ensure that the log does not + * contain any log items protected by any log incompat bit, and must ensure + * that there are no other threads that depend on the state of the log incompat + * feature flags in the primary super. + * + * Returns true if the superblock is dirty. + */ +bool +xfs_clear_incompat_log_features( + struct xfs_mount *mp) +{ + bool ret = false; + + if (!xfs_has_crc(mp) || + !xfs_sb_has_incompat_log_feature(&mp->m_sb, + XFS_SB_FEAT_INCOMPAT_LOG_ALL) || + xfs_is_shutdown(mp)) + return false; + + /* + * Update the incore superblock. We synchronize on the primary super + * buffer lock to be consistent with the add function, though at least + * in theory this shouldn't be necessary. + */ + xfs_buf_lock(mp->m_sb_bp); + xfs_buf_hold(mp->m_sb_bp); + + if (xfs_sb_has_incompat_log_feature(&mp->m_sb, + XFS_SB_FEAT_INCOMPAT_LOG_ALL)) { + xfs_info(mp, "Clearing log incompat feature flags."); + xfs_sb_remove_incompat_log_features(&mp->m_sb); + ret = true; + } + + xfs_buf_relse(mp->m_sb_bp); + return ret; +} + +/* * Update the in-core delayed block counter. * * We prefer to update the counter without having to take a spinlock for every diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index c78b63fe779a..e091f3b3fa15 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -57,6 +57,18 @@ struct xfs_error_cfg { }; /* + * Per-cpu deferred inode inactivation GC lists. + */ +struct xfs_inodegc { + struct llist_head list; + struct work_struct work; + + /* approximate count of inodes in the list */ + unsigned int items; + unsigned int shrinker_hits; +}; + +/* * The struct xfsmount layout is optimised to separate read-mostly variables * from variables that are frequently modified. We put the read-mostly variables * first, then place all the other variables at the end. @@ -82,6 +94,9 @@ typedef struct xfs_mount { xfs_buftarg_t *m_ddev_targp; /* saves taking the address */ xfs_buftarg_t *m_logdev_targp;/* ptr to log device */ xfs_buftarg_t *m_rtdev_targp; /* ptr to rt device */ + struct list_head m_mount_list; /* global mount list */ + void __percpu *m_inodegc; /* percpu inodegc structures */ + /* * Optional cache of rt summary level per bitmap block with the * invariant that m_rsum_cache[bbno] <= the minimum i for which @@ -92,10 +107,10 @@ typedef struct xfs_mount { struct xfs_mru_cache *m_filestream; /* per-mount filestream data */ struct workqueue_struct *m_buf_workqueue; struct workqueue_struct *m_unwritten_workqueue; - struct workqueue_struct *m_cil_workqueue; struct workqueue_struct *m_reclaim_workqueue; - struct workqueue_struct *m_gc_workqueue; struct workqueue_struct *m_sync_workqueue; + struct workqueue_struct *m_blockgc_wq; + struct workqueue_struct *m_inodegc_wq; int m_bsize; /* fs logical block size */ uint8_t m_blkbit_log; /* blocklog + NBBY */ @@ -131,11 +146,13 @@ typedef struct xfs_mount { uint m_rsumsize; /* size of rt summary, bytes */ int m_fixedfsid[2]; /* unchanged for life of FS */ uint m_qflags; /* quota status flags */ - uint64_t m_flags; /* global mount flags */ - int64_t m_low_space[XFS_LOWSP_MAX]; + uint64_t m_features; /* active filesystem features */ + uint64_t m_low_space[XFS_LOWSP_MAX]; + uint64_t m_low_rtexts[XFS_LOWSP_MAX]; struct xfs_ino_geometry m_ino_geo; /* inode geometry */ struct xfs_trans_resv m_resv; /* precomputed res values */ /* low free space thresholds */ + unsigned long m_opstate; /* dynamic state flags */ bool m_always_cow; bool m_fail_unmount; bool m_finobt_nores; /* no per-AG finobt resv. */ @@ -193,6 +210,8 @@ typedef struct xfs_mount { xfs_agnumber_t m_agirotor; /* last ag dir inode alloced */ spinlock_t m_agirotor_lock;/* .. and lock protecting it */ + /* Memory shrinker to throttle and reprioritize inodegc */ + struct shrinker m_inodegc_shrinker; /* * Workqueue item so that we can coalesce multiple inode flush attempts * into a single flush. @@ -225,38 +244,178 @@ typedef struct xfs_mount { #define M_IGEO(mp) (&(mp)->m_ino_geo) /* - * Flags for m_flags. + * Flags for m_features. + * + * These are all the active features in the filesystem, regardless of how + * they are configured. */ -#define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops - must be synchronous except - for space allocations */ -#define XFS_MOUNT_UNMOUNTING (1ULL << 1) /* filesystem is unmounting */ -#define XFS_MOUNT_WAS_CLEAN (1ULL << 3) -#define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem - operations, typically for - disk errors in metadata */ -#define XFS_MOUNT_DISCARD (1ULL << 5) /* discard unused blocks */ -#define XFS_MOUNT_NOALIGN (1ULL << 7) /* turn off stripe alignment - allocations */ -#define XFS_MOUNT_ATTR2 (1ULL << 8) /* allow use of attr2 format */ -#define XFS_MOUNT_GRPID (1ULL << 9) /* group-ID assigned from directory */ -#define XFS_MOUNT_NORECOVERY (1ULL << 10) /* no recovery - dirty fs */ -#define XFS_MOUNT_ALLOCSIZE (1ULL << 12) /* specified allocation size */ -#define XFS_MOUNT_SMALL_INUMS (1ULL << 14) /* user wants 32bit inodes */ -#define XFS_MOUNT_32BITINODES (1ULL << 15) /* inode32 allocator active */ -#define XFS_MOUNT_NOUUID (1ULL << 16) /* ignore uuid during mount */ -#define XFS_MOUNT_IKEEP (1ULL << 18) /* keep empty inode clusters*/ -#define XFS_MOUNT_SWALLOC (1ULL << 19) /* turn on stripe width - * allocation */ -#define XFS_MOUNT_RDONLY (1ULL << 20) /* read-only fs */ -#define XFS_MOUNT_DIRSYNC (1ULL << 21) /* synchronous directory ops */ -#define XFS_MOUNT_LARGEIO (1ULL << 22) /* report large preferred +#define XFS_FEAT_ATTR (1ULL << 0) /* xattrs present in fs */ +#define XFS_FEAT_NLINK (1ULL << 1) /* 32 bit link counts */ +#define XFS_FEAT_QUOTA (1ULL << 2) /* quota active */ +#define XFS_FEAT_ALIGN (1ULL << 3) /* inode alignment */ +#define XFS_FEAT_DALIGN (1ULL << 4) /* data alignment */ +#define XFS_FEAT_LOGV2 (1ULL << 5) /* version 2 logs */ +#define XFS_FEAT_SECTOR (1ULL << 6) /* sector size > 512 bytes */ +#define XFS_FEAT_EXTFLG (1ULL << 7) /* unwritten extents */ +#define XFS_FEAT_ASCIICI (1ULL << 8) /* ASCII only case-insens. */ +#define XFS_FEAT_LAZYSBCOUNT (1ULL << 9) /* Superblk counters */ +#define XFS_FEAT_ATTR2 (1ULL << 10) /* dynamic attr fork */ +#define XFS_FEAT_PARENT (1ULL << 11) /* parent pointers */ +#define XFS_FEAT_PROJID32 (1ULL << 12) /* 32 bit project id */ +#define XFS_FEAT_CRC (1ULL << 13) /* metadata CRCs */ +#define XFS_FEAT_V3INODES (1ULL << 14) /* Version 3 inodes */ +#define XFS_FEAT_PQUOTINO (1ULL << 15) /* non-shared proj/grp quotas */ +#define XFS_FEAT_FTYPE (1ULL << 16) /* inode type in dir */ +#define XFS_FEAT_FINOBT (1ULL << 17) /* free inode btree */ +#define XFS_FEAT_RMAPBT (1ULL << 18) /* reverse map btree */ +#define XFS_FEAT_REFLINK (1ULL << 19) /* reflinked files */ +#define XFS_FEAT_SPINODES (1ULL << 20) /* sparse inode chunks */ +#define XFS_FEAT_META_UUID (1ULL << 21) /* metadata UUID */ +#define XFS_FEAT_REALTIME (1ULL << 22) /* realtime device present */ +#define XFS_FEAT_INOBTCNT (1ULL << 23) /* inobt block counts */ +#define XFS_FEAT_BIGTIME (1ULL << 24) /* large timestamps */ +#define XFS_FEAT_NEEDSREPAIR (1ULL << 25) /* needs xfs_repair */ + +/* Mount features */ +#define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */ +#define XFS_FEAT_NOALIGN (1ULL << 49) /* ignore alignment */ +#define XFS_FEAT_ALLOCSIZE (1ULL << 50) /* user specified allocation size */ +#define XFS_FEAT_LARGE_IOSIZE (1ULL << 51) /* report large preferred * I/O size in stat() */ -#define XFS_MOUNT_FILESTREAMS (1ULL << 24) /* enable the filestreams - allocator */ -#define XFS_MOUNT_NOATTR2 (1ULL << 25) /* disable use of attr2 format */ -#define XFS_MOUNT_DAX_ALWAYS (1ULL << 26) -#define XFS_MOUNT_DAX_NEVER (1ULL << 27) +#define XFS_FEAT_WSYNC (1ULL << 52) /* synchronous metadata ops */ +#define XFS_FEAT_DIRSYNC (1ULL << 53) /* synchronous directory ops */ +#define XFS_FEAT_DISCARD (1ULL << 54) /* discard unused blocks */ +#define XFS_FEAT_GRPID (1ULL << 55) /* group-ID assigned from directory */ +#define XFS_FEAT_SMALL_INUMS (1ULL << 56) /* user wants 32bit inodes */ +#define XFS_FEAT_IKEEP (1ULL << 57) /* keep empty inode clusters*/ +#define XFS_FEAT_SWALLOC (1ULL << 58) /* stripe width allocation */ +#define XFS_FEAT_FILESTREAMS (1ULL << 59) /* use filestreams allocator */ +#define XFS_FEAT_DAX_ALWAYS (1ULL << 60) /* DAX always enabled */ +#define XFS_FEAT_DAX_NEVER (1ULL << 61) /* DAX never enabled */ +#define XFS_FEAT_NORECOVERY (1ULL << 62) /* no recovery - dirty fs */ +#define XFS_FEAT_NOUUID (1ULL << 63) /* ignore uuid during mount */ + +#define __XFS_HAS_FEAT(name, NAME) \ +static inline bool xfs_has_ ## name (struct xfs_mount *mp) \ +{ \ + return mp->m_features & XFS_FEAT_ ## NAME; \ +} + +/* Some features can be added dynamically so they need a set wrapper, too. */ +#define __XFS_ADD_FEAT(name, NAME) \ + __XFS_HAS_FEAT(name, NAME); \ +static inline void xfs_add_ ## name (struct xfs_mount *mp) \ +{ \ + mp->m_features |= XFS_FEAT_ ## NAME; \ + xfs_sb_version_add ## name(&mp->m_sb); \ +} + +/* Superblock features */ +__XFS_ADD_FEAT(attr, ATTR) +__XFS_HAS_FEAT(nlink, NLINK) +__XFS_ADD_FEAT(quota, QUOTA) +__XFS_HAS_FEAT(align, ALIGN) +__XFS_HAS_FEAT(dalign, DALIGN) +__XFS_HAS_FEAT(logv2, LOGV2) +__XFS_HAS_FEAT(sector, SECTOR) +__XFS_HAS_FEAT(extflg, EXTFLG) +__XFS_HAS_FEAT(asciici, ASCIICI) +__XFS_HAS_FEAT(lazysbcount, LAZYSBCOUNT) +__XFS_ADD_FEAT(attr2, ATTR2) +__XFS_HAS_FEAT(parent, PARENT) +__XFS_ADD_FEAT(projid32, PROJID32) +__XFS_HAS_FEAT(crc, CRC) +__XFS_HAS_FEAT(v3inodes, V3INODES) +__XFS_HAS_FEAT(pquotino, PQUOTINO) +__XFS_HAS_FEAT(ftype, FTYPE) +__XFS_HAS_FEAT(finobt, FINOBT) +__XFS_HAS_FEAT(rmapbt, RMAPBT) +__XFS_HAS_FEAT(reflink, REFLINK) +__XFS_HAS_FEAT(sparseinodes, SPINODES) +__XFS_HAS_FEAT(metauuid, META_UUID) +__XFS_HAS_FEAT(realtime, REALTIME) +__XFS_HAS_FEAT(inobtcounts, INOBTCNT) +__XFS_HAS_FEAT(bigtime, BIGTIME) +__XFS_HAS_FEAT(needsrepair, NEEDSREPAIR) + +/* + * Mount features + * + * These do not change dynamically - features that can come and go, such as 32 + * bit inodes and read-only state, are kept as operational state rather than + * features. + */ +__XFS_HAS_FEAT(noattr2, NOATTR2) +__XFS_HAS_FEAT(noalign, NOALIGN) +__XFS_HAS_FEAT(allocsize, ALLOCSIZE) +__XFS_HAS_FEAT(large_iosize, LARGE_IOSIZE) +__XFS_HAS_FEAT(wsync, WSYNC) +__XFS_HAS_FEAT(dirsync, DIRSYNC) +__XFS_HAS_FEAT(discard, DISCARD) +__XFS_HAS_FEAT(grpid, GRPID) +__XFS_HAS_FEAT(small_inums, SMALL_INUMS) +__XFS_HAS_FEAT(ikeep, IKEEP) +__XFS_HAS_FEAT(swalloc, SWALLOC) +__XFS_HAS_FEAT(filestreams, FILESTREAMS) +__XFS_HAS_FEAT(dax_always, DAX_ALWAYS) +__XFS_HAS_FEAT(dax_never, DAX_NEVER) +__XFS_HAS_FEAT(norecovery, NORECOVERY) +__XFS_HAS_FEAT(nouuid, NOUUID) + +/* + * Operational mount state flags + * + * Use these with atomic bit ops only! + */ +#define XFS_OPSTATE_UNMOUNTING 0 /* filesystem is unmounting */ +#define XFS_OPSTATE_CLEAN 1 /* mount was clean */ +#define XFS_OPSTATE_SHUTDOWN 2 /* stop all fs operations */ +#define XFS_OPSTATE_INODE32 3 /* inode32 allocator active */ +#define XFS_OPSTATE_READONLY 4 /* read-only fs */ + +/* + * If set, inactivation worker threads will be scheduled to process queued + * inodegc work. If not, queued inodes remain in memory waiting to be + * processed. + */ +#define XFS_OPSTATE_INODEGC_ENABLED 5 +/* + * If set, background speculative prealloc gc worker threads will be scheduled + * to process queued blockgc work. If not, inodes retain their preallocations + * until explicitly deleted. + */ +#define XFS_OPSTATE_BLOCKGC_ENABLED 6 + +#define __XFS_IS_OPSTATE(name, NAME) \ +static inline bool xfs_is_ ## name (struct xfs_mount *mp) \ +{ \ + return test_bit(XFS_OPSTATE_ ## NAME, &mp->m_opstate); \ +} \ +static inline bool xfs_clear_ ## name (struct xfs_mount *mp) \ +{ \ + return test_and_clear_bit(XFS_OPSTATE_ ## NAME, &mp->m_opstate); \ +} \ +static inline bool xfs_set_ ## name (struct xfs_mount *mp) \ +{ \ + return test_and_set_bit(XFS_OPSTATE_ ## NAME, &mp->m_opstate); \ +} + +__XFS_IS_OPSTATE(unmounting, UNMOUNTING) +__XFS_IS_OPSTATE(clean, CLEAN) +__XFS_IS_OPSTATE(shutdown, SHUTDOWN) +__XFS_IS_OPSTATE(inode32, INODE32) +__XFS_IS_OPSTATE(readonly, READONLY) +__XFS_IS_OPSTATE(inodegc_enabled, INODEGC_ENABLED) +__XFS_IS_OPSTATE(blockgc_enabled, BLOCKGC_ENABLED) + +#define XFS_OPSTATE_STRINGS \ + { (1UL << XFS_OPSTATE_UNMOUNTING), "unmounting" }, \ + { (1UL << XFS_OPSTATE_CLEAN), "clean" }, \ + { (1UL << XFS_OPSTATE_SHUTDOWN), "shutdown" }, \ + { (1UL << XFS_OPSTATE_INODE32), "inode32" }, \ + { (1UL << XFS_OPSTATE_READONLY), "read_only" }, \ + { (1UL << XFS_OPSTATE_INODEGC_ENABLED), "inodegc" }, \ + { (1UL << XFS_OPSTATE_BLOCKGC_ENABLED), "blockgc" } /* * Max and min values for mount-option defined I/O @@ -265,9 +424,7 @@ typedef struct xfs_mount { #define XFS_MAX_IO_LOG 30 /* 1G */ #define XFS_MIN_IO_LOG PAGE_SHIFT -#define XFS_LAST_UNMOUNT_WAS_CLEAN(mp) \ - ((mp)->m_flags & XFS_MOUNT_WAS_CLEAN) -#define XFS_FORCED_SHUTDOWN(mp) ((mp)->m_flags & XFS_MOUNT_FS_SHUTDOWN) +#define xfs_is_shutdown(mp) xfs_is_shutdown(mp) void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname, int lnnum); #define xfs_force_shutdown(m,f) \ @@ -278,6 +435,12 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname, #define SHUTDOWN_FORCE_UMOUNT 0x0004 /* shutdown from a forced unmount */ #define SHUTDOWN_CORRUPT_INCORE 0x0008 /* corrupt in-memory data structures */ +#define XFS_SHUTDOWN_STRINGS \ + { SHUTDOWN_META_IO_ERROR, "metadata_io" }, \ + { SHUTDOWN_LOG_IO_ERROR, "log_io" }, \ + { SHUTDOWN_FORCE_UMOUNT, "force_umount" }, \ + { SHUTDOWN_CORRUPT_INCORE, "corruption" } + /* * Flags for xfs_mountfs */ @@ -306,6 +469,15 @@ extern uint64_t xfs_default_resblks(xfs_mount_t *mp); extern int xfs_mountfs(xfs_mount_t *mp); extern void xfs_unmountfs(xfs_mount_t *); +/* + * Deltas for the block count can vary from 1 to very large, but lock contention + * only occurs on frequent small block count updates such as in the delayed + * allocation path for buffered writes (page a time updates). Hence we set + * a large batch count (1024) to minimise global counter updates except when + * we get near to ENOSPC and we have to be very accurate with our updates. + */ +#define XFS_FDBLOCKS_BATCH 1024 + extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta, bool reserved); extern int xfs_mod_frextents(struct xfs_mount *mp, int64_t delta); @@ -325,6 +497,8 @@ int xfs_zero_extent(struct xfs_inode *ip, xfs_fsblock_t start_fsb, struct xfs_error_cfg * xfs_error_get_cfg(struct xfs_mount *mp, int error_class, int error); void xfs_force_summary_recalc(struct xfs_mount *mp); +int xfs_add_incompat_log_feature(struct xfs_mount *mp, uint32_t feature); +bool xfs_clear_incompat_log_features(struct xfs_mount *mp); void xfs_mod_delalloc(struct xfs_mount *mp, int64_t delta); #endif /* __XFS_MOUNT_H__ */ diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 956cca24e67f..5e1d29d8b2e7 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -92,7 +92,7 @@ xfs_fs_map_blocks( uint lock_flags; int error = 0; - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; /* diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index fe341f3fd419..5608066d6e53 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -157,7 +157,7 @@ xfs_qm_dqpurge( } ASSERT(atomic_read(&dqp->q_pincount) == 0); - ASSERT(XFS_FORCED_SHUTDOWN(mp) || + ASSERT(xfs_is_shutdown(mp) || !test_bit(XFS_LI_IN_AIL, &dqp->q_logitem.qli_item.li_flags)); xfs_dqfunlock(dqp); @@ -185,17 +185,13 @@ out_unlock: /* * Purge the dquot cache. */ -void +static void xfs_qm_dqpurge_all( - struct xfs_mount *mp, - uint flags) + struct xfs_mount *mp) { - if (flags & XFS_QMOPT_UQUOTA) - xfs_qm_dquot_walk(mp, XFS_DQTYPE_USER, xfs_qm_dqpurge, NULL); - if (flags & XFS_QMOPT_GQUOTA) - xfs_qm_dquot_walk(mp, XFS_DQTYPE_GROUP, xfs_qm_dqpurge, NULL); - if (flags & XFS_QMOPT_PQUOTA) - xfs_qm_dquot_walk(mp, XFS_DQTYPE_PROJ, xfs_qm_dqpurge, NULL); + xfs_qm_dquot_walk(mp, XFS_DQTYPE_USER, xfs_qm_dqpurge, NULL); + xfs_qm_dquot_walk(mp, XFS_DQTYPE_GROUP, xfs_qm_dqpurge, NULL); + xfs_qm_dquot_walk(mp, XFS_DQTYPE_PROJ, xfs_qm_dqpurge, NULL); } /* @@ -206,7 +202,7 @@ xfs_qm_unmount( struct xfs_mount *mp) { if (mp->m_quotainfo) { - xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL); + xfs_qm_dqpurge_all(mp); xfs_qm_destroy_quotainfo(mp); } } @@ -299,8 +295,6 @@ xfs_qm_need_dqattach( { struct xfs_mount *mp = ip->i_mount; - if (!XFS_IS_QUOTA_RUNNING(mp)) - return false; if (!XFS_IS_QUOTA_ON(mp)) return false; if (!XFS_NOT_DQATTACHED(mp, ip)) @@ -635,7 +629,7 @@ xfs_qm_init_quotainfo( struct xfs_quotainfo *qinf; int error; - ASSERT(XFS_IS_QUOTA_RUNNING(mp)); + ASSERT(XFS_IS_QUOTA_ON(mp)); qinf = mp->m_quotainfo = kmem_zalloc(sizeof(struct xfs_quotainfo), 0); @@ -662,7 +656,7 @@ xfs_qm_init_quotainfo( /* Precalc some constants */ qinf->qi_dqchunklen = XFS_FSB_TO_BB(mp, XFS_DQUOT_CLUSTER_SIZE_FSB); qinf->qi_dqperchunk = xfs_calc_dquots_per_chunk(qinf->qi_dqchunklen); - if (xfs_sb_version_hasbigtime(&mp->m_sb)) { + if (xfs_has_bigtime(mp)) { qinf->qi_expiry_min = xfs_dq_bigtime_to_unix(XFS_DQ_BIGTIME_EXPIRY_MIN); qinf->qi_expiry_max = @@ -680,11 +674,11 @@ xfs_qm_init_quotainfo( xfs_qm_init_timelimits(mp, XFS_DQTYPE_GROUP); xfs_qm_init_timelimits(mp, XFS_DQTYPE_PROJ); - if (XFS_IS_UQUOTA_RUNNING(mp)) + if (XFS_IS_UQUOTA_ON(mp)) xfs_qm_set_defquota(mp, XFS_DQTYPE_USER, qinf); - if (XFS_IS_GQUOTA_RUNNING(mp)) + if (XFS_IS_GQUOTA_ON(mp)) xfs_qm_set_defquota(mp, XFS_DQTYPE_GROUP, qinf); - if (XFS_IS_PQUOTA_RUNNING(mp)) + if (XFS_IS_PQUOTA_ON(mp)) xfs_qm_set_defquota(mp, XFS_DQTYPE_PROJ, qinf); qinf->qi_shrinker.count_objects = xfs_qm_shrink_count; @@ -755,7 +749,7 @@ xfs_qm_qino_alloc( * with PQUOTA, just use sb_gquotino for sb_pquotino and * vice-versa. */ - if (!xfs_sb_version_has_pquotino(&mp->m_sb) && + if (!xfs_has_pquotino(mp) && (flags & (XFS_QMOPT_PQUOTA|XFS_QMOPT_GQUOTA))) { xfs_ino_t ino = NULLFSINO; @@ -808,9 +802,9 @@ xfs_qm_qino_alloc( */ spin_lock(&mp->m_sb_lock); if (flags & XFS_QMOPT_SBVERSION) { - ASSERT(!xfs_sb_version_hasquota(&mp->m_sb)); + ASSERT(!xfs_has_quota(mp)); - xfs_sb_version_addquota(&mp->m_sb); + xfs_add_quota(mp); mp->m_sb.sb_uquotino = NULLFSINO; mp->m_sb.sb_gquotino = NULLFSINO; mp->m_sb.sb_pquotino = NULLFSINO; @@ -829,7 +823,7 @@ xfs_qm_qino_alloc( error = xfs_trans_commit(tp); if (error) { - ASSERT(XFS_FORCED_SHUTDOWN(mp)); + ASSERT(xfs_is_shutdown(mp)); xfs_alert(mp, "%s failed (error %d)!", __func__, error); } if (need_alloc) @@ -896,11 +890,11 @@ xfs_qm_reset_dqcounts( ddq->d_bwarns = 0; ddq->d_iwarns = 0; ddq->d_rtbwarns = 0; - if (xfs_sb_version_hasbigtime(&mp->m_sb)) + if (xfs_has_bigtime(mp)) ddq->d_type |= XFS_DQTYPE_BIGTIME; } - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { xfs_update_cksum((char *)&dqb[j], sizeof(struct xfs_dqblk), XFS_DQUOT_CRC_OFF); @@ -1147,7 +1141,7 @@ xfs_qm_dqusage_adjust( xfs_filblks_t rtblks = 0; /* total rt blks */ int error; - ASSERT(XFS_IS_QUOTA_RUNNING(mp)); + ASSERT(XFS_IS_QUOTA_ON(mp)); /* * rootino must have its resources accounted for, not so with the quota @@ -1288,7 +1282,7 @@ xfs_qm_quotacheck( flags = 0; ASSERT(uip || gip || pip); - ASSERT(XFS_IS_QUOTA_RUNNING(mp)); + ASSERT(XFS_IS_QUOTA_ON(mp)); xfs_notice(mp, "Quotacheck needed: Please wait."); @@ -1359,7 +1353,7 @@ xfs_qm_quotacheck( * at this point (because we intentionally didn't in dqget_noattach). */ if (error) { - xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL); + xfs_qm_dqpurge_all(mp); goto error_return; } @@ -1418,7 +1412,7 @@ xfs_qm_mount_quotas( goto write_changes; } - ASSERT(XFS_IS_QUOTA_RUNNING(mp)); + ASSERT(XFS_IS_QUOTA_ON(mp)); /* * Allocate the quotainfo structure inside the mount struct, and @@ -1473,7 +1467,7 @@ xfs_qm_mount_quotas( * the incore structures are convinced that quotas are * off, but the on disk superblock doesn't know that ! */ - ASSERT(!(XFS_IS_QUOTA_RUNNING(mp))); + ASSERT(!(XFS_IS_QUOTA_ON(mp))); xfs_alert(mp, "%s: Superblock update failed!", __func__); } @@ -1504,7 +1498,7 @@ xfs_qm_init_quotainos( /* * Get the uquota and gquota inodes */ - if (xfs_sb_version_hasquota(&mp->m_sb)) { + if (xfs_has_quota(mp)) { if (XFS_IS_UQUOTA_ON(mp) && mp->m_sb.sb_uquotino != NULLFSINO) { ASSERT(mp->m_sb.sb_uquotino > 0); @@ -1645,7 +1639,7 @@ xfs_qm_vop_dqalloc( int error; uint lockflags; - if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) + if (!XFS_IS_QUOTA_ON(mp)) return 0; lockflags = XFS_ILOCK_EXCL; @@ -1776,7 +1770,7 @@ xfs_qm_vop_chown( ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - ASSERT(XFS_IS_QUOTA_RUNNING(ip->i_mount)); + ASSERT(XFS_IS_QUOTA_ON(ip->i_mount)); /* old dquot */ prevdq = *IO_olddq; @@ -1829,7 +1823,7 @@ xfs_qm_vop_rename_dqattach( struct xfs_mount *mp = i_tab[0]->i_mount; int i; - if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) + if (!XFS_IS_QUOTA_ON(mp)) return 0; for (i = 0; (i < 4 && i_tab[i]); i++) { @@ -1860,7 +1854,7 @@ xfs_qm_vop_create_dqattach( { struct xfs_mount *mp = tp->t_mountp; - if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) + if (!XFS_IS_QUOTA_ON(mp)) return; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); @@ -1888,3 +1882,37 @@ xfs_qm_vop_create_dqattach( } } +/* Decide if this inode's dquot is near an enforcement boundary. */ +bool +xfs_inode_near_dquot_enforcement( + struct xfs_inode *ip, + xfs_dqtype_t type) +{ + struct xfs_dquot *dqp; + int64_t freesp; + + /* We only care for quotas that are enabled and enforced. */ + dqp = xfs_inode_dquot(ip, type); + if (!dqp || !xfs_dquot_is_enforced(dqp)) + return false; + + if (xfs_dquot_res_over_limits(&dqp->q_ino) || + xfs_dquot_res_over_limits(&dqp->q_rtb)) + return true; + + /* For space on the data device, check the various thresholds. */ + if (!dqp->q_prealloc_hi_wmark) + return false; + + if (dqp->q_blk.reserved < dqp->q_prealloc_lo_wmark) + return false; + + if (dqp->q_blk.reserved >= dqp->q_prealloc_hi_wmark) + return true; + + freesp = dqp->q_prealloc_hi_wmark - dqp->q_blk.reserved; + if (freesp < dqp->q_low_space[XFS_QLOWSP_5_PCNT]) + return true; + + return false; +} diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index ebbb484c49dc..442a0f97a9d4 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -140,9 +140,6 @@ struct xfs_dquot_acct { extern void xfs_qm_destroy_quotainfo(struct xfs_mount *); -/* dquot stuff */ -extern void xfs_qm_dqpurge_all(struct xfs_mount *, uint); - /* quota ops */ extern int xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint); extern int xfs_qm_scall_getquota(struct xfs_mount *mp, diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c index df00dfbf5c9d..b77673dd0558 100644 --- a/fs/xfs/xfs_qm_bhv.c +++ b/fs/xfs/xfs_qm_bhv.c @@ -75,7 +75,7 @@ xfs_qm_newmount( uint quotaondisk; uint uquotaondisk = 0, gquotaondisk = 0, pquotaondisk = 0; - quotaondisk = xfs_sb_version_hasquota(&mp->m_sb) && + quotaondisk = xfs_has_quota(mp) && (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT); if (quotaondisk) { diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 13a56e1ea15c..47fe60e1a887 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -19,91 +19,11 @@ #include "xfs_qm.h" #include "xfs_icache.h" -STATIC int -xfs_qm_log_quotaoff( - struct xfs_mount *mp, - struct xfs_qoff_logitem **qoffstartp, - uint flags) -{ - struct xfs_trans *tp; - int error; - struct xfs_qoff_logitem *qoffi; - - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_quotaoff, 0, 0, 0, &tp); - if (error) - goto out; - - qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT); - xfs_trans_log_quotaoff_item(tp, qoffi); - - spin_lock(&mp->m_sb_lock); - mp->m_sb.sb_qflags = (mp->m_qflags & ~(flags)) & XFS_MOUNT_QUOTA_ALL; - spin_unlock(&mp->m_sb_lock); - - xfs_log_sb(tp); - - /* - * We have to make sure that the transaction is secure on disk before we - * return and actually stop quota accounting. So, make it synchronous. - * We don't care about quotoff's performance. - */ - xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp); - if (error) - goto out; - - *qoffstartp = qoffi; -out: - return error; -} - -STATIC int -xfs_qm_log_quotaoff_end( - struct xfs_mount *mp, - struct xfs_qoff_logitem **startqoff, - uint flags) -{ - struct xfs_trans *tp; - int error; - struct xfs_qoff_logitem *qoffi; - - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_equotaoff, 0, 0, 0, &tp); - if (error) - return error; - - qoffi = xfs_trans_get_qoff_item(tp, *startqoff, - flags & XFS_ALL_QUOTA_ACCT); - xfs_trans_log_quotaoff_item(tp, qoffi); - *startqoff = NULL; - - /* - * We have to make sure that the transaction is secure on disk before we - * return and actually stop quota accounting. So, make it synchronous. - * We don't care about quotoff's performance. - */ - xfs_trans_set_sync(tp); - return xfs_trans_commit(tp); -} - -/* - * Turn off quota accounting and/or enforcement for all udquots and/or - * gdquots. Called only at unmount time. - * - * This assumes that there are no dquots of this file system cached - * incore, and modifies the ondisk dquot directly. Therefore, for example, - * it is an error to call this twice, without purging the cache. - */ int xfs_qm_scall_quotaoff( xfs_mount_t *mp, uint flags) { - struct xfs_quotainfo *q = mp->m_quotainfo; - uint dqtype; - int error; - uint inactivate_flags; - struct xfs_qoff_logitem *qoffstart = NULL; - /* * No file system can have quotas enabled on disk but not in core. * Note that quota utilities (like quotaoff) _expect_ @@ -111,160 +31,23 @@ xfs_qm_scall_quotaoff( */ if ((mp->m_qflags & flags) == 0) return -EEXIST; - error = 0; - - flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD); - - /* - * We don't want to deal with two quotaoffs messing up each other, - * so we're going to serialize it. quotaoff isn't exactly a performance - * critical thing. - * If quotaoff, then we must be dealing with the root filesystem. - */ - ASSERT(q); - mutex_lock(&q->qi_quotaofflock); /* - * If we're just turning off quota enforcement, change mp and go. + * We do not support actually turning off quota accounting any more. + * Just log a warning and ignore the accounting related flags. */ - if ((flags & XFS_ALL_QUOTA_ACCT) == 0) { - mp->m_qflags &= ~(flags); + if (flags & XFS_ALL_QUOTA_ACCT) + xfs_info(mp, "disabling of quota accounting not supported."); - spin_lock(&mp->m_sb_lock); - mp->m_sb.sb_qflags = mp->m_qflags; - spin_unlock(&mp->m_sb_lock); - mutex_unlock(&q->qi_quotaofflock); - - /* XXX what to do if error ? Revert back to old vals incore ? */ - return xfs_sync_sb(mp, false); - } - - dqtype = 0; - inactivate_flags = 0; - /* - * If accounting is off, we must turn enforcement off, clear the - * quota 'CHKD' certificate to make it known that we have to - * do a quotacheck the next time this quota is turned on. - */ - if (flags & XFS_UQUOTA_ACCT) { - dqtype |= XFS_QMOPT_UQUOTA; - flags |= (XFS_UQUOTA_CHKD | XFS_UQUOTA_ENFD); - inactivate_flags |= XFS_UQUOTA_ACTIVE; - } - if (flags & XFS_GQUOTA_ACCT) { - dqtype |= XFS_QMOPT_GQUOTA; - flags |= (XFS_GQUOTA_CHKD | XFS_GQUOTA_ENFD); - inactivate_flags |= XFS_GQUOTA_ACTIVE; - } - if (flags & XFS_PQUOTA_ACCT) { - dqtype |= XFS_QMOPT_PQUOTA; - flags |= (XFS_PQUOTA_CHKD | XFS_PQUOTA_ENFD); - inactivate_flags |= XFS_PQUOTA_ACTIVE; - } - - /* - * Nothing to do? Don't complain. This happens when we're just - * turning off quota enforcement. - */ - if ((mp->m_qflags & flags) == 0) - goto out_unlock; - - /* - * Write the LI_QUOTAOFF log record, and do SB changes atomically, - * and synchronously. If we fail to write, we should abort the - * operation as it cannot be recovered safely if we crash. - */ - error = xfs_qm_log_quotaoff(mp, &qoffstart, flags); - if (error) - goto out_unlock; - - /* - * Next we clear the XFS_MOUNT_*DQ_ACTIVE bit(s) in the mount struct - * to take care of the race between dqget and quotaoff. We don't take - * any special locks to reset these bits. All processes need to check - * these bits *after* taking inode lock(s) to see if the particular - * quota type is in the process of being turned off. If *ACTIVE, it is - * guaranteed that all dquot structures and all quotainode ptrs will all - * stay valid as long as that inode is kept locked. - * - * There is no turning back after this. - */ - mp->m_qflags &= ~inactivate_flags; - - /* - * Give back all the dquot reference(s) held by inodes. - * Here we go thru every single incore inode in this file system, and - * do a dqrele on the i_udquot/i_gdquot that it may have. - * Essentially, as long as somebody has an inode locked, this guarantees - * that quotas will not be turned off. This is handy because in a - * transaction once we lock the inode(s) and check for quotaon, we can - * depend on the quota inodes (and other things) being valid as long as - * we keep the lock(s). - */ - error = xfs_dqrele_all_inodes(mp, flags); - ASSERT(!error); - - /* - * Next we make the changes in the quota flag in the mount struct. - * This isn't protected by a particular lock directly, because we - * don't want to take a mrlock every time we depend on quotas being on. - */ - mp->m_qflags &= ~flags; - - /* - * Go through all the dquots of this file system and purge them, - * according to what was turned off. - */ - xfs_qm_dqpurge_all(mp, dqtype); - - /* - * Transactions that had started before ACTIVE state bit was cleared - * could have logged many dquots, so they'd have higher LSNs than - * the first QUOTAOFF log record does. If we happen to crash when - * the tail of the log has gone past the QUOTAOFF record, but - * before the last dquot modification, those dquots __will__ - * recover, and that's not good. - * - * So, we have QUOTAOFF start and end logitems; the start - * logitem won't get overwritten until the end logitem appears... - */ - error = xfs_qm_log_quotaoff_end(mp, &qoffstart, flags); - if (error) { - /* We're screwed now. Shutdown is the only option. */ - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - goto out_unlock; - } - - /* - * If all quotas are completely turned off, close shop. - */ - if (mp->m_qflags == 0) { - mutex_unlock(&q->qi_quotaofflock); - xfs_qm_destroy_quotainfo(mp); - return 0; - } - - /* - * Release our quotainode references if we don't need them anymore. - */ - if ((dqtype & XFS_QMOPT_UQUOTA) && q->qi_uquotaip) { - xfs_irele(q->qi_uquotaip); - q->qi_uquotaip = NULL; - } - if ((dqtype & XFS_QMOPT_GQUOTA) && q->qi_gquotaip) { - xfs_irele(q->qi_gquotaip); - q->qi_gquotaip = NULL; - } - if ((dqtype & XFS_QMOPT_PQUOTA) && q->qi_pquotaip) { - xfs_irele(q->qi_pquotaip); - q->qi_pquotaip = NULL; - } + mutex_lock(&mp->m_quotainfo->qi_quotaofflock); + mp->m_qflags &= ~(flags & XFS_ALL_QUOTA_ENFD); + spin_lock(&mp->m_sb_lock); + mp->m_sb.sb_qflags = mp->m_qflags; + spin_unlock(&mp->m_sb_lock); + mutex_unlock(&mp->m_quotainfo->qi_quotaofflock); -out_unlock: - if (error && qoffstart) - xfs_qm_qoff_logitem_relse(qoffstart); - mutex_unlock(&q->qi_quotaofflock); - return error; + /* XXX what to do if error ? Revert back to old vals incore ? */ + return xfs_sync_sb(mp, false); } STATIC int @@ -322,7 +105,7 @@ xfs_qm_scall_trunc_qfiles( { int error = -EINVAL; - if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0 || + if (!xfs_has_quota(mp) || flags == 0 || (flags & ~XFS_QMOPT_QUOTALL)) { xfs_debug(mp, "%s: flags=%x m_qflags=%x", __func__, flags, mp->m_qflags); @@ -421,7 +204,7 @@ xfs_qm_scall_quotaon( (mp->m_qflags & XFS_GQUOTA_ACCT))) return 0; - if (! XFS_IS_QUOTA_RUNNING(mp)) + if (!XFS_IS_QUOTA_ON(mp)) return -ESRCH; /* @@ -698,6 +481,10 @@ xfs_qm_scall_getquota( struct xfs_dquot *dqp; int error; + /* Flush inodegc work at the start of a quota reporting scan. */ + if (id == 0) + xfs_inodegc_flush(mp); + /* * Try to get the dquot. We don't want it allocated on disk, so don't * set doalloc. If it doesn't exist, we'll get ENOENT back. @@ -736,6 +523,10 @@ xfs_qm_scall_getquota_next( struct xfs_dquot *dqp; int error; + /* Flush inodegc work at the start of a quota reporting scan. */ + if (*id == 0) + xfs_inodegc_flush(mp); + error = xfs_qm_dqget_next(mp, *id, type, &dqp); if (error) return error; diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index d00d01302545..dcc785fdd345 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -113,6 +113,7 @@ xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks) { return xfs_trans_reserve_quota_nblks(NULL, ip, blocks, 0, false); } +bool xfs_inode_near_dquot_enforcement(struct xfs_inode *ip, xfs_dqtype_t type); #else static inline int xfs_qm_vop_dqalloc(struct xfs_inode *ip, kuid_t kuid, kgid_t kgid, @@ -168,6 +169,7 @@ xfs_trans_reserve_quota_icreate(struct xfs_trans *tp, struct xfs_dquot *udqp, #define xfs_qm_mount_quotas(mp) #define xfs_qm_unmount(mp) #define xfs_qm_unmount_quotas(mp) +#define xfs_inode_near_dquot_enforcement(ip, type) (false) #endif /* CONFIG_XFS_QUOTA */ static inline int diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c index 88d70c236a54..07989bd67728 100644 --- a/fs/xfs/xfs_quotaops.c +++ b/fs/xfs/xfs_quotaops.c @@ -60,18 +60,18 @@ xfs_fs_get_quota_state( struct xfs_quotainfo *q = mp->m_quotainfo; memset(state, 0, sizeof(*state)); - if (!XFS_IS_QUOTA_RUNNING(mp)) + if (!XFS_IS_QUOTA_ON(mp)) return 0; state->s_incoredqs = q->qi_dquots; - if (XFS_IS_UQUOTA_RUNNING(mp)) + if (XFS_IS_UQUOTA_ON(mp)) state->s_state[USRQUOTA].flags |= QCI_ACCT_ENABLED; if (XFS_IS_UQUOTA_ENFORCED(mp)) state->s_state[USRQUOTA].flags |= QCI_LIMITS_ENFORCED; - if (XFS_IS_GQUOTA_RUNNING(mp)) + if (XFS_IS_GQUOTA_ON(mp)) state->s_state[GRPQUOTA].flags |= QCI_ACCT_ENABLED; if (XFS_IS_GQUOTA_ENFORCED(mp)) state->s_state[GRPQUOTA].flags |= QCI_LIMITS_ENFORCED; - if (XFS_IS_PQUOTA_RUNNING(mp)) + if (XFS_IS_PQUOTA_ON(mp)) state->s_state[PRJQUOTA].flags |= QCI_ACCT_ENABLED; if (XFS_IS_PQUOTA_ENFORCED(mp)) state->s_state[PRJQUOTA].flags |= QCI_LIMITS_ENFORCED; @@ -114,10 +114,8 @@ xfs_fs_set_info( if (sb_rdonly(sb)) return -EROFS; - if (!XFS_IS_QUOTA_RUNNING(mp)) - return -ENOSYS; if (!XFS_IS_QUOTA_ON(mp)) - return -ESRCH; + return -ENOSYS; if (info->i_fieldmask & ~XFS_QC_SETINFO_MASK) return -EINVAL; if ((info->i_fieldmask & XFS_QC_SETINFO_MASK) == 0) @@ -164,7 +162,7 @@ xfs_quota_enable( if (sb_rdonly(sb)) return -EROFS; - if (!XFS_IS_QUOTA_RUNNING(mp)) + if (!XFS_IS_QUOTA_ON(mp)) return -ENOSYS; return xfs_qm_scall_quotaon(mp, xfs_quota_flags(uflags)); @@ -179,10 +177,8 @@ xfs_quota_disable( if (sb_rdonly(sb)) return -EROFS; - if (!XFS_IS_QUOTA_RUNNING(mp)) - return -ENOSYS; if (!XFS_IS_QUOTA_ON(mp)) - return -EINVAL; + return -ENOSYS; return xfs_qm_scall_quotaoff(mp, xfs_quota_flags(uflags)); } @@ -223,10 +219,8 @@ xfs_fs_get_dqblk( struct xfs_mount *mp = XFS_M(sb); xfs_dqid_t id; - if (!XFS_IS_QUOTA_RUNNING(mp)) - return -ENOSYS; if (!XFS_IS_QUOTA_ON(mp)) - return -ESRCH; + return -ENOSYS; id = from_kqid(&init_user_ns, qid); return xfs_qm_scall_getquota(mp, id, xfs_quota_type(qid.type), qdq); @@ -243,10 +237,8 @@ xfs_fs_get_nextdqblk( struct xfs_mount *mp = XFS_M(sb); xfs_dqid_t id; - if (!XFS_IS_QUOTA_RUNNING(mp)) - return -ENOSYS; if (!XFS_IS_QUOTA_ON(mp)) - return -ESRCH; + return -ENOSYS; id = from_kqid(&init_user_ns, *qid); ret = xfs_qm_scall_getquota_next(mp, &id, xfs_quota_type(qid->type), @@ -269,10 +261,8 @@ xfs_fs_set_dqblk( if (sb_rdonly(sb)) return -EROFS; - if (!XFS_IS_QUOTA_RUNNING(mp)) - return -ENOSYS; if (!XFS_IS_QUOTA_ON(mp)) - return -ESRCH; + return -ENOSYS; return xfs_qm_scall_setqlim(mp, from_kqid(&init_user_ns, qid), xfs_quota_type(qid.type), qdq); diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 746f4eda724c..46904b793bd4 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -423,7 +423,7 @@ xfs_cui_validate_phys( struct xfs_mount *mp, struct xfs_phys_extent *refc) { - if (!xfs_sb_version_hasreflink(&mp->m_sb)) + if (!xfs_has_reflink(mp)) return false; if (refc->pe_flags & ~XFS_REFCOUNT_EXTENT_FLAGS) @@ -522,6 +522,9 @@ xfs_cui_item_recover( error = xfs_trans_log_finish_refcount_update(tp, cudp, type, refc->pe_startblock, refc->pe_len, &new_fsb, &new_len, &rcur); + if (error == -EFSCORRUPTED) + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + refc, sizeof(*refc)); if (error) goto abort_error; diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index c256104772cb..76355f293488 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -759,7 +759,7 @@ xfs_reflink_recover_cow( xfs_agnumber_t agno; int error = 0; - if (!xfs_sb_version_hasreflink(&mp->m_sb)) + if (!xfs_has_reflink(mp)) return 0; for_each_perag(mp, agno, pag) { @@ -967,7 +967,7 @@ xfs_reflink_ag_has_free_space( struct xfs_perag *pag; int error = 0; - if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) + if (!xfs_has_rmapbt(mp)) return 0; pag = xfs_perag_get(mp, agno); diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index 487b00434b96..bea65f2fe657 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -8,8 +8,7 @@ static inline bool xfs_is_always_cow_inode(struct xfs_inode *ip) { - return ip->i_mount->m_always_cow && - xfs_sb_version_hasreflink(&ip->i_mount->m_sb); + return ip->i_mount->m_always_cow && xfs_has_reflink(ip->i_mount); } static inline bool xfs_is_cow_inode(struct xfs_inode *ip) diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index dc4f0c9f0897..5f0695980467 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -466,7 +466,7 @@ xfs_rui_validate_map( struct xfs_mount *mp, struct xfs_map_extent *rmap) { - if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) + if (!xfs_has_rmapbt(mp)) return false; if (rmap->me_flags & ~XFS_RMAP_EXTENT_FLAGS) @@ -578,6 +578,9 @@ xfs_rui_item_recover( rmap->me_owner, whichfork, rmap->me_startoff, rmap->me_startblock, rmap->me_len, state, &rcur); + if (error == -EFSCORRUPTED) + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + rmap, sizeof(*rmap)); if (error) goto abort_error; diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 699066fb9052..b8c79ee791af 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -951,8 +951,7 @@ xfs_growfs_rt( return -EINVAL; /* Unsupported realtime features. */ - if (xfs_sb_version_hasrmapbt(&mp->m_sb) || - xfs_sb_version_hasreflink(&mp->m_sb)) + if (xfs_has_rmapbt(mp) || xfs_has_reflink(mp)) return -EOPNOTSUPP; nrblocks = in->newblocks; @@ -1131,6 +1130,9 @@ error_cancel: error = xfs_trans_commit(tp); if (error) break; + + /* Ensure the mount RT feature flag is now set. */ + mp->m_features |= XFS_FEAT_REALTIME; } if (error) goto out_free; diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index ed885620589c..91b00289509b 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -22,9 +22,9 @@ struct xfs_rtalloc_rec { }; typedef int (*xfs_rtalloc_query_range_fn)( - struct xfs_trans *tp, - struct xfs_rtalloc_rec *rec, - void *priv); + struct xfs_trans *tp, + const struct xfs_rtalloc_rec *rec, + void *priv); #ifdef CONFIG_XFS_RT /* @@ -124,10 +124,9 @@ int xfs_rtfree_range(struct xfs_mount *mp, struct xfs_trans *tp, xfs_rtblock_t start, xfs_extlen_t len, struct xfs_buf **rbpp, xfs_fsblock_t *rsb); int xfs_rtalloc_query_range(struct xfs_trans *tp, - struct xfs_rtalloc_rec *low_rec, - struct xfs_rtalloc_rec *high_rec, - xfs_rtalloc_query_range_fn fn, - void *priv); + const struct xfs_rtalloc_rec *low_rec, + const struct xfs_rtalloc_rec *high_rec, + xfs_rtalloc_query_range_fn fn, void *priv); int xfs_rtalloc_query_all(struct xfs_trans *tp, xfs_rtalloc_query_range_fn fn, void *priv); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 2c9e26a44546..9a86d3ec2cb6 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -49,6 +49,28 @@ static struct kset *xfs_kset; /* top-level xfs sysfs dir */ static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */ #endif +#ifdef CONFIG_HOTPLUG_CPU +static LIST_HEAD(xfs_mount_list); +static DEFINE_SPINLOCK(xfs_mount_list_lock); + +static inline void xfs_mount_list_add(struct xfs_mount *mp) +{ + spin_lock(&xfs_mount_list_lock); + list_add(&mp->m_mount_list, &xfs_mount_list); + spin_unlock(&xfs_mount_list_lock); +} + +static inline void xfs_mount_list_del(struct xfs_mount *mp) +{ + spin_lock(&xfs_mount_list_lock); + list_del(&mp->m_mount_list); + spin_unlock(&xfs_mount_list_lock); +} +#else /* !CONFIG_HOTPLUG_CPU */ +static inline void xfs_mount_list_add(struct xfs_mount *mp) {} +static inline void xfs_mount_list_del(struct xfs_mount *mp) {} +#endif + enum xfs_dax_mode { XFS_DAX_INODE = 0, XFS_DAX_ALWAYS = 1, @@ -62,15 +84,15 @@ xfs_mount_set_dax_mode( { switch (mode) { case XFS_DAX_INODE: - mp->m_flags &= ~(XFS_MOUNT_DAX_ALWAYS | XFS_MOUNT_DAX_NEVER); + mp->m_features &= ~(XFS_FEAT_DAX_ALWAYS | XFS_FEAT_DAX_NEVER); break; case XFS_DAX_ALWAYS: - mp->m_flags |= XFS_MOUNT_DAX_ALWAYS; - mp->m_flags &= ~XFS_MOUNT_DAX_NEVER; + mp->m_features |= XFS_FEAT_DAX_ALWAYS; + mp->m_features &= ~XFS_FEAT_DAX_NEVER; break; case XFS_DAX_NEVER: - mp->m_flags |= XFS_MOUNT_DAX_NEVER; - mp->m_flags &= ~XFS_MOUNT_DAX_ALWAYS; + mp->m_features |= XFS_FEAT_DAX_NEVER; + mp->m_features &= ~XFS_FEAT_DAX_ALWAYS; break; } } @@ -154,33 +176,32 @@ xfs_fs_show_options( { static struct proc_xfs_info xfs_info_set[] = { /* the few simple ones we can get from the mount struct */ - { XFS_MOUNT_IKEEP, ",ikeep" }, - { XFS_MOUNT_WSYNC, ",wsync" }, - { XFS_MOUNT_NOALIGN, ",noalign" }, - { XFS_MOUNT_SWALLOC, ",swalloc" }, - { XFS_MOUNT_NOUUID, ",nouuid" }, - { XFS_MOUNT_NORECOVERY, ",norecovery" }, - { XFS_MOUNT_ATTR2, ",attr2" }, - { XFS_MOUNT_FILESTREAMS, ",filestreams" }, - { XFS_MOUNT_GRPID, ",grpid" }, - { XFS_MOUNT_DISCARD, ",discard" }, - { XFS_MOUNT_LARGEIO, ",largeio" }, - { XFS_MOUNT_DAX_ALWAYS, ",dax=always" }, - { XFS_MOUNT_DAX_NEVER, ",dax=never" }, + { XFS_FEAT_IKEEP, ",ikeep" }, + { XFS_FEAT_WSYNC, ",wsync" }, + { XFS_FEAT_NOALIGN, ",noalign" }, + { XFS_FEAT_SWALLOC, ",swalloc" }, + { XFS_FEAT_NOUUID, ",nouuid" }, + { XFS_FEAT_NORECOVERY, ",norecovery" }, + { XFS_FEAT_ATTR2, ",attr2" }, + { XFS_FEAT_FILESTREAMS, ",filestreams" }, + { XFS_FEAT_GRPID, ",grpid" }, + { XFS_FEAT_DISCARD, ",discard" }, + { XFS_FEAT_LARGE_IOSIZE, ",largeio" }, + { XFS_FEAT_DAX_ALWAYS, ",dax=always" }, + { XFS_FEAT_DAX_NEVER, ",dax=never" }, { 0, NULL } }; struct xfs_mount *mp = XFS_M(root->d_sb); struct proc_xfs_info *xfs_infop; for (xfs_infop = xfs_info_set; xfs_infop->flag; xfs_infop++) { - if (mp->m_flags & xfs_infop->flag) + if (mp->m_features & xfs_infop->flag) seq_puts(m, xfs_infop->str); } - seq_printf(m, ",inode%d", - (mp->m_flags & XFS_MOUNT_SMALL_INUMS) ? 32 : 64); + seq_printf(m, ",inode%d", xfs_has_small_inums(mp) ? 32 : 64); - if (mp->m_flags & XFS_MOUNT_ALLOCSIZE) + if (xfs_has_allocsize(mp)) seq_printf(m, ",allocsize=%dk", (1 << mp->m_allocsize_log) >> 10); @@ -201,25 +222,20 @@ xfs_fs_show_options( seq_printf(m, ",swidth=%d", (int)XFS_FSB_TO_BB(mp, mp->m_swidth)); - if (mp->m_qflags & XFS_UQUOTA_ACCT) { - if (mp->m_qflags & XFS_UQUOTA_ENFD) - seq_puts(m, ",usrquota"); - else - seq_puts(m, ",uqnoenforce"); - } + if (mp->m_qflags & XFS_UQUOTA_ENFD) + seq_puts(m, ",usrquota"); + else if (mp->m_qflags & XFS_UQUOTA_ACCT) + seq_puts(m, ",uqnoenforce"); - if (mp->m_qflags & XFS_PQUOTA_ACCT) { - if (mp->m_qflags & XFS_PQUOTA_ENFD) - seq_puts(m, ",prjquota"); - else - seq_puts(m, ",pqnoenforce"); - } - if (mp->m_qflags & XFS_GQUOTA_ACCT) { - if (mp->m_qflags & XFS_GQUOTA_ENFD) - seq_puts(m, ",grpquota"); - else - seq_puts(m, ",gqnoenforce"); - } + if (mp->m_qflags & XFS_PQUOTA_ENFD) + seq_puts(m, ",prjquota"); + else if (mp->m_qflags & XFS_PQUOTA_ACCT) + seq_puts(m, ",pqnoenforce"); + + if (mp->m_qflags & XFS_GQUOTA_ENFD) + seq_puts(m, ",grpquota"); + else if (mp->m_qflags & XFS_GQUOTA_ACCT) + seq_puts(m, ",gqnoenforce"); if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT)) seq_puts(m, ",noquota"); @@ -230,11 +246,11 @@ xfs_fs_show_options( /* * Set parameters for inode allocation heuristics, taking into account * filesystem size and inode32/inode64 mount options; i.e. specifically - * whether or not XFS_MOUNT_SMALL_INUMS is set. + * whether or not XFS_FEAT_SMALL_INUMS is set. * * Inode allocation patterns are altered only if inode32 is requested - * (XFS_MOUNT_SMALL_INUMS), and the filesystem is sufficiently large. - * If altered, XFS_MOUNT_32BITINODES is set as well. + * (XFS_FEAT_SMALL_INUMS), and the filesystem is sufficiently large. + * If altered, XFS_OPSTATE_INODE32 is set as well. * * An agcount independent of that in the mount structure is provided * because in the growfs case, mp->m_sb.sb_agcount is not yet updated @@ -276,13 +292,13 @@ xfs_set_inode_alloc( /* * If user asked for no more than 32-bit inodes, and the fs is - * sufficiently large, set XFS_MOUNT_32BITINODES if we must alter + * sufficiently large, set XFS_OPSTATE_INODE32 if we must alter * the allocator to accommodate the request. */ - if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && ino > XFS_MAXINUMBER_32) - mp->m_flags |= XFS_MOUNT_32BITINODES; + if (xfs_has_small_inums(mp) && ino > XFS_MAXINUMBER_32) + set_bit(XFS_OPSTATE_INODE32, &mp->m_opstate); else - mp->m_flags &= ~XFS_MOUNT_32BITINODES; + clear_bit(XFS_OPSTATE_INODE32, &mp->m_opstate); for (index = 0; index < agcount; index++) { struct xfs_perag *pag; @@ -291,7 +307,7 @@ xfs_set_inode_alloc( pag = xfs_perag_get(mp, index); - if (mp->m_flags & XFS_MOUNT_32BITINODES) { + if (xfs_is_inode32(mp)) { if (ino > XFS_MAXINUMBER_32) { pag->pagi_inodeok = 0; pag->pagf_metadata = 0; @@ -311,7 +327,7 @@ xfs_set_inode_alloc( xfs_perag_put(pag); } - return (mp->m_flags & XFS_MOUNT_32BITINODES) ? maxagi : agcount; + return xfs_is_inode32(mp) ? maxagi : agcount; } STATIC int @@ -468,7 +484,7 @@ xfs_setup_devices( if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) { unsigned int log_sector_size = BBSIZE; - if (xfs_sb_version_hassector(&mp->m_sb)) + if (xfs_has_sector(mp)) log_sector_size = mp->m_sb.sb_logsectsize; error = xfs_setsize_buftarg(mp->m_logdev_targp, log_sector_size); @@ -501,37 +517,37 @@ xfs_init_mount_workqueues( if (!mp->m_unwritten_workqueue) goto out_destroy_buf; - mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s", - XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_UNBOUND), - 0, mp->m_super->s_id); - if (!mp->m_cil_workqueue) - goto out_destroy_unwritten; - mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s", XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM), 0, mp->m_super->s_id); if (!mp->m_reclaim_workqueue) - goto out_destroy_cil; + goto out_destroy_unwritten; - mp->m_gc_workqueue = alloc_workqueue("xfs-gc/%s", - WQ_SYSFS | WQ_UNBOUND | WQ_FREEZABLE | WQ_MEM_RECLAIM, + mp->m_blockgc_wq = alloc_workqueue("xfs-blockgc/%s", + XFS_WQFLAGS(WQ_UNBOUND | WQ_FREEZABLE | WQ_MEM_RECLAIM), 0, mp->m_super->s_id); - if (!mp->m_gc_workqueue) + if (!mp->m_blockgc_wq) goto out_destroy_reclaim; + mp->m_inodegc_wq = alloc_workqueue("xfs-inodegc/%s", + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM), + 1, mp->m_super->s_id); + if (!mp->m_inodegc_wq) + goto out_destroy_blockgc; + mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s", XFS_WQFLAGS(WQ_FREEZABLE), 0, mp->m_super->s_id); if (!mp->m_sync_workqueue) - goto out_destroy_eofb; + goto out_destroy_inodegc; return 0; -out_destroy_eofb: - destroy_workqueue(mp->m_gc_workqueue); +out_destroy_inodegc: + destroy_workqueue(mp->m_inodegc_wq); +out_destroy_blockgc: + destroy_workqueue(mp->m_blockgc_wq); out_destroy_reclaim: destroy_workqueue(mp->m_reclaim_workqueue); -out_destroy_cil: - destroy_workqueue(mp->m_cil_workqueue); out_destroy_unwritten: destroy_workqueue(mp->m_unwritten_workqueue); out_destroy_buf: @@ -545,9 +561,9 @@ xfs_destroy_mount_workqueues( struct xfs_mount *mp) { destroy_workqueue(mp->m_sync_workqueue); - destroy_workqueue(mp->m_gc_workqueue); + destroy_workqueue(mp->m_blockgc_wq); + destroy_workqueue(mp->m_inodegc_wq); destroy_workqueue(mp->m_reclaim_workqueue); - destroy_workqueue(mp->m_cil_workqueue); destroy_workqueue(mp->m_unwritten_workqueue); destroy_workqueue(mp->m_buf_workqueue); } @@ -596,32 +612,6 @@ xfs_fs_alloc_inode( return NULL; } -#ifdef DEBUG -static void -xfs_check_delalloc( - struct xfs_inode *ip, - int whichfork) -{ - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); - struct xfs_bmbt_irec got; - struct xfs_iext_cursor icur; - - if (!ifp || !xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got)) - return; - do { - if (isnullstartblock(got.br_startblock)) { - xfs_warn(ip->i_mount, - "ino %llx %s fork has delalloc extent at [0x%llx:0x%llx]", - ip->i_ino, - whichfork == XFS_DATA_FORK ? "data" : "cow", - got.br_startoff, got.br_blockcount); - } - } while (xfs_iext_next_extent(ifp, &icur, &got)); -} -#else -#define xfs_check_delalloc(ip, whichfork) do { } while (0) -#endif - /* * Now that the generic code is guaranteed not to be accessing * the linux inode, we can inactivate and reclaim the inode. @@ -637,30 +627,6 @@ xfs_fs_destroy_inode( ASSERT(!rwsem_is_locked(&inode->i_rwsem)); XFS_STATS_INC(ip->i_mount, vn_rele); XFS_STATS_INC(ip->i_mount, vn_remove); - - xfs_inactive(ip); - - if (!XFS_FORCED_SHUTDOWN(ip->i_mount) && ip->i_delayed_blks) { - xfs_check_delalloc(ip, XFS_DATA_FORK); - xfs_check_delalloc(ip, XFS_COW_FORK); - ASSERT(0); - } - - XFS_STATS_INC(ip->i_mount, vn_reclaim); - - /* - * We should never get here with one of the reclaim flags already set. - */ - ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIMABLE)); - ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM)); - - /* - * We always use background reclaim here because even if the inode is - * clean, it still may be under IO and hence we have wait for IO - * completion to occur before we can reclaim the inode. The background - * reclaim path handles this more efficiently than we can here, so - * simply let background reclaim tear down all inodes. - */ xfs_inode_mark_reclaimable(ip); } @@ -709,8 +675,6 @@ xfs_fs_inode_init_once( atomic_set(&ip->i_pincount, 0); spin_lock_init(&ip->i_flags_lock); - mrlock_init(&ip->i_mmaplock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER, - "xfsino", ip->i_ino); mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER, "xfsino", ip->i_ino); } @@ -734,7 +698,7 @@ xfs_fs_drop_inode( * that. See the comment for this inode flag. */ if (ip->i_flags & XFS_IRECOVERY) { - ASSERT(ip->i_mount->m_log->l_flags & XLOG_RECOVERY_NEEDED); + ASSERT(xlog_recovery_needed(ip->i_mount->m_log)); return 0; } @@ -757,6 +721,8 @@ xfs_fs_sync_fs( { struct xfs_mount *mp = XFS_M(sb); + trace_xfs_fs_sync_fs(mp, __return_address); + /* * Doing anything during the async pass would be counterproductive. */ @@ -773,6 +739,25 @@ xfs_fs_sync_fs( flush_delayed_work(&mp->m_log->l_work); } + /* + * If we are called with page faults frozen out, it means we are about + * to freeze the transaction subsystem. Take the opportunity to shut + * down inodegc because once SB_FREEZE_FS is set it's too late to + * prevent inactivation races with freeze. The fs doesn't get called + * again by the freezing process until after SB_FREEZE_FS has been set, + * so it's now or never. Same logic applies to speculative allocation + * garbage collection. + * + * We don't care if this is a normal syncfs call that does this or + * freeze that does this - we can run this multiple times without issue + * and we won't race with a restart because a restart can only occur + * when the state is either SB_FREEZE_FS or SB_FREEZE_COMPLETE. + */ + if (sb->s_writers.frozen == SB_FREEZE_PAGEFAULT) { + xfs_inodegc_stop(mp); + xfs_blockgc_stop(mp); + } + return 0; } @@ -791,6 +776,9 @@ xfs_fs_statfs( xfs_extlen_t lsize; int64_t ffree; + /* Wait for whatever inactivations are in progress. */ + xfs_inodegc_flush(mp); + statp->f_type = XFS_SUPER_MAGIC; statp->f_namelen = MAXNAMELEN - 1; @@ -886,10 +874,22 @@ xfs_fs_freeze( * set a GFP_NOFS context here to avoid recursion deadlocks. */ flags = memalloc_nofs_save(); - xfs_blockgc_stop(mp); xfs_save_resvblks(mp); ret = xfs_log_quiesce(mp); memalloc_nofs_restore(flags); + + /* + * For read-write filesystems, we need to restart the inodegc on error + * because we stopped it at SB_FREEZE_PAGEFAULT level and a thaw is not + * going to be run to restart it now. We are at SB_FREEZE_FS level + * here, so we can restart safely without racing with a stop in + * xfs_fs_sync_fs(). + */ + if (ret && !xfs_is_readonly(mp)) { + xfs_blockgc_start(mp); + xfs_inodegc_start(mp); + } + return ret; } @@ -901,7 +901,18 @@ xfs_fs_unfreeze( xfs_restore_resvblks(mp); xfs_log_work_queue(mp); - xfs_blockgc_start(mp); + + /* + * Don't reactivate the inodegc worker on a readonly filesystem because + * inodes are sent directly to reclaim. Don't reactivate the blockgc + * worker because there are no speculative preallocations on a readonly + * filesystem. + */ + if (!xfs_is_readonly(mp)) { + xfs_blockgc_start(mp); + xfs_inodegc_start(mp); + } + return 0; } @@ -913,10 +924,8 @@ STATIC int xfs_finish_flags( struct xfs_mount *mp) { - int ronly = (mp->m_flags & XFS_MOUNT_RDONLY); - /* Fail a mount where the logbuf is smaller than the log stripe */ - if (xfs_sb_version_haslogv2(&mp->m_sb)) { + if (xfs_has_logv2(mp)) { if (mp->m_logbsize <= 0 && mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE) { mp->m_logbsize = mp->m_sb.sb_logsunit; @@ -938,33 +947,24 @@ xfs_finish_flags( /* * V5 filesystems always use attr2 format for attributes. */ - if (xfs_sb_version_hascrc(&mp->m_sb) && - (mp->m_flags & XFS_MOUNT_NOATTR2)) { + if (xfs_has_crc(mp) && xfs_has_noattr2(mp)) { xfs_warn(mp, "Cannot mount a V5 filesystem as noattr2. " "attr2 is always enabled for V5 filesystems."); return -EINVAL; } /* - * mkfs'ed attr2 will turn on attr2 mount unless explicitly - * told by noattr2 to turn it off - */ - if (xfs_sb_version_hasattr2(&mp->m_sb) && - !(mp->m_flags & XFS_MOUNT_NOATTR2)) - mp->m_flags |= XFS_MOUNT_ATTR2; - - /* * prohibit r/w mounts of read-only filesystems */ - if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !ronly) { + if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !xfs_is_readonly(mp)) { xfs_warn(mp, "cannot mount a read-only filesystem as read-write"); return -EROFS; } - if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) && - (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE)) && - !xfs_sb_version_has_pquotino(&mp->m_sb)) { + if ((mp->m_qflags & XFS_GQUOTA_ACCT) && + (mp->m_qflags & XFS_PQUOTA_ACCT) && + !xfs_has_pquotino(mp)) { xfs_warn(mp, "Super block does not support project and group quota together"); return -EINVAL; @@ -1022,11 +1022,40 @@ xfs_destroy_percpu_counters( percpu_counter_destroy(&mp->m_icount); percpu_counter_destroy(&mp->m_ifree); percpu_counter_destroy(&mp->m_fdblocks); - ASSERT(XFS_FORCED_SHUTDOWN(mp) || + ASSERT(xfs_is_shutdown(mp) || percpu_counter_sum(&mp->m_delalloc_blks) == 0); percpu_counter_destroy(&mp->m_delalloc_blks); } +static int +xfs_inodegc_init_percpu( + struct xfs_mount *mp) +{ + struct xfs_inodegc *gc; + int cpu; + + mp->m_inodegc = alloc_percpu(struct xfs_inodegc); + if (!mp->m_inodegc) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + gc = per_cpu_ptr(mp->m_inodegc, cpu); + init_llist_head(&gc->list); + gc->items = 0; + INIT_WORK(&gc->work, xfs_inodegc_worker); + } + return 0; +} + +static void +xfs_inodegc_free_percpu( + struct xfs_mount *mp) +{ + if (!mp->m_inodegc) + return; + free_percpu(mp->m_inodegc); +} + static void xfs_fs_put_super( struct super_block *sb) @@ -1043,6 +1072,8 @@ xfs_fs_put_super( xfs_freesb(mp); free_percpu(mp->m_stats.xs_stats); + xfs_mount_list_del(mp); + xfs_inodegc_free_percpu(mp); xfs_destroy_percpu_counters(mp); xfs_destroy_mount_workqueues(mp); xfs_close_devices(mp); @@ -1131,7 +1162,7 @@ xfs_fs_warn_deprecated( * already had the flag set */ if ((fc->purpose & FS_CONTEXT_FOR_RECONFIGURE) && - !!(XFS_M(fc->root->d_sb)->m_flags & flag) == value) + !!(XFS_M(fc->root->d_sb)->m_features & flag) == value) return; xfs_warn(fc->s_fs_info, "%s mount option is deprecated.", param->key); } @@ -1179,27 +1210,27 @@ xfs_fs_parse_param( if (suffix_kstrtoint(param->string, 10, &size)) return -EINVAL; parsing_mp->m_allocsize_log = ffs(size) - 1; - parsing_mp->m_flags |= XFS_MOUNT_ALLOCSIZE; + parsing_mp->m_features |= XFS_FEAT_ALLOCSIZE; return 0; case Opt_grpid: case Opt_bsdgroups: - parsing_mp->m_flags |= XFS_MOUNT_GRPID; + parsing_mp->m_features |= XFS_FEAT_GRPID; return 0; case Opt_nogrpid: case Opt_sysvgroups: - parsing_mp->m_flags &= ~XFS_MOUNT_GRPID; + parsing_mp->m_features &= ~XFS_FEAT_GRPID; return 0; case Opt_wsync: - parsing_mp->m_flags |= XFS_MOUNT_WSYNC; + parsing_mp->m_features |= XFS_FEAT_WSYNC; return 0; case Opt_norecovery: - parsing_mp->m_flags |= XFS_MOUNT_NORECOVERY; + parsing_mp->m_features |= XFS_FEAT_NORECOVERY; return 0; case Opt_noalign: - parsing_mp->m_flags |= XFS_MOUNT_NOALIGN; + parsing_mp->m_features |= XFS_FEAT_NOALIGN; return 0; case Opt_swalloc: - parsing_mp->m_flags |= XFS_MOUNT_SWALLOC; + parsing_mp->m_features |= XFS_FEAT_SWALLOC; return 0; case Opt_sunit: parsing_mp->m_dalign = result.uint_32; @@ -1208,62 +1239,58 @@ xfs_fs_parse_param( parsing_mp->m_swidth = result.uint_32; return 0; case Opt_inode32: - parsing_mp->m_flags |= XFS_MOUNT_SMALL_INUMS; + parsing_mp->m_features |= XFS_FEAT_SMALL_INUMS; return 0; case Opt_inode64: - parsing_mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS; + parsing_mp->m_features &= ~XFS_FEAT_SMALL_INUMS; return 0; case Opt_nouuid: - parsing_mp->m_flags |= XFS_MOUNT_NOUUID; + parsing_mp->m_features |= XFS_FEAT_NOUUID; return 0; case Opt_largeio: - parsing_mp->m_flags |= XFS_MOUNT_LARGEIO; + parsing_mp->m_features |= XFS_FEAT_LARGE_IOSIZE; return 0; case Opt_nolargeio: - parsing_mp->m_flags &= ~XFS_MOUNT_LARGEIO; + parsing_mp->m_features &= ~XFS_FEAT_LARGE_IOSIZE; return 0; case Opt_filestreams: - parsing_mp->m_flags |= XFS_MOUNT_FILESTREAMS; + parsing_mp->m_features |= XFS_FEAT_FILESTREAMS; return 0; case Opt_noquota: parsing_mp->m_qflags &= ~XFS_ALL_QUOTA_ACCT; parsing_mp->m_qflags &= ~XFS_ALL_QUOTA_ENFD; - parsing_mp->m_qflags &= ~XFS_ALL_QUOTA_ACTIVE; return 0; case Opt_quota: case Opt_uquota: case Opt_usrquota: - parsing_mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE | - XFS_UQUOTA_ENFD); + parsing_mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ENFD); return 0; case Opt_qnoenforce: case Opt_uqnoenforce: - parsing_mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE); + parsing_mp->m_qflags |= XFS_UQUOTA_ACCT; parsing_mp->m_qflags &= ~XFS_UQUOTA_ENFD; return 0; case Opt_pquota: case Opt_prjquota: - parsing_mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE | - XFS_PQUOTA_ENFD); + parsing_mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ENFD); return 0; case Opt_pqnoenforce: - parsing_mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE); + parsing_mp->m_qflags |= XFS_PQUOTA_ACCT; parsing_mp->m_qflags &= ~XFS_PQUOTA_ENFD; return 0; case Opt_gquota: case Opt_grpquota: - parsing_mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE | - XFS_GQUOTA_ENFD); + parsing_mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ENFD); return 0; case Opt_gqnoenforce: - parsing_mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE); + parsing_mp->m_qflags |= XFS_GQUOTA_ACCT; parsing_mp->m_qflags &= ~XFS_GQUOTA_ENFD; return 0; case Opt_discard: - parsing_mp->m_flags |= XFS_MOUNT_DISCARD; + parsing_mp->m_features |= XFS_FEAT_DISCARD; return 0; case Opt_nodiscard: - parsing_mp->m_flags &= ~XFS_MOUNT_DISCARD; + parsing_mp->m_features &= ~XFS_FEAT_DISCARD; return 0; #ifdef CONFIG_FS_DAX case Opt_dax: @@ -1275,21 +1302,20 @@ xfs_fs_parse_param( #endif /* Following mount options will be removed in September 2025 */ case Opt_ikeep: - xfs_fs_warn_deprecated(fc, param, XFS_MOUNT_IKEEP, true); - parsing_mp->m_flags |= XFS_MOUNT_IKEEP; + xfs_fs_warn_deprecated(fc, param, XFS_FEAT_IKEEP, true); + parsing_mp->m_features |= XFS_FEAT_IKEEP; return 0; case Opt_noikeep: - xfs_fs_warn_deprecated(fc, param, XFS_MOUNT_IKEEP, false); - parsing_mp->m_flags &= ~XFS_MOUNT_IKEEP; + xfs_fs_warn_deprecated(fc, param, XFS_FEAT_IKEEP, false); + parsing_mp->m_features &= ~XFS_FEAT_IKEEP; return 0; case Opt_attr2: - xfs_fs_warn_deprecated(fc, param, XFS_MOUNT_ATTR2, true); - parsing_mp->m_flags |= XFS_MOUNT_ATTR2; + xfs_fs_warn_deprecated(fc, param, XFS_FEAT_ATTR2, true); + parsing_mp->m_features |= XFS_FEAT_ATTR2; return 0; case Opt_noattr2: - xfs_fs_warn_deprecated(fc, param, XFS_MOUNT_NOATTR2, true); - parsing_mp->m_flags &= ~XFS_MOUNT_ATTR2; - parsing_mp->m_flags |= XFS_MOUNT_NOATTR2; + xfs_fs_warn_deprecated(fc, param, XFS_FEAT_NOATTR2, true); + parsing_mp->m_features |= XFS_FEAT_NOATTR2; return 0; default: xfs_warn(parsing_mp, "unknown mount option [%s].", param->key); @@ -1303,17 +1329,23 @@ static int xfs_fs_validate_params( struct xfs_mount *mp) { + /* No recovery flag requires a read-only mount */ + if (xfs_has_norecovery(mp) && !xfs_is_readonly(mp)) { + xfs_warn(mp, "no-recovery mounts must be read-only."); + return -EINVAL; + } + /* - * no recovery flag requires a read-only mount + * We have not read the superblock at this point, so only the attr2 + * mount option can set the attr2 feature by this stage. */ - if ((mp->m_flags & XFS_MOUNT_NORECOVERY) && - !(mp->m_flags & XFS_MOUNT_RDONLY)) { - xfs_warn(mp, "no-recovery mounts must be read-only."); + if (xfs_has_attr2(mp) && xfs_has_noattr2(mp)) { + xfs_warn(mp, "attr2 and noattr2 cannot both be specified."); return -EINVAL; } - if ((mp->m_flags & XFS_MOUNT_NOALIGN) && - (mp->m_dalign || mp->m_swidth)) { + + if (xfs_has_noalign(mp) && (mp->m_dalign || mp->m_swidth)) { xfs_warn(mp, "sunit and swidth options incompatible with the noalign option"); return -EINVAL; @@ -1357,7 +1389,7 @@ xfs_fs_validate_params( return -EINVAL; } - if ((mp->m_flags & XFS_MOUNT_ALLOCSIZE) && + if (xfs_has_allocsize(mp) && (mp->m_allocsize_log > XFS_MAX_IO_LOG || mp->m_allocsize_log < XFS_MIN_IO_LOG)) { xfs_warn(mp, "invalid log iosize: %d [not %d-%d]", @@ -1418,11 +1450,22 @@ xfs_fs_fill_super( if (error) goto out_destroy_workqueues; + error = xfs_inodegc_init_percpu(mp); + if (error) + goto out_destroy_counters; + + /* + * All percpu data structures requiring cleanup when a cpu goes offline + * must be allocated before adding this @mp to the cpu-dead handler's + * mount list. + */ + xfs_mount_list_add(mp); + /* Allocate stats memory before we do operations that might use it */ mp->m_stats.xs_stats = alloc_percpu(struct xfsstats); if (!mp->m_stats.xs_stats) { error = -ENOMEM; - goto out_destroy_counters; + goto out_destroy_inodegc; } error = xfs_readsb(mp, flags); @@ -1438,7 +1481,7 @@ xfs_fs_fill_super( goto out_free_sb; /* V4 support is undergoing deprecation. */ - if (!xfs_sb_version_hascrc(&mp->m_sb)) { + if (!xfs_has_crc(mp)) { #ifdef CONFIG_XFS_SUPPORT_V4 xfs_warn_once(mp, "Deprecated V4 format (crc=0) will not be supported after September 2030."); @@ -1451,7 +1494,7 @@ xfs_fs_fill_super( } /* Filesystem claims it needs repair, so refuse the mount. */ - if (xfs_sb_version_needsrepair(&mp->m_sb)) { + if (xfs_has_needsrepair(mp)) { xfs_warn(mp, "Filesystem needs repair. Please run xfs_repair."); error = -EFSCORRUPTED; goto out_free_sb; @@ -1523,7 +1566,7 @@ xfs_fs_fill_super( sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_max_links = XFS_MAXLINK; sb->s_time_gran = 1; - if (xfs_sb_version_hasbigtime(&mp->m_sb)) { + if (xfs_has_bigtime(mp)) { sb->s_time_min = xfs_bigtime_to_unix(XFS_BIGTIME_TIME_MIN); sb->s_time_max = xfs_bigtime_to_unix(XFS_BIGTIME_TIME_MAX); } else { @@ -1536,14 +1579,10 @@ xfs_fs_fill_super( set_posix_acl_flag(sb); /* version 5 superblocks support inode version counters. */ - if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5) + if (xfs_has_crc(mp)) sb->s_flags |= SB_I_VERSION; - if (xfs_sb_version_hasbigtime(&mp->m_sb)) - xfs_warn(mp, - "EXPERIMENTAL big timestamp feature in use. Use at your own risk!"); - - if (mp->m_flags & XFS_MOUNT_DAX_ALWAYS) { + if (xfs_has_dax_always(mp)) { bool rtdev_is_dax = false, datadev_is_dax; xfs_warn(mp, @@ -1559,7 +1598,7 @@ xfs_fs_fill_super( "DAX unsupported by block device. Turning off DAX."); xfs_mount_set_dax_mode(mp, XFS_DAX_NEVER); } - if (xfs_sb_version_hasreflink(&mp->m_sb)) { + if (xfs_has_reflink(mp)) { xfs_alert(mp, "DAX and reflink cannot be used together!"); error = -EINVAL; @@ -1567,17 +1606,17 @@ xfs_fs_fill_super( } } - if (mp->m_flags & XFS_MOUNT_DISCARD) { + if (xfs_has_discard(mp)) { struct request_queue *q = bdev_get_queue(sb->s_bdev); if (!blk_queue_discard(q)) { xfs_warn(mp, "mounting with \"discard\" option, but " "the device does not support discard"); - mp->m_flags &= ~XFS_MOUNT_DISCARD; + mp->m_features &= ~XFS_FEAT_DISCARD; } } - if (xfs_sb_version_hasreflink(&mp->m_sb)) { + if (xfs_has_reflink(mp)) { if (mp->m_sb.sb_rblocks) { xfs_alert(mp, "reflink not compatible with realtime device!"); @@ -1591,17 +1630,13 @@ xfs_fs_fill_super( } } - if (xfs_sb_version_hasrmapbt(&mp->m_sb) && mp->m_sb.sb_rblocks) { + if (xfs_has_rmapbt(mp) && mp->m_sb.sb_rblocks) { xfs_alert(mp, "reverse mapping btree not compatible with realtime device!"); error = -EINVAL; goto out_filestream_unmount; } - if (xfs_sb_version_hasinobtcounts(&mp->m_sb)) - xfs_warn(mp, - "EXPERIMENTAL inode btree counters feature in use. Use at your own risk!"); - error = xfs_mountfs(mp); if (error) goto out_filestream_unmount; @@ -1625,6 +1660,9 @@ xfs_fs_fill_super( xfs_freesb(mp); out_free_stats: free_percpu(mp->m_stats.xs_stats); + out_destroy_inodegc: + xfs_mount_list_del(mp); + xfs_inodegc_free_percpu(mp); out_destroy_counters: xfs_destroy_percpu_counters(mp); out_destroy_workqueues: @@ -1656,13 +1694,13 @@ xfs_remount_rw( struct xfs_sb *sbp = &mp->m_sb; int error; - if (mp->m_flags & XFS_MOUNT_NORECOVERY) { + if (xfs_has_norecovery(mp)) { xfs_warn(mp, "ro->rw transition prohibited on norecovery mount"); return -EINVAL; } - if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && + if (xfs_sb_is_v5(sbp) && xfs_sb_has_ro_compat_feature(sbp, XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) { xfs_warn(mp, "ro->rw transition prohibited on unknown (0x%x) ro-compat filesystem", @@ -1671,7 +1709,7 @@ xfs_remount_rw( return -EINVAL; } - mp->m_flags &= ~XFS_MOUNT_RDONLY; + clear_bit(XFS_OPSTATE_READONLY, &mp->m_opstate); /* * If this is the first remount to writeable state we might have some @@ -1708,6 +1746,9 @@ xfs_remount_rw( if (error && error != -ENOSPC) return error; + /* Re-enable the background inode inactivation worker. */ + xfs_inodegc_start(mp); + return 0; } @@ -1730,6 +1771,15 @@ xfs_remount_ro( return error; } + /* + * Stop the inodegc background worker. xfs_fs_reconfigure already + * flushed all pending inodegc work when it sync'd the filesystem. + * The VFS holds s_umount, so we know that inodes cannot enter + * xfs_fs_destroy_inode during a remount operation. In readonly mode + * we send inodes straight to reclaim, so no inodes will be queued. + */ + xfs_inodegc_stop(mp); + /* Free the per-AG metadata reservation pool. */ error = xfs_fs_unreserve_ag_blocks(mp); if (error) { @@ -1747,7 +1797,7 @@ xfs_remount_ro( xfs_save_resvblks(mp); xfs_log_clean(mp); - mp->m_flags |= XFS_MOUNT_RDONLY; + set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate); return 0; } @@ -1770,12 +1820,11 @@ xfs_fs_reconfigure( { struct xfs_mount *mp = XFS_M(fc->root->d_sb); struct xfs_mount *new_mp = fc->s_fs_info; - xfs_sb_t *sbp = &mp->m_sb; int flags = fc->sb_flags; int error; /* version 5 superblocks always support version counters. */ - if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5) + if (xfs_has_crc(mp)) fc->sb_flags |= SB_I_VERSION; error = xfs_fs_validate_params(new_mp); @@ -1785,28 +1834,26 @@ xfs_fs_reconfigure( sync_filesystem(mp->m_super); /* inode32 -> inode64 */ - if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && - !(new_mp->m_flags & XFS_MOUNT_SMALL_INUMS)) { - mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS; - mp->m_maxagi = xfs_set_inode_alloc(mp, sbp->sb_agcount); + if (xfs_has_small_inums(mp) && !xfs_has_small_inums(new_mp)) { + mp->m_features &= ~XFS_FEAT_SMALL_INUMS; + mp->m_maxagi = xfs_set_inode_alloc(mp, mp->m_sb.sb_agcount); } /* inode64 -> inode32 */ - if (!(mp->m_flags & XFS_MOUNT_SMALL_INUMS) && - (new_mp->m_flags & XFS_MOUNT_SMALL_INUMS)) { - mp->m_flags |= XFS_MOUNT_SMALL_INUMS; - mp->m_maxagi = xfs_set_inode_alloc(mp, sbp->sb_agcount); + if (!xfs_has_small_inums(mp) && xfs_has_small_inums(new_mp)) { + mp->m_features |= XFS_FEAT_SMALL_INUMS; + mp->m_maxagi = xfs_set_inode_alloc(mp, mp->m_sb.sb_agcount); } /* ro -> rw */ - if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(flags & SB_RDONLY)) { + if (xfs_is_readonly(mp) && !(flags & SB_RDONLY)) { error = xfs_remount_rw(mp); if (error) return error; } /* rw -> ro */ - if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (flags & SB_RDONLY)) { + if (!xfs_is_readonly(mp) && (flags & SB_RDONLY)) { error = xfs_remount_ro(mp); if (error) return error; @@ -1873,11 +1920,11 @@ static int xfs_init_fs_context( * Copy binary VFS mount flags we are interested in. */ if (fc->sb_flags & SB_RDONLY) - mp->m_flags |= XFS_MOUNT_RDONLY; + set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate); if (fc->sb_flags & SB_DIRSYNC) - mp->m_flags |= XFS_MOUNT_DIRSYNC; + mp->m_features |= XFS_FEAT_DIRSYNC; if (fc->sb_flags & SB_SYNCHRONOUS) - mp->m_flags |= XFS_MOUNT_WSYNC; + mp->m_features |= XFS_FEAT_WSYNC; fc->s_fs_info = mp; fc->ops = &xfs_context_ops; @@ -2120,6 +2167,48 @@ xfs_destroy_workqueues(void) destroy_workqueue(xfs_alloc_wq); } +#ifdef CONFIG_HOTPLUG_CPU +static int +xfs_cpu_dead( + unsigned int cpu) +{ + struct xfs_mount *mp, *n; + + spin_lock(&xfs_mount_list_lock); + list_for_each_entry_safe(mp, n, &xfs_mount_list, m_mount_list) { + spin_unlock(&xfs_mount_list_lock); + xfs_inodegc_cpu_dead(mp, cpu); + spin_lock(&xfs_mount_list_lock); + } + spin_unlock(&xfs_mount_list_lock); + return 0; +} + +static int __init +xfs_cpu_hotplug_init(void) +{ + int error; + + error = cpuhp_setup_state_nocalls(CPUHP_XFS_DEAD, "xfs:dead", NULL, + xfs_cpu_dead); + if (error < 0) + xfs_alert(NULL, +"Failed to initialise CPU hotplug, error %d. XFS is non-functional.", + error); + return error; +} + +static void +xfs_cpu_hotplug_destroy(void) +{ + cpuhp_remove_state_nocalls(CPUHP_XFS_DEAD); +} + +#else /* !CONFIG_HOTPLUG_CPU */ +static inline int xfs_cpu_hotplug_init(void) { return 0; } +static inline void xfs_cpu_hotplug_destroy(void) {} +#endif + STATIC int __init init_xfs_fs(void) { @@ -2132,10 +2221,14 @@ init_xfs_fs(void) xfs_dir_startup(); - error = xfs_init_zones(); + error = xfs_cpu_hotplug_init(); if (error) goto out; + error = xfs_init_zones(); + if (error) + goto out_destroy_hp; + error = xfs_init_workqueues(); if (error) goto out_destroy_zones; @@ -2215,6 +2308,8 @@ init_xfs_fs(void) xfs_destroy_workqueues(); out_destroy_zones: xfs_destroy_zones(); + out_destroy_hp: + xfs_cpu_hotplug_destroy(); out: return error; } @@ -2237,6 +2332,7 @@ exit_xfs_fs(void) xfs_destroy_workqueues(); xfs_destroy_zones(); xfs_uuid_table_free(); + xfs_cpu_hotplug_destroy(); } module_init(init_xfs_fs); diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 1525636f4065..fc2c6a404647 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -63,7 +63,7 @@ xfs_readlink_bmap_ilocked( byte_cnt = pathlen; cur_chunk = bp->b_addr; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { if (!xfs_symlink_hdr_ok(ip->i_ino, offset, byte_cnt, bp)) { error = -EFSCORRUPTED; @@ -107,7 +107,7 @@ xfs_readlink( ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_LOCAL); - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; xfs_ilock(ip, XFS_ILOCK_SHARED); @@ -168,7 +168,7 @@ xfs_symlink( trace_xfs_symlink(dp, link_name); - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; /* @@ -321,9 +321,8 @@ xfs_symlink( * symlink transaction goes to disk before returning to * the user. */ - if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { + if (xfs_has_wsync(mp) || xfs_has_dirsync(mp)) xfs_trans_set_sync(tp); - } error = xfs_trans_commit(tp); if (error) @@ -445,7 +444,7 @@ xfs_inactive_symlink_rmt( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); error = xfs_trans_commit(tp); if (error) { - ASSERT(XFS_FORCED_SHUTDOWN(mp)); + ASSERT(xfs_is_shutdown(mp)); goto error_unlock; } @@ -478,7 +477,7 @@ xfs_inactive_symlink( trace_xfs_inactive_symlink(ip); - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; xfs_ilock(ip, XFS_ILOCK_EXCL); diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index f1bc88f4367c..18dc5eca6c04 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -10,6 +10,7 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_sysfs.h" +#include "xfs_log.h" #include "xfs_log_priv.h" #include "xfs_mount.h" diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index 7e01e00550ac..d269ef57ff01 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -20,6 +20,7 @@ #include "xfs_bmap.h" #include "xfs_attr.h" #include "xfs_trans.h" +#include "xfs_log.h" #include "xfs_log_priv.h" #include "xfs_buf_item.h" #include "xfs_quota.h" @@ -32,6 +33,7 @@ #include "xfs_icache.h" #include "xfs_ag.h" #include "xfs_ag_resv.h" +#include "xfs_error.h" /* * We include this last to have the helpers above available for the trace diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 19260291ff8b..1033a95fbf8e 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -2,6 +2,41 @@ /* * Copyright (c) 2009, Christoph Hellwig * All Rights Reserved. + * + * NOTE: none of these tracepoints shall be considered a stable kernel ABI + * as they can change at any time. + * + * Current conventions for printing numbers measuring specific units: + * + * agno: allocation group number + * + * agino: per-AG inode number + * ino: filesystem inode number + * + * agbno: per-AG block number in fs blocks + * startblock: physical block number for file mappings. This is either a + * segmented fsblock for data device mappings, or a rfsblock + * for realtime device mappings + * fsbcount: number of blocks in an extent, in fs blocks + * + * daddr: physical block number in 512b blocks + * bbcount: number of blocks in a physical extent, in 512b blocks + * + * owner: reverse-mapping owner, usually inodes + * + * fileoff: file offset, in fs blocks + * pos: file offset, in bytes + * bytecount: number of bytes + * + * disize: ondisk file size, in bytes + * isize: incore file size, in bytes + * + * forkoff: inode fork offset, in bytes + * + * ireccount: number of inode records + * + * Numbers describing space allocations (blocks, extents, inodes) should be + * formatted in hexadecimal. */ #undef TRACE_SYSTEM #define TRACE_SYSTEM xfs @@ -139,7 +174,7 @@ DECLARE_EVENT_CLASS(xfs_perag_class, __entry->refcount = refcount; __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d agno %u refcount %d caller %pS", + TP_printk("dev %d:%d agno 0x%x refcount %d caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->refcount, @@ -157,6 +192,84 @@ DEFINE_PERAG_REF_EVENT(xfs_perag_put); DEFINE_PERAG_REF_EVENT(xfs_perag_set_inode_tag); DEFINE_PERAG_REF_EVENT(xfs_perag_clear_inode_tag); +TRACE_EVENT(xfs_inodegc_worker, + TP_PROTO(struct xfs_mount *mp, unsigned int shrinker_hits), + TP_ARGS(mp, shrinker_hits), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, shrinker_hits) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->shrinker_hits = shrinker_hits; + ), + TP_printk("dev %d:%d shrinker_hits %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->shrinker_hits) +); + +DECLARE_EVENT_CLASS(xfs_fs_class, + TP_PROTO(struct xfs_mount *mp, void *caller_ip), + TP_ARGS(mp, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned long long, mflags) + __field(unsigned long, opstate) + __field(unsigned long, sbflags) + __field(void *, caller_ip) + ), + TP_fast_assign( + if (mp) { + __entry->dev = mp->m_super->s_dev; + __entry->mflags = mp->m_features; + __entry->opstate = mp->m_opstate; + __entry->sbflags = mp->m_super->s_flags; + } + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d m_features 0x%llx opstate (%s) s_flags 0x%lx caller %pS", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->mflags, + __print_flags(__entry->opstate, "|", XFS_OPSTATE_STRINGS), + __entry->sbflags, + __entry->caller_ip) +); + +#define DEFINE_FS_EVENT(name) \ +DEFINE_EVENT(xfs_fs_class, name, \ + TP_PROTO(struct xfs_mount *mp, void *caller_ip), \ + TP_ARGS(mp, caller_ip)) +DEFINE_FS_EVENT(xfs_inodegc_flush); +DEFINE_FS_EVENT(xfs_inodegc_start); +DEFINE_FS_EVENT(xfs_inodegc_stop); +DEFINE_FS_EVENT(xfs_inodegc_queue); +DEFINE_FS_EVENT(xfs_inodegc_throttle); +DEFINE_FS_EVENT(xfs_fs_sync_fs); +DEFINE_FS_EVENT(xfs_blockgc_start); +DEFINE_FS_EVENT(xfs_blockgc_stop); +DEFINE_FS_EVENT(xfs_blockgc_worker); +DEFINE_FS_EVENT(xfs_blockgc_flush_all); + +TRACE_EVENT(xfs_inodegc_shrinker_scan, + TP_PROTO(struct xfs_mount *mp, struct shrink_control *sc, + void *caller_ip), + TP_ARGS(mp, sc, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned long, nr_to_scan) + __field(void *, caller_ip) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->nr_to_scan = sc->nr_to_scan; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d nr_to_scan %lu caller %pS", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->nr_to_scan, + __entry->caller_ip) +); + DECLARE_EVENT_CLASS(xfs_ag_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno), TP_ARGS(mp, agno), @@ -168,7 +281,7 @@ DECLARE_EVENT_CLASS(xfs_ag_class, __entry->dev = mp->m_super->s_dev; __entry->agno = agno; ), - TP_printk("dev %d:%d agno %u", + TP_printk("dev %d:%d agno 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno) ); @@ -268,7 +381,7 @@ DECLARE_EVENT_CLASS(xfs_bmap_class, __entry->caller_ip = caller_ip; ), TP_printk("dev %d:%d ino 0x%llx state %s cur %p/%d " - "offset %lld block %lld count %lld flag %d caller %pS", + "fileoff 0x%llx startblock 0x%llx fsbcount 0x%llx flag %d caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS), @@ -308,10 +421,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class, ), TP_fast_assign( __entry->dev = bp->b_target->bt_dev; - if (bp->b_bn == XFS_BUF_DADDR_NULL) - __entry->bno = bp->b_maps[0].bm_bn; - else - __entry->bno = bp->b_bn; + __entry->bno = xfs_buf_daddr(bp); __entry->nblks = bp->b_length; __entry->hold = atomic_read(&bp->b_hold); __entry->pincount = atomic_read(&bp->b_pin_count); @@ -319,7 +429,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class, __entry->flags = bp->b_flags; __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d bno 0x%llx nblks 0x%x hold %d pincount %d " + TP_printk("dev %d:%d daddr 0x%llx bbcount 0x%x hold %d pincount %d " "lock %d flags %s caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->bno, @@ -370,7 +480,7 @@ DECLARE_EVENT_CLASS(xfs_buf_flags_class, TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_daddr_t, bno) - __field(size_t, buffer_length) + __field(unsigned int, length) __field(int, hold) __field(int, pincount) __field(unsigned, lockval) @@ -379,19 +489,19 @@ DECLARE_EVENT_CLASS(xfs_buf_flags_class, ), TP_fast_assign( __entry->dev = bp->b_target->bt_dev; - __entry->bno = bp->b_bn; - __entry->buffer_length = BBTOB(bp->b_length); + __entry->bno = xfs_buf_daddr(bp); + __entry->length = bp->b_length; __entry->flags = flags; __entry->hold = atomic_read(&bp->b_hold); __entry->pincount = atomic_read(&bp->b_pin_count); __entry->lockval = bp->b_sema.count; __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " + TP_printk("dev %d:%d daddr 0x%llx bbcount 0x%x hold %d pincount %d " "lock %d flags %s caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->bno, - __entry->buffer_length, + __entry->length, __entry->hold, __entry->pincount, __entry->lockval, @@ -413,7 +523,7 @@ TRACE_EVENT(xfs_buf_ioerror, TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_daddr_t, bno) - __field(size_t, buffer_length) + __field(unsigned int, length) __field(unsigned, flags) __field(int, hold) __field(int, pincount) @@ -423,8 +533,8 @@ TRACE_EVENT(xfs_buf_ioerror, ), TP_fast_assign( __entry->dev = bp->b_target->bt_dev; - __entry->bno = bp->b_bn; - __entry->buffer_length = BBTOB(bp->b_length); + __entry->bno = xfs_buf_daddr(bp); + __entry->length = bp->b_length; __entry->hold = atomic_read(&bp->b_hold); __entry->pincount = atomic_read(&bp->b_pin_count); __entry->lockval = bp->b_sema.count; @@ -432,11 +542,11 @@ TRACE_EVENT(xfs_buf_ioerror, __entry->flags = bp->b_flags; __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " + TP_printk("dev %d:%d daddr 0x%llx bbcount 0x%x hold %d pincount %d " "lock %d error %d flags %s caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->bno, - __entry->buffer_length, + __entry->length, __entry->hold, __entry->pincount, __entry->lockval, @@ -451,7 +561,7 @@ DECLARE_EVENT_CLASS(xfs_buf_item_class, TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_daddr_t, buf_bno) - __field(size_t, buf_len) + __field(unsigned int, buf_len) __field(int, buf_hold) __field(int, buf_pincount) __field(int, buf_lockval) @@ -466,15 +576,15 @@ DECLARE_EVENT_CLASS(xfs_buf_item_class, __entry->bli_flags = bip->bli_flags; __entry->bli_recur = bip->bli_recur; __entry->bli_refcount = atomic_read(&bip->bli_refcount); - __entry->buf_bno = bip->bli_buf->b_bn; - __entry->buf_len = BBTOB(bip->bli_buf->b_length); + __entry->buf_bno = xfs_buf_daddr(bip->bli_buf); + __entry->buf_len = bip->bli_buf->b_length; __entry->buf_flags = bip->bli_buf->b_flags; __entry->buf_hold = atomic_read(&bip->bli_buf->b_hold); __entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count); __entry->buf_lockval = bip->bli_buf->b_sema.count; __entry->li_flags = bip->bli_item.li_flags; ), - TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " + TP_printk("dev %d:%d daddr 0x%llx bbcount 0x%x hold %d pincount %d " "lock %d flags %s recur %d refcount %d bliflags %s " "liflags %s", MAJOR(__entry->dev), MINOR(__entry->dev), @@ -534,7 +644,7 @@ DECLARE_EVENT_CLASS(xfs_filestream_class, __entry->agno = agno; __entry->streams = xfs_filestream_peek_ag(mp, agno); ), - TP_printk("dev %d:%d ino 0x%llx agno %u streams %d", + TP_printk("dev %d:%d ino 0x%llx agno 0x%x streams %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->agno, @@ -568,7 +678,7 @@ TRACE_EVENT(xfs_filestream_pick, __entry->free = free; __entry->nscan = nscan; ), - TP_printk("dev %d:%d ino 0x%llx agno %u streams %d free %d nscan %d", + TP_printk("dev %d:%d ino 0x%llx agno 0x%x streams %d free %d nscan %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->agno, @@ -616,14 +726,17 @@ DECLARE_EVENT_CLASS(xfs_inode_class, TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) + __field(unsigned long, iflags) ), TP_fast_assign( __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; + __entry->iflags = ip->i_flags; ), - TP_printk("dev %d:%d ino 0x%llx", + TP_printk("dev %d:%d ino 0x%llx iflags 0x%lx", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->ino) + __entry->ino, + __entry->iflags) ) #define DEFINE_INODE_EVENT(name) \ @@ -667,6 +780,10 @@ DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid); DEFINE_INODE_EVENT(xfs_inode_set_cowblocks_tag); DEFINE_INODE_EVENT(xfs_inode_clear_cowblocks_tag); DEFINE_INODE_EVENT(xfs_inode_free_cowblocks_invalid); +DEFINE_INODE_EVENT(xfs_inode_set_reclaimable); +DEFINE_INODE_EVENT(xfs_inode_reclaiming); +DEFINE_INODE_EVENT(xfs_inode_set_need_inactive); +DEFINE_INODE_EVENT(xfs_inode_inactivating); /* * ftrace's __print_symbolic requires that all enum values be wrapped in the @@ -773,9 +890,12 @@ TRACE_EVENT(xfs_irec_merge_pre, __entry->nagino = nagino; __entry->nholemask = holemask; ), - TP_printk("dev %d:%d agno %d inobt (%u:0x%x) new (%u:0x%x)", - MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, - __entry->agino, __entry->holemask, __entry->nagino, + TP_printk("dev %d:%d agno 0x%x agino 0x%x holemask 0x%x new_agino 0x%x new_holemask 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agino, + __entry->holemask, + __entry->nagino, __entry->nholemask) ) @@ -795,8 +915,11 @@ TRACE_EVENT(xfs_irec_merge_post, __entry->agino = agino; __entry->holemask = holemask; ), - TP_printk("dev %d:%d agno %d inobt (%u:0x%x)", MAJOR(__entry->dev), - MINOR(__entry->dev), __entry->agno, __entry->agino, + TP_printk("dev %d:%d agno 0x%x agino 0x%x holemask 0x%x", + MAJOR(__entry->dev), + MINOR(__entry->dev), + __entry->agno, + __entry->agino, __entry->holemask) ) @@ -1301,7 +1424,7 @@ DECLARE_EVENT_CLASS(xfs_file_class, __entry->offset = iocb->ki_pos; __entry->count = iov_iter_count(iter); ), - TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count 0x%zx", + TP_printk("dev %d:%d ino 0x%llx disize 0x%llx pos 0x%llx bytecount 0x%zx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->size, @@ -1348,14 +1471,14 @@ DECLARE_EVENT_CLASS(xfs_imap_class, __entry->startblock = irec ? irec->br_startblock : 0; __entry->blockcount = irec ? irec->br_blockcount : 0; ), - TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count %zd " - "fork %s startoff 0x%llx startblock %lld blockcount 0x%llx", + TP_printk("dev %d:%d ino 0x%llx disize 0x%llx pos 0x%llx bytecount 0x%zx " + "fork %s startoff 0x%llx startblock 0x%llx fsbcount 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->size, __entry->offset, __entry->count, - __entry->whichfork == XFS_COW_FORK ? "cow" : "data", + __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), __entry->startoff, (int64_t)__entry->startblock, __entry->blockcount) @@ -1391,7 +1514,7 @@ DECLARE_EVENT_CLASS(xfs_simple_io_class, __entry->count = count; ), TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx " - "offset 0x%llx count %zd", + "pos 0x%llx bytecount 0x%zx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->isize, @@ -1427,7 +1550,7 @@ DECLARE_EVENT_CLASS(xfs_itrunc_class, __entry->size = ip->i_disk_size; __entry->new_size = new_size; ), - TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx", + TP_printk("dev %d:%d ino 0x%llx disize 0x%llx new_size 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->size, @@ -1458,7 +1581,7 @@ TRACE_EVENT(xfs_pagecache_inval, __entry->start = start; __entry->finish = finish; ), - TP_printk("dev %d:%d ino 0x%llx size 0x%llx start 0x%llx finish 0x%llx", + TP_printk("dev %d:%d ino 0x%llx disize 0x%llx start 0x%llx finish 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->size, @@ -1467,14 +1590,14 @@ TRACE_EVENT(xfs_pagecache_inval, ); TRACE_EVENT(xfs_bunmap, - TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, + TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t fileoff, xfs_filblks_t len, int flags, unsigned long caller_ip), - TP_ARGS(ip, bno, len, flags, caller_ip), + TP_ARGS(ip, fileoff, len, flags, caller_ip), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) __field(xfs_fsize_t, size) - __field(xfs_fileoff_t, bno) + __field(xfs_fileoff_t, fileoff) __field(xfs_filblks_t, len) __field(unsigned long, caller_ip) __field(int, flags) @@ -1483,17 +1606,17 @@ TRACE_EVENT(xfs_bunmap, __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; __entry->size = ip->i_disk_size; - __entry->bno = bno; + __entry->fileoff = fileoff; __entry->len = len; __entry->caller_ip = caller_ip; __entry->flags = flags; ), - TP_printk("dev %d:%d ino 0x%llx size 0x%llx bno 0x%llx len 0x%llx" + TP_printk("dev %d:%d ino 0x%llx disize 0x%llx fileoff 0x%llx fsbcount 0x%llx" "flags %s caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->size, - __entry->bno, + __entry->fileoff, __entry->len, __print_flags(__entry->flags, "|", XFS_BMAPI_FLAGS), (void *)__entry->caller_ip) @@ -1516,7 +1639,7 @@ DECLARE_EVENT_CLASS(xfs_extent_busy_class, __entry->agbno = agbno; __entry->len = len; ), - TP_printk("dev %d:%d agno %u agbno %u len %u", + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->agbno, @@ -1554,7 +1677,7 @@ TRACE_EVENT(xfs_extent_busy_trim, __entry->tbno = tbno; __entry->tlen = tlen; ), - TP_printk("dev %d:%d agno %u agbno %u len %u tbno %u tlen %u", + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x found_agbno 0x%x found_fsbcount 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->agbno, @@ -1601,7 +1724,7 @@ DECLARE_EVENT_CLASS(xfs_agf_class, __entry->longest = be32_to_cpu(agf->agf_longest); __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d agno %u flags %s length %u roots b %u c %u " + TP_printk("dev %d:%d agno 0x%x flags %s length %u roots b %u c %u " "levels b %u c %u flfirst %u fllast %u flcount %u " "freeblks %u longest %u caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), @@ -1650,7 +1773,7 @@ TRACE_EVENT(xfs_free_extent, __entry->haveleft = haveleft; __entry->haveright = haveright; ), - TP_printk("dev %d:%d agno %u agbno %u len %u resv %d %s", + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x resv %d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->agbno, @@ -1707,7 +1830,7 @@ DECLARE_EVENT_CLASS(xfs_alloc_class, __entry->datatype = args->datatype; __entry->firstblock = args->tp->t_firstblock; ), - TP_printk("dev %d:%d agno %u agbno %u minlen %u maxlen %u mod %u " + TP_printk("dev %d:%d agno 0x%x agbno 0x%x minlen %u maxlen %u mod %u " "prod %u minleft %u total %u alignment %u minalignslop %u " "len %u type %s otype %s wasdel %d wasfromfl %d resv %d " "datatype 0x%x firstblock 0x%llx", @@ -1785,7 +1908,7 @@ TRACE_EVENT(xfs_alloc_cur_check, __entry->diff = diff; __entry->new = new; ), - TP_printk("dev %d:%d btree %s bno 0x%x len 0x%x diff 0x%x new %d", + TP_printk("dev %d:%d btree %s agbno 0x%x fsbcount 0x%x diff 0x%x new %d", MAJOR(__entry->dev), MINOR(__entry->dev), __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), __entry->bno, __entry->len, __entry->diff, __entry->new) @@ -2060,7 +2183,7 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class, __entry->fork_off = XFS_IFORK_BOFF(ip); ), TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, " - "broot size %d, fork offset %d", + "broot size %d, forkoff 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __print_symbolic(__entry->which, XFS_SWAPEXT_INODES), @@ -2186,7 +2309,7 @@ DECLARE_EVENT_CLASS(xfs_log_recover_buf_item_class, __entry->size = buf_f->blf_size; __entry->map_size = buf_f->blf_map_size; ), - TP_printk("dev %d:%d blkno 0x%llx, len %u, flags 0x%x, size %d, " + TP_printk("dev %d:%d daddr 0x%llx, bbcount 0x%x, flags 0x%x, size %d, " "map_size %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->blkno, @@ -2237,7 +2360,7 @@ DECLARE_EVENT_CLASS(xfs_log_recover_ino_item_class, __entry->boffset = in_f->ilf_boffset; ), TP_printk("dev %d:%d ino 0x%llx, size %u, fields 0x%x, asize %d, " - "dsize %d, blkno 0x%llx, len %d, boffset %d", + "dsize %d, daddr 0x%llx, bbcount 0x%x, boffset %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->size, @@ -2278,10 +2401,14 @@ DECLARE_EVENT_CLASS(xfs_log_recover_icreate_item_class, __entry->length = be32_to_cpu(in_f->icl_length); __entry->gen = be32_to_cpu(in_f->icl_gen); ), - TP_printk("dev %d:%d agno %u agbno %u count %u isize %u length %u " - "gen %u", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->agno, __entry->agbno, __entry->count, __entry->isize, - __entry->length, __entry->gen) + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x ireccount %u isize %u gen 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->length, + __entry->count, + __entry->isize, + __entry->gen) ) #define DEFINE_LOG_RECOVER_ICREATE_ITEM(name) \ DEFINE_EVENT(xfs_log_recover_icreate_item_class, name, \ @@ -2307,7 +2434,7 @@ DECLARE_EVENT_CLASS(xfs_discard_class, __entry->agbno = agbno; __entry->len = len; ), - TP_printk("dev %d:%d agno %u agbno %u len %u", + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->agbno, @@ -2350,7 +2477,7 @@ DECLARE_EVENT_CLASS(xfs_btree_cur_class, __entry->level = level; __entry->nlevels = cur->bc_nlevels; __entry->ptr = cur->bc_ptrs[level]; - __entry->daddr = bp ? bp->b_bn : -1; + __entry->daddr = bp ? xfs_buf_daddr(bp) : -1; ), TP_printk("dev %d:%d btree %s level %d/%d ptr %d daddr 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), @@ -2466,7 +2593,7 @@ DECLARE_EVENT_CLASS(xfs_phys_extent_deferred_class, __entry->agbno = agbno; __entry->len = len; ), - TP_printk("dev %d:%d op %d agno %u agbno %u len %u", + TP_printk("dev %d:%d op %d agno 0x%x agbno 0x%x fsbcount 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->type, __entry->agno, @@ -2513,13 +2640,13 @@ DECLARE_EVENT_CLASS(xfs_map_extent_deferred_class, __entry->l_state = state; __entry->op = op; ), - TP_printk("dev %d:%d op %d agno %u agbno %u owner %lld %s offset %llu len %llu state %d", + TP_printk("dev %d:%d op %d agno 0x%x agbno 0x%x owner 0x%llx %s fileoff 0x%llx fsbcount 0x%llx state %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->op, __entry->agno, __entry->agbno, __entry->ino, - __entry->whichfork == XFS_ATTR_FORK ? "attr" : "data", + __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), __entry->l_loff, __entry->l_len, __entry->l_state) @@ -2583,7 +2710,7 @@ DECLARE_EVENT_CLASS(xfs_rmap_class, if (unwritten) __entry->flags |= XFS_RMAP_UNWRITTEN; ), - TP_printk("dev %d:%d agno %u agbno %u len %u owner %lld offset %llu flags 0x%lx", + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%lx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->agbno, @@ -2616,7 +2743,7 @@ DECLARE_EVENT_CLASS(xfs_ag_error_class, __entry->error = error; __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d agno %u error %d caller %pS", + TP_printk("dev %d:%d agno 0x%x error %d caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->error, @@ -2663,7 +2790,7 @@ DECLARE_EVENT_CLASS(xfs_rmapbt_class, __entry->offset = offset; __entry->flags = flags; ), - TP_printk("dev %d:%d agno %u agbno %u len %u owner %lld offset %llu flags 0x%x", + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->agbno, @@ -2732,7 +2859,7 @@ DECLARE_EVENT_CLASS(xfs_ag_resv_class, __entry->asked = r ? r->ar_asked : 0; __entry->len = len; ), - TP_printk("dev %d:%d agno %u resv %d freeblks %u flcount %u " + TP_printk("dev %d:%d agno 0x%x resv %d freeblks %u flcount %u " "resv %u ask %u len %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, @@ -2785,7 +2912,7 @@ DECLARE_EVENT_CLASS(xfs_ag_btree_lookup_class, __entry->agbno = agbno; __entry->dir = dir; ), - TP_printk("dev %d:%d agno %u agbno %u cmp %s(%d)", + TP_printk("dev %d:%d agno 0x%x agbno 0x%x cmp %s(%d)", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->agbno, @@ -2818,7 +2945,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_class, __entry->blockcount = irec->rc_blockcount; __entry->refcount = irec->rc_refcount; ), - TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u", + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->startblock, @@ -2853,7 +2980,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_at_class, __entry->refcount = irec->rc_refcount; __entry->agbno = agbno; ), - TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u @ agbno %u", + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u @ agbno 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->startblock, @@ -2893,8 +3020,8 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class, __entry->i2_blockcount = i2->rc_blockcount; __entry->i2_refcount = i2->rc_refcount; ), - TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- " - "agbno %u len %u refcount %u", + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u -- " + "agbno 0x%x fsbcount 0x%x refcount %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->i1_startblock, @@ -2939,8 +3066,8 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class, __entry->i2_refcount = i2->rc_refcount; __entry->agbno = agbno; ), - TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- " - "agbno %u len %u refcount %u @ agbno %u", + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u -- " + "agbno 0x%x fsbcount 0x%x refcount %u @ agbno 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->i1_startblock, @@ -2991,9 +3118,9 @@ DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class, __entry->i3_blockcount = i3->rc_blockcount; __entry->i3_refcount = i3->rc_refcount; ), - TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- " - "agbno %u len %u refcount %u -- " - "agbno %u len %u refcount %u", + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u -- " + "agbno 0x%x fsbcount 0x%x refcount %u -- " + "agbno 0x%x fsbcount 0x%x refcount %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->i1_startblock, @@ -3080,7 +3207,7 @@ TRACE_EVENT(xfs_refcount_finish_one_leftover, __entry->new_agbno = new_agbno; __entry->new_len = new_len; ), - TP_printk("dev %d:%d type %d agno %u agbno %u len %u new_agbno %u new_len %u", + TP_printk("dev %d:%d type %d agno 0x%x agbno 0x%x fsbcount 0x%x new_agbno 0x%x new_fsbcount 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->type, __entry->agno, @@ -3106,7 +3233,7 @@ DECLARE_EVENT_CLASS(xfs_inode_error_class, __entry->error = error; __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d ino %llx error %d caller %pS", + TP_printk("dev %d:%d ino 0x%llx error %d caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->error, @@ -3132,7 +3259,7 @@ DECLARE_EVENT_CLASS(xfs_double_io_class, __field(loff_t, src_isize) __field(loff_t, src_disize) __field(loff_t, src_offset) - __field(size_t, len) + __field(long long, len) __field(xfs_ino_t, dest_ino) __field(loff_t, dest_isize) __field(loff_t, dest_disize) @@ -3150,9 +3277,9 @@ DECLARE_EVENT_CLASS(xfs_double_io_class, __entry->dest_disize = dest->i_disk_size; __entry->dest_offset = doffset; ), - TP_printk("dev %d:%d count %zd " - "ino 0x%llx isize 0x%llx disize 0x%llx offset 0x%llx -> " - "ino 0x%llx isize 0x%llx disize 0x%llx offset 0x%llx", + TP_printk("dev %d:%d bytecount 0x%llx " + "ino 0x%llx isize 0x%llx disize 0x%llx pos 0x%llx -> " + "ino 0x%llx isize 0x%llx disize 0x%llx pos 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->len, __entry->src_ino, @@ -3191,7 +3318,7 @@ DECLARE_EVENT_CLASS(xfs_inode_irec_class, __entry->pblk = irec->br_startblock; __entry->state = irec->br_state; ), - TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x pblk %llu st %d", + TP_printk("dev %d:%d ino 0x%llx fileoff 0x%llx fsbcount 0x%x startblock 0x%llx st %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->lblk, @@ -3231,9 +3358,8 @@ TRACE_EVENT(xfs_reflink_remap_blocks, __entry->dest_ino = dest->i_ino; __entry->dest_lblk = doffset; ), - TP_printk("dev %d:%d len 0x%llx " - "ino 0x%llx offset 0x%llx blocks -> " - "ino 0x%llx offset 0x%llx blocks", + TP_printk("dev %d:%d fsbcount 0x%llx " + "ino 0x%llx fileoff 0x%llx -> ino 0x%llx fileoff 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->len, __entry->src_ino, @@ -3272,9 +3398,7 @@ TRACE_EVENT(xfs_ioctl_clone, __entry->dest_ino = dest->i_ino; __entry->dest_isize = i_size_read(dest); ), - TP_printk("dev %d:%d " - "ino 0x%lx isize 0x%llx -> " - "ino 0x%lx isize 0x%llx", + TP_printk("dev %d:%d ino 0x%lx isize 0x%llx -> ino 0x%lx isize 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->src_ino, __entry->src_isize, @@ -3310,7 +3434,7 @@ DEFINE_INODE_ERROR_EVENT(xfs_swap_extent_rmap_error); /* fsmap traces */ DECLARE_EVENT_CLASS(xfs_fsmap_class, TP_PROTO(struct xfs_mount *mp, u32 keydev, xfs_agnumber_t agno, - struct xfs_rmap_irec *rmap), + const struct xfs_rmap_irec *rmap), TP_ARGS(mp, keydev, agno, rmap), TP_STRUCT__entry( __field(dev_t, dev) @@ -3332,7 +3456,7 @@ DECLARE_EVENT_CLASS(xfs_fsmap_class, __entry->offset = rmap->rm_offset; __entry->flags = rmap->rm_flags; ), - TP_printk("dev %d:%d keydev %d:%d agno %u bno %llu len %llu owner %lld offset %llu flags 0x%x", + TP_printk("dev %d:%d keydev %d:%d agno 0x%x startblock 0x%llx fsbcount 0x%llx owner 0x%llx fileoff 0x%llx flags 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), MAJOR(__entry->keydev), MINOR(__entry->keydev), __entry->agno, @@ -3345,7 +3469,7 @@ DECLARE_EVENT_CLASS(xfs_fsmap_class, #define DEFINE_FSMAP_EVENT(name) \ DEFINE_EVENT(xfs_fsmap_class, name, \ TP_PROTO(struct xfs_mount *mp, u32 keydev, xfs_agnumber_t agno, \ - struct xfs_rmap_irec *rmap), \ + const struct xfs_rmap_irec *rmap), \ TP_ARGS(mp, keydev, agno, rmap)) DEFINE_FSMAP_EVENT(xfs_fsmap_low_key); DEFINE_FSMAP_EVENT(xfs_fsmap_high_key); @@ -3372,7 +3496,7 @@ DECLARE_EVENT_CLASS(xfs_getfsmap_class, __entry->offset = fsmap->fmr_offset; __entry->flags = fsmap->fmr_flags; ), - TP_printk("dev %d:%d keydev %d:%d block %llu len %llu owner %lld offset %llu flags 0x%llx", + TP_printk("dev %d:%d keydev %d:%d daddr 0x%llx bbcount 0x%llx owner 0x%llx fileoff_daddr 0x%llx flags 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), MAJOR(__entry->keydev), MINOR(__entry->keydev), __entry->block, @@ -3471,7 +3595,7 @@ TRACE_EVENT(xfs_iunlink_update_bucket, __entry->old_ptr = old_ptr; __entry->new_ptr = new_ptr; ), - TP_printk("dev %d:%d agno %u bucket %u old 0x%x new 0x%x", + TP_printk("dev %d:%d agno 0x%x bucket %u old 0x%x new 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->bucket, @@ -3497,7 +3621,7 @@ TRACE_EVENT(xfs_iunlink_update_dinode, __entry->old_ptr = old_ptr; __entry->new_ptr = new_ptr; ), - TP_printk("dev %d:%d agno %u agino 0x%x old 0x%x new 0x%x", + TP_printk("dev %d:%d agno 0x%x agino 0x%x old 0x%x new 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->agino, @@ -3518,7 +3642,7 @@ DECLARE_EVENT_CLASS(xfs_ag_inode_class, __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino); __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino); ), - TP_printk("dev %d:%d agno %u agino %u", + TP_printk("dev %d:%d agno 0x%x agino 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->agino) ) @@ -3570,7 +3694,7 @@ DECLARE_EVENT_CLASS(xfs_ag_corrupt_class, __entry->agno = agno; __entry->flags = flags; ), - TP_printk("dev %d:%d agno %u flags 0x%x", + TP_printk("dev %d:%d agno 0x%x flags 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->flags) ); @@ -3621,7 +3745,7 @@ TRACE_EVENT(xfs_iwalk_ag, __entry->agno = agno; __entry->startino = startino; ), - TP_printk("dev %d:%d agno %d startino %u", + TP_printk("dev %d:%d agno 0x%x startino 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->startino) ) @@ -3642,7 +3766,7 @@ TRACE_EVENT(xfs_iwalk_ag_rec, __entry->startino = irec->ir_startino; __entry->freemask = irec->ir_free; ), - TP_printk("dev %d:%d agno %d startino %u freemask 0x%llx", + TP_printk("dev %d:%d agno 0x%x startino 0x%x freemask 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->startino, __entry->freemask) ) @@ -3689,8 +3813,6 @@ DEFINE_EVENT(xfs_kmem_class, name, \ TP_PROTO(ssize_t size, int flags, unsigned long caller_ip), \ TP_ARGS(size, flags, caller_ip)) DEFINE_KMEM_EVENT(kmem_alloc); -DEFINE_KMEM_EVENT(kmem_alloc_io); -DEFINE_KMEM_EVENT(kmem_alloc_large); TRACE_EVENT(xfs_check_new_dalign, TP_PROTO(struct xfs_mount *mp, int new_dalign, xfs_ino_t calc_rootino), @@ -3707,7 +3829,7 @@ TRACE_EVENT(xfs_check_new_dalign, __entry->sb_rootino = mp->m_sb.sb_rootino; __entry->calc_rootino = calc_rootino; ), - TP_printk("dev %d:%d new_dalign %d sb_rootino %llu calc_rootino %llu", + TP_printk("dev %d:%d new_dalign %d sb_rootino 0x%llx calc_rootino 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->new_dalign, __entry->sb_rootino, __entry->calc_rootino) @@ -3732,7 +3854,7 @@ TRACE_EVENT(xfs_btree_commit_afakeroot, __entry->levels = cur->bc_ag.afake->af_levels; __entry->blocks = cur->bc_ag.afake->af_blocks; ), - TP_printk("dev %d:%d btree %s ag %u levels %u blocks %u root %u", + TP_printk("dev %d:%d btree %s agno 0x%x levels %u blocks %u root %u", MAJOR(__entry->dev), MINOR(__entry->dev), __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), __entry->agno, @@ -3764,12 +3886,12 @@ TRACE_EVENT(xfs_btree_commit_ifakeroot, __entry->blocks = cur->bc_ino.ifake->if_blocks; __entry->whichfork = cur->bc_ino.whichfork; ), - TP_printk("dev %d:%d btree %s ag %u agino %u whichfork %s levels %u blocks %u", + TP_printk("dev %d:%d btree %s agno 0x%x agino 0x%x whichfork %s levels %u blocks %u", MAJOR(__entry->dev), MINOR(__entry->dev), __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), __entry->agno, __entry->agino, - __entry->whichfork == XFS_ATTR_FORK ? "attr" : "data", + __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), __entry->levels, __entry->blocks) ) @@ -3847,7 +3969,7 @@ TRACE_EVENT(xfs_btree_bload_block, } __entry->nr_records = nr_records; ), - TP_printk("dev %d:%d btree %s level %u block %llu/%llu fsb (%u/%u) recs %u", + TP_printk("dev %d:%d btree %s level %u block %llu/%llu agno 0x%x agbno 0x%x recs %u", MAJOR(__entry->dev), MINOR(__entry->dev), __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), __entry->level, @@ -3934,7 +4056,6 @@ TRACE_DEFINE_ENUM(XLOG_STATE_SYNCING); TRACE_DEFINE_ENUM(XLOG_STATE_DONE_SYNC); TRACE_DEFINE_ENUM(XLOG_STATE_CALLBACK); TRACE_DEFINE_ENUM(XLOG_STATE_DIRTY); -TRACE_DEFINE_ENUM(XLOG_STATE_IOERROR); DECLARE_EVENT_CLASS(xlog_iclog_class, TP_PROTO(struct xlog_in_core *iclog, unsigned long caller_ip), @@ -3990,6 +4111,57 @@ DEFINE_ICLOG_EVENT(xlog_iclog_want_sync); DEFINE_ICLOG_EVENT(xlog_iclog_wait_on); DEFINE_ICLOG_EVENT(xlog_iclog_write); +DECLARE_EVENT_CLASS(xfs_das_state_class, + TP_PROTO(int das, struct xfs_inode *ip), + TP_ARGS(das, ip), + TP_STRUCT__entry( + __field(int, das) + __field(xfs_ino_t, ino) + ), + TP_fast_assign( + __entry->das = das; + __entry->ino = ip->i_ino; + ), + TP_printk("state change %d ino 0x%llx", + __entry->das, __entry->ino) +) + +#define DEFINE_DAS_STATE_EVENT(name) \ +DEFINE_EVENT(xfs_das_state_class, name, \ + TP_PROTO(int das, struct xfs_inode *ip), \ + TP_ARGS(das, ip)) +DEFINE_DAS_STATE_EVENT(xfs_attr_sf_addname_return); +DEFINE_DAS_STATE_EVENT(xfs_attr_set_iter_return); +DEFINE_DAS_STATE_EVENT(xfs_attr_node_addname_return); +DEFINE_DAS_STATE_EVENT(xfs_attr_remove_iter_return); +DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_remove_return); + +TRACE_EVENT(xfs_force_shutdown, + TP_PROTO(struct xfs_mount *mp, int ptag, int flags, const char *fname, + int line_num), + TP_ARGS(mp, ptag, flags, fname, line_num), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, ptag) + __field(int, flags) + __string(fname, fname) + __field(int, line_num) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->ptag = ptag; + __entry->flags = flags; + __assign_str(fname, fname); + __entry->line_num = line_num; + ), + TP_printk("dev %d:%d tag %s flags %s file %s line_num %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_flags(__entry->ptag, "|", XFS_PTAG_STRINGS), + __print_flags(__entry->flags, "|", XFS_SHUTDOWN_STRINGS), + __get_str(fname), + __entry->line_num) +); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 87bffd12c20c..67dec11e34c7 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -9,7 +9,6 @@ #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" -#include "xfs_log_priv.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_extent_busy.h" @@ -17,6 +16,7 @@ #include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_log.h" +#include "xfs_log_priv.h" #include "xfs_trace.h" #include "xfs_error.h" #include "xfs_defer.h" @@ -275,7 +275,7 @@ retry: WARN_ON(resp->tr_logres > 0 && mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE); ASSERT(!(flags & XFS_TRANS_RES_FDBLKS) || - xfs_sb_version_haslazysbcount(&mp->m_sb)); + xfs_has_lazysbcount(mp)); tp->t_magic = XFS_TRANS_HEADER_MAGIC; tp->t_flags = flags; @@ -295,10 +295,7 @@ retry: * Do not perform a synchronous scan because callers can hold * other locks. */ - error = xfs_blockgc_free_space(mp, NULL); - if (error) - return error; - + xfs_blockgc_flush_all(mp); want_retry = false; goto retry; } @@ -367,12 +364,12 @@ xfs_trans_mod_sb( switch (field) { case XFS_TRANS_SB_ICOUNT: tp->t_icount_delta += delta; - if (xfs_sb_version_haslazysbcount(&mp->m_sb)) + if (xfs_has_lazysbcount(mp)) flags &= ~XFS_TRANS_SB_DIRTY; break; case XFS_TRANS_SB_IFREE: tp->t_ifree_delta += delta; - if (xfs_sb_version_haslazysbcount(&mp->m_sb)) + if (xfs_has_lazysbcount(mp)) flags &= ~XFS_TRANS_SB_DIRTY; break; case XFS_TRANS_SB_FDBLOCKS: @@ -401,7 +398,7 @@ xfs_trans_mod_sb( delta -= blkres_delta; } tp->t_fdblocks_delta += delta; - if (xfs_sb_version_haslazysbcount(&mp->m_sb)) + if (xfs_has_lazysbcount(mp)) flags &= ~XFS_TRANS_SB_DIRTY; break; case XFS_TRANS_SB_RES_FDBLOCKS: @@ -411,7 +408,7 @@ xfs_trans_mod_sb( * be applied to the on-disk superblock. */ tp->t_res_fdblocks_delta += delta; - if (xfs_sb_version_haslazysbcount(&mp->m_sb)) + if (xfs_has_lazysbcount(mp)) flags &= ~XFS_TRANS_SB_DIRTY; break; case XFS_TRANS_SB_FREXTENTS: @@ -490,7 +487,7 @@ xfs_trans_apply_sb_deltas( /* * Only update the superblock counters if we are logging them */ - if (!xfs_sb_version_haslazysbcount(&(tp->t_mountp->m_sb))) { + if (!xfs_has_lazysbcount((tp->t_mountp))) { if (tp->t_icount_delta) be64_add_cpu(&sbp->sb_icount, tp->t_icount_delta); if (tp->t_ifree_delta) @@ -588,7 +585,7 @@ xfs_trans_unreserve_and_mod_sb( if (tp->t_blk_res > 0) blkdelta = tp->t_blk_res; if ((tp->t_fdblocks_delta != 0) && - (xfs_sb_version_haslazysbcount(&mp->m_sb) || + (xfs_has_lazysbcount(mp) || (tp->t_flags & XFS_TRANS_SB_DIRTY))) blkdelta += tp->t_fdblocks_delta; @@ -598,7 +595,7 @@ xfs_trans_unreserve_and_mod_sb( (tp->t_flags & XFS_TRANS_SB_DIRTY)) rtxdelta += tp->t_frextents_delta; - if (xfs_sb_version_haslazysbcount(&mp->m_sb) || + if (xfs_has_lazysbcount(mp) || (tp->t_flags & XFS_TRANS_SB_DIRTY)) { idelta = tp->t_icount_delta; ifreedelta = tp->t_ifree_delta; @@ -778,7 +775,7 @@ xfs_trans_committed_bulk( * object into the AIL as we are in a shutdown situation. */ if (aborted) { - ASSERT(XFS_FORCED_SHUTDOWN(ailp->ail_mount)); + ASSERT(xfs_is_shutdown(ailp->ail_mount)); if (lip->li_ops->iop_unpin) lip->li_ops->iop_unpin(lip, 1); continue; @@ -867,7 +864,7 @@ __xfs_trans_commit( if (!(tp->t_flags & XFS_TRANS_DIRTY)) goto out_unreserve; - if (XFS_FORCED_SHUTDOWN(mp)) { + if (xfs_is_shutdown(mp)) { error = -EIO; goto out_unreserve; } @@ -908,7 +905,7 @@ out_unreserve: */ xfs_trans_unreserve_and_mod_dquots(tp); if (tp->t_ticket) { - if (regrant && !XLOG_FORCED_SHUTDOWN(mp->m_log)) + if (regrant && !xlog_is_shutdown(mp->m_log)) xfs_log_ticket_regrant(mp->m_log, tp->t_ticket); else xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket); @@ -953,12 +950,12 @@ xfs_trans_cancel( * filesystem. This happens in paths where we detect * corruption and decide to give up. */ - if (dirty && !XFS_FORCED_SHUTDOWN(mp)) { + if (dirty && !xfs_is_shutdown(mp)) { XFS_ERROR_REPORT("xfs_trans_cancel", XFS_ERRLEVEL_LOW, mp); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); } #ifdef DEBUG - if (!dirty && !XFS_FORCED_SHUTDOWN(mp)) { + if (!dirty && !xfs_is_shutdown(mp)) { struct xfs_log_item *lip; list_for_each_entry(lip, &tp->t_items, li_trans) diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index dbb69b4bf3ed..2a8c8dc54c95 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -17,6 +17,7 @@ #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_log.h" +#include "xfs_log_priv.h" #ifdef DEBUG /* @@ -429,8 +430,12 @@ xfsaild_push( /* * If we encountered pinned items or did not finish writing out all - * buffers the last time we ran, force the log first and wait for it - * before pushing again. + * buffers the last time we ran, force a background CIL push to get the + * items unpinned in the near future. We do not wait on the CIL push as + * that could stall us for seconds if there is enough background IO + * load. Stalling for that long when the tail of the log is pinned and + * needs flushing will hard stop the transaction subsystem when log + * space runs out. */ if (ailp->ail_log_flush && ailp->ail_last_pushed_lsn == 0 && (!list_empty_careful(&ailp->ail_buf_list) || @@ -438,7 +443,7 @@ xfsaild_push( ailp->ail_log_flush = 0; XFS_STATS_INC(mp, xs_push_ail_flush); - xfs_log_force(mp, XFS_LOG_SYNC); + xlog_cil_flush(mp->m_log); } spin_lock(&ailp->ail_lock); @@ -615,7 +620,7 @@ xfsaild( * opportunity to release such buffers from the queue. */ ASSERT(list_empty(&ailp->ail_buf_list) || - XFS_FORCED_SHUTDOWN(ailp->ail_mount)); + xfs_is_shutdown(ailp->ail_mount)); xfs_buf_delwri_cancel(&ailp->ail_buf_list); break; } @@ -678,7 +683,7 @@ xfs_ail_push( struct xfs_log_item *lip; lip = xfs_ail_min(ailp); - if (!lip || XFS_FORCED_SHUTDOWN(ailp->ail_mount) || + if (!lip || xfs_is_shutdown(ailp->ail_mount) || XFS_LSN_CMP(threshold_lsn, ailp->ail_target) <= 0) return; @@ -743,7 +748,7 @@ xfs_ail_update_finish( return; } - if (!XFS_FORCED_SHUTDOWN(mp)) + if (!xfs_is_shutdown(mp)) xlog_assign_tail_lsn_locked(mp); if (list_empty(&ailp->ail_head)) @@ -863,7 +868,7 @@ xfs_trans_ail_delete( spin_lock(&ailp->ail_lock); if (!test_bit(XFS_LI_IN_AIL, &lip->li_flags)) { spin_unlock(&ailp->ail_lock); - if (shutdown_type && !XFS_FORCED_SHUTDOWN(mp)) { + if (shutdown_type && !xfs_is_shutdown(mp)) { xfs_alert_tag(mp, XFS_PTAG_AILDELETE, "%s: attempting to delete a log item that is not in the AIL", __func__); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index d11d032da0b4..6549e50d852c 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -38,7 +38,7 @@ xfs_trans_buf_item_match( blip = (struct xfs_buf_log_item *)lip; if (blip->bli_item.li_type == XFS_LI_BUF && blip->bli_buf->b_target == target && - XFS_BUF_ADDR(blip->bli_buf) == map[0].bm_bn && + xfs_buf_daddr(blip->bli_buf) == map[0].bm_bn && blip->bli_buf->b_length == len) { ASSERT(blip->bli_buf->b_map_count == nmaps); return blip->bli_buf; @@ -138,7 +138,7 @@ xfs_trans_get_buf_map( bp = xfs_trans_buf_item_match(tp, target, map, nmaps); if (bp != NULL) { ASSERT(xfs_buf_islocked(bp)); - if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) { + if (xfs_is_shutdown(tp->t_mountp)) { xfs_buf_stale(bp); bp->b_flags |= XBF_DONE; } @@ -244,7 +244,7 @@ xfs_trans_read_buf_map( * We never locked this buf ourselves, so we shouldn't * brelse it either. Just get out. */ - if (XFS_FORCED_SHUTDOWN(mp)) { + if (xfs_is_shutdown(mp)) { trace_xfs_trans_read_buf_shut(bp, _RET_IP_); return -EIO; } @@ -300,7 +300,7 @@ xfs_trans_read_buf_map( return error; } - if (XFS_FORCED_SHUTDOWN(mp)) { + if (xfs_is_shutdown(mp)) { xfs_buf_relse(bp); trace_xfs_trans_read_buf_shut(bp, _RET_IP_); return -EIO; diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 48e09ea30ee5..3872ce671411 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -58,7 +58,7 @@ xfs_trans_log_dquot( /* Upgrade the dquot to bigtime format if possible. */ if (dqp->q_id != 0 && - xfs_sb_version_hasbigtime(&tp->t_mountp->m_sb) && + xfs_has_bigtime(tp->t_mountp) && !(dqp->q_type & XFS_DQTYPE_BIGTIME)) dqp->q_type |= XFS_DQTYPE_BIGTIME; @@ -132,8 +132,7 @@ xfs_trans_mod_dquot_byino( { xfs_mount_t *mp = tp->t_mountp; - if (!XFS_IS_QUOTA_RUNNING(mp) || - !XFS_IS_QUOTA_ON(mp) || + if (!XFS_IS_QUOTA_ON(mp) || xfs_is_quota_inode(&mp->m_sb, ip->i_ino)) return; @@ -192,7 +191,7 @@ xfs_trans_mod_dquot( struct xfs_dqtrx *qtrx; ASSERT(tp); - ASSERT(XFS_IS_QUOTA_RUNNING(tp->t_mountp)); + ASSERT(XFS_IS_QUOTA_ON(tp->t_mountp)); qtrx = NULL; if (!delta) @@ -738,7 +737,7 @@ xfs_trans_reserve_quota_bydquots( { int error; - if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) + if (!XFS_IS_QUOTA_ON(mp)) return 0; ASSERT(flags & XFS_QMOPT_RESBLK_MASK); @@ -795,7 +794,7 @@ xfs_trans_reserve_quota_nblks( unsigned int qflags = 0; int error; - if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) + if (!XFS_IS_QUOTA_ON(mp)) return 0; ASSERT(!xfs_is_quota_inode(&mp->m_sb, ip->i_ino)); @@ -836,51 +835,13 @@ xfs_trans_reserve_quota_icreate( { struct xfs_mount *mp = tp->t_mountp; - if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) + if (!XFS_IS_QUOTA_ON(mp)) return 0; return xfs_trans_reserve_quota_bydquots(tp, mp, udqp, gdqp, pdqp, dblocks, 1, XFS_QMOPT_RES_REGBLKS); } -/* - * This routine is called to allocate a quotaoff log item. - */ -struct xfs_qoff_logitem * -xfs_trans_get_qoff_item( - struct xfs_trans *tp, - struct xfs_qoff_logitem *startqoff, - uint flags) -{ - struct xfs_qoff_logitem *q; - - ASSERT(tp != NULL); - - q = xfs_qm_qoff_logitem_init(tp->t_mountp, startqoff, flags); - ASSERT(q != NULL); - - /* - * Get a log_item_desc to point at the new item. - */ - xfs_trans_add_item(tp, &q->qql_item); - return q; -} - - -/* - * This is called to mark the quotaoff logitem as needing - * to be logged when the transaction is committed. The logitem must - * already be associated with the given transaction. - */ -void -xfs_trans_log_quotaoff_item( - struct xfs_trans *tp, - struct xfs_qoff_logitem *qlp) -{ - tp->t_flags |= XFS_TRANS_DIRTY; - set_bit(XFS_LI_DIRTY, &qlp->qql_item.li_flags); -} - STATIC void xfs_trans_alloc_dqinfo( xfs_trans_t *tp) diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 70055d486bf7..ddc346a9df9b 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -462,7 +462,7 @@ static int zonefs_file_truncate(struct inode *inode, loff_t isize) inode_dio_wait(inode); /* Serialize against page faults */ - down_write(&zi->i_mmap_sem); + filemap_invalidate_lock(inode->i_mapping); /* Serialize against zonefs_iomap_begin() */ mutex_lock(&zi->i_truncate_mutex); @@ -500,7 +500,7 @@ static int zonefs_file_truncate(struct inode *inode, loff_t isize) unlock: mutex_unlock(&zi->i_truncate_mutex); - up_write(&zi->i_mmap_sem); + filemap_invalidate_unlock(inode->i_mapping); return ret; } @@ -575,18 +575,6 @@ static int zonefs_file_fsync(struct file *file, loff_t start, loff_t end, return ret; } -static vm_fault_t zonefs_filemap_fault(struct vm_fault *vmf) -{ - struct zonefs_inode_info *zi = ZONEFS_I(file_inode(vmf->vma->vm_file)); - vm_fault_t ret; - - down_read(&zi->i_mmap_sem); - ret = filemap_fault(vmf); - up_read(&zi->i_mmap_sem); - - return ret; -} - static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf) { struct inode *inode = file_inode(vmf->vma->vm_file); @@ -607,16 +595,16 @@ static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf) file_update_time(vmf->vma->vm_file); /* Serialize against truncates */ - down_read(&zi->i_mmap_sem); + filemap_invalidate_lock_shared(inode->i_mapping); ret = iomap_page_mkwrite(vmf, &zonefs_iomap_ops); - up_read(&zi->i_mmap_sem); + filemap_invalidate_unlock_shared(inode->i_mapping); sb_end_pagefault(inode->i_sb); return ret; } static const struct vm_operations_struct zonefs_file_vm_ops = { - .fault = zonefs_filemap_fault, + .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = zonefs_filemap_page_mkwrite, }; @@ -1155,7 +1143,6 @@ static struct inode *zonefs_alloc_inode(struct super_block *sb) inode_init_once(&zi->i_vnode); mutex_init(&zi->i_truncate_mutex); - init_rwsem(&zi->i_mmap_sem); zi->i_wr_refcnt = 0; return &zi->i_vnode; diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h index 51141907097c..7b147907c328 100644 --- a/fs/zonefs/zonefs.h +++ b/fs/zonefs/zonefs.h @@ -70,12 +70,11 @@ struct zonefs_inode_info { * and changes to the inode private data, and in particular changes to * a sequential file size on completion of direct IO writes. * Serialization of mmap read IOs with truncate and syscall IO - * operations is done with i_mmap_sem in addition to i_truncate_mutex. - * Only zonefs_seq_file_truncate() takes both lock (i_mmap_sem first, - * i_truncate_mutex second). + * operations is done with invalidate_lock in addition to + * i_truncate_mutex. Only zonefs_seq_file_truncate() takes both lock + * (invalidate_lock first, i_truncate_mutex second). */ struct mutex i_truncate_mutex; - struct rw_semaphore i_mmap_sem; /* guarded by i_truncate_mutex */ unsigned int i_wr_refcnt; |