diff options
Diffstat (limited to 'fs')
490 files changed, 45732 insertions, 11520 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c index bb1b286c49ae..c381499f5416 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -97,10 +97,13 @@ static struct posix_acl *v9fs_get_cached_acl(struct inode *inode, int type) return acl; } -struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type) +struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type, bool rcu) { struct v9fs_session_info *v9ses; + if (rcu) + return ERR_PTR(-ECHILD); + v9ses = v9fs_inode2v9ses(inode); if (((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) || ((v9ses->flags & V9FS_ACL_MASK) != V9FS_POSIX_ACL)) { diff --git a/fs/9p/acl.h b/fs/9p/acl.h index e4f7e882272b..d43c8949e807 100644 --- a/fs/9p/acl.h +++ b/fs/9p/acl.h @@ -16,7 +16,7 @@ #ifdef CONFIG_9P_FS_POSIX_ACL extern int v9fs_get_acl(struct inode *, struct p9_fid *); -extern struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type); +extern struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type, bool rcu); extern int v9fs_acl_chmod(struct inode *, struct p9_fid *); extern int v9fs_set_create_acl(struct inode *, struct p9_fid *, struct posix_acl *, struct posix_acl *); diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index c4a2dc41beac..aab5e6538660 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -612,12 +612,7 @@ static void v9fs_mmap_vm_close(struct vm_area_struct *vma) p9_debug(P9_DEBUG_VFS, "9p VMA close, %p, flushing", vma); inode = file_inode(vma->vm_file); - - if (!mapping_can_writeback(inode->i_mapping)) - wbc.nr_to_write = 0; - - might_sleep(); - sync_inode(inode, &wbc); + filemap_fdatawrite_wbc(inode->i_mapping, &wbc); } diff --git a/fs/Kconfig b/fs/Kconfig index e7940882cbe8..a6313a969bc5 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -43,7 +43,7 @@ source "fs/f2fs/Kconfig" source "fs/zonefs/Kconfig" config FS_DAX - bool "Direct Access (DAX) support" + bool "File system based Direct Access (DAX) support" depends on MMU depends on !(ARM || MIPS || SPARC) select DEV_PAGEMAP_OPS if (ZONE_DEVICE && !FS_DAX_LIMITED) @@ -53,8 +53,23 @@ config FS_DAX Direct Access (DAX) can be used on memory-backed block devices. If the block device supports DAX and the filesystem supports DAX, then you can avoid using the pagecache to buffer I/Os. Turning - on this option will compile in support for DAX; you will need to - mount the filesystem using the -o dax option. + on this option will compile in support for DAX. + + For a DAX device to support file system access it needs to have + struct pages. For the nfit based NVDIMMs this can be enabled + using the ndctl utility: + + # ndctl create-namespace --force --reconfig=namespace0.0 \ + --mode=fsdax --map=mem + + See the 'create-namespace' man page for details on the overhead of + --map=mem: + https://docs.pmem.io/ndctl-user-guide/ndctl-man-pages/ndctl-create-namespace + + For ndctl to work CONFIG_DEV_DAX needs to be enabled as well. For most + file systems DAX support needs to be manually enabled globally or + per-inode using a mount option as well. See the file documentation in + Documentation/filesystems/dax.rst for details. If you do not have a block device that is capable of using this, or if unsure, say N. Saying Y will increase the size of the kernel @@ -136,6 +151,7 @@ menu "DOS/FAT/EXFAT/NT Filesystems" source "fs/fat/Kconfig" source "fs/exfat/Kconfig" source "fs/ntfs/Kconfig" +source "fs/ntfs3/Kconfig" endmenu endif # BLOCK @@ -218,8 +234,7 @@ config ARCH_SUPPORTS_HUGETLBFS config HUGETLBFS bool "HugeTLB file system support" - depends on X86 || IA64 || SPARC64 || (S390 && 64BIT) || \ - ARCH_SUPPORTS_HUGETLBFS || BROKEN + depends on X86 || IA64 || SPARC64 || ARCH_SUPPORTS_HUGETLBFS || BROKEN help hugetlbfs is a filesystem backing for HugeTLB pages, based on ramfs. For architectures that support it, say Y here and read diff --git a/fs/Makefile b/fs/Makefile index 1f18802f43a4..84c5e4cdfee5 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -17,7 +17,7 @@ obj-y := open.o read_write.o file_table.o super.o \ kernel_read_file.o remap_range.o ifeq ($(CONFIG_BLOCK),y) -obj-y += buffer.o block_dev.o direct-io.o mpage.o +obj-y += buffer.o direct-io.o mpage.o else obj-y += no-block.o endif @@ -101,6 +101,7 @@ obj-$(CONFIG_CIFS) += cifs/ obj-$(CONFIG_SMB_SERVER) += ksmbd/ obj-$(CONFIG_HPFS_FS) += hpfs/ obj-$(CONFIG_NTFS_FS) += ntfs/ +obj-$(CONFIG_NTFS3_FS) += ntfs3/ obj-$(CONFIG_UFS_FS) += ufs/ obj-$(CONFIG_EFS_FS) += efs/ obj-$(CONFIG_JFFS2_FS) += jffs2/ diff --git a/fs/attr.c b/fs/attr.c index 87ef39db1c34..473d21b3a86d 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -249,6 +249,34 @@ void setattr_copy(struct user_namespace *mnt_userns, struct inode *inode, } EXPORT_SYMBOL(setattr_copy); +int may_setattr(struct user_namespace *mnt_userns, struct inode *inode, + unsigned int ia_valid) +{ + int error; + + if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_TIMES_SET)) { + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + return -EPERM; + } + + /* + * If utimes(2) and friends are called with times == NULL (or both + * times are UTIME_NOW), then we need to check for write permission + */ + if (ia_valid & ATTR_TOUCH) { + if (IS_IMMUTABLE(inode)) + return -EPERM; + + if (!inode_owner_or_capable(mnt_userns, inode)) { + error = inode_permission(mnt_userns, inode, MAY_WRITE); + if (error) + return error; + } + } + return 0; +} +EXPORT_SYMBOL(may_setattr); + /** * notify_change - modify attributes of a filesytem object * @mnt_userns: user namespace of the mount the inode was found from @@ -290,25 +318,9 @@ int notify_change(struct user_namespace *mnt_userns, struct dentry *dentry, WARN_ON_ONCE(!inode_is_locked(inode)); - if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_TIMES_SET)) { - if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) - return -EPERM; - } - - /* - * If utimes(2) and friends are called with times == NULL (or both - * times are UTIME_NOW), then we need to check for write permission - */ - if (ia_valid & ATTR_TOUCH) { - if (IS_IMMUTABLE(inode)) - return -EPERM; - - if (!inode_owner_or_capable(mnt_userns, inode)) { - error = inode_permission(mnt_userns, inode, MAY_WRITE); - if (error) - return error; - } - } + error = may_setattr(mnt_userns, inode, ia_valid); + if (error) + return error; if ((ia_valid & ATTR_MODE)) { umode_t amode = attr->ia_mode; diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 48e16144c1f7..12b8fdcc445b 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -121,7 +121,7 @@ static const char *bad_inode_get_link(struct dentry *dentry, return ERR_PTR(-EIO); } -static struct posix_acl *bad_inode_get_acl(struct inode *inode, int type) +static struct posix_acl *bad_inode_get_acl(struct inode *inode, int type, bool rcu) { return ERR_PTR(-EIO); } diff --git a/fs/befs/debug.c b/fs/befs/debug.c index eb7bd6c692c7..02fa66fb82c2 100644 --- a/fs/befs/debug.c +++ b/fs/befs/debug.c @@ -14,7 +14,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #ifdef __KERNEL__ -#include <stdarg.h> +#include <linux/stdarg.h> #include <linux/string.h> #include <linux/spinlock.h> #include <linux/kernel.h> diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index 145917f734fe..0dcfc691e7e2 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -221,8 +221,7 @@ static int load_aout_binary(struct linux_binprm * bprm) } error = vm_mmap(bprm->file, N_TXTADDR(ex), ex.a_text, - PROT_READ | PROT_EXEC, - MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE, + PROT_READ | PROT_EXEC, MAP_FIXED | MAP_PRIVATE, fd_offset); if (error != N_TXTADDR(ex)) @@ -230,7 +229,7 @@ static int load_aout_binary(struct linux_binprm * bprm) error = vm_mmap(bprm->file, N_DATADDR(ex), ex.a_data, PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE, + MAP_FIXED | MAP_PRIVATE, fd_offset + ex.a_text); if (error != N_DATADDR(ex)) return error; @@ -309,7 +308,7 @@ static int load_aout_library(struct file *file) /* Now use mmap to map the library into memory. */ error = vm_mmap(file, start_addr, ex.a_text + ex.a_data, PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE, + MAP_FIXED | MAP_PRIVATE, N_TXTOFF(ex)); retval = error; if (error != start_addr) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 439ed81e755a..69d900a8473d 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -622,7 +622,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, eppnt = interp_elf_phdata; for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) { if (eppnt->p_type == PT_LOAD) { - int elf_type = MAP_PRIVATE | MAP_DENYWRITE; + int elf_type = MAP_PRIVATE; int elf_prot = make_prot(eppnt->p_flags, arch_state, true, true); unsigned long vaddr = 0; @@ -1070,7 +1070,7 @@ out_free_interp: elf_prot = make_prot(elf_ppnt->p_flags, &arch_state, !!interpreter, false); - elf_flags = MAP_PRIVATE | MAP_DENYWRITE; + elf_flags = MAP_PRIVATE; vaddr = elf_ppnt->p_vaddr; /* @@ -1384,7 +1384,7 @@ static int load_elf_library(struct file *file) (eppnt->p_filesz + ELF_PAGEOFFSET(eppnt->p_vaddr)), PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_FIXED_NOREPLACE | MAP_PRIVATE | MAP_DENYWRITE, + MAP_FIXED_NOREPLACE | MAP_PRIVATE, (eppnt->p_offset - ELF_PAGEOFFSET(eppnt->p_vaddr))); if (error != ELF_PAGESTART(eppnt->p_vaddr)) diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index cf4028487dcc..6d8fd6030cbb 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1041,7 +1041,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, if (phdr->p_flags & PF_W) prot |= PROT_WRITE; if (phdr->p_flags & PF_X) prot |= PROT_EXEC; - flags = MAP_PRIVATE | MAP_DENYWRITE; + flags = MAP_PRIVATE; maddr = 0; switch (params->flags & ELF_FDPIC_FLAG_ARRANGEMENT) { diff --git a/fs/block_dev.c b/fs/block_dev.c deleted file mode 100644 index 45df6cbccf12..000000000000 --- a/fs/block_dev.c +++ /dev/null @@ -1,1695 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright (C) 1991, 1992 Linus Torvalds - * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE - * Copyright (C) 2016 - 2020 Christoph Hellwig - */ - -#include <linux/init.h> -#include <linux/mm.h> -#include <linux/fcntl.h> -#include <linux/slab.h> -#include <linux/kmod.h> -#include <linux/major.h> -#include <linux/device_cgroup.h> -#include <linux/highmem.h> -#include <linux/blkdev.h> -#include <linux/backing-dev.h> -#include <linux/module.h> -#include <linux/blkpg.h> -#include <linux/magic.h> -#include <linux/buffer_head.h> -#include <linux/swap.h> -#include <linux/pagevec.h> -#include <linux/writeback.h> -#include <linux/mpage.h> -#include <linux/mount.h> -#include <linux/pseudo_fs.h> -#include <linux/uio.h> -#include <linux/namei.h> -#include <linux/log2.h> -#include <linux/cleancache.h> -#include <linux/task_io_accounting_ops.h> -#include <linux/falloc.h> -#include <linux/part_stat.h> -#include <linux/uaccess.h> -#include <linux/suspend.h> -#include "internal.h" -#include "../block/blk.h" - -struct bdev_inode { - struct block_device bdev; - struct inode vfs_inode; -}; - -static const struct address_space_operations def_blk_aops; - -static inline struct bdev_inode *BDEV_I(struct inode *inode) -{ - return container_of(inode, struct bdev_inode, vfs_inode); -} - -struct block_device *I_BDEV(struct inode *inode) -{ - return &BDEV_I(inode)->bdev; -} -EXPORT_SYMBOL(I_BDEV); - -static void bdev_write_inode(struct block_device *bdev) -{ - struct inode *inode = bdev->bd_inode; - int ret; - - spin_lock(&inode->i_lock); - while (inode->i_state & I_DIRTY) { - spin_unlock(&inode->i_lock); - ret = write_inode_now(inode, true); - if (ret) { - char name[BDEVNAME_SIZE]; - pr_warn_ratelimited("VFS: Dirty inode writeback failed " - "for block device %s (err=%d).\n", - bdevname(bdev, name), ret); - } - spin_lock(&inode->i_lock); - } - spin_unlock(&inode->i_lock); -} - -/* Kill _all_ buffers and pagecache , dirty or not.. */ -static void kill_bdev(struct block_device *bdev) -{ - struct address_space *mapping = bdev->bd_inode->i_mapping; - - if (mapping_empty(mapping)) - return; - - invalidate_bh_lrus(); - truncate_inode_pages(mapping, 0); -} - -/* Invalidate clean unused buffers and pagecache. */ -void invalidate_bdev(struct block_device *bdev) -{ - struct address_space *mapping = bdev->bd_inode->i_mapping; - - if (mapping->nrpages) { - invalidate_bh_lrus(); - lru_add_drain_all(); /* make sure all lru add caches are flushed */ - invalidate_mapping_pages(mapping, 0, -1); - } - /* 99% of the time, we don't need to flush the cleancache on the bdev. - * But, for the strange corners, lets be cautious - */ - cleancache_invalidate_inode(mapping); -} -EXPORT_SYMBOL(invalidate_bdev); - -/* - * Drop all buffers & page cache for given bdev range. This function bails - * with error if bdev has other exclusive owner (such as filesystem). - */ -int truncate_bdev_range(struct block_device *bdev, fmode_t mode, - loff_t lstart, loff_t lend) -{ - /* - * If we don't hold exclusive handle for the device, upgrade to it - * while we discard the buffer cache to avoid discarding buffers - * under live filesystem. - */ - if (!(mode & FMODE_EXCL)) { - int err = bd_prepare_to_claim(bdev, truncate_bdev_range); - if (err) - goto invalidate; - } - - truncate_inode_pages_range(bdev->bd_inode->i_mapping, lstart, lend); - if (!(mode & FMODE_EXCL)) - bd_abort_claiming(bdev, truncate_bdev_range); - return 0; - -invalidate: - /* - * Someone else has handle exclusively open. Try invalidating instead. - * The 'end' argument is inclusive so the rounding is safe. - */ - return invalidate_inode_pages2_range(bdev->bd_inode->i_mapping, - lstart >> PAGE_SHIFT, - lend >> PAGE_SHIFT); -} - -static void set_init_blocksize(struct block_device *bdev) -{ - unsigned int bsize = bdev_logical_block_size(bdev); - loff_t size = i_size_read(bdev->bd_inode); - - while (bsize < PAGE_SIZE) { - if (size & bsize) - break; - bsize <<= 1; - } - bdev->bd_inode->i_blkbits = blksize_bits(bsize); -} - -int set_blocksize(struct block_device *bdev, int size) -{ - /* Size must be a power of two, and between 512 and PAGE_SIZE */ - if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size)) - return -EINVAL; - - /* Size cannot be smaller than the size supported by the device */ - if (size < bdev_logical_block_size(bdev)) - return -EINVAL; - - /* Don't change the size if it is same as current */ - if (bdev->bd_inode->i_blkbits != blksize_bits(size)) { - sync_blockdev(bdev); - bdev->bd_inode->i_blkbits = blksize_bits(size); - kill_bdev(bdev); - } - return 0; -} - -EXPORT_SYMBOL(set_blocksize); - -int sb_set_blocksize(struct super_block *sb, int size) -{ - if (set_blocksize(sb->s_bdev, size)) - return 0; - /* If we get here, we know size is power of two - * and it's value is between 512 and PAGE_SIZE */ - sb->s_blocksize = size; - sb->s_blocksize_bits = blksize_bits(size); - return sb->s_blocksize; -} - -EXPORT_SYMBOL(sb_set_blocksize); - -int sb_min_blocksize(struct super_block *sb, int size) -{ - int minsize = bdev_logical_block_size(sb->s_bdev); - if (size < minsize) - size = minsize; - return sb_set_blocksize(sb, size); -} - -EXPORT_SYMBOL(sb_min_blocksize); - -static int -blkdev_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh, int create) -{ - bh->b_bdev = I_BDEV(inode); - bh->b_blocknr = iblock; - set_buffer_mapped(bh); - return 0; -} - -static struct inode *bdev_file_inode(struct file *file) -{ - return file->f_mapping->host; -} - -static unsigned int dio_bio_write_op(struct kiocb *iocb) -{ - unsigned int op = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE; - - /* avoid the need for a I/O completion work item */ - if (iocb->ki_flags & IOCB_DSYNC) - op |= REQ_FUA; - return op; -} - -#define DIO_INLINE_BIO_VECS 4 - -static void blkdev_bio_end_io_simple(struct bio *bio) -{ - struct task_struct *waiter = bio->bi_private; - - WRITE_ONCE(bio->bi_private, NULL); - blk_wake_io_task(waiter); -} - -static ssize_t -__blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, - unsigned int nr_pages) -{ - struct file *file = iocb->ki_filp; - struct block_device *bdev = I_BDEV(bdev_file_inode(file)); - struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs; - loff_t pos = iocb->ki_pos; - bool should_dirty = false; - struct bio bio; - ssize_t ret; - blk_qc_t qc; - - if ((pos | iov_iter_alignment(iter)) & - (bdev_logical_block_size(bdev) - 1)) - return -EINVAL; - - if (nr_pages <= DIO_INLINE_BIO_VECS) - vecs = inline_vecs; - else { - vecs = kmalloc_array(nr_pages, sizeof(struct bio_vec), - GFP_KERNEL); - if (!vecs) - return -ENOMEM; - } - - bio_init(&bio, vecs, nr_pages); - bio_set_dev(&bio, bdev); - bio.bi_iter.bi_sector = pos >> 9; - bio.bi_write_hint = iocb->ki_hint; - bio.bi_private = current; - bio.bi_end_io = blkdev_bio_end_io_simple; - bio.bi_ioprio = iocb->ki_ioprio; - - ret = bio_iov_iter_get_pages(&bio, iter); - if (unlikely(ret)) - goto out; - ret = bio.bi_iter.bi_size; - - if (iov_iter_rw(iter) == READ) { - bio.bi_opf = REQ_OP_READ; - if (iter_is_iovec(iter)) - should_dirty = true; - } else { - bio.bi_opf = dio_bio_write_op(iocb); - task_io_account_write(ret); - } - if (iocb->ki_flags & IOCB_NOWAIT) - bio.bi_opf |= REQ_NOWAIT; - if (iocb->ki_flags & IOCB_HIPRI) - bio_set_polled(&bio, iocb); - - qc = submit_bio(&bio); - for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); - if (!READ_ONCE(bio.bi_private)) - break; - if (!(iocb->ki_flags & IOCB_HIPRI) || - !blk_poll(bdev_get_queue(bdev), qc, true)) - blk_io_schedule(); - } - __set_current_state(TASK_RUNNING); - - bio_release_pages(&bio, should_dirty); - if (unlikely(bio.bi_status)) - ret = blk_status_to_errno(bio.bi_status); - -out: - if (vecs != inline_vecs) - kfree(vecs); - - bio_uninit(&bio); - - return ret; -} - -struct blkdev_dio { - union { - struct kiocb *iocb; - struct task_struct *waiter; - }; - size_t size; - atomic_t ref; - bool multi_bio : 1; - bool should_dirty : 1; - bool is_sync : 1; - struct bio bio; -}; - -static struct bio_set blkdev_dio_pool; - -static int blkdev_iopoll(struct kiocb *kiocb, bool wait) -{ - struct block_device *bdev = I_BDEV(kiocb->ki_filp->f_mapping->host); - struct request_queue *q = bdev_get_queue(bdev); - - return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait); -} - -static void blkdev_bio_end_io(struct bio *bio) -{ - struct blkdev_dio *dio = bio->bi_private; - bool should_dirty = dio->should_dirty; - - if (bio->bi_status && !dio->bio.bi_status) - dio->bio.bi_status = bio->bi_status; - - if (!dio->multi_bio || atomic_dec_and_test(&dio->ref)) { - if (!dio->is_sync) { - struct kiocb *iocb = dio->iocb; - ssize_t ret; - - if (likely(!dio->bio.bi_status)) { - ret = dio->size; - iocb->ki_pos += ret; - } else { - ret = blk_status_to_errno(dio->bio.bi_status); - } - - dio->iocb->ki_complete(iocb, ret, 0); - if (dio->multi_bio) - bio_put(&dio->bio); - } else { - struct task_struct *waiter = dio->waiter; - - WRITE_ONCE(dio->waiter, NULL); - blk_wake_io_task(waiter); - } - } - - if (should_dirty) { - bio_check_pages_dirty(bio); - } else { - bio_release_pages(bio, false); - bio_put(bio); - } -} - -static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - unsigned int nr_pages) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = bdev_file_inode(file); - struct block_device *bdev = I_BDEV(inode); - struct blk_plug plug; - struct blkdev_dio *dio; - struct bio *bio; - bool is_poll = (iocb->ki_flags & IOCB_HIPRI) != 0; - bool is_read = (iov_iter_rw(iter) == READ), is_sync; - loff_t pos = iocb->ki_pos; - blk_qc_t qc = BLK_QC_T_NONE; - int ret = 0; - - if ((pos | iov_iter_alignment(iter)) & - (bdev_logical_block_size(bdev) - 1)) - return -EINVAL; - - bio = bio_alloc_kiocb(iocb, nr_pages, &blkdev_dio_pool); - - dio = container_of(bio, struct blkdev_dio, bio); - dio->is_sync = is_sync = is_sync_kiocb(iocb); - if (dio->is_sync) { - dio->waiter = current; - bio_get(bio); - } else { - dio->iocb = iocb; - } - - dio->size = 0; - dio->multi_bio = false; - dio->should_dirty = is_read && iter_is_iovec(iter); - - /* - * Don't plug for HIPRI/polled IO, as those should go straight - * to issue - */ - if (!is_poll) - blk_start_plug(&plug); - - for (;;) { - bio_set_dev(bio, bdev); - bio->bi_iter.bi_sector = pos >> 9; - bio->bi_write_hint = iocb->ki_hint; - bio->bi_private = dio; - bio->bi_end_io = blkdev_bio_end_io; - bio->bi_ioprio = iocb->ki_ioprio; - - ret = bio_iov_iter_get_pages(bio, iter); - if (unlikely(ret)) { - bio->bi_status = BLK_STS_IOERR; - bio_endio(bio); - break; - } - - if (is_read) { - bio->bi_opf = REQ_OP_READ; - if (dio->should_dirty) - bio_set_pages_dirty(bio); - } else { - bio->bi_opf = dio_bio_write_op(iocb); - task_io_account_write(bio->bi_iter.bi_size); - } - if (iocb->ki_flags & IOCB_NOWAIT) - bio->bi_opf |= REQ_NOWAIT; - - dio->size += bio->bi_iter.bi_size; - pos += bio->bi_iter.bi_size; - - nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS); - if (!nr_pages) { - bool polled = false; - - if (iocb->ki_flags & IOCB_HIPRI) { - bio_set_polled(bio, iocb); - polled = true; - } - - qc = submit_bio(bio); - - if (polled) - WRITE_ONCE(iocb->ki_cookie, qc); - break; - } - - if (!dio->multi_bio) { - /* - * AIO needs an extra reference to ensure the dio - * structure which is embedded into the first bio - * stays around. - */ - if (!is_sync) - bio_get(bio); - dio->multi_bio = true; - atomic_set(&dio->ref, 2); - } else { - atomic_inc(&dio->ref); - } - - submit_bio(bio); - bio = bio_alloc(GFP_KERNEL, nr_pages); - } - - if (!is_poll) - blk_finish_plug(&plug); - - if (!is_sync) - return -EIOCBQUEUED; - - for (;;) { - set_current_state(TASK_UNINTERRUPTIBLE); - if (!READ_ONCE(dio->waiter)) - break; - - if (!(iocb->ki_flags & IOCB_HIPRI) || - !blk_poll(bdev_get_queue(bdev), qc, true)) - blk_io_schedule(); - } - __set_current_state(TASK_RUNNING); - - if (!ret) - ret = blk_status_to_errno(dio->bio.bi_status); - if (likely(!ret)) - ret = dio->size; - - bio_put(&dio->bio); - return ret; -} - -static ssize_t -blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) -{ - unsigned int nr_pages; - - if (!iov_iter_count(iter)) - return 0; - - nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1); - if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_VECS) - return __blkdev_direct_IO_simple(iocb, iter, nr_pages); - - return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages)); -} - -static __init int blkdev_init(void) -{ - return bioset_init(&blkdev_dio_pool, 4, - offsetof(struct blkdev_dio, bio), - BIOSET_NEED_BVECS|BIOSET_PERCPU_CACHE); -} -module_init(blkdev_init); - -int __sync_blockdev(struct block_device *bdev, int wait) -{ - if (!bdev) - return 0; - if (!wait) - return filemap_flush(bdev->bd_inode->i_mapping); - return filemap_write_and_wait(bdev->bd_inode->i_mapping); -} - -/* - * Write out and wait upon all the dirty data associated with a block - * device via its mapping. Does not take the superblock lock. - */ -int sync_blockdev(struct block_device *bdev) -{ - return __sync_blockdev(bdev, 1); -} -EXPORT_SYMBOL(sync_blockdev); - -/* - * Write out and wait upon all dirty data associated with this - * device. Filesystem data as well as the underlying block - * device. Takes the superblock lock. - */ -int fsync_bdev(struct block_device *bdev) -{ - struct super_block *sb = get_super(bdev); - if (sb) { - int res = sync_filesystem(sb); - drop_super(sb); - return res; - } - return sync_blockdev(bdev); -} -EXPORT_SYMBOL(fsync_bdev); - -/** - * freeze_bdev -- lock a filesystem and force it into a consistent state - * @bdev: blockdevice to lock - * - * If a superblock is found on this device, we take the s_umount semaphore - * on it to make sure nobody unmounts until the snapshot creation is done. - * The reference counter (bd_fsfreeze_count) guarantees that only the last - * unfreeze process can unfreeze the frozen filesystem actually when multiple - * freeze requests arrive simultaneously. It counts up in freeze_bdev() and - * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze - * actually. - */ -int freeze_bdev(struct block_device *bdev) -{ - struct super_block *sb; - int error = 0; - - mutex_lock(&bdev->bd_fsfreeze_mutex); - if (++bdev->bd_fsfreeze_count > 1) - goto done; - - sb = get_active_super(bdev); - if (!sb) - goto sync; - if (sb->s_op->freeze_super) - error = sb->s_op->freeze_super(sb); - else - error = freeze_super(sb); - deactivate_super(sb); - - if (error) { - bdev->bd_fsfreeze_count--; - goto done; - } - bdev->bd_fsfreeze_sb = sb; - -sync: - sync_blockdev(bdev); -done: - mutex_unlock(&bdev->bd_fsfreeze_mutex); - return error; -} -EXPORT_SYMBOL(freeze_bdev); - -/** - * thaw_bdev -- unlock filesystem - * @bdev: blockdevice to unlock - * - * Unlocks the filesystem and marks it writeable again after freeze_bdev(). - */ -int thaw_bdev(struct block_device *bdev) -{ - struct super_block *sb; - int error = -EINVAL; - - mutex_lock(&bdev->bd_fsfreeze_mutex); - if (!bdev->bd_fsfreeze_count) - goto out; - - error = 0; - if (--bdev->bd_fsfreeze_count > 0) - goto out; - - sb = bdev->bd_fsfreeze_sb; - if (!sb) - goto out; - - if (sb->s_op->thaw_super) - error = sb->s_op->thaw_super(sb); - else - error = thaw_super(sb); - if (error) - bdev->bd_fsfreeze_count++; - else - bdev->bd_fsfreeze_sb = NULL; -out: - mutex_unlock(&bdev->bd_fsfreeze_mutex); - return error; -} -EXPORT_SYMBOL(thaw_bdev); - -static int blkdev_writepage(struct page *page, struct writeback_control *wbc) -{ - return block_write_full_page(page, blkdev_get_block, wbc); -} - -static int blkdev_readpage(struct file * file, struct page * page) -{ - return block_read_full_page(page, blkdev_get_block); -} - -static void blkdev_readahead(struct readahead_control *rac) -{ - mpage_readahead(rac, blkdev_get_block); -} - -static int blkdev_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata) -{ - return block_write_begin(mapping, pos, len, flags, pagep, - blkdev_get_block); -} - -static int blkdev_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - int ret; - ret = block_write_end(file, mapping, pos, len, copied, page, fsdata); - - unlock_page(page); - put_page(page); - - return ret; -} - -/* - * private llseek: - * for a block special file file_inode(file)->i_size is zero - * so we compute the size by hand (just as in block_read/write above) - */ -static loff_t block_llseek(struct file *file, loff_t offset, int whence) -{ - struct inode *bd_inode = bdev_file_inode(file); - loff_t retval; - - inode_lock(bd_inode); - retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode)); - inode_unlock(bd_inode); - return retval; -} - -static int blkdev_fsync(struct file *filp, loff_t start, loff_t end, - int datasync) -{ - struct inode *bd_inode = bdev_file_inode(filp); - struct block_device *bdev = I_BDEV(bd_inode); - int error; - - error = file_write_and_wait_range(filp, start, end); - if (error) - return error; - - /* - * There is no need to serialise calls to blkdev_issue_flush with - * i_mutex and doing so causes performance issues with concurrent - * O_SYNC writers to a block device. - */ - error = blkdev_issue_flush(bdev); - if (error == -EOPNOTSUPP) - error = 0; - - return error; -} - -/** - * bdev_read_page() - Start reading a page from a block device - * @bdev: The device to read the page from - * @sector: The offset on the device to read the page to (need not be aligned) - * @page: The page to read - * - * On entry, the page should be locked. It will be unlocked when the page - * has been read. If the block driver implements rw_page synchronously, - * that will be true on exit from this function, but it need not be. - * - * Errors returned by this function are usually "soft", eg out of memory, or - * queue full; callers should try a different route to read this page rather - * than propagate an error back up the stack. - * - * Return: negative errno if an error occurs, 0 if submission was successful. - */ -int bdev_read_page(struct block_device *bdev, sector_t sector, - struct page *page) -{ - const struct block_device_operations *ops = bdev->bd_disk->fops; - int result = -EOPNOTSUPP; - - if (!ops->rw_page || bdev_get_integrity(bdev)) - return result; - - result = blk_queue_enter(bdev->bd_disk->queue, 0); - if (result) - return result; - result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, - REQ_OP_READ); - blk_queue_exit(bdev->bd_disk->queue); - return result; -} - -/** - * bdev_write_page() - Start writing a page to a block device - * @bdev: The device to write the page to - * @sector: The offset on the device to write the page to (need not be aligned) - * @page: The page to write - * @wbc: The writeback_control for the write - * - * On entry, the page should be locked and not currently under writeback. - * On exit, if the write started successfully, the page will be unlocked and - * under writeback. If the write failed already (eg the driver failed to - * queue the page to the device), the page will still be locked. If the - * caller is a ->writepage implementation, it will need to unlock the page. - * - * Errors returned by this function are usually "soft", eg out of memory, or - * queue full; callers should try a different route to write this page rather - * than propagate an error back up the stack. - * - * Return: negative errno if an error occurs, 0 if submission was successful. - */ -int bdev_write_page(struct block_device *bdev, sector_t sector, - struct page *page, struct writeback_control *wbc) -{ - int result; - const struct block_device_operations *ops = bdev->bd_disk->fops; - - if (!ops->rw_page || bdev_get_integrity(bdev)) - return -EOPNOTSUPP; - result = blk_queue_enter(bdev->bd_disk->queue, 0); - if (result) - return result; - - set_page_writeback(page); - result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, - REQ_OP_WRITE); - if (result) { - end_page_writeback(page); - } else { - clean_page_buffers(page); - unlock_page(page); - } - blk_queue_exit(bdev->bd_disk->queue); - return result; -} - -/* - * pseudo-fs - */ - -static __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock); -static struct kmem_cache * bdev_cachep __read_mostly; - -static struct inode *bdev_alloc_inode(struct super_block *sb) -{ - struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL); - - if (!ei) - return NULL; - memset(&ei->bdev, 0, sizeof(ei->bdev)); - return &ei->vfs_inode; -} - -static void bdev_free_inode(struct inode *inode) -{ - struct block_device *bdev = I_BDEV(inode); - - free_percpu(bdev->bd_stats); - kfree(bdev->bd_meta_info); - - if (!bdev_is_partition(bdev)) { - if (bdev->bd_disk && bdev->bd_disk->bdi) - bdi_put(bdev->bd_disk->bdi); - kfree(bdev->bd_disk); - } - - if (MAJOR(bdev->bd_dev) == BLOCK_EXT_MAJOR) - blk_free_ext_minor(MINOR(bdev->bd_dev)); - - kmem_cache_free(bdev_cachep, BDEV_I(inode)); -} - -static void init_once(void *data) -{ - struct bdev_inode *ei = data; - - inode_init_once(&ei->vfs_inode); -} - -static void bdev_evict_inode(struct inode *inode) -{ - truncate_inode_pages_final(&inode->i_data); - invalidate_inode_buffers(inode); /* is it needed here? */ - clear_inode(inode); -} - -static const struct super_operations bdev_sops = { - .statfs = simple_statfs, - .alloc_inode = bdev_alloc_inode, - .free_inode = bdev_free_inode, - .drop_inode = generic_delete_inode, - .evict_inode = bdev_evict_inode, -}; - -static int bd_init_fs_context(struct fs_context *fc) -{ - struct pseudo_fs_context *ctx = init_pseudo(fc, BDEVFS_MAGIC); - if (!ctx) - return -ENOMEM; - fc->s_iflags |= SB_I_CGROUPWB; - ctx->ops = &bdev_sops; - return 0; -} - -static struct file_system_type bd_type = { - .name = "bdev", - .init_fs_context = bd_init_fs_context, - .kill_sb = kill_anon_super, -}; - -struct super_block *blockdev_superblock __read_mostly; -EXPORT_SYMBOL_GPL(blockdev_superblock); - -void __init bdev_cache_init(void) -{ - int err; - static struct vfsmount *bd_mnt; - - bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), - 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_ACCOUNT|SLAB_PANIC), - init_once); - err = register_filesystem(&bd_type); - if (err) - panic("Cannot register bdev pseudo-fs"); - bd_mnt = kern_mount(&bd_type); - if (IS_ERR(bd_mnt)) - panic("Cannot create bdev pseudo-fs"); - blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */ -} - -struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) -{ - struct block_device *bdev; - struct inode *inode; - - inode = new_inode(blockdev_superblock); - if (!inode) - return NULL; - inode->i_mode = S_IFBLK; - inode->i_rdev = 0; - inode->i_data.a_ops = &def_blk_aops; - mapping_set_gfp_mask(&inode->i_data, GFP_USER); - - bdev = I_BDEV(inode); - mutex_init(&bdev->bd_fsfreeze_mutex); - spin_lock_init(&bdev->bd_size_lock); - bdev->bd_disk = disk; - bdev->bd_partno = partno; - bdev->bd_inode = inode; - bdev->bd_stats = alloc_percpu(struct disk_stats); - if (!bdev->bd_stats) { - iput(inode); - return NULL; - } - return bdev; -} - -void bdev_add(struct block_device *bdev, dev_t dev) -{ - bdev->bd_dev = dev; - bdev->bd_inode->i_rdev = dev; - bdev->bd_inode->i_ino = dev; - insert_inode_hash(bdev->bd_inode); -} - -long nr_blockdev_pages(void) -{ - struct inode *inode; - long ret = 0; - - spin_lock(&blockdev_superblock->s_inode_list_lock); - list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) - ret += inode->i_mapping->nrpages; - spin_unlock(&blockdev_superblock->s_inode_list_lock); - - return ret; -} - -/** - * bd_may_claim - test whether a block device can be claimed - * @bdev: block device of interest - * @whole: whole block device containing @bdev, may equal @bdev - * @holder: holder trying to claim @bdev - * - * Test whether @bdev can be claimed by @holder. - * - * CONTEXT: - * spin_lock(&bdev_lock). - * - * RETURNS: - * %true if @bdev can be claimed, %false otherwise. - */ -static bool bd_may_claim(struct block_device *bdev, struct block_device *whole, - void *holder) -{ - if (bdev->bd_holder == holder) - return true; /* already a holder */ - else if (bdev->bd_holder != NULL) - return false; /* held by someone else */ - else if (whole == bdev) - return true; /* is a whole device which isn't held */ - - else if (whole->bd_holder == bd_may_claim) - return true; /* is a partition of a device that is being partitioned */ - else if (whole->bd_holder != NULL) - return false; /* is a partition of a held device */ - else - return true; /* is a partition of an un-held device */ -} - -/** - * bd_prepare_to_claim - claim a block device - * @bdev: block device of interest - * @holder: holder trying to claim @bdev - * - * Claim @bdev. This function fails if @bdev is already claimed by another - * holder and waits if another claiming is in progress. return, the caller - * has ownership of bd_claiming and bd_holder[s]. - * - * RETURNS: - * 0 if @bdev can be claimed, -EBUSY otherwise. - */ -int bd_prepare_to_claim(struct block_device *bdev, void *holder) -{ - struct block_device *whole = bdev_whole(bdev); - - if (WARN_ON_ONCE(!holder)) - return -EINVAL; -retry: - spin_lock(&bdev_lock); - /* if someone else claimed, fail */ - if (!bd_may_claim(bdev, whole, holder)) { - spin_unlock(&bdev_lock); - return -EBUSY; - } - - /* if claiming is already in progress, wait for it to finish */ - if (whole->bd_claiming) { - wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0); - DEFINE_WAIT(wait); - - prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE); - spin_unlock(&bdev_lock); - schedule(); - finish_wait(wq, &wait); - goto retry; - } - - /* yay, all mine */ - whole->bd_claiming = holder; - spin_unlock(&bdev_lock); - return 0; -} -EXPORT_SYMBOL_GPL(bd_prepare_to_claim); /* only for the loop driver */ - -static void bd_clear_claiming(struct block_device *whole, void *holder) -{ - lockdep_assert_held(&bdev_lock); - /* tell others that we're done */ - BUG_ON(whole->bd_claiming != holder); - whole->bd_claiming = NULL; - wake_up_bit(&whole->bd_claiming, 0); -} - -/** - * bd_finish_claiming - finish claiming of a block device - * @bdev: block device of interest - * @holder: holder that has claimed @bdev - * - * Finish exclusive open of a block device. Mark the device as exlusively - * open by the holder and wake up all waiters for exclusive open to finish. - */ -static void bd_finish_claiming(struct block_device *bdev, void *holder) -{ - struct block_device *whole = bdev_whole(bdev); - - spin_lock(&bdev_lock); - BUG_ON(!bd_may_claim(bdev, whole, holder)); - /* - * Note that for a whole device bd_holders will be incremented twice, - * and bd_holder will be set to bd_may_claim before being set to holder - */ - whole->bd_holders++; - whole->bd_holder = bd_may_claim; - bdev->bd_holders++; - bdev->bd_holder = holder; - bd_clear_claiming(whole, holder); - spin_unlock(&bdev_lock); -} - -/** - * bd_abort_claiming - abort claiming of a block device - * @bdev: block device of interest - * @holder: holder that has claimed @bdev - * - * Abort claiming of a block device when the exclusive open failed. This can be - * also used when exclusive open is not actually desired and we just needed - * to block other exclusive openers for a while. - */ -void bd_abort_claiming(struct block_device *bdev, void *holder) -{ - spin_lock(&bdev_lock); - bd_clear_claiming(bdev_whole(bdev), holder); - spin_unlock(&bdev_lock); -} -EXPORT_SYMBOL(bd_abort_claiming); - -static void blkdev_flush_mapping(struct block_device *bdev) -{ - WARN_ON_ONCE(bdev->bd_holders); - sync_blockdev(bdev); - kill_bdev(bdev); - bdev_write_inode(bdev); -} - -static int blkdev_get_whole(struct block_device *bdev, fmode_t mode) -{ - struct gendisk *disk = bdev->bd_disk; - int ret = 0; - - if (disk->fops->open) { - ret = disk->fops->open(bdev, mode); - if (ret) { - /* avoid ghost partitions on a removed medium */ - if (ret == -ENOMEDIUM && - test_bit(GD_NEED_PART_SCAN, &disk->state)) - bdev_disk_changed(disk, true); - return ret; - } - } - - if (!bdev->bd_openers) - set_init_blocksize(bdev); - if (test_bit(GD_NEED_PART_SCAN, &disk->state)) - bdev_disk_changed(disk, false); - bdev->bd_openers++; - return 0;; -} - -static void blkdev_put_whole(struct block_device *bdev, fmode_t mode) -{ - if (!--bdev->bd_openers) - blkdev_flush_mapping(bdev); - if (bdev->bd_disk->fops->release) - bdev->bd_disk->fops->release(bdev->bd_disk, mode); -} - -static int blkdev_get_part(struct block_device *part, fmode_t mode) -{ - struct gendisk *disk = part->bd_disk; - int ret; - - if (part->bd_openers) - goto done; - - ret = blkdev_get_whole(bdev_whole(part), mode); - if (ret) - return ret; - - ret = -ENXIO; - if (!bdev_nr_sectors(part)) - goto out_blkdev_put; - - disk->open_partitions++; - set_init_blocksize(part); -done: - part->bd_openers++; - return 0; - -out_blkdev_put: - blkdev_put_whole(bdev_whole(part), mode); - return ret; -} - -static void blkdev_put_part(struct block_device *part, fmode_t mode) -{ - struct block_device *whole = bdev_whole(part); - - if (--part->bd_openers) - return; - blkdev_flush_mapping(part); - whole->bd_disk->open_partitions--; - blkdev_put_whole(whole, mode); -} - -struct block_device *blkdev_get_no_open(dev_t dev) -{ - struct block_device *bdev; - struct inode *inode; - - inode = ilookup(blockdev_superblock, dev); - if (!inode) { - blk_request_module(dev); - inode = ilookup(blockdev_superblock, dev); - if (!inode) - return NULL; - } - - /* switch from the inode reference to a device mode one: */ - bdev = &BDEV_I(inode)->bdev; - if (!kobject_get_unless_zero(&bdev->bd_device.kobj)) - bdev = NULL; - iput(inode); - - if (!bdev) - return NULL; - if ((bdev->bd_disk->flags & GENHD_FL_HIDDEN) || - !try_module_get(bdev->bd_disk->fops->owner)) { - put_device(&bdev->bd_device); - return NULL; - } - - return bdev; -} - -void blkdev_put_no_open(struct block_device *bdev) -{ - module_put(bdev->bd_disk->fops->owner); - put_device(&bdev->bd_device); -} - -/** - * blkdev_get_by_dev - open a block device by device number - * @dev: device number of block device to open - * @mode: FMODE_* mask - * @holder: exclusive holder identifier - * - * Open the block device described by device number @dev. If @mode includes - * %FMODE_EXCL, the block device is opened with exclusive access. Specifying - * %FMODE_EXCL with a %NULL @holder is invalid. Exclusive opens may nest for - * the same @holder. - * - * Use this interface ONLY if you really do not have anything better - i.e. when - * you are behind a truly sucky interface and all you are given is a device - * number. Everything else should use blkdev_get_by_path(). - * - * CONTEXT: - * Might sleep. - * - * RETURNS: - * Reference to the block_device on success, ERR_PTR(-errno) on failure. - */ -struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) -{ - bool unblock_events = true; - struct block_device *bdev; - struct gendisk *disk; - int ret; - - ret = devcgroup_check_permission(DEVCG_DEV_BLOCK, - MAJOR(dev), MINOR(dev), - ((mode & FMODE_READ) ? DEVCG_ACC_READ : 0) | - ((mode & FMODE_WRITE) ? DEVCG_ACC_WRITE : 0)); - if (ret) - return ERR_PTR(ret); - - bdev = blkdev_get_no_open(dev); - if (!bdev) - return ERR_PTR(-ENXIO); - disk = bdev->bd_disk; - - if (mode & FMODE_EXCL) { - ret = bd_prepare_to_claim(bdev, holder); - if (ret) - goto put_blkdev; - } - - disk_block_events(disk); - - mutex_lock(&disk->open_mutex); - ret = -ENXIO; - if (!disk_live(disk)) - goto abort_claiming; - if (bdev_is_partition(bdev)) - ret = blkdev_get_part(bdev, mode); - else - ret = blkdev_get_whole(bdev, mode); - if (ret) - goto abort_claiming; - if (mode & FMODE_EXCL) { - bd_finish_claiming(bdev, holder); - - /* - * Block event polling for write claims if requested. Any write - * holder makes the write_holder state stick until all are - * released. This is good enough and tracking individual - * writeable reference is too fragile given the way @mode is - * used in blkdev_get/put(). - */ - if ((mode & FMODE_WRITE) && !bdev->bd_write_holder && - (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) { - bdev->bd_write_holder = true; - unblock_events = false; - } - } - mutex_unlock(&disk->open_mutex); - - if (unblock_events) - disk_unblock_events(disk); - return bdev; - -abort_claiming: - if (mode & FMODE_EXCL) - bd_abort_claiming(bdev, holder); - mutex_unlock(&disk->open_mutex); - disk_unblock_events(disk); -put_blkdev: - blkdev_put_no_open(bdev); - return ERR_PTR(ret); -} -EXPORT_SYMBOL(blkdev_get_by_dev); - -/** - * blkdev_get_by_path - open a block device by name - * @path: path to the block device to open - * @mode: FMODE_* mask - * @holder: exclusive holder identifier - * - * Open the block device described by the device file at @path. If @mode - * includes %FMODE_EXCL, the block device is opened with exclusive access. - * Specifying %FMODE_EXCL with a %NULL @holder is invalid. Exclusive opens may - * nest for the same @holder. - * - * CONTEXT: - * Might sleep. - * - * RETURNS: - * Reference to the block_device on success, ERR_PTR(-errno) on failure. - */ -struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, - void *holder) -{ - struct block_device *bdev; - dev_t dev; - int error; - - error = lookup_bdev(path, &dev); - if (error) - return ERR_PTR(error); - - bdev = blkdev_get_by_dev(dev, mode, holder); - if (!IS_ERR(bdev) && (mode & FMODE_WRITE) && bdev_read_only(bdev)) { - blkdev_put(bdev, mode); - return ERR_PTR(-EACCES); - } - - return bdev; -} -EXPORT_SYMBOL(blkdev_get_by_path); - -static int blkdev_open(struct inode * inode, struct file * filp) -{ - struct block_device *bdev; - - /* - * Preserve backwards compatibility and allow large file access - * even if userspace doesn't ask for it explicitly. Some mkfs - * binary needs it. We might want to drop this workaround - * during an unstable branch. - */ - filp->f_flags |= O_LARGEFILE; - - filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; - - if (filp->f_flags & O_NDELAY) - filp->f_mode |= FMODE_NDELAY; - if (filp->f_flags & O_EXCL) - filp->f_mode |= FMODE_EXCL; - if ((filp->f_flags & O_ACCMODE) == 3) - filp->f_mode |= FMODE_WRITE_IOCTL; - - bdev = blkdev_get_by_dev(inode->i_rdev, filp->f_mode, filp); - if (IS_ERR(bdev)) - return PTR_ERR(bdev); - filp->f_mapping = bdev->bd_inode->i_mapping; - filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping); - return 0; -} - -void blkdev_put(struct block_device *bdev, fmode_t mode) -{ - struct gendisk *disk = bdev->bd_disk; - - /* - * Sync early if it looks like we're the last one. If someone else - * opens the block device between now and the decrement of bd_openers - * then we did a sync that we didn't need to, but that's not the end - * of the world and we want to avoid long (could be several minute) - * syncs while holding the mutex. - */ - if (bdev->bd_openers == 1) - sync_blockdev(bdev); - - mutex_lock(&disk->open_mutex); - if (mode & FMODE_EXCL) { - struct block_device *whole = bdev_whole(bdev); - bool bdev_free; - - /* - * Release a claim on the device. The holder fields - * are protected with bdev_lock. open_mutex is to - * synchronize disk_holder unlinking. - */ - spin_lock(&bdev_lock); - - WARN_ON_ONCE(--bdev->bd_holders < 0); - WARN_ON_ONCE(--whole->bd_holders < 0); - - if ((bdev_free = !bdev->bd_holders)) - bdev->bd_holder = NULL; - if (!whole->bd_holders) - whole->bd_holder = NULL; - - spin_unlock(&bdev_lock); - - /* - * If this was the last claim, remove holder link and - * unblock evpoll if it was a write holder. - */ - if (bdev_free && bdev->bd_write_holder) { - disk_unblock_events(disk); - bdev->bd_write_holder = false; - } - } - - /* - * Trigger event checking and tell drivers to flush MEDIA_CHANGE - * event. This is to ensure detection of media removal commanded - * from userland - e.g. eject(1). - */ - disk_flush_events(disk, DISK_EVENT_MEDIA_CHANGE); - - if (bdev_is_partition(bdev)) - blkdev_put_part(bdev, mode); - else - blkdev_put_whole(bdev, mode); - mutex_unlock(&disk->open_mutex); - - blkdev_put_no_open(bdev); -} -EXPORT_SYMBOL(blkdev_put); - -static int blkdev_close(struct inode * inode, struct file * filp) -{ - struct block_device *bdev = I_BDEV(bdev_file_inode(filp)); - blkdev_put(bdev, filp->f_mode); - return 0; -} - -static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) -{ - struct block_device *bdev = I_BDEV(bdev_file_inode(file)); - fmode_t mode = file->f_mode; - - /* - * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have - * to updated it before every ioctl. - */ - if (file->f_flags & O_NDELAY) - mode |= FMODE_NDELAY; - else - mode &= ~FMODE_NDELAY; - - return blkdev_ioctl(bdev, mode, cmd, arg); -} - -/* - * Write data to the block device. Only intended for the block device itself - * and the raw driver which basically is a fake block device. - * - * Does not take i_mutex for the write and thus is not for general purpose - * use. - */ -static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) -{ - struct file *file = iocb->ki_filp; - struct inode *bd_inode = bdev_file_inode(file); - loff_t size = i_size_read(bd_inode); - struct blk_plug plug; - size_t shorted = 0; - ssize_t ret; - - if (bdev_read_only(I_BDEV(bd_inode))) - return -EPERM; - - if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode->i_rdev)) - return -ETXTBSY; - - if (!iov_iter_count(from)) - return 0; - - if (iocb->ki_pos >= size) - return -ENOSPC; - - if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT) - return -EOPNOTSUPP; - - size -= iocb->ki_pos; - if (iov_iter_count(from) > size) { - shorted = iov_iter_count(from) - size; - iov_iter_truncate(from, size); - } - - blk_start_plug(&plug); - ret = __generic_file_write_iter(iocb, from); - if (ret > 0) - ret = generic_write_sync(iocb, ret); - iov_iter_reexpand(from, iov_iter_count(from) + shorted); - blk_finish_plug(&plug); - return ret; -} - -static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) -{ - struct file *file = iocb->ki_filp; - struct inode *bd_inode = bdev_file_inode(file); - loff_t size = i_size_read(bd_inode); - loff_t pos = iocb->ki_pos; - size_t shorted = 0; - ssize_t ret; - - if (pos >= size) - return 0; - - size -= pos; - if (iov_iter_count(to) > size) { - shorted = iov_iter_count(to) - size; - iov_iter_truncate(to, size); - } - - ret = generic_file_read_iter(iocb, to); - iov_iter_reexpand(to, iov_iter_count(to) + shorted); - return ret; -} - -static int blkdev_writepages(struct address_space *mapping, - struct writeback_control *wbc) -{ - return generic_writepages(mapping, wbc); -} - -static const struct address_space_operations def_blk_aops = { - .set_page_dirty = __set_page_dirty_buffers, - .readpage = blkdev_readpage, - .readahead = blkdev_readahead, - .writepage = blkdev_writepage, - .write_begin = blkdev_write_begin, - .write_end = blkdev_write_end, - .writepages = blkdev_writepages, - .direct_IO = blkdev_direct_IO, - .migratepage = buffer_migrate_page_norefs, - .is_dirty_writeback = buffer_check_dirty_writeback, -}; - -#define BLKDEV_FALLOC_FL_SUPPORTED \ - (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ - FALLOC_FL_ZERO_RANGE | FALLOC_FL_NO_HIDE_STALE) - -static long blkdev_fallocate(struct file *file, int mode, loff_t start, - loff_t len) -{ - struct block_device *bdev = I_BDEV(bdev_file_inode(file)); - loff_t end = start + len - 1; - loff_t isize; - int error; - - /* Fail if we don't recognize the flags. */ - if (mode & ~BLKDEV_FALLOC_FL_SUPPORTED) - return -EOPNOTSUPP; - - /* Don't go off the end of the device. */ - isize = i_size_read(bdev->bd_inode); - if (start >= isize) - return -EINVAL; - if (end >= isize) { - if (mode & FALLOC_FL_KEEP_SIZE) { - len = isize - start; - end = start + len - 1; - } else - return -EINVAL; - } - - /* - * Don't allow IO that isn't aligned to logical block size. - */ - if ((start | len) & (bdev_logical_block_size(bdev) - 1)) - return -EINVAL; - - /* Invalidate the page cache, including dirty pages. */ - error = truncate_bdev_range(bdev, file->f_mode, start, end); - if (error) - return error; - - switch (mode) { - case FALLOC_FL_ZERO_RANGE: - case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE: - error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, - GFP_KERNEL, BLKDEV_ZERO_NOUNMAP); - break; - case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE: - error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, - GFP_KERNEL, BLKDEV_ZERO_NOFALLBACK); - break; - case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE: - error = blkdev_issue_discard(bdev, start >> 9, len >> 9, - GFP_KERNEL, 0); - break; - default: - return -EOPNOTSUPP; - } - if (error) - return error; - - /* - * Invalidate the page cache again; if someone wandered in and dirtied - * a page, we just discard it - userspace has no way of knowing whether - * the write happened before or after discard completing... - */ - return truncate_bdev_range(bdev, file->f_mode, start, end); -} - -const struct file_operations def_blk_fops = { - .open = blkdev_open, - .release = blkdev_close, - .llseek = block_llseek, - .read_iter = blkdev_read_iter, - .write_iter = blkdev_write_iter, - .iopoll = blkdev_iopoll, - .mmap = generic_file_mmap, - .fsync = blkdev_fsync, - .unlocked_ioctl = block_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = compat_blkdev_ioctl, -#endif - .splice_read = generic_file_splice_read, - .splice_write = iter_file_splice_write, - .fallocate = blkdev_fallocate, -}; - -/** - * lookup_bdev - lookup a struct block_device by name - * @pathname: special file representing the block device - * @dev: return value of the block device's dev_t - * - * Get a reference to the blockdevice at @pathname in the current - * namespace if possible and return it. Return ERR_PTR(error) - * otherwise. - */ -int lookup_bdev(const char *pathname, dev_t *dev) -{ - struct inode *inode; - struct path path; - int error; - - if (!pathname || !*pathname) - return -EINVAL; - - error = kern_path(pathname, LOOKUP_FOLLOW, &path); - if (error) - return error; - - inode = d_backing_inode(path.dentry); - error = -ENOTBLK; - if (!S_ISBLK(inode->i_mode)) - goto out_path_put; - error = -EACCES; - if (!may_open_dev(&path)) - goto out_path_put; - - *dev = inode->i_rdev; - error = 0; -out_path_put: - path_put(&path); - return error; -} -EXPORT_SYMBOL(lookup_bdev); - -int __invalidate_device(struct block_device *bdev, bool kill_dirty) -{ - struct super_block *sb = get_super(bdev); - int res = 0; - - if (sb) { - /* - * no need to lock the super, get_super holds the - * read mutex so the filesystem cannot go away - * under us (->put_super runs with the write lock - * hold). - */ - shrink_dcache_sb(sb); - res = invalidate_inodes(sb, kill_dirty); - drop_super(sb); - } - invalidate_bdev(bdev); - return res; -} -EXPORT_SYMBOL(__invalidate_device); - -void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) -{ - struct inode *inode, *old_inode = NULL; - - spin_lock(&blockdev_superblock->s_inode_list_lock); - list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) { - struct address_space *mapping = inode->i_mapping; - struct block_device *bdev; - - spin_lock(&inode->i_lock); - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) || - mapping->nrpages == 0) { - spin_unlock(&inode->i_lock); - continue; - } - __iget(inode); - spin_unlock(&inode->i_lock); - spin_unlock(&blockdev_superblock->s_inode_list_lock); - /* - * We hold a reference to 'inode' so it couldn't have been - * removed from s_inodes list while we dropped the - * s_inode_list_lock We cannot iput the inode now as we can - * be holding the last reference and we cannot iput it under - * s_inode_list_lock. So we keep the reference and iput it - * later. - */ - iput(old_inode); - old_inode = inode; - bdev = I_BDEV(inode); - - mutex_lock(&bdev->bd_disk->open_mutex); - if (bdev->bd_openers) - func(bdev, arg); - mutex_unlock(&bdev->bd_disk->open_mutex); - - spin_lock(&blockdev_superblock->s_inode_list_lock); - } - spin_unlock(&blockdev_superblock->s_inode_list_lock); - iput(old_inode); -} diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index cec88a66bd6c..3dcf9bcc2326 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -36,6 +36,7 @@ btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o btrfs-$(CONFIG_BLK_DEV_ZONED) += zoned.o +btrfs-$(CONFIG_FS_VERITY) += verity.o btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \ tests/extent-buffer-tests.o tests/btrfs-tests.o \ diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index d95eb5c8cb37..0a0d0eccee4e 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -16,13 +16,16 @@ #include "btrfs_inode.h" #include "xattr.h" -struct posix_acl *btrfs_get_acl(struct inode *inode, int type) +struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu) { int size; const char *name; char *value = NULL; struct posix_acl *acl; + if (rcu) + return ERR_PTR(-ECHILD); + switch (type) { case ACL_TYPE_ACCESS: name = XATTR_NAME_POSIX_ACL_ACCESS; @@ -53,7 +56,8 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type) } static int __btrfs_set_acl(struct btrfs_trans_handle *trans, - struct inode *inode, struct posix_acl *acl, int type) + struct user_namespace *mnt_userns, + struct inode *inode, struct posix_acl *acl, int type) { int ret, size = 0; const char *name; @@ -114,12 +118,12 @@ int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, umode_t old_mode = inode->i_mode; if (type == ACL_TYPE_ACCESS && acl) { - ret = posix_acl_update_mode(&init_user_ns, inode, + ret = posix_acl_update_mode(mnt_userns, inode, &inode->i_mode, &acl); if (ret) return ret; } - ret = __btrfs_set_acl(NULL, inode, acl, type); + ret = __btrfs_set_acl(NULL, mnt_userns, inode, acl, type); if (ret) inode->i_mode = old_mode; return ret; @@ -140,14 +144,14 @@ int btrfs_init_acl(struct btrfs_trans_handle *trans, return ret; if (default_acl) { - ret = __btrfs_set_acl(trans, inode, default_acl, + ret = __btrfs_set_acl(trans, &init_user_ns, inode, default_acl, ACL_TYPE_DEFAULT); posix_acl_release(default_acl); } if (acl) { if (!ret) - ret = __btrfs_set_acl(trans, inode, acl, + ret = __btrfs_set_acl(trans, &init_user_ns, inode, acl, ACL_TYPE_ACCESS); posix_acl_release(acl); } diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 78b202d198b8..f735b8798ba1 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1211,7 +1211,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, again: head = NULL; - ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0); + ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); if (ret < 0) goto out; BUG_ON(ret == 0); @@ -1488,14 +1488,14 @@ static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans, int btrfs_find_all_roots(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 time_seq, struct ulist **roots, - bool ignore_offset, bool skip_commit_root_sem) + bool skip_commit_root_sem) { int ret; if (!trans && !skip_commit_root_sem) down_read(&fs_info->commit_root_sem); ret = btrfs_find_all_roots_safe(trans, fs_info, bytenr, - time_seq, roots, ignore_offset); + time_seq, roots, false); if (!trans && !skip_commit_root_sem) up_read(&fs_info->commit_root_sem); return ret; diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index ff5f07f9940b..ba454032dbe2 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -47,7 +47,7 @@ int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, const u64 *extent_item_pos, bool ignore_offset); int btrfs_find_all_roots(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, - u64 time_seq, struct ulist **roots, bool ignore_offset, + u64 time_seq, struct ulist **roots, bool skip_commit_root_sem); char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, u32 name_len, unsigned long name_off, diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 9e7d9d0c763d..a3b830b8410a 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1561,7 +1561,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) div64_u64(zone_unusable * 100, bg->length)); trace_btrfs_reclaim_block_group(bg); ret = btrfs_relocate_chunk(fs_info, bg->start); - if (ret) + if (ret && ret != -EAGAIN) btrfs_err(fs_info, "error relocating chunk %llu", bg->start); @@ -2105,11 +2105,22 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info) bg->used = em->len; bg->flags = map->type; ret = btrfs_add_block_group_cache(fs_info, bg); + /* + * We may have some valid block group cache added already, in + * that case we skip to the next one. + */ + if (ret == -EEXIST) { + ret = 0; + btrfs_put_block_group(bg); + continue; + } + if (ret) { btrfs_remove_free_space_cache(bg); btrfs_put_block_group(bg); break; } + btrfs_update_space_info(fs_info, bg->flags, em->len, em->len, 0, 0, &space_info); bg->space_info = space_info; @@ -2212,6 +2223,14 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) ret = check_chunk_block_group_mappings(info); error: btrfs_free_path(path); + /* + * We've hit some error while reading the extent tree, and have + * rescue=ibadroots mount option. + * Try to fill the tree using dummy block groups so that the user can + * continue to mount and grab their data. + */ + if (ret && btrfs_test_opt(info, IGNOREBADROOTS)) + ret = fill_dummy_bgs(info); return ret; } @@ -2244,6 +2263,95 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans, return btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi)); } +static int insert_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, u64 chunk_offset, + u64 start, u64 num_bytes) +{ + struct btrfs_fs_info *fs_info = device->fs_info; + struct btrfs_root *root = fs_info->dev_root; + struct btrfs_path *path; + struct btrfs_dev_extent *extent; + struct extent_buffer *leaf; + struct btrfs_key key; + int ret; + + WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)); + WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = device->devid; + key.type = BTRFS_DEV_EXTENT_KEY; + key.offset = start; + ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent)); + if (ret) + goto out; + + leaf = path->nodes[0]; + extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); + btrfs_set_dev_extent_chunk_tree(leaf, extent, BTRFS_CHUNK_TREE_OBJECTID); + btrfs_set_dev_extent_chunk_objectid(leaf, extent, + BTRFS_FIRST_CHUNK_TREE_OBJECTID); + btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); + + btrfs_set_dev_extent_length(leaf, extent, num_bytes); + btrfs_mark_buffer_dirty(leaf); +out: + btrfs_free_path(path); + return ret; +} + +/* + * This function belongs to phase 2. + * + * See the comment at btrfs_chunk_alloc() for details about the chunk allocation + * phases. + */ +static int insert_dev_extents(struct btrfs_trans_handle *trans, + u64 chunk_offset, u64 chunk_size) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_device *device; + struct extent_map *em; + struct map_lookup *map; + u64 dev_offset; + u64 stripe_size; + int i; + int ret = 0; + + em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size); + if (IS_ERR(em)) + return PTR_ERR(em); + + map = em->map_lookup; + stripe_size = em->orig_block_len; + + /* + * Take the device list mutex to prevent races with the final phase of + * a device replace operation that replaces the device object associated + * with the map's stripes, because the device object's id can change + * at any time during that final phase of the device replace operation + * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the + * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID, + * resulting in persisting a device extent item with such ID. + */ + mutex_lock(&fs_info->fs_devices->device_list_mutex); + for (i = 0; i < map->num_stripes; i++) { + device = map->stripes[i].dev; + dev_offset = map->stripes[i].physical; + + ret = insert_dev_extent(trans, device, chunk_offset, dev_offset, + stripe_size); + if (ret) + break; + } + mutex_unlock(&fs_info->fs_devices->device_list_mutex); + + free_extent_map(em); + return ret; +} + /* * This function, btrfs_create_pending_block_groups(), belongs to the phase 2 of * chunk allocation. @@ -2278,8 +2386,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) if (ret) btrfs_abort_transaction(trans, ret); } - ret = btrfs_finish_chunk_alloc(trans, block_group->start, - block_group->length); + ret = insert_dev_extents(trans, block_group->start, + block_group->length); if (ret) btrfs_abort_transaction(trans, ret); add_block_group_free_space(trans, block_group); diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index c652e19ad74e..76ee1452c57b 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -51,6 +51,13 @@ enum { * the file range, inode's io_tree). */ BTRFS_INODE_NO_DELALLOC_FLUSH, + /* + * Set when we are working on enabling verity for a file. Computing and + * writing the whole Merkle tree can take a while so we want to prevent + * races where two separate tasks attempt to simultaneously start verity + * on the same file. + */ + BTRFS_INODE_VERITY_IN_PROGRESS, }; /* in memory btrfs inode */ @@ -189,8 +196,10 @@ struct btrfs_inode { */ u64 csum_bytes; - /* flags field from the on disk inode */ + /* Backwards incompatible flags, lower half of inode_item::flags */ u32 flags; + /* Read-only compatibility flags, upper half of inode_item::flags */ + u32 ro_flags; /* * Counters to keep track of the number of extent item's we may use due @@ -348,6 +357,22 @@ struct btrfs_dio_private { u8 csums[]; }; +/* + * btrfs_inode_item stores flags in a u64, btrfs_inode stores them in two + * separate u32s. These two functions convert between the two representations. + */ +static inline u64 btrfs_inode_combine_flags(u32 flags, u32 ro_flags) +{ + return (flags | ((u64)ro_flags << 32)); +} + +static inline void btrfs_inode_split_flags(u64 inode_item_flags, + u32 *flags, u32 *ro_flags) +{ + *flags = (u32)inode_item_flags; + *ro_flags = (u32)(inode_item_flags >> 32); +} + /* Array of bytes with variable length, hexadecimal format 0x1234 */ #define CSUM_FMT "0x%*phN" #define CSUM_FMT_VALUE(size, bytes) size, bytes diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 169508609324..86816088927f 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -243,47 +243,6 @@ struct btrfsic_state { u32 datablock_size; }; -static void btrfsic_block_init(struct btrfsic_block *b); -static struct btrfsic_block *btrfsic_block_alloc(void); -static void btrfsic_block_free(struct btrfsic_block *b); -static void btrfsic_block_link_init(struct btrfsic_block_link *n); -static struct btrfsic_block_link *btrfsic_block_link_alloc(void); -static void btrfsic_block_link_free(struct btrfsic_block_link *n); -static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds); -static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void); -static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds); -static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h); -static void btrfsic_block_hashtable_add(struct btrfsic_block *b, - struct btrfsic_block_hashtable *h); -static void btrfsic_block_hashtable_remove(struct btrfsic_block *b); -static struct btrfsic_block *btrfsic_block_hashtable_lookup( - struct block_device *bdev, - u64 dev_bytenr, - struct btrfsic_block_hashtable *h); -static void btrfsic_block_link_hashtable_init( - struct btrfsic_block_link_hashtable *h); -static void btrfsic_block_link_hashtable_add( - struct btrfsic_block_link *l, - struct btrfsic_block_link_hashtable *h); -static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l); -static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup( - struct block_device *bdev_ref_to, - u64 dev_bytenr_ref_to, - struct block_device *bdev_ref_from, - u64 dev_bytenr_ref_from, - struct btrfsic_block_link_hashtable *h); -static void btrfsic_dev_state_hashtable_init( - struct btrfsic_dev_state_hashtable *h); -static void btrfsic_dev_state_hashtable_add( - struct btrfsic_dev_state *ds, - struct btrfsic_dev_state_hashtable *h); -static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds); -static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(dev_t dev, - struct btrfsic_dev_state_hashtable *h); -static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void); -static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf); -static int btrfsic_process_superblock(struct btrfsic_state *state, - struct btrfs_fs_devices *fs_devices); static int btrfsic_process_metablock(struct btrfsic_state *state, struct btrfsic_block *block, struct btrfsic_block_data_ctx *block_ctx, @@ -313,14 +272,6 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx); static int btrfsic_read_block(struct btrfsic_state *state, struct btrfsic_block_data_ctx *block_ctx); -static void btrfsic_dump_database(struct btrfsic_state *state); -static int btrfsic_test_for_metadata(struct btrfsic_state *state, - char **datav, unsigned int num_pages); -static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, - u64 dev_bytenr, char **mapped_datav, - unsigned int num_pages, - struct bio *bio, int *bio_is_patched, - int submit_bio_bh_rw); static int btrfsic_process_written_superblock( struct btrfsic_state *state, struct btrfsic_block *const block, @@ -1558,10 +1509,8 @@ static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx) /* Pages must be unmapped in reverse order */ while (num_pages > 0) { num_pages--; - if (block_ctx->datav[num_pages]) { - kunmap_local(block_ctx->datav[num_pages]); + if (block_ctx->datav[num_pages]) block_ctx->datav[num_pages] = NULL; - } if (block_ctx->pagev[num_pages]) { __free_page(block_ctx->pagev[num_pages]); block_ctx->pagev[num_pages] = NULL; @@ -1638,7 +1587,7 @@ static int btrfsic_read_block(struct btrfsic_state *state, i = j; } for (i = 0; i < num_pages; i++) - block_ctx->datav[i] = kmap_local_page(block_ctx->pagev[i]); + block_ctx->datav[i] = page_address(block_ctx->pagev[i]); return block_ctx->len; } @@ -2703,7 +2652,7 @@ static void __btrfsic_submit_bio(struct bio *bio) bio_for_each_segment(bvec, bio, iter) { BUG_ON(bvec.bv_len != PAGE_SIZE); - mapped_datav[i] = kmap_local_page(bvec.bv_page); + mapped_datav[i] = page_address(bvec.bv_page); i++; if (dev_state->state->print_mask & @@ -2716,9 +2665,6 @@ static void __btrfsic_submit_bio(struct bio *bio) mapped_datav, segs, bio, &bio_is_patched, bio->bi_opf); - /* Unmap in reverse order */ - for (--i; i >= 0; i--) - kunmap_local(mapped_datav[i]); kfree(mapped_datav); } else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) { if (dev_state->state->print_mask & diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 30d82cdf128c..7869ad12bc6e 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -172,10 +172,9 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio, /* Hash through the page sector by sector */ for (pg_offset = 0; pg_offset < bytes_left; pg_offset += sectorsize) { - kaddr = kmap_atomic(page); + kaddr = page_address(page); crypto_shash_digest(shash, kaddr + pg_offset, sectorsize, csum); - kunmap_atomic(kaddr); if (memcmp(&csum, cb_sum, csum_size) != 0) { btrfs_print_data_csum_error(inode, disk_start, @@ -565,6 +564,16 @@ static noinline int add_ra_bio_pages(struct inode *inode, if (isize == 0) return 0; + /* + * For current subpage support, we only support 64K page size, + * which means maximum compressed extent size (128K) is just 2x page + * size. + * This makes readahead less effective, so here disable readahead for + * subpage for now, until full compressed write is supported. + */ + if (btrfs_sb(inode->i_sb)->sectorsize < PAGE_SIZE) + return 0; + end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; while (last_offset < compressed_end) { @@ -673,6 +682,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, struct page *page; struct bio *comp_bio; u64 cur_disk_byte = bio->bi_iter.bi_sector << 9; + u64 file_offset; u64 em_len; u64 em_start; struct extent_map *em; @@ -682,15 +692,17 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, em_tree = &BTRFS_I(inode)->extent_tree; + file_offset = bio_first_bvec_all(bio)->bv_offset + + page_offset(bio_first_page_all(bio)); + /* we need the actual starting offset of this extent in the file */ read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, - page_offset(bio_first_page_all(bio)), - fs_info->sectorsize); + em = lookup_extent_mapping(em_tree, file_offset, fs_info->sectorsize); read_unlock(&em_tree->lock); if (!em) return BLK_STS_IOERR; + ASSERT(em->compress_type != BTRFS_COMPRESS_NONE); compressed_len = em->block_len; cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); if (!cb) @@ -721,8 +733,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, goto fail1; for (pg_index = 0; pg_index < nr_pages; pg_index++) { - cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS | - __GFP_HIGHMEM); + cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS); if (!cb->compressed_pages[pg_index]) { faili = pg_index - 1; ret = BLK_STS_RESOURCE; @@ -1261,96 +1272,82 @@ void __cold btrfs_exit_compress(void) } /* - * Copy uncompressed data from working buffer to pages. + * Copy decompressed data from working buffer to pages. + * + * @buf: The decompressed data buffer + * @buf_len: The decompressed data length + * @decompressed: Number of bytes that are already decompressed inside the + * compressed extent + * @cb: The compressed extent descriptor + * @orig_bio: The original bio that the caller wants to read for + * + * An easier to understand graph is like below: + * + * |<- orig_bio ->| |<- orig_bio->| + * |<------- full decompressed extent ----->| + * |<----------- @cb range ---->| + * | |<-- @buf_len -->| + * |<--- @decompressed --->| + * + * Note that, @cb can be a subpage of the full decompressed extent, but + * @cb->start always has the same as the orig_file_offset value of the full + * decompressed extent. * - * buf_start is the byte offset we're of the start of our workspace buffer. + * When reading compressed extent, we have to read the full compressed extent, + * while @orig_bio may only want part of the range. + * Thus this function will ensure only data covered by @orig_bio will be copied + * to. * - * total_out is the last byte of the buffer + * Return 0 if we have copied all needed contents for @orig_bio. + * Return >0 if we need continue decompress. */ -int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start, - unsigned long total_out, u64 disk_start, - struct bio *bio) +int btrfs_decompress_buf2page(const char *buf, u32 buf_len, + struct compressed_bio *cb, u32 decompressed) { - unsigned long buf_offset; - unsigned long current_buf_start; - unsigned long start_byte; - unsigned long prev_start_byte; - unsigned long working_bytes = total_out - buf_start; - unsigned long bytes; - struct bio_vec bvec = bio_iter_iovec(bio, bio->bi_iter); - - /* - * start byte is the first byte of the page we're currently - * copying into relative to the start of the compressed data. - */ - start_byte = page_offset(bvec.bv_page) - disk_start; - - /* we haven't yet hit data corresponding to this page */ - if (total_out <= start_byte) - return 1; - - /* - * the start of the data we care about is offset into - * the middle of our working buffer - */ - if (total_out > start_byte && buf_start < start_byte) { - buf_offset = start_byte - buf_start; - working_bytes -= buf_offset; - } else { - buf_offset = 0; - } - current_buf_start = buf_start; - - /* copy bytes from the working buffer into the pages */ - while (working_bytes > 0) { - bytes = min_t(unsigned long, bvec.bv_len, - PAGE_SIZE - (buf_offset % PAGE_SIZE)); - bytes = min(bytes, working_bytes); - - memcpy_to_page(bvec.bv_page, bvec.bv_offset, buf + buf_offset, - bytes); - flush_dcache_page(bvec.bv_page); + struct bio *orig_bio = cb->orig_bio; + /* Offset inside the full decompressed extent */ + u32 cur_offset; + + cur_offset = decompressed; + /* The main loop to do the copy */ + while (cur_offset < decompressed + buf_len) { + struct bio_vec bvec; + size_t copy_len; + u32 copy_start; + /* Offset inside the full decompressed extent */ + u32 bvec_offset; + + bvec = bio_iter_iovec(orig_bio, orig_bio->bi_iter); + /* + * cb->start may underflow, but subtracting that value can still + * give us correct offset inside the full decompressed extent. + */ + bvec_offset = page_offset(bvec.bv_page) + bvec.bv_offset - cb->start; - buf_offset += bytes; - working_bytes -= bytes; - current_buf_start += bytes; + /* Haven't reached the bvec range, exit */ + if (decompressed + buf_len <= bvec_offset) + return 1; - /* check if we need to pick another page */ - bio_advance(bio, bytes); - if (!bio->bi_iter.bi_size) - return 0; - bvec = bio_iter_iovec(bio, bio->bi_iter); - prev_start_byte = start_byte; - start_byte = page_offset(bvec.bv_page) - disk_start; + copy_start = max(cur_offset, bvec_offset); + copy_len = min(bvec_offset + bvec.bv_len, + decompressed + buf_len) - copy_start; + ASSERT(copy_len); /* - * We need to make sure we're only adjusting - * our offset into compression working buffer when - * we're switching pages. Otherwise we can incorrectly - * keep copying when we were actually done. + * Extra range check to ensure we didn't go beyond + * @buf + @buf_len. */ - if (start_byte != prev_start_byte) { - /* - * make sure our new page is covered by this - * working buffer - */ - if (total_out <= start_byte) - return 1; + ASSERT(copy_start - decompressed < buf_len); + memcpy_to_page(bvec.bv_page, bvec.bv_offset, + buf + copy_start - decompressed, copy_len); + flush_dcache_page(bvec.bv_page); + cur_offset += copy_len; - /* - * the next page in the biovec might not be adjacent - * to the last page, but it might still be found - * inside this working buffer. bump our offset pointer - */ - if (total_out > start_byte && - current_buf_start < start_byte) { - buf_offset = start_byte - buf_start; - working_bytes = total_out - start_byte; - current_buf_start = buf_start + buf_offset; - } - } + bio_advance(orig_bio, copy_len); + /* Finished the bio */ + if (!orig_bio->bi_iter.bi_size) + return 0; } - return 1; } diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index c359f20920d0..399be0b435bf 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -86,9 +86,8 @@ int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping, unsigned long *total_out); int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, unsigned long start_byte, size_t srclen, size_t destlen); -int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start, - unsigned long total_out, u64 disk_start, - struct bio *bio); +int btrfs_decompress_buf2page(const char *buf, u32 buf_len, + struct compressed_bio *cb, u32 decompressed); blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, unsigned int len, u64 disk_start, diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index c5c08c87e130..84627cbd5b5b 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -726,21 +726,21 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, /* * search for key in the extent_buffer. The items start at offset p, - * and they are item_size apart. There are 'max' items in p. + * and they are item_size apart. * * the slot in the array is returned via slot, and it points to * the place where you would insert key if it is not found in * the array. * - * slot may point to max if the key is bigger than all of the keys + * Slot may point to total number of items if the key is bigger than + * all of the keys */ static noinline int generic_bin_search(struct extent_buffer *eb, unsigned long p, int item_size, - const struct btrfs_key *key, - int max, int *slot) + const struct btrfs_key *key, int *slot) { int low = 0; - int high = max; + int high = btrfs_header_nritems(eb); int ret; const int key_size = sizeof(struct btrfs_disk_key); @@ -799,15 +799,11 @@ int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key, if (btrfs_header_level(eb) == 0) return generic_bin_search(eb, offsetof(struct btrfs_leaf, items), - sizeof(struct btrfs_item), - key, btrfs_header_nritems(eb), - slot); + sizeof(struct btrfs_item), key, slot); else return generic_bin_search(eb, offsetof(struct btrfs_node, ptrs), - sizeof(struct btrfs_key_ptr), - key, btrfs_header_nritems(eb), - slot); + sizeof(struct btrfs_key_ptr), key, slot); } static void root_add_used(struct btrfs_root *root, u32 size) @@ -1237,7 +1233,6 @@ static void reada_for_search(struct btrfs_fs_info *fs_info, u64 target; u64 nread = 0; u64 nread_max; - struct extent_buffer *eb; u32 nr; u32 blocksize; u32 nscan = 0; @@ -1266,10 +1261,14 @@ static void reada_for_search(struct btrfs_fs_info *fs_info, search = btrfs_node_blockptr(node, slot); blocksize = fs_info->nodesize; - eb = find_extent_buffer(fs_info, search); - if (eb) { - free_extent_buffer(eb); - return; + if (path->reada != READA_FORWARD_ALWAYS) { + struct extent_buffer *eb; + + eb = find_extent_buffer(fs_info, search); + if (eb) { + free_extent_buffer(eb); + return; + } } target = search; @@ -2103,6 +2102,27 @@ again: } /* + * Execute search and call btrfs_previous_item to traverse backwards if the item + * was not found. + * + * Return 0 if found, 1 if not found and < 0 if error. + */ +int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key, + struct btrfs_path *path) +{ + int ret; + + ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + if (ret > 0) + ret = btrfs_previous_item(root, path, key->objectid, key->type); + + if (ret == 0) + btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]); + + return ret; +} + +/* * adjust the pointers going up the tree, starting at level * making sure the right key of each node is points to 'key'. * This is used after shifting pointers to the left, so it stops @@ -4358,16 +4378,6 @@ next: return 1; } -/* - * search the tree again to find a leaf with greater keys - * returns 0 if it found something or 1 if there are no greater leaves. - * returns < 0 on io errors. - */ -int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) -{ - return btrfs_next_old_leaf(root, path, 0); -} - int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq) { diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index e5e53e592d4f..dff2c8a3e059 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -281,7 +281,8 @@ struct btrfs_super_block { #define BTRFS_FEATURE_COMPAT_RO_SUPP \ (BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE | \ - BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID) + BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID | \ + BTRFS_FEATURE_COMPAT_RO_VERITY) #define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL #define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL @@ -1012,8 +1013,6 @@ struct btrfs_fs_info { u64 zoned; }; - /* Max size to emit ZONE_APPEND write command */ - u64 max_zone_append_size; struct mutex zoned_meta_io_lock; spinlock_t treelog_bg_lock; u64 treelog_bg; @@ -1484,20 +1483,20 @@ do { \ /* * Inode flags */ -#define BTRFS_INODE_NODATASUM (1 << 0) -#define BTRFS_INODE_NODATACOW (1 << 1) -#define BTRFS_INODE_READONLY (1 << 2) -#define BTRFS_INODE_NOCOMPRESS (1 << 3) -#define BTRFS_INODE_PREALLOC (1 << 4) -#define BTRFS_INODE_SYNC (1 << 5) -#define BTRFS_INODE_IMMUTABLE (1 << 6) -#define BTRFS_INODE_APPEND (1 << 7) -#define BTRFS_INODE_NODUMP (1 << 8) -#define BTRFS_INODE_NOATIME (1 << 9) -#define BTRFS_INODE_DIRSYNC (1 << 10) -#define BTRFS_INODE_COMPRESS (1 << 11) - -#define BTRFS_INODE_ROOT_ITEM_INIT (1 << 31) +#define BTRFS_INODE_NODATASUM (1U << 0) +#define BTRFS_INODE_NODATACOW (1U << 1) +#define BTRFS_INODE_READONLY (1U << 2) +#define BTRFS_INODE_NOCOMPRESS (1U << 3) +#define BTRFS_INODE_PREALLOC (1U << 4) +#define BTRFS_INODE_SYNC (1U << 5) +#define BTRFS_INODE_IMMUTABLE (1U << 6) +#define BTRFS_INODE_APPEND (1U << 7) +#define BTRFS_INODE_NODUMP (1U << 8) +#define BTRFS_INODE_NOATIME (1U << 9) +#define BTRFS_INODE_DIRSYNC (1U << 10) +#define BTRFS_INODE_COMPRESS (1U << 11) + +#define BTRFS_INODE_ROOT_ITEM_INIT (1U << 31) #define BTRFS_INODE_FLAG_MASK \ (BTRFS_INODE_NODATASUM | \ @@ -1514,6 +1513,10 @@ do { \ BTRFS_INODE_COMPRESS | \ BTRFS_INODE_ROOT_ITEM_INIT) +#define BTRFS_INODE_RO_VERITY (1U << 0) + +#define BTRFS_INODE_RO_FLAG_MASK (BTRFS_INODE_RO_VERITY) + struct btrfs_map_token { struct extent_buffer *eb; char *kaddr; @@ -2781,10 +2784,11 @@ enum btrfs_flush_state { FLUSH_DELAYED_REFS = 4, FLUSH_DELALLOC = 5, FLUSH_DELALLOC_WAIT = 6, - ALLOC_CHUNK = 7, - ALLOC_CHUNK_FORCE = 8, - RUN_DELAYED_IPUTS = 9, - COMMIT_TRANS = 10, + FLUSH_DELALLOC_FULL = 7, + ALLOC_CHUNK = 8, + ALLOC_CHUNK_FORCE = 9, + RUN_DELAYED_IPUTS = 10, + COMMIT_TRANS = 11, }; int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, @@ -2901,10 +2905,13 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, return btrfs_insert_empty_items(trans, root, path, key, &data_size, 1); } -int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq); + +int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key, + struct btrfs_path *path); + static inline int btrfs_next_old_item(struct btrfs_root *root, struct btrfs_path *p, u64 time_seq) { @@ -2913,6 +2920,18 @@ static inline int btrfs_next_old_item(struct btrfs_root *root, return btrfs_next_old_leaf(root, p, time_seq); return 0; } + +/* + * Search the tree again to find a leaf with greater keys. + * + * Returns 0 if it found something or 1 if there are no greater leaves. + * Returns < 0 on error. + */ +static inline int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) +{ + return btrfs_next_old_leaf(root, path, 0); +} + static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p) { return btrfs_next_old_item(root, p, 0); @@ -3145,7 +3164,8 @@ int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, struct extent_state **cached_state); int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, struct btrfs_root *new_root, - struct btrfs_root *parent_root); + struct btrfs_root *parent_root, + struct user_namespace *mnt_userns); void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, unsigned *bits); void btrfs_clear_delalloc_extent(struct inode *inode, @@ -3194,10 +3214,10 @@ int btrfs_prealloc_file_range_trans(struct inode *inode, int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, unsigned long *nr_written, struct writeback_control *wbc); -int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end); +int btrfs_writepage_cow_fixup(struct page *page); void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, struct page *page, u64 start, - u64 end, int uptodate); + u64 end, bool uptodate); extern const struct dentry_operations btrfs_dentry_operations; extern const struct iomap_ops btrfs_dio_iomap_ops; extern const struct iomap_dio_ops btrfs_dio_ops; @@ -3686,7 +3706,7 @@ static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag) /* acl.c */ #ifdef CONFIG_BTRFS_FS_POSIX_ACL -struct posix_acl *btrfs_get_acl(struct inode *inode, int type); +struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu); int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, struct posix_acl *acl, int type); int btrfs_init_acl(struct btrfs_trans_handle *trans, @@ -3779,6 +3799,30 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) return signal_pending(current); } +/* verity.c */ +#ifdef CONFIG_FS_VERITY + +extern const struct fsverity_operations btrfs_verityops; +int btrfs_drop_verity_items(struct btrfs_inode *inode); + +BTRFS_SETGET_FUNCS(verity_descriptor_encryption, struct btrfs_verity_descriptor_item, + encryption, 8); +BTRFS_SETGET_FUNCS(verity_descriptor_size, struct btrfs_verity_descriptor_item, + size, 64); +BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_encryption, + struct btrfs_verity_descriptor_item, encryption, 8); +BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_size, + struct btrfs_verity_descriptor_item, size, 64); + +#else + +static inline int btrfs_drop_verity_items(struct btrfs_inode *inode) +{ + return 0; +} + +#endif + /* Sanity test specific functions */ #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS void btrfs_test_destroy_inode(struct inode *inode); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 257c1e18abd4..1e08eb2b27f0 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -6,7 +6,6 @@ #include <linux/slab.h> #include <linux/iversion.h> -#include <linux/sched/mm.h> #include "misc.h" #include "delayed-inode.h" #include "disk-io.h" @@ -672,176 +671,119 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_fs_info *fs_info, } /* - * This helper will insert some continuous items into the same leaf according - * to the free space of the leaf. + * Insert a single delayed item or a batch of delayed items that have consecutive + * keys if they exist. */ -static int btrfs_batch_insert_items(struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_delayed_item *item) +static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_delayed_item *first_item) { - struct btrfs_delayed_item *curr, *next; - int free_space; - int total_size = 0; - struct extent_buffer *leaf; - char *data_ptr; - struct btrfs_key *keys; - u32 *data_size; - struct list_head head; - int slot; + LIST_HEAD(batch); + struct btrfs_delayed_item *curr; + struct btrfs_delayed_item *next; + const int max_size = BTRFS_LEAF_DATA_SIZE(root->fs_info); + int total_size; int nitems; - int i; - int ret = 0; - - BUG_ON(!path->nodes[0]); + char *ins_data = NULL; + struct btrfs_key *ins_keys; + u32 *ins_sizes; + int ret; - leaf = path->nodes[0]; - free_space = btrfs_leaf_free_space(leaf); - INIT_LIST_HEAD(&head); + list_add_tail(&first_item->tree_list, &batch); + nitems = 1; + total_size = first_item->data_len + sizeof(struct btrfs_item); + curr = first_item; - next = item; - nitems = 0; + while (true) { + int next_size; - /* - * count the number of the continuous items that we can insert in batch - */ - while (total_size + next->data_len + sizeof(struct btrfs_item) <= - free_space) { - total_size += next->data_len + sizeof(struct btrfs_item); - list_add_tail(&next->tree_list, &head); - nitems++; - - curr = next; next = __btrfs_next_delayed_item(curr); - if (!next) + if (!next || !btrfs_is_continuous_delayed_item(curr, next)) break; - if (!btrfs_is_continuous_delayed_item(curr, next)) + next_size = next->data_len + sizeof(struct btrfs_item); + if (total_size + next_size > max_size) break; - } - if (!nitems) { - ret = 0; - goto out; + list_add_tail(&next->tree_list, &batch); + nitems++; + total_size += next_size; + curr = next; } - keys = kmalloc_array(nitems, sizeof(struct btrfs_key), GFP_NOFS); - if (!keys) { - ret = -ENOMEM; - goto out; - } + if (nitems == 1) { + ins_keys = &first_item->key; + ins_sizes = &first_item->data_len; + } else { + int i = 0; - data_size = kmalloc_array(nitems, sizeof(u32), GFP_NOFS); - if (!data_size) { - ret = -ENOMEM; - goto error; + ins_data = kmalloc(nitems * sizeof(u32) + + nitems * sizeof(struct btrfs_key), GFP_NOFS); + if (!ins_data) { + ret = -ENOMEM; + goto out; + } + ins_sizes = (u32 *)ins_data; + ins_keys = (struct btrfs_key *)(ins_data + nitems * sizeof(u32)); + list_for_each_entry(curr, &batch, tree_list) { + ins_keys[i] = curr->key; + ins_sizes[i] = curr->data_len; + i++; + } } - /* get keys of all the delayed items */ - i = 0; - list_for_each_entry(next, &head, tree_list) { - keys[i] = next->key; - data_size[i] = next->data_len; - i++; - } + ret = btrfs_insert_empty_items(trans, root, path, ins_keys, ins_sizes, + nitems); + if (ret) + goto out; - /* insert the keys of the items */ - setup_items_for_insert(root, path, keys, data_size, nitems); + list_for_each_entry(curr, &batch, tree_list) { + char *data_ptr; - /* insert the dir index items */ - slot = path->slots[0]; - list_for_each_entry_safe(curr, next, &head, tree_list) { - data_ptr = btrfs_item_ptr(leaf, slot, char); - write_extent_buffer(leaf, &curr->data, - (unsigned long)data_ptr, - curr->data_len); - slot++; + data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char); + write_extent_buffer(path->nodes[0], &curr->data, + (unsigned long)data_ptr, curr->data_len); + path->slots[0]++; + } - btrfs_delayed_item_release_metadata(root, curr); + /* + * Now release our path before releasing the delayed items and their + * metadata reservations, so that we don't block other tasks for more + * time than needed. + */ + btrfs_release_path(path); + list_for_each_entry_safe(curr, next, &batch, tree_list) { list_del(&curr->tree_list); + btrfs_delayed_item_release_metadata(root, curr); btrfs_release_delayed_item(curr); } - -error: - kfree(data_size); - kfree(keys); out: + kfree(ins_data); return ret; } -/* - * This helper can just do simple insertion that needn't extend item for new - * data, such as directory name index insertion, inode insertion. - */ -static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_delayed_item *delayed_item) -{ - struct extent_buffer *leaf; - unsigned int nofs_flag; - char *ptr; - int ret; - - nofs_flag = memalloc_nofs_save(); - ret = btrfs_insert_empty_item(trans, root, path, &delayed_item->key, - delayed_item->data_len); - memalloc_nofs_restore(nofs_flag); - if (ret < 0 && ret != -EEXIST) - return ret; - - leaf = path->nodes[0]; - - ptr = btrfs_item_ptr(leaf, path->slots[0], char); - - write_extent_buffer(leaf, delayed_item->data, (unsigned long)ptr, - delayed_item->data_len); - btrfs_mark_buffer_dirty(leaf); - - btrfs_delayed_item_release_metadata(root, delayed_item); - return 0; -} - -/* - * we insert an item first, then if there are some continuous items, we try - * to insert those items into the same leaf. - */ static int btrfs_insert_delayed_items(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_root *root, struct btrfs_delayed_node *node) { - struct btrfs_delayed_item *curr, *prev; int ret = 0; -do_again: - mutex_lock(&node->mutex); - curr = __btrfs_first_delayed_insertion_item(node); - if (!curr) - goto insert_end; - - ret = btrfs_insert_delayed_item(trans, root, path, curr); - if (ret < 0) { - btrfs_release_path(path); - goto insert_end; - } + while (ret == 0) { + struct btrfs_delayed_item *curr; - prev = curr; - curr = __btrfs_next_delayed_item(prev); - if (curr && btrfs_is_continuous_delayed_item(prev, curr)) { - /* insert the continuous items into the same leaf */ - path->slots[0]++; - btrfs_batch_insert_items(root, path, curr); + mutex_lock(&node->mutex); + curr = __btrfs_first_delayed_insertion_item(node); + if (!curr) { + mutex_unlock(&node->mutex); + break; + } + ret = btrfs_insert_delayed_item(trans, root, path, curr); + mutex_unlock(&node->mutex); } - btrfs_release_delayed_item(prev); - btrfs_mark_buffer_dirty(path->nodes[0]); - btrfs_release_path(path); - mutex_unlock(&node->mutex); - goto do_again; - -insert_end: - mutex_unlock(&node->mutex); return ret; } @@ -914,7 +856,6 @@ static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans, struct btrfs_delayed_node *node) { struct btrfs_delayed_item *curr, *prev; - unsigned int nofs_flag; int ret = 0; do_again: @@ -923,9 +864,7 @@ do_again: if (!curr) goto delete_fail; - nofs_flag = memalloc_nofs_save(); ret = btrfs_search_slot(trans, root, &curr->key, path, -1, 1); - memalloc_nofs_restore(nofs_flag); if (ret < 0) goto delete_fail; else if (ret > 0) { @@ -994,7 +933,6 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, struct btrfs_key key; struct btrfs_inode_item *inode_item; struct extent_buffer *leaf; - unsigned int nofs_flag; int mod; int ret; @@ -1007,9 +945,7 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, else mod = 1; - nofs_flag = memalloc_nofs_save(); ret = btrfs_lookup_inode(trans, root, path, &key, mod); - memalloc_nofs_restore(nofs_flag); if (ret > 0) ret = -ENOENT; if (ret < 0) @@ -1066,9 +1002,7 @@ search: key.type = BTRFS_INODE_EXTREF_KEY; key.offset = -1; - nofs_flag = memalloc_nofs_save(); ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - memalloc_nofs_restore(nofs_flag); if (ret < 0) goto err_out; ASSERT(ret); @@ -1711,6 +1645,8 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode_item *inode_item, struct inode *inode) { + u64 flags; + btrfs_set_stack_inode_uid(inode_item, i_uid_read(inode)); btrfs_set_stack_inode_gid(inode_item, i_gid_read(inode)); btrfs_set_stack_inode_size(inode_item, BTRFS_I(inode)->disk_i_size); @@ -1723,7 +1659,9 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans, inode_peek_iversion(inode)); btrfs_set_stack_inode_transid(inode_item, trans->transid); btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev); - btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags); + flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags, + BTRFS_I(inode)->ro_flags); + btrfs_set_stack_inode_flags(inode_item, flags); btrfs_set_stack_inode_block_group(inode_item, 0); btrfs_set_stack_timespec_sec(&inode_item->atime, @@ -1781,7 +1719,8 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) btrfs_stack_inode_sequence(inode_item)); inode->i_rdev = 0; *rdev = btrfs_stack_inode_rdev(inode_item); - BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item); + btrfs_inode_split_flags(btrfs_stack_inode_flags(inode_item), + &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags); inode->i_atime.tv_sec = btrfs_stack_timespec_sec(&inode_item->atime); inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->atime); diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 98b63ebed539..f1274d5c3805 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -170,6 +170,25 @@ out_free: return 0; } +static struct btrfs_dir_item *btrfs_lookup_match_dir( + struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *key, const char *name, + int name_len, int mod) +{ + const int ins_len = (mod < 0 ? -1 : 0); + const int cow = (mod != 0); + int ret; + + ret = btrfs_search_slot(trans, root, key, path, ins_len, cow); + if (ret < 0) + return ERR_PTR(ret); + if (ret > 0) + return ERR_PTR(-ENOENT); + + return btrfs_match_dir_item_name(root->fs_info, path, name, name_len); +} + /* * lookup a directory item based on name. 'dir' is the objectid * we're searching in, and 'mod' tells us if you plan on deleting the @@ -181,23 +200,18 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, const char *name, int name_len, int mod) { - int ret; struct btrfs_key key; - int ins_len = mod < 0 ? -1 : 0; - int cow = mod != 0; + struct btrfs_dir_item *di; key.objectid = dir; key.type = BTRFS_DIR_ITEM_KEY; - key.offset = btrfs_name_hash(name, name_len); - ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); - if (ret < 0) - return ERR_PTR(ret); - if (ret > 0) + di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod); + if (IS_ERR(di) && PTR_ERR(di) == -ENOENT) return NULL; - return btrfs_match_dir_item_name(root->fs_info, path, name, name_len); + return di; } int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, @@ -211,7 +225,6 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, int slot; struct btrfs_path *path; - path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -220,20 +233,20 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, key.type = BTRFS_DIR_ITEM_KEY; key.offset = btrfs_name_hash(name, name_len); - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - - /* return back any errors */ - if (ret < 0) - goto out; + di = btrfs_lookup_match_dir(NULL, root, path, &key, name, name_len, 0); + if (IS_ERR(di)) { + ret = PTR_ERR(di); + /* Nothing found, we're safe */ + if (ret == -ENOENT) { + ret = 0; + goto out; + } - /* nothing found, we're safe */ - if (ret > 0) { - ret = 0; - goto out; + if (ret < 0) + goto out; } /* we found an item, look for our name in the item */ - di = btrfs_match_dir_item_name(root->fs_info, path, name, name_len); if (di) { /* our exact name was found */ ret = -EEXIST; @@ -274,21 +287,13 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, u64 objectid, const char *name, int name_len, int mod) { - int ret; struct btrfs_key key; - int ins_len = mod < 0 ? -1 : 0; - int cow = mod != 0; key.objectid = dir; key.type = BTRFS_DIR_INDEX_KEY; key.offset = objectid; - ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); - if (ret < 0) - return ERR_PTR(ret); - if (ret > 0) - return ERR_PTR(-ENOENT); - return btrfs_match_dir_item_name(root->fs_info, path, name, name_len); + return btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod); } struct btrfs_dir_item * @@ -345,21 +350,18 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, const char *name, u16 name_len, int mod) { - int ret; struct btrfs_key key; - int ins_len = mod < 0 ? -1 : 0; - int cow = mod != 0; + struct btrfs_dir_item *di; key.objectid = dir; key.type = BTRFS_XATTR_ITEM_KEY; key.offset = btrfs_name_hash(name, name_len); - ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); - if (ret < 0) - return ERR_PTR(ret); - if (ret > 0) + + di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod); + if (IS_ERR(di) && PTR_ERR(di) == -ENOENT) return NULL; - return btrfs_match_dir_item_name(root->fs_info, path, name, name_len); + return di; } /* diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a59ab7b9aea0..355ea88d5c5f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3314,6 +3314,30 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device */ fs_info->compress_type = BTRFS_COMPRESS_ZLIB; + /* + * Flag our filesystem as having big metadata blocks if they are bigger + * than the page size. + */ + if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) { + if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA)) + btrfs_info(fs_info, + "flagging fs with big metadata feature"); + features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA; + } + + /* Set up fs_info before parsing mount options */ + nodesize = btrfs_super_nodesize(disk_super); + sectorsize = btrfs_super_sectorsize(disk_super); + stripesize = sectorsize; + fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids)); + fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids)); + + fs_info->nodesize = nodesize; + fs_info->sectorsize = sectorsize; + fs_info->sectorsize_bits = ilog2(sectorsize); + fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size; + fs_info->stripesize = stripesize; + ret = btrfs_parse_options(fs_info, options, sb->s_flags); if (ret) { err = ret; @@ -3341,30 +3365,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device btrfs_info(fs_info, "has skinny extents"); /* - * flag our filesystem as having big metadata blocks if - * they are bigger than the page size - */ - if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) { - if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA)) - btrfs_info(fs_info, - "flagging fs with big metadata feature"); - features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA; - } - - nodesize = btrfs_super_nodesize(disk_super); - sectorsize = btrfs_super_sectorsize(disk_super); - stripesize = sectorsize; - fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids)); - fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids)); - - /* Cache block sizes */ - fs_info->nodesize = nodesize; - fs_info->sectorsize = sectorsize; - fs_info->sectorsize_bits = ilog2(sectorsize); - fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size; - fs_info->stripesize = stripesize; - - /* * mixed block groups end up with duplicate but slightly offset * extent buffers for the same range. It leads to corruptions */ @@ -3392,11 +3392,16 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail_alloc; } - /* For 4K sector size support, it's only read-only */ - if (PAGE_SIZE == SZ_64K && sectorsize == SZ_4K) { - if (!sb_rdonly(sb) || btrfs_super_log_root(disk_super)) { + if (sectorsize != PAGE_SIZE) { + btrfs_warn(fs_info, + "read-write for sector size %u with page size %lu is experimental", + sectorsize, PAGE_SIZE); + } + if (sectorsize != PAGE_SIZE) { + if (btrfs_super_incompat_flags(fs_info->super_copy) & + BTRFS_FEATURE_INCOMPAT_RAID56) { btrfs_err(fs_info, - "subpage sectorsize %u only supported read-only for page size %lu", + "RAID56 is not yet supported for sector size %u with page size %lu", sectorsize, PAGE_SIZE); err = -EINVAL; goto fail_alloc; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 268ce58d4569..fc3da7585fb7 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -153,7 +153,7 @@ search_again: else key.type = BTRFS_EXTENT_ITEM_KEY; - ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0); + ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); if (ret < 0) goto out_free; @@ -5950,9 +5950,9 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) */ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) { + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_block_group *cache = NULL; struct btrfs_device *device; - struct list_head *devices; u64 group_trimmed; u64 range_end = U64_MAX; u64 start; @@ -6016,9 +6016,9 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) btrfs_warn(fs_info, "failed to trim %llu block group(s), last error %d", bg_failed, bg_ret); - mutex_lock(&fs_info->fs_devices->device_list_mutex); - devices = &fs_info->fs_devices->devices; - list_for_each_entry(device, devices, dev_list) { + + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) { if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) continue; @@ -6031,7 +6031,7 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) trimmed += group_trimmed; } - mutex_unlock(&fs_info->fs_devices->device_list_mutex); + mutex_unlock(&fs_devices->device_list_mutex); if (dev_failed) btrfs_warn(fs_info, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 9e81d25dea70..aaddd7225348 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -13,6 +13,7 @@ #include <linux/pagevec.h> #include <linux/prefetch.h> #include <linux/cleancache.h> +#include <linux/fsverity.h> #include "misc.h" #include "extent_io.h" #include "extent-io-tree.h" @@ -172,6 +173,8 @@ int __must_check submit_one_bio(struct bio *bio, int mirror_num, bio->bi_private = NULL; + /* Caller should ensure the bio has at least some range added */ + ASSERT(bio->bi_iter.bi_size); if (is_data_inode(tree->private_data)) ret = btrfs_submit_data_bio(tree->private_data, bio, mirror_num, bio_flags); @@ -2245,18 +2248,6 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, return bitset; } -/* - * helper function to set a given page up to date if all the - * extents in the tree for that page are up to date - */ -static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) -{ - u64 start = page_offset(page); - u64 end = start + PAGE_SIZE - 1; - if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) - SetPageUptodate(page); -} - int free_io_failure(struct extent_io_tree *failure_tree, struct extent_io_tree *io_tree, struct io_failure_record *rec) @@ -2688,7 +2679,15 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) start + len <= page_offset(page) + PAGE_SIZE); if (uptodate) { - btrfs_page_set_uptodate(fs_info, page, start, len); + if (fsverity_active(page->mapping->host) && + !PageError(page) && + !PageUptodate(page) && + start < i_size_read(page->mapping->host) && + !fsverity_verify_page(page)) { + btrfs_page_set_error(fs_info, page, start, len); + } else { + btrfs_page_set_uptodate(fs_info, page, start, len); + } } else { btrfs_page_clear_uptodate(fs_info, page, start, len); btrfs_page_set_error(fs_info, page, start, len); @@ -2779,7 +2778,7 @@ next: void end_extent_writepage(struct page *page, int err, u64 start, u64 end) { struct btrfs_inode *inode; - int uptodate = (err == 0); + const bool uptodate = (err == 0); int ret = 0; ASSERT(page && page->mapping); @@ -2787,8 +2786,14 @@ void end_extent_writepage(struct page *page, int err, u64 start, u64 end) btrfs_writepage_endio_finish_ordered(inode, page, start, end, uptodate); if (!uptodate) { - ClearPageUptodate(page); - SetPageError(page); + const struct btrfs_fs_info *fs_info = inode->root->fs_info; + u32 len; + + ASSERT(end + 1 - start <= U32_MAX); + len = end + 1 - start; + + btrfs_page_clear_uptodate(fs_info, page, start, len); + btrfs_page_set_error(fs_info, page, start, len); ret = err < 0 ? err : -EIO; mapping_set_error(page->mapping, ret); } @@ -3097,7 +3102,7 @@ readpage_ok: /* Update page status and unlock */ end_page_read(page, uptodate, start, len); endio_readpage_release_extent(&processed, BTRFS_I(inode), - start, end, uptodate); + start, end, PageUptodate(page)); } /* Release the last extent */ endio_readpage_release_extent(&processed, NULL, 0, 0, false); @@ -3153,11 +3158,13 @@ struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs) return bio; } -struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size) +struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size) { struct bio *bio; struct btrfs_io_bio *btrfs_bio; + ASSERT(offset <= UINT_MAX && size <= UINT_MAX); + /* this will never fail when it's backed by a bioset */ bio = bio_clone_fast(orig, GFP_NOFS, &btrfs_bioset); ASSERT(bio); @@ -3181,20 +3188,22 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size) * @size: portion of page that we want to write * @prev_bio_flags: flags of previous bio to see if we can merge the current one * @bio_flags: flags of the current bio to see if we can merge them - * @return: true if page was added, false otherwise * * Attempt to add a page to bio considering stripe alignment etc. * - * Return true if successfully page added. Otherwise, return false. + * Return >= 0 for the number of bytes added to the bio. + * Can return 0 if the current bio is already at stripe/zone boundary. + * Return <0 for error. */ -static bool btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, - struct page *page, - u64 disk_bytenr, unsigned int size, - unsigned int pg_offset, - unsigned long bio_flags) +static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, + struct page *page, + u64 disk_bytenr, unsigned int size, + unsigned int pg_offset, + unsigned long bio_flags) { struct bio *bio = bio_ctrl->bio; u32 bio_size = bio->bi_iter.bi_size; + u32 real_size; const sector_t sector = disk_bytenr >> SECTOR_SHIFT; bool contig; int ret; @@ -3203,29 +3212,36 @@ static bool btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, /* The limit should be calculated when bio_ctrl->bio is allocated */ ASSERT(bio_ctrl->len_to_oe_boundary && bio_ctrl->len_to_stripe_boundary); if (bio_ctrl->bio_flags != bio_flags) - return false; + return 0; if (bio_ctrl->bio_flags & EXTENT_BIO_COMPRESSED) contig = bio->bi_iter.bi_sector == sector; else contig = bio_end_sector(bio) == sector; if (!contig) - return false; + return 0; - if (bio_size + size > bio_ctrl->len_to_oe_boundary || - bio_size + size > bio_ctrl->len_to_stripe_boundary) - return false; + real_size = min(bio_ctrl->len_to_oe_boundary, + bio_ctrl->len_to_stripe_boundary) - bio_size; + real_size = min(real_size, size); + + /* + * If real_size is 0, never call bio_add_*_page(), as even size is 0, + * bio will still execute its endio function on the page! + */ + if (real_size == 0) + return 0; if (bio_op(bio) == REQ_OP_ZONE_APPEND) - ret = bio_add_zone_append_page(bio, page, size, pg_offset); + ret = bio_add_zone_append_page(bio, page, real_size, pg_offset); else - ret = bio_add_page(bio, page, size, pg_offset); + ret = bio_add_page(bio, page, real_size, pg_offset); - return ret == size; + return ret; } static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, - struct btrfs_inode *inode) + struct btrfs_inode *inode, u64 file_offset) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_io_geometry geom; @@ -3266,9 +3282,8 @@ static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, return 0; } - ASSERT(fs_info->max_zone_append_size > 0); /* Ordered extent not yet created, so we're good */ - ordered = btrfs_lookup_ordered_extent(inode, logical); + ordered = btrfs_lookup_ordered_extent(inode, file_offset); if (!ordered) { bio_ctrl->len_to_oe_boundary = U32_MAX; return 0; @@ -3280,6 +3295,62 @@ static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, return 0; } +static int alloc_new_bio(struct btrfs_inode *inode, + struct btrfs_bio_ctrl *bio_ctrl, + struct writeback_control *wbc, + unsigned int opf, + bio_end_io_t end_io_func, + u64 disk_bytenr, u32 offset, u64 file_offset, + unsigned long bio_flags) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct bio *bio; + int ret; + + /* + * For compressed page range, its disk_bytenr is always @disk_bytenr + * passed in, no matter if we have added any range into previous bio. + */ + if (bio_flags & EXTENT_BIO_COMPRESSED) + bio = btrfs_bio_alloc(disk_bytenr); + else + bio = btrfs_bio_alloc(disk_bytenr + offset); + bio_ctrl->bio = bio; + bio_ctrl->bio_flags = bio_flags; + bio->bi_end_io = end_io_func; + bio->bi_private = &inode->io_tree; + bio->bi_write_hint = inode->vfs_inode.i_write_hint; + bio->bi_opf = opf; + ret = calc_bio_boundaries(bio_ctrl, inode, file_offset); + if (ret < 0) + goto error; + if (wbc) { + struct block_device *bdev; + + bdev = fs_info->fs_devices->latest_bdev; + bio_set_dev(bio, bdev); + wbc_init_bio(wbc, bio); + } + if (btrfs_is_zoned(fs_info) && bio_op(bio) == REQ_OP_ZONE_APPEND) { + struct btrfs_device *device; + + device = btrfs_zoned_get_device(fs_info, disk_bytenr, + fs_info->sectorsize); + if (IS_ERR(device)) { + ret = PTR_ERR(device); + goto error; + } + + btrfs_io_bio(bio)->device = device; + } + return 0; +error: + bio_ctrl->bio = NULL; + bio->bi_status = errno_to_blk_status(ret); + bio_endio(bio); + return ret; +} + /* * @opf: bio REQ_OP_* and REQ_* flags as one value * @wbc: optional writeback control for io accounting @@ -3305,61 +3376,67 @@ static int submit_extent_page(unsigned int opf, bool force_bio_submit) { int ret = 0; - struct bio *bio; - size_t io_size = min_t(size_t, size, PAGE_SIZE); struct btrfs_inode *inode = BTRFS_I(page->mapping->host); - struct extent_io_tree *tree = &inode->io_tree; - struct btrfs_fs_info *fs_info = inode->root->fs_info; + unsigned int cur = pg_offset; ASSERT(bio_ctrl); ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE && pg_offset + size <= PAGE_SIZE); - if (bio_ctrl->bio) { - bio = bio_ctrl->bio; - if (force_bio_submit || - !btrfs_bio_add_page(bio_ctrl, page, disk_bytenr, io_size, - pg_offset, bio_flags)) { - ret = submit_one_bio(bio, mirror_num, bio_ctrl->bio_flags); + if (force_bio_submit && bio_ctrl->bio) { + ret = submit_one_bio(bio_ctrl->bio, mirror_num, bio_ctrl->bio_flags); + bio_ctrl->bio = NULL; + if (ret < 0) + return ret; + } + + while (cur < pg_offset + size) { + u32 offset = cur - pg_offset; + int added; + + /* Allocate new bio if needed */ + if (!bio_ctrl->bio) { + ret = alloc_new_bio(inode, bio_ctrl, wbc, opf, + end_io_func, disk_bytenr, offset, + page_offset(page) + cur, + bio_flags); + if (ret < 0) + return ret; + } + /* + * We must go through btrfs_bio_add_page() to ensure each + * page range won't cross various boundaries. + */ + if (bio_flags & EXTENT_BIO_COMPRESSED) + added = btrfs_bio_add_page(bio_ctrl, page, disk_bytenr, + size - offset, pg_offset + offset, + bio_flags); + else + added = btrfs_bio_add_page(bio_ctrl, page, + disk_bytenr + offset, size - offset, + pg_offset + offset, bio_flags); + + /* Metadata page range should never be split */ + if (!is_data_inode(&inode->vfs_inode)) + ASSERT(added == 0 || added == size - offset); + + /* At least we added some page, update the account */ + if (wbc && added) + wbc_account_cgroup_owner(wbc, page, added); + + /* We have reached boundary, submit right now */ + if (added < size - offset) { + /* The bio should contain some page(s) */ + ASSERT(bio_ctrl->bio->bi_iter.bi_size); + ret = submit_one_bio(bio_ctrl->bio, mirror_num, + bio_ctrl->bio_flags); bio_ctrl->bio = NULL; if (ret < 0) return ret; - } else { - if (wbc) - wbc_account_cgroup_owner(wbc, page, io_size); - return 0; } + cur += added; } - - bio = btrfs_bio_alloc(disk_bytenr); - bio_add_page(bio, page, io_size, pg_offset); - bio->bi_end_io = end_io_func; - bio->bi_private = tree; - bio->bi_write_hint = page->mapping->host->i_write_hint; - bio->bi_opf = opf; - if (wbc) { - struct block_device *bdev; - - bdev = fs_info->fs_devices->latest_bdev; - bio_set_dev(bio, bdev); - wbc_init_bio(wbc, bio); - wbc_account_cgroup_owner(wbc, page, io_size); - } - if (btrfs_is_zoned(fs_info) && bio_op(bio) == REQ_OP_ZONE_APPEND) { - struct btrfs_device *device; - - device = btrfs_zoned_get_device(fs_info, disk_bytenr, io_size); - if (IS_ERR(device)) - return PTR_ERR(device); - - btrfs_io_bio(bio)->device = device; - } - - bio_ctrl->bio = bio; - bio_ctrl->bio_flags = bio_flags; - ret = calc_bio_boundaries(bio_ctrl, inode); - - return ret; + return 0; } static int attach_extent_buffer_page(struct extent_buffer *eb, @@ -3488,7 +3565,6 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, size_t pg_offset = 0; size_t iosize; size_t blocksize = inode->i_sb->s_blocksize; - unsigned long this_bio_flag = 0; struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; ret = set_page_extent_mapped(page); @@ -3519,6 +3595,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, } begin_page_read(fs_info, page); while (cur <= end) { + unsigned long this_bio_flag = 0; bool force_bio_submit = false; u64 disk_bytenr; @@ -3627,7 +3704,6 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, /* the get_extent function already copied into the page */ if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1, NULL)) { - check_page_uptodate(tree, page); unlock_extent(tree, cur, cur + iosize - 1); end_page_read(page, true, cur, iosize); cur = cur + iosize; @@ -3722,14 +3798,9 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, ret = btrfs_run_delalloc_range(inode, page, delalloc_start, delalloc_end, &page_started, nr_written, wbc); if (ret) { - SetPageError(page); - /* - * btrfs_run_delalloc_range should return < 0 for error - * but just in case, we use > 0 here meaning the IO is - * started, so we don't want to return > 0 unless - * things are going well. - */ - return ret < 0 ? ret : -EIO; + btrfs_page_set_error(inode->root->fs_info, page, + page_offset(page), PAGE_SIZE); + return ret; } /* * delalloc_end is already one less than the total length, so @@ -3829,9 +3900,8 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, int *nr_ret) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - u64 start = page_offset(page); - u64 end = start + PAGE_SIZE - 1; - u64 cur = start; + u64 cur = page_offset(page); + u64 end = cur + PAGE_SIZE - 1; u64 extent_offset; u64 block_start; struct extent_map *em; @@ -3841,7 +3911,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, const unsigned int write_flags = wbc_to_write_flags(wbc); bool compressed; - ret = btrfs_writepage_cow_fixup(page, start, end); + ret = btrfs_writepage_cow_fixup(page); if (ret) { /* Fixup worker will requeue */ redirty_page_for_writepage(wbc, page); @@ -3865,7 +3935,16 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, if (cur >= i_size) { btrfs_writepage_endio_finish_ordered(inode, page, cur, - end, 1); + end, true); + /* + * This range is beyond i_size, thus we don't need to + * bother writing back. + * But we still need to clear the dirty subpage bit, or + * the next time the page gets dirtied, we will try to + * writeback the sectors with subpage dirty bits, + * causing writeback without ordered extent. + */ + btrfs_page_clear_dirty(fs_info, page, cur, end + 1 - cur); break; } @@ -3915,7 +3994,8 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, nr++; else btrfs_writepage_endio_finish_ordered(inode, - page, cur, cur + iosize - 1, 1); + page, cur, cur + iosize - 1, true); + btrfs_page_clear_dirty(fs_info, page, cur, iosize); cur += iosize; continue; } @@ -3951,6 +4031,12 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, cur += iosize; nr++; } + /* + * If we finish without problem, we should not only clear page dirty, + * but also empty subpage dirty bits + */ + if (!ret) + btrfs_page_assert_not_dirty(fs_info, page); *nr_ret = nr; return ret; } @@ -3981,7 +4067,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, WARN_ON(!PageLocked(page)); - ClearPageError(page); + btrfs_page_clear_error(btrfs_sb(inode->i_sb), page, + page_offset(page), PAGE_SIZE); pg_offset = offset_in_page(i_size); if (page->index > end_index || @@ -4022,10 +4109,39 @@ done: set_page_writeback(page); end_page_writeback(page); } - if (PageError(page)) { - ret = ret < 0 ? ret : -EIO; + /* + * Here we used to have a check for PageError() and then set @ret and + * call end_extent_writepage(). + * + * But in fact setting @ret here will cause different error paths + * between subpage and regular sectorsize. + * + * For regular page size, we never submit current page, but only add + * current page to current bio. + * The bio submission can only happen in next page. + * Thus if we hit the PageError() branch, @ret is already set to + * non-zero value and will not get updated for regular sectorsize. + * + * But for subpage case, it's possible we submit part of current page, + * thus can get PageError() set by submitted bio of the same page, + * while our @ret is still 0. + * + * So here we unify the behavior and don't set @ret. + * Error can still be properly passed to higher layer as page will + * be set error, here we just don't handle the IO failure. + * + * NOTE: This is just a hotfix for subpage. + * The root fix will be properly ending ordered extent when we hit + * an error during writeback. + * + * But that needs a bigger refactoring, as we not only need to grab the + * submitted OE, but also need to know exactly at which bytenr we hit + * the error. + * Currently the full page based __extent_writepage_io() is not + * capable of that. + */ + if (PageError(page)) end_extent_writepage(page, ret, start, page_end); - } unlock_page(page); ASSERT(ret <= 0); return ret; @@ -4984,7 +5100,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end, ret = __extent_writepage(page, &wbc_writepages, &epd); else { btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), - page, start, start + PAGE_SIZE - 1, 1); + page, start, start + PAGE_SIZE - 1, true); unlock_page(page); } put_page(page); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 62027f551b44..53abdc280451 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -280,7 +280,7 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, struct bio *btrfs_bio_alloc(u64 first_byte); struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs); struct bio *btrfs_bio_clone(struct bio *bio); -struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size); +struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size); int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, u64 length, u64 logical, struct page *page, diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index df6631eefc65..2673c6ba7a4e 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -233,7 +233,6 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, struct btrfs_path *path, u64 objectid, u64 offset, int mod) { - int ret; struct btrfs_key file_key; int ins_len = mod < 0 ? -1 : 0; int cow = mod != 0; @@ -241,8 +240,8 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, file_key.objectid = objectid; file_key.offset = offset; file_key.type = BTRFS_EXTENT_DATA_KEY; - ret = btrfs_search_slot(trans, root, &file_key, path, ins_len, cow); - return ret; + + return btrfs_search_slot(trans, root, &file_key, path, ins_len, cow); } /* diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index ee34497500e1..7ff577005d0f 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -16,6 +16,7 @@ #include <linux/btrfs.h> #include <linux/uio.h> #include <linux/iversion.h> +#include <linux/fsverity.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -1340,7 +1341,18 @@ static int prepare_uptodate_page(struct inode *inode, unlock_page(page); return -EIO; } - if (page->mapping != inode->i_mapping) { + + /* + * Since btrfs_readpage() will unlock the page before it + * returns, there is a window where btrfs_releasepage() can be + * called to release the page. Here we check both inode + * mapping and PagePrivate() to make sure the page was not + * released. + * + * The private flag check is essential for subpage as we need + * to store extra bitmap using page->private. + */ + if (page->mapping != inode->i_mapping || !PagePrivate(page)) { unlock_page(page); return -EAGAIN; } @@ -3604,7 +3616,13 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence) static int btrfs_file_open(struct inode *inode, struct file *filp) { + int ret; + filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; + + ret = fsverity_file_open(inode, filp); + if (ret) + return ret; return generic_file_open(inode, filp); } @@ -3633,6 +3651,9 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to) struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; + if (fsverity_active(inode)) + return 0; + if (check_direct_read(btrfs_sb(inode->i_sb), to, iocb->ki_pos)) return 0; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 2131ae5b9ed7..da0eee7c9e5f 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -344,19 +344,13 @@ fail: static void readahead_cache(struct inode *inode) { - struct file_ra_state *ra; + struct file_ra_state ra; unsigned long last_index; - ra = kzalloc(sizeof(*ra), GFP_NOFS); - if (!ra) - return; - - file_ra_state_init(ra, inode->i_mapping); + file_ra_state_init(&ra, inode->i_mapping); last_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; - page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index); - - kfree(ra); + page_cache_sync_readahead(inode->i_mapping, &ra, NULL, 0, last_index); } static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode, @@ -2544,6 +2538,7 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; u64 offset = bytenr - block_group->start; u64 to_free, to_unusable; + const int bg_reclaim_threshold = READ_ONCE(fs_info->bg_reclaim_threshold); spin_lock(&ctl->tree_lock); if (!used) @@ -2573,9 +2568,9 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, /* All the region is now unusable. Mark it as unused and reclaim */ if (block_group->zone_unusable == block_group->length) { btrfs_mark_bg_unused(block_group); - } else if (block_group->zone_unusable >= - div_factor_fine(block_group->length, - fs_info->bg_reclaim_threshold)) { + } else if (bg_reclaim_threshold && + block_group->zone_unusable >= + div_factor_fine(block_group->length, bg_reclaim_threshold)) { btrfs_mark_bg_to_reclaim(block_group); } @@ -2652,8 +2647,11 @@ int btrfs_remove_free_space(struct btrfs_block_group *block_group, * btrfs_pin_extent_for_log_replay() when replaying the log. * Advance the pointer not to overwrite the tree-log nodes. */ - if (block_group->alloc_offset < offset + bytes) - block_group->alloc_offset = offset + bytes; + if (block_group->start + block_group->alloc_offset < + offset + bytes) { + block_group->alloc_offset = + offset + bytes - block_group->start; + } return 0; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index bd5689fa290e..487533c35ddb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -32,6 +32,7 @@ #include <linux/sched/mm.h> #include <linux/iomap.h> #include <asm/unaligned.h> +#include <linux/fsverity.h> #include "misc.h" #include "ctree.h" #include "disk-io.h" @@ -286,9 +287,8 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, cur_size = min_t(unsigned long, compressed_size, PAGE_SIZE); - kaddr = kmap_atomic(cpage); + kaddr = page_address(cpage); write_extent_buffer(leaf, kaddr, ptr, cur_size); - kunmap_atomic(kaddr); i++; ptr += cur_size; @@ -490,6 +490,9 @@ static noinline int add_async_extent(struct async_chunk *cow, */ static inline bool inode_can_compress(struct btrfs_inode *inode) { + /* Subpage doesn't support compression yet */ + if (inode->root->fs_info->sectorsize < PAGE_SIZE) + return false; if (inode->flags & BTRFS_INODE_NODATACOW || inode->flags & BTRFS_INODE_NODATASUM) return false; @@ -682,7 +685,11 @@ again: } } cont: - if (start == 0) { + /* + * Check cow_file_range() for why we don't even try to create inline + * extent for subpage case. + */ + if (start == 0 && fs_info->sectorsize == PAGE_SIZE) { /* lets try to make an inline extent */ if (ret || total_in < actual_end) { /* we didn't compress the entire range, try @@ -973,7 +980,7 @@ retry: p->mapping = inode->vfs_inode.i_mapping; btrfs_writepage_endio_finish_ordered(inode, p, start, - end, 0); + end, false); p->mapping = NULL; extent_clear_unlock_delalloc(inode, start, end, NULL, 0, @@ -1080,7 +1087,17 @@ static noinline int cow_file_range(struct btrfs_inode *inode, inode_should_defrag(inode, start, end, num_bytes, SZ_64K); - if (start == 0) { + /* + * Due to the page size limit, for subpage we can only trigger the + * writeback for the dirty sectors of page, that means data writeback + * is doing more writeback than what we want. + * + * This is especially unexpected for some call sites like fallocate, + * where we only increase i_size after everything is done. + * This means we can trigger inline extent even if we didn't want to. + * So here we skip inline extent creation completely. + */ + if (start == 0 && fs_info->sectorsize == PAGE_SIZE) { /* lets try to make an inline extent */ ret = cow_file_range_inline(inode, start, end, 0, BTRFS_COMPRESS_NONE, NULL); @@ -1290,11 +1307,6 @@ static noinline void async_cow_submit(struct btrfs_work *work) nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >> PAGE_SHIFT; - /* atomic_sub_return implies a barrier */ - if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) < - 5 * SZ_1M) - cond_wake_up_nomb(&fs_info->async_submit_wait); - /* * ->inode could be NULL if async_chunk_start has failed to compress, * in which case we don't have anything to submit, yet we need to @@ -1303,6 +1315,11 @@ static noinline void async_cow_submit(struct btrfs_work *work) */ if (async_chunk->inode) submit_compressed_extents(async_chunk); + + /* atomic_sub_return implies a barrier */ + if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) < + 5 * SZ_1M) + cond_wake_up_nomb(&fs_info->async_submit_wait); } static noinline void async_cow_free(struct btrfs_work *work) @@ -1946,6 +1963,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page ret = cow_file_range_async(inode, wbc, locked_page, start, end, page_started, nr_written); } + ASSERT(ret <= 0); if (ret) btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1); @@ -2285,7 +2303,6 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len, struct extent_map *split_mid = NULL; struct extent_map *split_post = NULL; int ret = 0; - int modified; unsigned long flags; /* Sanity check */ @@ -2315,11 +2332,12 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len, ASSERT(em->len == len); ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)); ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE); + ASSERT(test_bit(EXTENT_FLAG_PINNED, &em->flags)); + ASSERT(!test_bit(EXTENT_FLAG_LOGGING, &em->flags)); + ASSERT(!list_empty(&em->list)); flags = em->flags; clear_bit(EXTENT_FLAG_PINNED, &em->flags); - clear_bit(EXTENT_FLAG_LOGGING, &flags); - modified = !list_empty(&em->list); /* First, replace the em with a new extent_map starting from * em->start */ split_pre->start = em->start; @@ -2333,7 +2351,7 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len, split_pre->compress_type = em->compress_type; split_pre->generation = em->generation; - replace_extent_mapping(em_tree, em, split_pre, modified); + replace_extent_mapping(em_tree, em, split_pre, 1); /* * Now we only have an extent_map at: @@ -2353,7 +2371,7 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len, split_mid->flags = flags; split_mid->compress_type = em->compress_type; split_mid->generation = em->generation; - add_extent_mapping(em_tree, split_mid, modified); + add_extent_mapping(em_tree, split_mid, 1); } if (post) { @@ -2367,7 +2385,7 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len, split_post->flags = flags; split_post->compress_type = em->compress_type; split_post->generation = em->generation; - add_extent_mapping(em_tree, split_post, modified); + add_extent_mapping(em_tree, split_post, 1); } /* Once for us */ @@ -2770,7 +2788,7 @@ out_page: * to fix it up. The async helper will wait for ordered extents, set * the delalloc bit and make it safe to write the page. */ -int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end) +int btrfs_writepage_cow_fixup(struct page *page) { struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -3171,7 +3189,7 @@ static void finish_ordered_fn(struct btrfs_work *work) void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, struct page *page, u64 start, - u64 end, int uptodate) + u64 end, bool uptodate) { trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate); @@ -3257,25 +3275,44 @@ unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, return 0; } - if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) + /* + * For subpage case, above PageChecked is not safe as it's not subpage + * compatible. + * But for now only cow fixup and compressed read utilize PageChecked + * flag, while in this context we can easily use io_bio->csum to + * determine if we really need to do csum verification. + * + * So for now, just exit if io_bio->csum is NULL, as it means it's + * compressed read, and its compressed data csum has already been + * verified. + */ + if (io_bio->csum == NULL) return 0; - if (!root->fs_info->csum_root) + if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) return 0; - if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && - test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) { - clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM); + if (!root->fs_info->csum_root) return 0; - } ASSERT(page_offset(page) <= start && end <= page_offset(page) + PAGE_SIZE - 1); for (pg_off = offset_in_page(start); pg_off < offset_in_page(end); pg_off += sectorsize, bio_offset += sectorsize) { + u64 file_offset = pg_off + page_offset(page); int ret; + if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && + test_range_bit(io_tree, file_offset, + file_offset + sectorsize - 1, + EXTENT_NODATASUM, 1, NULL)) { + /* Skip the range without csum for data reloc inode */ + clear_extent_bits(io_tree, file_offset, + file_offset + sectorsize - 1, + EXTENT_NODATASUM); + continue; + } ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off, page_offset(page) + pg_off); if (ret < 0) { @@ -3520,7 +3557,14 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) /* * If we have an inode with links, there are a couple of - * possibilities. Old kernels (before v3.12) used to create an + * possibilities: + * + * 1. We were halfway through creating fsverity metadata for the + * file. In that case, the orphan item represents incomplete + * fsverity metadata which must be cleaned up with + * btrfs_drop_verity_items and deleting the orphan item. + + * 2. Old kernels (before v3.12) used to create an * orphan item for truncate indicating that there were possibly * extent items past i_size that needed to be deleted. In v3.12, * truncate was changed to update i_size in sync with the extent @@ -3538,8 +3582,12 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) * but either way, we can delete the orphan item. */ if (ret == -ENOENT || inode->i_nlink) { - if (!ret) + if (!ret) { + ret = btrfs_drop_verity_items(BTRFS_I(inode)); iput(inode); + if (ret) + goto out; + } trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); @@ -3728,7 +3776,8 @@ static int btrfs_read_locked_inode(struct inode *inode, rdev = btrfs_inode_rdev(leaf, inode_item); BTRFS_I(inode)->index_cnt = (u64)-1; - BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); + btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item), + &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags); cache_index: /* @@ -3859,6 +3908,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, struct inode *inode) { struct btrfs_map_token token; + u64 flags; btrfs_init_map_token(&token, leaf); @@ -3894,7 +3944,9 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode)); btrfs_set_token_inode_transid(&token, item, trans->transid); btrfs_set_token_inode_rdev(&token, item, inode->i_rdev); - btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags); + flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags, + BTRFS_I(inode)->ro_flags); + btrfs_set_token_inode_flags(&token, item, flags); btrfs_set_token_inode_block_group(&token, item, 0); } @@ -5088,15 +5140,13 @@ static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode, int ret; /* - * Still need to make sure the inode looks like it's been updated so - * that any holes get logged if we fsync. + * If NO_HOLES is enabled, we don't need to do anything. + * Later, up in the call chain, either btrfs_set_inode_last_sub_trans() + * or btrfs_update_inode() will be called, which guarantee that the next + * fsync will know this inode was changed and needs to be logged. */ - if (btrfs_fs_incompat(fs_info, NO_HOLES)) { - inode->last_trans = fs_info->generation; - inode->last_sub_trans = root->log_transid; - inode->last_log_commit = root->last_log_commit; + if (btrfs_fs_incompat(fs_info, NO_HOLES)) return 0; - } /* * 1 - for the one we're dropping @@ -5342,7 +5392,7 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr if (btrfs_root_readonly(root)) return -EROFS; - err = setattr_prepare(&init_user_ns, dentry, attr); + err = setattr_prepare(mnt_userns, dentry, attr); if (err) return err; @@ -5353,13 +5403,12 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr } if (attr->ia_valid) { - setattr_copy(&init_user_ns, inode, attr); + setattr_copy(mnt_userns, inode, attr); inode_inc_iversion(inode); err = btrfs_dirty_inode(inode); if (!err && attr->ia_valid & ATTR_MODE) - err = posix_acl_chmod(&init_user_ns, inode, - inode->i_mode); + err = posix_acl_chmod(mnt_userns, inode, inode->i_mode); } return err; @@ -5522,6 +5571,7 @@ void btrfs_evict_inode(struct inode *inode) trace_btrfs_inode_evict(inode); if (!root) { + fsverity_cleanup_inode(inode); clear_inode(inode); return; } @@ -5604,6 +5654,7 @@ no_delete: * to retry these periodically in the future. */ btrfs_remove_delayed_node(BTRFS_I(inode)); + fsverity_cleanup_inode(inode); clear_inode(inode); } @@ -6370,6 +6421,7 @@ static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct user_namespace *mnt_userns, struct inode *dir, const char *name, int name_len, u64 ref_objectid, u64 objectid, @@ -6479,7 +6531,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, if (ret != 0) goto fail_unlock; - inode_init_owner(&init_user_ns, inode, dir, mode); + inode_init_owner(mnt_userns, inode, dir, mode); inode_set_bytes(inode, 0); inode->i_mtime = current_time(inode); @@ -6664,9 +6716,9 @@ static int btrfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, if (err) goto out_unlock; - inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, - dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid, - mode, &index); + inode = btrfs_new_inode(trans, root, mnt_userns, dir, + dentry->d_name.name, dentry->d_name.len, + btrfs_ino(BTRFS_I(dir)), objectid, mode, &index); if (IS_ERR(inode)) { err = PTR_ERR(inode); inode = NULL; @@ -6728,9 +6780,9 @@ static int btrfs_create(struct user_namespace *mnt_userns, struct inode *dir, if (err) goto out_unlock; - inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, - dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid, - mode, &index); + inode = btrfs_new_inode(trans, root, mnt_userns, dir, + dentry->d_name.name, dentry->d_name.len, + btrfs_ino(BTRFS_I(dir)), objectid, mode, &index); if (IS_ERR(inode)) { err = PTR_ERR(inode); inode = NULL; @@ -6873,8 +6925,9 @@ static int btrfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, if (err) goto out_fail; - inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, - dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid, + inode = btrfs_new_inode(trans, root, mnt_userns, dir, + dentry->d_name.name, dentry->d_name.len, + btrfs_ino(BTRFS_I(dir)), objectid, S_IFDIR | mode, &index); if (IS_ERR(inode)) { err = PTR_ERR(inode); @@ -8194,9 +8247,10 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio, return dip; } -static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, +static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter, struct bio *dio_bio, loff_t file_offset) { + struct inode *inode = iter->inode; const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); const bool raid56 = (btrfs_data_alloc_profile(fs_info) & @@ -8206,13 +8260,13 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, u64 start_sector; int async_submit = 0; u64 submit_len; - int clone_offset = 0; - int clone_len; + u64 clone_offset = 0; + u64 clone_len; u64 logical; int ret; blk_status_t status; struct btrfs_io_geometry geom; - struct btrfs_dio_data *dio_data = iomap->private; + struct btrfs_dio_data *dio_data = iter->iomap.private; struct extent_map *em = NULL; dip = btrfs_create_dio_private(dio_bio, inode, file_offset); @@ -8255,9 +8309,9 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, status = errno_to_blk_status(ret); goto out_err_em; } - ASSERT(geom.len <= INT_MAX); - clone_len = min_t(int, submit_len, geom.len); + clone_len = min(submit_len, geom.len); + ASSERT(clone_len <= UINT_MAX); /* * This will never fail as it's passing GPF_NOFS and @@ -8401,11 +8455,47 @@ static void btrfs_readahead(struct readahead_control *rac) extent_readahead(rac); } +/* + * For releasepage() and invalidatepage() we have a race window where + * end_page_writeback() is called but the subpage spinlock is not yet released. + * If we continue to release/invalidate the page, we could cause use-after-free + * for subpage spinlock. So this function is to spin and wait for subpage + * spinlock. + */ +static void wait_subpage_spinlock(struct page *page) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); + struct btrfs_subpage *subpage; + + if (fs_info->sectorsize == PAGE_SIZE) + return; + + ASSERT(PagePrivate(page) && page->private); + subpage = (struct btrfs_subpage *)page->private; + + /* + * This may look insane as we just acquire the spinlock and release it, + * without doing anything. But we just want to make sure no one is + * still holding the subpage spinlock. + * And since the page is not dirty nor writeback, and we have page + * locked, the only possible way to hold a spinlock is from the endio + * function to clear page writeback. + * + * Here we just acquire the spinlock so that all existing callers + * should exit and we're safe to release/invalidate the page. + */ + spin_lock_irq(&subpage->lock); + spin_unlock_irq(&subpage->lock); +} + static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags) { int ret = try_release_extent_mapping(page, gfp_flags); - if (ret == 1) + + if (ret == 1) { + wait_subpage_spinlock(page); clear_page_extent_mapped(page); + } return ret; } @@ -8469,6 +8559,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, * do double ordered extent accounting on the same page. */ wait_on_page_writeback(page); + wait_subpage_spinlock(page); /* * For subpage case, we have call sites like @@ -8557,7 +8648,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, spin_unlock_irq(&inode->ordered_tree.lock); if (btrfs_dec_test_ordered_pending(inode, &ordered, - cur, range_end + 1 - cur, 1)) { + cur, range_end + 1 - cur)) { btrfs_finish_ordered_io(ordered); /* * The ordered extent has finished, now we're again @@ -8938,7 +9029,8 @@ out: */ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, struct btrfs_root *new_root, - struct btrfs_root *parent_root) + struct btrfs_root *parent_root, + struct user_namespace *mnt_userns) { struct inode *inode; int err; @@ -8949,7 +9041,8 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, if (err < 0) return err; - inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, ino, ino, + inode = btrfs_new_inode(trans, new_root, mnt_userns, NULL, "..", 2, + ino, ino, S_IFDIR | (~current_umask() & S_IRWXUGO), &index); if (IS_ERR(inode)) @@ -8993,6 +9086,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->defrag_bytes = 0; ei->disk_i_size = 0; ei->flags = 0; + ei->ro_flags = 0; ei->csum_bytes = 0; ei->index_cnt = (u64)-1; ei->dir_index = 0; @@ -9174,6 +9268,7 @@ static int btrfs_getattr(struct user_namespace *mnt_userns, struct inode *inode = d_inode(path->dentry); u32 blocksize = inode->i_sb->s_blocksize; u32 bi_flags = BTRFS_I(inode)->flags; + u32 bi_ro_flags = BTRFS_I(inode)->ro_flags; stat->result_mask |= STATX_BTIME; stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec; @@ -9186,13 +9281,15 @@ static int btrfs_getattr(struct user_namespace *mnt_userns, stat->attributes |= STATX_ATTR_IMMUTABLE; if (bi_flags & BTRFS_INODE_NODUMP) stat->attributes |= STATX_ATTR_NODUMP; + if (bi_ro_flags & BTRFS_INODE_RO_VERITY) + stat->attributes |= STATX_ATTR_VERITY; stat->attributes_mask |= (STATX_ATTR_APPEND | STATX_ATTR_COMPRESSED | STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP); - generic_fillattr(&init_user_ns, inode, stat); + generic_fillattr(mnt_userns, inode, stat); stat->dev = BTRFS_I(inode)->root->anon_dev; spin_lock(&BTRFS_I(inode)->lock); @@ -9280,8 +9377,6 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* force full log commit if subvolume involved. */ btrfs_set_log_full_commit(trans); } else { - btrfs_pin_log_trans(root); - root_log_pinned = true; ret = btrfs_insert_inode_ref(trans, dest, new_dentry->d_name.name, new_dentry->d_name.len, @@ -9298,8 +9393,6 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* force full log commit if subvolume involved. */ btrfs_set_log_full_commit(trans); } else { - btrfs_pin_log_trans(dest); - dest_log_pinned = true; ret = btrfs_insert_inode_ref(trans, root, old_dentry->d_name.name, old_dentry->d_name.len, @@ -9330,6 +9423,29 @@ static int btrfs_rename_exchange(struct inode *old_dir, BTRFS_I(new_inode), 1); } + /* + * Now pin the logs of the roots. We do it to ensure that no other task + * can sync the logs while we are in progress with the rename, because + * that could result in an inconsistency in case any of the inodes that + * are part of this rename operation were logged before. + * + * We pin the logs even if at this precise moment none of the inodes was + * logged before. This is because right after we checked for that, some + * other task fsyncing some other inode not involved with this rename + * operation could log that one of our inodes exists. + * + * We don't need to pin the logs before the above calls to + * btrfs_insert_inode_ref(), since those don't ever need to change a log. + */ + if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { + btrfs_pin_log_trans(root); + root_log_pinned = true; + } + if (new_ino != BTRFS_FIRST_FREE_OBJECTID) { + btrfs_pin_log_trans(dest); + dest_log_pinned = true; + } + /* src is a subvolume */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_unlink_subvol(trans, old_dir, old_dentry); @@ -9411,8 +9527,7 @@ out_fail: if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) || btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) || btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) || - (new_inode && - btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation))) + btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation)) btrfs_set_log_full_commit(trans); if (root_log_pinned) { @@ -9436,6 +9551,7 @@ out_notrans: static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry) { @@ -9448,7 +9564,7 @@ static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans, if (ret) return ret; - inode = btrfs_new_inode(trans, root, dir, + inode = btrfs_new_inode(trans, root, mnt_userns, dir, dentry->d_name.name, dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), @@ -9485,9 +9601,10 @@ out: return ret; } -static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) +static int btrfs_rename(struct user_namespace *mnt_userns, + struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb); struct btrfs_trans_handle *trans; @@ -9582,8 +9699,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, /* force full log commit if subvolume involved. */ btrfs_set_log_full_commit(trans); } else { - btrfs_pin_log_trans(root); - log_pinned = true; ret = btrfs_insert_inode_ref(trans, dest, new_dentry->d_name.name, new_dentry->d_name.len, @@ -9607,6 +9722,25 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { ret = btrfs_unlink_subvol(trans, old_dir, old_dentry); } else { + /* + * Now pin the log. We do it to ensure that no other task can + * sync the log while we are in progress with the rename, as + * that could result in an inconsistency in case any of the + * inodes that are part of this rename operation were logged + * before. + * + * We pin the log even if at this precise moment none of the + * inodes was logged before. This is because right after we + * checked for that, some other task fsyncing some other inode + * not involved with this rename operation could log that one of + * our inodes exists. + * + * We don't need to pin the logs before the above call to + * btrfs_insert_inode_ref(), since that does not need to change + * a log. + */ + btrfs_pin_log_trans(root); + log_pinned = true; ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir), BTRFS_I(d_inode(old_dentry)), old_dentry->d_name.name, @@ -9660,8 +9794,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, } if (flags & RENAME_WHITEOUT) { - ret = btrfs_whiteout_for_rename(trans, root, old_dir, - old_dentry); + ret = btrfs_whiteout_for_rename(trans, root, mnt_userns, + old_dir, old_dentry); if (ret) { btrfs_abort_transaction(trans, ret); @@ -9711,7 +9845,8 @@ static int btrfs_rename2(struct user_namespace *mnt_userns, struct inode *old_di return btrfs_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); - return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry, flags); + return btrfs_rename(mnt_userns, old_dir, old_dentry, new_dir, + new_dentry, flags); } struct btrfs_delalloc_work { @@ -9808,11 +9943,7 @@ static int start_delalloc_inodes(struct btrfs_root *root, btrfs_queue_work(root->fs_info->flush_workers, &work->work); } else { - ret = sync_inode(inode, wbc); - if (!ret && - test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags)) - ret = sync_inode(inode, wbc); + ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc); btrfs_add_delayed_iput(inode); if (ret || wbc->nr_to_write <= 0) goto out; @@ -9947,9 +10078,10 @@ static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, if (err) goto out_unlock; - inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, - dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), - objectid, S_IFLNK|S_IRWXUGO, &index); + inode = btrfs_new_inode(trans, root, mnt_userns, dir, + dentry->d_name.name, dentry->d_name.len, + btrfs_ino(BTRFS_I(dir)), objectid, + S_IFLNK | S_IRWXUGO, &index); if (IS_ERR(inode)) { err = PTR_ERR(inode); inode = NULL; @@ -10273,7 +10405,7 @@ static int btrfs_permission(struct user_namespace *mnt_userns, if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) return -EACCES; } - return generic_permission(&init_user_ns, inode, mask); + return generic_permission(mnt_userns, inode, mask); } static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, @@ -10298,7 +10430,7 @@ static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, if (ret) goto out; - inode = btrfs_new_inode(trans, root, dir, NULL, 0, + inode = btrfs_new_inode(trans, root, mnt_userns, dir, NULL, 0, btrfs_ino(BTRFS_I(dir)), objectid, mode, &index); if (IS_ERR(inode)) { ret = PTR_ERR(inode); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0ba98e08a029..cc61813213d8 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -27,6 +27,7 @@ #include <linux/uaccess.h> #include <linux/iversion.h> #include <linux/fileattr.h> +#include <linux/fsverity.h> #include "ctree.h" #include "disk-io.h" #include "export.h" @@ -103,9 +104,11 @@ static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode, * Export internal inode flags to the format expected by the FS_IOC_GETFLAGS * ioctl. */ -static unsigned int btrfs_inode_flags_to_fsflags(unsigned int flags) +static unsigned int btrfs_inode_flags_to_fsflags(struct btrfs_inode *binode) { unsigned int iflags = 0; + u32 flags = binode->flags; + u32 ro_flags = binode->ro_flags; if (flags & BTRFS_INODE_SYNC) iflags |= FS_SYNC_FL; @@ -121,6 +124,8 @@ static unsigned int btrfs_inode_flags_to_fsflags(unsigned int flags) iflags |= FS_DIRSYNC_FL; if (flags & BTRFS_INODE_NODATACOW) iflags |= FS_NOCOW_FL; + if (ro_flags & BTRFS_INODE_RO_VERITY) + iflags |= FS_VERITY_FL; if (flags & BTRFS_INODE_NOCOMPRESS) iflags |= FS_NOCOMP_FL; @@ -148,10 +153,12 @@ void btrfs_sync_inode_flags_to_i_flags(struct inode *inode) new_fl |= S_NOATIME; if (binode->flags & BTRFS_INODE_DIRSYNC) new_fl |= S_DIRSYNC; + if (binode->ro_flags & BTRFS_INODE_RO_VERITY) + new_fl |= S_VERITY; set_mask_bits(&inode->i_flags, - S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC, - new_fl); + S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC | + S_VERITY, new_fl); } /* @@ -200,7 +207,7 @@ int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) { struct btrfs_inode *binode = BTRFS_I(d_inode(dentry)); - fileattr_fill_flags(fa, btrfs_inode_flags_to_fsflags(binode->flags)); + fileattr_fill_flags(fa, btrfs_inode_flags_to_fsflags(binode)); return 0; } @@ -224,7 +231,7 @@ int btrfs_fileattr_set(struct user_namespace *mnt_userns, return -EOPNOTSUPP; fsflags = btrfs_mask_fsflags_for_type(inode, fa->flags); - old_fsflags = btrfs_inode_flags_to_fsflags(binode->flags); + old_fsflags = btrfs_inode_flags_to_fsflags(binode); ret = check_fsflags(old_fsflags, fsflags); if (ret) return ret; @@ -492,8 +499,8 @@ int __pure btrfs_is_empty_uuid(u8 *uuid) return 1; } -static noinline int create_subvol(struct inode *dir, - struct dentry *dentry, +static noinline int create_subvol(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *dentry, const char *name, int namelen, struct btrfs_qgroup_inherit *inherit) { @@ -638,7 +645,7 @@ static noinline int create_subvol(struct inode *dir, goto fail; } - ret = btrfs_create_subvol_root(trans, new_root, root); + ret = btrfs_create_subvol_root(trans, new_root, root, mnt_userns); btrfs_put_root(new_root); if (ret) { /* We potentially lose an unused inode item here */ @@ -830,7 +837,8 @@ free_pending: * nfs_async_unlink(). */ -static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir) +static int btrfs_may_delete(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *victim, int isdir) { int error; @@ -840,12 +848,12 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir) BUG_ON(d_inode(victim->d_parent) != dir); audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); - error = inode_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC); + error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC); if (error) return error; if (IS_APPEND(dir)) return -EPERM; - if (check_sticky(&init_user_ns, dir, d_inode(victim)) || + if (check_sticky(mnt_userns, dir, d_inode(victim)) || IS_APPEND(d_inode(victim)) || IS_IMMUTABLE(d_inode(victim)) || IS_SWAPFILE(d_inode(victim))) return -EPERM; @@ -864,13 +872,16 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir) } /* copy of may_create in fs/namei.c() */ -static inline int btrfs_may_create(struct inode *dir, struct dentry *child) +static inline int btrfs_may_create(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *child) { if (d_really_is_positive(child)) return -EEXIST; if (IS_DEADDIR(dir)) return -ENOENT; - return inode_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC); + if (!fsuidgid_has_mapping(dir->i_sb, mnt_userns)) + return -EOVERFLOW; + return inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC); } /* @@ -879,6 +890,7 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child) * inside this filesystem so it's quite a bit simpler. */ static noinline int btrfs_mksubvol(const struct path *parent, + struct user_namespace *mnt_userns, const char *name, int namelen, struct btrfs_root *snap_src, bool readonly, @@ -893,12 +905,12 @@ static noinline int btrfs_mksubvol(const struct path *parent, if (error == -EINTR) return error; - dentry = lookup_one_len(name, parent->dentry, namelen); + dentry = lookup_one(mnt_userns, name, parent->dentry, namelen); error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out_unlock; - error = btrfs_may_create(dir, dentry); + error = btrfs_may_create(mnt_userns, dir, dentry); if (error) goto out_dput; @@ -920,7 +932,7 @@ static noinline int btrfs_mksubvol(const struct path *parent, if (snap_src) error = create_snapshot(snap_src, dir, dentry, readonly, inherit); else - error = create_subvol(dir, dentry, name, namelen, inherit); + error = create_subvol(mnt_userns, dir, dentry, name, namelen, inherit); if (!error) fsnotify_mkdir(dir, dentry); @@ -934,6 +946,7 @@ out_unlock: } static noinline int btrfs_mksnapshot(const struct path *parent, + struct user_namespace *mnt_userns, const char *name, int namelen, struct btrfs_root *root, bool readonly, @@ -963,7 +976,7 @@ static noinline int btrfs_mksnapshot(const struct path *parent, btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1); - ret = btrfs_mksubvol(parent, name, namelen, + ret = btrfs_mksubvol(parent, mnt_userns, name, namelen, root, readonly, inherit); out: if (snapshot_force_cow) @@ -1792,6 +1805,7 @@ out_drop: } static noinline int __btrfs_ioctl_snap_create(struct file *file, + struct user_namespace *mnt_userns, const char *name, unsigned long fd, int subvol, bool readonly, struct btrfs_qgroup_inherit *inherit) @@ -1819,8 +1833,8 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file, } if (subvol) { - ret = btrfs_mksubvol(&file->f_path, name, namelen, - NULL, readonly, inherit); + ret = btrfs_mksubvol(&file->f_path, mnt_userns, name, + namelen, NULL, readonly, inherit); } else { struct fd src = fdget(fd); struct inode *src_inode; @@ -1834,16 +1848,17 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file, btrfs_info(BTRFS_I(file_inode(file))->root->fs_info, "Snapshot src from another FS"); ret = -EXDEV; - } else if (!inode_owner_or_capable(&init_user_ns, src_inode)) { + } else if (!inode_owner_or_capable(mnt_userns, src_inode)) { /* * Subvolume creation is not restricted, but snapshots * are limited to own subvolumes only */ ret = -EPERM; } else { - ret = btrfs_mksnapshot(&file->f_path, name, namelen, - BTRFS_I(src_inode)->root, - readonly, inherit); + ret = btrfs_mksnapshot(&file->f_path, mnt_userns, + name, namelen, + BTRFS_I(src_inode)->root, + readonly, inherit); } fdput(src); } @@ -1867,8 +1882,9 @@ static noinline int btrfs_ioctl_snap_create(struct file *file, return PTR_ERR(vol_args); vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - ret = __btrfs_ioctl_snap_create(file, vol_args->name, vol_args->fd, - subvol, false, NULL); + ret = __btrfs_ioctl_snap_create(file, file_mnt_user_ns(file), + vol_args->name, vol_args->fd, subvol, + false, NULL); kfree(vol_args); return ret; @@ -1926,8 +1942,9 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, } } - ret = __btrfs_ioctl_snap_create(file, vol_args->name, vol_args->fd, - subvol, readonly, inherit); + ret = __btrfs_ioctl_snap_create(file, file_mnt_user_ns(file), + vol_args->name, vol_args->fd, subvol, + readonly, inherit); if (ret) goto free_inherit; free_inherit: @@ -1971,7 +1988,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file, u64 flags; int ret = 0; - if (!inode_owner_or_capable(&init_user_ns, inode)) + if (!inode_owner_or_capable(file_mnt_user_ns(file), inode)) return -EPERM; ret = mnt_want_write_file(file); @@ -2382,23 +2399,16 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, key.offset = (u64)-1; while (1) { - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + ret = btrfs_search_backwards(root, &key, path); if (ret < 0) goto out; else if (ret > 0) { - ret = btrfs_previous_item(root, path, dirid, - BTRFS_INODE_REF_KEY); - if (ret < 0) - goto out; - else if (ret > 0) { - ret = -ENOENT; - goto out; - } + ret = -ENOENT; + goto out; } l = path->nodes[0]; slot = path->slots[0]; - btrfs_item_key_to_cpu(l, &key, slot); iref = btrfs_item_ptr(l, slot, struct btrfs_inode_ref); len = btrfs_inode_ref_name_len(l, iref); @@ -2429,7 +2439,8 @@ out: return ret; } -static int btrfs_search_path_in_tree_user(struct inode *inode, +static int btrfs_search_path_in_tree_user(struct user_namespace *mnt_userns, + struct inode *inode, struct btrfs_ioctl_ino_lookup_user_args *args) { struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; @@ -2473,23 +2484,16 @@ static int btrfs_search_path_in_tree_user(struct inode *inode, key.type = BTRFS_INODE_REF_KEY; key.offset = (u64)-1; while (1) { - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) { + ret = btrfs_search_backwards(root, &key, path); + if (ret < 0) + goto out_put; + else if (ret > 0) { + ret = -ENOENT; goto out_put; - } else if (ret > 0) { - ret = btrfs_previous_item(root, path, dirid, - BTRFS_INODE_REF_KEY); - if (ret < 0) { - goto out_put; - } else if (ret > 0) { - ret = -ENOENT; - goto out_put; - } } leaf = path->nodes[0]; slot = path->slots[0]; - btrfs_item_key_to_cpu(leaf, &key, slot); iref = btrfs_item_ptr(leaf, slot, struct btrfs_inode_ref); len = btrfs_inode_ref_name_len(leaf, iref); @@ -2527,7 +2531,7 @@ static int btrfs_search_path_in_tree_user(struct inode *inode, ret = PTR_ERR(temp_inode); goto out_put; } - ret = inode_permission(&init_user_ns, temp_inode, + ret = inode_permission(mnt_userns, temp_inode, MAY_READ | MAY_EXEC); iput(temp_inode); if (ret) { @@ -2669,7 +2673,7 @@ static int btrfs_ioctl_ino_lookup_user(struct file *file, void __user *argp) return -EACCES; } - ret = btrfs_search_path_in_tree_user(inode, args); + ret = btrfs_search_path_in_tree_user(file_mnt_user_ns(file), inode, args); if (ret == 0 && copy_to_user(argp, args, sizeof(*args))) ret = -EFAULT; @@ -2905,6 +2909,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, struct btrfs_root *dest = NULL; struct btrfs_ioctl_vol_args *vol_args = NULL; struct btrfs_ioctl_vol_args_v2 *vol_args2 = NULL; + struct user_namespace *mnt_userns = file_mnt_user_ns(file); char *subvol_name, *subvol_name_ptr = NULL; int subvol_namelen; int err = 0; @@ -2932,6 +2937,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, if (err) goto out; } else { + struct inode *old_dir; + if (vol_args2->subvolid < BTRFS_FIRST_FREE_OBJECTID) { err = -EINVAL; goto out; @@ -2968,6 +2975,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, err = PTR_ERR(parent); goto out_drop_write; } + old_dir = dir; dir = d_inode(parent); /* @@ -2978,6 +2986,20 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, */ destroy_parent = true; + /* + * On idmapped mounts, deletion via subvolid is + * restricted to subvolumes that are immediate + * ancestors of the inode referenced by the file + * descriptor in the ioctl. Otherwise the idmapping + * could potentially be abused to delete subvolumes + * anywhere in the filesystem the user wouldn't be able + * to delete without an idmapped mount. + */ + if (old_dir != dir && mnt_userns != &init_user_ns) { + err = -EOPNOTSUPP; + goto free_parent; + } + subvol_name_ptr = btrfs_get_subvol_name_from_objectid( fs_info, vol_args2->subvolid); if (IS_ERR(subvol_name_ptr)) { @@ -3016,7 +3038,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, err = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); if (err == -EINTR) goto free_subvol_name; - dentry = lookup_one_len(subvol_name, parent, subvol_namelen); + dentry = lookup_one(mnt_userns, subvol_name, parent, subvol_namelen); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); goto out_unlock_dir; @@ -3058,14 +3080,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, if (root == dest) goto out_dput; - err = inode_permission(&init_user_ns, inode, - MAY_WRITE | MAY_EXEC); + err = inode_permission(mnt_userns, inode, MAY_WRITE | MAY_EXEC); if (err) goto out_dput; } /* check if subvolume may be deleted by a user */ - err = btrfs_may_delete(dir, dentry, 1); + err = btrfs_may_delete(mnt_userns, dir, dentry, 1); if (err) goto out_dput; @@ -3103,7 +3124,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) { struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_ioctl_defrag_range_args *range; + struct btrfs_ioctl_defrag_range_args range = {0}; int ret; ret = mnt_want_write_file(file); @@ -3115,6 +3136,12 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) goto out; } + /* Subpage defrag will be supported in later commits */ + if (root->fs_info->sectorsize < PAGE_SIZE) { + ret = -ENOTTY; + goto out; + } + switch (inode->i_mode & S_IFMT) { case S_IFDIR: if (!capable(CAP_SYS_ADMIN)) { @@ -3135,33 +3162,24 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) goto out; } - range = kzalloc(sizeof(*range), GFP_KERNEL); - if (!range) { - ret = -ENOMEM; - goto out; - } - if (argp) { - if (copy_from_user(range, argp, - sizeof(*range))) { + if (copy_from_user(&range, argp, sizeof(range))) { ret = -EFAULT; - kfree(range); goto out; } /* compression requires us to start the IO */ - if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) { - range->flags |= BTRFS_DEFRAG_RANGE_START_IO; - range->extent_thresh = (u32)-1; + if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS)) { + range.flags |= BTRFS_DEFRAG_RANGE_START_IO; + range.extent_thresh = (u32)-1; } } else { /* the rest are all set to zero by kzalloc */ - range->len = (u64)-1; + range.len = (u64)-1; } ret = btrfs_defrag_file(file_inode(file), file, - range, BTRFS_OLDEST_GENERATION, 0); + &range, BTRFS_OLDEST_GENERATION, 0); if (ret > 0) ret = 0; - kfree(range); break; default: ret = -EINVAL; @@ -3205,6 +3223,8 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ioctl_vol_args_v2 *vol_args; + struct block_device *bdev = NULL; + fmode_t mode; int ret; bool cancel = false; @@ -3237,9 +3257,9 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) /* Exclusive operation is now claimed */ if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) - ret = btrfs_rm_device(fs_info, NULL, vol_args->devid); + ret = btrfs_rm_device(fs_info, NULL, vol_args->devid, &bdev, &mode); else - ret = btrfs_rm_device(fs_info, vol_args->name, 0); + ret = btrfs_rm_device(fs_info, vol_args->name, 0, &bdev, &mode); btrfs_exclop_finish(fs_info); @@ -3255,6 +3275,8 @@ out: kfree(vol_args); err_drop: mnt_drop_write_file(file); + if (bdev) + blkdev_put(bdev, mode); return ret; } @@ -3263,6 +3285,8 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ioctl_vol_args *vol_args; + struct block_device *bdev = NULL; + fmode_t mode; int ret; bool cancel; @@ -3284,7 +3308,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE, cancel); if (ret == 0) { - ret = btrfs_rm_device(fs_info, vol_args->name, 0); + ret = btrfs_rm_device(fs_info, vol_args->name, 0, &bdev, &mode); if (!ret) btrfs_info(fs_info, "disk deleted %s", vol_args->name); btrfs_exclop_finish(fs_info); @@ -3293,7 +3317,8 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) kfree(vol_args); out_drop_write: mnt_drop_write_file(file); - + if (bdev) + blkdev_put(bdev, mode); return ret; } @@ -4404,25 +4429,20 @@ drop_write: static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info, void __user *arg) { - struct btrfs_ioctl_quota_rescan_args *qsa; + struct btrfs_ioctl_quota_rescan_args qsa = {0}; int ret = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - qsa = kzalloc(sizeof(*qsa), GFP_KERNEL); - if (!qsa) - return -ENOMEM; - if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { - qsa->flags = 1; - qsa->progress = fs_info->qgroup_rescan_progress.objectid; + qsa.flags = 1; + qsa.progress = fs_info->qgroup_rescan_progress.objectid; } - if (copy_to_user(arg, qsa, sizeof(*qsa))) + if (copy_to_user(arg, &qsa, sizeof(qsa))) ret = -EFAULT; - kfree(qsa); return ret; } @@ -4436,6 +4456,7 @@ static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info, } static long _btrfs_ioctl_set_received_subvol(struct file *file, + struct user_namespace *mnt_userns, struct btrfs_ioctl_received_subvol_args *sa) { struct inode *inode = file_inode(file); @@ -4447,7 +4468,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file, int ret = 0; int received_uuid_changed; - if (!inode_owner_or_capable(&init_user_ns, inode)) + if (!inode_owner_or_capable(mnt_userns, inode)) return -EPERM; ret = mnt_want_write_file(file); @@ -4552,7 +4573,7 @@ static long btrfs_ioctl_set_received_subvol_32(struct file *file, args64->rtime.nsec = args32->rtime.nsec; args64->flags = args32->flags; - ret = _btrfs_ioctl_set_received_subvol(file, args64); + ret = _btrfs_ioctl_set_received_subvol(file, file_mnt_user_ns(file), args64); if (ret) goto out; @@ -4586,7 +4607,7 @@ static long btrfs_ioctl_set_received_subvol(struct file *file, if (IS_ERR(sa)) return PTR_ERR(sa); - ret = _btrfs_ioctl_set_received_subvol(file, sa); + ret = _btrfs_ioctl_set_received_subvol(file, file_mnt_user_ns(file), sa); if (ret) goto out; @@ -5013,6 +5034,10 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_get_subvol_rootref(file, argp); case BTRFS_IOC_INO_LOOKUP_USER: return btrfs_ioctl_ino_lookup_user(file, argp); + case FS_IOC_ENABLE_VERITY: + return fsverity_ioctl_enable(file, (const void __user *)argp); + case FS_IOC_MEASURE_VERITY: + return fsverity_ioctl_measure(file, argp); } return -ENOTTY; diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index cd042c7567a4..c25dfd1a8a54 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -14,6 +14,7 @@ #include <linux/lzo.h> #include <linux/refcount.h> #include "compression.h" +#include "ctree.h" #define LZO_LEN 4 @@ -140,18 +141,18 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, *total_in = 0; in_page = find_get_page(mapping, start >> PAGE_SHIFT); - data_in = kmap(in_page); + data_in = page_address(in_page); /* * store the size of all chunks of compressed data in * the first 4 bytes */ - out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + out_page = alloc_page(GFP_NOFS); if (out_page == NULL) { ret = -ENOMEM; goto out; } - cpage_out = kmap(out_page); + cpage_out = page_address(out_page); out_offset = LZO_LEN; tot_out = LZO_LEN; pages[0] = out_page; @@ -209,19 +210,18 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, if (out_len == 0 && tot_in >= len) break; - kunmap(out_page); if (nr_pages == nr_dest_pages) { out_page = NULL; ret = -E2BIG; goto out; } - out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + out_page = alloc_page(GFP_NOFS); if (out_page == NULL) { ret = -ENOMEM; goto out; } - cpage_out = kmap(out_page); + cpage_out = page_address(out_page); pages[nr_pages++] = out_page; pg_bytes_left = PAGE_SIZE; @@ -243,12 +243,11 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, break; bytes_left = len - tot_in; - kunmap(in_page); put_page(in_page); start += PAGE_SIZE; in_page = find_get_page(mapping, start >> PAGE_SHIFT); - data_in = kmap(in_page); + data_in = page_address(in_page); in_len = min(bytes_left, PAGE_SIZE); } @@ -258,164 +257,130 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, } /* store the size of all chunks of compressed data */ - sizes_ptr = kmap_local_page(pages[0]); + sizes_ptr = page_address(pages[0]); write_compress_length(sizes_ptr, tot_out); - kunmap_local(sizes_ptr); ret = 0; *total_out = tot_out; *total_in = tot_in; out: *out_pages = nr_pages; - if (out_page) - kunmap(out_page); - if (in_page) { - kunmap(in_page); + if (in_page) put_page(in_page); - } return ret; } +/* + * Copy the compressed segment payload into @dest. + * + * For the payload there will be no padding, just need to do page switching. + */ +static void copy_compressed_segment(struct compressed_bio *cb, + char *dest, u32 len, u32 *cur_in) +{ + u32 orig_in = *cur_in; + + while (*cur_in < orig_in + len) { + struct page *cur_page; + u32 copy_len = min_t(u32, PAGE_SIZE - offset_in_page(*cur_in), + orig_in + len - *cur_in); + + ASSERT(copy_len); + cur_page = cb->compressed_pages[*cur_in / PAGE_SIZE]; + + memcpy(dest + *cur_in - orig_in, + page_address(cur_page) + offset_in_page(*cur_in), + copy_len); + + *cur_in += copy_len; + } +} + int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) { struct workspace *workspace = list_entry(ws, struct workspace, list); - int ret = 0, ret2; - char *data_in; - unsigned long page_in_index = 0; - size_t srclen = cb->compressed_len; - unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE); - unsigned long buf_start; - unsigned long buf_offset = 0; - unsigned long bytes; - unsigned long working_bytes; - size_t in_len; - size_t out_len; - const size_t max_segment_len = lzo1x_worst_compress(PAGE_SIZE); - unsigned long in_offset; - unsigned long in_page_bytes_left; - unsigned long tot_in; - unsigned long tot_out; - unsigned long tot_len; - char *buf; - bool may_late_unmap, need_unmap; - struct page **pages_in = cb->compressed_pages; - u64 disk_start = cb->start; - struct bio *orig_bio = cb->orig_bio; + const struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); + const u32 sectorsize = fs_info->sectorsize; + int ret; + /* Compressed data length, can be unaligned */ + u32 len_in; + /* Offset inside the compressed data */ + u32 cur_in = 0; + /* Bytes decompressed so far */ + u32 cur_out = 0; + + len_in = read_compress_length(page_address(cb->compressed_pages[0])); + cur_in += LZO_LEN; - data_in = kmap(pages_in[0]); - tot_len = read_compress_length(data_in); /* - * Compressed data header check. + * LZO header length check * - * The real compressed size can't exceed the maximum extent length, and - * all pages should be used (whole unused page with just the segment - * header is not possible). If this happens it means the compressed - * extent is corrupted. + * The total length should not exceed the maximum extent length, + * and all sectors should be used. + * If this happens, it means the compressed extent is corrupted. */ - if (tot_len > min_t(size_t, BTRFS_MAX_COMPRESSED, srclen) || - tot_len < srclen - PAGE_SIZE) { - ret = -EUCLEAN; - goto done; + if (len_in > min_t(size_t, BTRFS_MAX_COMPRESSED, cb->compressed_len) || + round_up(len_in, sectorsize) < cb->compressed_len) { + btrfs_err(fs_info, + "invalid lzo header, lzo len %u compressed len %u", + len_in, cb->compressed_len); + return -EUCLEAN; } - tot_in = LZO_LEN; - in_offset = LZO_LEN; - in_page_bytes_left = PAGE_SIZE - LZO_LEN; - - tot_out = 0; - - while (tot_in < tot_len) { - in_len = read_compress_length(data_in + in_offset); - in_page_bytes_left -= LZO_LEN; - in_offset += LZO_LEN; - tot_in += LZO_LEN; + /* Go through each lzo segment */ + while (cur_in < len_in) { + struct page *cur_page; + /* Length of the compressed segment */ + u32 seg_len; + u32 sector_bytes_left; + size_t out_len = lzo1x_worst_compress(sectorsize); /* - * Segment header check. - * - * The segment length must not exceed the maximum LZO - * compression size, nor the total compressed size. + * We should always have enough space for one segment header + * inside current sector. */ - if (in_len > max_segment_len || tot_in + in_len > tot_len) { - ret = -EUCLEAN; - goto done; - } - - tot_in += in_len; - working_bytes = in_len; - may_late_unmap = need_unmap = false; - - /* fast path: avoid using the working buffer */ - if (in_page_bytes_left >= in_len) { - buf = data_in + in_offset; - bytes = in_len; - may_late_unmap = true; - goto cont; - } - - /* copy bytes from the pages into the working buffer */ - buf = workspace->cbuf; - buf_offset = 0; - while (working_bytes) { - bytes = min(working_bytes, in_page_bytes_left); - - memcpy(buf + buf_offset, data_in + in_offset, bytes); - buf_offset += bytes; -cont: - working_bytes -= bytes; - in_page_bytes_left -= bytes; - in_offset += bytes; - - /* check if we need to pick another page */ - if ((working_bytes == 0 && in_page_bytes_left < LZO_LEN) - || in_page_bytes_left == 0) { - tot_in += in_page_bytes_left; - - if (working_bytes == 0 && tot_in >= tot_len) - break; - - if (page_in_index + 1 >= total_pages_in) { - ret = -EIO; - goto done; - } - - if (may_late_unmap) - need_unmap = true; - else - kunmap(pages_in[page_in_index]); - - data_in = kmap(pages_in[++page_in_index]); - - in_page_bytes_left = PAGE_SIZE; - in_offset = 0; - } - } - - out_len = max_segment_len; - ret = lzo1x_decompress_safe(buf, in_len, workspace->buf, - &out_len); - if (need_unmap) - kunmap(pages_in[page_in_index - 1]); + ASSERT(cur_in / sectorsize == + (cur_in + LZO_LEN - 1) / sectorsize); + cur_page = cb->compressed_pages[cur_in / PAGE_SIZE]; + ASSERT(cur_page); + seg_len = read_compress_length(page_address(cur_page) + + offset_in_page(cur_in)); + cur_in += LZO_LEN; + + /* Copy the compressed segment payload into workspace */ + copy_compressed_segment(cb, workspace->cbuf, seg_len, &cur_in); + + /* Decompress the data */ + ret = lzo1x_decompress_safe(workspace->cbuf, seg_len, + workspace->buf, &out_len); if (ret != LZO_E_OK) { - pr_warn("BTRFS: decompress failed\n"); + btrfs_err(fs_info, "failed to decompress"); ret = -EIO; - break; + goto out; } - buf_start = tot_out; - tot_out += out_len; + /* Copy the data into inode pages */ + ret = btrfs_decompress_buf2page(workspace->buf, out_len, cb, cur_out); + cur_out += out_len; - ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start, - tot_out, disk_start, orig_bio); - if (ret2 == 0) - break; + /* All data read, exit */ + if (ret == 0) + goto out; + ret = 0; + + /* Check if the sector has enough space for a segment header */ + sector_bytes_left = sectorsize - (cur_in % sectorsize); + if (sector_bytes_left >= LZO_LEN) + continue; + + /* Skip the padding zeros */ + cur_in += sector_bytes_left; } -done: - kunmap(pages_in[page_in_index]); +out: if (!ret) - zero_fill_bio(orig_bio); + zero_fill_bio(cb->orig_bio); return ret; } @@ -466,7 +431,7 @@ int lzo_decompress(struct list_head *ws, unsigned char *data_in, destlen = min_t(unsigned long, destlen, PAGE_SIZE); bytes = min_t(unsigned long, destlen, out_len - start_byte); - kaddr = kmap_local_page(dest_page); + kaddr = page_address(dest_page); memcpy(kaddr, workspace->buf + start_byte, bytes); /* @@ -476,7 +441,6 @@ int lzo_decompress(struct list_head *ws, unsigned char *data_in, */ if (bytes < destlen) memset(kaddr+bytes, 0, destlen-bytes); - kunmap_local(kaddr); out: return ret; } diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h index 6461ebc3a1c1..340f995652f2 100644 --- a/fs/btrfs/misc.h +++ b/fs/btrfs/misc.h @@ -5,7 +5,7 @@ #include <linux/sched.h> #include <linux/wait.h> -#include <asm/div64.h> +#include <linux/math64.h> #include <linux/rbtree.h> #define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len)) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 5c0f8481e25e..6b51fd2ec5ac 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -446,7 +446,6 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, * Will be also used to store the finished ordered extent. * @file_offset: File offset for the finished IO * @io_size: Length of the finish IO range - * @uptodate: If the IO finishes without problem * * Return true if the ordered extent is finished in the range, and update * @cached. @@ -457,7 +456,7 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, */ bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, struct btrfs_ordered_extent **cached, - u64 file_offset, u64 io_size, int uptodate) + u64 file_offset, u64 io_size) { struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; struct rb_node *node; @@ -486,8 +485,6 @@ have_entry: entry->bytes_left, io_size); entry->bytes_left -= io_size; - if (!uptodate) - set_bit(BTRFS_ORDERED_IOERR, &entry->flags); if (entry->bytes_left == 0) { /* @@ -1052,6 +1049,7 @@ static int clone_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pos, u64 len) { struct inode *inode = ordered->inode; + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; u64 file_offset = ordered->file_offset + pos; u64 disk_bytenr = ordered->disk_bytenr + pos; u64 num_bytes = len; @@ -1069,6 +1067,13 @@ static int clone_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pos, else type = __ffs(flags_masked); + /* + * The splitting extent is already counted and will be added again + * in btrfs_add_ordered_extent_*(). Subtract num_bytes to avoid + * double counting. + */ + percpu_counter_add_batch(&fs_info->ordered_bytes, -num_bytes, + fs_info->delalloc_batch); if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered->flags)) { WARN_ON_ONCE(1); ret = btrfs_add_ordered_extent_compress(BTRFS_I(inode), diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index b2d88aba8420..4194e960ff61 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -177,7 +177,7 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, bool uptodate); bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, struct btrfs_ordered_extent **cached, - u64 file_offset, u64 io_size, int uptodate); + u64 file_offset, u64 io_size); int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes, int type); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 0fa121171ca1..db680f5be745 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1733,7 +1733,7 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, ASSERT(trans != NULL); ret = btrfs_find_all_roots(NULL, trans->fs_info, bytenr, 0, &old_root, - false, true); + true); if (ret < 0) { trans->fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; btrfs_warn(trans->fs_info, @@ -2651,7 +2651,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) /* Search commit root to find old_roots */ ret = btrfs_find_all_roots(NULL, fs_info, record->bytenr, 0, - &record->old_roots, false, false); + &record->old_roots, false); if (ret < 0) goto cleanup; } @@ -2667,7 +2667,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) * current root. It's safe inside commit_transaction(). */ ret = btrfs_find_all_roots(trans, fs_info, - record->bytenr, BTRFS_SEQ_LAST, &new_roots, false, false); + record->bytenr, BTRFS_SEQ_LAST, &new_roots, false); if (ret < 0) goto cleanup; if (qgroup_to_skip) { @@ -3201,7 +3201,7 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans, num_bytes = found.offset; ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0, - &roots, false, false); + &roots, false); if (ret < 0) goto out; /* For rescan, just pass old_roots as NULL */ diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 244d499ebc72..d8d268ca8aa7 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1035,7 +1035,7 @@ static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) for (i = 0; i < rbio->nr_pages; i++) { if (rbio->stripe_pages[i]) continue; - page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + page = alloc_page(GFP_NOFS); if (!page) return -ENOMEM; rbio->stripe_pages[i] = page; @@ -1054,7 +1054,7 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) for (; i < rbio->nr_pages; i++) { if (rbio->stripe_pages[i]) continue; - page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + page = alloc_page(GFP_NOFS); if (!page) return -ENOMEM; rbio->stripe_pages[i] = page; @@ -1636,10 +1636,10 @@ struct btrfs_plug_cb { static int plug_cmp(void *priv, const struct list_head *a, const struct list_head *b) { - struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio, - plug_list); - struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio, - plug_list); + const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio, + plug_list); + const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio, + plug_list); u64 a_sector = ra->bio_list.head->bi_iter.bi_sector; u64 b_sector = rb->bio_list.head->bi_iter.bi_sector; @@ -2300,7 +2300,7 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) if (rbio->stripe_pages[index]) continue; - page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + page = alloc_page(GFP_NOFS); if (!page) return -ENOMEM; rbio->stripe_pages[index] = page; @@ -2350,14 +2350,14 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, if (!need_check) goto writeback; - p_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + p_page = alloc_page(GFP_NOFS); if (!p_page) goto cleanup; SetPageUptodate(p_page); if (has_qstripe) { /* RAID6, allocate and map temp space for the Q stripe */ - q_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + q_page = alloc_page(GFP_NOFS); if (!q_page) { __free_page(p_page); goto cleanup; diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 8e026de74c44..d2062d5f71dd 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -264,8 +264,8 @@ static struct block_entry *add_block_entry(struct btrfs_fs_info *fs_info, struct block_entry *be = NULL, *exist; struct root_entry *re = NULL; - re = kzalloc(sizeof(struct root_entry), GFP_KERNEL); - be = kzalloc(sizeof(struct block_entry), GFP_KERNEL); + re = kzalloc(sizeof(struct root_entry), GFP_NOFS); + be = kzalloc(sizeof(struct block_entry), GFP_NOFS); if (!be || !re) { kfree(re); kfree(be); @@ -313,7 +313,7 @@ static int add_tree_block(struct btrfs_fs_info *fs_info, u64 ref_root, struct root_entry *re; struct ref_entry *ref = NULL, *exist; - ref = kmalloc(sizeof(struct ref_entry), GFP_KERNEL); + ref = kmalloc(sizeof(struct ref_entry), GFP_NOFS); if (!ref) return -ENOMEM; @@ -358,7 +358,7 @@ static int add_shared_data_ref(struct btrfs_fs_info *fs_info, struct block_entry *be; struct ref_entry *ref; - ref = kzalloc(sizeof(struct ref_entry), GFP_KERNEL); + ref = kzalloc(sizeof(struct ref_entry), GFP_NOFS); if (!ref) return -ENOMEM; be = add_block_entry(fs_info, bytenr, num_bytes, 0); @@ -393,7 +393,7 @@ static int add_extent_data_ref(struct btrfs_fs_info *fs_info, u64 offset = btrfs_extent_data_ref_offset(leaf, dref); u32 num_refs = btrfs_extent_data_ref_count(leaf, dref); - ref = kzalloc(sizeof(struct ref_entry), GFP_KERNEL); + ref = kzalloc(sizeof(struct ref_entry), GFP_NOFS); if (!ref) return -ENOMEM; be = add_block_entry(fs_info, bytenr, num_bytes, ref_root); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index fc831597cb22..914d403b4415 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -24,6 +24,7 @@ #include "block-group.h" #include "backref.h" #include "misc.h" +#include "subpage.h" /* * Relocation overview @@ -2781,10 +2782,70 @@ static noinline_for_stack int prealloc_file_extent_cluster( u64 num_bytes; int nr; int ret = 0; + u64 i_size = i_size_read(&inode->vfs_inode); u64 prealloc_start = cluster->start - offset; u64 prealloc_end = cluster->end - offset; u64 cur_offset = prealloc_start; + /* + * For subpage case, previous i_size may not be aligned to PAGE_SIZE. + * This means the range [i_size, PAGE_END + 1) is filled with zeros by + * btrfs_do_readpage() call of previously relocated file cluster. + * + * If the current cluster starts in the above range, btrfs_do_readpage() + * will skip the read, and relocate_one_page() will later writeback + * the padding zeros as new data, causing data corruption. + * + * Here we have to manually invalidate the range (i_size, PAGE_END + 1). + */ + if (!IS_ALIGNED(i_size, PAGE_SIZE)) { + struct address_space *mapping = inode->vfs_inode.i_mapping; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + const u32 sectorsize = fs_info->sectorsize; + struct page *page; + + ASSERT(sectorsize < PAGE_SIZE); + ASSERT(IS_ALIGNED(i_size, sectorsize)); + + /* + * Subpage can't handle page with DIRTY but without UPTODATE + * bit as it can lead to the following deadlock: + * + * btrfs_readpage() + * | Page already *locked* + * |- btrfs_lock_and_flush_ordered_range() + * |- btrfs_start_ordered_extent() + * |- extent_write_cache_pages() + * |- lock_page() + * We try to lock the page we already hold. + * + * Here we just writeback the whole data reloc inode, so that + * we will be ensured to have no dirty range in the page, and + * are safe to clear the uptodate bits. + * + * This shouldn't cause too much overhead, as we need to write + * the data back anyway. + */ + ret = filemap_write_and_wait(mapping); + if (ret < 0) + return ret; + + clear_extent_bits(&inode->io_tree, i_size, + round_up(i_size, PAGE_SIZE) - 1, + EXTENT_UPTODATE); + page = find_lock_page(mapping, i_size >> PAGE_SHIFT); + /* + * If page is freed we don't need to do anything then, as we + * will re-read the whole page anyway. + */ + if (page) { + btrfs_subpage_clear_uptodate(fs_info, page, i_size, + round_up(i_size, PAGE_SIZE) - i_size); + unlock_page(page); + put_page(page); + } + } + BUG_ON(cluster->start != cluster->boundary[0]); ret = btrfs_alloc_data_chunk_ondemand(inode, prealloc_end + 1 - prealloc_start); @@ -2886,19 +2947,149 @@ noinline int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info) } ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE); -static int relocate_file_extent_cluster(struct inode *inode, - struct file_extent_cluster *cluster) +static u64 get_cluster_boundary_end(struct file_extent_cluster *cluster, + int cluster_nr) +{ + /* Last extent, use cluster end directly */ + if (cluster_nr >= cluster->nr - 1) + return cluster->end; + + /* Use next boundary start*/ + return cluster->boundary[cluster_nr + 1] - 1; +} + +static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, + struct file_extent_cluster *cluster, + int *cluster_nr, unsigned long page_index) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + u64 offset = BTRFS_I(inode)->index_cnt; + const unsigned long last_index = (cluster->end - offset) >> PAGE_SHIFT; + gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); + struct page *page; u64 page_start; u64 page_end; + u64 cur; + int ret; + + ASSERT(page_index <= last_index); + page = find_lock_page(inode->i_mapping, page_index); + if (!page) { + page_cache_sync_readahead(inode->i_mapping, ra, NULL, + page_index, last_index + 1 - page_index); + page = find_or_create_page(inode->i_mapping, page_index, mask); + if (!page) + return -ENOMEM; + } + ret = set_page_extent_mapped(page); + if (ret < 0) + goto release_page; + + if (PageReadahead(page)) + page_cache_async_readahead(inode->i_mapping, ra, NULL, page, + page_index, last_index + 1 - page_index); + + if (!PageUptodate(page)) { + btrfs_readpage(NULL, page); + lock_page(page); + if (!PageUptodate(page)) { + ret = -EIO; + goto release_page; + } + } + + page_start = page_offset(page); + page_end = page_start + PAGE_SIZE - 1; + + /* + * Start from the cluster, as for subpage case, the cluster can start + * inside the page. + */ + cur = max(page_start, cluster->boundary[*cluster_nr] - offset); + while (cur <= page_end) { + u64 extent_start = cluster->boundary[*cluster_nr] - offset; + u64 extent_end = get_cluster_boundary_end(cluster, + *cluster_nr) - offset; + u64 clamped_start = max(page_start, extent_start); + u64 clamped_end = min(page_end, extent_end); + u32 clamped_len = clamped_end + 1 - clamped_start; + + /* Reserve metadata for this range */ + ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), + clamped_len); + if (ret) + goto release_page; + + /* Mark the range delalloc and dirty for later writeback */ + lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end); + ret = btrfs_set_extent_delalloc(BTRFS_I(inode), clamped_start, + clamped_end, 0, NULL); + if (ret) { + clear_extent_bits(&BTRFS_I(inode)->io_tree, + clamped_start, clamped_end, + EXTENT_LOCKED | EXTENT_BOUNDARY); + btrfs_delalloc_release_metadata(BTRFS_I(inode), + clamped_len, true); + btrfs_delalloc_release_extents(BTRFS_I(inode), + clamped_len); + goto release_page; + } + btrfs_page_set_dirty(fs_info, page, clamped_start, clamped_len); + + /* + * Set the boundary if it's inside the page. + * Data relocation requires the destination extents to have the + * same size as the source. + * EXTENT_BOUNDARY bit prevents current extent from being merged + * with previous extent. + */ + if (in_range(cluster->boundary[*cluster_nr] - offset, + page_start, PAGE_SIZE)) { + u64 boundary_start = cluster->boundary[*cluster_nr] - + offset; + u64 boundary_end = boundary_start + + fs_info->sectorsize - 1; + + set_extent_bits(&BTRFS_I(inode)->io_tree, + boundary_start, boundary_end, + EXTENT_BOUNDARY); + } + unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end); + btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len); + cur += clamped_len; + + /* Crossed extent end, go to next extent */ + if (cur >= extent_end) { + (*cluster_nr)++; + /* Just finished the last extent of the cluster, exit. */ + if (*cluster_nr >= cluster->nr) + break; + } + } + unlock_page(page); + put_page(page); + + balance_dirty_pages_ratelimited(inode->i_mapping); + btrfs_throttle(fs_info); + if (btrfs_should_cancel_balance(fs_info)) + ret = -ECANCELED; + return ret; + +release_page: + unlock_page(page); + put_page(page); + return ret; +} + +static int relocate_file_extent_cluster(struct inode *inode, + struct file_extent_cluster *cluster) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 offset = BTRFS_I(inode)->index_cnt; unsigned long index; unsigned long last_index; - struct page *page; struct file_ra_state *ra; - gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); - int nr = 0; + int cluster_nr = 0; int ret = 0; if (!cluster->nr) @@ -2919,109 +3110,14 @@ static int relocate_file_extent_cluster(struct inode *inode, if (ret) goto out; - index = (cluster->start - offset) >> PAGE_SHIFT; last_index = (cluster->end - offset) >> PAGE_SHIFT; - while (index <= last_index) { - ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), - PAGE_SIZE); - if (ret) - goto out; - - page = find_lock_page(inode->i_mapping, index); - if (!page) { - page_cache_sync_readahead(inode->i_mapping, - ra, NULL, index, - last_index + 1 - index); - page = find_or_create_page(inode->i_mapping, index, - mask); - if (!page) { - btrfs_delalloc_release_metadata(BTRFS_I(inode), - PAGE_SIZE, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), - PAGE_SIZE); - ret = -ENOMEM; - goto out; - } - } - ret = set_page_extent_mapped(page); - if (ret < 0) { - btrfs_delalloc_release_metadata(BTRFS_I(inode), - PAGE_SIZE, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); - unlock_page(page); - put_page(page); - goto out; - } - - if (PageReadahead(page)) { - page_cache_async_readahead(inode->i_mapping, - ra, NULL, page, index, - last_index + 1 - index); - } - - if (!PageUptodate(page)) { - btrfs_readpage(NULL, page); - lock_page(page); - if (!PageUptodate(page)) { - unlock_page(page); - put_page(page); - btrfs_delalloc_release_metadata(BTRFS_I(inode), - PAGE_SIZE, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), - PAGE_SIZE); - ret = -EIO; - goto out; - } - } - - page_start = page_offset(page); - page_end = page_start + PAGE_SIZE - 1; - - lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end); - - if (nr < cluster->nr && - page_start + offset == cluster->boundary[nr]) { - set_extent_bits(&BTRFS_I(inode)->io_tree, - page_start, page_end, - EXTENT_BOUNDARY); - nr++; - } - - ret = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, - page_end, 0, NULL); - if (ret) { - unlock_page(page); - put_page(page); - btrfs_delalloc_release_metadata(BTRFS_I(inode), - PAGE_SIZE, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), - PAGE_SIZE); - - clear_extent_bits(&BTRFS_I(inode)->io_tree, - page_start, page_end, - EXTENT_LOCKED | EXTENT_BOUNDARY); - goto out; - - } - set_page_dirty(page); - - unlock_extent(&BTRFS_I(inode)->io_tree, - page_start, page_end); - unlock_page(page); - put_page(page); - - index++; - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); - balance_dirty_pages_ratelimited(inode->i_mapping); - btrfs_throttle(fs_info); - if (btrfs_should_cancel_balance(fs_info)) { - ret = -ECANCELED; - goto out; - } - } - WARN_ON(nr != cluster->nr); + for (index = (cluster->start - offset) >> PAGE_SHIFT; + index <= last_index && !ret; index++) + ret = relocate_one_page(inode, ra, cluster, &cluster_nr, index); if (btrfs_is_zoned(fs_info) && !ret) ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); + if (ret == 0) + WARN_ON(cluster_nr != cluster->nr); out: kfree(ra); return ret; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 6ac37ae6c811..72f9b865e847 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1198,7 +1198,7 @@ struct backref_ctx { static int __clone_root_cmp_bsearch(const void *key, const void *elt) { u64 root = (u64)(uintptr_t)key; - struct clone_root *cr = (struct clone_root *)elt; + const struct clone_root *cr = elt; if (root < cr->root->root_key.objectid) return -1; @@ -1209,8 +1209,8 @@ static int __clone_root_cmp_bsearch(const void *key, const void *elt) static int __clone_root_cmp_sort(const void *e1, const void *e2) { - struct clone_root *cr1 = (struct clone_root *)e1; - struct clone_root *cr2 = (struct clone_root *)e2; + const struct clone_root *cr1 = e1; + const struct clone_root *cr2 = e2; if (cr1->root->root_key.objectid < cr2->root->root_key.objectid) return -1; @@ -1307,7 +1307,7 @@ static int find_extent_clone(struct send_ctx *sctx, u64 flags = 0; struct btrfs_file_extent_item *fi; struct extent_buffer *eb = path->nodes[0]; - struct backref_ctx *backref_ctx = NULL; + struct backref_ctx backref_ctx = {0}; struct clone_root *cur_clone_root; struct btrfs_key found_key; struct btrfs_path *tmp_path; @@ -1322,12 +1322,6 @@ static int find_extent_clone(struct send_ctx *sctx, /* We only use this path under the commit sem */ tmp_path->need_commit_sem = 0; - backref_ctx = kmalloc(sizeof(*backref_ctx), GFP_KERNEL); - if (!backref_ctx) { - ret = -ENOMEM; - goto out; - } - if (data_offset >= ino_size) { /* * There may be extents that lie behind the file's size. @@ -1392,12 +1386,12 @@ static int find_extent_clone(struct send_ctx *sctx, cur_clone_root->found_refs = 0; } - backref_ctx->sctx = sctx; - backref_ctx->found = 0; - backref_ctx->cur_objectid = ino; - backref_ctx->cur_offset = data_offset; - backref_ctx->found_itself = 0; - backref_ctx->extent_len = num_bytes; + backref_ctx.sctx = sctx; + backref_ctx.found = 0; + backref_ctx.cur_objectid = ino; + backref_ctx.cur_offset = data_offset; + backref_ctx.found_itself = 0; + backref_ctx.extent_len = num_bytes; /* * The last extent of a file may be too large due to page alignment. @@ -1405,7 +1399,7 @@ static int find_extent_clone(struct send_ctx *sctx, * __iterate_backrefs work. */ if (data_offset + num_bytes >= ino_size) - backref_ctx->extent_len = ino_size - data_offset; + backref_ctx.extent_len = ino_size - data_offset; /* * Now collect all backrefs. @@ -1416,12 +1410,12 @@ static int find_extent_clone(struct send_ctx *sctx, extent_item_pos = 0; ret = iterate_extent_inodes(fs_info, found_key.objectid, extent_item_pos, 1, __iterate_backrefs, - backref_ctx, false); + &backref_ctx, false); if (ret < 0) goto out; - if (!backref_ctx->found_itself) { + if (!backref_ctx.found_itself) { /* found a bug in backref code? */ ret = -EIO; btrfs_err(fs_info, @@ -1434,7 +1428,7 @@ static int find_extent_clone(struct send_ctx *sctx, "find_extent_clone: data_offset=%llu, ino=%llu, num_bytes=%llu, logical=%llu", data_offset, ino, num_bytes, logical); - if (!backref_ctx->found) + if (!backref_ctx.found) btrfs_debug(fs_info, "no clones found"); cur_clone_root = NULL; @@ -1458,7 +1452,6 @@ static int find_extent_clone(struct send_ctx *sctx, out: btrfs_free_path(tmp_path); - kfree(backref_ctx); return ret; } diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index f79bf85f2439..5ada02e0e629 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -493,6 +493,11 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, long time_left; int loops; + delalloc_bytes = percpu_counter_sum_positive(&fs_info->delalloc_bytes); + ordered_bytes = percpu_counter_sum_positive(&fs_info->ordered_bytes); + if (delalloc_bytes == 0 && ordered_bytes == 0) + return; + /* Calc the number of the pages we need flush for space reservation */ if (to_reclaim == U64_MAX) { items = U64_MAX; @@ -500,22 +505,21 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, /* * to_reclaim is set to however much metadata we need to * reclaim, but reclaiming that much data doesn't really track - * exactly, so increase the amount to reclaim by 2x in order to - * make sure we're flushing enough delalloc to hopefully reclaim - * some metadata reservations. + * exactly. What we really want to do is reclaim full inode's + * worth of reservations, however that's not available to us + * here. We will take a fraction of the delalloc bytes for our + * flushing loops and hope for the best. Delalloc will expand + * the amount we write to cover an entire dirty extent, which + * will reclaim the metadata reservation for that range. If + * it's not enough subsequent flush stages will be more + * aggressive. */ + to_reclaim = max(to_reclaim, delalloc_bytes >> 3); items = calc_reclaim_items_nr(fs_info, to_reclaim) * 2; - to_reclaim = items * EXTENT_SIZE_PER_ITEM; } trans = (struct btrfs_trans_handle *)current->journal_info; - delalloc_bytes = percpu_counter_sum_positive( - &fs_info->delalloc_bytes); - ordered_bytes = percpu_counter_sum_positive(&fs_info->ordered_bytes); - if (delalloc_bytes == 0 && ordered_bytes == 0) - return; - /* * If we are doing more ordered than delalloc we need to just wait on * ordered extents, otherwise we'll waste time trying to flush delalloc @@ -528,9 +532,49 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, while ((delalloc_bytes || ordered_bytes) && loops < 3) { u64 temp = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT; long nr_pages = min_t(u64, temp, LONG_MAX); + int async_pages; btrfs_start_delalloc_roots(fs_info, nr_pages, true); + /* + * We need to make sure any outstanding async pages are now + * processed before we continue. This is because things like + * sync_inode() try to be smart and skip writing if the inode is + * marked clean. We don't use filemap_fwrite for flushing + * because we want to control how many pages we write out at a + * time, thus this is the only safe way to make sure we've + * waited for outstanding compressed workers to have started + * their jobs and thus have ordered extents set up properly. + * + * This exists because we do not want to wait for each + * individual inode to finish its async work, we simply want to + * start the IO on everybody, and then come back here and wait + * for all of the async work to catch up. Once we're done with + * that we know we'll have ordered extents for everything and we + * can decide if we wait for that or not. + * + * If we choose to replace this in the future, make absolutely + * sure that the proper waiting is being done in the async case, + * as there have been bugs in that area before. + */ + async_pages = atomic_read(&fs_info->async_delalloc_pages); + if (!async_pages) + goto skip_async; + + /* + * We don't want to wait forever, if we wrote less pages in this + * loop than we have outstanding, only wait for that number of + * pages, otherwise we can wait for all async pages to finish + * before continuing. + */ + if (async_pages > nr_pages) + async_pages -= nr_pages; + else + async_pages = 0; + wait_event(fs_info->async_submit_wait, + atomic_read(&fs_info->async_delalloc_pages) <= + async_pages); +skip_async: loops++; if (wait_ordered && !trans) { btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); @@ -595,8 +639,11 @@ static void flush_space(struct btrfs_fs_info *fs_info, break; case FLUSH_DELALLOC: case FLUSH_DELALLOC_WAIT: + case FLUSH_DELALLOC_FULL: + if (state == FLUSH_DELALLOC_FULL) + num_bytes = U64_MAX; shrink_delalloc(fs_info, space_info, num_bytes, - state == FLUSH_DELALLOC_WAIT, for_preempt); + state != FLUSH_DELALLOC, for_preempt); break; case FLUSH_DELAYED_REFS_NR: case FLUSH_DELAYED_REFS: @@ -686,7 +733,7 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, { u64 global_rsv_size = fs_info->global_block_rsv.reserved; u64 ordered, delalloc; - u64 thresh = div_factor_fine(space_info->total_bytes, 98); + u64 thresh = div_factor_fine(space_info->total_bytes, 90); u64 used; /* If we're just plain full then async reclaim just slows us down. */ @@ -694,6 +741,20 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, global_rsv_size) >= thresh) return false; + used = space_info->bytes_may_use + space_info->bytes_pinned; + + /* The total flushable belongs to the global rsv, don't flush. */ + if (global_rsv_size >= used) + return false; + + /* + * 128MiB is 1/4 of the maximum global rsv size. If we have less than + * that devoted to other reservations then there's no sense in flushing, + * we don't have a lot of things that need flushing. + */ + if (used - global_rsv_size <= SZ_128M) + return false; + /* * We have tickets queued, bail so we don't compete with the async * flushers. @@ -824,6 +885,8 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info, struct reserve_ticket *ticket; u64 tickets_id = space_info->tickets_id; + trace_btrfs_fail_all_tickets(fs_info, space_info); + if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { btrfs_info(fs_info, "cannot satisfy tickets, dumping space info"); __btrfs_dump_space_info(fs_info, space_info); @@ -905,6 +968,14 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) } /* + * We do not want to empty the system of delalloc unless we're + * under heavy pressure, so allow one trip through the flushing + * logic before we start doing a FLUSH_DELALLOC_FULL. + */ + if (flush_state == FLUSH_DELALLOC_FULL && !commit_cycles) + flush_state++; + + /* * We don't want to force a chunk allocation until we've tried * pretty hard to reclaim space. Think of the case where we * freed up a bunch of space and so have a lot of pinned space @@ -1067,7 +1138,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work) * so if we now have space to allocate do the force chunk allocation. */ static const enum btrfs_flush_state data_flush_states[] = { - FLUSH_DELALLOC_WAIT, + FLUSH_DELALLOC_FULL, RUN_DELAYED_IPUTS, COMMIT_TRANS, ALLOC_CHUNK_FORCE, @@ -1156,6 +1227,7 @@ static const enum btrfs_flush_state evict_flush_states[] = { FLUSH_DELAYED_REFS, FLUSH_DELALLOC, FLUSH_DELALLOC_WAIT, + FLUSH_DELALLOC_FULL, ALLOC_CHUNK, COMMIT_TRANS, }; diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c index 8260f8bb3ff0..f429256f56db 100644 --- a/fs/btrfs/struct-funcs.c +++ b/fs/btrfs/struct-funcs.c @@ -73,7 +73,7 @@ u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \ } \ token->kaddr = page_address(token->eb->pages[idx]); \ token->offset = idx << PAGE_SHIFT; \ - if (oip + size <= PAGE_SIZE) \ + if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE ) \ return get_unaligned_le##bits(token->kaddr + oip); \ \ memcpy(lebytes, token->kaddr + oip, part); \ @@ -94,7 +94,7 @@ u##bits btrfs_get_##bits(const struct extent_buffer *eb, \ u8 lebytes[sizeof(u##bits)]; \ \ ASSERT(check_setget_bounds(eb, ptr, off, size)); \ - if (oip + size <= PAGE_SIZE) \ + if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) \ return get_unaligned_le##bits(kaddr + oip); \ \ memcpy(lebytes, kaddr + oip, part); \ @@ -124,7 +124,7 @@ void btrfs_set_token_##bits(struct btrfs_map_token *token, \ } \ token->kaddr = page_address(token->eb->pages[idx]); \ token->offset = idx << PAGE_SHIFT; \ - if (oip + size <= PAGE_SIZE) { \ + if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) { \ put_unaligned_le##bits(val, token->kaddr + oip); \ return; \ } \ @@ -146,7 +146,7 @@ void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \ u8 lebytes[sizeof(u##bits)]; \ \ ASSERT(check_setget_bounds(eb, ptr, off, size)); \ - if (oip + size <= PAGE_SIZE) { \ + if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) { \ put_unaligned_le##bits(val, kaddr + oip); \ return; \ } \ diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 640bcd21bf28..cb10e56ee31e 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -435,8 +435,10 @@ void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info, spin_lock_irqsave(&subpage->lock, flags); subpage->writeback_bitmap &= ~tmp; - if (subpage->writeback_bitmap == 0) + if (subpage->writeback_bitmap == 0) { + ASSERT(PageWriteback(page)); end_page_writeback(page); + } spin_unlock_irqrestore(&subpage->lock, flags); } @@ -559,3 +561,23 @@ IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback, PageWriteback); IMPLEMENT_BTRFS_PAGE_OPS(ordered, SetPageOrdered, ClearPageOrdered, PageOrdered); + +/* + * Make sure not only the page dirty bit is cleared, but also subpage dirty bit + * is cleared. + */ +void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info, + struct page *page) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + + if (!IS_ENABLED(CONFIG_BTRFS_ASSERT)) + return; + + ASSERT(!PageDirty(page)); + if (fs_info->sectorsize == PAGE_SIZE) + return; + + ASSERT(PagePrivate(page) && page->private); + ASSERT(subpage->dirty_bitmap == 0); +} diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index 4d7aca85d915..0120948f37a1 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -126,4 +126,7 @@ DECLARE_BTRFS_SUBPAGE_OPS(ordered); bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len); +void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info, + struct page *page); + #endif diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index d07b18b2b250..537d90bf5d84 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1201,21 +1201,14 @@ char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, key.type = BTRFS_ROOT_BACKREF_KEY; key.offset = (u64)-1; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + ret = btrfs_search_backwards(root, &key, path); if (ret < 0) { goto err; } else if (ret > 0) { - ret = btrfs_previous_item(root, path, subvol_objectid, - BTRFS_ROOT_BACKREF_KEY); - if (ret < 0) { - goto err; - } else if (ret > 0) { - ret = -ENOENT; - goto err; - } + ret = -ENOENT; + goto err; } - btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); subvol_objectid = key.offset; root_ref = btrfs_item_ptr(path->nodes[0], path->slots[0], @@ -1248,21 +1241,14 @@ char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, key.type = BTRFS_INODE_REF_KEY; key.offset = (u64)-1; - ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0); + ret = btrfs_search_backwards(fs_root, &key, path); if (ret < 0) { goto err; } else if (ret > 0) { - ret = btrfs_previous_item(fs_root, path, dirid, - BTRFS_INODE_REF_KEY); - if (ret < 0) { - goto err; - } else if (ret > 0) { - ret = -ENOENT; - goto err; - } + ret = -ENOENT; + goto err; } - btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); dirid = key.offset; inode_ref = btrfs_item_ptr(path->nodes[0], @@ -1353,6 +1339,9 @@ static int btrfs_fill_super(struct super_block *sb, sb->s_op = &btrfs_super_ops; sb->s_d_op = &btrfs_dentry_operations; sb->s_export_op = &btrfs_export_ops; +#ifdef CONFIG_FS_VERITY + sb->s_vop = &btrfs_verityops; +#endif sb->s_xattr = btrfs_xattr_handlers; sb->s_time_gran = 1; #ifdef CONFIG_BTRFS_FS_POSIX_ACL @@ -2041,13 +2030,6 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) ret = -EINVAL; goto restore; } - if (fs_info->sectorsize < PAGE_SIZE) { - btrfs_warn(fs_info, - "read-write mount is not yet allowed for sectorsize %u page size %lu", - fs_info->sectorsize, PAGE_SIZE); - ret = -EINVAL; - goto restore; - } /* * NOTE: when remounting with a change that does writes, don't @@ -2096,16 +2078,15 @@ restore: } /* Used to sort the devices by max_avail(descending sort) */ -static inline int btrfs_cmp_device_free_bytes(const void *dev_info1, - const void *dev_info2) +static int btrfs_cmp_device_free_bytes(const void *a, const void *b) { - if (((struct btrfs_device_info *)dev_info1)->max_avail > - ((struct btrfs_device_info *)dev_info2)->max_avail) + const struct btrfs_device_info *dev_info1 = a; + const struct btrfs_device_info *dev_info2 = b; + + if (dev_info1->max_avail > dev_info2->max_avail) return -1; - else if (((struct btrfs_device_info *)dev_info1)->max_avail < - ((struct btrfs_device_info *)dev_info2)->max_avail) + else if (dev_info1->max_avail < dev_info2->max_avail) return 1; - else return 0; } @@ -2381,7 +2362,7 @@ static struct file_system_type btrfs_root_fs_type = { .name = "btrfs", .mount = btrfs_mount_root, .kill_sb = btrfs_kill_super, - .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA, + .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_ALLOW_IDMAP, }; MODULE_ALIAS_FS("btrfs"); @@ -2572,6 +2553,11 @@ static void __init btrfs_print_mod_info(void) #else ", zoned=no" #endif +#ifdef CONFIG_FS_VERITY + ", fsverity=yes" +#else + ", fsverity=no" +#endif ; pr_info("Btrfs loaded, crc32c=%s%s\n", crc32c_impl(), options); } diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 9d1d140118ff..25a6f587852b 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -22,6 +22,26 @@ #include "block-group.h" #include "qgroup.h" +/* + * Structure name Path + * -------------------------------------------------------------------------- + * btrfs_supported_static_feature_attrs /sys/fs/btrfs/features + * btrfs_supported_feature_attrs /sys/fs/btrfs/features and + * /sys/fs/btrfs/<uuid>/features + * btrfs_attrs /sys/fs/btrfs/<uuid> + * devid_attrs /sys/fs/btrfs/<uuid>/devinfo/<devid> + * allocation_attrs /sys/fs/btrfs/<uuid>/allocation + * qgroup_attrs /sys/fs/btrfs/<uuid>/qgroups/<level>_<qgroupid> + * space_info_attrs /sys/fs/btrfs/<uuid>/allocation/<bg-type> + * raid_attrs /sys/fs/btrfs/<uuid>/allocation/<bg-type>/<bg-profile> + * + * When built with BTRFS_CONFIG_DEBUG: + * + * btrfs_debug_feature_attrs /sys/fs/btrfs/debug + * btrfs_debug_mount_attrs /sys/fs/btrfs/<uuid>/debug + * discard_debug_attrs /sys/fs/btrfs/<uuid>/debug/discard + */ + struct btrfs_feature_attr { struct kobj_attribute kobj_attr; enum btrfs_feature_set feature_set; @@ -267,7 +287,17 @@ BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34); #ifdef CONFIG_BTRFS_DEBUG BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED); #endif +#ifdef CONFIG_FS_VERITY +BTRFS_FEAT_ATTR_COMPAT_RO(verity, VERITY); +#endif +/* + * Features which depend on feature bits and may differ between each fs. + * + * /sys/fs/btrfs/features - all available features implemeted by this version + * /sys/fs/btrfs/UUID/features - features of the fs which are enabled or + * can be changed on a mounted filesystem. + */ static struct attribute *btrfs_supported_feature_attrs[] = { BTRFS_FEAT_ATTR_PTR(mixed_backref), BTRFS_FEAT_ATTR_PTR(default_subvol), @@ -285,16 +315,12 @@ static struct attribute *btrfs_supported_feature_attrs[] = { #ifdef CONFIG_BTRFS_DEBUG BTRFS_FEAT_ATTR_PTR(zoned), #endif +#ifdef CONFIG_FS_VERITY + BTRFS_FEAT_ATTR_PTR(verity), +#endif NULL }; -/* - * Features which depend on feature bits and may differ between each fs. - * - * /sys/fs/btrfs/features lists all available features of this kernel while - * /sys/fs/btrfs/UUID/features shows features of the fs which are enabled or - * can be changed online. - */ static const struct attribute_group btrfs_feature_attr_group = { .name = "features", .is_visible = btrfs_feature_visible, @@ -366,6 +392,10 @@ static ssize_t supported_sectorsizes_show(struct kobject *kobj, { ssize_t ret = 0; + /* 4K sector size is also supported with 64K page size */ + if (PAGE_SIZE == SZ_64K) + ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%u ", SZ_4K); + /* Only sectorsize == PAGE_SIZE is now supported */ ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%lu\n", PAGE_SIZE); @@ -374,6 +404,12 @@ static ssize_t supported_sectorsizes_show(struct kobject *kobj, BTRFS_ATTR(static_feature, supported_sectorsizes, supported_sectorsizes_show); +/* + * Features which only depend on kernel version. + * + * These are listed in /sys/fs/btrfs/features along with + * btrfs_supported_feature_attrs. + */ static struct attribute *btrfs_supported_static_feature_attrs[] = { BTRFS_ATTR_PTR(static_feature, rmdir_subvol), BTRFS_ATTR_PTR(static_feature, supported_checksums), @@ -383,12 +419,6 @@ static struct attribute *btrfs_supported_static_feature_attrs[] = { NULL }; -/* - * Features which only depend on kernel version. - * - * These are listed in /sys/fs/btrfs/features along with - * btrfs_feature_attr_group - */ static const struct attribute_group btrfs_static_feature_attr_group = { .name = "features", .attrs = btrfs_supported_static_feature_attrs, @@ -547,6 +577,11 @@ static ssize_t btrfs_discard_max_discard_size_store(struct kobject *kobj, BTRFS_ATTR_RW(discard, max_discard_size, btrfs_discard_max_discard_size_show, btrfs_discard_max_discard_size_store); +/* + * Per-filesystem debugging of discard (when mounted with discard=async). + * + * Path: /sys/fs/btrfs/<uuid>/debug/discard/ + */ static const struct attribute *discard_debug_attrs[] = { BTRFS_ATTR_PTR(discard, discardable_bytes), BTRFS_ATTR_PTR(discard, discardable_extents), @@ -560,15 +595,19 @@ static const struct attribute *discard_debug_attrs[] = { }; /* - * Runtime debugging exported via sysfs + * Per-filesystem runtime debugging exported via sysfs. * - * /sys/fs/btrfs/debug - applies to module or all filesystems - * /sys/fs/btrfs/UUID - applies only to the given filesystem + * Path: /sys/fs/btrfs/UUID/debug/ */ static const struct attribute *btrfs_debug_mount_attrs[] = { NULL, }; +/* + * Runtime debugging exported via sysfs, applies to all mounted filesystems. + * + * Path: /sys/fs/btrfs/debug + */ static struct attribute *btrfs_debug_feature_attrs[] = { NULL }; @@ -637,6 +676,11 @@ static ssize_t raid_bytes_show(struct kobject *kobj, return scnprintf(buf, PAGE_SIZE, "%llu\n", val); } +/* + * Allocation information about block group profiles. + * + * Path: /sys/fs/btrfs/<uuid>/allocation/<bg-type>/<bg-profile>/ + */ static struct attribute *raid_attrs[] = { BTRFS_ATTR_PTR(raid, total_bytes), BTRFS_ATTR_PTR(raid, used_bytes), @@ -676,6 +720,11 @@ SPACE_INFO_ATTR(bytes_zone_unusable); SPACE_INFO_ATTR(disk_used); SPACE_INFO_ATTR(disk_total); +/* + * Allocation information about block group types. + * + * Path: /sys/fs/btrfs/<uuid>/allocation/<bg-type>/ + */ static struct attribute *space_info_attrs[] = { BTRFS_ATTR_PTR(space_info, flags), BTRFS_ATTR_PTR(space_info, total_bytes), @@ -703,6 +752,11 @@ static struct kobj_type space_info_ktype = { .default_groups = space_info_groups, }; +/* + * Allocation information about block groups. + * + * Path: /sys/fs/btrfs/<uuid>/allocation/ + */ static const struct attribute *allocation_attrs[] = { BTRFS_ATTR_PTR(allocation, global_rsv_reserved), BTRFS_ATTR_PTR(allocation, global_rsv_size), @@ -974,7 +1028,8 @@ static ssize_t btrfs_bg_reclaim_threshold_show(struct kobject *kobj, struct btrfs_fs_info *fs_info = to_fs_info(kobj); ssize_t ret; - ret = scnprintf(buf, PAGE_SIZE, "%d\n", fs_info->bg_reclaim_threshold); + ret = scnprintf(buf, PAGE_SIZE, "%d\n", + READ_ONCE(fs_info->bg_reclaim_threshold)); return ret; } @@ -991,16 +1046,21 @@ static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj, if (ret) return ret; - if (thresh <= 50 || thresh > 100) + if (thresh != 0 && (thresh <= 50 || thresh > 100)) return -EINVAL; - fs_info->bg_reclaim_threshold = thresh; + WRITE_ONCE(fs_info->bg_reclaim_threshold, thresh); return len; } BTRFS_ATTR_RW(, bg_reclaim_threshold, btrfs_bg_reclaim_threshold_show, btrfs_bg_reclaim_threshold_store); +/* + * Per-filesystem information and stats. + * + * Path: /sys/fs/btrfs/<uuid>/ + */ static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, label), BTRFS_ATTR_PTR(, nodesize), @@ -1510,6 +1570,11 @@ static ssize_t btrfs_devinfo_error_stats_show(struct kobject *kobj, } BTRFS_ATTR(devid, error_stats, btrfs_devinfo_error_stats_show); +/* + * Information about one device. + * + * Path: /sys/fs/btrfs/<uuid>/devinfo/<devid>/ + */ static struct attribute *devid_attrs[] = { BTRFS_ATTR_PTR(devid, error_stats), BTRFS_ATTR_PTR(devid, in_fs_metadata), @@ -1799,6 +1864,11 @@ QGROUP_RSV_ATTR(data, BTRFS_QGROUP_RSV_DATA); QGROUP_RSV_ATTR(meta_pertrans, BTRFS_QGROUP_RSV_META_PERTRANS); QGROUP_RSV_ATTR(meta_prealloc, BTRFS_QGROUP_RSV_META_PREALLOC); +/* + * Qgroup information. + * + * Path: /sys/fs/btrfs/<uuid>/qgroups/<level>_<qgroupid>/ + */ static struct attribute *qgroup_attrs[] = { BTRFS_ATTR_PTR(qgroup, referenced), BTRFS_ATTR_PTR(qgroup, exclusive), diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index 98b5aaba46f1..19ba7d5b7d8f 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -223,8 +223,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root, * we can only call btrfs_qgroup_account_extent() directly to test * quota. */ - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, - false, false); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false); if (ret) { ulist_free(old_roots); test_err("couldn't find old roots: %d", ret); @@ -236,8 +235,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root, if (ret) return ret; - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, - false, false); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false); if (ret) { ulist_free(old_roots); ulist_free(new_roots); @@ -260,8 +258,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root, old_roots = NULL; new_roots = NULL; - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, - false, false); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false); if (ret) { ulist_free(old_roots); test_err("couldn't find old roots: %d", ret); @@ -272,8 +269,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root, if (ret) return -EINVAL; - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, - false, false); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false); if (ret) { ulist_free(old_roots); ulist_free(new_roots); @@ -324,8 +320,7 @@ static int test_multiple_refs(struct btrfs_root *root, return ret; } - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, - false, false); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false); if (ret) { ulist_free(old_roots); test_err("couldn't find old roots: %d", ret); @@ -337,8 +332,7 @@ static int test_multiple_refs(struct btrfs_root *root, if (ret) return ret; - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, - false, false); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false); if (ret) { ulist_free(old_roots); ulist_free(new_roots); @@ -359,8 +353,7 @@ static int test_multiple_refs(struct btrfs_root *root, return -EINVAL; } - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, - false, false); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false); if (ret) { ulist_free(old_roots); test_err("couldn't find old roots: %d", ret); @@ -372,8 +365,7 @@ static int test_multiple_refs(struct btrfs_root *root, if (ret) return ret; - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, - false, false); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false); if (ret) { ulist_free(old_roots); ulist_free(new_roots); @@ -400,8 +392,7 @@ static int test_multiple_refs(struct btrfs_root *root, return -EINVAL; } - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, - false, false); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false); if (ret) { ulist_free(old_roots); test_err("couldn't find old roots: %d", ret); @@ -413,8 +404,7 @@ static int test_multiple_refs(struct btrfs_root *root, if (ret) return ret; - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, - false, false); + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false); if (ret) { ulist_free(old_roots); ulist_free(new_roots); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index a8b2e0d2c025..7733e8ac0a69 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -24,6 +24,7 @@ #include "compression.h" #include "volumes.h" #include "misc.h" +#include "btrfs_inode.h" /* * Error message should follow the following format: @@ -873,13 +874,22 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf, } } - if (unlikely((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) || - (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes != 2) || - (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) || - (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) || - (type & BTRFS_BLOCK_GROUP_DUP && num_stripes != 2) || + if (unlikely((type & BTRFS_BLOCK_GROUP_RAID10 && + sub_stripes != btrfs_raid_array[BTRFS_RAID_RAID10].sub_stripes) || + (type & BTRFS_BLOCK_GROUP_RAID1 && + num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1].devs_min) || + (type & BTRFS_BLOCK_GROUP_RAID1C3 && + num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1C3].devs_min) || + (type & BTRFS_BLOCK_GROUP_RAID1C4 && + num_stripes != btrfs_raid_array[BTRFS_RAID_RAID1C4].devs_min) || + (type & BTRFS_BLOCK_GROUP_RAID5 && + num_stripes < btrfs_raid_array[BTRFS_RAID_RAID5].devs_min) || + (type & BTRFS_BLOCK_GROUP_RAID6 && + num_stripes < btrfs_raid_array[BTRFS_RAID_RAID6].devs_min) || + (type & BTRFS_BLOCK_GROUP_DUP && + num_stripes != btrfs_raid_array[BTRFS_RAID_DUP].dev_stripes) || ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 && - num_stripes != 1))) { + num_stripes != btrfs_raid_array[BTRFS_RAID_SINGLE].dev_stripes))) { chunk_err(leaf, chunk, logical, "invalid num_stripes:sub_stripes %u:%u for profile %llu", num_stripes, sub_stripes, @@ -999,6 +1009,8 @@ static int check_inode_item(struct extent_buffer *leaf, u32 valid_mask = (S_IFMT | S_ISUID | S_ISGID | S_ISVTX | 0777); u32 mode; int ret; + u32 flags; + u32 ro_flags; ret = check_inode_key(leaf, key, slot); if (unlikely(ret < 0)) @@ -1054,11 +1066,17 @@ static int check_inode_item(struct extent_buffer *leaf, btrfs_inode_nlink(leaf, iitem)); return -EUCLEAN; } - if (unlikely(btrfs_inode_flags(leaf, iitem) & ~BTRFS_INODE_FLAG_MASK)) { + btrfs_inode_split_flags(btrfs_inode_flags(leaf, iitem), &flags, &ro_flags); + if (unlikely(flags & ~BTRFS_INODE_FLAG_MASK)) { inode_item_err(leaf, slot, - "unknown flags detected: 0x%llx", - btrfs_inode_flags(leaf, iitem) & - ~BTRFS_INODE_FLAG_MASK); + "unknown incompat flags detected: 0x%x", flags); + return -EUCLEAN; + } + if (unlikely(!sb_rdonly(fs_info->sb) && + (ro_flags & ~BTRFS_INODE_RO_FLAG_MASK))) { + inode_item_err(leaf, slot, + "unknown ro-compat flags detected on writeable mount: 0x%x", + ro_flags); return -EUCLEAN; } return 0; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index e6430ac9bbe8..f7efc26aa82a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -753,7 +753,9 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, */ ret = btrfs_lookup_data_extent(fs_info, ins.objectid, ins.offset); - if (ret == 0) { + if (ret < 0) { + goto out; + } else if (ret == 0) { btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, ins.objectid, ins.offset, 0); @@ -3039,8 +3041,6 @@ static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, list_del_init(&ctx->list); ctx->log_ret = error; } - - INIT_LIST_HEAD(&root->log_ctxs[index]); } /* @@ -3328,10 +3328,16 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, goto out_wake_log_root; } - mutex_lock(&root->log_mutex); - if (root->last_log_commit < log_transid) - root->last_log_commit = log_transid; - mutex_unlock(&root->log_mutex); + /* + * We know there can only be one task here, since we have not yet set + * root->log_commit[index1] to 0 and any task attempting to sync the + * log must wait for the previous log transaction to commit if it's + * still in progress or wait for the current log transaction commit if + * someone else already started it. We use <= and not < because the + * first log transaction has an ID of 0. + */ + ASSERT(root->last_log_commit <= log_transid); + root->last_log_commit = log_transid; out_wake_log_root: mutex_lock(&log_root_tree->log_mutex); @@ -3417,14 +3423,10 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, } /* - * Check if an inode was logged in the current transaction. We can't always rely - * on an inode's logged_trans value, because it's an in-memory only field and - * therefore not persisted. This means that its value is lost if the inode gets - * evicted and loaded again from disk (in which case it has a value of 0, and - * certainly it is smaller then any possible transaction ID), when that happens - * the full_sync flag is set in the inode's runtime flags, so on that case we - * assume eviction happened and ignore the logged_trans value, assuming the - * worst case, that the inode was logged before in the current transaction. + * Check if an inode was logged in the current transaction. This may often + * return some false positives, because logged_trans is an in memory only field, + * not persisted anywhere. This is meant to be used in contexts where a false + * positive has no functional consequences. */ static bool inode_logged(struct btrfs_trans_handle *trans, struct btrfs_inode *inode) @@ -3432,8 +3434,17 @@ static bool inode_logged(struct btrfs_trans_handle *trans, if (inode->logged_trans == trans->transid) return true; - if (inode->last_trans == trans->transid && - test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) && + /* + * The inode's logged_trans is always 0 when we load it (because it is + * not persisted in the inode item or elsewhere). So if it is 0, the + * inode was last modified in the current transaction then the inode may + * have been logged before in the current transaction, then evicted and + * loaded again in the current transaction - or may have never been logged + * in the current transaction, but since we can not be sure, we have to + * assume it was, otherwise our callers can leave an inconsistent log. + */ + if (inode->logged_trans == 0 && + inode->last_trans == trans->transid && !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags)) return true; @@ -3913,6 +3924,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, u64 logged_isize) { struct btrfs_map_token token; + u64 flags; btrfs_init_map_token(&token, leaf); @@ -3962,20 +3974,49 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode)); btrfs_set_token_inode_transid(&token, item, trans->transid); btrfs_set_token_inode_rdev(&token, item, inode->i_rdev); - btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags); + flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags, + BTRFS_I(inode)->ro_flags); + btrfs_set_token_inode_flags(&token, item, flags); btrfs_set_token_inode_block_group(&token, item, 0); } static int log_inode_item(struct btrfs_trans_handle *trans, struct btrfs_root *log, struct btrfs_path *path, - struct btrfs_inode *inode) + struct btrfs_inode *inode, bool inode_item_dropped) { struct btrfs_inode_item *inode_item; int ret; - ret = btrfs_insert_empty_item(trans, log, path, - &inode->location, sizeof(*inode_item)); - if (ret && ret != -EEXIST) + /* + * If we are doing a fast fsync and the inode was logged before in the + * current transaction, then we know the inode was previously logged and + * it exists in the log tree. For performance reasons, in this case use + * btrfs_search_slot() directly with ins_len set to 0 so that we never + * attempt a write lock on the leaf's parent, which adds unnecessary lock + * contention in case there are concurrent fsyncs for other inodes of the + * same subvolume. Using btrfs_insert_empty_item() when the inode item + * already exists can also result in unnecessarily splitting a leaf. + */ + if (!inode_item_dropped && inode->logged_trans == trans->transid) { + ret = btrfs_search_slot(trans, log, &inode->location, path, 0, 1); + ASSERT(ret <= 0); + if (ret > 0) + ret = -ENOENT; + } else { + /* + * This means it is the first fsync in the current transaction, + * so the inode item is not in the log and we need to insert it. + * We can never get -EEXIST because we are only called for a fast + * fsync and in case an inode eviction happens after the inode was + * logged before in the current transaction, when we load again + * the inode, we set BTRFS_INODE_NEEDS_FULL_SYNC on its runtime + * flags and set ->logged_trans to 0. + */ + ret = btrfs_insert_empty_item(trans, log, path, &inode->location, + sizeof(*inode_item)); + ASSERT(ret != -EEXIST); + } + if (ret) return ret; inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_item); @@ -4160,7 +4201,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, static int extent_cmp(void *priv, const struct list_head *a, const struct list_head *b) { - struct extent_map *em1, *em2; + const struct extent_map *em1, *em2; em1 = list_entry(a, struct extent_map, list); em2 = list_entry(b, struct extent_map, list); @@ -5053,8 +5094,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, /* * Check the inode's logged_trans only instead of * btrfs_inode_in_log(). This is because the last_log_commit of - * the inode is not updated when we only log that it exists and - * it has the full sync bit set (see btrfs_log_inode()). + * the inode is not updated when we only log that it exists (see + * btrfs_log_inode()). */ if (BTRFS_I(inode)->logged_trans == trans->transid) { spin_unlock(&BTRFS_I(inode)->lock); @@ -5299,6 +5340,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, bool need_log_inode_item = true; bool xattrs_logged = false; bool recursive_logging = false; + bool inode_item_dropped = true; path = btrfs_alloc_path(); if (!path) @@ -5433,6 +5475,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, } else { if (inode_only == LOG_INODE_ALL) fast_search = true; + inode_item_dropped = false; goto log_extents; } @@ -5466,7 +5509,7 @@ log_extents: btrfs_release_path(path); btrfs_release_path(dst_path); if (need_log_inode_item) { - err = log_inode_item(trans, log, dst_path, inode); + err = log_inode_item(trans, log, dst_path, inode, inode_item_dropped); if (err) goto out_unlock; /* @@ -5573,6 +5616,13 @@ static bool need_log_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *inode) { /* + * If a directory was not modified, no dentries added or removed, we can + * and should avoid logging it. + */ + if (S_ISDIR(inode->vfs_inode.i_mode) && inode->last_trans < trans->transid) + return false; + + /* * If this inode does not have new/updated/deleted xattrs since the last * time it was logged and is flagged as logged in the current transaction, * we can skip logging it. As for new/deleted names, those are updated in diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c new file mode 100644 index 000000000000..28d443d3ef93 --- /dev/null +++ b/fs/btrfs/verity.c @@ -0,0 +1,811 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/init.h> +#include <linux/fs.h> +#include <linux/slab.h> +#include <linux/rwsem.h> +#include <linux/xattr.h> +#include <linux/security.h> +#include <linux/posix_acl_xattr.h> +#include <linux/iversion.h> +#include <linux/fsverity.h> +#include <linux/sched/mm.h> +#include "ctree.h" +#include "btrfs_inode.h" +#include "transaction.h" +#include "disk-io.h" +#include "locking.h" + +/* + * Implementation of the interface defined in struct fsverity_operations. + * + * The main question is how and where to store the verity descriptor and the + * Merkle tree. We store both in dedicated btree items in the filesystem tree, + * together with the rest of the inode metadata. This means we'll need to do + * extra work to encrypt them once encryption is supported in btrfs, but btrfs + * has a lot of careful code around i_size and it seems better to make a new key + * type than try and adjust all of our expectations for i_size. + * + * Note that this differs from the implementation in ext4 and f2fs, where + * this data is stored as if it were in the file, but past EOF. However, btrfs + * does not have a widespread mechanism for caching opaque metadata pages, so we + * do pretend that the Merkle tree pages themselves are past EOF for the + * purposes of caching them (as opposed to creating a virtual inode). + * + * fs verity items are stored under two different key types on disk. + * The descriptor items: + * [ inode objectid, BTRFS_VERITY_DESC_ITEM_KEY, offset ] + * + * At offset 0, we store a btrfs_verity_descriptor_item which tracks the + * size of the descriptor item and some extra data for encryption. + * Starting at offset 1, these hold the generic fs verity descriptor. + * The latter are opaque to btrfs, we just read and write them as a blob for + * the higher level verity code. The most common descriptor size is 256 bytes. + * + * The merkle tree items: + * [ inode objectid, BTRFS_VERITY_MERKLE_ITEM_KEY, offset ] + * + * These also start at offset 0, and correspond to the merkle tree bytes. + * So when fsverity asks for page 0 of the merkle tree, we pull up one page + * starting at offset 0 for this key type. These are also opaque to btrfs, + * we're blindly storing whatever fsverity sends down. + * + * Another important consideration is the fact that the Merkle tree data scales + * linearly with the size of the file (with 4K pages/blocks and SHA-256, it's + * ~1/127th the size) so for large files, writing the tree can be a lengthy + * operation. For that reason, we guard the whole enable verity operation + * (between begin_enable_verity and end_enable_verity) with an orphan item. + * Again, because the data can be pretty large, it's quite possible that we + * could run out of space writing it, so we try our best to handle errors by + * stopping and rolling back rather than aborting the victim transaction. + */ + +#define MERKLE_START_ALIGN 65536 + +/* + * Compute the logical file offset where we cache the Merkle tree. + * + * @inode: inode of the verity file + * + * For the purposes of caching the Merkle tree pages, as required by + * fs-verity, it is convenient to do size computations in terms of a file + * offset, rather than in terms of page indices. + * + * Use 64K to be sure it's past the last page in the file, even with 64K pages. + * That rounding operation itself can overflow loff_t, so we do it in u64 and + * check. + * + * Returns the file offset on success, negative error code on failure. + */ +static loff_t merkle_file_pos(const struct inode *inode) +{ + u64 sz = inode->i_size; + u64 rounded = round_up(sz, MERKLE_START_ALIGN); + + if (rounded > inode->i_sb->s_maxbytes) + return -EFBIG; + + return rounded; +} + +/* + * Drop all the items for this inode with this key_type. + * + * @inode: inode to drop items for + * @key_type: type of items to drop (BTRFS_VERITY_DESC_ITEM or + * BTRFS_VERITY_MERKLE_ITEM) + * + * Before doing a verity enable we cleanup any existing verity items. + * This is also used to clean up if a verity enable failed half way through. + * + * Returns number of dropped items on success, negative error code on failure. + */ +static int drop_verity_items(struct btrfs_inode *inode, u8 key_type) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = inode->root; + struct btrfs_path *path; + struct btrfs_key key; + int count = 0; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + while (1) { + /* 1 for the item being dropped */ + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + + /* + * Walk backwards through all the items until we find one that + * isn't from our key type or objectid + */ + key.objectid = btrfs_ino(inode); + key.type = key_type; + key.offset = (u64)-1; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) { + ret = 0; + /* No more keys of this type, we're done */ + if (path->slots[0] == 0) + break; + path->slots[0]--; + } else if (ret < 0) { + btrfs_end_transaction(trans); + goto out; + } + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + /* No more keys of this type, we're done */ + if (key.objectid != btrfs_ino(inode) || key.type != key_type) + break; + + /* + * This shouldn't be a performance sensitive function because + * it's not used as part of truncate. If it ever becomes + * perf sensitive, change this to walk forward and bulk delete + * items + */ + ret = btrfs_del_items(trans, root, path, path->slots[0], 1); + if (ret) { + btrfs_end_transaction(trans); + goto out; + } + count++; + btrfs_release_path(path); + btrfs_end_transaction(trans); + } + ret = count; + btrfs_end_transaction(trans); +out: + btrfs_free_path(path); + return ret; +} + +/* + * Drop all verity items + * + * @inode: inode to drop verity items for + * + * In most contexts where we are dropping verity items, we want to do it for all + * the types of verity items, not a particular one. + * + * Returns: 0 on success, negative error code on failure. + */ +int btrfs_drop_verity_items(struct btrfs_inode *inode) +{ + int ret; + + ret = drop_verity_items(inode, BTRFS_VERITY_DESC_ITEM_KEY); + if (ret < 0) + return ret; + ret = drop_verity_items(inode, BTRFS_VERITY_MERKLE_ITEM_KEY); + if (ret < 0) + return ret; + + return 0; +} + +/* + * Insert and write inode items with a given key type and offset. + * + * @inode: inode to insert for + * @key_type: key type to insert + * @offset: item offset to insert at + * @src: source data to write + * @len: length of source data to write + * + * Write len bytes from src into items of up to 2K length. + * The inserted items will have key (ino, key_type, offset + off) where off is + * consecutively increasing from 0 up to the last item ending at offset + len. + * + * Returns 0 on success and a negative error code on failure. + */ +static int write_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, + const char *src, u64 len) +{ + struct btrfs_trans_handle *trans; + struct btrfs_path *path; + struct btrfs_root *root = inode->root; + struct extent_buffer *leaf; + struct btrfs_key key; + unsigned long copy_bytes; + unsigned long src_offset = 0; + void *data; + int ret = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + while (len > 0) { + /* 1 for the new item being inserted */ + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } + + key.objectid = btrfs_ino(inode); + key.type = key_type; + key.offset = offset; + + /* + * Insert 2K at a time mostly to be friendly for smaller leaf + * size filesystems + */ + copy_bytes = min_t(u64, len, 2048); + + ret = btrfs_insert_empty_item(trans, root, path, &key, copy_bytes); + if (ret) { + btrfs_end_transaction(trans); + break; + } + + leaf = path->nodes[0]; + + data = btrfs_item_ptr(leaf, path->slots[0], void); + write_extent_buffer(leaf, src + src_offset, + (unsigned long)data, copy_bytes); + offset += copy_bytes; + src_offset += copy_bytes; + len -= copy_bytes; + + btrfs_release_path(path); + btrfs_end_transaction(trans); + } + + btrfs_free_path(path); + return ret; +} + +/* + * Read inode items of the given key type and offset from the btree. + * + * @inode: inode to read items of + * @key_type: key type to read + * @offset: item offset to read from + * @dest: Buffer to read into. This parameter has slightly tricky + * semantics. If it is NULL, the function will not do any copying + * and will just return the size of all the items up to len bytes. + * If dest_page is passed, then the function will kmap_local the + * page and ignore dest, but it must still be non-NULL to avoid the + * counting-only behavior. + * @len: length in bytes to read + * @dest_page: copy into this page instead of the dest buffer + * + * Helper function to read items from the btree. This returns the number of + * bytes read or < 0 for errors. We can return short reads if the items don't + * exist on disk or aren't big enough to fill the desired length. Supports + * reading into a provided buffer (dest) or into the page cache + * + * Returns number of bytes read or a negative error code on failure. + */ +static int read_key_bytes(struct btrfs_inode *inode, u8 key_type, u64 offset, + char *dest, u64 len, struct page *dest_page) +{ + struct btrfs_path *path; + struct btrfs_root *root = inode->root; + struct extent_buffer *leaf; + struct btrfs_key key; + u64 item_end; + u64 copy_end; + int copied = 0; + u32 copy_offset; + unsigned long copy_bytes; + unsigned long dest_offset = 0; + void *data; + char *kaddr = dest; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + if (dest_page) + path->reada = READA_FORWARD; + + key.objectid = btrfs_ino(inode); + key.type = key_type; + key.offset = offset; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + goto out; + } else if (ret > 0) { + ret = 0; + if (path->slots[0] == 0) + goto out; + path->slots[0]--; + } + + while (len > 0) { + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + if (key.objectid != btrfs_ino(inode) || key.type != key_type) + break; + + item_end = btrfs_item_size_nr(leaf, path->slots[0]) + key.offset; + + if (copied > 0) { + /* + * Once we've copied something, we want all of the items + * to be sequential + */ + if (key.offset != offset) + break; + } else { + /* + * Our initial offset might be in the middle of an + * item. Make sure it all makes sense. + */ + if (key.offset > offset) + break; + if (item_end <= offset) + break; + } + + /* desc = NULL to just sum all the item lengths */ + if (!dest) + copy_end = item_end; + else + copy_end = min(offset + len, item_end); + + /* Number of bytes in this item we want to copy */ + copy_bytes = copy_end - offset; + + /* Offset from the start of item for copying */ + copy_offset = offset - key.offset; + + if (dest) { + if (dest_page) + kaddr = kmap_local_page(dest_page); + + data = btrfs_item_ptr(leaf, path->slots[0], void); + read_extent_buffer(leaf, kaddr + dest_offset, + (unsigned long)data + copy_offset, + copy_bytes); + + if (dest_page) + kunmap_local(kaddr); + } + + offset += copy_bytes; + dest_offset += copy_bytes; + len -= copy_bytes; + copied += copy_bytes; + + path->slots[0]++; + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + /* + * We've reached the last slot in this leaf and we need + * to go to the next leaf. + */ + ret = btrfs_next_leaf(root, path); + if (ret < 0) { + break; + } else if (ret > 0) { + ret = 0; + break; + } + } + } +out: + btrfs_free_path(path); + if (!ret) + ret = copied; + return ret; +} + +/* + * Delete an fsverity orphan + * + * @trans: transaction to do the delete in + * @inode: inode to orphan + * + * Capture verity orphan specific logic that is repeated in the couple places + * we delete verity orphans. Specifically, handling ENOENT and ignoring inodes + * with 0 links. + * + * Returns zero on success or a negative error code on failure. + */ +static int del_orphan(struct btrfs_trans_handle *trans, struct btrfs_inode *inode) +{ + struct btrfs_root *root = inode->root; + int ret; + + /* + * If the inode has no links, it is either already unlinked, or was + * created with O_TMPFILE. In either case, it should have an orphan from + * that other operation. Rather than reference count the orphans, we + * simply ignore them here, because we only invoke the verity path in + * the orphan logic when i_nlink is 1. + */ + if (!inode->vfs_inode.i_nlink) + return 0; + + ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode)); + if (ret == -ENOENT) + ret = 0; + return ret; +} + +/* + * Rollback in-progress verity if we encounter an error. + * + * @inode: inode verity had an error for + * + * We try to handle recoverable errors while enabling verity by rolling it back + * and just failing the operation, rather than having an fs level error no + * matter what. However, any error in rollback is unrecoverable. + * + * Returns 0 on success, negative error code on failure. + */ +static int rollback_verity(struct btrfs_inode *inode) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = inode->root; + int ret; + + ASSERT(inode_is_locked(&inode->vfs_inode)); + truncate_inode_pages(inode->vfs_inode.i_mapping, inode->vfs_inode.i_size); + clear_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags); + ret = btrfs_drop_verity_items(inode); + if (ret) { + btrfs_handle_fs_error(root->fs_info, ret, + "failed to drop verity items in rollback %llu", + (u64)inode->vfs_inode.i_ino); + goto out; + } + + /* + * 1 for updating the inode flag + * 1 for deleting the orphan + */ + trans = btrfs_start_transaction(root, 2); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + btrfs_handle_fs_error(root->fs_info, ret, + "failed to start transaction in verity rollback %llu", + (u64)inode->vfs_inode.i_ino); + goto out; + } + inode->ro_flags &= ~BTRFS_INODE_RO_VERITY; + btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode); + ret = btrfs_update_inode(trans, root, inode); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } + ret = del_orphan(trans, inode); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } + btrfs_end_transaction(trans); +out: + return ret; +} + +/* + * Finalize making the file a valid verity file + * + * @inode: inode to be marked as verity + * @desc: contents of the verity descriptor to write (not NULL) + * @desc_size: size of the verity descriptor + * + * Do the actual work of finalizing verity after successfully writing the Merkle + * tree: + * + * - write out the descriptor items + * - mark the inode with the verity flag + * - delete the orphan item + * - mark the ro compat bit + * - clear the in progress bit + * + * Returns 0 on success, negative error code on failure. + */ +static int finish_verity(struct btrfs_inode *inode, const void *desc, + size_t desc_size) +{ + struct btrfs_trans_handle *trans = NULL; + struct btrfs_root *root = inode->root; + struct btrfs_verity_descriptor_item item; + int ret; + + /* Write out the descriptor item */ + memset(&item, 0, sizeof(item)); + btrfs_set_stack_verity_descriptor_size(&item, desc_size); + ret = write_key_bytes(inode, BTRFS_VERITY_DESC_ITEM_KEY, 0, + (const char *)&item, sizeof(item)); + if (ret) + goto out; + + /* Write out the descriptor itself */ + ret = write_key_bytes(inode, BTRFS_VERITY_DESC_ITEM_KEY, 1, + desc, desc_size); + if (ret) + goto out; + + /* + * 1 for updating the inode flag + * 1 for deleting the orphan + */ + trans = btrfs_start_transaction(root, 2); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + inode->ro_flags |= BTRFS_INODE_RO_VERITY; + btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode); + ret = btrfs_update_inode(trans, root, inode); + if (ret) + goto end_trans; + ret = del_orphan(trans, inode); + if (ret) + goto end_trans; + clear_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags); + btrfs_set_fs_compat_ro(root->fs_info, VERITY); +end_trans: + btrfs_end_transaction(trans); +out: + return ret; + +} + +/* + * fsverity op that begins enabling verity. + * + * @filp: file to enable verity on + * + * Begin enabling fsverity for the file. We drop any existing verity items, add + * an orphan and set the in progress bit. + * + * Returns 0 on success, negative error code on failure. + */ +static int btrfs_begin_enable_verity(struct file *filp) +{ + struct btrfs_inode *inode = BTRFS_I(file_inode(filp)); + struct btrfs_root *root = inode->root; + struct btrfs_trans_handle *trans; + int ret; + + ASSERT(inode_is_locked(file_inode(filp))); + + if (test_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags)) + return -EBUSY; + + /* + * This should almost never do anything, but theoretically, it's + * possible that we failed to enable verity on a file, then were + * interrupted or failed while rolling back, failed to cleanup the + * orphan, and finally attempt to enable verity again. + */ + ret = btrfs_drop_verity_items(inode); + if (ret) + return ret; + + /* 1 for the orphan item */ + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + ret = btrfs_orphan_add(trans, inode); + if (!ret) + set_bit(BTRFS_INODE_VERITY_IN_PROGRESS, &inode->runtime_flags); + btrfs_end_transaction(trans); + + return 0; +} + +/* + * fsverity op that ends enabling verity. + * + * @filp: file we are finishing enabling verity on + * @desc: verity descriptor to write out (NULL in error conditions) + * @desc_size: size of the verity descriptor (variable with signatures) + * @merkle_tree_size: size of the merkle tree in bytes + * + * If desc is null, then VFS is signaling an error occurred during verity + * enable, and we should try to rollback. Otherwise, attempt to finish verity. + * + * Returns 0 on success, negative error code on error. + */ +static int btrfs_end_enable_verity(struct file *filp, const void *desc, + size_t desc_size, u64 merkle_tree_size) +{ + struct btrfs_inode *inode = BTRFS_I(file_inode(filp)); + int ret = 0; + int rollback_ret; + + ASSERT(inode_is_locked(file_inode(filp))); + + if (desc == NULL) + goto rollback; + + ret = finish_verity(inode, desc, desc_size); + if (ret) + goto rollback; + return ret; + +rollback: + rollback_ret = rollback_verity(inode); + if (rollback_ret) + btrfs_err(inode->root->fs_info, + "failed to rollback verity items: %d", rollback_ret); + return ret; +} + +/* + * fsverity op that gets the struct fsverity_descriptor. + * + * @inode: inode to get the descriptor of + * @buf: output buffer for the descriptor contents + * @buf_size: size of the output buffer. 0 to query the size + * + * fsverity does a two pass setup for reading the descriptor, in the first pass + * it calls with buf_size = 0 to query the size of the descriptor, and then in + * the second pass it actually reads the descriptor off disk. + * + * Returns the size on success or a negative error code on failure. + */ +static int btrfs_get_verity_descriptor(struct inode *inode, void *buf, + size_t buf_size) +{ + u64 true_size; + int ret = 0; + struct btrfs_verity_descriptor_item item; + + memset(&item, 0, sizeof(item)); + ret = read_key_bytes(BTRFS_I(inode), BTRFS_VERITY_DESC_ITEM_KEY, 0, + (char *)&item, sizeof(item), NULL); + if (ret < 0) + return ret; + + if (item.reserved[0] != 0 || item.reserved[1] != 0) + return -EUCLEAN; + + true_size = btrfs_stack_verity_descriptor_size(&item); + if (true_size > INT_MAX) + return -EUCLEAN; + + if (buf_size == 0) + return true_size; + if (buf_size < true_size) + return -ERANGE; + + ret = read_key_bytes(BTRFS_I(inode), BTRFS_VERITY_DESC_ITEM_KEY, 1, + buf, buf_size, NULL); + if (ret < 0) + return ret; + if (ret != true_size) + return -EIO; + + return true_size; +} + +/* + * fsverity op that reads and caches a merkle tree page. + * + * @inode: inode to read a merkle tree page for + * @index: page index relative to the start of the merkle tree + * @num_ra_pages: number of pages to readahead. Optional, we ignore it + * + * The Merkle tree is stored in the filesystem btree, but its pages are cached + * with a logical position past EOF in the inode's mapping. + * + * Returns the page we read, or an ERR_PTR on error. + */ +static struct page *btrfs_read_merkle_tree_page(struct inode *inode, + pgoff_t index, + unsigned long num_ra_pages) +{ + struct page *page; + u64 off = (u64)index << PAGE_SHIFT; + loff_t merkle_pos = merkle_file_pos(inode); + int ret; + + if (merkle_pos < 0) + return ERR_PTR(merkle_pos); + if (merkle_pos > inode->i_sb->s_maxbytes - off - PAGE_SIZE) + return ERR_PTR(-EFBIG); + index += merkle_pos >> PAGE_SHIFT; +again: + page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED); + if (page) { + if (PageUptodate(page)) + return page; + + lock_page(page); + /* + * We only insert uptodate pages, so !Uptodate has to be + * an error + */ + if (!PageUptodate(page)) { + unlock_page(page); + put_page(page); + return ERR_PTR(-EIO); + } + unlock_page(page); + return page; + } + + page = __page_cache_alloc(mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS)); + if (!page) + return ERR_PTR(-ENOMEM); + + /* + * Merkle item keys are indexed from byte 0 in the merkle tree. + * They have the form: + * + * [ inode objectid, BTRFS_MERKLE_ITEM_KEY, offset in bytes ] + */ + ret = read_key_bytes(BTRFS_I(inode), BTRFS_VERITY_MERKLE_ITEM_KEY, off, + page_address(page), PAGE_SIZE, page); + if (ret < 0) { + put_page(page); + return ERR_PTR(ret); + } + if (ret < PAGE_SIZE) + memzero_page(page, ret, PAGE_SIZE - ret); + + SetPageUptodate(page); + ret = add_to_page_cache_lru(page, inode->i_mapping, index, GFP_NOFS); + + if (!ret) { + /* Inserted and ready for fsverity */ + unlock_page(page); + } else { + put_page(page); + /* Did someone race us into inserting this page? */ + if (ret == -EEXIST) + goto again; + page = ERR_PTR(ret); + } + return page; +} + +/* + * fsverity op that writes a Merkle tree block into the btree. + * + * @inode: inode to write a Merkle tree block for + * @buf: Merkle tree data block to write + * @index: index of the block in the Merkle tree + * @log_blocksize: log base 2 of the Merkle tree block size + * + * Note that the block size could be different from the page size, so it is not + * safe to assume that index is a page index. + * + * Returns 0 on success or negative error code on failure + */ +static int btrfs_write_merkle_tree_block(struct inode *inode, const void *buf, + u64 index, int log_blocksize) +{ + u64 off = index << log_blocksize; + u64 len = 1ULL << log_blocksize; + loff_t merkle_pos = merkle_file_pos(inode); + + if (merkle_pos < 0) + return merkle_pos; + if (merkle_pos > inode->i_sb->s_maxbytes - off - len) + return -EFBIG; + + return write_key_bytes(BTRFS_I(inode), BTRFS_VERITY_MERKLE_ITEM_KEY, + off, buf, len); +} + +const struct fsverity_operations btrfs_verityops = { + .begin_enable_verity = btrfs_begin_enable_verity, + .end_enable_verity = btrfs_end_enable_verity, + .get_verity_descriptor = btrfs_get_verity_descriptor, + .read_merkle_tree_page = btrfs_read_merkle_tree_page, + .write_merkle_tree_block = btrfs_write_merkle_tree_block, +}; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 70f94b75f25a..464485aa7318 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -38,7 +38,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .sub_stripes = 2, .dev_stripes = 1, .devs_max = 0, /* 0 == as many as possible */ - .devs_min = 4, + .devs_min = 2, .tolerated_failures = 1, .devs_increment = 2, .ncopies = 2, @@ -103,7 +103,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .sub_stripes = 1, .dev_stripes = 1, .devs_max = 0, - .devs_min = 2, + .devs_min = 1, .tolerated_failures = 0, .devs_increment = 1, .ncopies = 1, @@ -153,6 +153,32 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { }, }; +/* + * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which + * can be used as index to access btrfs_raid_array[]. + */ +enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags) +{ + if (flags & BTRFS_BLOCK_GROUP_RAID10) + return BTRFS_RAID_RAID10; + else if (flags & BTRFS_BLOCK_GROUP_RAID1) + return BTRFS_RAID_RAID1; + else if (flags & BTRFS_BLOCK_GROUP_RAID1C3) + return BTRFS_RAID_RAID1C3; + else if (flags & BTRFS_BLOCK_GROUP_RAID1C4) + return BTRFS_RAID_RAID1C4; + else if (flags & BTRFS_BLOCK_GROUP_DUP) + return BTRFS_RAID_DUP; + else if (flags & BTRFS_BLOCK_GROUP_RAID0) + return BTRFS_RAID_RAID0; + else if (flags & BTRFS_BLOCK_GROUP_RAID5) + return BTRFS_RAID_RAID5; + else if (flags & BTRFS_BLOCK_GROUP_RAID6) + return BTRFS_RAID_RAID6; + + return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ +} + const char *btrfs_bg_type_to_raid_name(u64 flags) { const int index = btrfs_bg_flags_to_raid_index(flags); @@ -404,44 +430,6 @@ void __exit btrfs_cleanup_fs_uuids(void) } } -/* - * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error. - * Returned struct is not linked onto any lists and must be destroyed using - * btrfs_free_device. - */ -static struct btrfs_device *__alloc_device(struct btrfs_fs_info *fs_info) -{ - struct btrfs_device *dev; - - dev = kzalloc(sizeof(*dev), GFP_KERNEL); - if (!dev) - return ERR_PTR(-ENOMEM); - - /* - * Preallocate a bio that's always going to be used for flushing device - * barriers and matches the device lifespan - */ - dev->flush_bio = bio_kmalloc(GFP_KERNEL, 0); - if (!dev->flush_bio) { - kfree(dev); - return ERR_PTR(-ENOMEM); - } - - INIT_LIST_HEAD(&dev->dev_list); - INIT_LIST_HEAD(&dev->dev_alloc_list); - INIT_LIST_HEAD(&dev->post_commit_list); - - atomic_set(&dev->reada_in_flight, 0); - atomic_set(&dev->dev_stats_ccnt, 0); - btrfs_device_data_ordered_init(dev); - INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); - INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); - extent_io_tree_init(fs_info, &dev->alloc_state, - IO_TREE_DEVICE_ALLOC_STATE, NULL); - - return dev; -} - static noinline struct btrfs_fs_devices *find_fsid( const u8 *fsid, const u8 *metadata_fsid) { @@ -570,6 +558,8 @@ static int btrfs_free_stale_devices(const char *path, struct btrfs_device *device, *tmp_device; int ret = 0; + lockdep_assert_held(&uuid_mutex); + if (path) ret = -ENOENT; @@ -1000,11 +990,12 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) struct btrfs_device *orig_dev; int ret = 0; + lockdep_assert_held(&uuid_mutex); + fs_devices = alloc_fs_devices(orig->fsid, NULL); if (IS_ERR(fs_devices)) return fs_devices; - mutex_lock(&orig->device_list_mutex); fs_devices->total_devices = orig->total_devices; list_for_each_entry(orig_dev, &orig->devices, dev_list) { @@ -1036,10 +1027,8 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) device->fs_devices = fs_devices; fs_devices->num_devices++; } - mutex_unlock(&orig->device_list_mutex); return fs_devices; error: - mutex_unlock(&orig->device_list_mutex); free_fs_devices(fs_devices); return ERR_PTR(ret); } @@ -1130,6 +1119,9 @@ static void btrfs_close_one_device(struct btrfs_device *device) fs_devices->rw_devices--; } + if (device->devid == BTRFS_DEV_REPLACE_DEVID) + clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); + if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) fs_devices->missing_devices--; @@ -1228,7 +1220,7 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, static int devid_cmp(void *priv, const struct list_head *a, const struct list_head *b) { - struct btrfs_device *dev1, *dev2; + const struct btrfs_device *dev1, *dev2; dev1 = list_entry(a, struct btrfs_device, dev_list); dev2 = list_entry(b, struct btrfs_device, dev_list); @@ -1598,14 +1590,9 @@ again: key.offset = search_start; key.type = BTRFS_DEV_EXTENT_KEY; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + ret = btrfs_search_backwards(root, &key, path); if (ret < 0) goto out; - if (ret > 0) { - ret = btrfs_previous_item(root, path, key.objectid, key.type); - if (ret < 0) - goto out; - } while (1) { l = path->nodes[0]; @@ -1759,48 +1746,6 @@ out: return ret; } -static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, - struct btrfs_device *device, - u64 chunk_offset, u64 start, u64 num_bytes) -{ - int ret; - struct btrfs_path *path; - struct btrfs_fs_info *fs_info = device->fs_info; - struct btrfs_root *root = fs_info->dev_root; - struct btrfs_dev_extent *extent; - struct extent_buffer *leaf; - struct btrfs_key key; - - WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)); - WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - key.objectid = device->devid; - key.offset = start; - key.type = BTRFS_DEV_EXTENT_KEY; - ret = btrfs_insert_empty_item(trans, root, path, &key, - sizeof(*extent)); - if (ret) - goto out; - - leaf = path->nodes[0]; - extent = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_dev_extent); - btrfs_set_dev_extent_chunk_tree(leaf, extent, - BTRFS_CHUNK_TREE_OBJECTID); - btrfs_set_dev_extent_chunk_objectid(leaf, extent, - BTRFS_FIRST_CHUNK_TREE_OBJECTID); - btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); - - btrfs_set_dev_extent_length(leaf, extent, num_bytes); - btrfs_mark_buffer_dirty(leaf); -out: - btrfs_free_path(path); - return ret; -} - static u64 find_next_chunk(struct btrfs_fs_info *fs_info) { struct extent_map_tree *em_tree; @@ -1925,15 +1870,17 @@ out: * Function to update ctime/mtime for a given device path. * Mainly used for ctime/mtime based probe like libblkid. */ -static void update_dev_time(const char *path_name) +static void update_dev_time(struct block_device *bdev) { - struct file *filp; + struct inode *inode = bdev->bd_inode; + struct timespec64 now; - filp = filp_open(path_name, O_RDWR, 0); - if (IS_ERR(filp)) + /* Shouldn't happen but just in case. */ + if (!inode) return; - file_update_time(filp); - filp_close(filp, NULL); + + now = current_time(inode); + generic_update_time(inode, &now, S_MTIME | S_CTIME); } static int btrfs_rm_dev_item(struct btrfs_device *device) @@ -2003,12 +1950,8 @@ static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info, if (!(all_avail & btrfs_raid_array[i].bg_flag)) continue; - if (num_devices < btrfs_raid_array[i].devs_min) { - int ret = btrfs_raid_array[i].mindev_error; - - if (ret) - return ret; - } + if (num_devices < btrfs_raid_array[i].devs_min) + return btrfs_raid_array[i].mindev_error; } return 0; @@ -2113,11 +2056,11 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, btrfs_kobject_uevent(bdev, KOBJ_CHANGE); /* Update ctime/mtime for device path for libblkid */ - update_dev_time(device_path); + update_dev_time(bdev); } int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, - u64 devid) + u64 devid, struct block_device **bdev, fmode_t *mode) { struct btrfs_device *device; struct btrfs_fs_devices *cur_devices; @@ -2137,7 +2080,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, if (IS_ERR(device)) { if (PTR_ERR(device) == -ENOENT && - strcmp(device_path, "missing") == 0) + device_path && strcmp(device_path, "missing") == 0) ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; else ret = PTR_ERR(device); @@ -2231,15 +2174,26 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, mutex_unlock(&fs_devices->device_list_mutex); /* - * at this point, the device is zero sized and detached from - * the devices list. All that's left is to zero out the old - * supers and free the device. + * At this point, the device is zero sized and detached from the + * devices list. All that's left is to zero out the old supers and + * free the device. + * + * We cannot call btrfs_close_bdev() here because we're holding the sb + * write lock, and blkdev_put() will pull in the ->open_mutex on the + * block device and it's dependencies. Instead just flush the device + * and let the caller do the final blkdev_put. */ - if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { btrfs_scratch_superblocks(fs_info, device->bdev, device->name->str); + if (device->bdev) { + sync_blockdev(device->bdev); + invalidate_bdev(device->bdev); + } + } - btrfs_close_bdev(device); + *bdev = device->bdev; + *mode = device->mode; synchronize_rcu(); btrfs_free_device(device); @@ -2766,7 +2720,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path btrfs_forget_devices(device_path); /* Update ctime/mtime for blkid or udev */ - update_dev_time(device_path); + update_dev_time(bdev); return ret; @@ -3622,10 +3576,7 @@ static u64 calc_data_stripes(u64 type, int num_stripes) const int ncopies = btrfs_raid_array[index].ncopies; const int nparity = btrfs_raid_array[index].nparity; - if (nparity) - return num_stripes - nparity; - else - return num_stripes / ncopies; + return (num_stripes - nparity) / ncopies; } /* [pstart, pend) */ @@ -4025,6 +3976,13 @@ static inline int validate_convert_profile(struct btrfs_fs_info *fs_info, if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) return true; + if (fs_info->sectorsize < PAGE_SIZE && + bargs->target & BTRFS_BLOCK_GROUP_RAID56_MASK) { + btrfs_err(fs_info, + "RAID56 is not yet supported for sectorsize %u with page size %lu", + fs_info->sectorsize, PAGE_SIZE); + return false; + } /* Profile is valid and does not have bits outside of the allowed set */ if (alloc_profile_is_valid(bargs->target, 1) && (bargs->target & ~allowed) == 0) @@ -5464,56 +5422,6 @@ out: } /* - * This function, btrfs_finish_chunk_alloc(), belongs to phase 2. - * - * See the comment at btrfs_chunk_alloc() for details about the chunk allocation - * phases. - */ -int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, - u64 chunk_offset, u64 chunk_size) -{ - struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_device *device; - struct extent_map *em; - struct map_lookup *map; - u64 dev_offset; - u64 stripe_size; - int i; - int ret = 0; - - em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size); - if (IS_ERR(em)) - return PTR_ERR(em); - - map = em->map_lookup; - stripe_size = em->orig_block_len; - - /* - * Take the device list mutex to prevent races with the final phase of - * a device replace operation that replaces the device object associated - * with the map's stripes, because the device object's id can change - * at any time during that final phase of the device replace operation - * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the - * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID, - * resulting in persisting a device extent item with such ID. - */ - mutex_lock(&fs_info->fs_devices->device_list_mutex); - for (i = 0; i < map->num_stripes; i++) { - device = map->stripes[i].dev; - dev_offset = map->stripes[i].physical; - - ret = btrfs_alloc_dev_extent(trans, device, chunk_offset, - dev_offset, stripe_size); - if (ret) - break; - } - mutex_unlock(&fs_info->fs_devices->device_list_mutex); - - free_extent_map(em); - return ret; -} - -/* * This function, btrfs_chunk_alloc_add_chunk_item(), typically belongs to the * phase 1 of chunk allocation. It belongs to phase 2 only when allocating system * chunks. @@ -6923,9 +6831,31 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, if (WARN_ON(!devid && !fs_info)) return ERR_PTR(-EINVAL); - dev = __alloc_device(fs_info); - if (IS_ERR(dev)) - return dev; + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return ERR_PTR(-ENOMEM); + + /* + * Preallocate a bio that's always going to be used for flushing device + * barriers and matches the device lifespan + */ + dev->flush_bio = bio_kmalloc(GFP_KERNEL, 0); + if (!dev->flush_bio) { + kfree(dev); + return ERR_PTR(-ENOMEM); + } + + INIT_LIST_HEAD(&dev->dev_list); + INIT_LIST_HEAD(&dev->dev_alloc_list); + INIT_LIST_HEAD(&dev->post_commit_list); + + atomic_set(&dev->reada_in_flight, 0); + atomic_set(&dev->dev_stats_ccnt, 0); + btrfs_device_data_ordered_init(dev); + INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); + INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); + extent_io_tree_init(fs_info, &dev->alloc_state, + IO_TREE_DEVICE_ALLOC_STATE, NULL); if (devid) tmp = *devid; @@ -6961,15 +6891,7 @@ static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes) { - int index = btrfs_bg_flags_to_raid_index(type); - int ncopies = btrfs_raid_array[index].ncopies; - const int nparity = btrfs_raid_array[index].nparity; - int data_stripes; - - if (nparity) - data_stripes = num_stripes - nparity; - else - data_stripes = num_stripes / ncopies; + const int data_stripes = calc_data_stripes(type, num_stripes); return div_u64(chunk_len, data_stripes); } @@ -8144,7 +8066,7 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) goto out; if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { - ret = btrfs_next_item(root, path); + ret = btrfs_next_leaf(root, path); if (ret < 0) goto out; /* No dev extents at all? Not good */ diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 55a8ba244716..2183361db614 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -472,7 +472,8 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, const u8 *uuid); void btrfs_free_device(struct btrfs_device *device); int btrfs_rm_device(struct btrfs_fs_info *fs_info, - const char *device_path, u64 devid); + const char *device_path, u64 devid, + struct block_device **bdev, fmode_t *mode); void __exit btrfs_cleanup_fs_uuids(void); int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len); int btrfs_grow_device(struct btrfs_trans_handle *trans, @@ -508,8 +509,6 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len); unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, u64 logical); -int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, - u64 chunk_offset, u64 chunk_size); int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, struct btrfs_block_group *bg); int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset); @@ -568,32 +567,6 @@ static inline void btrfs_dev_stat_set(struct btrfs_device *dev, atomic_inc(&dev->dev_stats_ccnt); } -/* - * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which - * can be used as index to access btrfs_raid_array[]. - */ -static inline enum btrfs_raid_types btrfs_bg_flags_to_raid_index(u64 flags) -{ - if (flags & BTRFS_BLOCK_GROUP_RAID10) - return BTRFS_RAID_RAID10; - else if (flags & BTRFS_BLOCK_GROUP_RAID1) - return BTRFS_RAID_RAID1; - else if (flags & BTRFS_BLOCK_GROUP_RAID1C3) - return BTRFS_RAID_RAID1C3; - else if (flags & BTRFS_BLOCK_GROUP_RAID1C4) - return BTRFS_RAID_RAID1C4; - else if (flags & BTRFS_BLOCK_GROUP_DUP) - return BTRFS_RAID_DUP; - else if (flags & BTRFS_BLOCK_GROUP_RAID0) - return BTRFS_RAID_RAID0; - else if (flags & BTRFS_BLOCK_GROUP_RAID5) - return BTRFS_RAID_RAID5; - else if (flags & BTRFS_BLOCK_GROUP_RAID6) - return BTRFS_RAID_RAID6; - - return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ -} - void btrfs_commit_device_sizes(struct btrfs_transaction *trans); struct list_head * __attribute_const__ btrfs_get_fs_uuids(void); @@ -603,6 +576,7 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, struct block_device *bdev, const char *device_path); +enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags); int btrfs_bg_type_to_factor(u64 flags); const char *btrfs_bg_type_to_raid_name(u64 flags); int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index c3fa7d3fa770..8afa90074891 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -121,12 +121,12 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, workspace->strm.total_in = 0; workspace->strm.total_out = 0; - out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + out_page = alloc_page(GFP_NOFS); if (out_page == NULL) { ret = -ENOMEM; goto out; } - cpage_out = kmap(out_page); + cpage_out = page_address(out_page); pages[0] = out_page; nr_pages = 1; @@ -148,26 +148,22 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, int i; for (i = 0; i < in_buf_pages; i++) { - if (in_page) { - kunmap(in_page); + if (in_page) put_page(in_page); - } in_page = find_get_page(mapping, start >> PAGE_SHIFT); - data_in = kmap(in_page); + data_in = page_address(in_page); memcpy(workspace->buf + i * PAGE_SIZE, data_in, PAGE_SIZE); start += PAGE_SIZE; } workspace->strm.next_in = workspace->buf; } else { - if (in_page) { - kunmap(in_page); + if (in_page) put_page(in_page); - } in_page = find_get_page(mapping, start >> PAGE_SHIFT); - data_in = kmap(in_page); + data_in = page_address(in_page); start += PAGE_SIZE; workspace->strm.next_in = data_in; } @@ -196,18 +192,17 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, * the stream end if required */ if (workspace->strm.avail_out == 0) { - kunmap(out_page); if (nr_pages == nr_dest_pages) { out_page = NULL; ret = -E2BIG; goto out; } - out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + out_page = alloc_page(GFP_NOFS); if (out_page == NULL) { ret = -ENOMEM; goto out; } - cpage_out = kmap(out_page); + cpage_out = page_address(out_page); pages[nr_pages] = out_page; nr_pages++; workspace->strm.avail_out = PAGE_SIZE; @@ -234,18 +229,17 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, goto out; } else if (workspace->strm.avail_out == 0) { /* get another page for the stream end */ - kunmap(out_page); if (nr_pages == nr_dest_pages) { out_page = NULL; ret = -E2BIG; goto out; } - out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + out_page = alloc_page(GFP_NOFS); if (out_page == NULL) { ret = -ENOMEM; goto out; } - cpage_out = kmap(out_page); + cpage_out = page_address(out_page); pages[nr_pages] = out_page; nr_pages++; workspace->strm.avail_out = PAGE_SIZE; @@ -264,13 +258,8 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, *total_in = workspace->strm.total_in; out: *out_pages = nr_pages; - if (out_page) - kunmap(out_page); - - if (in_page) { - kunmap(in_page); + if (in_page) put_page(in_page); - } return ret; } @@ -286,10 +275,8 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE); unsigned long buf_start; struct page **pages_in = cb->compressed_pages; - u64 disk_start = cb->start; - struct bio *orig_bio = cb->orig_bio; - data_in = kmap(pages_in[page_in_index]); + data_in = page_address(pages_in[page_in_index]); workspace->strm.next_in = data_in; workspace->strm.avail_in = min_t(size_t, srclen, PAGE_SIZE); workspace->strm.total_in = 0; @@ -311,7 +298,6 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) { pr_warn("BTRFS: inflateInit failed\n"); - kunmap(pages_in[page_in_index]); return -EIO; } while (workspace->strm.total_in < srclen) { @@ -326,9 +312,8 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) if (buf_start == total_out) break; - ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start, - total_out, disk_start, - orig_bio); + ret2 = btrfs_decompress_buf2page(workspace->buf, + total_out - buf_start, cb, buf_start); if (ret2 == 0) { ret = 0; goto done; @@ -339,17 +324,16 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) if (workspace->strm.avail_in == 0) { unsigned long tmp; - kunmap(pages_in[page_in_index]); + page_in_index++; if (page_in_index >= total_pages_in) { data_in = NULL; break; } - data_in = kmap(pages_in[page_in_index]); + data_in = page_address(pages_in[page_in_index]); workspace->strm.next_in = data_in; tmp = srclen - workspace->strm.total_in; - workspace->strm.avail_in = min(tmp, - PAGE_SIZE); + workspace->strm.avail_in = min(tmp, PAGE_SIZE); } } if (ret != Z_STREAM_END) @@ -358,10 +342,8 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) ret = 0; done: zlib_inflateEnd(&workspace->strm); - if (data_in) - kunmap(pages_in[page_in_index]); if (!ret) - zero_fill_bio(orig_bio); + zero_fill_bio(cb->orig_bio); return ret; } diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 907c2cc45c9c..47af1ab3bf12 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -245,7 +245,7 @@ static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info) goto out; if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { - ret = btrfs_next_item(root, path); + ret = btrfs_next_leaf(root, path); if (ret < 0) goto out; /* No dev extents at all? Not good */ @@ -296,7 +296,6 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_zoned_device_info *zone_info = NULL; struct block_device *bdev = device->bdev; - struct request_queue *queue = bdev_get_queue(bdev); sector_t nr_sectors; sector_t sector = 0; struct blk_zone *zones = NULL; @@ -348,19 +347,10 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) nr_sectors = bdev_nr_sectors(bdev); zone_info->zone_size_shift = ilog2(zone_info->zone_size); - zone_info->max_zone_append_size = - (u64)queue_max_zone_append_sectors(queue) << SECTOR_SHIFT; zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors); if (!IS_ALIGNED(nr_sectors, zone_sectors)) zone_info->nr_zones++; - if (bdev_is_zoned(bdev) && zone_info->max_zone_append_size == 0) { - btrfs_err(fs_info, "zoned: device %pg does not support zone append", - bdev); - ret = -EINVAL; - goto out; - } - zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); if (!zone_info->seq_zones) { ret = -ENOMEM; @@ -529,7 +519,6 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) u64 zoned_devices = 0; u64 nr_devices = 0; u64 zone_size = 0; - u64 max_zone_append_size = 0; const bool incompat_zoned = btrfs_fs_incompat(fs_info, ZONED); int ret = 0; @@ -565,11 +554,6 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) ret = -EINVAL; goto out; } - if (!max_zone_append_size || - (zone_info->max_zone_append_size && - zone_info->max_zone_append_size < max_zone_append_size)) - max_zone_append_size = - zone_info->max_zone_append_size; } nr_devices++; } @@ -619,7 +603,6 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) } fs_info->zone_size = zone_size; - fs_info->max_zone_append_size = max_zone_append_size; fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED; /* @@ -1318,9 +1301,6 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start) if (!btrfs_is_zoned(fs_info)) return false; - if (!fs_info->max_zone_append_size) - return false; - if (!is_data_inode(&inode->vfs_inode)) return false; diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index b0ae2608cb6b..4b299705bb12 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -22,7 +22,6 @@ struct btrfs_zoned_device_info { */ u64 zone_size; u8 zone_size_shift; - u64 max_zone_append_size; u32 nr_zones; unsigned long *seq_zones; unsigned long *empty_zones; diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 3e26b466476a..56dce9f00988 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -399,19 +399,19 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, /* map in the first page of input data */ in_page = find_get_page(mapping, start >> PAGE_SHIFT); - workspace->in_buf.src = kmap(in_page); + workspace->in_buf.src = page_address(in_page); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE); /* Allocate and map in the output buffer */ - out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + out_page = alloc_page(GFP_NOFS); if (out_page == NULL) { ret = -ENOMEM; goto out; } pages[nr_pages++] = out_page; - workspace->out_buf.dst = kmap(out_page); + workspace->out_buf.dst = page_address(out_page); workspace->out_buf.pos = 0; workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); @@ -446,19 +446,18 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, if (workspace->out_buf.pos == workspace->out_buf.size) { tot_out += PAGE_SIZE; max_out -= PAGE_SIZE; - kunmap(out_page); if (nr_pages == nr_dest_pages) { out_page = NULL; ret = -E2BIG; goto out; } - out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + out_page = alloc_page(GFP_NOFS); if (out_page == NULL) { ret = -ENOMEM; goto out; } pages[nr_pages++] = out_page; - workspace->out_buf.dst = kmap(out_page); + workspace->out_buf.dst = page_address(out_page); workspace->out_buf.pos = 0; workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); @@ -473,13 +472,12 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, /* Check if we need more input */ if (workspace->in_buf.pos == workspace->in_buf.size) { tot_in += PAGE_SIZE; - kunmap(in_page); put_page(in_page); start += PAGE_SIZE; len -= PAGE_SIZE; in_page = find_get_page(mapping, start >> PAGE_SHIFT); - workspace->in_buf.src = kmap(in_page); + workspace->in_buf.src = page_address(in_page); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE); } @@ -506,19 +504,18 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, tot_out += PAGE_SIZE; max_out -= PAGE_SIZE; - kunmap(out_page); if (nr_pages == nr_dest_pages) { out_page = NULL; ret = -E2BIG; goto out; } - out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + out_page = alloc_page(GFP_NOFS); if (out_page == NULL) { ret = -ENOMEM; goto out; } pages[nr_pages++] = out_page; - workspace->out_buf.dst = kmap(out_page); + workspace->out_buf.dst = page_address(out_page); workspace->out_buf.pos = 0; workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); } @@ -534,12 +531,8 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, out: *out_pages = nr_pages; /* Cleanup */ - if (in_page) { - kunmap(in_page); + if (in_page) put_page(in_page); - } - if (out_page) - kunmap(out_page); return ret; } @@ -547,8 +540,6 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) { struct workspace *workspace = list_entry(ws, struct workspace, list); struct page **pages_in = cb->compressed_pages; - u64 disk_start = cb->start; - struct bio *orig_bio = cb->orig_bio; size_t srclen = cb->compressed_len; ZSTD_DStream *stream; int ret = 0; @@ -565,7 +556,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) goto done; } - workspace->in_buf.src = kmap(pages_in[page_in_index]); + workspace->in_buf.src = page_address(pages_in[page_in_index]); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE); @@ -589,7 +580,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) workspace->out_buf.pos = 0; ret = btrfs_decompress_buf2page(workspace->out_buf.dst, - buf_start, total_out, disk_start, orig_bio); + total_out - buf_start, cb, buf_start); if (ret == 0) break; @@ -601,23 +592,21 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) break; if (workspace->in_buf.pos == workspace->in_buf.size) { - kunmap(pages_in[page_in_index++]); + page_in_index++; if (page_in_index >= total_pages_in) { workspace->in_buf.src = NULL; ret = -EIO; goto done; } srclen -= PAGE_SIZE; - workspace->in_buf.src = kmap(pages_in[page_in_index]); + workspace->in_buf.src = page_address(pages_in[page_in_index]); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE); } } ret = 0; - zero_fill_bio(orig_bio); + zero_fill_bio(cb->orig_bio); done: - if (workspace->in_buf.src) - kunmap(pages_in[page_in_index]); return ret; } diff --git a/fs/buffer.c b/fs/buffer.c index 6290c3afdba4..ab7573d72dd7 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1912,7 +1912,7 @@ EXPORT_SYMBOL(page_zero_new_buffers); static void iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh, - struct iomap *iomap) + const struct iomap *iomap) { loff_t offset = block << inode->i_blkbits; @@ -1966,7 +1966,7 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh, } int __block_write_begin_int(struct page *page, loff_t pos, unsigned len, - get_block_t *get_block, struct iomap *iomap) + get_block_t *get_block, const struct iomap *iomap) { unsigned from = pos & (PAGE_SIZE - 1); unsigned to = from + len; @@ -3268,33 +3268,6 @@ out: EXPORT_SYMBOL(try_to_free_buffers); /* - * There are no bdflush tunables left. But distributions are - * still running obsolete flush daemons, so we terminate them here. - * - * Use of bdflush() is deprecated and will be removed in a future kernel. - * The `flush-X' kernel threads fully replace bdflush daemons and this call. - */ -SYSCALL_DEFINE2(bdflush, int, func, long, data) -{ - static int msg_count; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (msg_count < 5) { - msg_count++; - printk(KERN_INFO - "warning: process `%s' used the obsolete bdflush" - " system call\n", current->comm); - printk(KERN_INFO "Fix your initscripts?\n"); - } - - if (func == 1) - do_exit(0); - return 0; -} - -/* * Buffer-head allocation */ static struct kmem_cache *bh_cachep __read_mostly; diff --git a/fs/cachefiles/Kconfig b/fs/cachefiles/Kconfig index ff9ca55a9ae9..6827b40f7ddc 100644 --- a/fs/cachefiles/Kconfig +++ b/fs/cachefiles/Kconfig @@ -19,22 +19,3 @@ config CACHEFILES_DEBUG caching on files module. If this is set, the debugging output may be enabled by setting bits in /sys/modules/cachefiles/parameter/debug or by including a debugging specifier in /etc/cachefilesd.conf. - -config CACHEFILES_HISTOGRAM - bool "Gather latency information on CacheFiles" - depends on CACHEFILES && PROC_FS - help - - This option causes latency information to be gathered on CacheFiles - operation and exported through file: - - /proc/fs/cachefiles/histogram - - The generation of this histogram adds a certain amount of overhead to - execution as there are a number of points at which data is gathered, - and on a multi-CPU system these may be on cachelines that keep - bouncing between CPUs. On the other hand, the histogram may be - useful for debugging purposes. Saying 'N' here is recommended. - - See Documentation/filesystems/caching/cachefiles.rst for more - information. diff --git a/fs/cachefiles/Makefile b/fs/cachefiles/Makefile index 2227dc2d5498..02fd17731769 100644 --- a/fs/cachefiles/Makefile +++ b/fs/cachefiles/Makefile @@ -15,6 +15,4 @@ cachefiles-y := \ security.o \ xattr.o -cachefiles-$(CONFIG_CACHEFILES_HISTOGRAM) += proc.o - obj-$(CONFIG_CACHEFILES) := cachefiles.o diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c index 38bb7764b454..d463d89f5db8 100644 --- a/fs/cachefiles/bind.c +++ b/fs/cachefiles/bind.c @@ -108,8 +108,6 @@ static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache) atomic_set(&fsdef->usage, 1); fsdef->type = FSCACHE_COOKIE_TYPE_INDEX; - _debug("- fsdef %p", fsdef); - /* look up the directory at the root of the cache */ ret = kern_path(cache->rootdirname, LOOKUP_DIRECTORY, &path); if (ret < 0) diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index da3948fdb615..da28ac1fa225 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -33,7 +33,7 @@ static struct fscache_object *cachefiles_alloc_object( cache = container_of(_cache, struct cachefiles_cache, cache); - _enter("{%s},%p,", cache->cache.identifier, cookie); + _enter("{%s},%x,", cache->cache.identifier, cookie->debug_id); lookup_data = kmalloc(sizeof(*lookup_data), cachefiles_gfp); if (!lookup_data) @@ -96,7 +96,7 @@ static struct fscache_object *cachefiles_alloc_object( lookup_data->key = key; object->lookup_data = lookup_data; - _leave(" = %p [%p]", &object->fscache, lookup_data); + _leave(" = %x [%p]", object->fscache.debug_id, lookup_data); return &object->fscache; nomem_key: @@ -379,7 +379,7 @@ static void cachefiles_sync_cache(struct fscache_cache *_cache) const struct cred *saved_cred; int ret; - _enter("%p", _cache); + _enter("%s", _cache->tag->name); cache = container_of(_cache, struct cachefiles_cache, cache); diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index 4ed83aa5253b..0a511c36dab8 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -181,31 +181,6 @@ extern int cachefiles_check_in_use(struct cachefiles_cache *cache, struct dentry *dir, char *filename); /* - * proc.c - */ -#ifdef CONFIG_CACHEFILES_HISTOGRAM -extern atomic_t cachefiles_lookup_histogram[HZ]; -extern atomic_t cachefiles_mkdir_histogram[HZ]; -extern atomic_t cachefiles_create_histogram[HZ]; - -extern int __init cachefiles_proc_init(void); -extern void cachefiles_proc_cleanup(void); -static inline -void cachefiles_hist(atomic_t histogram[], unsigned long start_jif) -{ - unsigned long jif = jiffies - start_jif; - if (jif >= HZ) - jif = HZ - 1; - atomic_inc(&histogram[jif]); -} - -#else -#define cachefiles_proc_init() (0) -#define cachefiles_proc_cleanup() do {} while (0) -#define cachefiles_hist(hist, start_jif) do {} while (0) -#endif - -/* * rdwr.c */ extern int cachefiles_read_or_alloc_page(struct fscache_retrieval *, diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c index b13fb45fc3f3..fac2e8e7b533 100644 --- a/fs/cachefiles/io.c +++ b/fs/cachefiles/io.c @@ -70,7 +70,7 @@ static int cachefiles_read(struct netfs_cache_resources *cres, _enter("%pD,%li,%llx,%zx/%llx", file, file_inode(file)->i_ino, start_pos, len, - i_size_read(file->f_inode)); + i_size_read(file_inode(file))); /* If the caller asked us to seek for data before doing the read, then * we should do that now. If we find a gap, we fill it with zeros. @@ -194,7 +194,7 @@ static int cachefiles_write(struct netfs_cache_resources *cres, _enter("%pD,%li,%llx,%zx/%llx", file, file_inode(file)->i_ino, start_pos, len, - i_size_read(file->f_inode)); + i_size_read(file_inode(file))); ki = kzalloc(sizeof(struct cachefiles_kiocb), GFP_KERNEL); if (!ki) @@ -410,7 +410,7 @@ int cachefiles_begin_read_operation(struct netfs_read_request *rreq, rreq->cache_resources.cache_priv = op; rreq->cache_resources.cache_priv2 = file; rreq->cache_resources.ops = &cachefiles_netfs_cache_ops; - rreq->cookie_debug_id = object->fscache.debug_id; + rreq->cache_resources.debug_id = object->fscache.debug_id; _leave(""); return 0; diff --git a/fs/cachefiles/key.c b/fs/cachefiles/key.c index be96f5fc5cac..7f94efc97e23 100644 --- a/fs/cachefiles/key.c +++ b/fs/cachefiles/key.c @@ -150,6 +150,6 @@ char *cachefiles_cook_key(const u8 *raw, int keylen, uint8_t type) key[len++] = 0; key[len] = 0; - _leave(" = %p %d", key, len); + _leave(" = %s %d", key, len); return key; } diff --git a/fs/cachefiles/main.c b/fs/cachefiles/main.c index ddf0cd58d60c..9c8d34c49b12 100644 --- a/fs/cachefiles/main.c +++ b/fs/cachefiles/main.c @@ -69,15 +69,9 @@ static int __init cachefiles_init(void) goto error_object_jar; } - ret = cachefiles_proc_init(); - if (ret < 0) - goto error_proc; - pr_info("Loaded\n"); return 0; -error_proc: - kmem_cache_destroy(cachefiles_object_jar); error_object_jar: misc_deregister(&cachefiles_dev); error_dev: @@ -94,7 +88,6 @@ static void __exit cachefiles_exit(void) { pr_info("Unloading\n"); - cachefiles_proc_cleanup(); kmem_cache_destroy(cachefiles_object_jar); misc_deregister(&cachefiles_dev); } diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 7bf0732ae25c..a9aca5ab5970 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -39,18 +39,18 @@ void __cachefiles_printk_object(struct cachefiles_object *object, pr_err("%sops=%u inp=%u exc=%u\n", prefix, object->fscache.n_ops, object->fscache.n_in_progress, object->fscache.n_exclusive); - pr_err("%sparent=%p\n", - prefix, object->fscache.parent); + pr_err("%sparent=%x\n", + prefix, object->fscache.parent ? object->fscache.parent->debug_id : 0); spin_lock(&object->fscache.lock); cookie = object->fscache.cookie; if (cookie) { - pr_err("%scookie=%p [pr=%p nd=%p fl=%lx]\n", + pr_err("%scookie=%x [pr=%x nd=%p fl=%lx]\n", prefix, - object->fscache.cookie, - object->fscache.cookie->parent, - object->fscache.cookie->netfs_data, - object->fscache.cookie->flags); + cookie->debug_id, + cookie->parent ? cookie->parent->debug_id : 0, + cookie->netfs_data, + cookie->flags); pr_err("%skey=[%u] '", prefix, cookie->key_len); k = (cookie->key_len <= sizeof(cookie->inline_key)) ? cookie->inline_key : cookie->key; @@ -110,7 +110,7 @@ static void cachefiles_mark_object_buried(struct cachefiles_cache *cache, /* found the dentry for */ found_dentry: - kdebug("preemptive burial: OBJ%x [%s] %p", + kdebug("preemptive burial: OBJ%x [%s] %pd", object->fscache.debug_id, object->fscache.state->name, dentry); @@ -140,7 +140,7 @@ static int cachefiles_mark_object_active(struct cachefiles_cache *cache, struct rb_node **_p, *_parent = NULL; struct dentry *dentry; - _enter(",%p", object); + _enter(",%x", object->fscache.debug_id); try_again: write_lock(&cache->active_lock); @@ -298,8 +298,6 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache, _enter(",'%pd','%pd'", dir, rep); - _debug("remove %p from %p", rep, dir); - /* non-directories can just be unlinked */ if (!d_is_dir(rep)) { _debug("unlink stale object"); @@ -446,7 +444,7 @@ int cachefiles_delete_object(struct cachefiles_cache *cache, struct dentry *dir; int ret; - _enter(",OBJ%x{%p}", object->fscache.debug_id, object->dentry); + _enter(",OBJ%x{%pd}", object->fscache.debug_id, object->dentry); ASSERT(object->dentry); ASSERT(d_backing_inode(object->dentry)); @@ -496,11 +494,10 @@ int cachefiles_walk_to_object(struct cachefiles_object *parent, struct dentry *dir, *next = NULL; struct inode *inode; struct path path; - unsigned long start; const char *name; int ret, nlen; - _enter("OBJ%x{%p},OBJ%x,%s,", + _enter("OBJ%x{%pd},OBJ%x,%s,", parent->fscache.debug_id, parent->dentry, object->fscache.debug_id, key); @@ -535,9 +532,7 @@ lookup_again: inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); - start = jiffies; next = lookup_one_len(name, dir, nlen); - cachefiles_hist(cachefiles_lookup_histogram, start); if (IS_ERR(next)) { trace_cachefiles_lookup(object, next, NULL); goto lookup_error; @@ -545,7 +540,7 @@ lookup_again: inode = d_backing_inode(next); trace_cachefiles_lookup(object, next, inode); - _debug("next -> %p %s", next, inode ? "positive" : "negative"); + _debug("next -> %pd %s", next, inode ? "positive" : "negative"); if (!key) object->new = !inode; @@ -568,9 +563,7 @@ lookup_again: ret = security_path_mkdir(&path, next, 0); if (ret < 0) goto create_error; - start = jiffies; ret = vfs_mkdir(&init_user_ns, d_inode(dir), next, 0); - cachefiles_hist(cachefiles_mkdir_histogram, start); if (!key) trace_cachefiles_mkdir(object, next, ret); if (ret < 0) @@ -583,8 +576,8 @@ lookup_again: } ASSERT(d_backing_inode(next)); - _debug("mkdir -> %p{%p{ino=%lu}}", - next, d_backing_inode(next), d_backing_inode(next)->i_ino); + _debug("mkdir -> %pd{ino=%lu}", + next, d_backing_inode(next)->i_ino); } else if (!d_can_lookup(next)) { pr_err("inode %lu is not a directory\n", @@ -604,18 +597,16 @@ lookup_again: ret = security_path_mknod(&path, next, S_IFREG, 0); if (ret < 0) goto create_error; - start = jiffies; ret = vfs_create(&init_user_ns, d_inode(dir), next, S_IFREG, true); - cachefiles_hist(cachefiles_create_histogram, start); trace_cachefiles_create(object, next, ret); if (ret < 0) goto create_error; ASSERT(d_backing_inode(next)); - _debug("create -> %p{%p{ino=%lu}}", - next, d_backing_inode(next), d_backing_inode(next)->i_ino); + _debug("create -> %pd{ino=%lu}", + next, d_backing_inode(next)->i_ino); } else if (!d_can_lookup(next) && !d_is_reg(next) @@ -765,7 +756,6 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, const char *dirname) { struct dentry *subdir; - unsigned long start; struct path path; int ret; @@ -775,16 +765,14 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, inode_lock(d_inode(dir)); retry: - start = jiffies; subdir = lookup_one_len(dirname, dir, strlen(dirname)); - cachefiles_hist(cachefiles_lookup_histogram, start); if (IS_ERR(subdir)) { if (PTR_ERR(subdir) == -ENOMEM) goto nomem_d_alloc; goto lookup_error; } - _debug("subdir -> %p %s", + _debug("subdir -> %pd %s", subdir, d_backing_inode(subdir) ? "positive" : "negative"); /* we need to create the subdir if it doesn't exist yet */ @@ -810,10 +798,8 @@ retry: } ASSERT(d_backing_inode(subdir)); - _debug("mkdir -> %p{%p{ino=%lu}}", - subdir, - d_backing_inode(subdir), - d_backing_inode(subdir)->i_ino); + _debug("mkdir -> %pd{ino=%lu}", + subdir, d_backing_inode(subdir)->i_ino); } inode_unlock(d_inode(dir)); @@ -876,7 +862,6 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache, struct cachefiles_object *object; struct rb_node *_n; struct dentry *victim; - unsigned long start; int ret; //_enter(",%pd/,%s", @@ -885,13 +870,11 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache, /* look up the victim */ inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); - start = jiffies; victim = lookup_one_len(filename, dir, strlen(filename)); - cachefiles_hist(cachefiles_lookup_histogram, start); if (IS_ERR(victim)) goto lookup_error; - //_debug("victim -> %p %s", + //_debug("victim -> %pd %s", // victim, d_backing_inode(victim) ? "positive" : "negative"); /* if the object is no longer there then we probably retired the object @@ -922,7 +905,7 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache, read_unlock(&cache->active_lock); - //_leave(" = %p", victim); + //_leave(" = %pd", victim); return victim; object_in_use: @@ -968,7 +951,7 @@ int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir, if (IS_ERR(victim)) return PTR_ERR(victim); - _debug("victim -> %p %s", + _debug("victim -> %pd %s", victim, d_backing_inode(victim) ? "positive" : "negative"); /* okay... the victim is not being used so we can cull it diff --git a/fs/cachefiles/proc.c b/fs/cachefiles/proc.c deleted file mode 100644 index 6e67aea0f24e..000000000000 --- a/fs/cachefiles/proc.c +++ /dev/null @@ -1,114 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* CacheFiles statistics - * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - */ - -#include <linux/module.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> -#include "internal.h" - -atomic_t cachefiles_lookup_histogram[HZ]; -atomic_t cachefiles_mkdir_histogram[HZ]; -atomic_t cachefiles_create_histogram[HZ]; - -/* - * display the latency histogram - */ -static int cachefiles_histogram_show(struct seq_file *m, void *v) -{ - unsigned long index; - unsigned x, y, z, t; - - switch ((unsigned long) v) { - case 1: - seq_puts(m, "JIFS SECS LOOKUPS MKDIRS CREATES\n"); - return 0; - case 2: - seq_puts(m, "===== ===== ========= ========= =========\n"); - return 0; - default: - index = (unsigned long) v - 3; - x = atomic_read(&cachefiles_lookup_histogram[index]); - y = atomic_read(&cachefiles_mkdir_histogram[index]); - z = atomic_read(&cachefiles_create_histogram[index]); - if (x == 0 && y == 0 && z == 0) - return 0; - - t = (index * 1000) / HZ; - - seq_printf(m, "%4lu 0.%03u %9u %9u %9u\n", index, t, x, y, z); - return 0; - } -} - -/* - * set up the iterator to start reading from the first line - */ -static void *cachefiles_histogram_start(struct seq_file *m, loff_t *_pos) -{ - if ((unsigned long long)*_pos >= HZ + 2) - return NULL; - if (*_pos == 0) - *_pos = 1; - return (void *)(unsigned long) *_pos; -} - -/* - * move to the next line - */ -static void *cachefiles_histogram_next(struct seq_file *m, void *v, loff_t *pos) -{ - (*pos)++; - return (unsigned long long)*pos > HZ + 2 ? - NULL : (void *)(unsigned long) *pos; -} - -/* - * clean up after reading - */ -static void cachefiles_histogram_stop(struct seq_file *m, void *v) -{ -} - -static const struct seq_operations cachefiles_histogram_ops = { - .start = cachefiles_histogram_start, - .stop = cachefiles_histogram_stop, - .next = cachefiles_histogram_next, - .show = cachefiles_histogram_show, -}; - -/* - * initialise the /proc/fs/cachefiles/ directory - */ -int __init cachefiles_proc_init(void) -{ - _enter(""); - - if (!proc_mkdir("fs/cachefiles", NULL)) - goto error_dir; - - if (!proc_create_seq("fs/cachefiles/histogram", S_IFREG | 0444, NULL, - &cachefiles_histogram_ops)) - goto error_histogram; - - _leave(" = 0"); - return 0; - -error_histogram: - remove_proc_entry("fs/cachefiles", NULL); -error_dir: - _leave(" = -ENOMEM"); - return -ENOMEM; -} - -/* - * clean up the /proc/fs/cachefiles/ directory - */ -void cachefiles_proc_cleanup(void) -{ - remove_proc_entry("fs/cachefiles/histogram", NULL); - remove_proc_entry("fs/cachefiles", NULL); -} diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c index a591b5e09637..9e82de668595 100644 --- a/fs/cachefiles/xattr.c +++ b/fs/cachefiles/xattr.c @@ -36,7 +36,7 @@ int cachefiles_check_object_type(struct cachefiles_object *object) else snprintf(type, 3, "%02x", object->fscache.cookie->def->type); - _enter("%p{%s}", object, type); + _enter("%x{%s}", object->fscache.debug_id, type); /* attempt to install a type label directly */ ret = vfs_setxattr(&init_user_ns, dentry, cachefiles_xattr_cache, type, @@ -134,7 +134,7 @@ int cachefiles_update_object_xattr(struct cachefiles_object *object, if (!dentry) return -ESTALE; - _enter("%p,#%d", object, auxdata->len); + _enter("%x,#%d", object->fscache.debug_id, auxdata->len); /* attempt to install the cache metadata directly */ _debug("SET #%u", auxdata->len); diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c index 529af59d9fd3..f4fc8e0b847c 100644 --- a/fs/ceph/acl.c +++ b/fs/ceph/acl.c @@ -29,7 +29,7 @@ static inline void ceph_set_cached_acl(struct inode *inode, spin_unlock(&ci->i_ceph_lock); } -struct posix_acl *ceph_get_acl(struct inode *inode, int type) +struct posix_acl *ceph_get_acl(struct inode *inode, int type, bool rcu) { int size; unsigned int retry_cnt = 0; @@ -37,6 +37,9 @@ struct posix_acl *ceph_get_acl(struct inode *inode, int type) char *value = NULL; struct posix_acl *acl; + if (rcu) + return ERR_PTR(-ECHILD); + switch (type) { case ACL_TYPE_ACCESS: name = XATTR_NAME_POSIX_ACL_ACCESS; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 7e7a897ae0d3..99b80b5c7a93 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1281,8 +1281,8 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, dout("write_end file %p inode %p page %p %d~%d (%d)\n", file, inode, page, (int)pos, (int)copied, (int)len); - /* zero the stale part of the page if we did a short copy */ if (!PageUptodate(page)) { + /* just return that nothing was copied on a short copy */ if (copied < len) { copied = 0; goto out; diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h index 1409d6149281..058ea2a04376 100644 --- a/fs/ceph/cache.h +++ b/fs/ceph/cache.h @@ -26,12 +26,6 @@ void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci); void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp); void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci); -int ceph_readpage_from_fscache(struct inode *inode, struct page *page); -int ceph_readpages_from_fscache(struct inode *inode, - struct address_space *mapping, - struct list_head *pages, - unsigned *nr_pages); - static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci) { ci->fscache = NULL; diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 39db97f149b9..6c0e52fd0743 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -703,29 +703,12 @@ void ceph_add_cap(struct inode *inode, */ struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc, realmino); - if (realm) { - struct ceph_snap_realm *oldrealm = ci->i_snap_realm; - if (oldrealm) { - spin_lock(&oldrealm->inodes_with_caps_lock); - list_del_init(&ci->i_snap_realm_item); - spin_unlock(&oldrealm->inodes_with_caps_lock); - } - - spin_lock(&realm->inodes_with_caps_lock); - list_add(&ci->i_snap_realm_item, - &realm->inodes_with_caps); - ci->i_snap_realm = realm; - if (realm->ino == ci->i_vino.ino) - realm->inode = inode; - spin_unlock(&realm->inodes_with_caps_lock); - - if (oldrealm) - ceph_put_snap_realm(mdsc, oldrealm); - } else { - pr_err("ceph_add_cap: couldn't find snap realm %llx\n", - realmino); - WARN_ON(!realm); - } + if (realm) + ceph_change_snap_realm(inode, realm); + else + WARN(1, "%s: couldn't find snap realm 0x%llx (ino 0x%llx oldrealm 0x%llx)\n", + __func__, realmino, ci->i_vino.ino, + ci->i_snap_realm ? ci->i_snap_realm->ino : 0); } __check_cap_issue(ci, cap, issued); @@ -1112,20 +1095,6 @@ int ceph_is_any_caps(struct inode *inode) return ret; } -static void drop_inode_snap_realm(struct ceph_inode_info *ci) -{ - struct ceph_snap_realm *realm = ci->i_snap_realm; - spin_lock(&realm->inodes_with_caps_lock); - list_del_init(&ci->i_snap_realm_item); - ci->i_snap_realm_counter++; - ci->i_snap_realm = NULL; - if (realm->ino == ci->i_vino.ino) - realm->inode = NULL; - spin_unlock(&realm->inodes_with_caps_lock); - ceph_put_snap_realm(ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc, - realm); -} - /* * Remove a cap. Take steps to deal with a racing iterate_session_caps. * @@ -1145,17 +1114,16 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) return; } + lockdep_assert_held(&ci->i_ceph_lock); + dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode); mdsc = ceph_inode_to_client(&ci->vfs_inode)->mdsc; /* remove from inode's cap rbtree, and clear auth cap */ rb_erase(&cap->ci_node, &ci->i_caps); - if (ci->i_auth_cap == cap) { - WARN_ON_ONCE(!list_empty(&ci->i_dirty_item) && - !mdsc->fsc->blocklisted); + if (ci->i_auth_cap == cap) ci->i_auth_cap = NULL; - } /* remove from session list */ spin_lock(&session->s_cap_lock); @@ -1201,12 +1169,34 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) * keep i_snap_realm. */ if (ci->i_wr_ref == 0 && ci->i_snap_realm) - drop_inode_snap_realm(ci); + ceph_change_snap_realm(&ci->vfs_inode, NULL); __cap_delay_cancel(mdsc, ci); } } +void ceph_remove_cap(struct ceph_cap *cap, bool queue_release) +{ + struct ceph_inode_info *ci = cap->ci; + struct ceph_fs_client *fsc; + + /* 'ci' being NULL means the remove have already occurred */ + if (!ci) { + dout("%s: cap inode is NULL\n", __func__); + return; + } + + lockdep_assert_held(&ci->i_ceph_lock); + + fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); + WARN_ON_ONCE(ci->i_auth_cap == cap && + !list_empty(&ci->i_dirty_item) && + !fsc->blocklisted && + READ_ONCE(fsc->mount_state) != CEPH_MOUNT_SHUTDOWN); + + __ceph_remove_cap(cap, queue_release); +} + struct cap_msg_args { struct ceph_mds_session *session; u64 ino, cid, follows; @@ -1335,7 +1325,7 @@ void __ceph_remove_caps(struct ceph_inode_info *ci) while (p) { struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node); p = rb_next(p); - __ceph_remove_cap(cap, true); + ceph_remove_cap(cap, true); } spin_unlock(&ci->i_ceph_lock); } @@ -1746,6 +1736,9 @@ struct ceph_cap_flush *ceph_alloc_cap_flush(void) struct ceph_cap_flush *cf; cf = kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL); + if (!cf) + return NULL; + cf->is_capsnap = false; return cf; } @@ -1856,6 +1849,8 @@ static u64 __mark_caps_flushing(struct inode *inode, * try to invalidate mapping pages without blocking. */ static int try_nonblocking_invalidate(struct inode *inode) + __releases(ci->i_ceph_lock) + __acquires(ci->i_ceph_lock) { struct ceph_inode_info *ci = ceph_inode(inode); u32 invalidating_gen = ci->i_rdcache_gen; @@ -2219,6 +2214,7 @@ static int caps_are_flushed(struct inode *inode, u64 flush_tid) */ static int unsafe_request_wait(struct inode *inode) { + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_request *req1 = NULL, *req2 = NULL; int ret, err = 0; @@ -2238,6 +2234,81 @@ static int unsafe_request_wait(struct inode *inode) } spin_unlock(&ci->i_unsafe_lock); + /* + * Trigger to flush the journal logs in all the relevant MDSes + * manually, or in the worst case we must wait at most 5 seconds + * to wait the journal logs to be flushed by the MDSes periodically. + */ + if (req1 || req2) { + struct ceph_mds_session **sessions = NULL; + struct ceph_mds_session *s; + struct ceph_mds_request *req; + unsigned int max; + int i; + + /* + * The mdsc->max_sessions is unlikely to be changed + * mostly, here we will retry it by reallocating the + * sessions arrary memory to get rid of the mdsc->mutex + * lock. + */ +retry: + max = mdsc->max_sessions; + sessions = krealloc(sessions, max * sizeof(s), __GFP_ZERO); + if (!sessions) + return -ENOMEM; + + spin_lock(&ci->i_unsafe_lock); + if (req1) { + list_for_each_entry(req, &ci->i_unsafe_dirops, + r_unsafe_dir_item) { + s = req->r_session; + if (unlikely(s->s_mds > max)) { + spin_unlock(&ci->i_unsafe_lock); + goto retry; + } + if (!sessions[s->s_mds]) { + s = ceph_get_mds_session(s); + sessions[s->s_mds] = s; + } + } + } + if (req2) { + list_for_each_entry(req, &ci->i_unsafe_iops, + r_unsafe_target_item) { + s = req->r_session; + if (unlikely(s->s_mds > max)) { + spin_unlock(&ci->i_unsafe_lock); + goto retry; + } + if (!sessions[s->s_mds]) { + s = ceph_get_mds_session(s); + sessions[s->s_mds] = s; + } + } + } + spin_unlock(&ci->i_unsafe_lock); + + /* the auth MDS */ + spin_lock(&ci->i_ceph_lock); + if (ci->i_auth_cap) { + s = ci->i_auth_cap->session; + if (!sessions[s->s_mds]) + sessions[s->s_mds] = ceph_get_mds_session(s); + } + spin_unlock(&ci->i_ceph_lock); + + /* send flush mdlog request to MDSes */ + for (i = 0; i < max; i++) { + s = sessions[i]; + if (s) { + send_flush_mdlog(s); + ceph_put_mds_session(s); + } + } + kfree(sessions); + } + dout("unsafe_request_wait %p wait on tid %llu %llu\n", inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL); if (req1) { @@ -3008,7 +3079,7 @@ static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had, } /* see comment in __ceph_remove_cap() */ if (!__ceph_is_any_real_caps(ci) && ci->i_snap_realm) - drop_inode_snap_realm(ci); + ceph_change_snap_realm(inode, NULL); } } if (check_flushsnaps && __ceph_have_pending_cap_snap(ci)) { @@ -3114,7 +3185,16 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, break; } } - BUG_ON(!found); + + if (!found) { + /* + * The capsnap should already be removed when removing + * auth cap in the case of a forced unmount. + */ + WARN_ON_ONCE(ci->i_auth_cap); + goto unlock; + } + capsnap->dirty_pages -= nr; if (capsnap->dirty_pages == 0) { complete_capsnap = true; @@ -3136,6 +3216,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, complete_capsnap ? " (complete capsnap)" : ""); } +unlock: spin_unlock(&ci->i_ceph_lock); if (last) { @@ -3606,6 +3687,43 @@ out: iput(inode); } +void __ceph_remove_capsnap(struct inode *inode, struct ceph_cap_snap *capsnap, + bool *wake_ci, bool *wake_mdsc) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + bool ret; + + lockdep_assert_held(&ci->i_ceph_lock); + + dout("removing capsnap %p, inode %p ci %p\n", capsnap, inode, ci); + + list_del_init(&capsnap->ci_item); + ret = __detach_cap_flush_from_ci(ci, &capsnap->cap_flush); + if (wake_ci) + *wake_ci = ret; + + spin_lock(&mdsc->cap_dirty_lock); + if (list_empty(&ci->i_cap_flush_list)) + list_del_init(&ci->i_flushing_item); + + ret = __detach_cap_flush_from_mdsc(mdsc, &capsnap->cap_flush); + if (wake_mdsc) + *wake_mdsc = ret; + spin_unlock(&mdsc->cap_dirty_lock); +} + +void ceph_remove_capsnap(struct inode *inode, struct ceph_cap_snap *capsnap, + bool *wake_ci, bool *wake_mdsc) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + lockdep_assert_held(&ci->i_ceph_lock); + + WARN_ON_ONCE(capsnap->dirty_pages || capsnap->writing); + __ceph_remove_capsnap(inode, capsnap, wake_ci, wake_mdsc); +} + /* * Handle FLUSHSNAP_ACK. MDS has flushed snap data to disk and we can * throw away our cap_snap. @@ -3643,23 +3761,10 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, capsnap, capsnap->follows); } } - if (flushed) { - WARN_ON(capsnap->dirty_pages || capsnap->writing); - dout(" removing %p cap_snap %p follows %lld\n", - inode, capsnap, follows); - list_del(&capsnap->ci_item); - wake_ci |= __detach_cap_flush_from_ci(ci, &capsnap->cap_flush); - - spin_lock(&mdsc->cap_dirty_lock); - - if (list_empty(&ci->i_cap_flush_list)) - list_del_init(&ci->i_flushing_item); - - wake_mdsc |= __detach_cap_flush_from_mdsc(mdsc, - &capsnap->cap_flush); - spin_unlock(&mdsc->cap_dirty_lock); - } + if (flushed) + ceph_remove_capsnap(inode, capsnap, &wake_ci, &wake_mdsc); spin_unlock(&ci->i_ceph_lock); + if (flushed) { ceph_put_snap_context(capsnap->context); ceph_put_cap_snap(capsnap); @@ -3743,7 +3848,7 @@ retry: goto out_unlock; if (target < 0) { - __ceph_remove_cap(cap, false); + ceph_remove_cap(cap, false); goto out_unlock; } @@ -3778,7 +3883,7 @@ retry: change_auth_cap_ses(ci, tcap->session); } } - __ceph_remove_cap(cap, false); + ceph_remove_cap(cap, false); goto out_unlock; } else if (tsession) { /* add placeholder for the export tagert */ @@ -3795,7 +3900,7 @@ retry: spin_unlock(&mdsc->cap_dirty_lock); } - __ceph_remove_cap(cap, false); + ceph_remove_cap(cap, false); goto out_unlock; } @@ -3906,7 +4011,7 @@ retry: ocap->mseq, mds, le32_to_cpu(ph->seq), le32_to_cpu(ph->mseq)); } - __ceph_remove_cap(ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE)); + ceph_remove_cap(ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE)); } *old_issued = issued; @@ -4134,8 +4239,9 @@ void ceph_handle_caps(struct ceph_mds_session *session, done: mutex_unlock(&session->s_mutex); done_unlocked: - ceph_put_string(extra_info.pool_ns); iput(inode); +out: + ceph_put_string(extra_info.pool_ns); return; flush_cap_releases: @@ -4150,7 +4256,7 @@ flush_cap_releases: bad: pr_err("ceph_handle_caps: corrupt message\n"); ceph_msg_dump(msg); - return; + goto out; } /* @@ -4225,33 +4331,9 @@ static void flush_dirty_session_caps(struct ceph_mds_session *s) dout("flush_dirty_caps done\n"); } -static void iterate_sessions(struct ceph_mds_client *mdsc, - void (*cb)(struct ceph_mds_session *)) -{ - int mds; - - mutex_lock(&mdsc->mutex); - for (mds = 0; mds < mdsc->max_sessions; ++mds) { - struct ceph_mds_session *s; - - if (!mdsc->sessions[mds]) - continue; - - s = ceph_get_mds_session(mdsc->sessions[mds]); - if (!s) - continue; - - mutex_unlock(&mdsc->mutex); - cb(s); - ceph_put_mds_session(s); - mutex_lock(&mdsc->mutex); - } - mutex_unlock(&mdsc->mutex); -} - void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) { - iterate_sessions(mdsc, flush_dirty_session_caps); + ceph_mdsc_iterate_sessions(mdsc, flush_dirty_session_caps, true); } void __ceph_touch_fmode(struct ceph_inode_info *ci, diff --git a/fs/ceph/file.c b/fs/ceph/file.c index e1d605a02d4a..d16fd2d5fd42 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1722,32 +1722,26 @@ retry_snap: goto out; } - err = file_remove_privs(file); - if (err) + down_read(&osdc->lock); + map_flags = osdc->osdmap->flags; + pool_flags = ceph_pg_pool_flags(osdc->osdmap, ci->i_layout.pool_id); + up_read(&osdc->lock); + if ((map_flags & CEPH_OSDMAP_FULL) || + (pool_flags & CEPH_POOL_FLAG_FULL)) { + err = -ENOSPC; goto out; + } - err = file_update_time(file); + err = file_remove_privs(file); if (err) goto out; - inode_inc_iversion_raw(inode); - if (ci->i_inline_version != CEPH_INLINE_NONE) { err = ceph_uninline_data(file, NULL); if (err < 0) goto out; } - down_read(&osdc->lock); - map_flags = osdc->osdmap->flags; - pool_flags = ceph_pg_pool_flags(osdc->osdmap, ci->i_layout.pool_id); - up_read(&osdc->lock); - if ((map_flags & CEPH_OSDMAP_FULL) || - (pool_flags & CEPH_POOL_FLAG_FULL)) { - err = -ENOSPC; - goto out; - } - dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n", inode, ceph_vinop(inode), pos, count, i_size_read(inode)); if (fi->fmode & CEPH_FILE_MODE_LAZY) @@ -1759,6 +1753,12 @@ retry_snap: if (err < 0) goto out; + err = file_update_time(file); + if (err) + goto out_caps; + + inode_inc_iversion_raw(inode); + dout("aio_write %p %llx.%llx %llu~%zd got cap refs on %s\n", inode, ceph_vinop(inode), pos, count, ceph_cap_string(got)); @@ -1842,6 +1842,8 @@ retry_snap: } goto out_unlocked; +out_caps: + ceph_put_cap_refs(ci, got); out: if (direct_lock) ceph_end_io_direct(inode); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 1bd2cc015913..2df1e1284451 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -581,16 +581,9 @@ void ceph_evict_inode(struct inode *inode) */ if (ci->i_snap_realm) { if (ceph_snap(inode) == CEPH_NOSNAP) { - struct ceph_snap_realm *realm = ci->i_snap_realm; dout(" dropping residual ref to snap realm %p\n", - realm); - spin_lock(&realm->inodes_with_caps_lock); - list_del_init(&ci->i_snap_realm_item); - ci->i_snap_realm = NULL; - if (realm->ino == ci->i_vino.ino) - realm->inode = NULL; - spin_unlock(&realm->inodes_with_caps_lock); - ceph_put_snap_realm(mdsc, realm); + ci->i_snap_realm); + ceph_change_snap_realm(inode, NULL); } else { ceph_put_snapid_map(mdsc, ci->i_snapid_map); ci->i_snap_realm = NULL; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 0b69aec23e5c..7cad180d6deb 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -11,6 +11,7 @@ #include <linux/ratelimit.h> #include <linux/bits.h> #include <linux/ktime.h> +#include <linux/bitmap.h> #include "super.h" #include "mds_client.h" @@ -652,14 +653,9 @@ const char *ceph_session_state_name(int s) struct ceph_mds_session *ceph_get_mds_session(struct ceph_mds_session *s) { - if (refcount_inc_not_zero(&s->s_ref)) { - dout("mdsc get_session %p %d -> %d\n", s, - refcount_read(&s->s_ref)-1, refcount_read(&s->s_ref)); + if (refcount_inc_not_zero(&s->s_ref)) return s; - } else { - dout("mdsc get_session %p 0 -- FAIL\n", s); - return NULL; - } + return NULL; } void ceph_put_mds_session(struct ceph_mds_session *s) @@ -667,8 +663,6 @@ void ceph_put_mds_session(struct ceph_mds_session *s) if (IS_ERR_OR_NULL(s)) return; - dout("mdsc put_session %p %d -> %d\n", s, - refcount_read(&s->s_ref), refcount_read(&s->s_ref)-1); if (refcount_dec_and_test(&s->s_ref)) { if (s->s_auth.authorizer) ceph_auth_destroy_authorizer(s->s_auth.authorizer); @@ -743,8 +737,6 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, s->s_mdsc = mdsc; s->s_mds = mds; s->s_state = CEPH_MDS_SESSION_NEW; - s->s_ttl = 0; - s->s_seq = 0; mutex_init(&s->s_mutex); ceph_con_init(&s->s_con, s, &mds_con_ops, &mdsc->fsc->client->msgr); @@ -753,17 +745,11 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, s->s_cap_ttl = jiffies - 1; spin_lock_init(&s->s_cap_lock); - s->s_renew_requested = 0; - s->s_renew_seq = 0; INIT_LIST_HEAD(&s->s_caps); - s->s_nr_caps = 0; refcount_set(&s->s_ref, 1); INIT_LIST_HEAD(&s->s_waiting); INIT_LIST_HEAD(&s->s_unsafe); xa_init(&s->s_delegated_inos); - s->s_num_cap_releases = 0; - s->s_cap_reconnect = 0; - s->s_cap_iterator = NULL; INIT_LIST_HEAD(&s->s_cap_releases); INIT_WORK(&s->s_cap_release_work, ceph_cap_release_work); @@ -811,6 +797,33 @@ static void put_request_session(struct ceph_mds_request *req) } } +void ceph_mdsc_iterate_sessions(struct ceph_mds_client *mdsc, + void (*cb)(struct ceph_mds_session *), + bool check_state) +{ + int mds; + + mutex_lock(&mdsc->mutex); + for (mds = 0; mds < mdsc->max_sessions; ++mds) { + struct ceph_mds_session *s; + + s = __ceph_lookup_mds_session(mdsc, mds); + if (!s) + continue; + + if (check_state && !check_session_state(s)) { + ceph_put_mds_session(s); + continue; + } + + mutex_unlock(&mdsc->mutex); + cb(s); + ceph_put_mds_session(s); + mutex_lock(&mdsc->mutex); + } + mutex_unlock(&mdsc->mutex); +} + void ceph_mdsc_release_request(struct kref *kref) { struct ceph_mds_request *req = container_of(kref, @@ -1155,7 +1168,7 @@ random: /* * session messages */ -static struct ceph_msg *create_session_msg(u32 op, u64 seq) +struct ceph_msg *ceph_create_session_msg(u32 op, u64 seq) { struct ceph_msg *msg; struct ceph_mds_session_head *h; @@ -1163,7 +1176,8 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq) msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS, false); if (!msg) { - pr_err("create_session_msg ENOMEM creating msg\n"); + pr_err("ENOMEM creating session %s msg\n", + ceph_session_op_name(op)); return NULL; } h = msg->front.iov_base; @@ -1294,7 +1308,7 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + extra_bytes, GFP_NOFS, false); if (!msg) { - pr_err("create_session_msg ENOMEM creating msg\n"); + pr_err("ENOMEM creating session open msg\n"); return ERR_PTR(-ENOMEM); } p = msg->front.iov_base; @@ -1583,14 +1597,39 @@ out: return ret; } +static int remove_capsnaps(struct ceph_mds_client *mdsc, struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_cap_snap *capsnap; + int capsnap_release = 0; + + lockdep_assert_held(&ci->i_ceph_lock); + + dout("removing capsnaps, ci is %p, inode is %p\n", ci, inode); + + while (!list_empty(&ci->i_cap_snaps)) { + capsnap = list_first_entry(&ci->i_cap_snaps, + struct ceph_cap_snap, ci_item); + __ceph_remove_capsnap(inode, capsnap, NULL, NULL); + ceph_put_snap_context(capsnap->context); + ceph_put_cap_snap(capsnap); + capsnap_release++; + } + wake_up_all(&ci->i_cap_wq); + wake_up_all(&mdsc->cap_flushing_wq); + return capsnap_release; +} + static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) { struct ceph_fs_client *fsc = (struct ceph_fs_client *)arg; + struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); LIST_HEAD(to_remove); bool dirty_dropped = false; bool invalidate = false; + int capsnap_release = 0; dout("removing cap %p, ci is %p, inode is %p\n", cap, ci, &ci->vfs_inode); @@ -1598,7 +1637,6 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, __ceph_remove_cap(cap, false); if (!ci->i_auth_cap) { struct ceph_cap_flush *cf; - struct ceph_mds_client *mdsc = fsc->mdsc; if (READ_ONCE(fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) { if (inode->i_data.nrpages > 0) @@ -1662,6 +1700,9 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, list_add(&ci->i_prealloc_cap_flush->i_list, &to_remove); ci->i_prealloc_cap_flush = NULL; } + + if (!list_empty(&ci->i_cap_snaps)) + capsnap_release = remove_capsnaps(mdsc, inode); } spin_unlock(&ci->i_ceph_lock); while (!list_empty(&to_remove)) { @@ -1678,6 +1719,8 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, ceph_queue_invalidate(inode); if (dirty_dropped) iput(inode); + while (capsnap_release--) + iput(inode); return 0; } @@ -1803,8 +1846,8 @@ static int send_renew_caps(struct ceph_mds_client *mdsc, dout("send_renew_caps to mds%d (%s)\n", session->s_mds, ceph_mds_state_name(state)); - msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS, - ++session->s_renew_seq); + msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS, + ++session->s_renew_seq); if (!msg) return -ENOMEM; ceph_con_send(&session->s_con, msg); @@ -1818,7 +1861,7 @@ static int send_flushmsg_ack(struct ceph_mds_client *mdsc, dout("send_flushmsg_ack to mds%d (%s)s seq %lld\n", session->s_mds, ceph_session_state_name(session->s_state), seq); - msg = create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq); + msg = ceph_create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq); if (!msg) return -ENOMEM; ceph_con_send(&session->s_con, msg); @@ -1870,7 +1913,8 @@ static int request_close_session(struct ceph_mds_session *session) dout("request_close_session mds%d state %s seq %lld\n", session->s_mds, ceph_session_state_name(session->s_state), session->s_seq); - msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq); + msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_CLOSE, + session->s_seq); if (!msg) return -ENOMEM; ceph_con_send(&session->s_con, msg); @@ -1965,7 +2009,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) if (oissued) { /* we aren't the only cap.. just remove us */ - __ceph_remove_cap(cap, true); + ceph_remove_cap(cap, true); (*remaining)--; } else { struct dentry *dentry; @@ -4150,13 +4194,21 @@ static void check_new_map(struct ceph_mds_client *mdsc, struct ceph_mdsmap *newmap, struct ceph_mdsmap *oldmap) { - int i; + int i, j, err; int oldstate, newstate; struct ceph_mds_session *s; + unsigned long targets[DIV_ROUND_UP(CEPH_MAX_MDS, sizeof(unsigned long))] = {0}; dout("check_new_map new %u old %u\n", newmap->m_epoch, oldmap->m_epoch); + if (newmap->m_info) { + for (i = 0; i < newmap->possible_max_rank; i++) { + for (j = 0; j < newmap->m_info[i].num_export_targets; j++) + set_bit(newmap->m_info[i].export_targets[j], targets); + } + } + for (i = 0; i < oldmap->possible_max_rank && i < mdsc->max_sessions; i++) { if (!mdsc->sessions[i]) continue; @@ -4210,6 +4262,7 @@ static void check_new_map(struct ceph_mds_client *mdsc, if (s->s_state == CEPH_MDS_SESSION_RESTARTING && newstate >= CEPH_MDS_STATE_RECONNECT) { mutex_unlock(&mdsc->mutex); + clear_bit(i, targets); send_mds_reconnect(mdsc, s); mutex_lock(&mdsc->mutex); } @@ -4232,6 +4285,51 @@ static void check_new_map(struct ceph_mds_client *mdsc, } } + /* + * Only open and reconnect sessions that don't exist yet. + */ + for (i = 0; i < newmap->possible_max_rank; i++) { + /* + * In case the import MDS is crashed just after + * the EImportStart journal is flushed, so when + * a standby MDS takes over it and is replaying + * the EImportStart journal the new MDS daemon + * will wait the client to reconnect it, but the + * client may never register/open the session yet. + * + * Will try to reconnect that MDS daemon if the + * rank number is in the export targets array and + * is the up:reconnect state. + */ + newstate = ceph_mdsmap_get_state(newmap, i); + if (!test_bit(i, targets) || newstate != CEPH_MDS_STATE_RECONNECT) + continue; + + /* + * The session maybe registered and opened by some + * requests which were choosing random MDSes during + * the mdsc->mutex's unlock/lock gap below in rare + * case. But the related MDS daemon will just queue + * that requests and be still waiting for the client's + * reconnection request in up:reconnect state. + */ + s = __ceph_lookup_mds_session(mdsc, i); + if (likely(!s)) { + s = __open_export_target_session(mdsc, i); + if (IS_ERR(s)) { + err = PTR_ERR(s); + pr_err("failed to open export target session, err %d\n", + err); + continue; + } + } + dout("send reconnect to export target mds.%d\n", i); + mutex_unlock(&mdsc->mutex); + send_mds_reconnect(mdsc, s); + ceph_put_mds_session(s); + mutex_lock(&mdsc->mutex); + } + for (i = 0; i < newmap->possible_max_rank && i < mdsc->max_sessions; i++) { s = mdsc->sessions[i]; if (!s) @@ -4409,24 +4507,12 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session, } /* - * lock unlock sessions, to wait ongoing session activities + * lock unlock the session, to wait ongoing session activities */ -static void lock_unlock_sessions(struct ceph_mds_client *mdsc) +static void lock_unlock_session(struct ceph_mds_session *s) { - int i; - - mutex_lock(&mdsc->mutex); - for (i = 0; i < mdsc->max_sessions; i++) { - struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i); - if (!s) - continue; - mutex_unlock(&mdsc->mutex); - mutex_lock(&s->s_mutex); - mutex_unlock(&s->s_mutex); - ceph_put_mds_session(s); - mutex_lock(&mdsc->mutex); - } - mutex_unlock(&mdsc->mutex); + mutex_lock(&s->s_mutex); + mutex_unlock(&s->s_mutex); } static void maybe_recover_session(struct ceph_mds_client *mdsc) @@ -4448,6 +4534,8 @@ static void maybe_recover_session(struct ceph_mds_client *mdsc) bool check_session_state(struct ceph_mds_session *s) { + struct ceph_fs_client *fsc = s->s_mdsc->fsc; + switch (s->s_state) { case CEPH_MDS_SESSION_OPEN: if (s->s_ttl && time_after(jiffies, s->s_ttl)) { @@ -4456,8 +4544,9 @@ bool check_session_state(struct ceph_mds_session *s) } break; case CEPH_MDS_SESSION_CLOSING: - /* Should never reach this when we're unmounting */ - WARN_ON_ONCE(s->s_ttl); + /* Should never reach this when not force unmounting */ + WARN_ON_ONCE(s->s_ttl && + READ_ONCE(fsc->mount_state) != CEPH_MOUNT_SHUTDOWN); fallthrough; case CEPH_MDS_SESSION_NEW: case CEPH_MDS_SESSION_RESTARTING: @@ -4584,21 +4673,12 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) init_completion(&mdsc->safe_umount_waiters); init_waitqueue_head(&mdsc->session_close_wq); INIT_LIST_HEAD(&mdsc->waiting_for_map); - mdsc->sessions = NULL; - atomic_set(&mdsc->num_sessions, 0); - mdsc->max_sessions = 0; - mdsc->stopping = 0; - atomic64_set(&mdsc->quotarealms_count, 0); mdsc->quotarealms_inodes = RB_ROOT; mutex_init(&mdsc->quotarealms_inodes_mutex); - mdsc->last_snap_seq = 0; init_rwsem(&mdsc->snap_rwsem); mdsc->snap_realms = RB_ROOT; INIT_LIST_HEAD(&mdsc->snap_empty); - mdsc->num_snap_realms = 0; spin_lock_init(&mdsc->snap_empty_lock); - mdsc->last_tid = 0; - mdsc->oldest_tid = 0; mdsc->request_tree = RB_ROOT; INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work); mdsc->last_renew_caps = jiffies; @@ -4610,11 +4690,9 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) mdsc->last_cap_flush_tid = 1; INIT_LIST_HEAD(&mdsc->cap_flush_list); INIT_LIST_HEAD(&mdsc->cap_dirty_migrating); - mdsc->num_cap_flushing = 0; spin_lock_init(&mdsc->cap_dirty_lock); init_waitqueue_head(&mdsc->cap_flushing_wq); INIT_WORK(&mdsc->cap_reclaim_work, ceph_cap_reclaim_work); - atomic_set(&mdsc->cap_reclaim_pending, 0); err = ceph_metric_init(&mdsc->metric); if (err) goto err_mdsmap; @@ -4676,6 +4754,30 @@ static void wait_requests(struct ceph_mds_client *mdsc) dout("wait_requests done\n"); } +void send_flush_mdlog(struct ceph_mds_session *s) +{ + struct ceph_msg *msg; + + /* + * Pre-luminous MDS crashes when it sees an unknown session request + */ + if (!CEPH_HAVE_FEATURE(s->s_con.peer_features, SERVER_LUMINOUS)) + return; + + mutex_lock(&s->s_mutex); + dout("request mdlog flush to mds%d (%s)s seq %lld\n", s->s_mds, + ceph_session_state_name(s->s_state), s->s_seq); + msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_FLUSH_MDLOG, + s->s_seq); + if (!msg) { + pr_err("failed to request mdlog flush to mds%d (%s) seq %lld\n", + s->s_mds, ceph_session_state_name(s->s_state), s->s_seq); + } else { + ceph_con_send(&s->s_con, msg); + } + mutex_unlock(&s->s_mutex); +} + /* * called before mount is ro, and before dentries are torn down. * (hmm, does this still race with new lookups?) @@ -4685,7 +4787,8 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc) dout("pre_umount\n"); mdsc->stopping = 1; - lock_unlock_sessions(mdsc); + ceph_mdsc_iterate_sessions(mdsc, send_flush_mdlog, true); + ceph_mdsc_iterate_sessions(mdsc, lock_unlock_session, false); ceph_flush_dirty_caps(mdsc); wait_requests(mdsc); @@ -4912,7 +5015,6 @@ void ceph_mdsc_destroy(struct ceph_fs_client *fsc) ceph_metric_destroy(&mdsc->metric); - flush_delayed_work(&mdsc->metric.delayed_work); fsc->mdsc = NULL; kfree(mdsc); dout("mdsc_destroy %p done\n", mdsc); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 20e42d8b66c6..97c7f7bfa55f 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -522,6 +522,11 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req) kref_put(&req->r_kref, ceph_mdsc_release_request); } +extern void send_flush_mdlog(struct ceph_mds_session *s); +extern void ceph_mdsc_iterate_sessions(struct ceph_mds_client *mdsc, + void (*cb)(struct ceph_mds_session *), + bool check_state); +extern struct ceph_msg *ceph_create_session_msg(u32 op, u64 seq); extern void __ceph_queue_cap_release(struct ceph_mds_session *session, struct ceph_cap *cap); extern void ceph_flush_cap_releases(struct ceph_mds_client *mdsc, diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 3c444b9cb17b..61d67cbcb367 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -122,6 +122,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2) int err; u8 mdsmap_v; u16 mdsmap_ev; + u32 target; m = kzalloc(sizeof(*m), GFP_NOFS); if (!m) @@ -260,9 +261,14 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2) sizeof(u32), GFP_NOFS); if (!info->export_targets) goto nomem; - for (j = 0; j < num_export_targets; j++) - info->export_targets[j] = - ceph_decode_32(&pexport_targets); + for (j = 0; j < num_export_targets; j++) { + target = ceph_decode_32(&pexport_targets); + if (target >= m->possible_max_rank) { + err = -EIO; + goto corrupt; + } + info->export_targets[j] = target; + } } else { info->export_targets = NULL; } diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c index 5ac151eb0d49..04d5df29bbbf 100644 --- a/fs/ceph/metric.c +++ b/fs/ceph/metric.c @@ -302,6 +302,8 @@ void ceph_metric_destroy(struct ceph_client_metric *m) if (!m) return; + cancel_delayed_work_sync(&m->delayed_work); + percpu_counter_destroy(&m->total_inodes); percpu_counter_destroy(&m->opened_inodes); percpu_counter_destroy(&m->i_caps_mis); @@ -309,8 +311,6 @@ void ceph_metric_destroy(struct ceph_client_metric *m) percpu_counter_destroy(&m->d_lease_mis); percpu_counter_destroy(&m->d_lease_hit); - cancel_delayed_work_sync(&m->delayed_work); - ceph_put_mds_session(m->session); } diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 15105f9da3fd..b41e6724c591 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -849,6 +849,43 @@ static void flush_snaps(struct ceph_mds_client *mdsc) dout("flush_snaps done\n"); } +/** + * ceph_change_snap_realm - change the snap_realm for an inode + * @inode: inode to move to new snap realm + * @realm: new realm to move inode into (may be NULL) + * + * Detach an inode from its old snaprealm (if any) and attach it to + * the new snaprealm (if any). The old snap realm reference held by + * the inode is put. If realm is non-NULL, then the caller's reference + * to it is taken over by the inode. + */ +void ceph_change_snap_realm(struct inode *inode, struct ceph_snap_realm *realm) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; + struct ceph_snap_realm *oldrealm = ci->i_snap_realm; + + lockdep_assert_held(&ci->i_ceph_lock); + + if (oldrealm) { + spin_lock(&oldrealm->inodes_with_caps_lock); + list_del_init(&ci->i_snap_realm_item); + if (oldrealm->ino == ci->i_vino.ino) + oldrealm->inode = NULL; + spin_unlock(&oldrealm->inodes_with_caps_lock); + ceph_put_snap_realm(mdsc, oldrealm); + } + + ci->i_snap_realm = realm; + + if (realm) { + spin_lock(&realm->inodes_with_caps_lock); + list_add(&ci->i_snap_realm_item, &realm->inodes_with_caps); + if (realm->ino == ci->i_vino.ino) + realm->inode = inode; + spin_unlock(&realm->inodes_with_caps_lock); + } +} /* * Handle a snap notification from the MDS. @@ -935,7 +972,6 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, }; struct inode *inode = ceph_find_inode(sb, vino); struct ceph_inode_info *ci; - struct ceph_snap_realm *oldrealm; if (!inode) continue; @@ -960,27 +996,10 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, } dout(" will move %p to split realm %llx %p\n", inode, realm->ino, realm); - /* - * Move the inode to the new realm - */ - oldrealm = ci->i_snap_realm; - spin_lock(&oldrealm->inodes_with_caps_lock); - list_del_init(&ci->i_snap_realm_item); - spin_unlock(&oldrealm->inodes_with_caps_lock); - - spin_lock(&realm->inodes_with_caps_lock); - list_add(&ci->i_snap_realm_item, - &realm->inodes_with_caps); - ci->i_snap_realm = realm; - if (realm->ino == ci->i_vino.ino) - realm->inode = inode; - spin_unlock(&realm->inodes_with_caps_lock); - - spin_unlock(&ci->i_ceph_lock); ceph_get_snap_realm(mdsc, realm); - ceph_put_snap_realm(mdsc, oldrealm); - + ceph_change_snap_realm(inode, realm); + spin_unlock(&ci->i_ceph_lock); iput(inode); continue; diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c index 4a79f3632260..573bb9556fb5 100644 --- a/fs/ceph/strings.c +++ b/fs/ceph/strings.c @@ -46,6 +46,7 @@ const char *ceph_session_op_name(int op) case CEPH_SESSION_FLUSHMSG_ACK: return "flushmsg_ack"; case CEPH_SESSION_FORCE_RO: return "force_ro"; case CEPH_SESSION_REJECT: return "reject"; + case CEPH_SESSION_REQUEST_FLUSH_MDLOG: return "flush_mdlog"; } return "???"; } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index b1a363641beb..a40eb14c282a 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -418,7 +418,6 @@ struct ceph_inode_info { struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */ struct ceph_snapid_map *i_snapid_map; /* snapid -> dev_t */ }; - int i_snap_realm_counter; /* snap realm (if caps) */ struct list_head i_snap_realm_item; struct list_head i_snap_flush_item; struct timespec64 i_btime; @@ -929,6 +928,7 @@ extern void ceph_put_snap_realm(struct ceph_mds_client *mdsc, extern int ceph_update_snap_trace(struct ceph_mds_client *m, void *p, void *e, bool deletion, struct ceph_snap_realm **realm_ret); +void ceph_change_snap_realm(struct inode *inode, struct ceph_snap_realm *realm); extern void ceph_handle_snap(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, struct ceph_msg *msg); @@ -1088,7 +1088,7 @@ void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx); /* acl.c */ #ifdef CONFIG_CEPH_FS_POSIX_ACL -struct posix_acl *ceph_get_acl(struct inode *, int); +struct posix_acl *ceph_get_acl(struct inode *, int, bool); int ceph_set_acl(struct user_namespace *mnt_userns, struct inode *inode, struct posix_acl *acl, int type); int ceph_pre_init_acls(struct inode *dir, umode_t *mode, @@ -1138,6 +1138,7 @@ extern void ceph_add_cap(struct inode *inode, unsigned cap, unsigned seq, u64 realmino, int flags, struct ceph_cap **new_cap); extern void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release); +extern void ceph_remove_cap(struct ceph_cap *cap, bool queue_release); extern void __ceph_remove_caps(struct ceph_inode_info *ci); extern void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap); @@ -1163,6 +1164,12 @@ extern void ceph_put_cap_refs_no_check_caps(struct ceph_inode_info *ci, int had); extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, struct ceph_snap_context *snapc); +extern void __ceph_remove_capsnap(struct inode *inode, + struct ceph_cap_snap *capsnap, + bool *wake_ci, bool *wake_mdsc); +extern void ceph_remove_capsnap(struct inode *inode, + struct ceph_cap_snap *capsnap, + bool *wake_ci, bool *wake_mdsc); extern void ceph_flush_snaps(struct ceph_inode_info *ci, struct ceph_mds_session **psession); extern bool __ceph_should_report_size(struct ceph_inode_info *ci); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 1242db8d3444..159a1ffa4f4b 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -340,6 +340,18 @@ static ssize_t ceph_vxattrcb_caps(struct ceph_inode_info *ci, char *val, ceph_cap_string(issued), issued); } +static ssize_t ceph_vxattrcb_auth_mds(struct ceph_inode_info *ci, + char *val, size_t size) +{ + int ret; + + spin_lock(&ci->i_ceph_lock); + ret = ceph_fmt_xattr(val, size, "%d", + ci->i_auth_cap ? ci->i_auth_cap->session->s_mds : -1); + spin_unlock(&ci->i_ceph_lock); + return ret; +} + #define CEPH_XATTR_NAME(_type, _name) XATTR_CEPH_PREFIX #_type "." #_name #define CEPH_XATTR_NAME2(_type, _name, _name2) \ XATTR_CEPH_PREFIX #_type "." #_name "." #_name2 @@ -473,6 +485,13 @@ static struct ceph_vxattr ceph_common_vxattrs[] = { .exists_cb = NULL, .flags = VXATTR_FLAG_READONLY, }, + { + .name = "ceph.auth_mds", + .name_size = sizeof("ceph.auth_mds"), + .getxattr_cb = ceph_vxattrcb_auth_mds, + .exists_cb = NULL, + .flags = VXATTR_FLAG_READONLY, + }, { .name = NULL, 0 } /* Required table terminator */ }; diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index ac5e0c0e9181..1466b5d01cbb 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -45,7 +45,7 @@ static void configfs_d_iput(struct dentry * dentry, /* * Set sd->s_dentry to null only when this dentry is the one * that is going to be killed. Otherwise configfs_d_iput may - * run just after configfs_attach_attr and set sd->s_dentry to + * run just after configfs_lookup and set sd->s_dentry to * NULL even it's still in use. */ if (sd->s_dentry == dentry) @@ -417,44 +417,16 @@ static void configfs_remove_dir(struct config_item * item) dput(dentry); } - -/* attaches attribute's configfs_dirent to the dentry corresponding to the - * attribute file - */ -static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * dentry) -{ - struct configfs_attribute * attr = sd->s_element; - struct inode *inode; - - spin_lock(&configfs_dirent_lock); - dentry->d_fsdata = configfs_get(sd); - sd->s_dentry = dentry; - spin_unlock(&configfs_dirent_lock); - - inode = configfs_create(dentry, (attr->ca_mode & S_IALLUGO) | S_IFREG); - if (IS_ERR(inode)) { - configfs_put(sd); - return PTR_ERR(inode); - } - if (sd->s_type & CONFIGFS_ITEM_BIN_ATTR) { - inode->i_size = 0; - inode->i_fop = &configfs_bin_file_operations; - } else { - inode->i_size = PAGE_SIZE; - inode->i_fop = &configfs_file_operations; - } - d_add(dentry, inode); - return 0; -} - static struct dentry * configfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct configfs_dirent * parent_sd = dentry->d_parent->d_fsdata; struct configfs_dirent * sd; - int found = 0; - int err; + struct inode *inode = NULL; + + if (dentry->d_name.len > NAME_MAX) + return ERR_PTR(-ENAMETOOLONG); /* * Fake invisibility if dir belongs to a group/default groups hierarchy @@ -464,36 +436,39 @@ static struct dentry * configfs_lookup(struct inode *dir, * not complete their initialization, since the dentries of the * attributes won't be instantiated. */ - err = -ENOENT; if (!configfs_dirent_is_ready(parent_sd)) - goto out; + return ERR_PTR(-ENOENT); + spin_lock(&configfs_dirent_lock); list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { - if (sd->s_type & CONFIGFS_NOT_PINNED) { - const unsigned char * name = configfs_get_name(sd); + if ((sd->s_type & CONFIGFS_NOT_PINNED) && + !strcmp(configfs_get_name(sd), dentry->d_name.name)) { + struct configfs_attribute *attr = sd->s_element; + umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG; - if (strcmp(name, dentry->d_name.name)) - continue; + dentry->d_fsdata = configfs_get(sd); + sd->s_dentry = dentry; + spin_unlock(&configfs_dirent_lock); - found = 1; - err = configfs_attach_attr(sd, dentry); - break; + inode = configfs_create(dentry, mode); + if (IS_ERR(inode)) { + configfs_put(sd); + return ERR_CAST(inode); + } + if (sd->s_type & CONFIGFS_ITEM_BIN_ATTR) { + inode->i_size = 0; + inode->i_fop = &configfs_bin_file_operations; + } else { + inode->i_size = PAGE_SIZE; + inode->i_fop = &configfs_file_operations; + } + goto done; } } - - if (!found) { - /* - * If it doesn't exist and it isn't a NOT_PINNED item, - * it must be negative. - */ - if (dentry->d_name.len > NAME_MAX) - return ERR_PTR(-ENAMETOOLONG); - d_add(dentry, NULL); - return NULL; - } - -out: - return ERR_PTR(err); + spin_unlock(&configfs_dirent_lock); +done: + d_add(dentry, inode); + return NULL; } /* diff --git a/fs/coredump.c b/fs/coredump.c index 07afb5ddb1c4..3224dee44d30 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -782,10 +782,17 @@ void do_coredump(const kernel_siginfo_t *siginfo) * filesystem. */ mnt_userns = file_mnt_user_ns(cprm.file); - if (!uid_eq(i_uid_into_mnt(mnt_userns, inode), current_fsuid())) + if (!uid_eq(i_uid_into_mnt(mnt_userns, inode), + current_fsuid())) { + pr_info_ratelimited("Core dump to %s aborted: cannot preserve file owner\n", + cn.corename); goto close_fail; - if ((inode->i_mode & 0677) != 0600) + } + if ((inode->i_mode & 0677) != 0600) { + pr_info_ratelimited("Core dump to %s aborted: cannot preserve file permissions\n", + cn.corename); goto close_fail; + } if (!(cprm.file->f_mode & FMODE_CAN_WRITE)) goto close_fail; if (do_truncate(mnt_userns, cprm.file->f_path.dentry, @@ -1127,8 +1134,10 @@ int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count, mmap_write_unlock(mm); - if (WARN_ON(i != *vma_count)) + if (WARN_ON(i != *vma_count)) { + kvfree(*vma_meta); return -EFAULT; + } *vma_data_size_ptr = vma_data_size; return 0; diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index d00455440d08..eb538c28df94 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -26,7 +26,7 @@ * it to find the directory entry again if requested. Naively, that would just * mean using the ciphertext filenames. However, since the ciphertext filenames * can contain illegal characters ('\0' and '/'), they must be encoded in some - * way. We use base64. But that can cause names to exceed NAME_MAX (255 + * way. We use base64url. But that can cause names to exceed NAME_MAX (255 * bytes), so we also need to use a strong hash to abbreviate long names. * * The filesystem may also need another kind of hash, the "dirhash", to quickly @@ -38,7 +38,7 @@ * casefolded directories use this type of dirhash. At least in these cases, * each no-key name must include the name's dirhash too. * - * To meet all these requirements, we base64-encode the following + * To meet all these requirements, we base64url-encode the following * variable-length structure. It contains the dirhash, or 0's if the filesystem * didn't provide one; up to 149 bytes of the ciphertext name; and for * ciphertexts longer than 149 bytes, also the SHA-256 of the remaining bytes. @@ -52,15 +52,19 @@ struct fscrypt_nokey_name { u32 dirhash[2]; u8 bytes[149]; u8 sha256[SHA256_DIGEST_SIZE]; -}; /* 189 bytes => 252 bytes base64-encoded, which is <= NAME_MAX (255) */ +}; /* 189 bytes => 252 bytes base64url-encoded, which is <= NAME_MAX (255) */ /* - * Decoded size of max-size nokey name, i.e. a name that was abbreviated using + * Decoded size of max-size no-key name, i.e. a name that was abbreviated using * the strong hash and thus includes the 'sha256' field. This isn't simply * sizeof(struct fscrypt_nokey_name), as the padding at the end isn't included. */ #define FSCRYPT_NOKEY_NAME_MAX offsetofend(struct fscrypt_nokey_name, sha256) +/* Encoded size of max-size no-key name */ +#define FSCRYPT_NOKEY_NAME_MAX_ENCODED \ + FSCRYPT_BASE64URL_CHARS(FSCRYPT_NOKEY_NAME_MAX) + static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) { if (str->len == 1 && str->name[0] == '.') @@ -175,62 +179,82 @@ static int fname_decrypt(const struct inode *inode, return 0; } -static const char lookup_table[65] = - "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,"; +static const char base64url_table[65] = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"; -#define BASE64_CHARS(nbytes) DIV_ROUND_UP((nbytes) * 4, 3) +#define FSCRYPT_BASE64URL_CHARS(nbytes) DIV_ROUND_UP((nbytes) * 4, 3) /** - * base64_encode() - base64-encode some bytes - * @src: the bytes to encode - * @len: number of bytes to encode - * @dst: (output) the base64-encoded string. Not NUL-terminated. + * fscrypt_base64url_encode() - base64url-encode some binary data + * @src: the binary data to encode + * @srclen: the length of @src in bytes + * @dst: (output) the base64url-encoded string. Not NUL-terminated. * - * Encodes the input string using characters from the set [A-Za-z0-9+,]. - * The encoded string is roughly 4/3 times the size of the input string. + * Encodes data using base64url encoding, i.e. the "Base 64 Encoding with URL + * and Filename Safe Alphabet" specified by RFC 4648. '='-padding isn't used, + * as it's unneeded and not required by the RFC. base64url is used instead of + * base64 to avoid the '/' character, which isn't allowed in filenames. * - * Return: length of the encoded string + * Return: the length of the resulting base64url-encoded string in bytes. + * This will be equal to FSCRYPT_BASE64URL_CHARS(srclen). */ -static int base64_encode(const u8 *src, int len, char *dst) +static int fscrypt_base64url_encode(const u8 *src, int srclen, char *dst) { - int i, bits = 0, ac = 0; + u32 ac = 0; + int bits = 0; + int i; char *cp = dst; - for (i = 0; i < len; i++) { - ac += src[i] << bits; + for (i = 0; i < srclen; i++) { + ac = (ac << 8) | src[i]; bits += 8; do { - *cp++ = lookup_table[ac & 0x3f]; - ac >>= 6; bits -= 6; + *cp++ = base64url_table[(ac >> bits) & 0x3f]; } while (bits >= 6); } if (bits) - *cp++ = lookup_table[ac & 0x3f]; + *cp++ = base64url_table[(ac << (6 - bits)) & 0x3f]; return cp - dst; } -static int base64_decode(const char *src, int len, u8 *dst) +/** + * fscrypt_base64url_decode() - base64url-decode a string + * @src: the string to decode. Doesn't need to be NUL-terminated. + * @srclen: the length of @src in bytes + * @dst: (output) the decoded binary data + * + * Decodes a string using base64url encoding, i.e. the "Base 64 Encoding with + * URL and Filename Safe Alphabet" specified by RFC 4648. '='-padding isn't + * accepted, nor are non-encoding characters such as whitespace. + * + * This implementation hasn't been optimized for performance. + * + * Return: the length of the resulting decoded binary data in bytes, + * or -1 if the string isn't a valid base64url string. + */ +static int fscrypt_base64url_decode(const char *src, int srclen, u8 *dst) { - int i, bits = 0, ac = 0; - const char *p; - u8 *cp = dst; + u32 ac = 0; + int bits = 0; + int i; + u8 *bp = dst; + + for (i = 0; i < srclen; i++) { + const char *p = strchr(base64url_table, src[i]); - for (i = 0; i < len; i++) { - p = strchr(lookup_table, src[i]); if (p == NULL || src[i] == 0) - return -2; - ac += (p - lookup_table) << bits; + return -1; + ac = (ac << 6) | (p - base64url_table); bits += 6; if (bits >= 8) { - *cp++ = ac & 0xff; - ac >>= 8; bits -= 8; + *bp++ = (u8)(ac >> bits); } } - if (ac) + if (ac & ((1 << bits) - 1)) return -1; - return cp - dst; + return bp - dst; } bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, @@ -263,10 +287,8 @@ bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, int fscrypt_fname_alloc_buffer(u32 max_encrypted_len, struct fscrypt_str *crypto_str) { - const u32 max_encoded_len = BASE64_CHARS(FSCRYPT_NOKEY_NAME_MAX); - u32 max_presented_len; - - max_presented_len = max(max_encoded_len, max_encrypted_len); + u32 max_presented_len = max_t(u32, FSCRYPT_NOKEY_NAME_MAX_ENCODED, + max_encrypted_len); crypto_str->name = kmalloc(max_presented_len + 1, GFP_NOFS); if (!crypto_str->name) @@ -342,7 +364,7 @@ int fscrypt_fname_disk_to_usr(const struct inode *inode, offsetof(struct fscrypt_nokey_name, bytes)); BUILD_BUG_ON(offsetofend(struct fscrypt_nokey_name, bytes) != offsetof(struct fscrypt_nokey_name, sha256)); - BUILD_BUG_ON(BASE64_CHARS(FSCRYPT_NOKEY_NAME_MAX) > NAME_MAX); + BUILD_BUG_ON(FSCRYPT_NOKEY_NAME_MAX_ENCODED > NAME_MAX); nokey_name.dirhash[0] = hash; nokey_name.dirhash[1] = minor_hash; @@ -358,7 +380,8 @@ int fscrypt_fname_disk_to_usr(const struct inode *inode, nokey_name.sha256); size = FSCRYPT_NOKEY_NAME_MAX; } - oname->len = base64_encode((const u8 *)&nokey_name, size, oname->name); + oname->len = fscrypt_base64url_encode((const u8 *)&nokey_name, size, + oname->name); return 0; } EXPORT_SYMBOL(fscrypt_fname_disk_to_usr); @@ -432,14 +455,15 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, * user-supplied name */ - if (iname->len > BASE64_CHARS(FSCRYPT_NOKEY_NAME_MAX)) + if (iname->len > FSCRYPT_NOKEY_NAME_MAX_ENCODED) return -ENOENT; fname->crypto_buf.name = kmalloc(FSCRYPT_NOKEY_NAME_MAX, GFP_KERNEL); if (fname->crypto_buf.name == NULL) return -ENOMEM; - ret = base64_decode(iname->name, iname->len, fname->crypto_buf.name); + ret = fscrypt_base64url_decode(iname->name, iname->len, + fname->crypto_buf.name); if (ret < (int)offsetof(struct fscrypt_nokey_name, bytes[1]) || (ret > offsetof(struct fscrypt_nokey_name, sha256) && ret != FSCRYPT_NOKEY_NAME_MAX)) { diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index a73b0376e6f3..af74599ae1cf 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -384,3 +384,47 @@ err_kfree: return ERR_PTR(err); } EXPORT_SYMBOL_GPL(fscrypt_get_symlink); + +/** + * fscrypt_symlink_getattr() - set the correct st_size for encrypted symlinks + * @path: the path for the encrypted symlink being queried + * @stat: the struct being filled with the symlink's attributes + * + * Override st_size of encrypted symlinks to be the length of the decrypted + * symlink target (or the no-key encoded symlink target, if the key is + * unavailable) rather than the length of the encrypted symlink target. This is + * necessary for st_size to match the symlink target that userspace actually + * sees. POSIX requires this, and some userspace programs depend on it. + * + * This requires reading the symlink target from disk if needed, setting up the + * inode's encryption key if possible, and then decrypting or encoding the + * symlink target. This makes lstat() more heavyweight than is normally the + * case. However, decrypted symlink targets will be cached in ->i_link, so + * usually the symlink won't have to be read and decrypted again later if/when + * it is actually followed, readlink() is called, or lstat() is called again. + * + * Return: 0 on success, -errno on failure + */ +int fscrypt_symlink_getattr(const struct path *path, struct kstat *stat) +{ + struct dentry *dentry = path->dentry; + struct inode *inode = d_inode(dentry); + const char *link; + DEFINE_DELAYED_CALL(done); + + /* + * To get the symlink target that userspace will see (whether it's the + * decrypted target or the no-key encoded target), we can just get it in + * the same way the VFS does during path resolution and readlink(). + */ + link = READ_ONCE(inode->i_link); + if (!link) { + link = inode->i_op->get_link(dentry, inode, &done); + if (IS_ERR(link)) + return PTR_ERR(link); + } + stat->size = strlen(link); + do_delayed_call(&done); + return 0; +} +EXPORT_SYMBOL_GPL(fscrypt_symlink_getattr); diff --git a/fs/d_path.c b/fs/d_path.c index 23a53f7b5c71..cd60c7535181 100644 --- a/fs/d_path.c +++ b/fs/d_path.c @@ -22,13 +22,57 @@ static char *extract_string(struct prepend_buffer *p) return ERR_PTR(-ENAMETOOLONG); } -static void prepend(struct prepend_buffer *p, const char *str, int namelen) +static bool prepend_char(struct prepend_buffer *p, unsigned char c) { - p->len -= namelen; - if (likely(p->len >= 0)) { - p->buf -= namelen; - memcpy(p->buf, str, namelen); + if (likely(p->len > 0)) { + p->len--; + *--p->buf = c; + return true; + } + p->len = -1; + return false; +} + +/* + * The source of the prepend data can be an optimistoc load + * of a dentry name and length. And because we don't hold any + * locks, the length and the pointer to the name may not be + * in sync if a concurrent rename happens, and the kernel + * copy might fault as a result. + * + * The end result will correct itself when we check the + * rename sequence count, but we need to be able to handle + * the fault gracefully. + */ +static bool prepend_copy(void *dst, const void *src, int len) +{ + if (unlikely(copy_from_kernel_nofault(dst, src, len))) { + memset(dst, 'x', len); + return false; } + return true; +} + +static bool prepend(struct prepend_buffer *p, const char *str, int namelen) +{ + // Already overflowed? + if (p->len < 0) + return false; + + // Will overflow? + if (p->len < namelen) { + // Fill as much as possible from the end of the name + str += namelen - p->len; + p->buf -= p->len; + prepend_copy(p->buf, str, p->len); + p->len = -1; + return false; + } + + // Fits fully + p->len -= namelen; + p->buf -= namelen; + return prepend_copy(p->buf, str, namelen); } /** @@ -40,32 +84,21 @@ static void prepend(struct prepend_buffer *p, const char *str, int namelen) * With RCU path tracing, it may race with d_move(). Use READ_ONCE() to * make sure that either the old or the new name pointer and length are * fetched. However, there may be mismatch between length and pointer. - * The length cannot be trusted, we need to copy it byte-by-byte until - * the length is reached or a null byte is found. It also prepends "/" at + * But since the length cannot be trusted, we need to copy the name very + * carefully when doing the prepend_copy(). It also prepends "/" at * the beginning of the name. The sequence number check at the caller will * retry it again when a d_move() does happen. So any garbage in the buffer * due to mismatched pointer and length will be discarded. * - * Load acquire is needed to make sure that we see that terminating NUL. + * Load acquire is needed to make sure that we see the new name data even + * if we might get the length wrong. */ static bool prepend_name(struct prepend_buffer *p, const struct qstr *name) { const char *dname = smp_load_acquire(&name->name); /* ^^^ */ u32 dlen = READ_ONCE(name->len); - char *s; - p->len -= dlen + 1; - if (unlikely(p->len < 0)) - return false; - s = p->buf -= dlen + 1; - *s++ = '/'; - while (dlen--) { - char c = *dname++; - if (!c) - break; - *s++ = c; - } - return true; + return prepend(p, dname, dlen) && prepend_char(p, '/'); } static int __prepend_path(const struct dentry *dentry, const struct mount *mnt, @@ -158,7 +191,7 @@ restart: b = *p; if (b.len == p->len) - prepend(&b, "/", 1); + prepend_char(&b, '/'); *p = b; return error; @@ -186,7 +219,7 @@ char *__d_path(const struct path *path, { DECLARE_BUFFER(b, buf, buflen); - prepend(&b, "", 1); + prepend_char(&b, 0); if (unlikely(prepend_path(path, root, &b) > 0)) return NULL; return extract_string(&b); @@ -198,7 +231,7 @@ char *d_absolute_path(const struct path *path, struct path root = {}; DECLARE_BUFFER(b, buf, buflen); - prepend(&b, "", 1); + prepend_char(&b, 0); if (unlikely(prepend_path(path, &root, &b) > 1)) return ERR_PTR(-EINVAL); return extract_string(&b); @@ -255,7 +288,7 @@ char *d_path(const struct path *path, char *buf, int buflen) if (unlikely(d_unlinked(path->dentry))) prepend(&b, " (deleted)", 11); else - prepend(&b, "", 1); + prepend_char(&b, 0); prepend_path(path, &root, &b); rcu_read_unlock(); @@ -290,7 +323,7 @@ char *simple_dname(struct dentry *dentry, char *buffer, int buflen) /* these dentries are never renamed, so d_lock is not needed */ prepend(&b, " (deleted)", 11); prepend(&b, dentry->d_name.name, dentry->d_name.len); - prepend(&b, "/", 1); + prepend_char(&b, '/'); return extract_string(&b); } @@ -324,7 +357,7 @@ restart: } done_seqretry(&rename_lock, seq); if (b.len == p->len) - prepend(&b, "/", 1); + prepend_char(&b, '/'); return extract_string(&b); } @@ -332,7 +365,7 @@ char *dentry_path_raw(const struct dentry *dentry, char *buf, int buflen) { DECLARE_BUFFER(b, buf, buflen); - prepend(&b, "", 1); + prepend_char(&b, 0); return __dentry_path(dentry, &b); } EXPORT_SYMBOL(dentry_path_raw); @@ -344,7 +377,7 @@ char *dentry_path(const struct dentry *dentry, char *buf, int buflen) if (unlikely(d_unlinked(dentry))) prepend(&b, "//deleted", 10); else - prepend(&b, "", 1); + prepend_char(&b, 0); return __dentry_path(dentry, &b); } @@ -397,7 +430,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size) unsigned len; DECLARE_BUFFER(b, page, PATH_MAX); - prepend(&b, "", 1); + prepend_char(&b, 0); if (unlikely(prepend_path(&pwd, &root, &b) > 0)) prepend(&b, "(unreachable)", 13); rcu_read_unlock(); @@ -1005,12 +1005,12 @@ int dax_writeback_mapping_range(struct address_space *mapping, } EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); -static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos) +static sector_t dax_iomap_sector(const struct iomap *iomap, loff_t pos) { return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9; } -static int dax_iomap_pfn(struct iomap *iomap, loff_t pos, size_t size, +static int dax_iomap_pfn(const struct iomap *iomap, loff_t pos, size_t size, pfn_t *pfnp) { const sector_t sector = dax_iomap_sector(iomap, pos); @@ -1066,6 +1066,66 @@ static vm_fault_t dax_load_hole(struct xa_state *xas, return ret; } +#ifdef CONFIG_FS_DAX_PMD +static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, + const struct iomap *iomap, void **entry) +{ + struct address_space *mapping = vmf->vma->vm_file->f_mapping; + unsigned long pmd_addr = vmf->address & PMD_MASK; + struct vm_area_struct *vma = vmf->vma; + struct inode *inode = mapping->host; + pgtable_t pgtable = NULL; + struct page *zero_page; + spinlock_t *ptl; + pmd_t pmd_entry; + pfn_t pfn; + + zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm); + + if (unlikely(!zero_page)) + goto fallback; + + pfn = page_to_pfn_t(zero_page); + *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, + DAX_PMD | DAX_ZERO_PAGE, false); + + if (arch_needs_pgtable_deposit()) { + pgtable = pte_alloc_one(vma->vm_mm); + if (!pgtable) + return VM_FAULT_OOM; + } + + ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); + if (!pmd_none(*(vmf->pmd))) { + spin_unlock(ptl); + goto fallback; + } + + if (pgtable) { + pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); + mm_inc_nr_ptes(vma->vm_mm); + } + pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot); + pmd_entry = pmd_mkhuge(pmd_entry); + set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry); + spin_unlock(ptl); + trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry); + return VM_FAULT_NOPAGE; + +fallback: + if (pgtable) + pte_free(vma->vm_mm, pgtable); + trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry); + return VM_FAULT_FALLBACK; +} +#else +static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, + const struct iomap *iomap, void **entry) +{ + return VM_FAULT_FALLBACK; +} +#endif /* CONFIG_FS_DAX_PMD */ + s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap) { sector_t sector = iomap_sector(iomap, pos & PAGE_MASK); @@ -1103,20 +1163,21 @@ s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap) return size; } -static loff_t -dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, - struct iomap *iomap, struct iomap *srcmap) +static loff_t dax_iomap_iter(const struct iomap_iter *iomi, + struct iov_iter *iter) { + const struct iomap *iomap = &iomi->iomap; + loff_t length = iomap_length(iomi); + loff_t pos = iomi->pos; struct block_device *bdev = iomap->bdev; struct dax_device *dax_dev = iomap->dax_dev; - struct iov_iter *iter = data; loff_t end = pos + length, done = 0; ssize_t ret = 0; size_t xfer; int id; if (iov_iter_rw(iter) == READ) { - end = min(end, i_size_read(inode)); + end = min(end, i_size_read(iomi->inode)); if (pos >= end) return 0; @@ -1133,7 +1194,7 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, * written by write(2) is visible in mmap. */ if (iomap->flags & IOMAP_F_NEW) { - invalidate_inode_pages2_range(inode->i_mapping, + invalidate_inode_pages2_range(iomi->inode->i_mapping, pos >> PAGE_SHIFT, (end - 1) >> PAGE_SHIFT); } @@ -1209,31 +1270,29 @@ ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops) { - struct address_space *mapping = iocb->ki_filp->f_mapping; - struct inode *inode = mapping->host; - loff_t pos = iocb->ki_pos, ret = 0, done = 0; - unsigned flags = 0; + struct iomap_iter iomi = { + .inode = iocb->ki_filp->f_mapping->host, + .pos = iocb->ki_pos, + .len = iov_iter_count(iter), + }; + loff_t done = 0; + int ret; if (iov_iter_rw(iter) == WRITE) { - lockdep_assert_held_write(&inode->i_rwsem); - flags |= IOMAP_WRITE; + lockdep_assert_held_write(&iomi.inode->i_rwsem); + iomi.flags |= IOMAP_WRITE; } else { - lockdep_assert_held(&inode->i_rwsem); + lockdep_assert_held(&iomi.inode->i_rwsem); } if (iocb->ki_flags & IOCB_NOWAIT) - flags |= IOMAP_NOWAIT; + iomi.flags |= IOMAP_NOWAIT; - while (iov_iter_count(iter)) { - ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops, - iter, dax_iomap_actor); - if (ret <= 0) - break; - pos += ret; - done += ret; - } + while ((ret = iomap_iter(&iomi, ops)) > 0) + iomi.processed = dax_iomap_iter(&iomi, iter); - iocb->ki_pos += done; + done = iomi.pos - iocb->ki_pos; + iocb->ki_pos = iomi.pos; return done ? done : ret; } EXPORT_SYMBOL_GPL(dax_iomap_rw); @@ -1250,44 +1309,146 @@ static vm_fault_t dax_fault_return(int error) * flushed on write-faults (non-cow), but not read-faults. */ static bool dax_fault_is_synchronous(unsigned long flags, - struct vm_area_struct *vma, struct iomap *iomap) + struct vm_area_struct *vma, const struct iomap *iomap) { return (flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC) && (iomap->flags & IOMAP_F_DIRTY); } +/* + * When handling a synchronous page fault and the inode need a fsync, we can + * insert the PTE/PMD into page tables only after that fsync happened. Skip + * insertion for now and return the pfn so that caller can insert it after the + * fsync is done. + */ +static vm_fault_t dax_fault_synchronous_pfnp(pfn_t *pfnp, pfn_t pfn) +{ + if (WARN_ON_ONCE(!pfnp)) + return VM_FAULT_SIGBUS; + *pfnp = pfn; + return VM_FAULT_NEEDDSYNC; +} + +static vm_fault_t dax_fault_cow_page(struct vm_fault *vmf, + const struct iomap_iter *iter) +{ + sector_t sector = dax_iomap_sector(&iter->iomap, iter->pos); + unsigned long vaddr = vmf->address; + vm_fault_t ret; + int error = 0; + + switch (iter->iomap.type) { + case IOMAP_HOLE: + case IOMAP_UNWRITTEN: + clear_user_highpage(vmf->cow_page, vaddr); + break; + case IOMAP_MAPPED: + error = copy_cow_page_dax(iter->iomap.bdev, iter->iomap.dax_dev, + sector, vmf->cow_page, vaddr); + break; + default: + WARN_ON_ONCE(1); + error = -EIO; + break; + } + + if (error) + return dax_fault_return(error); + + __SetPageUptodate(vmf->cow_page); + ret = finish_fault(vmf); + if (!ret) + return VM_FAULT_DONE_COW; + return ret; +} + +/** + * dax_fault_iter - Common actor to handle pfn insertion in PTE/PMD fault. + * @vmf: vm fault instance + * @iter: iomap iter + * @pfnp: pfn to be returned + * @xas: the dax mapping tree of a file + * @entry: an unlocked dax entry to be inserted + * @pmd: distinguish whether it is a pmd fault + */ +static vm_fault_t dax_fault_iter(struct vm_fault *vmf, + const struct iomap_iter *iter, pfn_t *pfnp, + struct xa_state *xas, void **entry, bool pmd) +{ + struct address_space *mapping = vmf->vma->vm_file->f_mapping; + const struct iomap *iomap = &iter->iomap; + size_t size = pmd ? PMD_SIZE : PAGE_SIZE; + loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT; + bool write = vmf->flags & FAULT_FLAG_WRITE; + bool sync = dax_fault_is_synchronous(iter->flags, vmf->vma, iomap); + unsigned long entry_flags = pmd ? DAX_PMD : 0; + int err = 0; + pfn_t pfn; + + if (!pmd && vmf->cow_page) + return dax_fault_cow_page(vmf, iter); + + /* if we are reading UNWRITTEN and HOLE, return a hole. */ + if (!write && + (iomap->type == IOMAP_UNWRITTEN || iomap->type == IOMAP_HOLE)) { + if (!pmd) + return dax_load_hole(xas, mapping, entry, vmf); + return dax_pmd_load_hole(xas, vmf, iomap, entry); + } + + if (iomap->type != IOMAP_MAPPED) { + WARN_ON_ONCE(1); + return pmd ? VM_FAULT_FALLBACK : VM_FAULT_SIGBUS; + } + + err = dax_iomap_pfn(&iter->iomap, pos, size, &pfn); + if (err) + return pmd ? VM_FAULT_FALLBACK : dax_fault_return(err); + + *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, entry_flags, + write && !sync); + + if (sync) + return dax_fault_synchronous_pfnp(pfnp, pfn); + + /* insert PMD pfn */ + if (pmd) + return vmf_insert_pfn_pmd(vmf, pfn, write); + + /* insert PTE pfn */ + if (write) + return vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn); + return vmf_insert_mixed(vmf->vma, vmf->address, pfn); +} + static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops) { - struct vm_area_struct *vma = vmf->vma; - struct address_space *mapping = vma->vm_file->f_mapping; + struct address_space *mapping = vmf->vma->vm_file->f_mapping; XA_STATE(xas, &mapping->i_pages, vmf->pgoff); - struct inode *inode = mapping->host; - unsigned long vaddr = vmf->address; - loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; - struct iomap iomap = { .type = IOMAP_HOLE }; - struct iomap srcmap = { .type = IOMAP_HOLE }; - unsigned flags = IOMAP_FAULT; - int error, major = 0; - bool write = vmf->flags & FAULT_FLAG_WRITE; - bool sync; + struct iomap_iter iter = { + .inode = mapping->host, + .pos = (loff_t)vmf->pgoff << PAGE_SHIFT, + .len = PAGE_SIZE, + .flags = IOMAP_FAULT, + }; vm_fault_t ret = 0; void *entry; - pfn_t pfn; + int error; - trace_dax_pte_fault(inode, vmf, ret); + trace_dax_pte_fault(iter.inode, vmf, ret); /* * Check whether offset isn't beyond end of file now. Caller is supposed * to hold locks serializing us with truncate / punch hole so this is * a reliable test. */ - if (pos >= i_size_read(inode)) { + if (iter.pos >= i_size_read(iter.inode)) { ret = VM_FAULT_SIGBUS; goto out; } - if (write && !vmf->cow_page) - flags |= IOMAP_WRITE; + if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) + iter.flags |= IOMAP_WRITE; entry = grab_mapping_entry(&xas, mapping, 0); if (xa_is_internal(entry)) { @@ -1306,234 +1467,103 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, goto unlock_entry; } - /* - * Note that we don't bother to use iomap_apply here: DAX required - * the file system block size to be equal the page size, which means - * that we never have to deal with more than a single extent here. - */ - error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap, &srcmap); - if (iomap_errp) - *iomap_errp = error; - if (error) { - ret = dax_fault_return(error); - goto unlock_entry; - } - if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { - error = -EIO; /* fs corruption? */ - goto error_finish_iomap; - } - - if (vmf->cow_page) { - sector_t sector = dax_iomap_sector(&iomap, pos); - - switch (iomap.type) { - case IOMAP_HOLE: - case IOMAP_UNWRITTEN: - clear_user_highpage(vmf->cow_page, vaddr); - break; - case IOMAP_MAPPED: - error = copy_cow_page_dax(iomap.bdev, iomap.dax_dev, - sector, vmf->cow_page, vaddr); - break; - default: - WARN_ON_ONCE(1); - error = -EIO; - break; + while ((error = iomap_iter(&iter, ops)) > 0) { + if (WARN_ON_ONCE(iomap_length(&iter) < PAGE_SIZE)) { + iter.processed = -EIO; /* fs corruption? */ + continue; } - if (error) - goto error_finish_iomap; - - __SetPageUptodate(vmf->cow_page); - ret = finish_fault(vmf); - if (!ret) - ret = VM_FAULT_DONE_COW; - goto finish_iomap; - } - - sync = dax_fault_is_synchronous(flags, vma, &iomap); - - switch (iomap.type) { - case IOMAP_MAPPED: - if (iomap.flags & IOMAP_F_NEW) { + ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, false); + if (ret != VM_FAULT_SIGBUS && + (iter.iomap.flags & IOMAP_F_NEW)) { count_vm_event(PGMAJFAULT); - count_memcg_event_mm(vma->vm_mm, PGMAJFAULT); - major = VM_FAULT_MAJOR; + count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); + ret |= VM_FAULT_MAJOR; } - error = dax_iomap_pfn(&iomap, pos, PAGE_SIZE, &pfn); - if (error < 0) - goto error_finish_iomap; - - entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn, - 0, write && !sync); - /* - * If we are doing synchronous page fault and inode needs fsync, - * we can insert PTE into page tables only after that happens. - * Skip insertion for now and return the pfn so that caller can - * insert it after fsync is done. - */ - if (sync) { - if (WARN_ON_ONCE(!pfnp)) { - error = -EIO; - goto error_finish_iomap; - } - *pfnp = pfn; - ret = VM_FAULT_NEEDDSYNC | major; - goto finish_iomap; - } - trace_dax_insert_mapping(inode, vmf, entry); - if (write) - ret = vmf_insert_mixed_mkwrite(vma, vaddr, pfn); - else - ret = vmf_insert_mixed(vma, vaddr, pfn); - - goto finish_iomap; - case IOMAP_UNWRITTEN: - case IOMAP_HOLE: - if (!write) { - ret = dax_load_hole(&xas, mapping, &entry, vmf); - goto finish_iomap; - } - fallthrough; - default: - WARN_ON_ONCE(1); - error = -EIO; - break; + if (!(ret & VM_FAULT_ERROR)) + iter.processed = PAGE_SIZE; } - error_finish_iomap: - ret = dax_fault_return(error); - finish_iomap: - if (ops->iomap_end) { - int copied = PAGE_SIZE; + if (iomap_errp) + *iomap_errp = error; + if (!ret && error) + ret = dax_fault_return(error); - if (ret & VM_FAULT_ERROR) - copied = 0; - /* - * The fault is done by now and there's no way back (other - * thread may be already happily using PTE we have installed). - * Just ignore error from ->iomap_end since we cannot do much - * with it. - */ - ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap); - } - unlock_entry: +unlock_entry: dax_unlock_entry(&xas, entry); - out: - trace_dax_pte_fault_done(inode, vmf, ret); - return ret | major; +out: + trace_dax_pte_fault_done(iter.inode, vmf, ret); + return ret; } #ifdef CONFIG_FS_DAX_PMD -static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, - struct iomap *iomap, void **entry) +static bool dax_fault_check_fallback(struct vm_fault *vmf, struct xa_state *xas, + pgoff_t max_pgoff) { - struct address_space *mapping = vmf->vma->vm_file->f_mapping; unsigned long pmd_addr = vmf->address & PMD_MASK; - struct vm_area_struct *vma = vmf->vma; - struct inode *inode = mapping->host; - pgtable_t pgtable = NULL; - struct page *zero_page; - spinlock_t *ptl; - pmd_t pmd_entry; - pfn_t pfn; - - zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm); - - if (unlikely(!zero_page)) - goto fallback; + bool write = vmf->flags & FAULT_FLAG_WRITE; - pfn = page_to_pfn_t(zero_page); - *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, - DAX_PMD | DAX_ZERO_PAGE, false); + /* + * Make sure that the faulting address's PMD offset (color) matches + * the PMD offset from the start of the file. This is necessary so + * that a PMD range in the page table overlaps exactly with a PMD + * range in the page cache. + */ + if ((vmf->pgoff & PG_PMD_COLOUR) != + ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR)) + return true; - if (arch_needs_pgtable_deposit()) { - pgtable = pte_alloc_one(vma->vm_mm); - if (!pgtable) - return VM_FAULT_OOM; - } + /* Fall back to PTEs if we're going to COW */ + if (write && !(vmf->vma->vm_flags & VM_SHARED)) + return true; - ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); - if (!pmd_none(*(vmf->pmd))) { - spin_unlock(ptl); - goto fallback; - } + /* If the PMD would extend outside the VMA */ + if (pmd_addr < vmf->vma->vm_start) + return true; + if ((pmd_addr + PMD_SIZE) > vmf->vma->vm_end) + return true; - if (pgtable) { - pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); - mm_inc_nr_ptes(vma->vm_mm); - } - pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot); - pmd_entry = pmd_mkhuge(pmd_entry); - set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry); - spin_unlock(ptl); - trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry); - return VM_FAULT_NOPAGE; + /* If the PMD would extend beyond the file size */ + if ((xas->xa_index | PG_PMD_COLOUR) >= max_pgoff) + return true; -fallback: - if (pgtable) - pte_free(vma->vm_mm, pgtable); - trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry); - return VM_FAULT_FALLBACK; + return false; } static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, const struct iomap_ops *ops) { - struct vm_area_struct *vma = vmf->vma; - struct address_space *mapping = vma->vm_file->f_mapping; + struct address_space *mapping = vmf->vma->vm_file->f_mapping; XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER); - unsigned long pmd_addr = vmf->address & PMD_MASK; - bool write = vmf->flags & FAULT_FLAG_WRITE; - bool sync; - unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT; - struct inode *inode = mapping->host; - vm_fault_t result = VM_FAULT_FALLBACK; - struct iomap iomap = { .type = IOMAP_HOLE }; - struct iomap srcmap = { .type = IOMAP_HOLE }; + struct iomap_iter iter = { + .inode = mapping->host, + .len = PMD_SIZE, + .flags = IOMAP_FAULT, + }; + vm_fault_t ret = VM_FAULT_FALLBACK; pgoff_t max_pgoff; void *entry; - loff_t pos; int error; - pfn_t pfn; + + if (vmf->flags & FAULT_FLAG_WRITE) + iter.flags |= IOMAP_WRITE; /* * Check whether offset isn't beyond end of file now. Caller is * supposed to hold locks serializing us with truncate / punch hole so * this is a reliable test. */ - max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); - - trace_dax_pmd_fault(inode, vmf, max_pgoff, 0); - - /* - * Make sure that the faulting address's PMD offset (color) matches - * the PMD offset from the start of the file. This is necessary so - * that a PMD range in the page table overlaps exactly with a PMD - * range in the page cache. - */ - if ((vmf->pgoff & PG_PMD_COLOUR) != - ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR)) - goto fallback; + max_pgoff = DIV_ROUND_UP(i_size_read(iter.inode), PAGE_SIZE); - /* Fall back to PTEs if we're going to COW */ - if (write && !(vma->vm_flags & VM_SHARED)) - goto fallback; - - /* If the PMD would extend outside the VMA */ - if (pmd_addr < vma->vm_start) - goto fallback; - if ((pmd_addr + PMD_SIZE) > vma->vm_end) - goto fallback; + trace_dax_pmd_fault(iter.inode, vmf, max_pgoff, 0); if (xas.xa_index >= max_pgoff) { - result = VM_FAULT_SIGBUS; + ret = VM_FAULT_SIGBUS; goto out; } - /* If the PMD would extend beyond the file size */ - if ((xas.xa_index | PG_PMD_COLOUR) >= max_pgoff) + if (dax_fault_check_fallback(vmf, &xas, max_pgoff)) goto fallback; /* @@ -1544,7 +1574,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, */ entry = grab_mapping_entry(&xas, mapping, PMD_ORDER); if (xa_is_internal(entry)) { - result = xa_to_internal(entry); + ret = xa_to_internal(entry); goto fallback; } @@ -1556,88 +1586,30 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, */ if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd) && !pmd_devmap(*vmf->pmd)) { - result = 0; + ret = 0; goto unlock_entry; } - /* - * Note that we don't use iomap_apply here. We aren't doing I/O, only - * setting up a mapping, so really we're using iomap_begin() as a way - * to look up our filesystem block. - */ - pos = (loff_t)xas.xa_index << PAGE_SHIFT; - error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap, - &srcmap); - if (error) - goto unlock_entry; - - if (iomap.offset + iomap.length < pos + PMD_SIZE) - goto finish_iomap; - - sync = dax_fault_is_synchronous(iomap_flags, vma, &iomap); - - switch (iomap.type) { - case IOMAP_MAPPED: - error = dax_iomap_pfn(&iomap, pos, PMD_SIZE, &pfn); - if (error < 0) - goto finish_iomap; - - entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn, - DAX_PMD, write && !sync); - - /* - * If we are doing synchronous page fault and inode needs fsync, - * we can insert PMD into page tables only after that happens. - * Skip insertion for now and return the pfn so that caller can - * insert it after fsync is done. - */ - if (sync) { - if (WARN_ON_ONCE(!pfnp)) - goto finish_iomap; - *pfnp = pfn; - result = VM_FAULT_NEEDDSYNC; - goto finish_iomap; - } + iter.pos = (loff_t)xas.xa_index << PAGE_SHIFT; + while ((error = iomap_iter(&iter, ops)) > 0) { + if (iomap_length(&iter) < PMD_SIZE) + continue; /* actually breaks out of the loop */ - trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry); - result = vmf_insert_pfn_pmd(vmf, pfn, write); - break; - case IOMAP_UNWRITTEN: - case IOMAP_HOLE: - if (WARN_ON_ONCE(write)) - break; - result = dax_pmd_load_hole(&xas, vmf, &iomap, &entry); - break; - default: - WARN_ON_ONCE(1); - break; + ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, true); + if (ret != VM_FAULT_FALLBACK) + iter.processed = PMD_SIZE; } - finish_iomap: - if (ops->iomap_end) { - int copied = PMD_SIZE; - - if (result == VM_FAULT_FALLBACK) - copied = 0; - /* - * The fault is done by now and there's no way back (other - * thread may be already happily using PMD we have installed). - * Just ignore error from ->iomap_end since we cannot do much - * with it. - */ - ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags, - &iomap); - } - unlock_entry: +unlock_entry: dax_unlock_entry(&xas, entry); - fallback: - if (result == VM_FAULT_FALLBACK) { - split_huge_pmd(vma, vmf->pmd, vmf->address); +fallback: + if (ret == VM_FAULT_FALLBACK) { + split_huge_pmd(vmf->vma, vmf->pmd, vmf->address); count_vm_event(THP_FAULT_FALLBACK); } out: - trace_dax_pmd_fault_done(inode, vmf, max_pgoff, result); - return result; + trace_dax_pmd_fault_done(iter.inode, vmf, max_pgoff, ret); + return ret; } #else static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index df00231d3ecc..7d162b0efbf0 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -179,8 +179,10 @@ static int open_proxy_open(struct inode *inode, struct file *filp) if (!fops_get(real_fops)) { #ifdef CONFIG_MODULES if (real_fops->owner && - real_fops->owner->state == MODULE_STATE_GOING) + real_fops->owner->state == MODULE_STATE_GOING) { + r = -ENXIO; goto out; + } #endif /* Huh? Module did not clean up after itself at exit? */ @@ -314,8 +316,10 @@ static int full_proxy_open(struct inode *inode, struct file *filp) if (!fops_get(real_fops)) { #ifdef CONFIG_MODULES if (real_fops->owner && - real_fops->owner->state == MODULE_STATE_GOING) + real_fops->owner->state == MODULE_STATE_GOING) { + r = -ENXIO; goto out; + } #endif /* Huh? Module did not cleanup after itself at exit? */ diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c index 10c36ae1a8f9..45ebbe602bbf 100644 --- a/fs/dlm/dir.c +++ b/fs/dlm/dir.c @@ -85,8 +85,10 @@ int dlm_recover_directory(struct dlm_ls *ls) for (;;) { int left; error = dlm_recovery_stopped(ls); - if (error) + if (error) { + error = -EINTR; goto out_free; + } error = dlm_rcom_names(ls, memb->nodeid, last_name, last_len); diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 91d1ca3a121a..5f57538b5d45 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -468,7 +468,7 @@ struct dlm_rcom { struct dlm_opt_header { uint16_t t_type; uint16_t t_length; - uint32_t o_pad; + uint32_t t_pad; /* need to be 8 byte aligned */ char t_value[]; }; diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index d71aba8c3e64..10eddfa6c3d7 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -498,7 +498,7 @@ static int new_lockspace(const char *name, const char *cluster, ls->ls_exflags = (flags & ~(DLM_LSFL_TIMEWARN | DLM_LSFL_FS | DLM_LSFL_NEWEXCL)); - size = dlm_config.ci_rsbtbl_size; + size = READ_ONCE(dlm_config.ci_rsbtbl_size); ls->ls_rsbtbl_size = size; ls->ls_rsbtbl = vmalloc(array_size(size, sizeof(struct dlm_rsbtable))); @@ -793,6 +793,7 @@ static int release_lockspace(struct dlm_ls *ls, int force) if (ls_count == 1) { dlm_scand_stop(); + dlm_clear_members(ls); dlm_midcomms_shutdown(); } diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 0ea9ae35da0b..8f715c620e1f 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -84,9 +84,7 @@ struct connection { struct list_head writequeue; /* List of outgoing writequeue_entries */ spinlock_t writequeue_lock; atomic_t writequeue_cnt; - void (*connect_action) (struct connection *); /* What to do to connect */ - void (*shutdown_action)(struct connection *con); /* What to do to shutdown */ - bool (*eof_condition)(struct connection *con); /* What to do to eof check */ + struct mutex wq_alloc; int retries; #define MAX_CONNECT_RETRIES 3 struct hlist_node list; @@ -145,6 +143,24 @@ struct dlm_node_addr { struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT]; }; +struct dlm_proto_ops { + bool try_new_addr; + const char *name; + int proto; + + int (*connect)(struct connection *con, struct socket *sock, + struct sockaddr *addr, int addr_len); + void (*sockopts)(struct socket *sock); + int (*bind)(struct socket *sock); + int (*listen_validate)(void); + void (*listen_sockopts)(struct socket *sock); + int (*listen_bind)(struct socket *sock); + /* What to do to shutdown */ + void (*shutdown_action)(struct connection *con); + /* What to do to eof check */ + bool (*eof_condition)(struct connection *con); +}; + static struct listen_sock_callbacks { void (*sk_error_report)(struct sock *); void (*sk_data_ready)(struct sock *); @@ -168,12 +184,26 @@ static struct hlist_head connection_hash[CONN_HASH_SIZE]; static DEFINE_SPINLOCK(connections_lock); DEFINE_STATIC_SRCU(connections_srcu); +static const struct dlm_proto_ops *dlm_proto_ops; + static void process_recv_sockets(struct work_struct *work); static void process_send_sockets(struct work_struct *work); -static void sctp_connect_to_sock(struct connection *con); -static void tcp_connect_to_sock(struct connection *con); -static void dlm_tcp_shutdown(struct connection *con); +/* need to held writequeue_lock */ +static struct writequeue_entry *con_next_wq(struct connection *con) +{ + struct writequeue_entry *e; + + if (list_empty(&con->writequeue)) + return NULL; + + e = list_first_entry(&con->writequeue, struct writequeue_entry, + list); + if (e->len == 0) + return NULL; + + return e; +} static struct connection *__find_con(int nodeid, int r) { @@ -208,20 +238,6 @@ static int dlm_con_init(struct connection *con, int nodeid) INIT_WORK(&con->rwork, process_recv_sockets); init_waitqueue_head(&con->shutdown_wait); - switch (dlm_config.ci_protocol) { - case DLM_PROTO_TCP: - con->connect_action = tcp_connect_to_sock; - con->shutdown_action = dlm_tcp_shutdown; - con->eof_condition = tcp_eof_condition; - break; - case DLM_PROTO_SCTP: - con->connect_action = sctp_connect_to_sock; - break; - default: - kfree(con->rx_buf); - return -EINVAL; - } - return 0; } @@ -249,6 +265,8 @@ static struct connection *nodeid2con(int nodeid, gfp_t alloc) return NULL; } + mutex_init(&con->wq_alloc); + spin_lock(&connections_lock); /* Because multiple workqueues/threads calls this function it can * race on multiple cpu's. Instead of locking hot path __find_con() @@ -583,8 +601,7 @@ static void lowcomms_error_report(struct sock *sk) goto out; orig_report = listen_sock.sk_error_report; - if (con->sock == NULL || - kernel_getpeername(con->sock, (struct sockaddr *)&saddr) < 0) { + if (kernel_getpeername(sk->sk_socket, (struct sockaddr *)&saddr) < 0) { printk_ratelimited(KERN_ERR "dlm: node %d: socket error " "sending to node %d, port %d, " "sk_err=%d/%d\n", dlm_our_nodeid(), @@ -801,6 +818,7 @@ static void close_connection(struct connection *con, bool and_other, con->rx_leftover = 0; con->retries = 0; + clear_bit(CF_APP_LIMITED, &con->flags); clear_bit(CF_CONNECTED, &con->flags); clear_bit(CF_DELAY_CONNECT, &con->flags); clear_bit(CF_RECONNECT, &con->flags); @@ -877,7 +895,6 @@ static int con_realloc_receive_buf(struct connection *con, int newlen) /* Data received from remote end */ static int receive_from_sock(struct connection *con) { - int call_again_soon = 0; struct msghdr msg; struct kvec iov; int ret, buflen; @@ -897,41 +914,40 @@ static int receive_from_sock(struct connection *con) goto out_resched; } - /* calculate new buffer parameter regarding last receive and - * possible leftover bytes - */ - iov.iov_base = con->rx_buf + con->rx_leftover; - iov.iov_len = con->rx_buflen - con->rx_leftover; - - memset(&msg, 0, sizeof(msg)); - msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; - ret = kernel_recvmsg(con->sock, &msg, &iov, 1, iov.iov_len, - msg.msg_flags); - if (ret <= 0) - goto out_close; - else if (ret == iov.iov_len) - call_again_soon = 1; - - /* new buflen according readed bytes and leftover from last receive */ - buflen = ret + con->rx_leftover; - ret = dlm_process_incoming_buffer(con->nodeid, con->rx_buf, buflen); - if (ret < 0) - goto out_close; + for (;;) { + /* calculate new buffer parameter regarding last receive and + * possible leftover bytes + */ + iov.iov_base = con->rx_buf + con->rx_leftover; + iov.iov_len = con->rx_buflen - con->rx_leftover; + + memset(&msg, 0, sizeof(msg)); + msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; + ret = kernel_recvmsg(con->sock, &msg, &iov, 1, iov.iov_len, + msg.msg_flags); + if (ret == -EAGAIN) + break; + else if (ret <= 0) + goto out_close; - /* calculate leftover bytes from process and put it into begin of - * the receive buffer, so next receive we have the full message - * at the start address of the receive buffer. - */ - con->rx_leftover = buflen - ret; - if (con->rx_leftover) { - memmove(con->rx_buf, con->rx_buf + ret, - con->rx_leftover); - call_again_soon = true; + /* new buflen according readed bytes and leftover from last receive */ + buflen = ret + con->rx_leftover; + ret = dlm_process_incoming_buffer(con->nodeid, con->rx_buf, buflen); + if (ret < 0) + goto out_close; + + /* calculate leftover bytes from process and put it into begin of + * the receive buffer, so next receive we have the full message + * at the start address of the receive buffer. + */ + con->rx_leftover = buflen - ret; + if (con->rx_leftover) { + memmove(con->rx_buf, con->rx_buf + ret, + con->rx_leftover); + } } - if (call_again_soon) - goto out_resched; - + dlm_midcomms_receive_done(con->nodeid); mutex_unlock(&con->sock_mutex); return 0; @@ -946,7 +962,8 @@ out_close: log_print("connection %p got EOF from %d", con, con->nodeid); - if (con->eof_condition && con->eof_condition(con)) { + if (dlm_proto_ops->eof_condition && + dlm_proto_ops->eof_condition(con)) { set_bit(CF_EOF, &con->flags); mutex_unlock(&con->sock_mutex); } else { @@ -1134,242 +1151,6 @@ static int sctp_bind_addrs(struct socket *sock, uint16_t port) return result; } -/* Initiate an SCTP association. - This is a special case of send_to_sock() in that we don't yet have a - peeled-off socket for this association, so we use the listening socket - and add the primary IP address of the remote node. - */ -static void sctp_connect_to_sock(struct connection *con) -{ - struct sockaddr_storage daddr; - int result; - int addr_len; - struct socket *sock; - unsigned int mark; - - mutex_lock(&con->sock_mutex); - - /* Some odd races can cause double-connects, ignore them */ - if (con->retries++ > MAX_CONNECT_RETRIES) - goto out; - - if (con->sock) { - log_print("node %d already connected.", con->nodeid); - goto out; - } - - memset(&daddr, 0, sizeof(daddr)); - result = nodeid_to_addr(con->nodeid, &daddr, NULL, true, &mark); - if (result < 0) { - log_print("no address for nodeid %d", con->nodeid); - goto out; - } - - /* Create a socket to communicate with */ - result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family, - SOCK_STREAM, IPPROTO_SCTP, &sock); - if (result < 0) - goto socket_err; - - sock_set_mark(sock->sk, mark); - - add_sock(sock, con); - - /* Bind to all addresses. */ - if (sctp_bind_addrs(con->sock, 0)) - goto bind_err; - - make_sockaddr(&daddr, dlm_config.ci_tcp_port, &addr_len); - - log_print_ratelimited("connecting to %d", con->nodeid); - - /* Turn off Nagle's algorithm */ - sctp_sock_set_nodelay(sock->sk); - - /* - * Make sock->ops->connect() function return in specified time, - * since O_NONBLOCK argument in connect() function does not work here, - * then, we should restore the default value of this attribute. - */ - sock_set_sndtimeo(sock->sk, 5); - result = sock->ops->connect(sock, (struct sockaddr *)&daddr, addr_len, - 0); - sock_set_sndtimeo(sock->sk, 0); - - if (result == -EINPROGRESS) - result = 0; - if (result == 0) { - if (!test_and_set_bit(CF_CONNECTED, &con->flags)) - log_print("successful connected to node %d", con->nodeid); - goto out; - } - -bind_err: - con->sock = NULL; - sock_release(sock); - -socket_err: - /* - * Some errors are fatal and this list might need adjusting. For other - * errors we try again until the max number of retries is reached. - */ - if (result != -EHOSTUNREACH && - result != -ENETUNREACH && - result != -ENETDOWN && - result != -EINVAL && - result != -EPROTONOSUPPORT) { - log_print("connect %d try %d error %d", con->nodeid, - con->retries, result); - mutex_unlock(&con->sock_mutex); - msleep(1000); - lowcomms_connect_sock(con); - return; - } - -out: - mutex_unlock(&con->sock_mutex); -} - -/* Connect a new socket to its peer */ -static void tcp_connect_to_sock(struct connection *con) -{ - struct sockaddr_storage saddr, src_addr; - unsigned int mark; - int addr_len; - struct socket *sock = NULL; - int result; - - mutex_lock(&con->sock_mutex); - if (con->retries++ > MAX_CONNECT_RETRIES) - goto out; - - /* Some odd races can cause double-connects, ignore them */ - if (con->sock) - goto out; - - /* Create a socket to communicate with */ - result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family, - SOCK_STREAM, IPPROTO_TCP, &sock); - if (result < 0) - goto out_err; - - memset(&saddr, 0, sizeof(saddr)); - result = nodeid_to_addr(con->nodeid, &saddr, NULL, false, &mark); - if (result < 0) { - log_print("no address for nodeid %d", con->nodeid); - goto out_err; - } - - sock_set_mark(sock->sk, mark); - - add_sock(sock, con); - - /* Bind to our cluster-known address connecting to avoid - routing problems */ - memcpy(&src_addr, dlm_local_addr[0], sizeof(src_addr)); - make_sockaddr(&src_addr, 0, &addr_len); - result = sock->ops->bind(sock, (struct sockaddr *) &src_addr, - addr_len); - if (result < 0) { - log_print("could not bind for connect: %d", result); - /* This *may* not indicate a critical error */ - } - - make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len); - - log_print_ratelimited("connecting to %d", con->nodeid); - - /* Turn off Nagle's algorithm */ - tcp_sock_set_nodelay(sock->sk); - - result = sock->ops->connect(sock, (struct sockaddr *)&saddr, addr_len, - O_NONBLOCK); - if (result == -EINPROGRESS) - result = 0; - if (result == 0) - goto out; - -out_err: - if (con->sock) { - sock_release(con->sock); - con->sock = NULL; - } else if (sock) { - sock_release(sock); - } - /* - * Some errors are fatal and this list might need adjusting. For other - * errors we try again until the max number of retries is reached. - */ - if (result != -EHOSTUNREACH && - result != -ENETUNREACH && - result != -ENETDOWN && - result != -EINVAL && - result != -EPROTONOSUPPORT) { - log_print("connect %d try %d error %d", con->nodeid, - con->retries, result); - mutex_unlock(&con->sock_mutex); - msleep(1000); - lowcomms_connect_sock(con); - return; - } -out: - mutex_unlock(&con->sock_mutex); - return; -} - -/* On error caller must run dlm_close_sock() for the - * listen connection socket. - */ -static int tcp_create_listen_sock(struct listen_connection *con, - struct sockaddr_storage *saddr) -{ - struct socket *sock = NULL; - int result = 0; - int addr_len; - - if (dlm_local_addr[0]->ss_family == AF_INET) - addr_len = sizeof(struct sockaddr_in); - else - addr_len = sizeof(struct sockaddr_in6); - - /* Create a socket to communicate with */ - result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family, - SOCK_STREAM, IPPROTO_TCP, &sock); - if (result < 0) { - log_print("Can't create listening comms socket"); - goto create_out; - } - - sock_set_mark(sock->sk, dlm_config.ci_mark); - - /* Turn off Nagle's algorithm */ - tcp_sock_set_nodelay(sock->sk); - - sock_set_reuseaddr(sock->sk); - - add_listen_sock(sock, con); - - /* Bind to our port */ - make_sockaddr(saddr, dlm_config.ci_tcp_port, &addr_len); - result = sock->ops->bind(sock, (struct sockaddr *) saddr, addr_len); - if (result < 0) { - log_print("Can't bind to port %d", dlm_config.ci_tcp_port); - goto create_out; - } - sock_set_keepalive(sock->sk); - - result = sock->ops->listen(sock, 5); - if (result < 0) { - log_print("Can't listen on port %d", dlm_config.ci_tcp_port); - goto create_out; - } - - return 0; - -create_out: - return result; -} - /* Get local addresses */ static void init_local(void) { @@ -1396,63 +1177,6 @@ static void deinit_local(void) kfree(dlm_local_addr[i]); } -/* Initialise SCTP socket and bind to all interfaces - * On error caller must run dlm_close_sock() for the - * listen connection socket. - */ -static int sctp_listen_for_all(struct listen_connection *con) -{ - struct socket *sock = NULL; - int result = -EINVAL; - - log_print("Using SCTP for communications"); - - result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family, - SOCK_STREAM, IPPROTO_SCTP, &sock); - if (result < 0) { - log_print("Can't create comms socket, check SCTP is loaded"); - goto out; - } - - sock_set_rcvbuf(sock->sk, NEEDED_RMEM); - sock_set_mark(sock->sk, dlm_config.ci_mark); - sctp_sock_set_nodelay(sock->sk); - - add_listen_sock(sock, con); - - /* Bind to all addresses. */ - result = sctp_bind_addrs(con->sock, dlm_config.ci_tcp_port); - if (result < 0) - goto out; - - result = sock->ops->listen(sock, 5); - if (result < 0) { - log_print("Can't set socket listening"); - goto out; - } - - return 0; - -out: - return result; -} - -static int tcp_listen_for_all(void) -{ - /* We don't support multi-homed hosts */ - if (dlm_local_count > 1) { - log_print("TCP protocol can't handle multi-homed hosts, " - "try SCTP"); - return -EINVAL; - } - - log_print("Using TCP for communications"); - - return tcp_create_listen_sock(&listen_con, dlm_local_addr[0]); -} - - - static struct writequeue_entry *new_writequeue_entry(struct connection *con, gfp_t allocation) { @@ -1528,19 +1252,37 @@ static struct dlm_msg *dlm_lowcomms_new_msg_con(struct connection *con, int len, { struct writequeue_entry *e; struct dlm_msg *msg; + bool sleepable; msg = kzalloc(sizeof(*msg), allocation); if (!msg) return NULL; + /* this mutex is being used as a wait to avoid multiple "fast" + * new writequeue page list entry allocs in new_wq_entry in + * normal operation which is sleepable context. Without it + * we could end in multiple writequeue entries with one + * dlm message because multiple callers were waiting at + * the writequeue_lock in new_wq_entry(). + */ + sleepable = gfpflags_normal_context(allocation); + if (sleepable) + mutex_lock(&con->wq_alloc); + kref_init(&msg->ref); e = new_wq_entry(con, len, allocation, ppc, cb, mh); if (!e) { + if (sleepable) + mutex_unlock(&con->wq_alloc); + kfree(msg); return NULL; } + if (sleepable) + mutex_unlock(&con->wq_alloc); + msg->ppc = *ppc; msg->len = len; msg->entry = e; @@ -1646,10 +1388,9 @@ int dlm_lowcomms_resend_msg(struct dlm_msg *msg) /* Send a message */ static void send_to_sock(struct connection *con) { - int ret = 0; const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; struct writequeue_entry *e; - int len, offset; + int len, offset, ret; int count = 0; mutex_lock(&con->sock_mutex); @@ -1658,7 +1399,8 @@ static void send_to_sock(struct connection *con) spin_lock(&con->writequeue_lock); for (;;) { - if (list_empty(&con->writequeue)) + e = con_next_wq(con); + if (!e) break; e = list_first_entry(&con->writequeue, struct writequeue_entry, list); @@ -1667,25 +1409,22 @@ static void send_to_sock(struct connection *con) BUG_ON(len == 0 && e->users == 0); spin_unlock(&con->writequeue_lock); - ret = 0; - if (len) { - ret = kernel_sendpage(con->sock, e->page, offset, len, - msg_flags); - if (ret == -EAGAIN || ret == 0) { - if (ret == -EAGAIN && - test_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags) && - !test_and_set_bit(CF_APP_LIMITED, &con->flags)) { - /* Notify TCP that we're limited by the - * application window size. - */ - set_bit(SOCK_NOSPACE, &con->sock->flags); - con->sock->sk->sk_write_pending++; - } - cond_resched(); - goto out; - } else if (ret < 0) - goto out; - } + ret = kernel_sendpage(con->sock, e->page, offset, len, + msg_flags); + if (ret == -EAGAIN || ret == 0) { + if (ret == -EAGAIN && + test_bit(SOCKWQ_ASYNC_NOSPACE, &con->sock->flags) && + !test_and_set_bit(CF_APP_LIMITED, &con->flags)) { + /* Notify TCP that we're limited by the + * application window size. + */ + set_bit(SOCK_NOSPACE, &con->sock->flags); + con->sock->sk->sk_write_pending++; + } + cond_resched(); + goto out; + } else if (ret < 0) + goto out; /* Don't starve people filling buffers */ if (++count >= MAX_SEND_MSG_COUNT) { @@ -1770,12 +1509,9 @@ int dlm_lowcomms_close(int nodeid) static void process_recv_sockets(struct work_struct *work) { struct connection *con = container_of(work, struct connection, rwork); - int err; clear_bit(CF_READ_PENDING, &con->flags); - do { - err = receive_from_sock(con); - } while (!err); + receive_from_sock(con); } static void process_listen_recv_socket(struct work_struct *work) @@ -1783,6 +1519,74 @@ static void process_listen_recv_socket(struct work_struct *work) accept_from_sock(&listen_con); } +static void dlm_connect(struct connection *con) +{ + struct sockaddr_storage addr; + int result, addr_len; + struct socket *sock; + unsigned int mark; + + /* Some odd races can cause double-connects, ignore them */ + if (con->retries++ > MAX_CONNECT_RETRIES) + return; + + if (con->sock) { + log_print("node %d already connected.", con->nodeid); + return; + } + + memset(&addr, 0, sizeof(addr)); + result = nodeid_to_addr(con->nodeid, &addr, NULL, + dlm_proto_ops->try_new_addr, &mark); + if (result < 0) { + log_print("no address for nodeid %d", con->nodeid); + return; + } + + /* Create a socket to communicate with */ + result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family, + SOCK_STREAM, dlm_proto_ops->proto, &sock); + if (result < 0) + goto socket_err; + + sock_set_mark(sock->sk, mark); + dlm_proto_ops->sockopts(sock); + + add_sock(sock, con); + + result = dlm_proto_ops->bind(sock); + if (result < 0) + goto add_sock_err; + + log_print_ratelimited("connecting to %d", con->nodeid); + make_sockaddr(&addr, dlm_config.ci_tcp_port, &addr_len); + result = dlm_proto_ops->connect(con, sock, (struct sockaddr *)&addr, + addr_len); + if (result < 0) + goto add_sock_err; + + return; + +add_sock_err: + dlm_close_sock(&con->sock); + +socket_err: + /* + * Some errors are fatal and this list might need adjusting. For other + * errors we try again until the max number of retries is reached. + */ + if (result != -EHOSTUNREACH && + result != -ENETUNREACH && + result != -ENETDOWN && + result != -EINVAL && + result != -EPROTONOSUPPORT) { + log_print("connect %d try %d error %d", con->nodeid, + con->retries, result); + msleep(1000); + lowcomms_connect_sock(con); + } +} + /* Send workqueue function */ static void process_send_sockets(struct work_struct *work) { @@ -1797,11 +1601,15 @@ static void process_send_sockets(struct work_struct *work) dlm_midcomms_unack_msg_resend(con->nodeid); } - if (con->sock == NULL) { /* not mutex protected so check it inside too */ + if (con->sock == NULL) { if (test_and_clear_bit(CF_DELAY_CONNECT, &con->flags)) msleep(1000); - con->connect_action(con); + + mutex_lock(&con->sock_mutex); + dlm_connect(con); + mutex_unlock(&con->sock_mutex); } + if (!list_empty(&con->writequeue)) send_to_sock(con); } @@ -1840,8 +1648,8 @@ static int work_start(void) static void shutdown_conn(struct connection *con) { - if (con->shutdown_action) - con->shutdown_action(con); + if (dlm_proto_ops->shutdown_action) + dlm_proto_ops->shutdown_action(con); } void dlm_lowcomms_shutdown(void) @@ -1948,8 +1756,198 @@ void dlm_lowcomms_stop(void) srcu_read_unlock(&connections_srcu, idx); work_stop(); deinit_local(); + + dlm_proto_ops = NULL; } +static int dlm_listen_for_all(void) +{ + struct socket *sock; + int result; + + log_print("Using %s for communications", + dlm_proto_ops->name); + + result = dlm_proto_ops->listen_validate(); + if (result < 0) + return result; + + result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family, + SOCK_STREAM, dlm_proto_ops->proto, &sock); + if (result < 0) { + log_print("Can't create comms socket, check SCTP is loaded"); + goto out; + } + + sock_set_mark(sock->sk, dlm_config.ci_mark); + dlm_proto_ops->listen_sockopts(sock); + + result = dlm_proto_ops->listen_bind(sock); + if (result < 0) + goto out; + + save_listen_callbacks(sock); + add_listen_sock(sock, &listen_con); + + INIT_WORK(&listen_con.rwork, process_listen_recv_socket); + result = sock->ops->listen(sock, 5); + if (result < 0) { + dlm_close_sock(&listen_con.sock); + goto out; + } + + return 0; + +out: + sock_release(sock); + return result; +} + +static int dlm_tcp_bind(struct socket *sock) +{ + struct sockaddr_storage src_addr; + int result, addr_len; + + /* Bind to our cluster-known address connecting to avoid + * routing problems. + */ + memcpy(&src_addr, dlm_local_addr[0], sizeof(src_addr)); + make_sockaddr(&src_addr, 0, &addr_len); + + result = sock->ops->bind(sock, (struct sockaddr *)&src_addr, + addr_len); + if (result < 0) { + /* This *may* not indicate a critical error */ + log_print("could not bind for connect: %d", result); + } + + return 0; +} + +static int dlm_tcp_connect(struct connection *con, struct socket *sock, + struct sockaddr *addr, int addr_len) +{ + int ret; + + ret = sock->ops->connect(sock, addr, addr_len, O_NONBLOCK); + switch (ret) { + case -EINPROGRESS: + fallthrough; + case 0: + return 0; + } + + return ret; +} + +static int dlm_tcp_listen_validate(void) +{ + /* We don't support multi-homed hosts */ + if (dlm_local_count > 1) { + log_print("TCP protocol can't handle multi-homed hosts, try SCTP"); + return -EINVAL; + } + + return 0; +} + +static void dlm_tcp_sockopts(struct socket *sock) +{ + /* Turn off Nagle's algorithm */ + tcp_sock_set_nodelay(sock->sk); +} + +static void dlm_tcp_listen_sockopts(struct socket *sock) +{ + dlm_tcp_sockopts(sock); + sock_set_reuseaddr(sock->sk); +} + +static int dlm_tcp_listen_bind(struct socket *sock) +{ + int addr_len; + + /* Bind to our port */ + make_sockaddr(dlm_local_addr[0], dlm_config.ci_tcp_port, &addr_len); + return sock->ops->bind(sock, (struct sockaddr *)dlm_local_addr[0], + addr_len); +} + +static const struct dlm_proto_ops dlm_tcp_ops = { + .name = "TCP", + .proto = IPPROTO_TCP, + .connect = dlm_tcp_connect, + .sockopts = dlm_tcp_sockopts, + .bind = dlm_tcp_bind, + .listen_validate = dlm_tcp_listen_validate, + .listen_sockopts = dlm_tcp_listen_sockopts, + .listen_bind = dlm_tcp_listen_bind, + .shutdown_action = dlm_tcp_shutdown, + .eof_condition = tcp_eof_condition, +}; + +static int dlm_sctp_bind(struct socket *sock) +{ + return sctp_bind_addrs(sock, 0); +} + +static int dlm_sctp_connect(struct connection *con, struct socket *sock, + struct sockaddr *addr, int addr_len) +{ + int ret; + + /* + * Make sock->ops->connect() function return in specified time, + * since O_NONBLOCK argument in connect() function does not work here, + * then, we should restore the default value of this attribute. + */ + sock_set_sndtimeo(sock->sk, 5); + ret = sock->ops->connect(sock, addr, addr_len, 0); + sock_set_sndtimeo(sock->sk, 0); + if (ret < 0) + return ret; + + if (!test_and_set_bit(CF_CONNECTED, &con->flags)) + log_print("successful connected to node %d", con->nodeid); + + return 0; +} + +static int dlm_sctp_listen_validate(void) +{ + if (!IS_ENABLED(CONFIG_IP_SCTP)) { + log_print("SCTP is not enabled by this kernel"); + return -EOPNOTSUPP; + } + + request_module("sctp"); + return 0; +} + +static int dlm_sctp_bind_listen(struct socket *sock) +{ + return sctp_bind_addrs(sock, dlm_config.ci_tcp_port); +} + +static void dlm_sctp_sockopts(struct socket *sock) +{ + /* Turn off Nagle's algorithm */ + sctp_sock_set_nodelay(sock->sk); + sock_set_rcvbuf(sock->sk, NEEDED_RMEM); +} + +static const struct dlm_proto_ops dlm_sctp_ops = { + .name = "SCTP", + .proto = IPPROTO_SCTP, + .try_new_addr = true, + .connect = dlm_sctp_connect, + .sockopts = dlm_sctp_sockopts, + .bind = dlm_sctp_bind, + .listen_validate = dlm_sctp_listen_validate, + .listen_sockopts = dlm_sctp_sockopts, + .listen_bind = dlm_sctp_bind_listen, +}; + int dlm_lowcomms_start(void) { int error = -EINVAL; @@ -1976,23 +1974,27 @@ int dlm_lowcomms_start(void) /* Start listening */ switch (dlm_config.ci_protocol) { case DLM_PROTO_TCP: - error = tcp_listen_for_all(); + dlm_proto_ops = &dlm_tcp_ops; break; case DLM_PROTO_SCTP: - error = sctp_listen_for_all(&listen_con); + dlm_proto_ops = &dlm_sctp_ops; break; default: log_print("Invalid protocol identifier %d set", dlm_config.ci_protocol); error = -EINVAL; - break; + goto fail_proto_ops; } + + error = dlm_listen_for_all(); if (error) - goto fail_unlisten; + goto fail_listen; return 0; -fail_unlisten: +fail_listen: + dlm_proto_ops = NULL; +fail_proto_ops: dlm_allow_conn = 0; dlm_close_sock(&listen_con.sock); work_stop(); diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h index aaae7115c00d..4ccae07cf005 100644 --- a/fs/dlm/lowcomms.h +++ b/fs/dlm/lowcomms.h @@ -46,6 +46,7 @@ int dlm_lowcomms_resend_msg(struct dlm_msg *msg); int dlm_lowcomms_connect_node(int nodeid); int dlm_lowcomms_nodes_set_mark(int nodeid, unsigned int mark); int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len); +void dlm_midcomms_receive_done(int nodeid); #endif /* __LOWCOMMS_DOT_H__ */ diff --git a/fs/dlm/member.c b/fs/dlm/member.c index d9e1e4170eb1..731d489aa323 100644 --- a/fs/dlm/member.c +++ b/fs/dlm/member.c @@ -443,8 +443,10 @@ static int ping_members(struct dlm_ls *ls) list_for_each_entry(memb, &ls->ls_nodes, list) { error = dlm_recovery_stopped(ls); - if (error) + if (error) { + error = -EINTR; break; + } error = dlm_rcom_status(ls, memb->nodeid, 0); if (error) break; diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index e3de268898ed..7ae39ec8d9b0 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -109,12 +109,6 @@ * compatibility. There exists better ways to make a better handling. * However this should be changed in the next major version bump of dlm. * - * Ack handling: - * - * Currently we send an ack message for every dlm message. However we - * can ack multiple dlm messages with one ack by just delaying the ack - * message. Will reduce some traffic but makes the drop detection slower. - * * Tail Size checking: * * There exists a message tail payload in e.g. DLM_MSG however we don't @@ -169,6 +163,7 @@ struct midcomms_node { #define DLM_NODE_FLAG_CLOSE 1 #define DLM_NODE_FLAG_STOP_TX 2 #define DLM_NODE_FLAG_STOP_RX 3 +#define DLM_NODE_ULP_DELIVERED 4 unsigned long flags; wait_queue_head_t shutdown_wait; @@ -480,11 +475,12 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p, { if (seq == node->seq_next) { node->seq_next++; - /* send ack before fin */ - dlm_send_ack(node->nodeid, node->seq_next); switch (p->header.h_cmd) { case DLM_FIN: + /* send ack before fin */ + dlm_send_ack(node->nodeid, node->seq_next); + spin_lock(&node->state_lock); pr_debug("receive fin msg from node %d with state %s\n", node->nodeid, dlm_state_str(node->state)); @@ -534,6 +530,7 @@ static void dlm_midcomms_receive_buffer(union dlm_packet *p, default: WARN_ON(test_bit(DLM_NODE_FLAG_STOP_RX, &node->flags)); dlm_receive_buffer(p, node->nodeid); + set_bit(DLM_NODE_ULP_DELIVERED, &node->flags); break; } } else { @@ -933,6 +930,49 @@ int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len) return ret; } +void dlm_midcomms_receive_done(int nodeid) +{ + struct midcomms_node *node; + int idx; + + idx = srcu_read_lock(&nodes_srcu); + node = nodeid2node(nodeid, 0); + if (!node) { + srcu_read_unlock(&nodes_srcu, idx); + return; + } + + /* old protocol, we do nothing */ + switch (node->version) { + case DLM_VERSION_3_2: + break; + default: + srcu_read_unlock(&nodes_srcu, idx); + return; + } + + /* do nothing if we didn't delivered stateful to ulp */ + if (!test_and_clear_bit(DLM_NODE_ULP_DELIVERED, + &node->flags)) { + srcu_read_unlock(&nodes_srcu, idx); + return; + } + + spin_lock(&node->state_lock); + /* we only ack if state is ESTABLISHED */ + switch (node->state) { + case DLM_ESTABLISHED: + spin_unlock(&node->state_lock); + dlm_send_ack(node->nodeid, node->seq_next); + break; + default: + spin_unlock(&node->state_lock); + /* do nothing FIN has it's own ack send */ + break; + }; + srcu_read_unlock(&nodes_srcu, idx); +} + void dlm_midcomms_unack_msg_resend(int nodeid) { struct midcomms_node *node; diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c index 5651933f54a4..6cba86470278 100644 --- a/fs/dlm/rcom.c +++ b/fs/dlm/rcom.c @@ -89,22 +89,15 @@ static int create_rcom_stateless(struct dlm_ls *ls, int to_nodeid, int type, return 0; } -static void _send_rcom(struct dlm_ls *ls, struct dlm_rcom *rc) +static void send_rcom(struct dlm_mhandle *mh, struct dlm_rcom *rc) { dlm_rcom_out(rc); -} - -static void send_rcom(struct dlm_ls *ls, struct dlm_mhandle *mh, - struct dlm_rcom *rc) -{ - _send_rcom(ls, rc); dlm_midcomms_commit_mhandle(mh); } -static void send_rcom_stateless(struct dlm_ls *ls, struct dlm_msg *msg, - struct dlm_rcom *rc) +static void send_rcom_stateless(struct dlm_msg *msg, struct dlm_rcom *rc) { - _send_rcom(ls, rc); + dlm_rcom_out(rc); dlm_lowcomms_commit_msg(msg); dlm_lowcomms_put_msg(msg); } @@ -204,7 +197,7 @@ retry: allow_sync_reply(ls, &rc->rc_id); memset(ls->ls_recover_buf, 0, DLM_MAX_SOCKET_BUFSIZE); - send_rcom_stateless(ls, msg, rc); + send_rcom_stateless(msg, rc); error = dlm_wait_function(ls, &rcom_response); disallow_sync_reply(ls); @@ -287,7 +280,7 @@ static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in) spin_unlock(&ls->ls_recover_lock); do_send: - send_rcom_stateless(ls, msg, rc); + send_rcom_stateless(msg, rc); } static void receive_sync_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in) @@ -327,7 +320,7 @@ retry: allow_sync_reply(ls, &rc->rc_id); memset(ls->ls_recover_buf, 0, DLM_MAX_SOCKET_BUFSIZE); - send_rcom_stateless(ls, msg, rc); + send_rcom_stateless(msg, rc); error = dlm_wait_function(ls, &rcom_response); disallow_sync_reply(ls); @@ -356,7 +349,7 @@ static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in) dlm_copy_master_names(ls, rc_in->rc_buf, inlen, rc->rc_buf, outlen, nodeid); - send_rcom_stateless(ls, msg, rc); + send_rcom_stateless(msg, rc); } int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid) @@ -373,7 +366,7 @@ int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid) memcpy(rc->rc_buf, r->res_name, r->res_length); rc->rc_id = (unsigned long) r->res_id; - send_rcom(ls, mh, rc); + send_rcom(mh, rc); out: return error; } @@ -404,7 +397,7 @@ static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in) rc->rc_id = rc_in->rc_id; rc->rc_seq_reply = rc_in->rc_seq; - send_rcom(ls, mh, rc); + send_rcom(mh, rc); } static void receive_rcom_lookup_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in) @@ -461,7 +454,7 @@ int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) pack_rcom_lock(r, lkb, rl); rc->rc_id = (unsigned long) r; - send_rcom(ls, mh, rc); + send_rcom(mh, rc); out: return error; } @@ -487,7 +480,7 @@ static void receive_rcom_lock(struct dlm_ls *ls, struct dlm_rcom *rc_in) rc->rc_id = rc_in->rc_id; rc->rc_seq_reply = rc_in->rc_seq; - send_rcom(ls, mh, rc); + send_rcom(mh, rc); } /* If the lockspace doesn't exist then still send a status message diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c index 85e245392715..97d052cea5a9 100644 --- a/fs/dlm/recoverd.c +++ b/fs/dlm/recoverd.c @@ -125,8 +125,10 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) dlm_recover_waiters_pre(ls); error = dlm_recovery_stopped(ls); - if (error) + if (error) { + error = -EINTR; goto fail; + } if (neg || dlm_no_directory(ls)) { /* diff --git a/fs/drop_caches.c b/fs/drop_caches.c index f00fcc4a4f72..e619c31b6bd9 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -3,6 +3,7 @@ * Implement the manual drop-all-pagecache function */ +#include <linux/pagemap.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/fs.h> @@ -27,7 +28,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) * we need to reschedule to avoid softlockups. */ if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || - (inode->i_mapping->nrpages == 0 && !need_resched())) { + (mapping_empty(inode->i_mapping) && !need_resched())) { spin_unlock(&inode->i_lock); continue; } diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig index 906af0c1998c..14b747026742 100644 --- a/fs/erofs/Kconfig +++ b/fs/erofs/Kconfig @@ -3,6 +3,7 @@ config EROFS_FS tristate "EROFS filesystem support" depends on BLOCK + select FS_IOMAP select LIBCRC32C help EROFS (Enhanced Read-Only File System) is a lightweight diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 3787a5fb0a42..9db829715652 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -2,35 +2,13 @@ /* * Copyright (C) 2017-2018 HUAWEI, Inc. * https://www.huawei.com/ + * Copyright (C) 2021, Alibaba Cloud */ #include "internal.h" #include <linux/prefetch.h> - +#include <linux/dax.h> #include <trace/events/erofs.h> -static void erofs_readendio(struct bio *bio) -{ - struct bio_vec *bvec; - blk_status_t err = bio->bi_status; - struct bvec_iter_all iter_all; - - bio_for_each_segment_all(bvec, bio, iter_all) { - struct page *page = bvec->bv_page; - - /* page is already locked */ - DBG_BUGON(PageUptodate(page)); - - if (err) - SetPageError(page); - else - SetPageUptodate(page); - - unlock_page(page); - /* page could be reclaimed now */ - } - bio_put(bio); -} - struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr) { struct address_space *const mapping = sb->s_bdev->bd_inode->i_mapping; @@ -59,13 +37,6 @@ static int erofs_map_blocks_flatmode(struct inode *inode, nblocks = DIV_ROUND_UP(inode->i_size, PAGE_SIZE); lastblk = nblocks - tailendpacking; - if (offset >= inode->i_size) { - /* leave out-of-bound access unmapped */ - map->m_flags = 0; - map->m_plen = 0; - goto out; - } - /* there is no hole in flatmode */ map->m_flags = EROFS_MAP_MAPPED; @@ -100,217 +71,273 @@ static int erofs_map_blocks_flatmode(struct inode *inode, goto err_out; } -out: map->m_llen = map->m_plen; - err_out: trace_erofs_map_blocks_flatmode_exit(inode, map, flags, 0); return err; } -static inline struct bio *erofs_read_raw_page(struct bio *bio, - struct address_space *mapping, - struct page *page, - erofs_off_t *last_block, - unsigned int nblocks, - unsigned int *eblks, - bool ra) +static int erofs_map_blocks(struct inode *inode, + struct erofs_map_blocks *map, int flags) { - struct inode *const inode = mapping->host; - struct super_block *const sb = inode->i_sb; - erofs_off_t current_block = (erofs_off_t)page->index; - int err; - - DBG_BUGON(!nblocks); - - if (PageUptodate(page)) { - err = 0; - goto has_updated; - } + struct super_block *sb = inode->i_sb; + struct erofs_inode *vi = EROFS_I(inode); + struct erofs_inode_chunk_index *idx; + struct page *page; + u64 chunknr; + unsigned int unit; + erofs_off_t pos; + int err = 0; - /* note that for readpage case, bio also equals to NULL */ - if (bio && - (*last_block + 1 != current_block || !*eblks)) { -submit_bio_retry: - submit_bio(bio); - bio = NULL; + if (map->m_la >= inode->i_size) { + /* leave out-of-bound access unmapped */ + map->m_flags = 0; + map->m_plen = 0; + goto out; } - if (!bio) { - struct erofs_map_blocks map = { - .m_la = blknr_to_addr(current_block), - }; - erofs_blk_t blknr; - unsigned int blkoff; - - err = erofs_map_blocks_flatmode(inode, &map, EROFS_GET_BLOCKS_RAW); - if (err) - goto err_out; - - /* zero out the holed page */ - if (!(map.m_flags & EROFS_MAP_MAPPED)) { - zero_user_segment(page, 0, PAGE_SIZE); - SetPageUptodate(page); - - /* imply err = 0, see erofs_map_blocks */ - goto has_updated; - } - - /* for RAW access mode, m_plen must be equal to m_llen */ - DBG_BUGON(map.m_plen != map.m_llen); - - blknr = erofs_blknr(map.m_pa); - blkoff = erofs_blkoff(map.m_pa); - - /* deal with inline page */ - if (map.m_flags & EROFS_MAP_META) { - void *vsrc, *vto; - struct page *ipage; + if (vi->datalayout != EROFS_INODE_CHUNK_BASED) + return erofs_map_blocks_flatmode(inode, map, flags); - DBG_BUGON(map.m_plen > PAGE_SIZE); + if (vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES) + unit = sizeof(*idx); /* chunk index */ + else + unit = EROFS_BLOCK_MAP_ENTRY_SIZE; /* block map */ - ipage = erofs_get_meta_page(inode->i_sb, blknr); + chunknr = map->m_la >> vi->chunkbits; + pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize + + vi->xattr_isize, unit) + unit * chunknr; - if (IS_ERR(ipage)) { - err = PTR_ERR(ipage); - goto err_out; - } + page = erofs_get_meta_page(inode->i_sb, erofs_blknr(pos)); + if (IS_ERR(page)) + return PTR_ERR(page); - vsrc = kmap_atomic(ipage); - vto = kmap_atomic(page); - memcpy(vto, vsrc + blkoff, map.m_plen); - memset(vto + map.m_plen, 0, PAGE_SIZE - map.m_plen); - kunmap_atomic(vto); - kunmap_atomic(vsrc); - flush_dcache_page(page); + map->m_la = chunknr << vi->chunkbits; + map->m_plen = min_t(erofs_off_t, 1UL << vi->chunkbits, + roundup(inode->i_size - map->m_la, EROFS_BLKSIZ)); - SetPageUptodate(page); - /* TODO: could we unlock the page earlier? */ - unlock_page(ipage); - put_page(ipage); + /* handle block map */ + if (!(vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES)) { + __le32 *blkaddr = page_address(page) + erofs_blkoff(pos); - /* imply err = 0, see erofs_map_blocks */ - goto has_updated; + if (le32_to_cpu(*blkaddr) == EROFS_NULL_ADDR) { + map->m_flags = 0; + } else { + map->m_pa = blknr_to_addr(le32_to_cpu(*blkaddr)); + map->m_flags = EROFS_MAP_MAPPED; } + goto out_unlock; + } + /* parse chunk indexes */ + idx = page_address(page) + erofs_blkoff(pos); + switch (le32_to_cpu(idx->blkaddr)) { + case EROFS_NULL_ADDR: + map->m_flags = 0; + break; + default: + /* only one device is supported for now */ + if (idx->device_id) { + erofs_err(sb, "invalid device id %u @ %llu for nid %llu", + le16_to_cpu(idx->device_id), + chunknr, vi->nid); + err = -EFSCORRUPTED; + goto out_unlock; + } + map->m_pa = blknr_to_addr(le32_to_cpu(idx->blkaddr)); + map->m_flags = EROFS_MAP_MAPPED; + break; + } +out_unlock: + unlock_page(page); + put_page(page); +out: + map->m_llen = map->m_plen; + return err; +} - /* pa must be block-aligned for raw reading */ - DBG_BUGON(erofs_blkoff(map.m_pa)); +static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, + unsigned int flags, struct iomap *iomap, struct iomap *srcmap) +{ + int ret; + struct erofs_map_blocks map; + + map.m_la = offset; + map.m_llen = length; + + ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW); + if (ret < 0) + return ret; + + iomap->bdev = inode->i_sb->s_bdev; + iomap->dax_dev = EROFS_I_SB(inode)->dax_dev; + iomap->offset = map.m_la; + iomap->length = map.m_llen; + iomap->flags = 0; + iomap->private = NULL; + + if (!(map.m_flags & EROFS_MAP_MAPPED)) { + iomap->type = IOMAP_HOLE; + iomap->addr = IOMAP_NULL_ADDR; + if (!iomap->length) + iomap->length = length; + return 0; + } - /* max # of continuous pages */ - if (nblocks > DIV_ROUND_UP(map.m_plen, PAGE_SIZE)) - nblocks = DIV_ROUND_UP(map.m_plen, PAGE_SIZE); + if (map.m_flags & EROFS_MAP_META) { + struct page *ipage; + + iomap->type = IOMAP_INLINE; + ipage = erofs_get_meta_page(inode->i_sb, + erofs_blknr(map.m_pa)); + if (IS_ERR(ipage)) + return PTR_ERR(ipage); + iomap->inline_data = page_address(ipage) + + erofs_blkoff(map.m_pa); + iomap->private = ipage; + } else { + iomap->type = IOMAP_MAPPED; + iomap->addr = map.m_pa; + } + return 0; +} - *eblks = bio_max_segs(nblocks); - bio = bio_alloc(GFP_NOIO, *eblks); +static int erofs_iomap_end(struct inode *inode, loff_t pos, loff_t length, + ssize_t written, unsigned int flags, struct iomap *iomap) +{ + struct page *ipage = iomap->private; - bio->bi_end_io = erofs_readendio; - bio_set_dev(bio, sb->s_bdev); - bio->bi_iter.bi_sector = (sector_t)blknr << - LOG_SECTORS_PER_BLOCK; - bio->bi_opf = REQ_OP_READ | (ra ? REQ_RAHEAD : 0); + if (ipage) { + DBG_BUGON(iomap->type != IOMAP_INLINE); + unlock_page(ipage); + put_page(ipage); + } else { + DBG_BUGON(iomap->type == IOMAP_INLINE); } + return written; +} - err = bio_add_page(bio, page, PAGE_SIZE, 0); - /* out of the extent or bio is full */ - if (err < PAGE_SIZE) - goto submit_bio_retry; - --*eblks; - *last_block = current_block; - return bio; +static const struct iomap_ops erofs_iomap_ops = { + .iomap_begin = erofs_iomap_begin, + .iomap_end = erofs_iomap_end, +}; -err_out: - /* for sync reading, set page error immediately */ - if (!ra) { - SetPageError(page); - ClearPageUptodate(page); +int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len) +{ + if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout)) { +#ifdef CONFIG_EROFS_FS_ZIP + return iomap_fiemap(inode, fieinfo, start, len, + &z_erofs_iomap_report_ops); +#else + return -EOPNOTSUPP; +#endif } -has_updated: - unlock_page(page); - - /* if updated manually, continuous pages has a gap */ - if (bio) - submit_bio(bio); - return err ? ERR_PTR(err) : NULL; + return iomap_fiemap(inode, fieinfo, start, len, &erofs_iomap_ops); } /* * since we dont have write or truncate flows, so no inode * locking needs to be held at the moment. */ -static int erofs_raw_access_readpage(struct file *file, struct page *page) +static int erofs_readpage(struct file *file, struct page *page) { - erofs_off_t last_block; - unsigned int eblks; - struct bio *bio; - - trace_erofs_readpage(page, true); + return iomap_readpage(page, &erofs_iomap_ops); +} - bio = erofs_read_raw_page(NULL, page->mapping, - page, &last_block, 1, &eblks, false); +static void erofs_readahead(struct readahead_control *rac) +{ + return iomap_readahead(rac, &erofs_iomap_ops); +} - if (IS_ERR(bio)) - return PTR_ERR(bio); +static sector_t erofs_bmap(struct address_space *mapping, sector_t block) +{ + return iomap_bmap(mapping, block, &erofs_iomap_ops); +} - if (bio) - submit_bio(bio); +static int erofs_prepare_dio(struct kiocb *iocb, struct iov_iter *to) +{ + struct inode *inode = file_inode(iocb->ki_filp); + loff_t align = iocb->ki_pos | iov_iter_count(to) | + iov_iter_alignment(to); + struct block_device *bdev = inode->i_sb->s_bdev; + unsigned int blksize_mask; + + if (bdev) + blksize_mask = (1 << ilog2(bdev_logical_block_size(bdev))) - 1; + else + blksize_mask = (1 << inode->i_blkbits) - 1; + + if (align & blksize_mask) + return -EINVAL; return 0; } -static void erofs_raw_access_readahead(struct readahead_control *rac) +static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { - erofs_off_t last_block; - unsigned int eblks; - struct bio *bio = NULL; - struct page *page; - - trace_erofs_readpages(rac->mapping->host, readahead_index(rac), - readahead_count(rac), true); - - while ((page = readahead_page(rac))) { - prefetchw(&page->flags); - - bio = erofs_read_raw_page(bio, rac->mapping, page, &last_block, - readahead_count(rac), &eblks, true); - - /* all the page errors are ignored when readahead */ - if (IS_ERR(bio)) { - pr_err("%s, readahead error at page %lu of nid %llu\n", - __func__, page->index, - EROFS_I(rac->mapping->host)->nid); - - bio = NULL; - } - - put_page(page); + /* no need taking (shared) inode lock since it's a ro filesystem */ + if (!iov_iter_count(to)) + return 0; + +#ifdef CONFIG_FS_DAX + if (IS_DAX(iocb->ki_filp->f_mapping->host)) + return dax_iomap_rw(iocb, to, &erofs_iomap_ops); +#endif + if (iocb->ki_flags & IOCB_DIRECT) { + int err = erofs_prepare_dio(iocb, to); + + if (!err) + return iomap_dio_rw(iocb, to, &erofs_iomap_ops, + NULL, 0); + if (err < 0) + return err; } + return filemap_read(iocb, to, 0); +} + +/* for uncompressed (aligned) files and raw access for other files */ +const struct address_space_operations erofs_raw_access_aops = { + .readpage = erofs_readpage, + .readahead = erofs_readahead, + .bmap = erofs_bmap, + .direct_IO = noop_direct_IO, +}; - if (bio) - submit_bio(bio); +#ifdef CONFIG_FS_DAX +static vm_fault_t erofs_dax_huge_fault(struct vm_fault *vmf, + enum page_entry_size pe_size) +{ + return dax_iomap_fault(vmf, pe_size, NULL, NULL, &erofs_iomap_ops); } -static sector_t erofs_bmap(struct address_space *mapping, sector_t block) +static vm_fault_t erofs_dax_fault(struct vm_fault *vmf) { - struct inode *inode = mapping->host; - struct erofs_map_blocks map = { - .m_la = blknr_to_addr(block), - }; + return erofs_dax_huge_fault(vmf, PE_SIZE_PTE); +} - if (EROFS_I(inode)->datalayout == EROFS_INODE_FLAT_INLINE) { - erofs_blk_t blks = i_size_read(inode) >> LOG_BLOCK_SIZE; +static const struct vm_operations_struct erofs_dax_vm_ops = { + .fault = erofs_dax_fault, + .huge_fault = erofs_dax_huge_fault, +}; - if (block >> LOG_SECTORS_PER_BLOCK >= blks) - return 0; - } +static int erofs_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + if (!IS_DAX(file_inode(file))) + return generic_file_readonly_mmap(file, vma); - if (!erofs_map_blocks_flatmode(inode, &map, EROFS_GET_BLOCKS_RAW)) - return erofs_blknr(map.m_pa); + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) + return -EINVAL; + vma->vm_ops = &erofs_dax_vm_ops; + vma->vm_flags |= VM_HUGEPAGE; return 0; } - -/* for uncompressed (aligned) files and raw access for other files */ -const struct address_space_operations erofs_raw_access_aops = { - .readpage = erofs_raw_access_readpage, - .readahead = erofs_raw_access_readahead, - .bmap = erofs_bmap, +#else +#define erofs_file_mmap generic_file_readonly_mmap +#endif + +const struct file_operations erofs_file_fops = { + .llseek = generic_file_llseek, + .read_iter = erofs_file_read_iter, + .mmap = erofs_file_mmap, + .splice_read = generic_file_splice_read, }; diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index 0f8da74570b4..b0b23f41abc3 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -4,6 +4,7 @@ * * Copyright (C) 2017-2018 HUAWEI, Inc. * https://www.huawei.com/ + * Copyright (C) 2021, Alibaba Cloud */ #ifndef __EROFS_FS_H #define __EROFS_FS_H @@ -19,10 +20,12 @@ #define EROFS_FEATURE_INCOMPAT_LZ4_0PADDING 0x00000001 #define EROFS_FEATURE_INCOMPAT_COMPR_CFGS 0x00000002 #define EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER 0x00000002 +#define EROFS_FEATURE_INCOMPAT_CHUNKED_FILE 0x00000004 #define EROFS_ALL_FEATURE_INCOMPAT \ (EROFS_FEATURE_INCOMPAT_LZ4_0PADDING | \ EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \ - EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER) + EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER | \ + EROFS_FEATURE_INCOMPAT_CHUNKED_FILE) #define EROFS_SB_EXTSLOT_SIZE 16 @@ -64,13 +67,16 @@ struct erofs_super_block { * inode, [xattrs], last_inline_data, ... | ... | no-holed data * 3 - inode compression D: * inode, [xattrs], map_header, extents ... | ... - * 4~7 - reserved + * 4 - inode chunk-based E: + * inode, [xattrs], chunk indexes ... | ... + * 5~7 - reserved */ enum { EROFS_INODE_FLAT_PLAIN = 0, EROFS_INODE_FLAT_COMPRESSION_LEGACY = 1, EROFS_INODE_FLAT_INLINE = 2, EROFS_INODE_FLAT_COMPRESSION = 3, + EROFS_INODE_CHUNK_BASED = 4, EROFS_INODE_DATALAYOUT_MAX }; @@ -90,6 +96,19 @@ static inline bool erofs_inode_is_data_compressed(unsigned int datamode) #define EROFS_I_ALL \ ((1 << (EROFS_I_DATALAYOUT_BIT + EROFS_I_DATALAYOUT_BITS)) - 1) +/* indicate chunk blkbits, thus 'chunksize = blocksize << chunk blkbits' */ +#define EROFS_CHUNK_FORMAT_BLKBITS_MASK 0x001F +/* with chunk indexes or just a 4-byte blkaddr array */ +#define EROFS_CHUNK_FORMAT_INDEXES 0x0020 + +#define EROFS_CHUNK_FORMAT_ALL \ + (EROFS_CHUNK_FORMAT_BLKBITS_MASK | EROFS_CHUNK_FORMAT_INDEXES) + +struct erofs_inode_chunk_info { + __le16 format; /* chunk blkbits, etc. */ + __le16 reserved; +}; + /* 32-byte reduced form of an ondisk inode */ struct erofs_inode_compact { __le16 i_format; /* inode format hints */ @@ -107,6 +126,9 @@ struct erofs_inode_compact { /* for device files, used to indicate old/new device # */ __le32 rdev; + + /* for chunk-based files, it contains the summary info */ + struct erofs_inode_chunk_info c; } i_u; __le32 i_ino; /* only used for 32-bit stat compatibility */ __le16 i_uid; @@ -135,6 +157,9 @@ struct erofs_inode_extended { /* for device files, used to indicate old/new device # */ __le32 rdev; + + /* for chunk-based files, it contains the summary info */ + struct erofs_inode_chunk_info c; } i_u; /* only used for 32-bit stat compatibility */ @@ -204,6 +229,19 @@ static inline unsigned int erofs_xattr_entry_size(struct erofs_xattr_entry *e) e->e_name_len + le16_to_cpu(e->e_value_size)); } +/* represent a zeroed chunk (hole) */ +#define EROFS_NULL_ADDR -1 + +/* 4-byte block address array */ +#define EROFS_BLOCK_MAP_ENTRY_SIZE sizeof(__le32) + +/* 8-byte inode chunk indexes */ +struct erofs_inode_chunk_index { + __le16 advise; /* always 0, don't care for now */ + __le16 device_id; /* back-end storage id, always 0 for now */ + __le32 blkaddr; /* start block address of this inode chunk */ +}; + /* maximum supported size of a physical compression cluster */ #define Z_EROFS_PCLUSTER_MAX_SIZE (1024 * 1024) @@ -338,9 +376,14 @@ static inline void erofs_check_ondisk_layout_definitions(void) BUILD_BUG_ON(sizeof(struct erofs_inode_extended) != 64); BUILD_BUG_ON(sizeof(struct erofs_xattr_ibody_header) != 12); BUILD_BUG_ON(sizeof(struct erofs_xattr_entry) != 4); + BUILD_BUG_ON(sizeof(struct erofs_inode_chunk_info) != 4); + BUILD_BUG_ON(sizeof(struct erofs_inode_chunk_index) != 8); BUILD_BUG_ON(sizeof(struct z_erofs_map_header) != 8); BUILD_BUG_ON(sizeof(struct z_erofs_vle_decompressed_index) != 8); BUILD_BUG_ON(sizeof(struct erofs_dirent) != 12); + /* keep in sync between 2 index structures for better extendibility */ + BUILD_BUG_ON(sizeof(struct erofs_inode_chunk_index) != + sizeof(struct z_erofs_vle_decompressed_index)); BUILD_BUG_ON(BIT(Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) < Z_EROFS_VLE_CLUSTER_TYPE_MAX - 1); diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index aa8a0d770ba3..31ac3a73b390 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -2,6 +2,7 @@ /* * Copyright (C) 2017-2018 HUAWEI, Inc. * https://www.huawei.com/ + * Copyright (C) 2021, Alibaba Cloud */ #include "xattr.h" @@ -122,8 +123,11 @@ static struct page *erofs_read_inode(struct inode *inode, /* total blocks for compressed files */ if (erofs_inode_is_data_compressed(vi->datalayout)) nblks = le32_to_cpu(die->i_u.compressed_blocks); - + else if (vi->datalayout == EROFS_INODE_CHUNK_BASED) + /* fill chunked inode summary info */ + vi->chunkformat = le16_to_cpu(die->i_u.c.format); kfree(copied); + copied = NULL; break; case EROFS_INODE_LAYOUT_COMPACT: vi->inode_isize = sizeof(struct erofs_inode_compact); @@ -160,6 +164,8 @@ static struct page *erofs_read_inode(struct inode *inode, inode->i_size = le32_to_cpu(dic->i_size); if (erofs_inode_is_data_compressed(vi->datalayout)) nblks = le32_to_cpu(dic->i_u.compressed_blocks); + else if (vi->datalayout == EROFS_INODE_CHUNK_BASED) + vi->chunkformat = le16_to_cpu(dic->i_u.c.format); break; default: erofs_err(inode->i_sb, @@ -169,11 +175,26 @@ static struct page *erofs_read_inode(struct inode *inode, goto err_out; } + if (vi->datalayout == EROFS_INODE_CHUNK_BASED) { + if (!(vi->chunkformat & EROFS_CHUNK_FORMAT_ALL)) { + erofs_err(inode->i_sb, + "unsupported chunk format %x of nid %llu", + vi->chunkformat, vi->nid); + err = -EOPNOTSUPP; + goto err_out; + } + vi->chunkbits = LOG_BLOCK_SIZE + + (vi->chunkformat & EROFS_CHUNK_FORMAT_BLKBITS_MASK); + } inode->i_mtime.tv_sec = inode->i_ctime.tv_sec; inode->i_atime.tv_sec = inode->i_ctime.tv_sec; inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec; inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec; + inode->i_flags &= ~S_DAX; + if (test_opt(&sbi->ctx, DAX_ALWAYS) && S_ISREG(inode->i_mode) && + vi->datalayout == EROFS_INODE_FLAT_PLAIN) + inode->i_flags |= S_DAX; if (!nblks) /* measure inode.i_blocks as generic filesystems */ inode->i_blocks = roundup(inode->i_size, EROFS_BLKSIZ) >> 9; @@ -247,7 +268,10 @@ static int erofs_fill_inode(struct inode *inode, int isdir) switch (inode->i_mode & S_IFMT) { case S_IFREG: inode->i_op = &erofs_generic_iops; - inode->i_fop = &generic_ro_fops; + if (erofs_inode_is_data_compressed(vi->datalayout)) + inode->i_fop = &generic_ro_fops; + else + inode->i_fop = &erofs_file_fops; break; case S_IFDIR: inode->i_op = &erofs_dir_iops; @@ -358,6 +382,7 @@ const struct inode_operations erofs_generic_iops = { .getattr = erofs_getattr, .listxattr = erofs_listxattr, .get_acl = erofs_get_acl, + .fiemap = erofs_fiemap, }; const struct inode_operations erofs_symlink_iops = { diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 543c2ff97d30..9524e155b38f 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -2,6 +2,7 @@ /* * Copyright (C) 2017-2018 HUAWEI, Inc. * https://www.huawei.com/ + * Copyright (C) 2021, Alibaba Cloud */ #ifndef __EROFS_INTERNAL_H #define __EROFS_INTERNAL_H @@ -15,6 +16,7 @@ #include <linux/magic.h> #include <linux/slab.h> #include <linux/vmalloc.h> +#include <linux/iomap.h> #include "erofs_fs.h" /* redefine pr_fmt "erofs: " */ @@ -83,6 +85,7 @@ struct erofs_sb_info { struct erofs_sb_lz4_info lz4; #endif /* CONFIG_EROFS_FS_ZIP */ + struct dax_device *dax_dev; u32 blocks; u32 meta_blkaddr; #ifdef CONFIG_EROFS_FS_XATTR @@ -115,6 +118,8 @@ struct erofs_sb_info { /* Mount flags set via mount options or defaults */ #define EROFS_MOUNT_XATTR_USER 0x00000010 #define EROFS_MOUNT_POSIX_ACL 0x00000020 +#define EROFS_MOUNT_DAX_ALWAYS 0x00000040 +#define EROFS_MOUNT_DAX_NEVER 0x00000080 #define clear_opt(ctx, option) ((ctx)->mount_opt &= ~EROFS_MOUNT_##option) #define set_opt(ctx, option) ((ctx)->mount_opt |= EROFS_MOUNT_##option) @@ -257,6 +262,10 @@ struct erofs_inode { union { erofs_blk_t raw_blkaddr; + struct { + unsigned short chunkformat; + unsigned char chunkbits; + }; #ifdef CONFIG_EROFS_FS_ZIP struct { unsigned short z_advise; @@ -353,8 +362,15 @@ struct erofs_map_blocks { /* Flags used by erofs_map_blocks_flatmode() */ #define EROFS_GET_BLOCKS_RAW 0x0001 +/* + * Used to get the exact decompressed length, e.g. fiemap (consider lookback + * approach instead if possible since it's more metadata lightweight.) + */ +#define EROFS_GET_BLOCKS_FIEMAP 0x0002 /* zmap.c */ +extern const struct iomap_ops z_erofs_iomap_report_ops; + #ifdef CONFIG_EROFS_FS_ZIP int z_erofs_fill_inode(struct inode *inode); int z_erofs_map_blocks_iter(struct inode *inode, @@ -371,7 +387,10 @@ static inline int z_erofs_map_blocks_iter(struct inode *inode, #endif /* !CONFIG_EROFS_FS_ZIP */ /* data.c */ +extern const struct file_operations erofs_file_fops; struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr); +int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len); /* inode.c */ static inline unsigned long erofs_inode_hash(erofs_nid_t nid) @@ -441,8 +460,7 @@ int __init z_erofs_init_zip_subsystem(void); void z_erofs_exit_zip_subsystem(void); int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, struct erofs_workgroup *egrp); -int erofs_try_to_free_cached_page(struct address_space *mapping, - struct page *page); +int erofs_try_to_free_cached_page(struct page *page); int z_erofs_load_lz4_config(struct super_block *sb, struct erofs_super_block *dsb, struct z_erofs_lz4_cfgs *lz4, int len); diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c index a8271ce5e13f..8629e616028c 100644 --- a/fs/erofs/namei.c +++ b/fs/erofs/namei.c @@ -245,4 +245,5 @@ const struct inode_operations erofs_dir_iops = { .getattr = erofs_getattr, .listxattr = erofs_listxattr, .get_acl = erofs_get_acl, + .fiemap = erofs_fiemap, }; diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 8fc6c04b54f4..11b88559f8bf 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -11,6 +11,7 @@ #include <linux/crc32c.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> +#include <linux/dax.h> #include "xattr.h" #define CREATE_TRACE_POINTS @@ -355,6 +356,8 @@ enum { Opt_user_xattr, Opt_acl, Opt_cache_strategy, + Opt_dax, + Opt_dax_enum, Opt_err }; @@ -365,14 +368,47 @@ static const struct constant_table erofs_param_cache_strategy[] = { {} }; +static const struct constant_table erofs_dax_param_enums[] = { + {"always", EROFS_MOUNT_DAX_ALWAYS}, + {"never", EROFS_MOUNT_DAX_NEVER}, + {} +}; + static const struct fs_parameter_spec erofs_fs_parameters[] = { fsparam_flag_no("user_xattr", Opt_user_xattr), fsparam_flag_no("acl", Opt_acl), fsparam_enum("cache_strategy", Opt_cache_strategy, erofs_param_cache_strategy), + fsparam_flag("dax", Opt_dax), + fsparam_enum("dax", Opt_dax_enum, erofs_dax_param_enums), {} }; +static bool erofs_fc_set_dax_mode(struct fs_context *fc, unsigned int mode) +{ +#ifdef CONFIG_FS_DAX + struct erofs_fs_context *ctx = fc->fs_private; + + switch (mode) { + case EROFS_MOUNT_DAX_ALWAYS: + warnfc(fc, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); + set_opt(ctx, DAX_ALWAYS); + clear_opt(ctx, DAX_NEVER); + return true; + case EROFS_MOUNT_DAX_NEVER: + set_opt(ctx, DAX_NEVER); + clear_opt(ctx, DAX_ALWAYS); + return true; + default: + DBG_BUGON(1); + return false; + } +#else + errorfc(fc, "dax options not supported"); + return false; +#endif +} + static int erofs_fc_parse_param(struct fs_context *fc, struct fs_parameter *param) { @@ -412,6 +448,14 @@ static int erofs_fc_parse_param(struct fs_context *fc, errorfc(fc, "compression not supported, cache_strategy ignored"); #endif break; + case Opt_dax: + if (!erofs_fc_set_dax_mode(fc, EROFS_MOUNT_DAX_ALWAYS)) + return -EINVAL; + break; + case Opt_dax_enum: + if (!erofs_fc_set_dax_mode(fc, result.uint_32)) + return -EINVAL; + break; default: return -ENOPARAM; } @@ -430,7 +474,7 @@ static int erofs_managed_cache_releasepage(struct page *page, gfp_t gfp_mask) DBG_BUGON(mapping->a_ops != &managed_cache_aops); if (PagePrivate(page)) - ret = erofs_try_to_free_cached_page(mapping, page); + ret = erofs_try_to_free_cached_page(page); return ret; } @@ -496,10 +540,16 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) return -ENOMEM; sb->s_fs_info = sbi; + sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev); err = erofs_read_superblock(sb); if (err) return err; + if (test_opt(ctx, DAX_ALWAYS) && + !dax_supported(sbi->dax_dev, sb->s_bdev, EROFS_BLKSIZ, 0, bdev_nr_sectors(sb->s_bdev))) { + errorfc(fc, "DAX unsupported by block device. Turning off DAX."); + clear_opt(ctx, DAX_ALWAYS); + } sb->s_flags |= SB_RDONLY | SB_NOATIME; sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_time_gran = 1; @@ -609,6 +659,7 @@ static void erofs_kill_sb(struct super_block *sb) sbi = EROFS_SB(sb); if (!sbi) return; + fs_put_dax(sbi->dax_dev); kfree(sbi); sb->s_fs_info = NULL; } @@ -711,8 +762,8 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf) static int erofs_show_options(struct seq_file *seq, struct dentry *root) { - struct erofs_sb_info *sbi __maybe_unused = EROFS_SB(root->d_sb); - struct erofs_fs_context *ctx __maybe_unused = &sbi->ctx; + struct erofs_sb_info *sbi = EROFS_SB(root->d_sb); + struct erofs_fs_context *ctx = &sbi->ctx; #ifdef CONFIG_EROFS_FS_XATTR if (test_opt(ctx, XATTR_USER)) @@ -734,6 +785,10 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root) else if (ctx->cache_strategy == EROFS_ZIP_CACHE_READAROUND) seq_puts(seq, ",cache_strategy=readaround"); #endif + if (test_opt(ctx, DAX_ALWAYS)) + seq_puts(seq, ",dax=always"); + if (test_opt(ctx, DAX_NEVER)) + seq_puts(seq, ",dax=never"); return 0; } diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index 8dd54b420a1d..778f2c52295d 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -673,12 +673,15 @@ ssize_t erofs_listxattr(struct dentry *dentry, } #ifdef CONFIG_EROFS_FS_POSIX_ACL -struct posix_acl *erofs_get_acl(struct inode *inode, int type) +struct posix_acl *erofs_get_acl(struct inode *inode, int type, bool rcu) { struct posix_acl *acl; int prefix, rc; char *value = NULL; + if (rcu) + return ERR_PTR(-ECHILD); + switch (type) { case ACL_TYPE_ACCESS: prefix = EROFS_XATTR_INDEX_POSIX_ACL_ACCESS; diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h index 366dcb400525..94090c74b3f7 100644 --- a/fs/erofs/xattr.h +++ b/fs/erofs/xattr.h @@ -80,7 +80,7 @@ static inline int erofs_getxattr(struct inode *inode, int index, #endif /* !CONFIG_EROFS_FS_XATTR */ #ifdef CONFIG_EROFS_FS_POSIX_ACL -struct posix_acl *erofs_get_acl(struct inode *inode, int type); +struct posix_acl *erofs_get_acl(struct inode *inode, int type, bool rcu); #else #define erofs_get_acl (NULL) #endif diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index cb4d0889eca9..11c7a1aaebad 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -309,7 +309,6 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, { struct z_erofs_pcluster *const pcl = container_of(grp, struct z_erofs_pcluster, obj); - struct address_space *const mapping = MNGD_MAPPING(sbi); int i; /* @@ -326,7 +325,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, if (!trylock_page(page)) return -EBUSY; - if (page->mapping != mapping) + if (!erofs_page_is_managed(sbi, page)) continue; /* barrier is implied in the following 'unlock_page' */ @@ -337,8 +336,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, return 0; } -int erofs_try_to_free_cached_page(struct address_space *mapping, - struct page *page) +int erofs_try_to_free_cached_page(struct page *page) { struct z_erofs_pcluster *const pcl = (void *)page_private(page); int ret = 0; /* 0 - busy */ diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index f68aea4baed7..9fb98d85a3ce 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -212,9 +212,34 @@ static unsigned int decode_compactedbits(unsigned int lobits, return lo; } +static int get_compacted_la_distance(unsigned int lclusterbits, + unsigned int encodebits, + unsigned int vcnt, u8 *in, int i) +{ + const unsigned int lomask = (1 << lclusterbits) - 1; + unsigned int lo, d1 = 0; + u8 type; + + DBG_BUGON(i >= vcnt); + + do { + lo = decode_compactedbits(lclusterbits, lomask, + in, encodebits * i, &type); + + if (type != Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) + return d1; + ++d1; + } while (++i < vcnt); + + /* vcnt - 1 (Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) item */ + if (!(lo & Z_EROFS_VLE_DI_D0_CBLKCNT)) + d1 += lo - 1; + return d1; +} + static int unpack_compacted_index(struct z_erofs_maprecorder *m, unsigned int amortizedshift, - unsigned int eofs) + unsigned int eofs, bool lookahead) { struct erofs_inode *const vi = EROFS_I(m->inode); const unsigned int lclusterbits = vi->z_logical_clusterbits; @@ -243,6 +268,11 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, m->type = type; if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) { m->clusterofs = 1 << lclusterbits; + + /* figure out lookahead_distance: delta[1] if needed */ + if (lookahead) + m->delta[1] = get_compacted_la_distance(lclusterbits, + encodebits, vcnt, in, i); if (lo & Z_EROFS_VLE_DI_D0_CBLKCNT) { if (!big_pcluster) { DBG_BUGON(1); @@ -313,7 +343,7 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m, } static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m, - unsigned long lcn) + unsigned long lcn, bool lookahead) { struct inode *const inode = m->inode; struct erofs_inode *const vi = EROFS_I(inode); @@ -364,11 +394,12 @@ out: err = z_erofs_reload_indexes(m, erofs_blknr(pos)); if (err) return err; - return unpack_compacted_index(m, amortizedshift, erofs_blkoff(pos)); + return unpack_compacted_index(m, amortizedshift, erofs_blkoff(pos), + lookahead); } static int z_erofs_load_cluster_from_disk(struct z_erofs_maprecorder *m, - unsigned int lcn) + unsigned int lcn, bool lookahead) { const unsigned int datamode = EROFS_I(m->inode)->datalayout; @@ -376,7 +407,7 @@ static int z_erofs_load_cluster_from_disk(struct z_erofs_maprecorder *m, return legacy_load_cluster_from_disk(m, lcn); if (datamode == EROFS_INODE_FLAT_COMPRESSION) - return compacted_load_cluster_from_disk(m, lcn); + return compacted_load_cluster_from_disk(m, lcn, lookahead); return -EINVAL; } @@ -399,7 +430,7 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m, /* load extent head logical cluster if needed */ lcn -= lookback_distance; - err = z_erofs_load_cluster_from_disk(m, lcn); + err = z_erofs_load_cluster_from_disk(m, lcn, false); if (err) return err; @@ -450,7 +481,7 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m, if (m->compressedlcs) goto out; - err = z_erofs_load_cluster_from_disk(m, lcn); + err = z_erofs_load_cluster_from_disk(m, lcn, false); if (err) return err; @@ -498,6 +529,48 @@ err_bonus_cblkcnt: return -EFSCORRUPTED; } +static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m) +{ + struct inode *inode = m->inode; + struct erofs_inode *vi = EROFS_I(inode); + struct erofs_map_blocks *map = m->map; + unsigned int lclusterbits = vi->z_logical_clusterbits; + u64 lcn = m->lcn, headlcn = map->m_la >> lclusterbits; + int err; + + do { + /* handle the last EOF pcluster (no next HEAD lcluster) */ + if ((lcn << lclusterbits) >= inode->i_size) { + map->m_llen = inode->i_size - map->m_la; + return 0; + } + + err = z_erofs_load_cluster_from_disk(m, lcn, true); + if (err) + return err; + + if (m->type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) { + DBG_BUGON(!m->delta[1] && + m->clusterofs != 1 << lclusterbits); + } else if (m->type == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN || + m->type == Z_EROFS_VLE_CLUSTER_TYPE_HEAD) { + /* go on until the next HEAD lcluster */ + if (lcn != headlcn) + break; + m->delta[1] = 1; + } else { + erofs_err(inode->i_sb, "unknown type %u @ lcn %llu of nid %llu", + m->type, lcn, vi->nid); + DBG_BUGON(1); + return -EOPNOTSUPP; + } + lcn += m->delta[1]; + } while (m->delta[1]); + + map->m_llen = (lcn << lclusterbits) + m->clusterofs - map->m_la; + return 0; +} + int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, int flags) @@ -531,7 +604,7 @@ int z_erofs_map_blocks_iter(struct inode *inode, initial_lcn = ofs >> lclusterbits; endoff = ofs & ((1 << lclusterbits) - 1); - err = z_erofs_load_cluster_from_disk(&m, initial_lcn); + err = z_erofs_load_cluster_from_disk(&m, initial_lcn, false); if (err) goto unmap_out; @@ -581,6 +654,12 @@ int z_erofs_map_blocks_iter(struct inode *inode, err = z_erofs_get_extent_compressedlen(&m, initial_lcn); if (err) goto out; + + if (flags & EROFS_GET_BLOCKS_FIEMAP) { + err = z_erofs_get_extent_decompressedlen(&m); + if (!err) + map->m_flags |= EROFS_MAP_FULL_MAPPED; + } unmap_out: if (m.kaddr) kunmap_atomic(m.kaddr); @@ -596,3 +675,41 @@ out: DBG_BUGON(err < 0 && err != -ENOMEM); return err; } + +static int z_erofs_iomap_begin_report(struct inode *inode, loff_t offset, + loff_t length, unsigned int flags, + struct iomap *iomap, struct iomap *srcmap) +{ + int ret; + struct erofs_map_blocks map = { .m_la = offset }; + + ret = z_erofs_map_blocks_iter(inode, &map, EROFS_GET_BLOCKS_FIEMAP); + if (map.mpage) + put_page(map.mpage); + if (ret < 0) + return ret; + + iomap->bdev = inode->i_sb->s_bdev; + iomap->offset = map.m_la; + iomap->length = map.m_llen; + if (map.m_flags & EROFS_MAP_MAPPED) { + iomap->type = IOMAP_MAPPED; + iomap->addr = map.m_pa; + } else { + iomap->type = IOMAP_HOLE; + iomap->addr = IOMAP_NULL_ADDR; + /* + * No strict rule how to describe extents for post EOF, yet + * we need do like below. Otherwise, iomap itself will get + * into an endless loop on post EOF. + */ + if (iomap->offset >= inode->i_size) + iomap->length = length + map.m_la - offset; + } + iomap->flags = 0; + return 0; +} + +const struct iomap_ops z_erofs_iomap_report_ops = { + .iomap_begin = z_erofs_iomap_begin_report, +}; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 1e596e1d0bba..06f4c5ae1451 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -723,7 +723,7 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) */ call_rcu(&epi->rcu, epi_rcu_free); - atomic_long_dec(&ep->user->epoll_watches); + percpu_counter_dec(&ep->user->epoll_watches); return 0; } @@ -1439,7 +1439,6 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, { int error, pwake = 0; __poll_t revents; - long user_watches; struct epitem *epi; struct ep_pqueue epq; struct eventpoll *tep = NULL; @@ -1449,11 +1448,15 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, lockdep_assert_irqs_enabled(); - user_watches = atomic_long_read(&ep->user->epoll_watches); - if (unlikely(user_watches >= max_user_watches)) + if (unlikely(percpu_counter_compare(&ep->user->epoll_watches, + max_user_watches) >= 0)) return -ENOSPC; - if (!(epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL))) + percpu_counter_inc(&ep->user->epoll_watches); + + if (!(epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL))) { + percpu_counter_dec(&ep->user->epoll_watches); return -ENOMEM; + } /* Item initialization follow here ... */ INIT_LIST_HEAD(&epi->rdllink); @@ -1466,17 +1469,16 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, mutex_lock_nested(&tep->mtx, 1); /* Add the current item to the list of active epoll hook for this file */ if (unlikely(attach_epitem(tfile, epi) < 0)) { - kmem_cache_free(epi_cache, epi); if (tep) mutex_unlock(&tep->mtx); + kmem_cache_free(epi_cache, epi); + percpu_counter_dec(&ep->user->epoll_watches); return -ENOMEM; } if (full_check && !tep) list_file(tfile); - atomic_long_inc(&ep->user->epoll_watches); - /* * Add the current item to the RB tree. All RB tree operations are * protected by "mtx", and ep_insert() is called with "mtx" held. @@ -1684,8 +1686,8 @@ static int ep_send_events(struct eventpoll *ep, if (!revents) continue; - if (__put_user(revents, &events->events) || - __put_user(epi->event.data, &events->data)) { + events = epoll_put_uevent(revents, epi->event.data, events); + if (!events) { list_add(&epi->rdllink, &txlist); ep_pm_stay_awake(epi); if (!res) @@ -1693,7 +1695,6 @@ static int ep_send_events(struct eventpoll *ep, break; } res++; - events++; if (epi->event.events & EPOLLONESHOT) epi->event.events &= EP_PRIVATE_BITS; else if (!(epi->event.events & EPOLLET)) { diff --git a/fs/exec.c b/fs/exec.c index 3b78b22addfb..a098c133d8d7 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -217,8 +217,10 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, * We are doing an exec(). 'current' is the process * doing the exec and bprm->mm is the new process's mm. */ + mmap_read_lock(bprm->mm); ret = get_user_pages_remote(bprm->mm, pos, 1, gup_flags, &page, NULL, NULL); + mmap_read_unlock(bprm->mm); if (ret <= 0) return NULL; @@ -574,7 +576,7 @@ static int copy_strings(int argc, struct user_arg_ptr argv, } if (kmapped_page) { - flush_kernel_dcache_page(kmapped_page); + flush_dcache_page(kmapped_page); kunmap(kmapped_page); put_arg_page(kmapped_page); } @@ -592,7 +594,7 @@ static int copy_strings(int argc, struct user_arg_ptr argv, ret = 0; out: if (kmapped_page) { - flush_kernel_dcache_page(kmapped_page); + flush_dcache_page(kmapped_page); kunmap(kmapped_page); put_arg_page(kmapped_page); } @@ -634,7 +636,7 @@ int copy_string_kernel(const char *arg, struct linux_binprm *bprm) kaddr = kmap_atomic(page); flush_arg_page(bprm, pos & PAGE_MASK, page); memcpy(kaddr + offset_in_page(pos), arg, bytes_to_copy); - flush_kernel_dcache_page(page); + flush_dcache_page(page); kunmap_atomic(kaddr); put_arg_page(page); } @@ -1270,7 +1272,9 @@ int begin_new_exec(struct linux_binprm * bprm) * not visibile until then. This also enables the update * to be lockless. */ - set_mm_exe_file(bprm->mm, bprm->file); + retval = set_mm_exe_file(bprm->mm, bprm->file); + if (retval) + goto out; /* If the binary is not readable then enforce mm->dumpable=0 */ would_dump(bprm, bprm->file); diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index b9a9db98e94b..bf298967c5b8 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -141,13 +141,16 @@ fail: * inode->i_mutex: don't care */ struct posix_acl * -ext2_get_acl(struct inode *inode, int type) +ext2_get_acl(struct inode *inode, int type, bool rcu) { int name_index; char *value = NULL; struct posix_acl *acl; int retval; + if (rcu) + return ERR_PTR(-ECHILD); + switch (type) { case ACL_TYPE_ACCESS: name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS; diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h index 917db5f6630a..925ab6287d35 100644 --- a/fs/ext2/acl.h +++ b/fs/ext2/acl.h @@ -55,7 +55,7 @@ static inline int ext2_acl_count(size_t size) #ifdef CONFIG_EXT2_FS_POSIX_ACL /* acl.c */ -extern struct posix_acl *ext2_get_acl(struct inode *inode, int type); +extern struct posix_acl *ext2_get_acl(struct inode *inode, int type, bool rcu); extern int ext2_set_acl(struct user_namespace *mnt_userns, struct inode *inode, struct posix_acl *acl, int type); extern int ext2_init_acl (struct inode *, struct inode *); diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 987bcf32ed46..d8d580b609ba 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -946,7 +946,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); if (test_opt(sb, DAX)) { - if (!bdev_dax_supported(sb->s_bdev, blocksize)) { + if (!dax_supported(dax_dev, sb->s_bdev, blocksize, 0, + bdev_nr_sectors(sb->s_bdev))) { ext2_msg(sb, KERN_ERR, "DAX unsupported by block device. Turning off DAX."); clear_opt(sbi->s_mount_opt, DAX); diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index 49e7af6cc93f..7d89142e1421 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -10,7 +10,7 @@ ext4-y := balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \ indirect.o inline.o inode.o ioctl.o mballoc.o migrate.o \ mmp.o move_extent.o namei.o page-io.o readpage.o resize.o \ super.o symlink.o sysfs.o xattr.o xattr_hurd.o xattr_trusted.o \ - xattr_user.o fast_commit.o + xattr_user.o fast_commit.o orphan.o ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index c5eaffccecc3..0613dfcbfd4a 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -142,13 +142,16 @@ fail: * inode->i_mutex: don't care */ struct posix_acl * -ext4_get_acl(struct inode *inode, int type) +ext4_get_acl(struct inode *inode, int type, bool rcu) { int name_index; char *value = NULL; struct posix_acl *acl; int retval; + if (rcu) + return ERR_PTR(-ECHILD); + switch (type) { case ACL_TYPE_ACCESS: name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h index 84b8942a57f2..3219669732bf 100644 --- a/fs/ext4/acl.h +++ b/fs/ext4/acl.h @@ -55,7 +55,7 @@ static inline int ext4_acl_count(size_t size) #ifdef CONFIG_EXT4_FS_POSIX_ACL /* acl.c */ -struct posix_acl *ext4_get_acl(struct inode *inode, int type); +struct posix_acl *ext4_get_acl(struct inode *inode, int type, bool rcu); int ext4_set_acl(struct user_namespace *mnt_userns, struct inode *inode, struct posix_acl *acl, int type); extern int ext4_init_acl(handle_t *, struct inode *, struct inode *); diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 9dc6e74b265c..a0fb0c4bdc7c 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -652,8 +652,14 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries) * possible we just missed a transaction commit that did so */ smp_mb(); - if (sbi->s_mb_free_pending == 0) + if (sbi->s_mb_free_pending == 0) { + if (test_opt(sb, DISCARD)) { + atomic_inc(&sbi->s_retry_alloc_pending); + flush_work(&sbi->s_discard_work); + atomic_dec(&sbi->s_retry_alloc_pending); + } return ext4_has_free_clusters(sbi, 1, 0); + } /* * it's possible we've just missed a transaction commit here, diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 7ebaf66b6e31..90ff5acaf11f 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1034,7 +1034,14 @@ struct ext4_inode_info { */ struct rw_semaphore xattr_sem; - struct list_head i_orphan; /* unlinked but open inodes */ + /* + * Inodes with EXT4_STATE_ORPHAN_FILE use i_orphan_idx. Otherwise + * i_orphan is used. + */ + union { + struct list_head i_orphan; /* unlinked but open inodes */ + unsigned int i_orphan_idx; /* Index in orphan file */ + }; /* Fast commit related info */ @@ -1419,7 +1426,8 @@ struct ext4_super_block { __u8 s_last_error_errcode; __le16 s_encoding; /* Filename charset encoding */ __le16 s_encoding_flags; /* Filename charset encoding flags */ - __le32 s_reserved[95]; /* Padding to the end of the block */ + __le32 s_orphan_file_inum; /* Inode for tracking orphan inodes */ + __le32 s_reserved[94]; /* Padding to the end of the block */ __le32 s_checksum; /* crc32c(superblock) */ }; @@ -1438,6 +1446,54 @@ struct ext4_super_block { #define EXT4_ENC_UTF8_12_1 1 +/* Types of ext4 journal triggers */ +enum ext4_journal_trigger_type { + EXT4_JTR_ORPHAN_FILE, + EXT4_JTR_NONE /* This must be the last entry for indexing to work! */ +}; + +#define EXT4_JOURNAL_TRIGGER_COUNT EXT4_JTR_NONE + +struct ext4_journal_trigger { + struct jbd2_buffer_trigger_type tr_triggers; + struct super_block *sb; +}; + +static inline struct ext4_journal_trigger *EXT4_TRIGGER( + struct jbd2_buffer_trigger_type *trigger) +{ + return container_of(trigger, struct ext4_journal_trigger, tr_triggers); +} + +#define EXT4_ORPHAN_BLOCK_MAGIC 0x0b10ca04 + +/* Structure at the tail of orphan block */ +struct ext4_orphan_block_tail { + __le32 ob_magic; + __le32 ob_checksum; +}; + +static inline int ext4_inodes_per_orphan_block(struct super_block *sb) +{ + return (sb->s_blocksize - sizeof(struct ext4_orphan_block_tail)) / + sizeof(u32); +} + +struct ext4_orphan_block { + atomic_t ob_free_entries; /* Number of free orphan entries in block */ + struct buffer_head *ob_bh; /* Buffer for orphan block */ +}; + +/* + * Info about orphan file. + */ +struct ext4_orphan_info { + int of_blocks; /* Number of orphan blocks in a file */ + __u32 of_csum_seed; /* Checksum seed for orphan file */ + struct ext4_orphan_block *of_binfo; /* Array with info about orphan + * file blocks */ +}; + /* * fourth extended-fs super-block data in memory */ @@ -1492,9 +1548,11 @@ struct ext4_sb_info { /* Journaling */ struct journal_s *s_journal; - struct list_head s_orphan; - struct mutex s_orphan_lock; unsigned long s_ext4_flags; /* Ext4 superblock flags */ + struct mutex s_orphan_lock; /* Protects on disk list changes */ + struct list_head s_orphan; /* List of orphaned inodes in on disk + list */ + struct ext4_orphan_info s_orphan_info; unsigned long s_commit_interval; u32 s_max_batch_time; u32 s_min_batch_time; @@ -1527,6 +1585,9 @@ struct ext4_sb_info { unsigned int s_mb_free_pending; struct list_head s_freed_data_list; /* List of blocks to be freed after commit completed */ + struct list_head s_discard_list; + struct work_struct s_discard_work; + atomic_t s_retry_alloc_pending; struct rb_root s_mb_avg_fragment_size_root; rwlock_t s_mb_rb_lock; struct list_head *s_mb_largest_free_orders; @@ -1616,6 +1677,9 @@ struct ext4_sb_info { struct mb_cache *s_ea_inode_cache; spinlock_t s_es_lock ____cacheline_aligned_in_smp; + /* Journal triggers for checksum computation */ + struct ext4_journal_trigger s_journal_triggers[EXT4_JOURNAL_TRIGGER_COUNT]; + /* Ratelimit ext4 messages. */ struct ratelimit_state s_err_ratelimit_state; struct ratelimit_state s_warning_ratelimit_state; @@ -1826,6 +1890,7 @@ enum { EXT4_STATE_LUSTRE_EA_INODE, /* Lustre-style ea_inode */ EXT4_STATE_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */ EXT4_STATE_FC_COMMITTING, /* Fast commit ongoing */ + EXT4_STATE_ORPHAN_FILE, /* Inode orphaned in orphan file */ }; #define EXT4_INODE_BIT_FNS(name, field, offset) \ @@ -1927,6 +1992,7 @@ static inline bool ext4_verity_in_progress(struct inode *inode) */ #define EXT4_FEATURE_COMPAT_FAST_COMMIT 0x0400 #define EXT4_FEATURE_COMPAT_STABLE_INODES 0x0800 +#define EXT4_FEATURE_COMPAT_ORPHAN_FILE 0x1000 /* Orphan file exists */ #define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 #define EXT4_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 @@ -1947,6 +2013,8 @@ static inline bool ext4_verity_in_progress(struct inode *inode) #define EXT4_FEATURE_RO_COMPAT_READONLY 0x1000 #define EXT4_FEATURE_RO_COMPAT_PROJECT 0x2000 #define EXT4_FEATURE_RO_COMPAT_VERITY 0x8000 +#define EXT4_FEATURE_RO_COMPAT_ORPHAN_PRESENT 0x10000 /* Orphan file may be + non-empty */ #define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 #define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 @@ -2030,6 +2098,7 @@ EXT4_FEATURE_COMPAT_FUNCS(dir_index, DIR_INDEX) EXT4_FEATURE_COMPAT_FUNCS(sparse_super2, SPARSE_SUPER2) EXT4_FEATURE_COMPAT_FUNCS(fast_commit, FAST_COMMIT) EXT4_FEATURE_COMPAT_FUNCS(stable_inodes, STABLE_INODES) +EXT4_FEATURE_COMPAT_FUNCS(orphan_file, ORPHAN_FILE) EXT4_FEATURE_RO_COMPAT_FUNCS(sparse_super, SPARSE_SUPER) EXT4_FEATURE_RO_COMPAT_FUNCS(large_file, LARGE_FILE) @@ -2044,6 +2113,7 @@ EXT4_FEATURE_RO_COMPAT_FUNCS(metadata_csum, METADATA_CSUM) EXT4_FEATURE_RO_COMPAT_FUNCS(readonly, READONLY) EXT4_FEATURE_RO_COMPAT_FUNCS(project, PROJECT) EXT4_FEATURE_RO_COMPAT_FUNCS(verity, VERITY) +EXT4_FEATURE_RO_COMPAT_FUNCS(orphan_present, ORPHAN_PRESENT) EXT4_FEATURE_INCOMPAT_FUNCS(compression, COMPRESSION) EXT4_FEATURE_INCOMPAT_FUNCS(filetype, FILETYPE) @@ -2077,7 +2147,8 @@ EXT4_FEATURE_INCOMPAT_FUNCS(casefold, CASEFOLD) EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ EXT4_FEATURE_RO_COMPAT_BTREE_DIR) -#define EXT4_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR +#define EXT4_FEATURE_COMPAT_SUPP (EXT4_FEATURE_COMPAT_EXT_ATTR| \ + EXT4_FEATURE_COMPAT_ORPHAN_FILE) #define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ EXT4_FEATURE_INCOMPAT_RECOVER| \ EXT4_FEATURE_INCOMPAT_META_BG| \ @@ -2102,7 +2173,8 @@ EXT4_FEATURE_INCOMPAT_FUNCS(casefold, CASEFOLD) EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\ EXT4_FEATURE_RO_COMPAT_QUOTA |\ EXT4_FEATURE_RO_COMPAT_PROJECT |\ - EXT4_FEATURE_RO_COMPAT_VERITY) + EXT4_FEATURE_RO_COMPAT_VERITY |\ + EXT4_FEATURE_RO_COMPAT_ORPHAN_PRESENT) #define EXTN_FEATURE_FUNCS(ver) \ static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \ @@ -2138,6 +2210,8 @@ static inline bool ext4_has_incompat_features(struct super_block *sb) return (EXT4_SB(sb)->s_es->s_feature_incompat != 0); } +extern int ext4_feature_set_ok(struct super_block *sb, int readonly); + /* * Superblock flags */ @@ -2150,7 +2224,6 @@ static inline int ext4_forced_shutdown(struct ext4_sb_info *sbi) return test_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags); } - /* * Default values for user and/or group using reserved blocks */ @@ -2911,13 +2984,14 @@ int ext4_get_block(struct inode *inode, sector_t iblock, int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create); int ext4_walk_page_buffers(handle_t *handle, + struct inode *inode, struct buffer_head *head, unsigned from, unsigned to, int *partial, - int (*fn)(handle_t *handle, + int (*fn)(handle_t *handle, struct inode *inode, struct buffer_head *bh)); -int do_journal_get_write_access(handle_t *handle, +int do_journal_get_write_access(handle_t *handle, struct inode *inode, struct buffer_head *bh); #define FALL_BACK_TO_NONDELALLOC 1 #define CONVERT_INLINE_DATA 2 @@ -2996,8 +3070,6 @@ extern int ext4_init_new_dir(handle_t *handle, struct inode *dir, struct inode *inode); extern int ext4_dirblock_csum_verify(struct inode *inode, struct buffer_head *bh); -extern int ext4_orphan_add(handle_t *, struct inode *); -extern int ext4_orphan_del(handle_t *, struct inode *); extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, __u32 start_minor_hash, __u32 *next_hash); extern int ext4_search_dir(struct buffer_head *bh, @@ -3466,6 +3538,7 @@ static inline bool ext4_is_quota_journalled(struct super_block *sb) return (ext4_has_feature_quota(sb) || sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]); } +int ext4_enable_quotas(struct super_block *sb); #endif /* @@ -3727,6 +3800,19 @@ extern void ext4_stop_mmpd(struct ext4_sb_info *sbi); /* verity.c */ extern const struct fsverity_operations ext4_verityops; +/* orphan.c */ +extern int ext4_orphan_add(handle_t *, struct inode *); +extern int ext4_orphan_del(handle_t *, struct inode *); +extern void ext4_orphan_cleanup(struct super_block *sb, + struct ext4_super_block *es); +extern void ext4_release_orphan_info(struct super_block *sb); +extern int ext4_init_orphan_info(struct super_block *sb); +extern int ext4_orphan_file_empty(struct super_block *sb); +extern void ext4_orphan_file_block_trigger( + struct jbd2_buffer_trigger_type *triggers, + struct buffer_head *bh, + void *data, size_t size); + /* * Add new method to test whether block and inode bitmaps are properly * initialized. With uninit_bg reading the block from disk is not enough diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 44e59881a1f0..26435f3a3094 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -173,10 +173,11 @@ struct partial_cluster { #define EXT_MAX_EXTENT(__hdr__) \ ((le16_to_cpu((__hdr__)->eh_max)) ? \ ((EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)) \ - : 0) + : NULL) #define EXT_MAX_INDEX(__hdr__) \ ((le16_to_cpu((__hdr__)->eh_max)) ? \ - ((EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)) : 0) + ((EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)) \ + : NULL) static inline struct ext4_extent_header *ext_inode_hdr(struct inode *inode) { diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index b60f0152ea57..6def7339056d 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -218,9 +218,11 @@ static void ext4_check_bdev_write_error(struct super_block *sb) } int __ext4_journal_get_write_access(const char *where, unsigned int line, - handle_t *handle, struct buffer_head *bh) + handle_t *handle, struct super_block *sb, + struct buffer_head *bh, + enum ext4_journal_trigger_type trigger_type) { - int err = 0; + int err; might_sleep(); @@ -229,11 +231,18 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line, if (ext4_handle_valid(handle)) { err = jbd2_journal_get_write_access(handle, bh); - if (err) + if (err) { ext4_journal_abort_handle(where, line, __func__, bh, handle, err); + return err; + } } - return err; + if (trigger_type == EXT4_JTR_NONE || !ext4_has_metadata_csum(sb)) + return 0; + BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT); + jbd2_journal_set_triggers(bh, + &EXT4_SB(sb)->s_journal_triggers[trigger_type].tr_triggers); + return 0; } /* @@ -301,17 +310,27 @@ int __ext4_forget(const char *where, unsigned int line, handle_t *handle, } int __ext4_journal_get_create_access(const char *where, unsigned int line, - handle_t *handle, struct buffer_head *bh) + handle_t *handle, struct super_block *sb, + struct buffer_head *bh, + enum ext4_journal_trigger_type trigger_type) { - int err = 0; + int err; - if (ext4_handle_valid(handle)) { - err = jbd2_journal_get_create_access(handle, bh); - if (err) - ext4_journal_abort_handle(where, line, __func__, - bh, handle, err); + if (!ext4_handle_valid(handle)) + return 0; + + err = jbd2_journal_get_create_access(handle, bh); + if (err) { + ext4_journal_abort_handle(where, line, __func__, bh, handle, + err); + return err; } - return err; + if (trigger_type == EXT4_JTR_NONE || !ext4_has_metadata_csum(sb)) + return 0; + BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT); + jbd2_journal_set_triggers(bh, + &EXT4_SB(sb)->s_journal_triggers[trigger_type].tr_triggers); + return 0; } int __ext4_handle_dirty_metadata(const char *where, unsigned int line, diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 0d2fa423b7ad..0e4fa644df01 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -231,26 +231,32 @@ int ext4_expand_extra_isize(struct inode *inode, * Wrapper functions with which ext4 calls into JBD. */ int __ext4_journal_get_write_access(const char *where, unsigned int line, - handle_t *handle, struct buffer_head *bh); + handle_t *handle, struct super_block *sb, + struct buffer_head *bh, + enum ext4_journal_trigger_type trigger_type); int __ext4_forget(const char *where, unsigned int line, handle_t *handle, int is_metadata, struct inode *inode, struct buffer_head *bh, ext4_fsblk_t blocknr); int __ext4_journal_get_create_access(const char *where, unsigned int line, - handle_t *handle, struct buffer_head *bh); + handle_t *handle, struct super_block *sb, + struct buffer_head *bh, + enum ext4_journal_trigger_type trigger_type); int __ext4_handle_dirty_metadata(const char *where, unsigned int line, handle_t *handle, struct inode *inode, struct buffer_head *bh); -#define ext4_journal_get_write_access(handle, bh) \ - __ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh)) +#define ext4_journal_get_write_access(handle, sb, bh, trigger_type) \ + __ext4_journal_get_write_access(__func__, __LINE__, (handle), (sb), \ + (bh), (trigger_type)) #define ext4_forget(handle, is_metadata, inode, bh, block_nr) \ __ext4_forget(__func__, __LINE__, (handle), (is_metadata), (inode), \ (bh), (block_nr)) -#define ext4_journal_get_create_access(handle, bh) \ - __ext4_journal_get_create_access(__func__, __LINE__, (handle), (bh)) +#define ext4_journal_get_create_access(handle, sb, bh, trigger_type) \ + __ext4_journal_get_create_access(__func__, __LINE__, (handle), (sb), \ + (bh), (trigger_type)) #define ext4_handle_dirty_metadata(handle, inode, bh) \ __ext4_handle_dirty_metadata(__func__, __LINE__, (handle), (inode), \ (bh)) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index c33e0a2cb6c3..c0de30f25185 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -139,7 +139,8 @@ static int ext4_ext_get_access(handle_t *handle, struct inode *inode, if (path->p_bh) { /* path points to block */ BUFFER_TRACE(path->p_bh, "get_write_access"); - return ext4_journal_get_write_access(handle, path->p_bh); + return ext4_journal_get_write_access(handle, inode->i_sb, + path->p_bh, EXT4_JTR_NONE); } /* path points to leaf/index in inode body */ /* we use in-core data, no need to protect them */ @@ -1082,7 +1083,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, } lock_buffer(bh); - err = ext4_journal_get_create_access(handle, bh); + err = ext4_journal_get_create_access(handle, inode->i_sb, bh, + EXT4_JTR_NONE); if (err) goto cleanup; @@ -1160,7 +1162,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, } lock_buffer(bh); - err = ext4_journal_get_create_access(handle, bh); + err = ext4_journal_get_create_access(handle, inode->i_sb, bh, + EXT4_JTR_NONE); if (err) goto cleanup; @@ -1286,7 +1289,8 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, return -ENOMEM; lock_buffer(bh); - err = ext4_journal_get_create_access(handle, bh); + err = ext4_journal_get_create_access(handle, inode->i_sb, bh, + EXT4_JTR_NONE); if (err) { unlock_buffer(bh); goto out; @@ -3569,7 +3573,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, split_map.m_len - ee_block); err = ext4_ext_zeroout(inode, &zero_ex1); if (err) - goto out; + goto fallback; split_map.m_len = allocated; } if (split_map.m_lblk - ee_block + split_map.m_len < @@ -3583,7 +3587,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ext4_ext_pblock(ex)); err = ext4_ext_zeroout(inode, &zero_ex2); if (err) - goto out; + goto fallback; } split_map.m_len += split_map.m_lblk - ee_block; @@ -3592,6 +3596,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, } } +fallback: err = ext4_split_extent(handle, inode, ppath, &split_map, split_flag, flags); if (err > 0) diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index e8195229c252..8e610a381862 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -775,28 +775,27 @@ static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val, } /* Same as above, but adds dentry tlv. */ -static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u16 tag, - int parent_ino, int ino, int dlen, - const unsigned char *dname, - u32 *crc) +static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc, + struct ext4_fc_dentry_update *fc_dentry) { struct ext4_fc_dentry_info fcd; struct ext4_fc_tl tl; + int dlen = fc_dentry->fcd_name.len; u8 *dst = ext4_fc_reserve_space(sb, sizeof(tl) + sizeof(fcd) + dlen, crc); if (!dst) return false; - fcd.fc_parent_ino = cpu_to_le32(parent_ino); - fcd.fc_ino = cpu_to_le32(ino); - tl.fc_tag = cpu_to_le16(tag); + fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent); + fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino); + tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op); tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen); ext4_fc_memcpy(sb, dst, &tl, sizeof(tl), crc); dst += sizeof(tl); ext4_fc_memcpy(sb, dst, &fcd, sizeof(fcd), crc); dst += sizeof(fcd); - ext4_fc_memcpy(sb, dst, dname, dlen, crc); + ext4_fc_memcpy(sb, dst, fc_dentry->fcd_name.name, dlen, crc); dst += dlen; return true; @@ -992,11 +991,7 @@ __releases(&sbi->s_fc_lock) &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) { if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) { spin_unlock(&sbi->s_fc_lock); - if (!ext4_fc_add_dentry_tlv( - sb, fc_dentry->fcd_op, - fc_dentry->fcd_parent, fc_dentry->fcd_ino, - fc_dentry->fcd_name.len, - fc_dentry->fcd_name.name, crc)) { + if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) { ret = -ENOSPC; goto lock_and_exit; } @@ -1035,11 +1030,7 @@ __releases(&sbi->s_fc_lock) if (ret) goto lock_and_exit; - if (!ext4_fc_add_dentry_tlv( - sb, fc_dentry->fcd_op, - fc_dentry->fcd_parent, fc_dentry->fcd_ino, - fc_dentry->fcd_name.len, - fc_dentry->fcd_name.name, crc)) { + if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) { ret = -ENOSPC; goto lock_and_exit; } diff --git a/fs/ext4/file.c b/fs/ext4/file.c index d3b4ed91aa68..ac0e11bbb445 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -823,7 +823,8 @@ static int ext4_sample_last_mounted(struct super_block *sb, if (IS_ERR(handle)) goto out; BUFFER_TRACE(sbi->s_sbh, "get_write_access"); - err = ext4_journal_get_write_access(handle, sbi->s_sbh); + err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh, + EXT4_JTR_NONE); if (err) goto out_journal; lock_buffer(sbi->s_sbh); diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index e89fc0f770b0..f73e5eb43eae 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -300,7 +300,8 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) } BUFFER_TRACE(bitmap_bh, "get_write_access"); - fatal = ext4_journal_get_write_access(handle, bitmap_bh); + fatal = ext4_journal_get_write_access(handle, sb, bitmap_bh, + EXT4_JTR_NONE); if (fatal) goto error_return; @@ -308,7 +309,8 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) gdp = ext4_get_group_desc(sb, block_group, &bh2); if (gdp) { BUFFER_TRACE(bh2, "get_write_access"); - fatal = ext4_journal_get_write_access(handle, bh2); + fatal = ext4_journal_get_write_access(handle, sb, bh2, + EXT4_JTR_NONE); } ext4_lock_group(sb, block_group); cleared = ext4_test_and_clear_bit(bit, bitmap_bh->b_data); @@ -1085,7 +1087,8 @@ repeat_in_this_group: } } BUFFER_TRACE(inode_bitmap_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, inode_bitmap_bh); + err = ext4_journal_get_write_access(handle, sb, inode_bitmap_bh, + EXT4_JTR_NONE); if (err) { ext4_std_error(sb, err); goto out; @@ -1127,7 +1130,8 @@ got: } BUFFER_TRACE(group_desc_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, group_desc_bh); + err = ext4_journal_get_write_access(handle, sb, group_desc_bh, + EXT4_JTR_NONE); if (err) { ext4_std_error(sb, err); goto out; @@ -1144,7 +1148,8 @@ got: goto out; } BUFFER_TRACE(block_bitmap_bh, "get block bitmap access"); - err = ext4_journal_get_write_access(handle, block_bitmap_bh); + err = ext4_journal_get_write_access(handle, sb, block_bitmap_bh, + EXT4_JTR_NONE); if (err) { brelse(block_bitmap_bh); ext4_std_error(sb, err); @@ -1583,8 +1588,8 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, num = sbi->s_itb_per_group - used_blks; BUFFER_TRACE(group_desc_bh, "get_write_access"); - ret = ext4_journal_get_write_access(handle, - group_desc_bh); + ret = ext4_journal_get_write_access(handle, sb, group_desc_bh, + EXT4_JTR_NONE); if (ret) goto err_out; diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index a7bc6ad656a9..89efa78ed4b2 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -354,7 +354,8 @@ static int ext4_alloc_branch(handle_t *handle, } lock_buffer(bh); BUFFER_TRACE(bh, "call get_create_access"); - err = ext4_journal_get_create_access(handle, bh); + err = ext4_journal_get_create_access(handle, ar->inode->i_sb, + bh, EXT4_JTR_NONE); if (err) { unlock_buffer(bh); goto failed; @@ -429,7 +430,8 @@ static int ext4_splice_branch(handle_t *handle, */ if (where->bh) { BUFFER_TRACE(where->bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, where->bh); + err = ext4_journal_get_write_access(handle, ar->inode->i_sb, + where->bh, EXT4_JTR_NONE); if (err) goto err_out; } @@ -728,7 +730,8 @@ static int ext4_ind_truncate_ensure_credits(handle_t *handle, return ret; if (bh) { BUFFER_TRACE(bh, "retaking write access"); - ret = ext4_journal_get_write_access(handle, bh); + ret = ext4_journal_get_write_access(handle, inode->i_sb, bh, + EXT4_JTR_NONE); if (unlikely(ret)) return ret; } @@ -916,7 +919,8 @@ static void ext4_free_data(handle_t *handle, struct inode *inode, if (this_bh) { /* For indirect block */ BUFFER_TRACE(this_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, this_bh); + err = ext4_journal_get_write_access(handle, inode->i_sb, + this_bh, EXT4_JTR_NONE); /* Important: if we can't update the indirect pointers * to the blocks, we can't free them. */ if (err) @@ -1079,7 +1083,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, */ BUFFER_TRACE(parent_bh, "get_write_access"); if (!ext4_journal_get_write_access(handle, - parent_bh)){ + inode->i_sb, parent_bh, + EXT4_JTR_NONE)) { *p = 0; BUFFER_TRACE(parent_bh, "call ext4_handle_dirty_metadata"); diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 70cb64db33f7..82bf4ff6be28 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -264,7 +264,8 @@ static int ext4_create_inline_data(handle_t *handle, return error; BUFFER_TRACE(is.iloc.bh, "get_write_access"); - error = ext4_journal_get_write_access(handle, is.iloc.bh); + error = ext4_journal_get_write_access(handle, inode->i_sb, is.iloc.bh, + EXT4_JTR_NONE); if (error) goto out; @@ -350,7 +351,8 @@ static int ext4_update_inline_data(handle_t *handle, struct inode *inode, goto out; BUFFER_TRACE(is.iloc.bh, "get_write_access"); - error = ext4_journal_get_write_access(handle, is.iloc.bh); + error = ext4_journal_get_write_access(handle, inode->i_sb, is.iloc.bh, + EXT4_JTR_NONE); if (error) goto out; @@ -427,7 +429,8 @@ static int ext4_destroy_inline_data_nolock(handle_t *handle, goto out; BUFFER_TRACE(is.iloc.bh, "get_write_access"); - error = ext4_journal_get_write_access(handle, is.iloc.bh); + error = ext4_journal_get_write_access(handle, inode->i_sb, is.iloc.bh, + EXT4_JTR_NONE); if (error) goto out; @@ -593,7 +596,7 @@ retry: ret = __block_write_begin(page, from, to, ext4_get_block); if (!ret && ext4_should_journal_data(inode)) { - ret = ext4_walk_page_buffers(handle, page_buffers(page), + ret = ext4_walk_page_buffers(handle, inode, page_buffers(page), from, to, NULL, do_journal_get_write_access); } @@ -682,7 +685,8 @@ int ext4_try_to_write_inline_data(struct address_space *mapping, goto convert; } - ret = ext4_journal_get_write_access(handle, iloc.bh); + ret = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh, + EXT4_JTR_NONE); if (ret) goto out; @@ -750,6 +754,12 @@ int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, ext4_write_lock_xattr(inode, &no_expand); BUG_ON(!ext4_has_inline_data(inode)); + /* + * ei->i_inline_off may have changed since ext4_write_begin() + * called ext4_try_to_write_inline_data() + */ + (void) ext4_find_inline_data_nolock(inode); + kaddr = kmap_atomic(page); ext4_write_inline_data(inode, &iloc, kaddr, pos, len); kunmap_atomic(kaddr); @@ -923,7 +933,8 @@ retry_journal: if (ret < 0) goto out_release_page; } - ret = ext4_journal_get_write_access(handle, iloc.bh); + ret = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh, + EXT4_JTR_NONE); if (ret) goto out_release_page; @@ -1028,7 +1039,8 @@ static int ext4_add_dirent_to_inline(handle_t *handle, return err; BUFFER_TRACE(iloc->bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, iloc->bh); + err = ext4_journal_get_write_access(handle, dir->i_sb, iloc->bh, + EXT4_JTR_NONE); if (err) return err; ext4_insert_dentry(dir, inode, de, inline_size, fname); @@ -1223,7 +1235,8 @@ static int ext4_convert_inline_data_nolock(handle_t *handle, } lock_buffer(data_bh); - error = ext4_journal_get_create_access(handle, data_bh); + error = ext4_journal_get_create_access(handle, inode->i_sb, data_bh, + EXT4_JTR_NONE); if (error) { unlock_buffer(data_bh); error = -EIO; @@ -1707,7 +1720,8 @@ int ext4_delete_inline_entry(handle_t *handle, } BUFFER_TRACE(bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, bh); + err = ext4_journal_get_write_access(handle, dir->i_sb, bh, + EXT4_JTR_NONE); if (err) goto out; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 325c038e7b23..d18852d6029c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -139,7 +139,6 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode, static void ext4_invalidatepage(struct page *page, unsigned int offset, unsigned int length); static int __ext4_journalled_writepage(struct page *page, unsigned int len); -static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh); static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents); @@ -869,7 +868,8 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, */ lock_buffer(bh); BUFFER_TRACE(bh, "call get_create_access"); - err = ext4_journal_get_create_access(handle, bh); + err = ext4_journal_get_create_access(handle, inode->i_sb, bh, + EXT4_JTR_NONE); if (unlikely(err)) { unlock_buffer(bh); goto errout; @@ -954,12 +954,12 @@ out_brelse: return err; } -int ext4_walk_page_buffers(handle_t *handle, +int ext4_walk_page_buffers(handle_t *handle, struct inode *inode, struct buffer_head *head, unsigned from, unsigned to, int *partial, - int (*fn)(handle_t *handle, + int (*fn)(handle_t *handle, struct inode *inode, struct buffer_head *bh)) { struct buffer_head *bh; @@ -978,7 +978,7 @@ int ext4_walk_page_buffers(handle_t *handle, *partial = 1; continue; } - err = (*fn)(handle, bh); + err = (*fn)(handle, inode, bh); if (!ret) ret = err; } @@ -1009,7 +1009,7 @@ int ext4_walk_page_buffers(handle_t *handle, * is elevated. We'll still have enough credits for the tiny quotafile * write. */ -int do_journal_get_write_access(handle_t *handle, +int do_journal_get_write_access(handle_t *handle, struct inode *inode, struct buffer_head *bh) { int dirty = buffer_dirty(bh); @@ -1028,7 +1028,8 @@ int do_journal_get_write_access(handle_t *handle, if (dirty) clear_buffer_dirty(bh); BUFFER_TRACE(bh, "get write access"); - ret = ext4_journal_get_write_access(handle, bh); + ret = ext4_journal_get_write_access(handle, inode->i_sb, bh, + EXT4_JTR_NONE); if (!ret && dirty) ret = ext4_handle_dirty_metadata(handle, NULL, bh); return ret; @@ -1208,8 +1209,8 @@ retry_journal: ret = __block_write_begin(page, pos, len, ext4_get_block); #endif if (!ret && ext4_should_journal_data(inode)) { - ret = ext4_walk_page_buffers(handle, page_buffers(page), - from, to, NULL, + ret = ext4_walk_page_buffers(handle, inode, + page_buffers(page), from, to, NULL, do_journal_get_write_access); } @@ -1253,7 +1254,8 @@ retry_journal: } /* For write_end() in data=journal mode */ -static int write_end_fn(handle_t *handle, struct buffer_head *bh) +static int write_end_fn(handle_t *handle, struct inode *inode, + struct buffer_head *bh) { int ret; if (!buffer_mapped(bh) || buffer_freed(bh)) @@ -1352,6 +1354,7 @@ errout: * to call ext4_handle_dirty_metadata() instead. */ static void ext4_journalled_zero_new_buffers(handle_t *handle, + struct inode *inode, struct page *page, unsigned from, unsigned to) { @@ -1370,7 +1373,7 @@ static void ext4_journalled_zero_new_buffers(handle_t *handle, size = min(to, block_end) - start; zero_user(page, start, size); - write_end_fn(handle, bh); + write_end_fn(handle, inode, bh); } clear_buffer_new(bh); } @@ -1412,13 +1415,13 @@ static int ext4_journalled_write_end(struct file *file, copied = ret; } else if (unlikely(copied < len) && !PageUptodate(page)) { copied = 0; - ext4_journalled_zero_new_buffers(handle, page, from, to); + ext4_journalled_zero_new_buffers(handle, inode, page, from, to); } else { if (unlikely(copied < len)) - ext4_journalled_zero_new_buffers(handle, page, + ext4_journalled_zero_new_buffers(handle, inode, page, from + copied, to); - ret = ext4_walk_page_buffers(handle, page_buffers(page), from, - from + copied, &partial, + ret = ext4_walk_page_buffers(handle, inode, page_buffers(page), + from, from + copied, &partial, write_end_fn); if (!partial) SetPageUptodate(page); @@ -1619,7 +1622,8 @@ static void ext4_print_free_blocks(struct inode *inode) return; } -static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh) +static int ext4_bh_delay_or_unwritten(handle_t *handle, struct inode *inode, + struct buffer_head *bh) { return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh); } @@ -1851,13 +1855,15 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, return 0; } -static int bget_one(handle_t *handle, struct buffer_head *bh) +static int bget_one(handle_t *handle, struct inode *inode, + struct buffer_head *bh) { get_bh(bh); return 0; } -static int bput_one(handle_t *handle, struct buffer_head *bh) +static int bput_one(handle_t *handle, struct inode *inode, + struct buffer_head *bh) { put_bh(bh); return 0; @@ -1888,7 +1894,7 @@ static int __ext4_journalled_writepage(struct page *page, BUG(); goto out; } - ext4_walk_page_buffers(handle, page_bufs, 0, len, + ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, NULL, bget_one); } /* @@ -1920,11 +1926,11 @@ static int __ext4_journalled_writepage(struct page *page, if (inline_data) { ret = ext4_mark_inode_dirty(handle, inode); } else { - ret = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL, - do_journal_get_write_access); + ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, + NULL, do_journal_get_write_access); - err = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL, - write_end_fn); + err = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, + NULL, write_end_fn); } if (ret == 0) ret = err; @@ -1941,7 +1947,7 @@ out: unlock_page(page); out_no_pagelock: if (!inline_data && page_bufs) - ext4_walk_page_buffers(NULL, page_bufs, 0, len, + ext4_walk_page_buffers(NULL, inode, page_bufs, 0, len, NULL, bput_one); brelse(inode_bh); return ret; @@ -2031,7 +2037,7 @@ static int ext4_writepage(struct page *page, * for the extremely common case, this is an optimization that * skips a useless round trip through ext4_bio_write_page(). */ - if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL, + if (ext4_walk_page_buffers(NULL, inode, page_bufs, 0, len, NULL, ext4_bh_delay_or_unwritten)) { redirty_page_for_writepage(wbc, page); if ((current->flags & PF_MEMALLOC) || @@ -3794,7 +3800,8 @@ static int __ext4_block_zero_page_range(handle_t *handle, } if (ext4_should_journal_data(inode)) { BUFFER_TRACE(bh, "get write access"); - err = ext4_journal_get_write_access(handle, bh); + err = ext4_journal_get_write_access(handle, inode->i_sb, bh, + EXT4_JTR_NONE); if (err) goto unlock; } @@ -4329,101 +4336,93 @@ static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino, bh = sb_getblk(sb, block); if (unlikely(!bh)) return -ENOMEM; - if (ext4_simulate_fail(sb, EXT4_SIM_INODE_EIO)) - goto simulate_eio; - if (!buffer_uptodate(bh)) { - lock_buffer(bh); + if (ext4_buffer_uptodate(bh)) + goto has_buffer; - if (ext4_buffer_uptodate(bh)) { - /* someone brought it uptodate while we waited */ - unlock_buffer(bh); - goto has_buffer; - } - - /* - * If we have all information of the inode in memory and this - * is the only valid inode in the block, we need not read the - * block. - */ - if (in_mem) { - struct buffer_head *bitmap_bh; - int i, start; + lock_buffer(bh); + /* + * If we have all information of the inode in memory and this + * is the only valid inode in the block, we need not read the + * block. + */ + if (in_mem) { + struct buffer_head *bitmap_bh; + int i, start; - start = inode_offset & ~(inodes_per_block - 1); + start = inode_offset & ~(inodes_per_block - 1); - /* Is the inode bitmap in cache? */ - bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp)); - if (unlikely(!bitmap_bh)) - goto make_io; + /* Is the inode bitmap in cache? */ + bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp)); + if (unlikely(!bitmap_bh)) + goto make_io; - /* - * If the inode bitmap isn't in cache then the - * optimisation may end up performing two reads instead - * of one, so skip it. - */ - if (!buffer_uptodate(bitmap_bh)) { - brelse(bitmap_bh); - goto make_io; - } - for (i = start; i < start + inodes_per_block; i++) { - if (i == inode_offset) - continue; - if (ext4_test_bit(i, bitmap_bh->b_data)) - break; - } + /* + * If the inode bitmap isn't in cache then the + * optimisation may end up performing two reads instead + * of one, so skip it. + */ + if (!buffer_uptodate(bitmap_bh)) { brelse(bitmap_bh); - if (i == start + inodes_per_block) { - /* all other inodes are free, so skip I/O */ - memset(bh->b_data, 0, bh->b_size); - set_buffer_uptodate(bh); - unlock_buffer(bh); - goto has_buffer; - } + goto make_io; } + for (i = start; i < start + inodes_per_block; i++) { + if (i == inode_offset) + continue; + if (ext4_test_bit(i, bitmap_bh->b_data)) + break; + } + brelse(bitmap_bh); + if (i == start + inodes_per_block) { + /* all other inodes are free, so skip I/O */ + memset(bh->b_data, 0, bh->b_size); + set_buffer_uptodate(bh); + unlock_buffer(bh); + goto has_buffer; + } + } make_io: - /* - * If we need to do any I/O, try to pre-readahead extra - * blocks from the inode table. - */ - blk_start_plug(&plug); - if (EXT4_SB(sb)->s_inode_readahead_blks) { - ext4_fsblk_t b, end, table; - unsigned num; - __u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks; - - table = ext4_inode_table(sb, gdp); - /* s_inode_readahead_blks is always a power of 2 */ - b = block & ~((ext4_fsblk_t) ra_blks - 1); - if (table > b) - b = table; - end = b + ra_blks; - num = EXT4_INODES_PER_GROUP(sb); - if (ext4_has_group_desc_csum(sb)) - num -= ext4_itable_unused_count(sb, gdp); - table += num / inodes_per_block; - if (end > table) - end = table; - while (b <= end) - ext4_sb_breadahead_unmovable(sb, b++); - } + /* + * If we need to do any I/O, try to pre-readahead extra + * blocks from the inode table. + */ + blk_start_plug(&plug); + if (EXT4_SB(sb)->s_inode_readahead_blks) { + ext4_fsblk_t b, end, table; + unsigned num; + __u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks; + + table = ext4_inode_table(sb, gdp); + /* s_inode_readahead_blks is always a power of 2 */ + b = block & ~((ext4_fsblk_t) ra_blks - 1); + if (table > b) + b = table; + end = b + ra_blks; + num = EXT4_INODES_PER_GROUP(sb); + if (ext4_has_group_desc_csum(sb)) + num -= ext4_itable_unused_count(sb, gdp); + table += num / inodes_per_block; + if (end > table) + end = table; + while (b <= end) + ext4_sb_breadahead_unmovable(sb, b++); + } - /* - * There are other valid inodes in the buffer, this inode - * has in-inode xattrs, or we don't have this inode in memory. - * Read the block from disk. - */ - trace_ext4_load_inode(sb, ino); - ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL); - blk_finish_plug(&plug); - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) { - simulate_eio: - if (ret_block) - *ret_block = block; - brelse(bh); - return -EIO; - } + /* + * There are other valid inodes in the buffer, this inode + * has in-inode xattrs, or we don't have this inode in memory. + * Read the block from disk. + */ + trace_ext4_load_inode(sb, ino); + ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL); + blk_finish_plug(&plug); + wait_on_buffer(bh); + ext4_simulate_fail_bh(sb, bh, EXT4_SIM_INODE_EIO); + if (!buffer_uptodate(bh)) { + if (ret_block) + *ret_block = block; + brelse(bh); + return -EIO; } has_buffer: iloc->bh = bh; @@ -4602,6 +4601,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, struct ext4_iloc iloc; struct ext4_inode *raw_inode; struct ext4_inode_info *ei; + struct ext4_super_block *es = EXT4_SB(sb)->s_es; struct inode *inode; journal_t *journal = EXT4_SB(sb)->s_journal; long ret; @@ -4612,9 +4612,13 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, projid_t i_projid; if ((!(flags & EXT4_IGET_SPECIAL) && - (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)) || + ((ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) || + ino == le32_to_cpu(es->s_usr_quota_inum) || + ino == le32_to_cpu(es->s_grp_quota_inum) || + ino == le32_to_cpu(es->s_prj_quota_inum) || + ino == le32_to_cpu(es->s_orphan_file_inum))) || (ino < EXT4_ROOT_INO) || - (ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) { + (ino > le32_to_cpu(es->s_inodes_count))) { if (flags & EXT4_IGET_HANDLE) return ERR_PTR(-ESTALE); __ext4_error(sb, function, line, false, EFSCORRUPTED, 0, @@ -4927,8 +4931,14 @@ static int ext4_inode_blocks_set(handle_t *handle, ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); return 0; } + + /* + * This should never happen since sb->s_maxbytes should not have + * allowed this, sb->s_maxbytes was set according to the huge_file + * feature in ext4_fill_super(). + */ if (!ext4_has_feature_huge_file(sb)) - return -EFBIG; + return -EFSCORRUPTED; if (i_blocks <= 0xffffffffffffULL) { /* @@ -5031,16 +5041,14 @@ static int ext4_do_update_inode(handle_t *handle, spin_lock(&ei->i_raw_lock); - /* For fields not tracked in the in-memory inode, - * initialise them to zero for new inodes. */ + /* + * For fields not tracked in the in-memory inode, initialise them + * to zero for new inodes. + */ if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); err = ext4_inode_blocks_set(handle, raw_inode, ei); - if (err) { - spin_unlock(&ei->i_raw_lock); - goto out_brelse; - } raw_inode->i_mode = cpu_to_le16(inode->i_mode); i_uid = i_uid_read(inode); @@ -5049,10 +5057,11 @@ static int ext4_do_update_inode(handle_t *handle, if (!(test_opt(inode->i_sb, NO_UID32))) { raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid)); raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid)); -/* - * Fix up interoperability with old kernels. Otherwise, old inodes get - * re-used with the upper 16 bits of the uid/gid intact - */ + /* + * Fix up interoperability with old kernels. Otherwise, + * old inodes get re-used with the upper 16 bits of the + * uid/gid intact. + */ if (ei->i_dtime && list_empty(&ei->i_orphan)) { raw_inode->i_uid_high = 0; raw_inode->i_gid_high = 0; @@ -5121,8 +5130,9 @@ static int ext4_do_update_inode(handle_t *handle, } } - BUG_ON(!ext4_has_feature_project(inode->i_sb) && - i_projid != EXT4_DEF_PROJID); + if (i_projid != EXT4_DEF_PROJID && + !ext4_has_feature_project(inode->i_sb)) + err = err ?: -EFSCORRUPTED; if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) @@ -5130,6 +5140,11 @@ static int ext4_do_update_inode(handle_t *handle, ext4_inode_csum_set(inode, raw_inode, ei); spin_unlock(&ei->i_raw_lock); + if (err) { + EXT4_ERROR_INODE(inode, "corrupted inode contents"); + goto out_brelse; + } + if (inode->i_sb->s_flags & SB_LAZYTIME) ext4_update_other_inodes_time(inode->i_sb, inode->i_ino, bh->b_data); @@ -5137,13 +5152,15 @@ static int ext4_do_update_inode(handle_t *handle, BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, NULL, bh); if (err) - goto out_brelse; + goto out_error; ext4_clear_inode_state(inode, EXT4_STATE_NEW); if (set_large_file) { BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access"); - err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); + err = ext4_journal_get_write_access(handle, sb, + EXT4_SB(sb)->s_sbh, + EXT4_JTR_NONE); if (err) - goto out_brelse; + goto out_error; lock_buffer(EXT4_SB(sb)->s_sbh); ext4_set_feature_large_file(sb); ext4_superblock_csum_set(sb); @@ -5153,9 +5170,10 @@ static int ext4_do_update_inode(handle_t *handle, EXT4_SB(sb)->s_sbh); } ext4_update_inode_fsync_trans(handle, inode, need_datasync); +out_error: + ext4_std_error(inode->i_sb, err); out_brelse: brelse(bh); - ext4_std_error(inode->i_sb, err); return err; } @@ -5742,7 +5760,8 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode, err = ext4_get_inode_loc(inode, iloc); if (!err) { BUFFER_TRACE(iloc->bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, iloc->bh); + err = ext4_journal_get_write_access(handle, inode->i_sb, + iloc->bh, EXT4_JTR_NONE); if (err) { brelse(iloc->bh); iloc->bh = NULL; @@ -5865,7 +5884,8 @@ int ext4_expand_extra_isize(struct inode *inode, ext4_write_lock_xattr(inode, &no_expand); BUFFER_TRACE(iloc->bh, "get_write_access"); - error = ext4_journal_get_write_access(handle, iloc->bh); + error = ext4_journal_get_write_access(handle, inode->i_sb, iloc->bh, + EXT4_JTR_NONE); if (error) { brelse(iloc->bh); goto out_unlock; @@ -6036,7 +6056,8 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) return err; } -static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh) +static int ext4_bh_unmapped(handle_t *handle, struct inode *inode, + struct buffer_head *bh) { return !buffer_mapped(bh); } @@ -6109,7 +6130,7 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) * inode to the transaction's list to writeprotect pages on commit. */ if (page_has_buffers(page)) { - if (!ext4_walk_page_buffers(NULL, page_buffers(page), + if (!ext4_walk_page_buffers(NULL, inode, page_buffers(page), 0, len, NULL, ext4_bh_unmapped)) { /* Wait so that we don't change page under IO */ @@ -6155,11 +6176,13 @@ retry_alloc: err = __block_write_begin(page, 0, len, ext4_get_block); if (!err) { ret = VM_FAULT_SIGBUS; - if (ext4_walk_page_buffers(handle, page_buffers(page), - 0, len, NULL, do_journal_get_write_access)) + if (ext4_walk_page_buffers(handle, inode, + page_buffers(page), 0, len, NULL, + do_journal_get_write_access)) goto out_error; - if (ext4_walk_page_buffers(handle, page_buffers(page), - 0, len, NULL, write_end_fn)) + if (ext4_walk_page_buffers(handle, inode, + page_buffers(page), 0, len, NULL, + write_end_fn)) goto out_error; if (ext4_jbd2_inode_add_write(handle, inode, page_offset(page), len)) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 4fb5fe083c2b..606dee9e08a3 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -1154,7 +1154,9 @@ resizefs_out: err = PTR_ERR(handle); goto pwsalt_err_exit; } - err = ext4_journal_get_write_access(handle, sbi->s_sbh); + err = ext4_journal_get_write_access(handle, sb, + sbi->s_sbh, + EXT4_JTR_NONE); if (err) goto pwsalt_err_journal; lock_buffer(sbi->s_sbh); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 089c958aa2c3..72bfac2d6dce 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -408,6 +408,10 @@ static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac); static bool ext4_mb_good_group(struct ext4_allocation_context *ac, ext4_group_t group, int cr); +static int ext4_try_to_trim_range(struct super_block *sb, + struct ext4_buddy *e4b, ext4_grpblk_t start, + ext4_grpblk_t max, ext4_grpblk_t minblocks); + /* * The algorithm using this percpu seq counter goes below: * 1. We sample the percpu discard_pa_seq counter before trying for block @@ -2474,6 +2478,12 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac, * This could return negative error code if something goes wrong * during ext4_mb_init_group(). This should not be called with * ext4_lock_group() held. + * + * Note: because we are conditionally operating with the group lock in + * the EXT4_MB_STRICT_CHECK case, we need to fake out sparse in this + * function using __acquire and __release. This means we need to be + * super careful before messing with the error path handling via "goto + * out"! */ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, ext4_group_t group, int cr) @@ -2487,8 +2497,10 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, if (sbi->s_mb_stats) atomic64_inc(&sbi->s_bal_cX_groups_considered[ac->ac_criteria]); - if (should_lock) + if (should_lock) { ext4_lock_group(sb, group); + __release(ext4_group_lock_ptr(sb, group)); + } free = grp->bb_free; if (free == 0) goto out; @@ -2496,8 +2508,10 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, goto out; if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) goto out; - if (should_lock) + if (should_lock) { + __acquire(ext4_group_lock_ptr(sb, group)); ext4_unlock_group(sb, group); + } /* We only do this if the grp has never been initialized */ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { @@ -2524,12 +2538,16 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, return ret; } - if (should_lock) + if (should_lock) { ext4_lock_group(sb, group); + __release(ext4_group_lock_ptr(sb, group)); + } ret = ext4_mb_good_group(ac, group, cr); out: - if (should_lock) + if (should_lock) { + __acquire(ext4_group_lock_ptr(sb, group)); ext4_unlock_group(sb, group); + } return ret; } @@ -2965,6 +2983,7 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) } static void *ext4_mb_seq_structs_summary_start(struct seq_file *seq, loff_t *pos) +__acquires(&EXT4_SB(sb)->s_mb_rb_lock) { struct super_block *sb = PDE_DATA(file_inode(seq->file)); unsigned long position; @@ -3037,6 +3056,7 @@ static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v) } static void ext4_mb_seq_structs_summary_stop(struct seq_file *seq, void *v) +__releases(&EXT4_SB(sb)->s_mb_rb_lock) { struct super_block *sb = PDE_DATA(file_inode(seq->file)); @@ -3308,6 +3328,57 @@ static int ext4_groupinfo_create_slab(size_t size) return 0; } +static void ext4_discard_work(struct work_struct *work) +{ + struct ext4_sb_info *sbi = container_of(work, + struct ext4_sb_info, s_discard_work); + struct super_block *sb = sbi->s_sb; + struct ext4_free_data *fd, *nfd; + struct ext4_buddy e4b; + struct list_head discard_list; + ext4_group_t grp, load_grp; + int err = 0; + + INIT_LIST_HEAD(&discard_list); + spin_lock(&sbi->s_md_lock); + list_splice_init(&sbi->s_discard_list, &discard_list); + spin_unlock(&sbi->s_md_lock); + + load_grp = UINT_MAX; + list_for_each_entry_safe(fd, nfd, &discard_list, efd_list) { + /* + * If filesystem is umounting or no memory or suffering + * from no space, give up the discard + */ + if ((sb->s_flags & SB_ACTIVE) && !err && + !atomic_read(&sbi->s_retry_alloc_pending)) { + grp = fd->efd_group; + if (grp != load_grp) { + if (load_grp != UINT_MAX) + ext4_mb_unload_buddy(&e4b); + + err = ext4_mb_load_buddy(sb, grp, &e4b); + if (err) { + kmem_cache_free(ext4_free_data_cachep, fd); + load_grp = UINT_MAX; + continue; + } else { + load_grp = grp; + } + } + + ext4_lock_group(sb, grp); + ext4_try_to_trim_range(sb, &e4b, fd->efd_start_cluster, + fd->efd_start_cluster + fd->efd_count - 1, 1); + ext4_unlock_group(sb, grp); + } + kmem_cache_free(ext4_free_data_cachep, fd); + } + + if (load_grp != UINT_MAX) + ext4_mb_unload_buddy(&e4b); +} + int ext4_mb_init(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -3376,6 +3447,9 @@ int ext4_mb_init(struct super_block *sb) spin_lock_init(&sbi->s_md_lock); sbi->s_mb_free_pending = 0; INIT_LIST_HEAD(&sbi->s_freed_data_list); + INIT_LIST_HEAD(&sbi->s_discard_list); + INIT_WORK(&sbi->s_discard_work, ext4_discard_work); + atomic_set(&sbi->s_retry_alloc_pending, 0); sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN; @@ -3474,6 +3548,14 @@ int ext4_mb_release(struct super_block *sb) struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); int count; + if (test_opt(sb, DISCARD)) { + /* + * wait the discard work to drain all of ext4_free_data + */ + flush_work(&sbi->s_discard_work); + WARN_ON_ONCE(!list_empty(&sbi->s_discard_list)); + } + if (sbi->s_group_info) { for (i = 0; i < ngroups; i++) { cond_resched(); @@ -3596,7 +3678,6 @@ static void ext4_free_data_in_buddy(struct super_block *sb, put_page(e4b.bd_bitmap_page); } ext4_unlock_group(sb, entry->efd_group); - kmem_cache_free(ext4_free_data_cachep, entry); ext4_mb_unload_buddy(&e4b); mb_debug(sb, "freed %d blocks in %d structures\n", count, @@ -3611,10 +3692,9 @@ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_free_data *entry, *tmp; - struct bio *discard_bio = NULL; struct list_head freed_data_list; struct list_head *cut_pos = NULL; - int err; + bool wake; INIT_LIST_HEAD(&freed_data_list); @@ -3629,30 +3709,20 @@ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid) cut_pos); spin_unlock(&sbi->s_md_lock); - if (test_opt(sb, DISCARD)) { - list_for_each_entry(entry, &freed_data_list, efd_list) { - err = ext4_issue_discard(sb, entry->efd_group, - entry->efd_start_cluster, - entry->efd_count, - &discard_bio); - if (err && err != -EOPNOTSUPP) { - ext4_msg(sb, KERN_WARNING, "discard request in" - " group:%d block:%d count:%d failed" - " with %d", entry->efd_group, - entry->efd_start_cluster, - entry->efd_count, err); - } else if (err == -EOPNOTSUPP) - break; - } + list_for_each_entry(entry, &freed_data_list, efd_list) + ext4_free_data_in_buddy(sb, entry); - if (discard_bio) { - submit_bio_wait(discard_bio); - bio_put(discard_bio); - } + if (test_opt(sb, DISCARD)) { + spin_lock(&sbi->s_md_lock); + wake = list_empty(&sbi->s_discard_list); + list_splice_tail(&freed_data_list, &sbi->s_discard_list); + spin_unlock(&sbi->s_md_lock); + if (wake) + queue_work(system_unbound_wq, &sbi->s_discard_work); + } else { + list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list) + kmem_cache_free(ext4_free_data_cachep, entry); } - - list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list) - ext4_free_data_in_buddy(sb, entry); } int __init ext4_init_mballoc(void) @@ -3726,7 +3796,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, } BUFFER_TRACE(bitmap_bh, "getting write access"); - err = ext4_journal_get_write_access(handle, bitmap_bh); + err = ext4_journal_get_write_access(handle, sb, bitmap_bh, + EXT4_JTR_NONE); if (err) goto out_err; @@ -3739,7 +3810,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, ext4_free_group_clusters(sb, gdp)); BUFFER_TRACE(gdp_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, gdp_bh); + err = ext4_journal_get_write_access(handle, sb, gdp_bh, EXT4_JTR_NONE); if (err) goto out_err; @@ -5916,7 +5987,8 @@ do_more: } BUFFER_TRACE(bitmap_bh, "getting write access"); - err = ext4_journal_get_write_access(handle, bitmap_bh); + err = ext4_journal_get_write_access(handle, sb, bitmap_bh, + EXT4_JTR_NONE); if (err) goto error_return; @@ -5926,7 +5998,7 @@ do_more: * using it */ BUFFER_TRACE(gd_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, gd_bh); + err = ext4_journal_get_write_access(handle, sb, gd_bh, EXT4_JTR_NONE); if (err) goto error_return; #ifdef AGGRESSIVE_CHECK @@ -6107,7 +6179,8 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, } BUFFER_TRACE(bitmap_bh, "getting write access"); - err = ext4_journal_get_write_access(handle, bitmap_bh); + err = ext4_journal_get_write_access(handle, sb, bitmap_bh, + EXT4_JTR_NONE); if (err) goto error_return; @@ -6117,7 +6190,7 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, * using it */ BUFFER_TRACE(gd_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, gd_bh); + err = ext4_journal_get_write_access(handle, sb, gd_bh, EXT4_JTR_NONE); if (err) goto error_return; @@ -6183,19 +6256,19 @@ error_return: * @sb: super block for the file system * @start: starting block of the free extent in the alloc. group * @count: number of blocks to TRIM - * @group: alloc. group we are working with * @e4b: ext4 buddy for the group * * Trim "count" blocks starting at "start" in the "group". To assure that no * one will allocate those blocks, mark it as used in buddy bitmap. This must * be called with under the group lock. */ -static int ext4_trim_extent(struct super_block *sb, int start, int count, - ext4_group_t group, struct ext4_buddy *e4b) +static int ext4_trim_extent(struct super_block *sb, + int start, int count, struct ext4_buddy *e4b) __releases(bitlock) __acquires(bitlock) { struct ext4_free_extent ex; + ext4_group_t group = e4b->bd_group; int ret = 0; trace_ext4_trim_extent(sb, group, start, count); @@ -6218,51 +6291,21 @@ __acquires(bitlock) return ret; } -/** - * ext4_trim_all_free -- function to trim all free space in alloc. group - * @sb: super block for file system - * @group: group to be trimmed - * @start: first group block to examine - * @max: last group block to examine - * @minblocks: minimum extent block count - * - * ext4_trim_all_free walks through group's buddy bitmap searching for free - * extents. When the free block is found, ext4_trim_extent is called to TRIM - * the extent. - * - * - * ext4_trim_all_free walks through group's block bitmap searching for free - * extents. When the free extent is found, mark it as used in group buddy - * bitmap. Then issue a TRIM command on this extent and free the extent in - * the group buddy bitmap. This is done until whole group is scanned. - */ -static ext4_grpblk_t -ext4_trim_all_free(struct super_block *sb, ext4_group_t group, - ext4_grpblk_t start, ext4_grpblk_t max, - ext4_grpblk_t minblocks) +static int ext4_try_to_trim_range(struct super_block *sb, + struct ext4_buddy *e4b, ext4_grpblk_t start, + ext4_grpblk_t max, ext4_grpblk_t minblocks) +__acquires(ext4_group_lock_ptr(sb, e4b->bd_group)) +__releases(ext4_group_lock_ptr(sb, e4b->bd_group)) { + ext4_grpblk_t next, count, free_count; void *bitmap; - ext4_grpblk_t next, count = 0, free_count = 0; - struct ext4_buddy e4b; int ret = 0; - trace_ext4_trim_all_free(sb, group, start, max); - - ret = ext4_mb_load_buddy(sb, group, &e4b); - if (ret) { - ext4_warning(sb, "Error %d loading buddy information for %u", - ret, group); - return ret; - } - bitmap = e4b.bd_bitmap; - - ext4_lock_group(sb, group); - if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) && - minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks)) - goto out; - - start = (e4b.bd_info->bb_first_free > start) ? - e4b.bd_info->bb_first_free : start; + bitmap = e4b->bd_bitmap; + start = (e4b->bd_info->bb_first_free > start) ? + e4b->bd_info->bb_first_free : start; + count = 0; + free_count = 0; while (start <= max) { start = mb_find_next_zero_bit(bitmap, max + 1, start); @@ -6271,8 +6314,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, next = mb_find_next_bit(bitmap, max + 1, start); if ((next - start) >= minblocks) { - ret = ext4_trim_extent(sb, start, - next - start, group, &e4b); + ret = ext4_trim_extent(sb, start, next - start, e4b); if (ret && ret != -EOPNOTSUPP) break; ret = 0; @@ -6287,25 +6329,64 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, } if (need_resched()) { - ext4_unlock_group(sb, group); + ext4_unlock_group(sb, e4b->bd_group); cond_resched(); - ext4_lock_group(sb, group); + ext4_lock_group(sb, e4b->bd_group); } - if ((e4b.bd_info->bb_free - free_count) < minblocks) + if ((e4b->bd_info->bb_free - free_count) < minblocks) break; } - if (!ret) { - ret = count; - EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info); + return count; +} + +/** + * ext4_trim_all_free -- function to trim all free space in alloc. group + * @sb: super block for file system + * @group: group to be trimmed + * @start: first group block to examine + * @max: last group block to examine + * @minblocks: minimum extent block count + * + * ext4_trim_all_free walks through group's block bitmap searching for free + * extents. When the free extent is found, mark it as used in group buddy + * bitmap. Then issue a TRIM command on this extent and free the extent in + * the group buddy bitmap. + */ +static ext4_grpblk_t +ext4_trim_all_free(struct super_block *sb, ext4_group_t group, + ext4_grpblk_t start, ext4_grpblk_t max, + ext4_grpblk_t minblocks) +{ + struct ext4_buddy e4b; + int ret; + + trace_ext4_trim_all_free(sb, group, start, max); + + ret = ext4_mb_load_buddy(sb, group, &e4b); + if (ret) { + ext4_warning(sb, "Error %d loading buddy information for %u", + ret, group); + return ret; + } + + ext4_lock_group(sb, group); + + if (!EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) || + minblocks < atomic_read(&EXT4_SB(sb)->s_last_trim_minblks)) { + ret = ext4_try_to_trim_range(sb, &e4b, start, max, minblocks); + if (ret >= 0) + EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info); + } else { + ret = 0; } -out: + ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); ext4_debug("trimmed %d blocks in the group %d\n", - count, group); + ret, group); return ret; } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index f3bbcd4efb56..da7698341d7d 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -70,7 +70,8 @@ static struct buffer_head *ext4_append(handle_t *handle, inode->i_size += inode->i_sb->s_blocksize; EXT4_I(inode)->i_disksize = inode->i_size; BUFFER_TRACE(bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, bh); + err = ext4_journal_get_write_access(handle, inode->i_sb, bh, + EXT4_JTR_NONE); if (err) { brelse(bh); ext4_std_error(inode->i_sb, err); @@ -1927,12 +1928,14 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, } BUFFER_TRACE(*bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, *bh); + err = ext4_journal_get_write_access(handle, dir->i_sb, *bh, + EXT4_JTR_NONE); if (err) goto journal_error; BUFFER_TRACE(frame->bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, frame->bh); + err = ext4_journal_get_write_access(handle, dir->i_sb, frame->bh, + EXT4_JTR_NONE); if (err) goto journal_error; @@ -2109,7 +2112,8 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, return err; } BUFFER_TRACE(bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, bh); + err = ext4_journal_get_write_access(handle, dir->i_sb, bh, + EXT4_JTR_NONE); if (err) { ext4_std_error(dir->i_sb, err); return err; @@ -2167,7 +2171,8 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, blocksize = dir->i_sb->s_blocksize; dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino)); BUFFER_TRACE(bh, "get_write_access"); - retval = ext4_journal_get_write_access(handle, bh); + retval = ext4_journal_get_write_access(handle, dir->i_sb, bh, + EXT4_JTR_NONE); if (retval) { ext4_std_error(dir->i_sb, retval); brelse(bh); @@ -2419,7 +2424,7 @@ again: } BUFFER_TRACE(bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, bh); + err = ext4_journal_get_write_access(handle, sb, bh, EXT4_JTR_NONE); if (err) goto journal_error; @@ -2476,7 +2481,8 @@ again: node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize, sb->s_blocksize); BUFFER_TRACE(frame->bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, frame->bh); + err = ext4_journal_get_write_access(handle, sb, frame->bh, + EXT4_JTR_NONE); if (err) goto journal_error; if (!add_level) { @@ -2486,8 +2492,9 @@ again: icount1, icount2)); BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ - err = ext4_journal_get_write_access(handle, - (frame - 1)->bh); + err = ext4_journal_get_write_access(handle, sb, + (frame - 1)->bh, + EXT4_JTR_NONE); if (err) goto journal_error; @@ -2636,7 +2643,8 @@ static int ext4_delete_entry(handle_t *handle, csum_size = sizeof(struct ext4_dir_entry_tail); BUFFER_TRACE(bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, bh); + err = ext4_journal_get_write_access(handle, dir->i_sb, bh, + EXT4_JTR_NONE); if (unlikely(err)) goto out; @@ -3046,186 +3054,6 @@ bool ext4_empty_dir(struct inode *inode) return true; } -/* - * ext4_orphan_add() links an unlinked or truncated inode into a list of - * such inodes, starting at the superblock, in case we crash before the - * file is closed/deleted, or in case the inode truncate spans multiple - * transactions and the last transaction is not recovered after a crash. - * - * At filesystem recovery time, we walk this list deleting unlinked - * inodes and truncating linked inodes in ext4_orphan_cleanup(). - * - * Orphan list manipulation functions must be called under i_mutex unless - * we are just creating the inode or deleting it. - */ -int ext4_orphan_add(handle_t *handle, struct inode *inode) -{ - struct super_block *sb = inode->i_sb; - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_iloc iloc; - int err = 0, rc; - bool dirty = false; - - if (!sbi->s_journal || is_bad_inode(inode)) - return 0; - - WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && - !inode_is_locked(inode)); - /* - * Exit early if inode already is on orphan list. This is a big speedup - * since we don't have to contend on the global s_orphan_lock. - */ - if (!list_empty(&EXT4_I(inode)->i_orphan)) - return 0; - - /* - * Orphan handling is only valid for files with data blocks - * being truncated, or files being unlinked. Note that we either - * hold i_mutex, or the inode can not be referenced from outside, - * so i_nlink should not be bumped due to race - */ - ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); - - BUFFER_TRACE(sbi->s_sbh, "get_write_access"); - err = ext4_journal_get_write_access(handle, sbi->s_sbh); - if (err) - goto out; - - err = ext4_reserve_inode_write(handle, inode, &iloc); - if (err) - goto out; - - mutex_lock(&sbi->s_orphan_lock); - /* - * Due to previous errors inode may be already a part of on-disk - * orphan list. If so skip on-disk list modification. - */ - if (!NEXT_ORPHAN(inode) || NEXT_ORPHAN(inode) > - (le32_to_cpu(sbi->s_es->s_inodes_count))) { - /* Insert this inode at the head of the on-disk orphan list */ - NEXT_ORPHAN(inode) = le32_to_cpu(sbi->s_es->s_last_orphan); - lock_buffer(sbi->s_sbh); - sbi->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); - ext4_superblock_csum_set(sb); - unlock_buffer(sbi->s_sbh); - dirty = true; - } - list_add(&EXT4_I(inode)->i_orphan, &sbi->s_orphan); - mutex_unlock(&sbi->s_orphan_lock); - - if (dirty) { - err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); - rc = ext4_mark_iloc_dirty(handle, inode, &iloc); - if (!err) - err = rc; - if (err) { - /* - * We have to remove inode from in-memory list if - * addition to on disk orphan list failed. Stray orphan - * list entries can cause panics at unmount time. - */ - mutex_lock(&sbi->s_orphan_lock); - list_del_init(&EXT4_I(inode)->i_orphan); - mutex_unlock(&sbi->s_orphan_lock); - } - } else - brelse(iloc.bh); - - jbd_debug(4, "superblock will point to %lu\n", inode->i_ino); - jbd_debug(4, "orphan inode %lu will point to %d\n", - inode->i_ino, NEXT_ORPHAN(inode)); -out: - ext4_std_error(sb, err); - return err; -} - -/* - * ext4_orphan_del() removes an unlinked or truncated inode from the list - * of such inodes stored on disk, because it is finally being cleaned up. - */ -int ext4_orphan_del(handle_t *handle, struct inode *inode) -{ - struct list_head *prev; - struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - __u32 ino_next; - struct ext4_iloc iloc; - int err = 0; - - if (!sbi->s_journal && !(sbi->s_mount_state & EXT4_ORPHAN_FS)) - return 0; - - WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && - !inode_is_locked(inode)); - /* Do this quick check before taking global s_orphan_lock. */ - if (list_empty(&ei->i_orphan)) - return 0; - - if (handle) { - /* Grab inode buffer early before taking global s_orphan_lock */ - err = ext4_reserve_inode_write(handle, inode, &iloc); - } - - mutex_lock(&sbi->s_orphan_lock); - jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino); - - prev = ei->i_orphan.prev; - list_del_init(&ei->i_orphan); - - /* If we're on an error path, we may not have a valid - * transaction handle with which to update the orphan list on - * disk, but we still need to remove the inode from the linked - * list in memory. */ - if (!handle || err) { - mutex_unlock(&sbi->s_orphan_lock); - goto out_err; - } - - ino_next = NEXT_ORPHAN(inode); - if (prev == &sbi->s_orphan) { - jbd_debug(4, "superblock will point to %u\n", ino_next); - BUFFER_TRACE(sbi->s_sbh, "get_write_access"); - err = ext4_journal_get_write_access(handle, sbi->s_sbh); - if (err) { - mutex_unlock(&sbi->s_orphan_lock); - goto out_brelse; - } - lock_buffer(sbi->s_sbh); - sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); - ext4_superblock_csum_set(inode->i_sb); - unlock_buffer(sbi->s_sbh); - mutex_unlock(&sbi->s_orphan_lock); - err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); - } else { - struct ext4_iloc iloc2; - struct inode *i_prev = - &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode; - - jbd_debug(4, "orphan inode %lu will point to %u\n", - i_prev->i_ino, ino_next); - err = ext4_reserve_inode_write(handle, i_prev, &iloc2); - if (err) { - mutex_unlock(&sbi->s_orphan_lock); - goto out_brelse; - } - NEXT_ORPHAN(i_prev) = ino_next; - err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2); - mutex_unlock(&sbi->s_orphan_lock); - } - if (err) - goto out_brelse; - NEXT_ORPHAN(inode) = 0; - err = ext4_mark_iloc_dirty(handle, inode, &iloc); -out_err: - ext4_std_error(inode->i_sb, err); - return err; - -out_brelse: - brelse(iloc.bh); - goto out_err; -} - static int ext4_rmdir(struct inode *dir, struct dentry *dentry) { int retval; @@ -3675,7 +3503,8 @@ static int ext4_rename_dir_prepare(handle_t *handle, struct ext4_renament *ent) if (le32_to_cpu(ent->parent_de->inode) != ent->dir->i_ino) return -EFSCORRUPTED; BUFFER_TRACE(ent->dir_bh, "get_write_access"); - return ext4_journal_get_write_access(handle, ent->dir_bh); + return ext4_journal_get_write_access(handle, ent->dir->i_sb, + ent->dir_bh, EXT4_JTR_NONE); } static int ext4_rename_dir_finish(handle_t *handle, struct ext4_renament *ent, @@ -3710,7 +3539,8 @@ static int ext4_setent(handle_t *handle, struct ext4_renament *ent, int retval, retval2; BUFFER_TRACE(ent->bh, "get write access"); - retval = ext4_journal_get_write_access(handle, ent->bh); + retval = ext4_journal_get_write_access(handle, ent->dir->i_sb, ent->bh, + EXT4_JTR_NONE); if (retval) return retval; ent->de->inode = cpu_to_le32(ino); diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c new file mode 100644 index 000000000000..53adc8f570a3 --- /dev/null +++ b/fs/ext4/orphan.c @@ -0,0 +1,652 @@ +/* + * Ext4 orphan inode handling + */ +#include <linux/fs.h> +#include <linux/quotaops.h> +#include <linux/buffer_head.h> + +#include "ext4.h" +#include "ext4_jbd2.h" + +static int ext4_orphan_file_add(handle_t *handle, struct inode *inode) +{ + int i, j, start; + struct ext4_orphan_info *oi = &EXT4_SB(inode->i_sb)->s_orphan_info; + int ret = 0; + bool found = false; + __le32 *bdata; + int inodes_per_ob = ext4_inodes_per_orphan_block(inode->i_sb); + int looped = 0; + + /* + * Find block with free orphan entry. Use CPU number for a naive hash + * for a search start in the orphan file + */ + start = raw_smp_processor_id()*13 % oi->of_blocks; + i = start; + do { + if (atomic_dec_if_positive(&oi->of_binfo[i].ob_free_entries) + >= 0) { + found = true; + break; + } + if (++i >= oi->of_blocks) + i = 0; + } while (i != start); + + if (!found) { + /* + * For now we don't grow or shrink orphan file. We just use + * whatever was allocated at mke2fs time. The additional + * credits we would have to reserve for each orphan inode + * operation just don't seem worth it. + */ + return -ENOSPC; + } + + ret = ext4_journal_get_write_access(handle, inode->i_sb, + oi->of_binfo[i].ob_bh, EXT4_JTR_ORPHAN_FILE); + if (ret) { + atomic_inc(&oi->of_binfo[i].ob_free_entries); + return ret; + } + + bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data); + /* Find empty slot in a block */ + j = 0; + do { + if (looped) { + /* + * Did we walk through the block several times without + * finding free entry? It is theoretically possible + * if entries get constantly allocated and freed or + * if the block is corrupted. Avoid indefinite looping + * and bail. We'll use orphan list instead. + */ + if (looped > 3) { + atomic_inc(&oi->of_binfo[i].ob_free_entries); + return -ENOSPC; + } + cond_resched(); + } + while (bdata[j]) { + if (++j >= inodes_per_ob) { + j = 0; + looped++; + } + } + } while (cmpxchg(&bdata[j], (__le32)0, cpu_to_le32(inode->i_ino)) != + (__le32)0); + + EXT4_I(inode)->i_orphan_idx = i * inodes_per_ob + j; + ext4_set_inode_state(inode, EXT4_STATE_ORPHAN_FILE); + + return ext4_handle_dirty_metadata(handle, NULL, oi->of_binfo[i].ob_bh); +} + +/* + * ext4_orphan_add() links an unlinked or truncated inode into a list of + * such inodes, starting at the superblock, in case we crash before the + * file is closed/deleted, or in case the inode truncate spans multiple + * transactions and the last transaction is not recovered after a crash. + * + * At filesystem recovery time, we walk this list deleting unlinked + * inodes and truncating linked inodes in ext4_orphan_cleanup(). + * + * Orphan list manipulation functions must be called under i_mutex unless + * we are just creating the inode or deleting it. + */ +int ext4_orphan_add(handle_t *handle, struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_iloc iloc; + int err = 0, rc; + bool dirty = false; + + if (!sbi->s_journal || is_bad_inode(inode)) + return 0; + + WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && + !inode_is_locked(inode)); + /* + * Inode orphaned in orphan file or in orphan list? + */ + if (ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE) || + !list_empty(&EXT4_I(inode)->i_orphan)) + return 0; + + /* + * Orphan handling is only valid for files with data blocks + * being truncated, or files being unlinked. Note that we either + * hold i_mutex, or the inode can not be referenced from outside, + * so i_nlink should not be bumped due to race + */ + ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); + + if (sbi->s_orphan_info.of_blocks) { + err = ext4_orphan_file_add(handle, inode); + /* + * Fallback to normal orphan list of orphan file is + * out of space + */ + if (err != -ENOSPC) + return err; + } + + BUFFER_TRACE(sbi->s_sbh, "get_write_access"); + err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh, + EXT4_JTR_NONE); + if (err) + goto out; + + err = ext4_reserve_inode_write(handle, inode, &iloc); + if (err) + goto out; + + mutex_lock(&sbi->s_orphan_lock); + /* + * Due to previous errors inode may be already a part of on-disk + * orphan list. If so skip on-disk list modification. + */ + if (!NEXT_ORPHAN(inode) || NEXT_ORPHAN(inode) > + (le32_to_cpu(sbi->s_es->s_inodes_count))) { + /* Insert this inode at the head of the on-disk orphan list */ + NEXT_ORPHAN(inode) = le32_to_cpu(sbi->s_es->s_last_orphan); + lock_buffer(sbi->s_sbh); + sbi->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); + ext4_superblock_csum_set(sb); + unlock_buffer(sbi->s_sbh); + dirty = true; + } + list_add(&EXT4_I(inode)->i_orphan, &sbi->s_orphan); + mutex_unlock(&sbi->s_orphan_lock); + + if (dirty) { + err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); + rc = ext4_mark_iloc_dirty(handle, inode, &iloc); + if (!err) + err = rc; + if (err) { + /* + * We have to remove inode from in-memory list if + * addition to on disk orphan list failed. Stray orphan + * list entries can cause panics at unmount time. + */ + mutex_lock(&sbi->s_orphan_lock); + list_del_init(&EXT4_I(inode)->i_orphan); + mutex_unlock(&sbi->s_orphan_lock); + } + } else + brelse(iloc.bh); + + jbd_debug(4, "superblock will point to %lu\n", inode->i_ino); + jbd_debug(4, "orphan inode %lu will point to %d\n", + inode->i_ino, NEXT_ORPHAN(inode)); +out: + ext4_std_error(sb, err); + return err; +} + +static int ext4_orphan_file_del(handle_t *handle, struct inode *inode) +{ + struct ext4_orphan_info *oi = &EXT4_SB(inode->i_sb)->s_orphan_info; + __le32 *bdata; + int blk, off; + int inodes_per_ob = ext4_inodes_per_orphan_block(inode->i_sb); + int ret = 0; + + if (!handle) + goto out; + blk = EXT4_I(inode)->i_orphan_idx / inodes_per_ob; + off = EXT4_I(inode)->i_orphan_idx % inodes_per_ob; + if (WARN_ON_ONCE(blk >= oi->of_blocks)) + goto out; + + ret = ext4_journal_get_write_access(handle, inode->i_sb, + oi->of_binfo[blk].ob_bh, EXT4_JTR_ORPHAN_FILE); + if (ret) + goto out; + + bdata = (__le32 *)(oi->of_binfo[blk].ob_bh->b_data); + bdata[off] = 0; + atomic_inc(&oi->of_binfo[blk].ob_free_entries); + ret = ext4_handle_dirty_metadata(handle, NULL, oi->of_binfo[blk].ob_bh); +out: + ext4_clear_inode_state(inode, EXT4_STATE_ORPHAN_FILE); + INIT_LIST_HEAD(&EXT4_I(inode)->i_orphan); + + return ret; +} + +/* + * ext4_orphan_del() removes an unlinked or truncated inode from the list + * of such inodes stored on disk, because it is finally being cleaned up. + */ +int ext4_orphan_del(handle_t *handle, struct inode *inode) +{ + struct list_head *prev; + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + __u32 ino_next; + struct ext4_iloc iloc; + int err = 0; + + if (!sbi->s_journal && !(sbi->s_mount_state & EXT4_ORPHAN_FS)) + return 0; + + WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && + !inode_is_locked(inode)); + if (ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE)) + return ext4_orphan_file_del(handle, inode); + + /* Do this quick check before taking global s_orphan_lock. */ + if (list_empty(&ei->i_orphan)) + return 0; + + if (handle) { + /* Grab inode buffer early before taking global s_orphan_lock */ + err = ext4_reserve_inode_write(handle, inode, &iloc); + } + + mutex_lock(&sbi->s_orphan_lock); + jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino); + + prev = ei->i_orphan.prev; + list_del_init(&ei->i_orphan); + + /* If we're on an error path, we may not have a valid + * transaction handle with which to update the orphan list on + * disk, but we still need to remove the inode from the linked + * list in memory. */ + if (!handle || err) { + mutex_unlock(&sbi->s_orphan_lock); + goto out_err; + } + + ino_next = NEXT_ORPHAN(inode); + if (prev == &sbi->s_orphan) { + jbd_debug(4, "superblock will point to %u\n", ino_next); + BUFFER_TRACE(sbi->s_sbh, "get_write_access"); + err = ext4_journal_get_write_access(handle, inode->i_sb, + sbi->s_sbh, EXT4_JTR_NONE); + if (err) { + mutex_unlock(&sbi->s_orphan_lock); + goto out_brelse; + } + lock_buffer(sbi->s_sbh); + sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); + ext4_superblock_csum_set(inode->i_sb); + unlock_buffer(sbi->s_sbh); + mutex_unlock(&sbi->s_orphan_lock); + err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); + } else { + struct ext4_iloc iloc2; + struct inode *i_prev = + &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode; + + jbd_debug(4, "orphan inode %lu will point to %u\n", + i_prev->i_ino, ino_next); + err = ext4_reserve_inode_write(handle, i_prev, &iloc2); + if (err) { + mutex_unlock(&sbi->s_orphan_lock); + goto out_brelse; + } + NEXT_ORPHAN(i_prev) = ino_next; + err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2); + mutex_unlock(&sbi->s_orphan_lock); + } + if (err) + goto out_brelse; + NEXT_ORPHAN(inode) = 0; + err = ext4_mark_iloc_dirty(handle, inode, &iloc); +out_err: + ext4_std_error(inode->i_sb, err); + return err; + +out_brelse: + brelse(iloc.bh); + goto out_err; +} + +#ifdef CONFIG_QUOTA +static int ext4_quota_on_mount(struct super_block *sb, int type) +{ + return dquot_quota_on_mount(sb, + rcu_dereference_protected(EXT4_SB(sb)->s_qf_names[type], + lockdep_is_held(&sb->s_umount)), + EXT4_SB(sb)->s_jquota_fmt, type); +} +#endif + +static void ext4_process_orphan(struct inode *inode, + int *nr_truncates, int *nr_orphans) +{ + struct super_block *sb = inode->i_sb; + int ret; + + dquot_initialize(inode); + if (inode->i_nlink) { + if (test_opt(sb, DEBUG)) + ext4_msg(sb, KERN_DEBUG, + "%s: truncating inode %lu to %lld bytes", + __func__, inode->i_ino, inode->i_size); + jbd_debug(2, "truncating inode %lu to %lld bytes\n", + inode->i_ino, inode->i_size); + inode_lock(inode); + truncate_inode_pages(inode->i_mapping, inode->i_size); + ret = ext4_truncate(inode); + if (ret) { + /* + * We need to clean up the in-core orphan list + * manually if ext4_truncate() failed to get a + * transaction handle. + */ + ext4_orphan_del(NULL, inode); + ext4_std_error(inode->i_sb, ret); + } + inode_unlock(inode); + (*nr_truncates)++; + } else { + if (test_opt(sb, DEBUG)) + ext4_msg(sb, KERN_DEBUG, + "%s: deleting unreferenced inode %lu", + __func__, inode->i_ino); + jbd_debug(2, "deleting unreferenced inode %lu\n", + inode->i_ino); + (*nr_orphans)++; + } + iput(inode); /* The delete magic happens here! */ +} + +/* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at + * the superblock) which were deleted from all directories, but held open by + * a process at the time of a crash. We walk the list and try to delete these + * inodes at recovery time (only with a read-write filesystem). + * + * In order to keep the orphan inode chain consistent during traversal (in + * case of crash during recovery), we link each inode into the superblock + * orphan list_head and handle it the same way as an inode deletion during + * normal operation (which journals the operations for us). + * + * We only do an iget() and an iput() on each inode, which is very safe if we + * accidentally point at an in-use or already deleted inode. The worst that + * can happen in this case is that we get a "bit already cleared" message from + * ext4_free_inode(). The only reason we would point at a wrong inode is if + * e2fsck was run on this filesystem, and it must have already done the orphan + * inode cleanup for us, so we can safely abort without any further action. + */ +void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es) +{ + unsigned int s_flags = sb->s_flags; + int nr_orphans = 0, nr_truncates = 0; + struct inode *inode; + int i, j; +#ifdef CONFIG_QUOTA + int quota_update = 0; +#endif + __le32 *bdata; + struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; + int inodes_per_ob = ext4_inodes_per_orphan_block(sb); + + if (!es->s_last_orphan && !oi->of_blocks) { + jbd_debug(4, "no orphan inodes to clean up\n"); + return; + } + + if (bdev_read_only(sb->s_bdev)) { + ext4_msg(sb, KERN_ERR, "write access " + "unavailable, skipping orphan cleanup"); + return; + } + + /* Check if feature set would not allow a r/w mount */ + if (!ext4_feature_set_ok(sb, 0)) { + ext4_msg(sb, KERN_INFO, "Skipping orphan cleanup due to " + "unknown ROCOMPAT features"); + return; + } + + if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { + /* don't clear list on RO mount w/ errors */ + if (es->s_last_orphan && !(s_flags & SB_RDONLY)) { + ext4_msg(sb, KERN_INFO, "Errors on filesystem, " + "clearing orphan list.\n"); + es->s_last_orphan = 0; + } + jbd_debug(1, "Skipping orphan recovery on fs with errors.\n"); + return; + } + + if (s_flags & SB_RDONLY) { + ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs"); + sb->s_flags &= ~SB_RDONLY; + } +#ifdef CONFIG_QUOTA + /* + * Turn on quotas which were not enabled for read-only mounts if + * filesystem has quota feature, so that they are updated correctly. + */ + if (ext4_has_feature_quota(sb) && (s_flags & SB_RDONLY)) { + int ret = ext4_enable_quotas(sb); + + if (!ret) + quota_update = 1; + else + ext4_msg(sb, KERN_ERR, + "Cannot turn on quotas: error %d", ret); + } + + /* Turn on journaled quotas used for old sytle */ + for (i = 0; i < EXT4_MAXQUOTAS; i++) { + if (EXT4_SB(sb)->s_qf_names[i]) { + int ret = ext4_quota_on_mount(sb, i); + + if (!ret) + quota_update = 1; + else + ext4_msg(sb, KERN_ERR, + "Cannot turn on journaled " + "quota: type %d: error %d", i, ret); + } + } +#endif + + while (es->s_last_orphan) { + /* + * We may have encountered an error during cleanup; if + * so, skip the rest. + */ + if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { + jbd_debug(1, "Skipping orphan recovery on fs with errors.\n"); + es->s_last_orphan = 0; + break; + } + + inode = ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan)); + if (IS_ERR(inode)) { + es->s_last_orphan = 0; + break; + } + + list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); + ext4_process_orphan(inode, &nr_truncates, &nr_orphans); + } + + for (i = 0; i < oi->of_blocks; i++) { + bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data); + for (j = 0; j < inodes_per_ob; j++) { + if (!bdata[j]) + continue; + inode = ext4_orphan_get(sb, le32_to_cpu(bdata[j])); + if (IS_ERR(inode)) + continue; + ext4_set_inode_state(inode, EXT4_STATE_ORPHAN_FILE); + EXT4_I(inode)->i_orphan_idx = i * inodes_per_ob + j; + ext4_process_orphan(inode, &nr_truncates, &nr_orphans); + } + } + +#define PLURAL(x) (x), ((x) == 1) ? "" : "s" + + if (nr_orphans) + ext4_msg(sb, KERN_INFO, "%d orphan inode%s deleted", + PLURAL(nr_orphans)); + if (nr_truncates) + ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up", + PLURAL(nr_truncates)); +#ifdef CONFIG_QUOTA + /* Turn off quotas if they were enabled for orphan cleanup */ + if (quota_update) { + for (i = 0; i < EXT4_MAXQUOTAS; i++) { + if (sb_dqopt(sb)->files[i]) + dquot_quota_off(sb, i); + } + } +#endif + sb->s_flags = s_flags; /* Restore SB_RDONLY status */ +} + +void ext4_release_orphan_info(struct super_block *sb) +{ + int i; + struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; + + if (!oi->of_blocks) + return; + for (i = 0; i < oi->of_blocks; i++) + brelse(oi->of_binfo[i].ob_bh); + kfree(oi->of_binfo); +} + +static struct ext4_orphan_block_tail *ext4_orphan_block_tail( + struct super_block *sb, + struct buffer_head *bh) +{ + return (struct ext4_orphan_block_tail *)(bh->b_data + sb->s_blocksize - + sizeof(struct ext4_orphan_block_tail)); +} + +static int ext4_orphan_file_block_csum_verify(struct super_block *sb, + struct buffer_head *bh) +{ + __u32 calculated; + int inodes_per_ob = ext4_inodes_per_orphan_block(sb); + struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; + struct ext4_orphan_block_tail *ot; + __le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr); + + if (!ext4_has_metadata_csum(sb)) + return 1; + + ot = ext4_orphan_block_tail(sb, bh); + calculated = ext4_chksum(EXT4_SB(sb), oi->of_csum_seed, + (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr)); + calculated = ext4_chksum(EXT4_SB(sb), calculated, (__u8 *)bh->b_data, + inodes_per_ob * sizeof(__u32)); + return le32_to_cpu(ot->ob_checksum) == calculated; +} + +/* This gets called only when checksumming is enabled */ +void ext4_orphan_file_block_trigger(struct jbd2_buffer_trigger_type *triggers, + struct buffer_head *bh, + void *data, size_t size) +{ + struct super_block *sb = EXT4_TRIGGER(triggers)->sb; + __u32 csum; + int inodes_per_ob = ext4_inodes_per_orphan_block(sb); + struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; + struct ext4_orphan_block_tail *ot; + __le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr); + + csum = ext4_chksum(EXT4_SB(sb), oi->of_csum_seed, + (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr)); + csum = ext4_chksum(EXT4_SB(sb), csum, (__u8 *)data, + inodes_per_ob * sizeof(__u32)); + ot = ext4_orphan_block_tail(sb, bh); + ot->ob_checksum = cpu_to_le32(csum); +} + +int ext4_init_orphan_info(struct super_block *sb) +{ + struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; + struct inode *inode; + int i, j; + int ret; + int free; + __le32 *bdata; + int inodes_per_ob = ext4_inodes_per_orphan_block(sb); + struct ext4_orphan_block_tail *ot; + ino_t orphan_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_orphan_file_inum); + + if (!ext4_has_feature_orphan_file(sb)) + return 0; + + inode = ext4_iget(sb, orphan_ino, EXT4_IGET_SPECIAL); + if (IS_ERR(inode)) { + ext4_msg(sb, KERN_ERR, "get orphan inode failed"); + return PTR_ERR(inode); + } + oi->of_blocks = inode->i_size >> sb->s_blocksize_bits; + oi->of_csum_seed = EXT4_I(inode)->i_csum_seed; + oi->of_binfo = kmalloc(oi->of_blocks*sizeof(struct ext4_orphan_block), + GFP_KERNEL); + if (!oi->of_binfo) { + ret = -ENOMEM; + goto out_put; + } + for (i = 0; i < oi->of_blocks; i++) { + oi->of_binfo[i].ob_bh = ext4_bread(NULL, inode, i, 0); + if (IS_ERR(oi->of_binfo[i].ob_bh)) { + ret = PTR_ERR(oi->of_binfo[i].ob_bh); + goto out_free; + } + if (!oi->of_binfo[i].ob_bh) { + ret = -EIO; + goto out_free; + } + ot = ext4_orphan_block_tail(sb, oi->of_binfo[i].ob_bh); + if (le32_to_cpu(ot->ob_magic) != EXT4_ORPHAN_BLOCK_MAGIC) { + ext4_error(sb, "orphan file block %d: bad magic", i); + ret = -EIO; + goto out_free; + } + if (!ext4_orphan_file_block_csum_verify(sb, + oi->of_binfo[i].ob_bh)) { + ext4_error(sb, "orphan file block %d: bad checksum", i); + ret = -EIO; + goto out_free; + } + bdata = (__le32 *)(oi->of_binfo[i].ob_bh->b_data); + free = 0; + for (j = 0; j < inodes_per_ob; j++) + if (bdata[j] == 0) + free++; + atomic_set(&oi->of_binfo[i].ob_free_entries, free); + } + iput(inode); + return 0; +out_free: + for (i--; i >= 0; i--) + brelse(oi->of_binfo[i].ob_bh); + kfree(oi->of_binfo); +out_put: + iput(inode); + return ret; +} + +int ext4_orphan_file_empty(struct super_block *sb) +{ + struct ext4_orphan_info *oi = &EXT4_SB(sb)->s_orphan_info; + int i; + int inodes_per_ob = ext4_inodes_per_orphan_block(sb); + + if (!ext4_has_feature_orphan_file(sb)) + return 1; + for (i = 0; i < oi->of_blocks; i++) + if (atomic_read(&oi->of_binfo[i].ob_free_entries) != + inodes_per_ob) + return 0; + return 1; +} diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 7a9f1adef679..b63cb88ccdae 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -409,7 +409,8 @@ static struct buffer_head *bclean(handle_t *handle, struct super_block *sb, if (unlikely(!bh)) return ERR_PTR(-ENOMEM); BUFFER_TRACE(bh, "get_write_access"); - if ((err = ext4_journal_get_write_access(handle, bh))) { + err = ext4_journal_get_write_access(handle, sb, bh, EXT4_JTR_NONE); + if (err) { brelse(bh); bh = ERR_PTR(err); } else { @@ -474,7 +475,8 @@ static int set_flexbg_block_bitmap(struct super_block *sb, handle_t *handle, return -ENOMEM; BUFFER_TRACE(bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, bh); + err = ext4_journal_get_write_access(handle, sb, bh, + EXT4_JTR_NONE); if (err) { brelse(bh); return err; @@ -569,7 +571,8 @@ static int setup_new_flex_group_blocks(struct super_block *sb, } BUFFER_TRACE(gdb, "get_write_access"); - err = ext4_journal_get_write_access(handle, gdb); + err = ext4_journal_get_write_access(handle, sb, gdb, + EXT4_JTR_NONE); if (err) { brelse(gdb); goto out; @@ -837,17 +840,18 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, } BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access"); - err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); + err = ext4_journal_get_write_access(handle, sb, EXT4_SB(sb)->s_sbh, + EXT4_JTR_NONE); if (unlikely(err)) goto errout; BUFFER_TRACE(gdb_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, gdb_bh); + err = ext4_journal_get_write_access(handle, sb, gdb_bh, EXT4_JTR_NONE); if (unlikely(err)) goto errout; BUFFER_TRACE(dind, "get_write_access"); - err = ext4_journal_get_write_access(handle, dind); + err = ext4_journal_get_write_access(handle, sb, dind, EXT4_JTR_NONE); if (unlikely(err)) { ext4_std_error(sb, err); goto errout; @@ -956,7 +960,7 @@ static int add_new_gdb_meta_bg(struct super_block *sb, n_group_desc[gdb_num] = gdb_bh; BUFFER_TRACE(gdb_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, gdb_bh); + err = ext4_journal_get_write_access(handle, sb, gdb_bh, EXT4_JTR_NONE); if (err) { kvfree(n_group_desc); brelse(gdb_bh); @@ -1042,7 +1046,8 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, for (i = 0; i < reserved_gdb; i++) { BUFFER_TRACE(primary[i], "get_write_access"); - if ((err = ext4_journal_get_write_access(handle, primary[i]))) + if ((err = ext4_journal_get_write_access(handle, sb, primary[i], + EXT4_JTR_NONE))) goto exit_bh; } @@ -1149,10 +1154,9 @@ static void update_backups(struct super_block *sb, sector_t blk_off, char *data, backup_block, backup_block - ext4_group_first_block_no(sb, group)); BUFFER_TRACE(bh, "get_write_access"); - if ((err = ext4_journal_get_write_access(handle, bh))) { - brelse(bh); + if ((err = ext4_journal_get_write_access(handle, sb, bh, + EXT4_JTR_NONE))) break; - } lock_buffer(bh); memcpy(bh->b_data, data, size); if (rest) @@ -1232,7 +1236,8 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb, gdb_bh = sbi_array_rcu_deref(sbi, s_group_desc, gdb_num); BUFFER_TRACE(gdb_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, gdb_bh); + err = ext4_journal_get_write_access(handle, sb, gdb_bh, + EXT4_JTR_NONE); if (!err && reserved_gdb && ext4_bg_num_gdb(sb, group)) err = reserve_backup_gdb(handle, resize_inode, group); @@ -1509,7 +1514,8 @@ static int ext4_flex_group_add(struct super_block *sb, } BUFFER_TRACE(sbi->s_sbh, "get_write_access"); - err = ext4_journal_get_write_access(handle, sbi->s_sbh); + err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh, + EXT4_JTR_NONE); if (err) goto exit_journal; @@ -1722,7 +1728,8 @@ static int ext4_group_extend_no_check(struct super_block *sb, } BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access"); - err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); + err = ext4_journal_get_write_access(handle, sb, EXT4_SB(sb)->s_sbh, + EXT4_JTR_NONE); if (err) { ext4_warning(sb, "error %d on journal write access", err); goto errout; @@ -1884,7 +1891,8 @@ static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode) return PTR_ERR(handle); BUFFER_TRACE(sbi->s_sbh, "get_write_access"); - err = ext4_journal_get_write_access(handle, sbi->s_sbh); + err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh, + EXT4_JTR_NONE); if (err) goto errout; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index d6df62fc810c..0775950ee84e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -80,7 +80,6 @@ static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data); static inline int ext2_feature_set_ok(struct super_block *sb); static inline int ext3_feature_set_ok(struct super_block *sb); -static int ext4_feature_set_ok(struct super_block *sb, int readonly); static void ext4_destroy_lazyinit_thread(void); static void ext4_unregister_li_request(struct super_block *sb); static void ext4_clear_request_list(void); @@ -1173,6 +1172,7 @@ static void ext4_put_super(struct super_block *sb) flush_work(&sbi->s_error_work); destroy_workqueue(sbi->rsv_conversion_wq); + ext4_release_orphan_info(sb); /* * Unregister sysfs before destroying jbd2 journal. @@ -1198,6 +1198,7 @@ static void ext4_put_super(struct super_block *sb) if (!sb_rdonly(sb) && !aborted) { ext4_clear_feature_journal_needs_recovery(sb); + ext4_clear_feature_orphan_present(sb); es->s_state = cpu_to_le16(sbi->s_mount_state); } if (!sb_rdonly(sb)) @@ -1582,14 +1583,12 @@ static int ext4_mark_dquot_dirty(struct dquot *dquot); static int ext4_write_info(struct super_block *sb, int type); static int ext4_quota_on(struct super_block *sb, int type, int format_id, const struct path *path); -static int ext4_quota_on_mount(struct super_block *sb, int type); static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off); static ssize_t ext4_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off); static int ext4_quota_enable(struct super_block *sb, int type, int format_id, unsigned int flags); -static int ext4_enable_quotas(struct super_block *sb); static struct dquot **ext4_get_dquots(struct inode *inode) { @@ -2684,8 +2683,11 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT); le16_add_cpu(&es->s_mnt_count, 1); ext4_update_tstamp(es, s_mtime); - if (sbi->s_journal) + if (sbi->s_journal) { ext4_set_feature_journal_needs_recovery(sb); + if (ext4_has_feature_orphan_file(sb)) + ext4_set_feature_orphan_present(sb); + } err = ext4_commit_super(sb); done: @@ -2967,169 +2969,6 @@ static int ext4_check_descriptors(struct super_block *sb, return 1; } -/* ext4_orphan_cleanup() walks a singly-linked list of inodes (starting at - * the superblock) which were deleted from all directories, but held open by - * a process at the time of a crash. We walk the list and try to delete these - * inodes at recovery time (only with a read-write filesystem). - * - * In order to keep the orphan inode chain consistent during traversal (in - * case of crash during recovery), we link each inode into the superblock - * orphan list_head and handle it the same way as an inode deletion during - * normal operation (which journals the operations for us). - * - * We only do an iget() and an iput() on each inode, which is very safe if we - * accidentally point at an in-use or already deleted inode. The worst that - * can happen in this case is that we get a "bit already cleared" message from - * ext4_free_inode(). The only reason we would point at a wrong inode is if - * e2fsck was run on this filesystem, and it must have already done the orphan - * inode cleanup for us, so we can safely abort without any further action. - */ -static void ext4_orphan_cleanup(struct super_block *sb, - struct ext4_super_block *es) -{ - unsigned int s_flags = sb->s_flags; - int ret, nr_orphans = 0, nr_truncates = 0; -#ifdef CONFIG_QUOTA - int quota_update = 0; - int i; -#endif - if (!es->s_last_orphan) { - jbd_debug(4, "no orphan inodes to clean up\n"); - return; - } - - if (bdev_read_only(sb->s_bdev)) { - ext4_msg(sb, KERN_ERR, "write access " - "unavailable, skipping orphan cleanup"); - return; - } - - /* Check if feature set would not allow a r/w mount */ - if (!ext4_feature_set_ok(sb, 0)) { - ext4_msg(sb, KERN_INFO, "Skipping orphan cleanup due to " - "unknown ROCOMPAT features"); - return; - } - - if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { - /* don't clear list on RO mount w/ errors */ - if (es->s_last_orphan && !(s_flags & SB_RDONLY)) { - ext4_msg(sb, KERN_INFO, "Errors on filesystem, " - "clearing orphan list.\n"); - es->s_last_orphan = 0; - } - jbd_debug(1, "Skipping orphan recovery on fs with errors.\n"); - return; - } - - if (s_flags & SB_RDONLY) { - ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs"); - sb->s_flags &= ~SB_RDONLY; - } -#ifdef CONFIG_QUOTA - /* - * Turn on quotas which were not enabled for read-only mounts if - * filesystem has quota feature, so that they are updated correctly. - */ - if (ext4_has_feature_quota(sb) && (s_flags & SB_RDONLY)) { - int ret = ext4_enable_quotas(sb); - - if (!ret) - quota_update = 1; - else - ext4_msg(sb, KERN_ERR, - "Cannot turn on quotas: error %d", ret); - } - - /* Turn on journaled quotas used for old sytle */ - for (i = 0; i < EXT4_MAXQUOTAS; i++) { - if (EXT4_SB(sb)->s_qf_names[i]) { - int ret = ext4_quota_on_mount(sb, i); - - if (!ret) - quota_update = 1; - else - ext4_msg(sb, KERN_ERR, - "Cannot turn on journaled " - "quota: type %d: error %d", i, ret); - } - } -#endif - - while (es->s_last_orphan) { - struct inode *inode; - - /* - * We may have encountered an error during cleanup; if - * so, skip the rest. - */ - if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { - jbd_debug(1, "Skipping orphan recovery on fs with errors.\n"); - es->s_last_orphan = 0; - break; - } - - inode = ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan)); - if (IS_ERR(inode)) { - es->s_last_orphan = 0; - break; - } - - list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); - dquot_initialize(inode); - if (inode->i_nlink) { - if (test_opt(sb, DEBUG)) - ext4_msg(sb, KERN_DEBUG, - "%s: truncating inode %lu to %lld bytes", - __func__, inode->i_ino, inode->i_size); - jbd_debug(2, "truncating inode %lu to %lld bytes\n", - inode->i_ino, inode->i_size); - inode_lock(inode); - truncate_inode_pages(inode->i_mapping, inode->i_size); - ret = ext4_truncate(inode); - if (ret) { - /* - * We need to clean up the in-core orphan list - * manually if ext4_truncate() failed to get a - * transaction handle. - */ - ext4_orphan_del(NULL, inode); - ext4_std_error(inode->i_sb, ret); - } - inode_unlock(inode); - nr_truncates++; - } else { - if (test_opt(sb, DEBUG)) - ext4_msg(sb, KERN_DEBUG, - "%s: deleting unreferenced inode %lu", - __func__, inode->i_ino); - jbd_debug(2, "deleting unreferenced inode %lu\n", - inode->i_ino); - nr_orphans++; - } - iput(inode); /* The delete magic happens here! */ - } - -#define PLURAL(x) (x), ((x) == 1) ? "" : "s" - - if (nr_orphans) - ext4_msg(sb, KERN_INFO, "%d orphan inode%s deleted", - PLURAL(nr_orphans)); - if (nr_truncates) - ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up", - PLURAL(nr_truncates)); -#ifdef CONFIG_QUOTA - /* Turn off quotas if they were enabled for orphan cleanup */ - if (quota_update) { - for (i = 0; i < EXT4_MAXQUOTAS; i++) { - if (sb_dqopt(sb)->files[i]) - dquot_quota_off(sb, i); - } - } -#endif - sb->s_flags = s_flags; /* Restore SB_RDONLY status */ -} - /* * Maximal extent format file size. * Resulting logical blkno at s_maxbytes must fit in our on-disk @@ -3309,7 +3148,7 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi) * Returns 1 if this filesystem can be mounted as requested, * 0 if it cannot be. */ -static int ext4_feature_set_ok(struct super_block *sb, int readonly) +int ext4_feature_set_ok(struct super_block *sb, int readonly) { if (ext4_has_unknown_ext4_incompat_features(sb)) { ext4_msg(sb, KERN_ERR, @@ -4011,6 +3850,20 @@ static const char *ext4_quota_mode(struct super_block *sb) #endif } +static void ext4_setup_csum_trigger(struct super_block *sb, + enum ext4_journal_trigger_type type, + void (*trigger)( + struct jbd2_buffer_trigger_type *type, + struct buffer_head *bh, + void *mapped_data, + size_t size)) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + + sbi->s_journal_triggers[type].sb = sb; + sbi->s_journal_triggers[type].tr_triggers.t_frozen = trigger; +} + static int ext4_fill_super(struct super_block *sb, void *data, int silent) { struct dax_device *dax_dev = fs_dax_get_by_bdev(sb->s_bdev); @@ -4109,6 +3962,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) silent = 1; goto cantfind_ext4; } + ext4_setup_csum_trigger(sb, EXT4_JTR_ORPHAN_FILE, + ext4_orphan_file_block_trigger); /* Load the checksum driver */ sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); @@ -4432,7 +4287,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; } - if (bdev_dax_supported(sb->s_bdev, blocksize)) + if (dax_supported(dax_dev, sb->s_bdev, blocksize, 0, + bdev_nr_sectors(sb->s_bdev))) set_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags); if (sbi->s_mount_opt & EXT4_MOUNT_DAX_ALWAYS) { @@ -4773,6 +4629,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sb->s_root = NULL; needs_recovery = (es->s_last_orphan != 0 || + ext4_has_feature_orphan_present(sb) || ext4_has_feature_journal_needs_recovery(sb)); if (ext4_has_feature_mmp(sb) && !sb_rdonly(sb)) @@ -5029,6 +4886,14 @@ no_journal: err = percpu_counter_init(&sbi->s_freeinodes_counter, freei, GFP_KERNEL); } + /* + * Update the checksum after updating free space/inode + * counters. Otherwise the superblock can have an incorrect + * checksum in the buffer cache until it is written out and + * e2fsprogs programs trying to open a file system immediately + * after it is mounted can fail. + */ + ext4_superblock_csum_set(sb); if (!err) err = percpu_counter_init(&sbi->s_dirs_counter, ext4_count_dirs(sb), GFP_KERNEL); @@ -5063,12 +4928,15 @@ no_journal: if (err) goto failed_mount7; + err = ext4_init_orphan_info(sb); + if (err) + goto failed_mount8; #ifdef CONFIG_QUOTA /* Enable quota usage during mount. */ if (ext4_has_feature_quota(sb) && !sb_rdonly(sb)) { err = ext4_enable_quotas(sb); if (err) - goto failed_mount8; + goto failed_mount9; } #endif /* CONFIG_QUOTA */ @@ -5087,7 +4955,7 @@ no_journal: ext4_msg(sb, KERN_INFO, "recovery complete"); err = ext4_mark_recovery_complete(sb, es); if (err) - goto failed_mount8; + goto failed_mount9; } if (EXT4_SB(sb)->s_journal) { if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) @@ -5133,6 +5001,8 @@ cantfind_ext4: ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); goto failed_mount; +failed_mount9: + ext4_release_orphan_info(sb); failed_mount8: ext4_unregister_sysfs(sb); kobject_put(&sbi->s_kobj); @@ -5643,8 +5513,15 @@ static int ext4_mark_recovery_complete(struct super_block *sb, if (err < 0) goto out; - if (ext4_has_feature_journal_needs_recovery(sb) && sb_rdonly(sb)) { + if (sb_rdonly(sb) && (ext4_has_feature_journal_needs_recovery(sb) || + ext4_has_feature_orphan_present(sb))) { + if (!ext4_orphan_file_empty(sb)) { + ext4_error(sb, "Orphan file not empty on read-only fs."); + err = -EFSCORRUPTED; + goto out; + } ext4_clear_feature_journal_needs_recovery(sb); + ext4_clear_feature_orphan_present(sb); ext4_commit_super(sb); } out: @@ -5787,6 +5664,8 @@ static int ext4_freeze(struct super_block *sb) /* Journal blocked and flushed, clear needs_recovery flag. */ ext4_clear_feature_journal_needs_recovery(sb); + if (ext4_orphan_file_empty(sb)) + ext4_clear_feature_orphan_present(sb); } error = ext4_commit_super(sb); @@ -5809,6 +5688,8 @@ static int ext4_unfreeze(struct super_block *sb) if (EXT4_SB(sb)->s_journal) { /* Reset the needs_recovery flag before the fs is unlocked. */ ext4_set_feature_journal_needs_recovery(sb); + if (ext4_has_feature_orphan_file(sb)) + ext4_set_feature_orphan_present(sb); } ext4_commit_super(sb); @@ -6012,7 +5893,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) * around from a previously readonly bdev mount, * require a full umount/remount for now. */ - if (es->s_last_orphan) { + if (es->s_last_orphan || !ext4_orphan_file_empty(sb)) { ext4_msg(sb, KERN_WARNING, "Couldn't " "remount RDWR because of unprocessed " "orphan inode list. Please " @@ -6309,16 +6190,6 @@ static int ext4_write_info(struct super_block *sb, int type) return ret; } -/* - * Turn on quotas during mount time - we need to find - * the quota file and such... - */ -static int ext4_quota_on_mount(struct super_block *sb, int type) -{ - return dquot_quota_on_mount(sb, get_qf_name(sb, EXT4_SB(sb), type), - EXT4_SB(sb)->s_jquota_fmt, type); -} - static void lockdep_set_quota_inode(struct inode *inode, int subclass) { struct ext4_inode_info *ei = EXT4_I(inode); @@ -6448,7 +6319,7 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id, } /* Enable usage tracking for all quota types. */ -static int ext4_enable_quotas(struct super_block *sb) +int ext4_enable_quotas(struct super_block *sb) { int type, err = 0; unsigned long qf_inums[EXT4_MAXQUOTAS] = { @@ -6606,7 +6477,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, if (!bh) goto out; BUFFER_TRACE(bh, "get write access"); - err = ext4_journal_get_write_access(handle, bh); + err = ext4_journal_get_write_access(handle, sb, bh, EXT4_JTR_NONE); if (err) { brelse(bh); return err; diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index dd05af983092..69109746e6e2 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -52,10 +52,20 @@ static const char *ext4_encrypted_get_link(struct dentry *dentry, return paddr; } +static int ext4_encrypted_symlink_getattr(struct user_namespace *mnt_userns, + const struct path *path, + struct kstat *stat, u32 request_mask, + unsigned int query_flags) +{ + ext4_getattr(mnt_userns, path, stat, request_mask, query_flags); + + return fscrypt_symlink_getattr(path, stat); +} + const struct inode_operations ext4_encrypted_symlink_inode_operations = { .get_link = ext4_encrypted_get_link, .setattr = ext4_setattr, - .getattr = ext4_getattr, + .getattr = ext4_encrypted_symlink_getattr, .listxattr = ext4_listxattr, }; diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 6dd5c05c444a..1e0fc1ed845b 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -791,7 +791,8 @@ static void ext4_xattr_update_super_block(handle_t *handle, return; BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access"); - if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) { + if (ext4_journal_get_write_access(handle, sb, EXT4_SB(sb)->s_sbh, + EXT4_JTR_NONE) == 0) { lock_buffer(EXT4_SB(sb)->s_sbh); ext4_set_feature_xattr(sb); ext4_superblock_csum_set(sb); @@ -1169,7 +1170,8 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent, continue; } if (err > 0) { - err = ext4_journal_get_write_access(handle, bh); + err = ext4_journal_get_write_access(handle, + parent->i_sb, bh, EXT4_JTR_NONE); if (err) { ext4_warning_inode(ea_inode, "Re-get write access err=%d", @@ -1230,7 +1232,8 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, int error = 0; BUFFER_TRACE(bh, "get_write_access"); - error = ext4_journal_get_write_access(handle, bh); + error = ext4_journal_get_write_access(handle, inode->i_sb, bh, + EXT4_JTR_NONE); if (error) goto out; @@ -1371,7 +1374,8 @@ retry: "ext4_getblk() return bh = NULL"); return -EFSCORRUPTED; } - ret = ext4_journal_get_write_access(handle, bh); + ret = ext4_journal_get_write_access(handle, ea_inode->i_sb, bh, + EXT4_JTR_NONE); if (ret) goto out; @@ -1855,7 +1859,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, if (s->base) { BUFFER_TRACE(bs->bh, "get_write_access"); - error = ext4_journal_get_write_access(handle, bs->bh); + error = ext4_journal_get_write_access(handle, sb, bs->bh, + EXT4_JTR_NONE); if (error) goto cleanup; lock_buffer(bs->bh); @@ -1987,8 +1992,9 @@ inserted: if (error) goto cleanup; BUFFER_TRACE(new_bh, "get_write_access"); - error = ext4_journal_get_write_access(handle, - new_bh); + error = ext4_journal_get_write_access( + handle, sb, new_bh, + EXT4_JTR_NONE); if (error) goto cleanup_dquot; lock_buffer(new_bh); @@ -2092,7 +2098,8 @@ getblk_failed: } lock_buffer(new_bh); - error = ext4_journal_get_create_access(handle, new_bh); + error = ext4_journal_get_create_access(handle, sb, + new_bh, EXT4_JTR_NONE); if (error) { unlock_buffer(new_bh); error = -EIO; @@ -2848,7 +2855,8 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, goto cleanup; } - error = ext4_journal_get_write_access(handle, iloc.bh); + error = ext4_journal_get_write_access(handle, inode->i_sb, + iloc.bh, EXT4_JTR_NONE); if (error) { EXT4_ERROR_INODE(inode, "write access (error %d)", error); diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index 7669de7b49ce..7eea3cfd894d 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -105,6 +105,13 @@ config F2FS_FS_LZO help Support LZO compress algorithm, if unsure, say Y. +config F2FS_FS_LZORLE + bool "LZO-RLE compression support" + depends on F2FS_FS_LZO + default y + help + Support LZO-RLE compress algorithm, if unsure, say Y. + config F2FS_FS_LZ4 bool "LZ4 compression support" depends on F2FS_FS_COMPRESSION @@ -114,7 +121,6 @@ config F2FS_FS_LZ4 config F2FS_FS_LZ4HC bool "LZ4HC compression support" - depends on F2FS_FS_COMPRESSION depends on F2FS_FS_LZ4 default y help @@ -128,10 +134,11 @@ config F2FS_FS_ZSTD help Support ZSTD compress algorithm, if unsure, say Y. -config F2FS_FS_LZORLE - bool "LZO-RLE compression support" - depends on F2FS_FS_COMPRESSION - depends on F2FS_FS_LZO +config F2FS_IOSTAT + bool "F2FS IO statistics information" + depends on F2FS_FS default y help - Support LZO-RLE compress algorithm, if unsure, say Y. + Support getting IO statistics through sysfs and printing out periodic + IO statistics tracepoint events. You have to turn on "iostat_enable" + sysfs node to enable this feature. diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile index e5295746208b..8a7322d229e4 100644 --- a/fs/f2fs/Makefile +++ b/fs/f2fs/Makefile @@ -9,3 +9,4 @@ f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o f2fs-$(CONFIG_FS_VERITY) += verity.o f2fs-$(CONFIG_F2FS_FS_COMPRESSION) += compress.o +f2fs-$(CONFIG_F2FS_IOSTAT) += iostat.o diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 239ad9453b99..16e826e01f09 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -196,8 +196,11 @@ static struct posix_acl *__f2fs_get_acl(struct inode *inode, int type, return acl; } -struct posix_acl *f2fs_get_acl(struct inode *inode, int type) +struct posix_acl *f2fs_get_acl(struct inode *inode, int type, bool rcu) { + if (rcu) + return ERR_PTR(-ECHILD); + return __f2fs_get_acl(inode, type, NULL); } diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h index 986fd1bc780b..a26e33cab4ff 100644 --- a/fs/f2fs/acl.h +++ b/fs/f2fs/acl.h @@ -33,7 +33,7 @@ struct f2fs_acl_header { #ifdef CONFIG_F2FS_FS_POSIX_ACL -extern struct posix_acl *f2fs_get_acl(struct inode *, int); +extern struct posix_acl *f2fs_get_acl(struct inode *, int, bool); extern int f2fs_set_acl(struct user_namespace *, struct inode *, struct posix_acl *, int); extern int f2fs_init_acl(struct inode *, struct inode *, struct page *, diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 6c208108d69c..83e9bc0f91ff 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -18,6 +18,7 @@ #include "f2fs.h" #include "node.h" #include "segment.h" +#include "iostat.h" #include <trace/events/f2fs.h> #define DEFAULT_CHECKPOINT_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) @@ -465,16 +466,29 @@ static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, unsigned int devidx, int type) { struct inode_management *im = &sbi->im[type]; - struct ino_entry *e, *tmp; + struct ino_entry *e = NULL, *new = NULL; - tmp = f2fs_kmem_cache_alloc(ino_entry_slab, GFP_NOFS); + if (type == FLUSH_INO) { + rcu_read_lock(); + e = radix_tree_lookup(&im->ino_root, ino); + rcu_read_unlock(); + } + +retry: + if (!e) + new = f2fs_kmem_cache_alloc(ino_entry_slab, + GFP_NOFS, true, NULL); radix_tree_preload(GFP_NOFS | __GFP_NOFAIL); spin_lock(&im->ino_lock); e = radix_tree_lookup(&im->ino_root, ino); if (!e) { - e = tmp; + if (!new) { + spin_unlock(&im->ino_lock); + goto retry; + } + e = new; if (unlikely(radix_tree_insert(&im->ino_root, ino, e))) f2fs_bug_on(sbi, 1); @@ -492,8 +506,8 @@ static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, spin_unlock(&im->ino_lock); radix_tree_preload_end(); - if (e != tmp) - kmem_cache_free(ino_entry_slab, tmp); + if (new && e != new) + kmem_cache_free(ino_entry_slab, new); } static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) @@ -1289,12 +1303,20 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); unsigned long flags; - spin_lock_irqsave(&sbi->cp_lock, flags); + if (cpc->reason & CP_UMOUNT) { + if (le32_to_cpu(ckpt->cp_pack_total_block_count) > + sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks) { + clear_ckpt_flags(sbi, CP_NAT_BITS_FLAG); + f2fs_notice(sbi, "Disable nat_bits due to no space"); + } else if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG) && + f2fs_nat_bitmap_enabled(sbi)) { + f2fs_enable_nat_bits(sbi); + set_ckpt_flags(sbi, CP_NAT_BITS_FLAG); + f2fs_notice(sbi, "Rebuild and enable nat_bits"); + } + } - if ((cpc->reason & CP_UMOUNT) && - le32_to_cpu(ckpt->cp_pack_total_block_count) > - sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks) - disable_nat_bits(sbi, false); + spin_lock_irqsave(&sbi->cp_lock, flags); if (cpc->reason & CP_TRIMMED) __set_ckpt_flags(ckpt, CP_TRIMMED_FLAG); @@ -1480,7 +1502,8 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) start_blk = __start_cp_next_addr(sbi); /* write nat bits */ - if (enabled_nat_bits(sbi, cpc)) { + if ((cpc->reason & CP_UMOUNT) && + is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) { __u64 cp_ver = cur_cp_version(ckpt); block_t blk; @@ -1639,8 +1662,11 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* write cached NAT/SIT entries to NAT/SIT area */ err = f2fs_flush_nat_entries(sbi, cpc); - if (err) + if (err) { + f2fs_err(sbi, "f2fs_flush_nat_entries failed err:%d, stop checkpoint", err); + f2fs_bug_on(sbi, !f2fs_cp_error(sbi)); goto stop; + } f2fs_flush_sit_entries(sbi, cpc); @@ -1648,10 +1674,13 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_save_inmem_curseg(sbi); err = do_checkpoint(sbi, cpc); - if (err) + if (err) { + f2fs_err(sbi, "do_checkpoint failed err:%d, stop checkpoint", err); + f2fs_bug_on(sbi, !f2fs_cp_error(sbi)); f2fs_release_discard_addrs(sbi); - else + } else { f2fs_clear_prefree_segments(sbi, cpc); + } f2fs_restore_inmem_curseg(sbi); stop: diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 455561826c7d..c1bf9ad4c220 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -28,7 +28,8 @@ static void *page_array_alloc(struct inode *inode, int nr) unsigned int size = sizeof(struct page *) * nr; if (likely(size <= sbi->page_array_slab_size)) - return kmem_cache_zalloc(sbi->page_array_slab, GFP_NOFS); + return f2fs_kmem_cache_alloc(sbi->page_array_slab, + GFP_F2FS_ZERO, false, F2FS_I_SB(inode)); return f2fs_kzalloc(sbi, size, GFP_NOFS); } @@ -898,6 +899,54 @@ static bool cluster_has_invalid_data(struct compress_ctx *cc) return false; } +bool f2fs_sanity_check_cluster(struct dnode_of_data *dn) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + unsigned int cluster_size = F2FS_I(dn->inode)->i_cluster_size; + bool compressed = dn->data_blkaddr == COMPRESS_ADDR; + int cluster_end = 0; + int i; + char *reason = ""; + + if (!compressed) + return false; + + /* [..., COMPR_ADDR, ...] */ + if (dn->ofs_in_node % cluster_size) { + reason = "[*|C|*|*]"; + goto out; + } + + for (i = 1; i < cluster_size; i++) { + block_t blkaddr = data_blkaddr(dn->inode, dn->node_page, + dn->ofs_in_node + i); + + /* [COMPR_ADDR, ..., COMPR_ADDR] */ + if (blkaddr == COMPRESS_ADDR) { + reason = "[C|*|C|*]"; + goto out; + } + if (compressed) { + if (!__is_valid_data_blkaddr(blkaddr)) { + if (!cluster_end) + cluster_end = i; + continue; + } + /* [COMPR_ADDR, NULL_ADDR or NEW_ADDR, valid_blkaddr] */ + if (cluster_end) { + reason = "[C|N|N|V]"; + goto out; + } + } + } + return false; +out: + f2fs_warn(sbi, "access invalid cluster, ino:%lu, nid:%u, ofs_in_node:%u, reason:%s", + dn->inode->i_ino, dn->nid, dn->ofs_in_node, reason); + set_sbi_flag(sbi, SBI_NEED_FSCK); + return true; +} + static int __f2fs_cluster_blocks(struct inode *inode, unsigned int cluster_idx, bool compr) { @@ -915,6 +964,11 @@ static int __f2fs_cluster_blocks(struct inode *inode, goto fail; } + if (f2fs_sanity_check_cluster(&dn)) { + ret = -EFSCORRUPTED; + goto fail; + } + if (dn.data_blkaddr == COMPRESS_ADDR) { int i; @@ -1228,7 +1282,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, fio.version = ni.version; - cic = kmem_cache_zalloc(cic_entry_slab, GFP_NOFS); + cic = f2fs_kmem_cache_alloc(cic_entry_slab, GFP_F2FS_ZERO, false, sbi); if (!cic) goto out_put_dnode; @@ -1340,12 +1394,6 @@ out_destroy_crypt: for (--i; i >= 0; i--) fscrypt_finalize_bounce_page(&cc->cpages[i]); - for (i = 0; i < cc->nr_cpages; i++) { - if (!cc->cpages[i]) - continue; - f2fs_compress_free_page(cc->cpages[i]); - cc->cpages[i] = NULL; - } out_put_cic: kmem_cache_free(cic_entry_slab, cic); out_put_dnode: @@ -1356,6 +1404,12 @@ out_unlock_op: else f2fs_unlock_op(sbi); out_free: + for (i = 0; i < cc->nr_cpages; i++) { + if (!cc->cpages[i]) + continue; + f2fs_compress_free_page(cc->cpages[i]); + cc->cpages[i] = NULL; + } page_array_free(cc->inode, cc->cpages, cc->nr_cpages); cc->cpages = NULL; return -EAGAIN; @@ -1506,7 +1560,8 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc) pgoff_t start_idx = start_idx_of_cluster(cc); int i; - dic = kmem_cache_zalloc(dic_entry_slab, GFP_NOFS); + dic = f2fs_kmem_cache_alloc(dic_entry_slab, GFP_F2FS_ZERO, + false, F2FS_I_SB(cc->inode)); if (!dic) return ERR_PTR(-ENOMEM); @@ -1666,6 +1721,30 @@ void f2fs_put_page_dic(struct page *page) f2fs_put_dic(dic); } +/* + * check whether cluster blocks are contiguous, and add extent cache entry + * only if cluster blocks are logically and physically contiguous. + */ +unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn) +{ + bool compressed = f2fs_data_blkaddr(dn) == COMPRESS_ADDR; + int i = compressed ? 1 : 0; + block_t first_blkaddr = data_blkaddr(dn->inode, dn->node_page, + dn->ofs_in_node + i); + + for (i += 1; i < F2FS_I(dn->inode)->i_cluster_size; i++) { + block_t blkaddr = data_blkaddr(dn->inode, dn->node_page, + dn->ofs_in_node + i); + + if (!__is_valid_data_blkaddr(blkaddr)) + break; + if (first_blkaddr + i - (compressed ? 1 : 0) != blkaddr) + return 0; + } + + return compressed ? i - 1 : i; +} + const struct address_space_operations f2fs_compress_aops = { .releasepage = f2fs_release_page, .invalidatepage = f2fs_invalidate_page, diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index eb222b35edef..f4fd6c246c9a 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -25,6 +25,7 @@ #include "f2fs.h" #include "node.h" #include "segment.h" +#include "iostat.h" #include <trace/events/f2fs.h> #define NUM_PREALLOC_POST_READ_CTXS 128 @@ -116,6 +117,7 @@ struct bio_post_read_ctx { struct f2fs_sb_info *sbi; struct work_struct work; unsigned int enabled_steps; + block_t fs_blkaddr; }; static void f2fs_finish_read_bio(struct bio *bio) @@ -228,7 +230,7 @@ static void f2fs_handle_step_decompress(struct bio_post_read_ctx *ctx) struct bio_vec *bv; struct bvec_iter_all iter_all; bool all_compressed = true; - block_t blkaddr = SECTOR_TO_BLOCK(ctx->bio->bi_iter.bi_sector); + block_t blkaddr = ctx->fs_blkaddr; bio_for_each_segment_all(bv, ctx->bio, iter_all) { struct page *page = bv->bv_page; @@ -269,7 +271,10 @@ static void f2fs_post_read_work(struct work_struct *work) static void f2fs_read_end_io(struct bio *bio) { struct f2fs_sb_info *sbi = F2FS_P_SB(bio_first_page_all(bio)); - struct bio_post_read_ctx *ctx = bio->bi_private; + struct bio_post_read_ctx *ctx; + + iostat_update_and_unbind_ctx(bio, 0); + ctx = bio->bi_private; if (time_to_inject(sbi, FAULT_READ_IO)) { f2fs_show_injection_info(sbi, FAULT_READ_IO); @@ -291,10 +296,13 @@ static void f2fs_read_end_io(struct bio *bio) static void f2fs_write_end_io(struct bio *bio) { - struct f2fs_sb_info *sbi = bio->bi_private; + struct f2fs_sb_info *sbi; struct bio_vec *bvec; struct bvec_iter_all iter_all; + iostat_update_and_unbind_ctx(bio, 1); + sbi = bio->bi_private; + if (time_to_inject(sbi, FAULT_WRITE_IO)) { f2fs_show_injection_info(sbi, FAULT_WRITE_IO); bio->bi_status = BLK_STS_IOERR; @@ -398,6 +406,8 @@ static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages) bio->bi_write_hint = f2fs_io_type_to_rw_hint(sbi, fio->type, fio->temp); } + iostat_alloc_and_bind_ctx(sbi, bio, NULL); + if (fio->io_wbc) wbc_init_bio(fio->io_wbc, bio); @@ -479,6 +489,8 @@ submit_io: trace_f2fs_submit_read_bio(sbi->sb, type, bio); else trace_f2fs_submit_write_bio(sbi->sb, type, bio); + + iostat_update_submit_ctx(bio, type); submit_bio(bio); } @@ -723,7 +735,7 @@ static void add_bio_entry(struct f2fs_sb_info *sbi, struct bio *bio, struct f2fs_bio_info *io = sbi->write_io[DATA] + temp; struct bio_entry *be; - be = f2fs_kmem_cache_alloc(bio_entry_slab, GFP_NOFS); + be = f2fs_kmem_cache_alloc(bio_entry_slab, GFP_NOFS, true, NULL); be->bio = bio; bio_get(bio); @@ -970,7 +982,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct bio *bio; - struct bio_post_read_ctx *ctx; + struct bio_post_read_ctx *ctx = NULL; unsigned int post_read_steps = 0; bio = bio_alloc_bioset(for_write ? GFP_NOIO : GFP_KERNEL, @@ -1003,8 +1015,10 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, ctx->bio = bio; ctx->sbi = sbi; ctx->enabled_steps = post_read_steps; + ctx->fs_blkaddr = blkaddr; bio->bi_private = ctx; } + iostat_alloc_and_bind_ctx(sbi, bio, ctx); return bio; } @@ -1133,7 +1147,7 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index) int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index) { - struct extent_info ei = {0, 0, 0}; + struct extent_info ei = {0, }; struct inode *inode = dn->inode; if (f2fs_lookup_extent_cache(inode, index, &ei)) { @@ -1150,7 +1164,7 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; struct page *page; - struct extent_info ei = {0,0,0}; + struct extent_info ei = {0, }; int err; page = f2fs_grab_cache_page(mapping, index, for_write); @@ -1448,7 +1462,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int err = 0, ofs = 1; unsigned int ofs_in_node, last_ofs_in_node; blkcnt_t prealloc; - struct extent_info ei = {0,0,0}; + struct extent_info ei = {0, }; block_t blkaddr; unsigned int start_pgofs; @@ -1490,7 +1504,21 @@ next_dnode: if (err) { if (flag == F2FS_GET_BLOCK_BMAP) map->m_pblk = 0; + if (err == -ENOENT) { + /* + * There is one exceptional case that read_node_page() + * may return -ENOENT due to filesystem has been + * shutdown or cp_error, so force to convert error + * number to EIO for such case. + */ + if (map->m_may_create && + (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) || + f2fs_cp_error(sbi))) { + err = -EIO; + goto unlock_out; + } + err = 0; if (map->m_next_pgofs) *map->m_next_pgofs = @@ -1550,6 +1578,13 @@ next_block: map->m_flags |= F2FS_MAP_NEW; blkaddr = dn.data_blkaddr; } else { + if (f2fs_compressed_file(inode) && + f2fs_sanity_check_cluster(&dn) && + (flag != F2FS_GET_BLOCK_FIEMAP || + IS_ENABLED(CONFIG_F2FS_CHECK_FS))) { + err = -EFSCORRUPTED; + goto sync_out; + } if (flag == F2FS_GET_BLOCK_BMAP) { map->m_pblk = 0; goto sync_out; @@ -1843,8 +1878,9 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 logical = 0, phys = 0, size = 0; u32 flags = 0; int ret = 0; - bool compr_cluster = false; + bool compr_cluster = false, compr_appended; unsigned int cluster_size = F2FS_I(inode)->i_cluster_size; + unsigned int count_in_cluster = 0; loff_t maxbytes; if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { @@ -1892,15 +1928,17 @@ next: map.m_next_pgofs = &next_pgofs; map.m_seg_type = NO_CHECK_TYPE; - if (compr_cluster) - map.m_len = cluster_size - 1; + if (compr_cluster) { + map.m_lblk += 1; + map.m_len = cluster_size - count_in_cluster; + } ret = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_FIEMAP); if (ret) goto out; /* HOLE */ - if (!(map.m_flags & F2FS_MAP_FLAGS)) { + if (!compr_cluster && !(map.m_flags & F2FS_MAP_FLAGS)) { start_blk = next_pgofs; if (blks_to_bytes(inode, start_blk) < blks_to_bytes(inode, @@ -1910,6 +1948,14 @@ next: flags |= FIEMAP_EXTENT_LAST; } + compr_appended = false; + /* In a case of compressed cluster, append this to the last extent */ + if (compr_cluster && ((map.m_flags & F2FS_MAP_UNWRITTEN) || + !(map.m_flags & F2FS_MAP_FLAGS))) { + compr_appended = true; + goto skip_fill; + } + if (size) { flags |= FIEMAP_EXTENT_MERGED; if (IS_ENCRYPTED(inode)) @@ -1926,38 +1972,36 @@ next: if (start_blk > last_blk) goto out; - if (compr_cluster) { - compr_cluster = false; - - - logical = blks_to_bytes(inode, start_blk - 1); - phys = blks_to_bytes(inode, map.m_pblk); - size = blks_to_bytes(inode, cluster_size); - - flags |= FIEMAP_EXTENT_ENCODED; - - start_blk += cluster_size - 1; - - if (start_blk > last_blk) - goto out; - - goto prep_next; - } - +skip_fill: if (map.m_pblk == COMPRESS_ADDR) { compr_cluster = true; - start_blk++; - goto prep_next; - } - - logical = blks_to_bytes(inode, start_blk); - phys = blks_to_bytes(inode, map.m_pblk); - size = blks_to_bytes(inode, map.m_len); - flags = 0; - if (map.m_flags & F2FS_MAP_UNWRITTEN) - flags = FIEMAP_EXTENT_UNWRITTEN; + count_in_cluster = 1; + } else if (compr_appended) { + unsigned int appended_blks = cluster_size - + count_in_cluster + 1; + size += blks_to_bytes(inode, appended_blks); + start_blk += appended_blks; + compr_cluster = false; + } else { + logical = blks_to_bytes(inode, start_blk); + phys = __is_valid_data_blkaddr(map.m_pblk) ? + blks_to_bytes(inode, map.m_pblk) : 0; + size = blks_to_bytes(inode, map.m_len); + flags = 0; + + if (compr_cluster) { + flags = FIEMAP_EXTENT_ENCODED; + count_in_cluster += map.m_len; + if (count_in_cluster == cluster_size) { + compr_cluster = false; + size += blks_to_bytes(inode, 1); + } + } else if (map.m_flags & F2FS_MAP_UNWRITTEN) { + flags = FIEMAP_EXTENT_UNWRITTEN; + } - start_blk += bytes_to_blks(inode, size); + start_blk += bytes_to_blks(inode, size); + } prep_next: cond_resched(); @@ -2115,6 +2159,8 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, sector_t last_block_in_file; const unsigned blocksize = blks_to_bytes(inode, 1); struct decompress_io_ctx *dic = NULL; + struct extent_info ei = {0, }; + bool from_dnode = true; int i; int ret = 0; @@ -2137,6 +2183,8 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, continue; } unlock_page(page); + if (for_write) + put_page(page); cc->rpages[i] = NULL; cc->nr_rpages--; } @@ -2145,6 +2193,12 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, if (f2fs_cluster_is_empty(cc)) goto out; + if (f2fs_lookup_extent_cache(inode, start_idx, &ei)) + from_dnode = false; + + if (!from_dnode) + goto skip_reading_dnode; + set_new_dnode(&dn, inode, NULL, NULL, 0); ret = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE); if (ret) @@ -2152,11 +2206,13 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, f2fs_bug_on(sbi, dn.data_blkaddr != COMPRESS_ADDR); +skip_reading_dnode: for (i = 1; i < cc->cluster_size; i++) { block_t blkaddr; - blkaddr = data_blkaddr(dn.inode, dn.node_page, - dn.ofs_in_node + i); + blkaddr = from_dnode ? data_blkaddr(dn.inode, dn.node_page, + dn.ofs_in_node + i) : + ei.blk + i - 1; if (!__is_valid_data_blkaddr(blkaddr)) break; @@ -2166,6 +2222,9 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, goto out_put_dnode; } cc->nr_cpages++; + + if (!from_dnode && i >= ei.c_len) + break; } /* nothing to decompress */ @@ -2185,8 +2244,9 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, block_t blkaddr; struct bio_post_read_ctx *ctx; - blkaddr = data_blkaddr(dn.inode, dn.node_page, - dn.ofs_in_node + i + 1); + blkaddr = from_dnode ? data_blkaddr(dn.inode, dn.node_page, + dn.ofs_in_node + i + 1) : + ei.blk + i; f2fs_wait_on_block_writeback(inode, blkaddr); @@ -2220,7 +2280,7 @@ submit_and_realloc: if (bio_add_page(bio, page, blocksize, 0) < blocksize) goto submit_and_realloc; - ctx = bio->bi_private; + ctx = get_post_read_ctx(bio); ctx->enabled_steps |= STEP_DECOMPRESS; refcount_inc(&dic->refcnt); @@ -2231,13 +2291,15 @@ submit_and_realloc: *last_block_in_bio = blkaddr; } - f2fs_put_dnode(&dn); + if (from_dnode) + f2fs_put_dnode(&dn); *bio_ret = bio; return 0; out_put_dnode: - f2fs_put_dnode(&dn); + if (from_dnode) + f2fs_put_dnode(&dn); out: for (i = 0; i < cc->cluster_size; i++) { if (cc->rpages[i]) { @@ -2272,6 +2334,7 @@ static int f2fs_mpage_readpages(struct inode *inode, .nr_rpages = 0, .nr_cpages = 0, }; + pgoff_t nc_cluster_idx = NULL_CLUSTER; #endif unsigned nr_pages = rac ? readahead_count(rac) : 1; unsigned max_nr_pages = nr_pages; @@ -2304,12 +2367,23 @@ static int f2fs_mpage_readpages(struct inode *inode, if (ret) goto set_error_page; } - ret = f2fs_is_compressed_cluster(inode, page->index); - if (ret < 0) - goto set_error_page; - else if (!ret) - goto read_single_page; + if (cc.cluster_idx == NULL_CLUSTER) { + if (nc_cluster_idx == + page->index >> cc.log_cluster_size) { + goto read_single_page; + } + ret = f2fs_is_compressed_cluster(inode, page->index); + if (ret < 0) + goto set_error_page; + else if (!ret) { + nc_cluster_idx = + page->index >> cc.log_cluster_size; + goto read_single_page; + } + + nc_cluster_idx = NULL_CLUSTER; + } ret = f2fs_init_compress_ctx(&cc); if (ret) goto set_error_page; @@ -2498,6 +2572,8 @@ bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio) return true; if (f2fs_is_atomic_file(inode)) return true; + if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) + return true; /* swap file is migrating in aligned write mode */ if (is_inode_flag_set(inode, FI_ALIGNED_WRITE)) @@ -2530,7 +2606,7 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio) struct page *page = fio->page; struct inode *inode = page->mapping->host; struct dnode_of_data dn; - struct extent_info ei = {0,0,0}; + struct extent_info ei = {0, }; struct node_info ni; bool ipu_force = false; int err = 0; @@ -3176,9 +3252,8 @@ static int f2fs_write_data_pages(struct address_space *mapping, FS_CP_DATA_IO : FS_DATA_IO); } -static void f2fs_write_failed(struct address_space *mapping, loff_t to) +static void f2fs_write_failed(struct inode *inode, loff_t to) { - struct inode *inode = mapping->host; loff_t i_size = i_size_read(inode); if (IS_NOQUOTA(inode)) @@ -3187,12 +3262,12 @@ static void f2fs_write_failed(struct address_space *mapping, loff_t to) /* In the fs-verity case, f2fs_end_enable_verity() does the truncate */ if (to > i_size && !f2fs_verity_in_progress(inode)) { down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - filemap_invalidate_lock(mapping); + filemap_invalidate_lock(inode->i_mapping); truncate_pagecache(inode, i_size); f2fs_truncate_blocks(inode, i_size, true); - filemap_invalidate_unlock(mapping); + filemap_invalidate_unlock(inode->i_mapping); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); } } @@ -3206,7 +3281,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, struct dnode_of_data dn; struct page *ipage; bool locked = false; - struct extent_info ei = {0,0,0}; + struct extent_info ei = {0, }; int err = 0; int flag; @@ -3328,6 +3403,9 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, *fsdata = NULL; + if (len == PAGE_SIZE) + goto repeat; + ret = f2fs_prepare_compress_overwrite(inode, pagep, index, fsdata); if (ret < 0) { @@ -3410,7 +3488,7 @@ repeat: fail: f2fs_put_page(page, 1); - f2fs_write_failed(mapping, pos + len); + f2fs_write_failed(inode, pos + len); if (drop_atomic) f2fs_drop_inmem_pages_all(sbi, false); return err; @@ -3552,7 +3630,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) if (f2fs_force_buffered_io(inode, iocb, iter)) return 0; - do_opu = allow_outplace_dio(inode, iocb, iter); + do_opu = rw == WRITE && f2fs_lfs_mode(sbi); trace_f2fs_direct_IO_enter(inode, offset, count, rw); @@ -3600,7 +3678,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_IO, count - iov_iter_count(iter)); } else if (err < 0) { - f2fs_write_failed(mapping, offset + count); + f2fs_write_failed(inode, offset + count); } } else { if (err > 0) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 833325038ef3..8c50518475a9 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -323,11 +323,27 @@ get_cache: #endif } +static char *s_flag[] = { + [SBI_IS_DIRTY] = " fs_dirty", + [SBI_IS_CLOSE] = " closing", + [SBI_NEED_FSCK] = " need_fsck", + [SBI_POR_DOING] = " recovering", + [SBI_NEED_SB_WRITE] = " sb_dirty", + [SBI_NEED_CP] = " need_cp", + [SBI_IS_SHUTDOWN] = " shutdown", + [SBI_IS_RECOVERED] = " recovered", + [SBI_CP_DISABLED] = " cp_disabled", + [SBI_CP_DISABLED_QUICK] = " cp_disabled_quick", + [SBI_QUOTA_NEED_FLUSH] = " quota_need_flush", + [SBI_QUOTA_SKIP_FLUSH] = " quota_skip_flush", + [SBI_QUOTA_NEED_REPAIR] = " quota_need_repair", + [SBI_IS_RESIZEFS] = " resizefs", +}; + static int stat_show(struct seq_file *s, void *v) { struct f2fs_stat_info *si; - int i = 0; - int j; + int i = 0, j = 0; mutex_lock(&f2fs_stat_mutex); list_for_each_entry(si, &f2fs_stat_list, stat_list) { @@ -337,7 +353,13 @@ static int stat_show(struct seq_file *s, void *v) si->sbi->sb->s_bdev, i++, f2fs_readonly(si->sbi->sb) ? "RO": "RW", is_set_ckpt_flags(si->sbi, CP_DISABLED_FLAG) ? - "Disabled": (f2fs_cp_error(si->sbi) ? "Error": "Good")); + "Disabled" : (f2fs_cp_error(si->sbi) ? "Error" : "Good")); + if (si->sbi->s_flag) { + seq_puts(s, "[SBI:"); + for_each_set_bit(j, &si->sbi->s_flag, 32) + seq_puts(s, s_flag[j]); + seq_puts(s, "]\n"); + } seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ", si->sit_area_segs, si->nat_area_segs); seq_printf(s, "[SSA: %d] [MAIN: %d", @@ -450,6 +472,15 @@ static int stat_show(struct seq_file *s, void *v) si->data_segs, si->bg_data_segs); seq_printf(s, " - node segments : %d (%d)\n", si->node_segs, si->bg_node_segs); + seq_printf(s, " - Reclaimed segs : Normal (%d), Idle CB (%d), " + "Idle Greedy (%d), Idle AT (%d), " + "Urgent High (%d), Urgent Low (%d)\n", + si->sbi->gc_reclaimed_segs[GC_NORMAL], + si->sbi->gc_reclaimed_segs[GC_IDLE_CB], + si->sbi->gc_reclaimed_segs[GC_IDLE_GREEDY], + si->sbi->gc_reclaimed_segs[GC_IDLE_AT], + si->sbi->gc_reclaimed_segs[GC_URGENT_HIGH], + si->sbi->gc_reclaimed_segs[GC_URGENT_LOW]); seq_printf(s, "Try to move %d blocks (BG: %d)\n", si->tot_blks, si->bg_data_blks + si->bg_node_blks); seq_printf(s, " - data blocks : %d (%d)\n", si->data_blks, @@ -611,7 +642,7 @@ void __init f2fs_create_root_stats(void) #ifdef CONFIG_DEBUG_FS f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL); - debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root, NULL, + debugfs_create_file("status", 0444, f2fs_debugfs_root, NULL, &stat_fops); #endif } diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 456651682daf..1820e9c106f7 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -83,8 +83,8 @@ int f2fs_init_casefolded_name(const struct inode *dir, struct super_block *sb = dir->i_sb; if (IS_CASEFOLDED(dir)) { - fname->cf_name.name = kmem_cache_alloc(f2fs_cf_name_slab, - GFP_NOFS); + fname->cf_name.name = f2fs_kmem_cache_alloc(f2fs_cf_name_slab, + GFP_NOFS, false, F2FS_SB(sb)); if (!fname->cf_name.name) return -ENOMEM; fname->cf_name.len = utf8_casefold(sb->s_encoding, @@ -1000,6 +1000,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, struct f2fs_sb_info *sbi = F2FS_I_SB(d->inode); struct blk_plug plug; bool readdir_ra = sbi->readdir_ra == 1; + bool found_valid_dirent = false; int err = 0; bit_pos = ((unsigned long)ctx->pos % d->max); @@ -1014,13 +1015,15 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, de = &d->dentry[bit_pos]; if (de->name_len == 0) { + if (found_valid_dirent || !bit_pos) { + printk_ratelimited( + "%sF2FS-fs (%s): invalid namelen(0), ino:%u, run fsck to fix.", + KERN_WARNING, sbi->sb->s_id, + le32_to_cpu(de->ino)); + set_sbi_flag(sbi, SBI_NEED_FSCK); + } bit_pos++; ctx->pos = start_pos + bit_pos; - printk_ratelimited( - "%sF2FS-fs (%s): invalid namelen(0), ino:%u, run fsck to fix.", - KERN_WARNING, sbi->sb->s_id, - le32_to_cpu(de->ino)); - set_sbi_flag(sbi, SBI_NEED_FSCK); continue; } @@ -1063,6 +1066,7 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, f2fs_ra_node_page(sbi, le32_to_cpu(de->ino)); ctx->pos = start_pos + bit_pos; + found_valid_dirent = true; } out: if (readdir_ra) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 3ebf976a682d..866e72b29bd5 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -239,7 +239,7 @@ static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi, { struct extent_node *en; - en = kmem_cache_alloc(extent_node_slab, GFP_ATOMIC); + en = f2fs_kmem_cache_alloc(extent_node_slab, GFP_ATOMIC, false, sbi); if (!en) return NULL; @@ -292,7 +292,8 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode) mutex_lock(&sbi->extent_tree_lock); et = radix_tree_lookup(&sbi->extent_tree_root, ino); if (!et) { - et = f2fs_kmem_cache_alloc(extent_tree_slab, GFP_NOFS); + et = f2fs_kmem_cache_alloc(extent_tree_slab, + GFP_NOFS, true, NULL); f2fs_radix_tree_insert(&sbi->extent_tree_root, ino, et); memset(et, 0, sizeof(struct extent_tree)); et->ino = ino; @@ -661,6 +662,47 @@ static void f2fs_update_extent_tree_range(struct inode *inode, f2fs_mark_inode_dirty_sync(inode, true); } +#ifdef CONFIG_F2FS_FS_COMPRESSION +void f2fs_update_extent_tree_range_compressed(struct inode *inode, + pgoff_t fofs, block_t blkaddr, unsigned int llen, + unsigned int c_len) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et = F2FS_I(inode)->extent_tree; + struct extent_node *en = NULL; + struct extent_node *prev_en = NULL, *next_en = NULL; + struct extent_info ei; + struct rb_node **insert_p = NULL, *insert_parent = NULL; + bool leftmost = false; + + trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, llen); + + /* it is safe here to check FI_NO_EXTENT w/o et->lock in ro image */ + if (is_inode_flag_set(inode, FI_NO_EXTENT)) + return; + + write_lock(&et->lock); + + en = (struct extent_node *)f2fs_lookup_rb_tree_ret(&et->root, + (struct rb_entry *)et->cached_en, fofs, + (struct rb_entry **)&prev_en, + (struct rb_entry **)&next_en, + &insert_p, &insert_parent, false, + &leftmost); + if (en) + goto unlock_out; + + set_extent_info(&ei, fofs, blkaddr, llen); + ei.c_len = c_len; + + if (!__try_merge_extent_node(sbi, et, &ei, prev_en, next_en)) + __insert_extent_tree(sbi, et, &ei, + insert_p, insert_parent, leftmost); +unlock_out: + write_unlock(&et->lock); +} +#endif + unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) { struct extent_tree *et, *next; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 906b2c4b50e7..b339ae89c1ad 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -43,6 +43,7 @@ enum { FAULT_KVMALLOC, FAULT_PAGE_ALLOC, FAULT_PAGE_GET, + FAULT_ALLOC_BIO, /* it's obsolete due to bio_alloc() will never fail */ FAULT_ALLOC_NID, FAULT_ORPHAN, FAULT_BLOCK, @@ -53,6 +54,7 @@ enum { FAULT_CHECKPOINT, FAULT_DISCARD, FAULT_WRITE_IO, + FAULT_SLAB_ALLOC, FAULT_MAX, }; @@ -139,6 +141,11 @@ struct f2fs_mount_info { int fsync_mode; /* fsync policy */ int fs_mode; /* fs mode: LFS or ADAPTIVE */ int bggc_mode; /* bggc mode: off, on or sync */ + int discard_unit; /* + * discard command's offset/size should + * be aligned to this unit: block, + * segment or section + */ struct fscrypt_dummy_policy dummy_enc_policy; /* test dummy encryption */ block_t unusable_cap_perc; /* percentage for cap */ block_t unusable_cap; /* Amount of space allowed to be @@ -542,7 +549,7 @@ enum { */ }; -#define DEFAULT_RETRY_IO_COUNT 8 /* maximum retry read IO count */ +#define DEFAULT_RETRY_IO_COUNT 8 /* maximum retry read IO or flush count */ /* congestion wait timeout value, default: 20ms */ #define DEFAULT_IO_TIMEOUT (msecs_to_jiffies(20)) @@ -575,6 +582,9 @@ struct extent_info { unsigned int fofs; /* start offset in a file */ unsigned int len; /* length of the extent */ u32 blk; /* start block address of the extent */ +#ifdef CONFIG_F2FS_FS_COMPRESSION + unsigned int c_len; /* physical extent length of compressed blocks */ +#endif }; struct extent_node { @@ -793,6 +803,9 @@ static inline void set_extent_info(struct extent_info *ei, unsigned int fofs, ei->fofs = fofs; ei->blk = blk; ei->len = len; +#ifdef CONFIG_F2FS_FS_COMPRESSION + ei->c_len = 0; +#endif } static inline bool __is_discard_mergeable(struct discard_info *back, @@ -817,6 +830,12 @@ static inline bool __is_discard_front_mergeable(struct discard_info *cur, static inline bool __is_extent_mergeable(struct extent_info *back, struct extent_info *front) { +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (back->c_len && back->len != back->c_len) + return false; + if (front->c_len && front->len != front->c_len) + return false; +#endif return (back->fofs + back->len == front->fofs && back->blk + back->len == front->blk); } @@ -1252,6 +1271,7 @@ enum { GC_IDLE_AT, GC_URGENT_HIGH, GC_URGENT_LOW, + MAX_GC_MODE, }; enum { @@ -1297,6 +1317,12 @@ enum { */ }; +enum { + DISCARD_UNIT_BLOCK, /* basic discard unit is block */ + DISCARD_UNIT_SEGMENT, /* basic discard unit is segment */ + DISCARD_UNIT_SECTION, /* basic discard unit is section */ +}; + static inline int f2fs_test_bit(unsigned int nr, char *addr); static inline void f2fs_set_bit(unsigned int nr, char *addr); static inline void f2fs_clear_bit(unsigned int nr, char *addr); @@ -1686,14 +1712,6 @@ struct f2fs_sb_info { #endif spinlock_t stat_lock; /* lock for stat operations */ - /* For app/fs IO statistics */ - spinlock_t iostat_lock; - unsigned long long rw_iostat[NR_IO_TYPE]; - unsigned long long prev_rw_iostat[NR_IO_TYPE]; - bool iostat_enable; - unsigned long iostat_next_period; - unsigned int iostat_period_ms; - /* to attach REQ_META|REQ_FUA flags */ unsigned int data_io_flag; unsigned int node_io_flag; @@ -1732,6 +1750,12 @@ struct f2fs_sb_info { struct kmem_cache *inline_xattr_slab; /* inline xattr entry */ unsigned int inline_xattr_slab_size; /* default inline xattr slab size */ + /* For reclaimed segs statistics per each GC mode */ + unsigned int gc_segment_mode; /* GC state for reclaimed segments */ + unsigned int gc_reclaimed_segs[MAX_GC_MODE]; /* Reclaimed segs for each mode */ + + unsigned long seq_file_ra_mul; /* multiplier for ra_pages of seq. files in fadvise */ + #ifdef CONFIG_F2FS_FS_COMPRESSION struct kmem_cache *page_array_slab; /* page array entry */ unsigned int page_array_slab_size; /* default page array slab size */ @@ -1747,6 +1771,20 @@ struct f2fs_sb_info { unsigned int compress_watermark; /* cache page watermark */ atomic_t compress_page_hit; /* cache hit count */ #endif + +#ifdef CONFIG_F2FS_IOSTAT + /* For app/fs IO statistics */ + spinlock_t iostat_lock; + unsigned long long rw_iostat[NR_IO_TYPE]; + unsigned long long prev_rw_iostat[NR_IO_TYPE]; + bool iostat_enable; + unsigned long iostat_next_period; + unsigned int iostat_period_ms; + + /* For io latency related statistics info in one iostat period */ + spinlock_t iostat_lat_lock; + struct iostat_lat_info *iostat_io_lat; +#endif }; struct f2fs_private_dio { @@ -2034,36 +2072,6 @@ static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) spin_unlock_irqrestore(&sbi->cp_lock, flags); } -static inline void disable_nat_bits(struct f2fs_sb_info *sbi, bool lock) -{ - unsigned long flags; - unsigned char *nat_bits; - - /* - * In order to re-enable nat_bits we need to call fsck.f2fs by - * set_sbi_flag(sbi, SBI_NEED_FSCK). But it may give huge cost, - * so let's rely on regular fsck or unclean shutdown. - */ - - if (lock) - spin_lock_irqsave(&sbi->cp_lock, flags); - __clear_ckpt_flags(F2FS_CKPT(sbi), CP_NAT_BITS_FLAG); - nat_bits = NM_I(sbi)->nat_bits; - NM_I(sbi)->nat_bits = NULL; - if (lock) - spin_unlock_irqrestore(&sbi->cp_lock, flags); - - kvfree(nat_bits); -} - -static inline bool enabled_nat_bits(struct f2fs_sb_info *sbi, - struct cp_control *cpc) -{ - bool set = is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG); - - return (cpc) ? (cpc->reason & CP_UMOUNT) && set : set; -} - static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) { down_read(&sbi->cp_rwsem); @@ -2587,7 +2595,7 @@ static inline struct kmem_cache *f2fs_kmem_cache_create(const char *name, return kmem_cache_create(name, size, 0, SLAB_RECLAIM_ACCOUNT, NULL); } -static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep, +static inline void *f2fs_kmem_cache_alloc_nofail(struct kmem_cache *cachep, gfp_t flags) { void *entry; @@ -2598,6 +2606,20 @@ static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep, return entry; } +static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep, + gfp_t flags, bool nofail, struct f2fs_sb_info *sbi) +{ + if (nofail) + return f2fs_kmem_cache_alloc_nofail(cachep, flags); + + if (time_to_inject(sbi, FAULT_SLAB_ALLOC)) { + f2fs_show_injection_info(sbi, FAULT_SLAB_ALLOC); + return NULL; + } + + return kmem_cache_alloc(cachep, flags); +} + static inline bool is_inflight_io(struct f2fs_sb_info *sbi, int type) { if (get_pages(sbi, F2FS_RD_DATA) || get_pages(sbi, F2FS_RD_NODE) || @@ -3210,47 +3232,6 @@ static inline int get_inline_xattr_addrs(struct inode *inode) sizeof((f2fs_inode)->field)) \ <= (F2FS_OLD_ATTRIBUTE_SIZE + (extra_isize))) \ -#define DEFAULT_IOSTAT_PERIOD_MS 3000 -#define MIN_IOSTAT_PERIOD_MS 100 -/* maximum period of iostat tracing is 1 day */ -#define MAX_IOSTAT_PERIOD_MS 8640000 - -static inline void f2fs_reset_iostat(struct f2fs_sb_info *sbi) -{ - int i; - - spin_lock(&sbi->iostat_lock); - for (i = 0; i < NR_IO_TYPE; i++) { - sbi->rw_iostat[i] = 0; - sbi->prev_rw_iostat[i] = 0; - } - spin_unlock(&sbi->iostat_lock); -} - -extern void f2fs_record_iostat(struct f2fs_sb_info *sbi); - -static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, - enum iostat_type type, unsigned long long io_bytes) -{ - if (!sbi->iostat_enable) - return; - spin_lock(&sbi->iostat_lock); - sbi->rw_iostat[type] += io_bytes; - - if (type == APP_WRITE_IO || type == APP_DIRECT_IO) - sbi->rw_iostat[APP_BUFFERED_IO] = - sbi->rw_iostat[APP_WRITE_IO] - - sbi->rw_iostat[APP_DIRECT_IO]; - - if (type == APP_READ_IO || type == APP_DIRECT_READ_IO) - sbi->rw_iostat[APP_BUFFERED_READ_IO] = - sbi->rw_iostat[APP_READ_IO] - - sbi->rw_iostat[APP_DIRECT_READ_IO]; - spin_unlock(&sbi->iostat_lock); - - f2fs_record_iostat(sbi); -} - #define __is_large_section(sbi) ((sbi)->segs_per_sec > 1) #define __is_meta_io(fio) (PAGE_TYPE_OF_BIO((fio)->type) == META) @@ -3417,6 +3398,7 @@ int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from); int f2fs_truncate_xattr_node(struct inode *inode); int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, unsigned int seq_id); +bool f2fs_nat_bitmap_enabled(struct f2fs_sb_info *sbi); int f2fs_remove_inode_page(struct inode *inode); struct page *f2fs_new_inode_page(struct inode *inode); struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs); @@ -3441,6 +3423,7 @@ int f2fs_recover_xattr_data(struct inode *inode, struct page *page); int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page); int f2fs_restore_node_summary(struct f2fs_sb_info *sbi, unsigned int segno, struct f2fs_summary_block *sum); +void f2fs_enable_nat_bits(struct f2fs_sb_info *sbi); int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); int f2fs_build_node_manager(struct f2fs_sb_info *sbi); void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi); @@ -3464,6 +3447,7 @@ int f2fs_flush_device_cache(struct f2fs_sb_info *sbi); void f2fs_destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free); void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); bool f2fs_is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); +int f2fs_start_discard_thread(struct f2fs_sb_info *sbi); void f2fs_drop_discard_cmd(struct f2fs_sb_info *sbi); void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi); bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi); @@ -3986,6 +3970,9 @@ void f2fs_destroy_extent_cache(void); /* * sysfs.c */ +#define MIN_RA_MUL 2 +#define MAX_RA_MUL 256 + int __init f2fs_init_sysfs(void); void f2fs_exit_sysfs(void); int f2fs_register_sysfs(struct f2fs_sb_info *sbi); @@ -4040,18 +4027,23 @@ void f2fs_end_read_compressed_page(struct page *page, bool failed, block_t blkaddr); bool f2fs_cluster_is_empty(struct compress_ctx *cc); bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index); +bool f2fs_sanity_check_cluster(struct dnode_of_data *dn); void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page); int f2fs_write_multi_pages(struct compress_ctx *cc, int *submitted, struct writeback_control *wbc, enum iostat_type io_type); int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index); +void f2fs_update_extent_tree_range_compressed(struct inode *inode, + pgoff_t fofs, block_t blkaddr, unsigned int llen, + unsigned int c_len); int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, unsigned nr_pages, sector_t *last_block_in_bio, bool is_readahead, bool for_write); struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc); void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed); void f2fs_put_page_dic(struct page *page); +unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn); int f2fs_init_compress_ctx(struct compress_ctx *cc); void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse); void f2fs_init_compress_info(struct f2fs_sb_info *sbi); @@ -4106,6 +4098,8 @@ static inline void f2fs_put_page_dic(struct page *page) { WARN_ON_ONCE(1); } +static inline unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn) { return 0; } +static inline bool f2fs_sanity_check_cluster(struct dnode_of_data *dn) { return false; } static inline int f2fs_init_compress_inode(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_compress_inode(struct f2fs_sb_info *sbi) { } static inline int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) { return 0; } @@ -4121,6 +4115,9 @@ static inline bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi, static inline void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino) { } #define inc_compr_inode_stat(inode) do { } while (0) +static inline void f2fs_update_extent_tree_range_compressed(struct inode *inode, + pgoff_t fofs, block_t blkaddr, unsigned int llen, + unsigned int c_len) { } #endif static inline void set_compress_context(struct inode *inode) @@ -4136,7 +4133,8 @@ static inline void set_compress_context(struct inode *inode) 1 << COMPRESS_CHKSUM : 0; F2FS_I(inode)->i_cluster_size = 1 << F2FS_I(inode)->i_log_cluster_size; - if (F2FS_I(inode)->i_compress_algorithm == COMPRESS_LZ4 && + if ((F2FS_I(inode)->i_compress_algorithm == COMPRESS_LZ4 || + F2FS_I(inode)->i_compress_algorithm == COMPRESS_ZSTD) && F2FS_OPTION(sbi).compress_level) F2FS_I(inode)->i_compress_flag |= F2FS_OPTION(sbi).compress_level << @@ -4304,16 +4302,6 @@ static inline int block_unaligned_IO(struct inode *inode, return align & blocksize_mask; } -static inline int allow_outplace_dio(struct inode *inode, - struct kiocb *iocb, struct iov_iter *iter) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - int rw = iov_iter_rw(iter); - - return (f2fs_lfs_mode(sbi) && (rw == WRITE) && - !block_unaligned_IO(inode, iocb, iter)); -} - static inline bool f2fs_force_buffered_io(struct inode *inode, struct kiocb *iocb, struct iov_iter *iter) { @@ -4368,6 +4356,11 @@ static inline bool is_journalled_quota(struct f2fs_sb_info *sbi) return false; } +static inline bool f2fs_block_unit_discard(struct f2fs_sb_info *sbi) +{ + return F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_BLOCK; +} + #define EFSBADCRC EBADMSG /* Bad CRC detected */ #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 1ff333755721..9c8ef33bd8d3 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -23,6 +23,7 @@ #include <linux/nls.h> #include <linux/sched/signal.h> #include <linux/fileattr.h> +#include <linux/fadvise.h> #include "f2fs.h" #include "node.h" @@ -30,6 +31,7 @@ #include "xattr.h" #include "acl.h" #include "gc.h" +#include "iostat.h" #include <trace/events/f2fs.h> #include <uapi/linux/f2fs.h> @@ -258,8 +260,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, }; unsigned int seq_id = 0; - if (unlikely(f2fs_readonly(inode->i_sb) || - is_sbi_flag_set(sbi, SBI_CP_DISABLED))) + if (unlikely(f2fs_readonly(inode->i_sb))) return 0; trace_f2fs_sync_file_enter(inode); @@ -273,7 +274,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, ret = file_write_and_wait_range(file, start, end); clear_inode_flag(inode, FI_NEED_IPU); - if (ret) { + if (ret || is_sbi_flag_set(sbi, SBI_CP_DISABLED)) { trace_f2fs_sync_file_exit(inode, cp_reason, datasync, ret); return ret; } @@ -298,6 +299,18 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, f2fs_exist_written_data(sbi, ino, UPDATE_INO)) goto flush_out; goto out; + } else { + /* + * for OPU case, during fsync(), node can be persisted before + * data when lower device doesn't support write barrier, result + * in data corruption after SPO. + * So for strict fsync mode, force to use atomic write sematics + * to keep write order in between data/node and last node to + * avoid potential data corruption. + */ + if (F2FS_OPTION(sbi).fsync_mode == + FSYNC_MODE_STRICT && !atomic) + atomic = true; } go_write: /* @@ -737,6 +750,14 @@ int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock) return err; #ifdef CONFIG_F2FS_FS_COMPRESSION + /* + * For compressed file, after release compress blocks, don't allow write + * direct, but we should allow write direct after truncate to zero. + */ + if (f2fs_compressed_file(inode) && !free_from + && is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) + clear_inode_flag(inode, FI_COMPRESS_RELEASED); + if (from != free_from) { err = f2fs_truncate_partial_cluster(inode, from, lock); if (err) @@ -1082,7 +1103,6 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) } if (pg_start < pg_end) { - struct address_space *mapping = inode->i_mapping; loff_t blk_start, blk_end; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -1092,16 +1112,15 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) blk_end = (loff_t)pg_end << PAGE_SHIFT; down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - filemap_invalidate_lock(mapping); + filemap_invalidate_lock(inode->i_mapping); - truncate_inode_pages_range(mapping, blk_start, - blk_end - 1); + truncate_pagecache_range(inode, blk_start, blk_end - 1); f2fs_lock_op(sbi); ret = f2fs_truncate_hole(inode, pg_start, pg_end); f2fs_unlock_op(sbi); - filemap_invalidate_unlock(mapping); + filemap_invalidate_unlock(inode->i_mapping); up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); } } @@ -3473,8 +3492,8 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) released_blocks += ret; } - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_unlock(inode->i_mapping); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); out: inode_unlock(inode); @@ -3626,8 +3645,8 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) reserved_blocks += ret; } - up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_unlock(inode->i_mapping); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); if (ret >= 0) { clear_inode_flag(inode, FI_COMPRESS_RELEASED); @@ -4290,7 +4309,7 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) * back to buffered IO. */ if (!f2fs_force_buffered_io(inode, iocb, from) && - allow_outplace_dio(inode, iocb, from)) + f2fs_lfs_mode(F2FS_I_SB(inode))) goto write; } preallocated = true; @@ -4330,6 +4349,34 @@ out: return ret; } +static int f2fs_file_fadvise(struct file *filp, loff_t offset, loff_t len, + int advice) +{ + struct inode *inode; + struct address_space *mapping; + struct backing_dev_info *bdi; + + if (advice == POSIX_FADV_SEQUENTIAL) { + inode = file_inode(filp); + if (S_ISFIFO(inode->i_mode)) + return -ESPIPE; + + mapping = filp->f_mapping; + if (!mapping || len < 0) + return -EINVAL; + + bdi = inode_to_bdi(mapping->host); + filp->f_ra.ra_pages = bdi->ra_pages * + F2FS_I_SB(inode)->seq_file_ra_mul; + spin_lock(&filp->f_lock); + filp->f_mode &= ~FMODE_RANDOM; + spin_unlock(&filp->f_lock); + return 0; + } + + return generic_fadvise(filp, offset, len, advice); +} + #ifdef CONFIG_COMPAT struct compat_f2fs_gc_range { u32 sync; @@ -4458,4 +4505,5 @@ const struct file_operations f2fs_file_operations = { #endif .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, + .fadvise = f2fs_file_fadvise, }; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 0e42ee5f7770..77391e3b7d68 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -19,6 +19,7 @@ #include "node.h" #include "segment.h" #include "gc.h" +#include "iostat.h" #include <trace/events/f2fs.h> static struct kmem_cache *victim_entry_slab; @@ -371,7 +372,8 @@ static struct victim_entry *attach_victim_entry(struct f2fs_sb_info *sbi, struct atgc_management *am = &sbi->am; struct victim_entry *ve; - ve = f2fs_kmem_cache_alloc(victim_entry_slab, GFP_NOFS); + ve = f2fs_kmem_cache_alloc(victim_entry_slab, + GFP_NOFS, true, NULL); ve->mtime = mtime; ve->segno = segno; @@ -849,7 +851,8 @@ static void add_gc_inode(struct gc_inode_list *gc_list, struct inode *inode) iput(inode); return; } - new_ie = f2fs_kmem_cache_alloc(f2fs_inode_entry_slab, GFP_NOFS); + new_ie = f2fs_kmem_cache_alloc(f2fs_inode_entry_slab, + GFP_NOFS, true, NULL); new_ie->inode = inode; f2fs_radix_tree_insert(&gc_list->iroot, inode->i_ino, new_ie); @@ -1497,8 +1500,10 @@ next_step: int err; if (S_ISREG(inode->i_mode)) { - if (!down_write_trylock(&fi->i_gc_rwsem[READ])) + if (!down_write_trylock(&fi->i_gc_rwsem[READ])) { + sbi->skipped_gc_rwsem++; continue; + } if (!down_write_trylock( &fi->i_gc_rwsem[WRITE])) { sbi->skipped_gc_rwsem++; @@ -1646,6 +1651,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, force_migrate); stat_inc_seg_count(sbi, type, gc_type); + sbi->gc_reclaimed_segs[sbi->gc_mode]++; migrated++; freed: @@ -1747,7 +1753,7 @@ gc_more: round++; } - if (gc_type == FG_GC && seg_freed) + if (gc_type == FG_GC) sbi->cur_victim_sec = NULL_SEGNO; if (sync) diff --git a/fs/f2fs/iostat.c b/fs/f2fs/iostat.c new file mode 100644 index 000000000000..cdcf54ae0db8 --- /dev/null +++ b/fs/f2fs/iostat.c @@ -0,0 +1,287 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * f2fs iostat support + * + * Copyright 2021 Google LLC + * Author: Daeho Jeong <daehojeong@google.com> + */ + +#include <linux/fs.h> +#include <linux/f2fs_fs.h> +#include <linux/seq_file.h> + +#include "f2fs.h" +#include "iostat.h" +#include <trace/events/f2fs.h> + +#define NUM_PREALLOC_IOSTAT_CTXS 128 +static struct kmem_cache *bio_iostat_ctx_cache; +static mempool_t *bio_iostat_ctx_pool; + +int __maybe_unused iostat_info_seq_show(struct seq_file *seq, void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + time64_t now = ktime_get_real_seconds(); + + if (!sbi->iostat_enable) + return 0; + + seq_printf(seq, "time: %-16llu\n", now); + + /* print app write IOs */ + seq_puts(seq, "[WRITE]\n"); + seq_printf(seq, "app buffered: %-16llu\n", + sbi->rw_iostat[APP_BUFFERED_IO]); + seq_printf(seq, "app direct: %-16llu\n", + sbi->rw_iostat[APP_DIRECT_IO]); + seq_printf(seq, "app mapped: %-16llu\n", + sbi->rw_iostat[APP_MAPPED_IO]); + + /* print fs write IOs */ + seq_printf(seq, "fs data: %-16llu\n", + sbi->rw_iostat[FS_DATA_IO]); + seq_printf(seq, "fs node: %-16llu\n", + sbi->rw_iostat[FS_NODE_IO]); + seq_printf(seq, "fs meta: %-16llu\n", + sbi->rw_iostat[FS_META_IO]); + seq_printf(seq, "fs gc data: %-16llu\n", + sbi->rw_iostat[FS_GC_DATA_IO]); + seq_printf(seq, "fs gc node: %-16llu\n", + sbi->rw_iostat[FS_GC_NODE_IO]); + seq_printf(seq, "fs cp data: %-16llu\n", + sbi->rw_iostat[FS_CP_DATA_IO]); + seq_printf(seq, "fs cp node: %-16llu\n", + sbi->rw_iostat[FS_CP_NODE_IO]); + seq_printf(seq, "fs cp meta: %-16llu\n", + sbi->rw_iostat[FS_CP_META_IO]); + + /* print app read IOs */ + seq_puts(seq, "[READ]\n"); + seq_printf(seq, "app buffered: %-16llu\n", + sbi->rw_iostat[APP_BUFFERED_READ_IO]); + seq_printf(seq, "app direct: %-16llu\n", + sbi->rw_iostat[APP_DIRECT_READ_IO]); + seq_printf(seq, "app mapped: %-16llu\n", + sbi->rw_iostat[APP_MAPPED_READ_IO]); + + /* print fs read IOs */ + seq_printf(seq, "fs data: %-16llu\n", + sbi->rw_iostat[FS_DATA_READ_IO]); + seq_printf(seq, "fs gc data: %-16llu\n", + sbi->rw_iostat[FS_GDATA_READ_IO]); + seq_printf(seq, "fs compr_data: %-16llu\n", + sbi->rw_iostat[FS_CDATA_READ_IO]); + seq_printf(seq, "fs node: %-16llu\n", + sbi->rw_iostat[FS_NODE_READ_IO]); + seq_printf(seq, "fs meta: %-16llu\n", + sbi->rw_iostat[FS_META_READ_IO]); + + /* print other IOs */ + seq_puts(seq, "[OTHER]\n"); + seq_printf(seq, "fs discard: %-16llu\n", + sbi->rw_iostat[FS_DISCARD]); + + return 0; +} + +static inline void __record_iostat_latency(struct f2fs_sb_info *sbi) +{ + int io, idx = 0; + unsigned int cnt; + struct f2fs_iostat_latency iostat_lat[MAX_IO_TYPE][NR_PAGE_TYPE]; + struct iostat_lat_info *io_lat = sbi->iostat_io_lat; + + spin_lock_irq(&sbi->iostat_lat_lock); + for (idx = 0; idx < MAX_IO_TYPE; idx++) { + for (io = 0; io < NR_PAGE_TYPE; io++) { + cnt = io_lat->bio_cnt[idx][io]; + iostat_lat[idx][io].peak_lat = + jiffies_to_msecs(io_lat->peak_lat[idx][io]); + iostat_lat[idx][io].cnt = cnt; + iostat_lat[idx][io].avg_lat = cnt ? + jiffies_to_msecs(io_lat->sum_lat[idx][io]) / cnt : 0; + io_lat->sum_lat[idx][io] = 0; + io_lat->peak_lat[idx][io] = 0; + io_lat->bio_cnt[idx][io] = 0; + } + } + spin_unlock_irq(&sbi->iostat_lat_lock); + + trace_f2fs_iostat_latency(sbi, iostat_lat); +} + +static inline void f2fs_record_iostat(struct f2fs_sb_info *sbi) +{ + unsigned long long iostat_diff[NR_IO_TYPE]; + int i; + + if (time_is_after_jiffies(sbi->iostat_next_period)) + return; + + /* Need double check under the lock */ + spin_lock(&sbi->iostat_lock); + if (time_is_after_jiffies(sbi->iostat_next_period)) { + spin_unlock(&sbi->iostat_lock); + return; + } + sbi->iostat_next_period = jiffies + + msecs_to_jiffies(sbi->iostat_period_ms); + + for (i = 0; i < NR_IO_TYPE; i++) { + iostat_diff[i] = sbi->rw_iostat[i] - + sbi->prev_rw_iostat[i]; + sbi->prev_rw_iostat[i] = sbi->rw_iostat[i]; + } + spin_unlock(&sbi->iostat_lock); + + trace_f2fs_iostat(sbi, iostat_diff); + + __record_iostat_latency(sbi); +} + +void f2fs_reset_iostat(struct f2fs_sb_info *sbi) +{ + struct iostat_lat_info *io_lat = sbi->iostat_io_lat; + int i; + + spin_lock(&sbi->iostat_lock); + for (i = 0; i < NR_IO_TYPE; i++) { + sbi->rw_iostat[i] = 0; + sbi->prev_rw_iostat[i] = 0; + } + spin_unlock(&sbi->iostat_lock); + + spin_lock_irq(&sbi->iostat_lat_lock); + memset(io_lat, 0, sizeof(struct iostat_lat_info)); + spin_unlock_irq(&sbi->iostat_lat_lock); +} + +void f2fs_update_iostat(struct f2fs_sb_info *sbi, + enum iostat_type type, unsigned long long io_bytes) +{ + if (!sbi->iostat_enable) + return; + + spin_lock(&sbi->iostat_lock); + sbi->rw_iostat[type] += io_bytes; + + if (type == APP_WRITE_IO || type == APP_DIRECT_IO) + sbi->rw_iostat[APP_BUFFERED_IO] = + sbi->rw_iostat[APP_WRITE_IO] - + sbi->rw_iostat[APP_DIRECT_IO]; + + if (type == APP_READ_IO || type == APP_DIRECT_READ_IO) + sbi->rw_iostat[APP_BUFFERED_READ_IO] = + sbi->rw_iostat[APP_READ_IO] - + sbi->rw_iostat[APP_DIRECT_READ_IO]; + spin_unlock(&sbi->iostat_lock); + + f2fs_record_iostat(sbi); +} + +static inline void __update_iostat_latency(struct bio_iostat_ctx *iostat_ctx, + int rw, bool is_sync) +{ + unsigned long ts_diff; + unsigned int iotype = iostat_ctx->type; + unsigned long flags; + struct f2fs_sb_info *sbi = iostat_ctx->sbi; + struct iostat_lat_info *io_lat = sbi->iostat_io_lat; + int idx; + + if (!sbi->iostat_enable) + return; + + ts_diff = jiffies - iostat_ctx->submit_ts; + if (iotype >= META_FLUSH) + iotype = META; + + if (rw == 0) { + idx = READ_IO; + } else { + if (is_sync) + idx = WRITE_SYNC_IO; + else + idx = WRITE_ASYNC_IO; + } + + spin_lock_irqsave(&sbi->iostat_lat_lock, flags); + io_lat->sum_lat[idx][iotype] += ts_diff; + io_lat->bio_cnt[idx][iotype]++; + if (ts_diff > io_lat->peak_lat[idx][iotype]) + io_lat->peak_lat[idx][iotype] = ts_diff; + spin_unlock_irqrestore(&sbi->iostat_lat_lock, flags); +} + +void iostat_update_and_unbind_ctx(struct bio *bio, int rw) +{ + struct bio_iostat_ctx *iostat_ctx = bio->bi_private; + bool is_sync = bio->bi_opf & REQ_SYNC; + + if (rw == 0) + bio->bi_private = iostat_ctx->post_read_ctx; + else + bio->bi_private = iostat_ctx->sbi; + __update_iostat_latency(iostat_ctx, rw, is_sync); + mempool_free(iostat_ctx, bio_iostat_ctx_pool); +} + +void iostat_alloc_and_bind_ctx(struct f2fs_sb_info *sbi, + struct bio *bio, struct bio_post_read_ctx *ctx) +{ + struct bio_iostat_ctx *iostat_ctx; + /* Due to the mempool, this never fails. */ + iostat_ctx = mempool_alloc(bio_iostat_ctx_pool, GFP_NOFS); + iostat_ctx->sbi = sbi; + iostat_ctx->submit_ts = 0; + iostat_ctx->type = 0; + iostat_ctx->post_read_ctx = ctx; + bio->bi_private = iostat_ctx; +} + +int __init f2fs_init_iostat_processing(void) +{ + bio_iostat_ctx_cache = + kmem_cache_create("f2fs_bio_iostat_ctx", + sizeof(struct bio_iostat_ctx), 0, 0, NULL); + if (!bio_iostat_ctx_cache) + goto fail; + bio_iostat_ctx_pool = + mempool_create_slab_pool(NUM_PREALLOC_IOSTAT_CTXS, + bio_iostat_ctx_cache); + if (!bio_iostat_ctx_pool) + goto fail_free_cache; + return 0; + +fail_free_cache: + kmem_cache_destroy(bio_iostat_ctx_cache); +fail: + return -ENOMEM; +} + +void f2fs_destroy_iostat_processing(void) +{ + mempool_destroy(bio_iostat_ctx_pool); + kmem_cache_destroy(bio_iostat_ctx_cache); +} + +int f2fs_init_iostat(struct f2fs_sb_info *sbi) +{ + /* init iostat info */ + spin_lock_init(&sbi->iostat_lock); + spin_lock_init(&sbi->iostat_lat_lock); + sbi->iostat_enable = false; + sbi->iostat_period_ms = DEFAULT_IOSTAT_PERIOD_MS; + sbi->iostat_io_lat = f2fs_kzalloc(sbi, sizeof(struct iostat_lat_info), + GFP_KERNEL); + if (!sbi->iostat_io_lat) + return -ENOMEM; + + return 0; +} + +void f2fs_destroy_iostat(struct f2fs_sb_info *sbi) +{ + kfree(sbi->iostat_io_lat); +} diff --git a/fs/f2fs/iostat.h b/fs/f2fs/iostat.h new file mode 100644 index 000000000000..22a2d01f57ef --- /dev/null +++ b/fs/f2fs/iostat.h @@ -0,0 +1,84 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright 2021 Google LLC + * Author: Daeho Jeong <daehojeong@google.com> + */ +#ifndef __F2FS_IOSTAT_H__ +#define __F2FS_IOSTAT_H__ + +struct bio_post_read_ctx; + +#ifdef CONFIG_F2FS_IOSTAT + +#define DEFAULT_IOSTAT_PERIOD_MS 3000 +#define MIN_IOSTAT_PERIOD_MS 100 +/* maximum period of iostat tracing is 1 day */ +#define MAX_IOSTAT_PERIOD_MS 8640000 + +enum { + READ_IO, + WRITE_SYNC_IO, + WRITE_ASYNC_IO, + MAX_IO_TYPE, +}; + +struct iostat_lat_info { + unsigned long sum_lat[MAX_IO_TYPE][NR_PAGE_TYPE]; /* sum of io latencies */ + unsigned long peak_lat[MAX_IO_TYPE][NR_PAGE_TYPE]; /* peak io latency */ + unsigned int bio_cnt[MAX_IO_TYPE][NR_PAGE_TYPE]; /* bio count */ +}; + +extern int __maybe_unused iostat_info_seq_show(struct seq_file *seq, + void *offset); +extern void f2fs_reset_iostat(struct f2fs_sb_info *sbi); +extern void f2fs_update_iostat(struct f2fs_sb_info *sbi, + enum iostat_type type, unsigned long long io_bytes); + +struct bio_iostat_ctx { + struct f2fs_sb_info *sbi; + unsigned long submit_ts; + enum page_type type; + struct bio_post_read_ctx *post_read_ctx; +}; + +static inline void iostat_update_submit_ctx(struct bio *bio, + enum page_type type) +{ + struct bio_iostat_ctx *iostat_ctx = bio->bi_private; + + iostat_ctx->submit_ts = jiffies; + iostat_ctx->type = type; +} + +static inline struct bio_post_read_ctx *get_post_read_ctx(struct bio *bio) +{ + struct bio_iostat_ctx *iostat_ctx = bio->bi_private; + + return iostat_ctx->post_read_ctx; +} + +extern void iostat_update_and_unbind_ctx(struct bio *bio, int rw); +extern void iostat_alloc_and_bind_ctx(struct f2fs_sb_info *sbi, + struct bio *bio, struct bio_post_read_ctx *ctx); +extern int f2fs_init_iostat_processing(void); +extern void f2fs_destroy_iostat_processing(void); +extern int f2fs_init_iostat(struct f2fs_sb_info *sbi); +extern void f2fs_destroy_iostat(struct f2fs_sb_info *sbi); +#else +static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, + enum iostat_type type, unsigned long long io_bytes) {} +static inline void iostat_update_and_unbind_ctx(struct bio *bio, int rw) {} +static inline void iostat_alloc_and_bind_ctx(struct f2fs_sb_info *sbi, + struct bio *bio, struct bio_post_read_ctx *ctx) {} +static inline void iostat_update_submit_ctx(struct bio *bio, + enum page_type type) {} +static inline struct bio_post_read_ctx *get_post_read_ctx(struct bio *bio) +{ + return bio->bi_private; +} +static inline int f2fs_init_iostat_processing(void) { return 0; } +static inline void f2fs_destroy_iostat_processing(void) {} +static inline int f2fs_init_iostat(struct f2fs_sb_info *sbi) { return 0; } +static inline void f2fs_destroy_iostat(struct f2fs_sb_info *sbi) {} +#endif +#endif /* __F2FS_IOSTAT_H__ */ diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index e149c8c66a71..9c528e583c9d 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -1323,9 +1323,19 @@ static const char *f2fs_encrypted_get_link(struct dentry *dentry, return target; } +static int f2fs_encrypted_symlink_getattr(struct user_namespace *mnt_userns, + const struct path *path, + struct kstat *stat, u32 request_mask, + unsigned int query_flags) +{ + f2fs_getattr(mnt_userns, path, stat, request_mask, query_flags); + + return fscrypt_symlink_getattr(path, stat); +} + const struct inode_operations f2fs_encrypted_symlink_inode_operations = { .get_link = f2fs_encrypted_get_link, - .getattr = f2fs_getattr, + .getattr = f2fs_encrypted_symlink_getattr, .setattr = f2fs_setattr, .listxattr = f2fs_listxattr, }; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 0be9e2d7120e..e863136081b4 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -17,6 +17,7 @@ #include "node.h" #include "segment.h" #include "xattr.h" +#include "iostat.h" #include <trace/events/f2fs.h> #define on_f2fs_build_free_nids(nmi) mutex_is_locked(&(nm_i)->build_lock) @@ -162,14 +163,13 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) return dst_page; } -static struct nat_entry *__alloc_nat_entry(nid_t nid, bool no_fail) +static struct nat_entry *__alloc_nat_entry(struct f2fs_sb_info *sbi, + nid_t nid, bool no_fail) { struct nat_entry *new; - if (no_fail) - new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_F2FS_ZERO); - else - new = kmem_cache_alloc(nat_entry_slab, GFP_F2FS_ZERO); + new = f2fs_kmem_cache_alloc(nat_entry_slab, + GFP_F2FS_ZERO, no_fail, sbi); if (new) { nat_set_nid(new, nid); nat_reset_flag(new); @@ -242,7 +242,8 @@ static struct nat_entry_set *__grab_nat_entry_set(struct f2fs_nm_info *nm_i, head = radix_tree_lookup(&nm_i->nat_set_root, set); if (!head) { - head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_NOFS); + head = f2fs_kmem_cache_alloc(nat_entry_set_slab, + GFP_NOFS, true, NULL); INIT_LIST_HEAD(&head->entry_list); INIT_LIST_HEAD(&head->set_list); @@ -329,7 +330,8 @@ static unsigned int f2fs_add_fsync_node_entry(struct f2fs_sb_info *sbi, unsigned long flags; unsigned int seq_id; - fn = f2fs_kmem_cache_alloc(fsync_node_entry_slab, GFP_NOFS); + fn = f2fs_kmem_cache_alloc(fsync_node_entry_slab, + GFP_NOFS, true, NULL); get_page(page); fn->page = page; @@ -428,7 +430,7 @@ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid, struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *new, *e; - new = __alloc_nat_entry(nid, false); + new = __alloc_nat_entry(sbi, nid, false); if (!new) return; @@ -451,7 +453,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, { struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *e; - struct nat_entry *new = __alloc_nat_entry(ni->nid, true); + struct nat_entry *new = __alloc_nat_entry(sbi, ni->nid, true); down_write(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, ni->nid); @@ -552,7 +554,7 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, int i; ni->nid = nid; - +retry: /* Check nat cache */ down_read(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, nid); @@ -564,10 +566,19 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, return 0; } - memset(&ne, 0, sizeof(struct f2fs_nat_entry)); + /* + * Check current segment summary by trying to grab journal_rwsem first. + * This sem is on the critical path on the checkpoint requiring the above + * nat_tree_lock. Therefore, we should retry, if we failed to grab here + * while not bothering checkpoint. + */ + if (!rwsem_is_locked(&sbi->cp_global_sem)) { + down_read(&curseg->journal_rwsem); + } else if (!down_read_trylock(&curseg->journal_rwsem)) { + up_read(&nm_i->nat_tree_lock); + goto retry; + } - /* Check current segment summary */ - down_read(&curseg->journal_rwsem); i = f2fs_lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 0); if (i >= 0) { ne = nat_in_journal(journal, i); @@ -832,6 +843,26 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) dn->ofs_in_node = offset[level]; dn->node_page = npage[level]; dn->data_blkaddr = f2fs_data_blkaddr(dn); + + if (is_inode_flag_set(dn->inode, FI_COMPRESSED_FILE) && + f2fs_sb_has_readonly(sbi)) { + unsigned int c_len = f2fs_cluster_blocks_are_contiguous(dn); + block_t blkaddr; + + if (!c_len) + goto out; + + blkaddr = f2fs_data_blkaddr(dn); + if (blkaddr == COMPRESS_ADDR) + blkaddr = data_blkaddr(dn->inode, dn->node_page, + dn->ofs_in_node + 1); + + f2fs_update_extent_tree_range_compressed(dn->inode, + index, blkaddr, + F2FS_I(dn->inode)->i_cluster_size, + c_len); + } +out: return 0; release_pages: @@ -1321,7 +1352,8 @@ static int read_node_page(struct page *page, int op_flags) if (err) return err; - if (unlikely(ni.blk_addr == NULL_ADDR) || + /* NEW_ADDR can be seen, after cp_error drops some dirty node pages */ + if (unlikely(ni.blk_addr == NULL_ADDR || ni.blk_addr == NEW_ADDR) || is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN)) { ClearPageUptodate(page); return -ENOENT; @@ -2181,6 +2213,24 @@ static void __move_free_nid(struct f2fs_sb_info *sbi, struct free_nid *i, } } +bool f2fs_nat_bitmap_enabled(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int i; + bool ret = true; + + down_read(&nm_i->nat_tree_lock); + for (i = 0; i < nm_i->nat_blocks; i++) { + if (!test_bit_le(i, nm_i->nat_block_bitmap)) { + ret = false; + break; + } + } + up_read(&nm_i->nat_tree_lock); + + return ret; +} + static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, bool set, bool build) { @@ -2222,7 +2272,7 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, if (unlikely(f2fs_check_nid_range(sbi, nid))) return false; - i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS); + i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS, true, NULL); i->nid = nid; i->state = FREE_NID; @@ -2812,7 +2862,7 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi) ne = __lookup_nat_cache(nm_i, nid); if (!ne) { - ne = __alloc_nat_entry(nid, true); + ne = __alloc_nat_entry(sbi, nid, true); __init_nat_entry(nm_i, ne, &raw_ne, true); } @@ -2852,7 +2902,23 @@ add_out: list_add_tail(&nes->set_list, head); } -static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, +static void __update_nat_bits(struct f2fs_nm_info *nm_i, unsigned int nat_ofs, + unsigned int valid) +{ + if (valid == 0) { + __set_bit_le(nat_ofs, nm_i->empty_nat_bits); + __clear_bit_le(nat_ofs, nm_i->full_nat_bits); + return; + } + + __clear_bit_le(nat_ofs, nm_i->empty_nat_bits); + if (valid == NAT_ENTRY_PER_BLOCK) + __set_bit_le(nat_ofs, nm_i->full_nat_bits); + else + __clear_bit_le(nat_ofs, nm_i->full_nat_bits); +} + +static void update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, struct page *page) { struct f2fs_nm_info *nm_i = NM_I(sbi); @@ -2861,7 +2927,7 @@ static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, int valid = 0; int i = 0; - if (!enabled_nat_bits(sbi, NULL)) + if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) return; if (nat_index == 0) { @@ -2872,17 +2938,36 @@ static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, if (le32_to_cpu(nat_blk->entries[i].block_addr) != NULL_ADDR) valid++; } - if (valid == 0) { - __set_bit_le(nat_index, nm_i->empty_nat_bits); - __clear_bit_le(nat_index, nm_i->full_nat_bits); - return; + + __update_nat_bits(nm_i, nat_index, valid); +} + +void f2fs_enable_nat_bits(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int nat_ofs; + + down_read(&nm_i->nat_tree_lock); + + for (nat_ofs = 0; nat_ofs < nm_i->nat_blocks; nat_ofs++) { + unsigned int valid = 0, nid_ofs = 0; + + /* handle nid zero due to it should never be used */ + if (unlikely(nat_ofs == 0)) { + valid = 1; + nid_ofs = 1; + } + + for (; nid_ofs < NAT_ENTRY_PER_BLOCK; nid_ofs++) { + if (!test_bit_le(nid_ofs, + nm_i->free_nid_bitmap[nat_ofs])) + valid++; + } + + __update_nat_bits(nm_i, nat_ofs, valid); } - __clear_bit_le(nat_index, nm_i->empty_nat_bits); - if (valid == NAT_ENTRY_PER_BLOCK) - __set_bit_le(nat_index, nm_i->full_nat_bits); - else - __clear_bit_le(nat_index, nm_i->full_nat_bits); + up_read(&nm_i->nat_tree_lock); } static int __flush_nat_entry_set(struct f2fs_sb_info *sbi, @@ -2901,7 +2986,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi, * #1, flush nat entries to journal in current hot data summary block. * #2, flush nat entries to nat page. */ - if (enabled_nat_bits(sbi, cpc) || + if ((cpc->reason & CP_UMOUNT) || !__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL)) to_journal = false; @@ -2948,7 +3033,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi, if (to_journal) { up_write(&curseg->journal_rwsem); } else { - __update_nat_bits(sbi, start_nid, page); + update_nat_bits(sbi, start_nid, page); f2fs_put_page(page, 1); } @@ -2979,7 +3064,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) * during unmount, let's flush nat_bits before checking * nat_cnt[DIRTY_NAT]. */ - if (enabled_nat_bits(sbi, cpc)) { + if (cpc->reason & CP_UMOUNT) { down_write(&nm_i->nat_tree_lock); remove_nats_in_journal(sbi); up_write(&nm_i->nat_tree_lock); @@ -2995,7 +3080,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) * entries, remove all entries from journal and merge them * into nat entry set. */ - if (enabled_nat_bits(sbi, cpc) || + if (cpc->reason & CP_UMOUNT || !__has_cursum_space(journal, nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL)) remove_nats_in_journal(sbi); @@ -3032,15 +3117,18 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) __u64 cp_ver = cur_cp_version(ckpt); block_t nat_bits_addr; - if (!enabled_nat_bits(sbi, NULL)) - return 0; - nm_i->nat_bits_blocks = F2FS_BLK_ALIGN((nat_bits_bytes << 1) + 8); nm_i->nat_bits = f2fs_kvzalloc(sbi, nm_i->nat_bits_blocks << F2FS_BLKSIZE_BITS, GFP_KERNEL); if (!nm_i->nat_bits) return -ENOMEM; + nm_i->full_nat_bits = nm_i->nat_bits + 8; + nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes; + + if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) + return 0; + nat_bits_addr = __start_cp_addr(sbi) + sbi->blocks_per_seg - nm_i->nat_bits_blocks; for (i = 0; i < nm_i->nat_bits_blocks; i++) { @@ -3057,13 +3145,12 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) cp_ver |= (cur_cp_crc(ckpt) << 32); if (cpu_to_le64(cp_ver) != *(__le64 *)nm_i->nat_bits) { - disable_nat_bits(sbi, true); + clear_ckpt_flags(sbi, CP_NAT_BITS_FLAG); + f2fs_notice(sbi, "Disable nat_bits due to incorrect cp_ver (%llu, %llu)", + cp_ver, le64_to_cpu(*(__le64 *)nm_i->nat_bits)); return 0; } - nm_i->full_nat_bits = nm_i->nat_bits + 8; - nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes; - f2fs_notice(sbi, "Found nat_bits in checkpoint"); return 0; } @@ -3074,7 +3161,7 @@ static inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi) unsigned int i = 0; nid_t nid, last_nid; - if (!enabled_nat_bits(sbi, NULL)) + if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) return; for (i = 0; i < nm_i->nat_blocks; i++) { diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 695eacfe776c..04655511d7f5 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -91,7 +91,8 @@ static struct fsync_inode_entry *add_fsync_inode(struct f2fs_sb_info *sbi, goto err_out; } - entry = f2fs_kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO); + entry = f2fs_kmem_cache_alloc(fsync_entry_slab, + GFP_F2FS_ZERO, true, NULL); entry->inode = inode; list_add_tail(&entry->list, head); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 15cc89eef28d..a135d2247415 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -20,6 +20,7 @@ #include "segment.h" #include "node.h" #include "gc.h" +#include "iostat.h" #include <trace/events/f2fs.h> #define __reverse_ffz(x) __reverse_ffs(~(x)) @@ -188,7 +189,8 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page) set_page_private_atomic(page); - new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS); + new = f2fs_kmem_cache_alloc(inmem_entry_slab, + GFP_NOFS, true, NULL); /* add atomic page indices to the list */ new->page = page; @@ -776,11 +778,22 @@ int f2fs_flush_device_cache(struct f2fs_sb_info *sbi) return 0; for (i = 1; i < sbi->s_ndevs; i++) { + int count = DEFAULT_RETRY_IO_COUNT; + if (!f2fs_test_bit(i, (char *)&sbi->dirty_device)) continue; - ret = __submit_flush_wait(sbi, FDEV(i).bdev); - if (ret) + + do { + ret = __submit_flush_wait(sbi, FDEV(i).bdev); + if (ret) + congestion_wait(BLK_RW_ASYNC, + DEFAULT_IO_TIMEOUT); + } while (ret && --count); + + if (ret) { + f2fs_stop_checkpoint(sbi, false); break; + } spin_lock(&sbi->dev_lock); f2fs_clear_bit(i, (char *)&sbi->dirty_device); @@ -990,7 +1003,7 @@ static struct discard_cmd *__create_discard_cmd(struct f2fs_sb_info *sbi, pend_list = &dcc->pend_list[plist_idx(len)]; - dc = f2fs_kmem_cache_alloc(discard_cmd_slab, GFP_NOFS); + dc = f2fs_kmem_cache_alloc(discard_cmd_slab, GFP_NOFS, true, NULL); INIT_LIST_HEAD(&dc->list); dc->bdev = bdev; dc->lstart = lstart; @@ -1893,7 +1906,8 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi, se = get_seg_entry(sbi, GET_SEGNO(sbi, i)); offset = GET_BLKOFF_FROM_SEG0(sbi, i); - if (!f2fs_test_and_set_bit(offset, se->discard_map)) + if (f2fs_block_unit_discard(sbi) && + !f2fs_test_and_set_bit(offset, se->discard_map)) sbi->discard_blks--; } @@ -1918,7 +1932,8 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, struct list_head *head = &SM_I(sbi)->dcc_info->entry_list; int i; - if (se->valid_blocks == max_blocks || !f2fs_hw_support_discard(sbi)) + if (se->valid_blocks == max_blocks || !f2fs_hw_support_discard(sbi) || + !f2fs_block_unit_discard(sbi)) return false; if (!force) { @@ -1949,7 +1964,7 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, if (!de) { de = f2fs_kmem_cache_alloc(discard_entry_slab, - GFP_F2FS_ZERO); + GFP_F2FS_ZERO, true, NULL); de->start_blkaddr = START_BLOCK(sbi, cpc->trim_start); list_add_tail(&de->list, head); } @@ -2003,14 +2018,18 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, unsigned int start = 0, end = -1; unsigned int secno, start_segno; bool force = (cpc->reason & CP_DISCARD); - bool need_align = f2fs_lfs_mode(sbi) && __is_large_section(sbi); + bool section_alignment = F2FS_OPTION(sbi).discard_unit == + DISCARD_UNIT_SECTION; + + if (f2fs_lfs_mode(sbi) && __is_large_section(sbi)) + section_alignment = true; mutex_lock(&dirty_i->seglist_lock); while (1) { int i; - if (need_align && end != -1) + if (section_alignment && end != -1) end--; start = find_next_bit(prefree_map, MAIN_SEGS(sbi), end + 1); if (start >= MAIN_SEGS(sbi)) @@ -2018,7 +2037,7 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, end = find_next_zero_bit(prefree_map, MAIN_SEGS(sbi), start + 1); - if (need_align) { + if (section_alignment) { start = rounddown(start, sbi->segs_per_sec); end = roundup(end, sbi->segs_per_sec); } @@ -2056,6 +2075,9 @@ next: } mutex_unlock(&dirty_i->seglist_lock); + if (!f2fs_block_unit_discard(sbi)) + goto wakeup; + /* send small discards */ list_for_each_entry_safe(entry, this, head, list) { unsigned int cur_pos = 0, next_pos, len, total_len = 0; @@ -2089,12 +2111,29 @@ skip: dcc->nr_discards -= total_len; } +wakeup: wake_up_discard_thread(sbi, false); } -static int create_discard_cmd_control(struct f2fs_sb_info *sbi) +int f2fs_start_discard_thread(struct f2fs_sb_info *sbi) { dev_t dev = sbi->sb->s_bdev->bd_dev; + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + int err = 0; + + if (!f2fs_realtime_discard_enable(sbi)) + return 0; + + dcc->f2fs_issue_discard = kthread_run(issue_discard_thread, sbi, + "f2fs_discard-%u:%u", MAJOR(dev), MINOR(dev)); + if (IS_ERR(dcc->f2fs_issue_discard)) + err = PTR_ERR(dcc->f2fs_issue_discard); + + return err; +} + +static int create_discard_cmd_control(struct f2fs_sb_info *sbi) +{ struct discard_cmd_control *dcc; int err = 0, i; @@ -2108,6 +2147,11 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) return -ENOMEM; dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY; + if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT) + dcc->discard_granularity = sbi->blocks_per_seg; + else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION) + dcc->discard_granularity = BLKS_PER_SEC(sbi); + INIT_LIST_HEAD(&dcc->entry_list); for (i = 0; i < MAX_PLIST_NUM; i++) INIT_LIST_HEAD(&dcc->pend_list[i]); @@ -2127,13 +2171,10 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) init_waitqueue_head(&dcc->discard_wait_queue); SM_I(sbi)->dcc_info = dcc; init_thread: - dcc->f2fs_issue_discard = kthread_run(issue_discard_thread, sbi, - "f2fs_discard-%u:%u", MAJOR(dev), MINOR(dev)); - if (IS_ERR(dcc->f2fs_issue_discard)) { - err = PTR_ERR(dcc->f2fs_issue_discard); + err = f2fs_start_discard_thread(sbi); + if (err) { kfree(dcc); SM_I(sbi)->dcc_info = NULL; - return err; } return err; @@ -2255,7 +2296,8 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) del = 0; } - if (!f2fs_test_and_set_bit(offset, se->discard_map)) + if (f2fs_block_unit_discard(sbi) && + !f2fs_test_and_set_bit(offset, se->discard_map)) sbi->discard_blks--; /* @@ -2297,7 +2339,8 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) } } - if (f2fs_test_and_clear_bit(offset, se->discard_map)) + if (f2fs_block_unit_discard(sbi) && + f2fs_test_and_clear_bit(offset, se->discard_map)) sbi->discard_blks++; } if (!f2fs_test_bit(offset, se->ckpt_valid_map)) @@ -3563,7 +3606,7 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) goto drop_bio; } - if (is_sbi_flag_set(sbi, SBI_NEED_FSCK) || f2fs_cp_error(sbi)) { + if (f2fs_cp_error(sbi)) { err = -EIO; goto drop_bio; } @@ -4071,7 +4114,8 @@ static struct page *get_next_sit_page(struct f2fs_sb_info *sbi, static struct sit_entry_set *grab_sit_entry_set(void) { struct sit_entry_set *ses = - f2fs_kmem_cache_alloc(sit_entry_set_slab, GFP_NOFS); + f2fs_kmem_cache_alloc(sit_entry_set_slab, + GFP_NOFS, true, NULL); ses->entry_cnt = 0; INIT_LIST_HEAD(&ses->set_list); @@ -4282,6 +4326,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi) unsigned int sit_segs, start; char *src_bitmap, *bitmap; unsigned int bitmap_size, main_bitmap_size, sit_bitmap_size; + unsigned int discard_map = f2fs_block_unit_discard(sbi) ? 1 : 0; /* allocate memory for SIT information */ sit_i = f2fs_kzalloc(sbi, sizeof(struct sit_info), GFP_KERNEL); @@ -4304,9 +4349,9 @@ static int build_sit_info(struct f2fs_sb_info *sbi) return -ENOMEM; #ifdef CONFIG_F2FS_CHECK_FS - bitmap_size = MAIN_SEGS(sbi) * SIT_VBLOCK_MAP_SIZE * 4; + bitmap_size = MAIN_SEGS(sbi) * SIT_VBLOCK_MAP_SIZE * (3 + discard_map); #else - bitmap_size = MAIN_SEGS(sbi) * SIT_VBLOCK_MAP_SIZE * 3; + bitmap_size = MAIN_SEGS(sbi) * SIT_VBLOCK_MAP_SIZE * (2 + discard_map); #endif sit_i->bitmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL); if (!sit_i->bitmap) @@ -4326,8 +4371,10 @@ static int build_sit_info(struct f2fs_sb_info *sbi) bitmap += SIT_VBLOCK_MAP_SIZE; #endif - sit_i->sentries[start].discard_map = bitmap; - bitmap += SIT_VBLOCK_MAP_SIZE; + if (discard_map) { + sit_i->sentries[start].discard_map = bitmap; + bitmap += SIT_VBLOCK_MAP_SIZE; + } } sit_i->tmp_map = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); @@ -4489,17 +4536,19 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) if (IS_NODESEG(se->type)) total_node_blocks += se->valid_blocks; - /* build discard map only one time */ - if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { - memset(se->discard_map, 0xff, - SIT_VBLOCK_MAP_SIZE); - } else { - memcpy(se->discard_map, - se->cur_valid_map, - SIT_VBLOCK_MAP_SIZE); - sbi->discard_blks += - sbi->blocks_per_seg - - se->valid_blocks; + if (f2fs_block_unit_discard(sbi)) { + /* build discard map only one time */ + if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { + memset(se->discard_map, 0xff, + SIT_VBLOCK_MAP_SIZE); + } else { + memcpy(se->discard_map, + se->cur_valid_map, + SIT_VBLOCK_MAP_SIZE); + sbi->discard_blks += + sbi->blocks_per_seg - + se->valid_blocks; + } } if (__is_large_section(sbi)) @@ -4535,13 +4584,15 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) if (IS_NODESEG(se->type)) total_node_blocks += se->valid_blocks; - if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { - memset(se->discard_map, 0xff, SIT_VBLOCK_MAP_SIZE); - } else { - memcpy(se->discard_map, se->cur_valid_map, - SIT_VBLOCK_MAP_SIZE); - sbi->discard_blks += old_valid_blocks; - sbi->discard_blks -= se->valid_blocks; + if (f2fs_block_unit_discard(sbi)) { + if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { + memset(se->discard_map, 0xff, SIT_VBLOCK_MAP_SIZE); + } else { + memcpy(se->discard_map, se->cur_valid_map, + SIT_VBLOCK_MAP_SIZE); + sbi->discard_blks += old_valid_blocks; + sbi->discard_blks -= se->valid_blocks; + } } if (__is_large_section(sbi)) { @@ -5159,7 +5210,7 @@ int f2fs_build_segment_manager(struct f2fs_sb_info *sbi) sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC; sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; - sm_info->min_seq_blocks = sbi->blocks_per_seg * sbi->segs_per_sec; + sm_info->min_seq_blocks = sbi->blocks_per_seg; sm_info->min_hot_blocks = DEF_MIN_HOT_BLOCKS; sm_info->min_ssr_sections = reserved_sections(sbi); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 050230c70a53..89fff258727d 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -142,7 +142,7 @@ enum { }; /* - * In the victim_sel_policy->alloc_mode, there are two block allocation modes. + * In the victim_sel_policy->alloc_mode, there are three block allocation modes. * LFS writes data sequentially with cleaning operations. * SSR (Slack Space Recycle) reuses obsolete space without cleaning operations. * AT_SSR (Age Threshold based Slack Space Recycle) merges fragments into @@ -155,7 +155,7 @@ enum { }; /* - * In the victim_sel_policy->gc_mode, there are two gc, aka cleaning, modes. + * In the victim_sel_policy->gc_mode, there are three gc, aka cleaning, modes. * GC_CB is based on cost-benefit algorithm. * GC_GREEDY is based on greedy algorithm. * GC_AT is based on age-threshold algorithm. diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index ce2ab1b85c11..78ebc306ee2b 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -33,6 +33,7 @@ #include "segment.h" #include "xattr.h" #include "gc.h" +#include "iostat.h" #define CREATE_TRACE_POINTS #include <trace/events/f2fs.h> @@ -56,6 +57,7 @@ const char *f2fs_fault_name[FAULT_MAX] = { [FAULT_CHECKPOINT] = "checkpoint error", [FAULT_DISCARD] = "discard error", [FAULT_WRITE_IO] = "write IO error", + [FAULT_SLAB_ALLOC] = "slab alloc", }; void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate, @@ -155,6 +157,7 @@ enum { Opt_atgc, Opt_gc_merge, Opt_nogc_merge, + Opt_discard_unit, Opt_err, }; @@ -231,6 +234,7 @@ static match_table_t f2fs_tokens = { {Opt_atgc, "atgc"}, {Opt_gc_merge, "gc_merge"}, {Opt_nogc_merge, "nogc_merge"}, + {Opt_discard_unit, "discard_unit=%s"}, {Opt_err, NULL}, }; @@ -657,10 +661,14 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) return -EINVAL; break; case Opt_discard: + if (!f2fs_hw_support_discard(sbi)) { + f2fs_warn(sbi, "device does not support discard"); + break; + } set_opt(sbi, DISCARD); break; case Opt_nodiscard: - if (f2fs_sb_has_blkzoned(sbi)) { + if (f2fs_hw_should_discard(sbi)) { f2fs_warn(sbi, "discard is required for zoned block devices"); return -EINVAL; } @@ -1173,6 +1181,25 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) case Opt_nogc_merge: clear_opt(sbi, GC_MERGE); break; + case Opt_discard_unit: + name = match_strdup(&args[0]); + if (!name) + return -ENOMEM; + if (!strcmp(name, "block")) { + F2FS_OPTION(sbi).discard_unit = + DISCARD_UNIT_BLOCK; + } else if (!strcmp(name, "segment")) { + F2FS_OPTION(sbi).discard_unit = + DISCARD_UNIT_SEGMENT; + } else if (!strcmp(name, "section")) { + F2FS_OPTION(sbi).discard_unit = + DISCARD_UNIT_SECTION; + } else { + kfree(name); + return -EINVAL; + } + kfree(name); + break; default: f2fs_err(sbi, "Unrecognized mount option \"%s\" or missing value", p); @@ -1211,6 +1238,14 @@ default_check: return -EINVAL; } #endif + if (f2fs_sb_has_blkzoned(sbi)) { + if (F2FS_OPTION(sbi).discard_unit != + DISCARD_UNIT_SECTION) { + f2fs_info(sbi, "Zoned block device doesn't need small discard, set discard_unit=section by default"); + F2FS_OPTION(sbi).discard_unit = + DISCARD_UNIT_SECTION; + } + } #ifdef CONFIG_F2FS_FS_COMPRESSION if (f2fs_test_compress_extension(sbi)) { @@ -1271,7 +1306,8 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) { struct f2fs_inode_info *fi; - fi = kmem_cache_alloc(f2fs_inode_cachep, GFP_F2FS_ZERO); + fi = f2fs_kmem_cache_alloc(f2fs_inode_cachep, + GFP_F2FS_ZERO, false, F2FS_SB(sb)); if (!fi) return NULL; @@ -1541,6 +1577,7 @@ static void f2fs_put_super(struct super_block *sb) #endif fscrypt_free_dummy_policy(&F2FS_OPTION(sbi).dummy_enc_policy); destroy_percpu_info(sbi); + f2fs_destroy_iostat(sbi); for (i = 0; i < NR_PAGE_TYPE; i++) kvfree(sbi->write_io[i]); #ifdef CONFIG_UNICODE @@ -1924,6 +1961,14 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) if (test_opt(sbi, ATGC)) seq_puts(seq, ",atgc"); + + if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_BLOCK) + seq_printf(seq, ",discard_unit=%s", "block"); + else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT) + seq_printf(seq, ",discard_unit=%s", "segment"); + else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION) + seq_printf(seq, ",discard_unit=%s", "section"); + return 0; } @@ -1959,11 +2004,15 @@ static void default_options(struct f2fs_sb_info *sbi) F2FS_OPTION(sbi).unusable_cap = 0; sbi->sb->s_flags |= SB_LAZYTIME; set_opt(sbi, FLUSH_MERGE); - set_opt(sbi, DISCARD); - if (f2fs_sb_has_blkzoned(sbi)) + if (f2fs_hw_support_discard(sbi) || f2fs_hw_should_discard(sbi)) + set_opt(sbi, DISCARD); + if (f2fs_sb_has_blkzoned(sbi)) { F2FS_OPTION(sbi).fs_mode = FS_MODE_LFS; - else + F2FS_OPTION(sbi).discard_unit = DISCARD_UNIT_SECTION; + } else { F2FS_OPTION(sbi).fs_mode = FS_MODE_ADAPTIVE; + F2FS_OPTION(sbi).discard_unit = DISCARD_UNIT_BLOCK; + } #ifdef CONFIG_F2FS_FS_XATTR set_opt(sbi, XATTR_USER); @@ -2038,8 +2087,17 @@ restore_flag: static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) { + int retry = DEFAULT_RETRY_IO_COUNT; + /* we should flush all the data to keep data consistency */ - sync_inodes_sb(sbi->sb); + do { + sync_inodes_sb(sbi->sb); + cond_resched(); + congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); + } while (get_pages(sbi, F2FS_DIRTY_DATA) && retry--); + + if (unlikely(retry < 0)) + f2fs_warn(sbi, "checkpoint=enable has some unwritten data."); down_write(&sbi->gc_lock); f2fs_dirty_to_prefree(sbi); @@ -2060,12 +2118,15 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) bool need_restart_gc = false, need_stop_gc = false; bool need_restart_ckpt = false, need_stop_ckpt = false; bool need_restart_flush = false, need_stop_flush = false; + bool need_restart_discard = false, need_stop_discard = false; bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE); - bool disable_checkpoint = test_opt(sbi, DISABLE_CHECKPOINT); + bool enable_checkpoint = !test_opt(sbi, DISABLE_CHECKPOINT); bool no_io_align = !F2FS_IO_ALIGNED(sbi); bool no_atgc = !test_opt(sbi, ATGC); + bool no_discard = !test_opt(sbi, DISCARD); bool no_compress_cache = !test_opt(sbi, COMPRESS_CACHE); - bool checkpoint_changed; + bool block_unit_discard = f2fs_block_unit_discard(sbi); + struct discard_cmd_control *dcc; #ifdef CONFIG_QUOTA int i, j; #endif @@ -2110,8 +2171,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) err = parse_options(sb, data, true); if (err) goto restore_opts; - checkpoint_changed = - disable_checkpoint != test_opt(sbi, DISABLE_CHECKPOINT); /* * Previous and new state of filesystem is RO, @@ -2168,6 +2227,12 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; } + if (block_unit_discard != f2fs_block_unit_discard(sbi)) { + err = -EINVAL; + f2fs_warn(sbi, "switch discard_unit option is not allowed"); + goto restore_opts; + } + if ((*flags & SB_RDONLY) && test_opt(sbi, DISABLE_CHECKPOINT)) { err = -EINVAL; f2fs_warn(sbi, "disabling checkpoint not compatible with read-only"); @@ -2233,11 +2298,26 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) need_stop_flush = true; } - if (checkpoint_changed) { + if (no_discard == !!test_opt(sbi, DISCARD)) { + if (test_opt(sbi, DISCARD)) { + err = f2fs_start_discard_thread(sbi); + if (err) + goto restore_flush; + need_stop_discard = true; + } else { + dcc = SM_I(sbi)->dcc_info; + f2fs_stop_discard_thread(sbi); + if (atomic_read(&dcc->discard_cmd_cnt)) + f2fs_issue_discard_timeout(sbi); + need_restart_discard = true; + } + } + + if (enable_checkpoint == !!test_opt(sbi, DISABLE_CHECKPOINT)) { if (test_opt(sbi, DISABLE_CHECKPOINT)) { err = f2fs_disable_checkpoint(sbi); if (err) - goto restore_flush; + goto restore_discard; } else { f2fs_enable_checkpoint(sbi); } @@ -2257,6 +2337,13 @@ skip: adjust_unusable_cap_perc(sbi); *flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME); return 0; +restore_discard: + if (need_restart_discard) { + if (f2fs_start_discard_thread(sbi)) + f2fs_warn(sbi, "discard has been stopped"); + } else if (need_stop_discard) { + f2fs_stop_discard_thread(sbi); + } restore_flush: if (need_restart_flush) { if (f2fs_create_flush_cmd_control(sbi)) @@ -2517,6 +2604,33 @@ static int f2fs_enable_quotas(struct super_block *sb) return 0; } +static int f2fs_quota_sync_file(struct f2fs_sb_info *sbi, int type) +{ + struct quota_info *dqopt = sb_dqopt(sbi->sb); + struct address_space *mapping = dqopt->files[type]->i_mapping; + int ret = 0; + + ret = dquot_writeback_dquots(sbi->sb, type); + if (ret) + goto out; + + ret = filemap_fdatawrite(mapping); + if (ret) + goto out; + + /* if we are using journalled quota */ + if (is_journalled_quota(sbi)) + goto out; + + ret = filemap_fdatawait(mapping); + + truncate_inode_pages(&dqopt->files[type]->i_data, 0); +out: + if (ret) + set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); + return ret; +} + int f2fs_quota_sync(struct super_block *sb, int type) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -2525,56 +2639,41 @@ int f2fs_quota_sync(struct super_block *sb, int type) int ret; /* - * do_quotactl - * f2fs_quota_sync - * down_read(quota_sem) - * dquot_writeback_dquots() - * f2fs_dquot_commit - * block_operation - * down_read(quota_sem) - */ - f2fs_lock_op(sbi); - - down_read(&sbi->quota_sem); - ret = dquot_writeback_dquots(sb, type); - if (ret) - goto out; - - /* * Now when everything is written we can discard the pagecache so * that userspace sees the changes. */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - struct address_space *mapping; if (type != -1 && cnt != type) continue; - if (!sb_has_quota_active(sb, cnt)) - continue; - mapping = dqopt->files[cnt]->i_mapping; + if (!sb_has_quota_active(sb, type)) + return 0; - ret = filemap_fdatawrite(mapping); - if (ret) - goto out; + inode_lock(dqopt->files[cnt]); - /* if we are using journalled quota */ - if (is_journalled_quota(sbi)) - continue; + /* + * do_quotactl + * f2fs_quota_sync + * down_read(quota_sem) + * dquot_writeback_dquots() + * f2fs_dquot_commit + * block_operation + * down_read(quota_sem) + */ + f2fs_lock_op(sbi); + down_read(&sbi->quota_sem); - ret = filemap_fdatawait(mapping); - if (ret) - set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); + ret = f2fs_quota_sync_file(sbi, cnt); + + up_read(&sbi->quota_sem); + f2fs_unlock_op(sbi); - inode_lock(dqopt->files[cnt]); - truncate_inode_pages(&dqopt->files[cnt]->i_data, 0); inode_unlock(dqopt->files[cnt]); + + if (ret) + break; } -out: - if (ret) - set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); - up_read(&sbi->quota_sem); - f2fs_unlock_op(sbi); return ret; } @@ -3207,11 +3306,13 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, return -EFSCORRUPTED; } - if (le32_to_cpu(raw_super->cp_payload) > - (blocks_per_seg - F2FS_CP_PACKS)) { - f2fs_info(sbi, "Insane cp_payload (%u > %u)", + if (le32_to_cpu(raw_super->cp_payload) >= + (blocks_per_seg - F2FS_CP_PACKS - + NR_CURSEG_PERSIST_TYPE)) { + f2fs_info(sbi, "Insane cp_payload (%u >= %u)", le32_to_cpu(raw_super->cp_payload), - blocks_per_seg - F2FS_CP_PACKS); + blocks_per_seg - F2FS_CP_PACKS - + NR_CURSEG_PERSIST_TYPE); return -EFSCORRUPTED; } @@ -3247,6 +3348,7 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) unsigned int cp_pack_start_sum, cp_payload; block_t user_block_count, valid_user_blocks; block_t avail_node_count, valid_node_count; + unsigned int nat_blocks, nat_bits_bytes, nat_bits_blocks; int i, j; total = le32_to_cpu(raw_super->segment_count); @@ -3377,6 +3479,17 @@ skip_cross: return 1; } + nat_blocks = nat_segs << log_blocks_per_seg; + nat_bits_bytes = nat_blocks / BITS_PER_BYTE; + nat_bits_blocks = F2FS_BLK_ALIGN((nat_bits_bytes << 1) + 8); + if (__is_set_ckpt_flags(ckpt, CP_NAT_BITS_FLAG) && + (cp_payload + F2FS_CP_PACKS + + NR_CURSEG_PERSIST_TYPE + nat_bits_blocks >= blocks_per_seg)) { + f2fs_warn(sbi, "Insane cp_payload: %u, nat_bits_blocks: %u)", + cp_payload, nat_bits_blocks); + return -EFSCORRUPTED; + } + if (unlikely(f2fs_cp_error(sbi))) { f2fs_err(sbi, "A bug case: need to run fsck"); return 1; @@ -3409,6 +3522,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->next_victim_seg[FG_GC] = NULL_SEGNO; sbi->max_victim_search = DEF_MAX_VICTIM_SEARCH; sbi->migration_granularity = sbi->segs_per_sec; + sbi->seq_file_ra_mul = MIN_RA_MUL; sbi->dir_level = DEF_DIR_LEVEL; sbi->interval_time[CP_TIME] = DEF_CP_INTERVAL; @@ -3768,7 +3882,8 @@ static void f2fs_tuning_parameters(struct f2fs_sb_info *sbi) /* adjust parameters according to the volume size */ if (sm_i->main_segments <= SMALL_VOLUME_SEGMENTS) { F2FS_OPTION(sbi).alloc_mode = ALLOC_MODE_REUSE; - sm_i->dcc_info->discard_granularity = 1; + if (f2fs_block_unit_discard(sbi)) + sm_i->dcc_info->discard_granularity = 1; sm_i->ipu_policy = 1 << F2FS_IPU_FORCE; } @@ -3889,11 +4004,6 @@ try_onemore: set_sbi_flag(sbi, SBI_POR_DOING); spin_lock_init(&sbi->stat_lock); - /* init iostat info */ - spin_lock_init(&sbi->iostat_lock); - sbi->iostat_enable = false; - sbi->iostat_period_ms = DEFAULT_IOSTAT_PERIOD_MS; - for (i = 0; i < NR_PAGE_TYPE; i++) { int n = (i == META) ? 1 : NR_TEMP_TYPE; int j; @@ -3924,10 +4034,14 @@ try_onemore: init_waitqueue_head(&sbi->cp_wait); init_sb_info(sbi); - err = init_percpu_info(sbi); + err = f2fs_init_iostat(sbi); if (err) goto free_bio_info; + err = init_percpu_info(sbi); + if (err) + goto free_iostat; + if (F2FS_IO_ALIGNED(sbi)) { sbi->write_io_dummy = mempool_create_page_pool(2 * (F2FS_IO_SIZE(sbi) - 1), 0); @@ -4259,6 +4373,8 @@ free_io_dummy: mempool_destroy(sbi->write_io_dummy); free_percpu: destroy_percpu_info(sbi); +free_iostat: + f2fs_destroy_iostat(sbi); free_bio_info: for (i = 0; i < NR_PAGE_TYPE; i++) kvfree(sbi->write_io[i]); @@ -4401,9 +4517,12 @@ static int __init init_f2fs_fs(void) err = f2fs_init_post_read_processing(); if (err) goto free_root_stats; - err = f2fs_init_bio_entry_cache(); + err = f2fs_init_iostat_processing(); if (err) goto free_post_read; + err = f2fs_init_bio_entry_cache(); + if (err) + goto free_iostat; err = f2fs_init_bioset(); if (err) goto free_bio_enrty_cache; @@ -4425,6 +4544,8 @@ free_bioset: f2fs_destroy_bioset(); free_bio_enrty_cache: f2fs_destroy_bio_entry_cache(); +free_iostat: + f2fs_destroy_iostat_processing(); free_post_read: f2fs_destroy_post_read_processing(); free_root_stats: @@ -4459,6 +4580,7 @@ static void __exit exit_f2fs_fs(void) f2fs_destroy_compress_mempool(); f2fs_destroy_bioset(); f2fs_destroy_bio_entry_cache(); + f2fs_destroy_iostat_processing(); f2fs_destroy_post_read_processing(); f2fs_destroy_root_stats(); unregister_filesystem(&f2fs_fs_type); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index daad532a4e2b..a32fe31c33b8 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -17,6 +17,7 @@ #include "f2fs.h" #include "segment.h" #include "gc.h" +#include "iostat.h" #include <trace/events/f2fs.h> static struct proc_dir_entry *f2fs_proc_root; @@ -307,6 +308,14 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a, return sysfs_emit(buf, "%u\n", sbi->compr_new_inode); #endif + if (!strcmp(a->attr.name, "gc_segment_mode")) + return sysfs_emit(buf, "%u\n", sbi->gc_segment_mode); + + if (!strcmp(a->attr.name, "gc_reclaimed_segments")) { + return sysfs_emit(buf, "%u\n", + sbi->gc_reclaimed_segs[sbi->gc_segment_mode]); + } + ui = (unsigned int *)(ptr + a->offset); return sprintf(buf, "%u\n", *ui); @@ -343,7 +352,7 @@ static ssize_t __sbi_store(struct f2fs_attr *a, set = false; } - if (strlen(name) >= F2FS_EXTENSION_LEN) + if (!strlen(name) || strlen(name) >= F2FS_EXTENSION_LEN) return -EINVAL; down_write(&sbi->sb_lock); @@ -420,6 +429,8 @@ out: if (!strcmp(a->attr.name, "discard_granularity")) { if (t == 0 || t > MAX_PLIST_NUM) return -EINVAL; + if (!f2fs_block_unit_discard(sbi)) + return -EINVAL; if (t == *ui) return count; *ui = t; @@ -467,6 +478,7 @@ out: return count; } +#ifdef CONFIG_F2FS_IOSTAT if (!strcmp(a->attr.name, "iostat_enable")) { sbi->iostat_enable = !!t; if (!sbi->iostat_enable) @@ -482,6 +494,7 @@ out: spin_unlock(&sbi->iostat_lock); return count; } +#endif #ifdef CONFIG_F2FS_FS_COMPRESSION if (!strcmp(a->attr.name, "compr_written_block") || @@ -515,6 +528,29 @@ out: return count; } + if (!strcmp(a->attr.name, "gc_segment_mode")) { + if (t < MAX_GC_MODE) + sbi->gc_segment_mode = t; + else + return -EINVAL; + return count; + } + + if (!strcmp(a->attr.name, "gc_reclaimed_segments")) { + if (t != 0) + return -EINVAL; + sbi->gc_reclaimed_segs[sbi->gc_segment_mode] = 0; + return count; + } + + if (!strcmp(a->attr.name, "seq_file_ra_mul")) { + if (t >= MIN_RA_MUL && t <= MAX_RA_MUL) + sbi->seq_file_ra_mul = t; + else + return -EINVAL; + return count; + } + *ui = (unsigned int)t; return count; @@ -667,8 +703,10 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, discard_idle_interval, F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_idle_interval, interval_time[GC_TIME]); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, umount_discard_timeout, interval_time[UMOUNT_DISCARD_TIMEOUT]); +#ifdef CONFIG_F2FS_IOSTAT F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_period_ms, iostat_period_ms); +#endif F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, readdir_ra, readdir_ra); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_io_bytes, max_io_bytes); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_pin_file_thresh, gc_pin_file_threshold); @@ -740,6 +778,10 @@ F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_candidate_count, max_candidate_cou F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_age_weight, age_weight); F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_age_threshold, age_threshold); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, seq_file_ra_mul, seq_file_ra_mul); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_segment_mode, gc_segment_mode); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_reclaimed_segments, gc_reclaimed_segs); + #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { ATTR_LIST(gc_urgent_sleep_time), @@ -770,8 +812,10 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(discard_idle_interval), ATTR_LIST(gc_idle_interval), ATTR_LIST(umount_discard_timeout), +#ifdef CONFIG_F2FS_IOSTAT ATTR_LIST(iostat_enable), ATTR_LIST(iostat_period_ms), +#endif ATTR_LIST(readdir_ra), ATTR_LIST(max_io_bytes), ATTR_LIST(gc_pin_file_thresh), @@ -812,6 +856,9 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(atgc_candidate_count), ATTR_LIST(atgc_age_weight), ATTR_LIST(atgc_age_threshold), + ATTR_LIST(seq_file_ra_mul), + ATTR_LIST(gc_segment_mode), + ATTR_LIST(gc_reclaimed_segments), NULL, }; ATTRIBUTE_GROUPS(f2fs); @@ -1036,101 +1083,6 @@ static int __maybe_unused segment_bits_seq_show(struct seq_file *seq, return 0; } -void f2fs_record_iostat(struct f2fs_sb_info *sbi) -{ - unsigned long long iostat_diff[NR_IO_TYPE]; - int i; - - if (time_is_after_jiffies(sbi->iostat_next_period)) - return; - - /* Need double check under the lock */ - spin_lock(&sbi->iostat_lock); - if (time_is_after_jiffies(sbi->iostat_next_period)) { - spin_unlock(&sbi->iostat_lock); - return; - } - sbi->iostat_next_period = jiffies + - msecs_to_jiffies(sbi->iostat_period_ms); - - for (i = 0; i < NR_IO_TYPE; i++) { - iostat_diff[i] = sbi->rw_iostat[i] - - sbi->prev_rw_iostat[i]; - sbi->prev_rw_iostat[i] = sbi->rw_iostat[i]; - } - spin_unlock(&sbi->iostat_lock); - - trace_f2fs_iostat(sbi, iostat_diff); -} - -static int __maybe_unused iostat_info_seq_show(struct seq_file *seq, - void *offset) -{ - struct super_block *sb = seq->private; - struct f2fs_sb_info *sbi = F2FS_SB(sb); - time64_t now = ktime_get_real_seconds(); - - if (!sbi->iostat_enable) - return 0; - - seq_printf(seq, "time: %-16llu\n", now); - - /* print app write IOs */ - seq_puts(seq, "[WRITE]\n"); - seq_printf(seq, "app buffered: %-16llu\n", - sbi->rw_iostat[APP_BUFFERED_IO]); - seq_printf(seq, "app direct: %-16llu\n", - sbi->rw_iostat[APP_DIRECT_IO]); - seq_printf(seq, "app mapped: %-16llu\n", - sbi->rw_iostat[APP_MAPPED_IO]); - - /* print fs write IOs */ - seq_printf(seq, "fs data: %-16llu\n", - sbi->rw_iostat[FS_DATA_IO]); - seq_printf(seq, "fs node: %-16llu\n", - sbi->rw_iostat[FS_NODE_IO]); - seq_printf(seq, "fs meta: %-16llu\n", - sbi->rw_iostat[FS_META_IO]); - seq_printf(seq, "fs gc data: %-16llu\n", - sbi->rw_iostat[FS_GC_DATA_IO]); - seq_printf(seq, "fs gc node: %-16llu\n", - sbi->rw_iostat[FS_GC_NODE_IO]); - seq_printf(seq, "fs cp data: %-16llu\n", - sbi->rw_iostat[FS_CP_DATA_IO]); - seq_printf(seq, "fs cp node: %-16llu\n", - sbi->rw_iostat[FS_CP_NODE_IO]); - seq_printf(seq, "fs cp meta: %-16llu\n", - sbi->rw_iostat[FS_CP_META_IO]); - - /* print app read IOs */ - seq_puts(seq, "[READ]\n"); - seq_printf(seq, "app buffered: %-16llu\n", - sbi->rw_iostat[APP_BUFFERED_READ_IO]); - seq_printf(seq, "app direct: %-16llu\n", - sbi->rw_iostat[APP_DIRECT_READ_IO]); - seq_printf(seq, "app mapped: %-16llu\n", - sbi->rw_iostat[APP_MAPPED_READ_IO]); - - /* print fs read IOs */ - seq_printf(seq, "fs data: %-16llu\n", - sbi->rw_iostat[FS_DATA_READ_IO]); - seq_printf(seq, "fs gc data: %-16llu\n", - sbi->rw_iostat[FS_GDATA_READ_IO]); - seq_printf(seq, "fs compr_data: %-16llu\n", - sbi->rw_iostat[FS_CDATA_READ_IO]); - seq_printf(seq, "fs node: %-16llu\n", - sbi->rw_iostat[FS_NODE_READ_IO]); - seq_printf(seq, "fs meta: %-16llu\n", - sbi->rw_iostat[FS_META_READ_IO]); - - /* print other IOs */ - seq_puts(seq, "[OTHER]\n"); - seq_printf(seq, "fs discard: %-16llu\n", - sbi->rw_iostat[FS_DISCARD]); - - return 0; -} - static int __maybe_unused victim_bits_seq_show(struct seq_file *seq, void *offset) { @@ -1213,13 +1165,15 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); if (sbi->s_proc) { - proc_create_single_data("segment_info", S_IRUGO, sbi->s_proc, + proc_create_single_data("segment_info", 0444, sbi->s_proc, segment_info_seq_show, sb); - proc_create_single_data("segment_bits", S_IRUGO, sbi->s_proc, + proc_create_single_data("segment_bits", 0444, sbi->s_proc, segment_bits_seq_show, sb); - proc_create_single_data("iostat_info", S_IRUGO, sbi->s_proc, +#ifdef CONFIG_F2FS_IOSTAT + proc_create_single_data("iostat_info", 0444, sbi->s_proc, iostat_info_seq_show, sb); - proc_create_single_data("victim_bits", S_IRUGO, sbi->s_proc, +#endif + proc_create_single_data("victim_bits", 0444, sbi->s_proc, victim_bits_seq_show, sb); } return 0; @@ -1238,7 +1192,9 @@ put_sb_kobj: void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) { if (sbi->s_proc) { +#ifdef CONFIG_F2FS_IOSTAT remove_proc_entry("iostat_info", sbi->s_proc); +#endif remove_proc_entry("segment_info", sbi->s_proc); remove_proc_entry("segment_bits", sbi->s_proc); remove_proc_entry("victim_bits", sbi->s_proc); diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index c8f34decbf8e..1d2d29dcd41c 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -27,7 +27,8 @@ static void *xattr_alloc(struct f2fs_sb_info *sbi, int size, bool *is_inline) { if (likely(size == sbi->inline_xattr_slab_size)) { *is_inline = true; - return kmem_cache_zalloc(sbi->inline_xattr_slab, GFP_NOFS); + return f2fs_kmem_cache_alloc(sbi->inline_xattr_slab, + GFP_F2FS_ZERO, false, sbi); } *is_inline = false; return f2fs_kzalloc(sbi, size, GFP_NOFS); diff --git a/fs/fat/.kunitconfig b/fs/fat/.kunitconfig new file mode 100644 index 000000000000..0a6971dbeccb --- /dev/null +++ b/fs/fat/.kunitconfig @@ -0,0 +1,5 @@ +CONFIG_KUNIT=y +CONFIG_FAT_FS=y +CONFIG_MSDOS_FS=y +CONFIG_VFAT_FS=y +CONFIG_FAT_KUNIT_TEST=y diff --git a/fs/fat/Kconfig b/fs/fat/Kconfig index 66532a71e8fd..238cc55f84c4 100644 --- a/fs/fat/Kconfig +++ b/fs/fat/Kconfig @@ -77,7 +77,7 @@ config VFAT_FS config FAT_DEFAULT_CODEPAGE int "Default codepage for FAT" - depends on MSDOS_FS || VFAT_FS + depends on FAT_FS default 437 help This option should be set to the codepage of your FAT filesystems. @@ -115,3 +115,15 @@ config FAT_DEFAULT_UTF8 Say Y if you use UTF-8 encoding for file names, N otherwise. See <file:Documentation/filesystems/vfat.rst> for more information. + +config FAT_KUNIT_TEST + tristate "Unit Tests for FAT filesystems" if !KUNIT_ALL_TESTS + depends on KUNIT && FAT_FS + default KUNIT_ALL_TESTS + help + This builds the FAT KUnit tests + + For more information on KUnit and unit tests in general, please refer + to the KUnit documentation in Documentation/dev-tools/kunit + + If unsure, say N diff --git a/fs/fat/Makefile b/fs/fat/Makefile index 70645ce2f7fc..2b034112690d 100644 --- a/fs/fat/Makefile +++ b/fs/fat/Makefile @@ -10,3 +10,5 @@ obj-$(CONFIG_MSDOS_FS) += msdos.o fat-y := cache.o dir.o fatent.o file.o inode.o misc.o nfs.o vfat-y := namei_vfat.o msdos-y := namei_msdos.o + +obj-$(CONFIG_FAT_KUNIT_TEST) += fat_test.o diff --git a/fs/fat/fat_test.c b/fs/fat/fat_test.c new file mode 100644 index 000000000000..2dab4ca1d0d8 --- /dev/null +++ b/fs/fat/fat_test.c @@ -0,0 +1,196 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KUnit tests for FAT filesystems. + * + * Copyright (C) 2020 Google LLC. + * Author: David Gow <davidgow@google.com> + */ + +#include <kunit/test.h> + +#include "fat.h" + +static void fat_checksum_test(struct kunit *test) +{ + /* With no extension. */ + KUNIT_EXPECT_EQ(test, fat_checksum("VMLINUX "), (u8)44); + /* With 3-letter extension. */ + KUNIT_EXPECT_EQ(test, fat_checksum("README TXT"), (u8)115); + /* With short (1-letter) extension. */ + KUNIT_EXPECT_EQ(test, fat_checksum("ABCDEFGHA "), (u8)98); +} + +struct fat_timestamp_testcase { + const char *name; + struct timespec64 ts; + __le16 time; + __le16 date; + u8 cs; + int time_offset; +}; + +static struct fat_timestamp_testcase time_test_cases[] = { + { + .name = "Earliest possible UTC (1980-01-01 00:00:00)", + .ts = {.tv_sec = 315532800LL, .tv_nsec = 0L}, + .time = cpu_to_le16(0), + .date = cpu_to_le16(33), + .cs = 0, + .time_offset = 0, + }, + { + .name = "Latest possible UTC (2107-12-31 23:59:58)", + .ts = {.tv_sec = 4354819198LL, .tv_nsec = 0L}, + .time = cpu_to_le16(49021), + .date = cpu_to_le16(65439), + .cs = 0, + .time_offset = 0, + }, + { + .name = "Earliest possible (UTC-11) (== 1979-12-31 13:00:00 UTC)", + .ts = {.tv_sec = 315493200LL, .tv_nsec = 0L}, + .time = cpu_to_le16(0), + .date = cpu_to_le16(33), + .cs = 0, + .time_offset = 11 * 60, + }, + { + .name = "Latest possible (UTC+11) (== 2108-01-01 10:59:58 UTC)", + .ts = {.tv_sec = 4354858798LL, .tv_nsec = 0L}, + .time = cpu_to_le16(49021), + .date = cpu_to_le16(65439), + .cs = 0, + .time_offset = -11 * 60, + }, + { + .name = "Leap Day / Year (1996-02-29 00:00:00)", + .ts = {.tv_sec = 825552000LL, .tv_nsec = 0L}, + .time = cpu_to_le16(0), + .date = cpu_to_le16(8285), + .cs = 0, + .time_offset = 0, + }, + { + .name = "Year 2000 is leap year (2000-02-29 00:00:00)", + .ts = {.tv_sec = 951782400LL, .tv_nsec = 0L}, + .time = cpu_to_le16(0), + .date = cpu_to_le16(10333), + .cs = 0, + .time_offset = 0, + }, + { + .name = "Year 2100 not leap year (2100-03-01 00:00:00)", + .ts = {.tv_sec = 4107542400LL, .tv_nsec = 0L}, + .time = cpu_to_le16(0), + .date = cpu_to_le16(61537), + .cs = 0, + .time_offset = 0, + }, + { + .name = "Leap year + timezone UTC+1 (== 2004-02-29 00:30:00 UTC)", + .ts = {.tv_sec = 1078014600LL, .tv_nsec = 0L}, + .time = cpu_to_le16(48064), + .date = cpu_to_le16(12380), + .cs = 0, + .time_offset = -60, + }, + { + .name = "Leap year + timezone UTC-1 (== 2004-02-29 23:30:00 UTC)", + .ts = {.tv_sec = 1078097400LL, .tv_nsec = 0L}, + .time = cpu_to_le16(960), + .date = cpu_to_le16(12385), + .cs = 0, + .time_offset = 60, + }, + { + .name = "VFAT odd-second resolution (1999-12-31 23:59:59)", + .ts = {.tv_sec = 946684799LL, .tv_nsec = 0L}, + .time = cpu_to_le16(49021), + .date = cpu_to_le16(10143), + .cs = 100, + .time_offset = 0, + }, + { + .name = "VFAT 10ms resolution (1980-01-01 00:00:00:0010)", + .ts = {.tv_sec = 315532800LL, .tv_nsec = 10000000L}, + .time = cpu_to_le16(0), + .date = cpu_to_le16(33), + .cs = 1, + .time_offset = 0, + }, +}; + +static void time_testcase_desc(struct fat_timestamp_testcase *t, + char *desc) +{ + strscpy(desc, t->name, KUNIT_PARAM_DESC_SIZE); +} + +KUNIT_ARRAY_PARAM(fat_time, time_test_cases, time_testcase_desc); + +static void fat_time_fat2unix_test(struct kunit *test) +{ + static struct msdos_sb_info fake_sb; + struct timespec64 ts; + struct fat_timestamp_testcase *testcase = + (struct fat_timestamp_testcase *)test->param_value; + + fake_sb.options.tz_set = 1; + fake_sb.options.time_offset = testcase->time_offset; + + fat_time_fat2unix(&fake_sb, &ts, + testcase->time, + testcase->date, + testcase->cs); + KUNIT_EXPECT_EQ_MSG(test, + testcase->ts.tv_sec, + ts.tv_sec, + "Timestamp mismatch (seconds)\n"); + KUNIT_EXPECT_EQ_MSG(test, + testcase->ts.tv_nsec, + ts.tv_nsec, + "Timestamp mismatch (nanoseconds)\n"); +} + +static void fat_time_unix2fat_test(struct kunit *test) +{ + static struct msdos_sb_info fake_sb; + __le16 date, time; + u8 cs; + struct fat_timestamp_testcase *testcase = + (struct fat_timestamp_testcase *)test->param_value; + + fake_sb.options.tz_set = 1; + fake_sb.options.time_offset = testcase->time_offset; + + fat_time_unix2fat(&fake_sb, &testcase->ts, + &time, &date, &cs); + KUNIT_EXPECT_EQ_MSG(test, + le16_to_cpu(testcase->time), + le16_to_cpu(time), + "Time mismatch\n"); + KUNIT_EXPECT_EQ_MSG(test, + le16_to_cpu(testcase->date), + le16_to_cpu(date), + "Date mismatch\n"); + KUNIT_EXPECT_EQ_MSG(test, + testcase->cs, + cs, + "Centisecond mismatch\n"); +} + +static struct kunit_case fat_test_cases[] = { + KUNIT_CASE(fat_checksum_test), + KUNIT_CASE_PARAM(fat_time_fat2unix_test, fat_time_gen_params), + KUNIT_CASE_PARAM(fat_time_unix2fat_test, fat_time_gen_params), + {}, +}; + +static struct kunit_suite fat_test_suite = { + .name = "fat_test", + .test_cases = fat_test_cases, +}; + +kunit_test_suites(&fat_test_suite); + +MODULE_LICENSE("GPL v2"); diff --git a/fs/fat/misc.c b/fs/fat/misc.c index 18a50a46b57f..91ca3c304211 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -230,6 +230,9 @@ void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec64 *ts, } } +/* Export fat_time_fat2unix() for the fat_test KUnit tests. */ +EXPORT_SYMBOL_GPL(fat_time_fat2unix); + /* Convert linear UNIX date to a FAT time/date pair. */ void fat_time_unix2fat(struct msdos_sb_info *sbi, struct timespec64 *ts, __le16 *time, __le16 *date, u8 *time_cs) diff --git a/fs/fcntl.c b/fs/fcntl.c index 68added37c15..9c6c6a3e2de5 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -1051,7 +1051,8 @@ static int __init fcntl_init(void) __FMODE_EXEC | __FMODE_NONOTIFY)); fasync_cache = kmem_cache_create("fasync_cache", - sizeof(struct fasync_struct), 0, SLAB_PANIC, NULL); + sizeof(struct fasync_struct), 0, + SLAB_PANIC | SLAB_ACCOUNT, NULL); return 0; } diff --git a/fs/file.c b/fs/file.c index 86dc9956af32..8627dacfc424 100644 --- a/fs/file.c +++ b/fs/file.c @@ -596,18 +596,32 @@ void fd_install(unsigned int fd, struct file *file) EXPORT_SYMBOL(fd_install); +/** + * pick_file - return file associatd with fd + * @files: file struct to retrieve file from + * @fd: file descriptor to retrieve file for + * + * If this functions returns an EINVAL error pointer the fd was beyond the + * current maximum number of file descriptors for that fdtable. + * + * Returns: The file associated with @fd, on error returns an error pointer. + */ static struct file *pick_file(struct files_struct *files, unsigned fd) { - struct file *file = NULL; + struct file *file; struct fdtable *fdt; spin_lock(&files->file_lock); fdt = files_fdtable(files); - if (fd >= fdt->max_fds) + if (fd >= fdt->max_fds) { + file = ERR_PTR(-EINVAL); goto out_unlock; + } file = fdt->fd[fd]; - if (!file) + if (!file) { + file = ERR_PTR(-EBADF); goto out_unlock; + } rcu_assign_pointer(fdt->fd[fd], NULL); __put_unused_fd(files, fd); @@ -622,7 +636,7 @@ int close_fd(unsigned fd) struct file *file; file = pick_file(files, fd); - if (!file) + if (IS_ERR(file)) return -EBADF; return filp_close(file, files); @@ -663,11 +677,16 @@ static inline void __range_close(struct files_struct *cur_fds, unsigned int fd, struct file *file; file = pick_file(cur_fds, fd++); - if (!file) + if (!IS_ERR(file)) { + /* found a valid file to close */ + filp_close(file, cur_fds); + cond_resched(); continue; + } - filp_close(file, cur_fds); - cond_resched(); + /* beyond the last fd in that table */ + if (PTR_ERR(file) == -EINVAL) + return; } } @@ -682,7 +701,6 @@ static inline void __range_close(struct files_struct *cur_fds, unsigned int fd, */ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags) { - unsigned int cur_max; struct task_struct *me = current; struct files_struct *cur_fds = me->files, *fds = NULL; @@ -692,26 +710,26 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags) if (fd > max_fd) return -EINVAL; - rcu_read_lock(); - cur_max = files_fdtable(cur_fds)->max_fds; - rcu_read_unlock(); - - /* cap to last valid index into fdtable */ - cur_max--; - if (flags & CLOSE_RANGE_UNSHARE) { int ret; unsigned int max_unshare_fds = NR_OPEN_MAX; /* - * If the requested range is greater than the current maximum, - * we're closing everything so only copy all file descriptors - * beneath the lowest file descriptor. - * If the caller requested all fds to be made cloexec copy all - * of the file descriptors since they still want to use them. + * If the caller requested all fds to be made cloexec we always + * copy all of the file descriptors since they still want to + * use them. */ - if (!(flags & CLOSE_RANGE_CLOEXEC) && (max_fd >= cur_max)) - max_unshare_fds = fd; + if (!(flags & CLOSE_RANGE_CLOEXEC)) { + /* + * If the requested range is greater than the current + * maximum, we're closing everything so only copy all + * file descriptors beneath the lowest file descriptor. + */ + rcu_read_lock(); + if (max_fd >= last_fd(files_fdtable(cur_fds))) + max_unshare_fds = fd; + rcu_read_unlock(); + } ret = unshare_fd(CLONE_FILES, max_unshare_fds, &fds); if (ret) @@ -725,8 +743,6 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags) swap(cur_fds, fds); } - max_fd = min(max_fd, cur_max); - if (flags & CLOSE_RANGE_CLOEXEC) __range_cloexec(cur_fds, fd, max_fd); else @@ -1134,6 +1150,12 @@ int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags) return new_fd; } +int receive_fd(struct file *file, unsigned int o_flags) +{ + return __receive_fd(file, NULL, o_flags); +} +EXPORT_SYMBOL_GPL(receive_fd); + static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags) { int err = -EBADF; diff --git a/fs/filesystems.c b/fs/filesystems.c index 90b8d879fbaf..58b9067b2391 100644 --- a/fs/filesystems.c +++ b/fs/filesystems.c @@ -209,21 +209,28 @@ SYSCALL_DEFINE3(sysfs, int, option, unsigned long, arg1, unsigned long, arg2) } #endif -int __init get_filesystem_list(char *buf) +int __init list_bdev_fs_names(char *buf, size_t size) { - int len = 0; - struct file_system_type * tmp; + struct file_system_type *p; + size_t len; + int count = 0; read_lock(&file_systems_lock); - tmp = file_systems; - while (tmp && len < PAGE_SIZE - 80) { - len += sprintf(buf+len, "%s\t%s\n", - (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev", - tmp->name); - tmp = tmp->next; + for (p = file_systems; p; p = p->next) { + if (!(p->fs_flags & FS_REQUIRES_DEV)) + continue; + len = strlen(p->name) + 1; + if (len > size) { + pr_warn("%s: truncating file system list\n", __func__); + break; + } + memcpy(buf, p->name, len); + buf += len; + size -= len; + count++; } read_unlock(&file_systems_lock); - return len; + return count; } #ifdef CONFIG_PROC_FS diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 4c3370548982..81ec192ce067 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -406,6 +406,11 @@ static bool inode_do_switch_wbs(struct inode *inode, inc_wb_stat(new_wb, WB_WRITEBACK); } + if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) { + atomic_dec(&old_wb->writeback_inodes); + atomic_inc(&new_wb->writeback_inodes); + } + wb_get(new_wb); /* @@ -1034,20 +1039,20 @@ restart: * cgroup_writeback_by_id - initiate cgroup writeback from bdi and memcg IDs * @bdi_id: target bdi id * @memcg_id: target memcg css id - * @nr: number of pages to write, 0 for best-effort dirty flushing * @reason: reason why some writeback work initiated * @done: target wb_completion * * Initiate flush of the bdi_writeback identified by @bdi_id and @memcg_id * with the specified parameters. */ -int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, unsigned long nr, +int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, enum wb_reason reason, struct wb_completion *done) { struct backing_dev_info *bdi; struct cgroup_subsys_state *memcg_css; struct bdi_writeback *wb; struct wb_writeback_work *work; + unsigned long dirty; int ret; /* lookup bdi and memcg */ @@ -1076,24 +1081,22 @@ int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, unsigned long nr, } /* - * If @nr is zero, the caller is attempting to write out most of + * The caller is attempting to write out most of * the currently dirty pages. Let's take the current dirty page * count and inflate it by 25% which should be large enough to * flush out most dirty pages while avoiding getting livelocked by * concurrent dirtiers. + * + * BTW the memcg stats are flushed periodically and this is best-effort + * estimation, so some potential error is ok. */ - if (!nr) { - unsigned long filepages, headroom, dirty, writeback; - - mem_cgroup_wb_stats(wb, &filepages, &headroom, &dirty, - &writeback); - nr = dirty * 10 / 8; - } + dirty = memcg_page_state(mem_cgroup_from_css(memcg_css), NR_FILE_DIRTY); + dirty = dirty * 10 / 8; /* issue the writeback work */ work = kzalloc(sizeof(*work), GFP_NOWAIT | __GFP_NOWARN); if (work) { - work->nr_pages = nr; + work->nr_pages = dirty; work->sync_mode = WB_SYNC_NONE; work->range_cyclic = 1; work->reason = reason; @@ -1999,7 +2002,6 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, static long wb_writeback(struct bdi_writeback *wb, struct wb_writeback_work *work) { - unsigned long wb_start = jiffies; long nr_pages = work->nr_pages; unsigned long dirtied_before = jiffies; struct inode *inode; @@ -2053,8 +2055,6 @@ static long wb_writeback(struct bdi_writeback *wb, progress = __writeback_inodes_wb(wb, work); trace_writeback_written(wb, work); - wb_update_bandwidth(wb, wb_start); - /* * Did we write something? Try for more * @@ -2730,23 +2730,6 @@ int write_inode_now(struct inode *inode, int sync) EXPORT_SYMBOL(write_inode_now); /** - * sync_inode - write an inode and its pages to disk. - * @inode: the inode to sync - * @wbc: controls the writeback mode - * - * sync_inode() will write an inode and its pages to disk. It will also - * correctly update the inode on its superblock's dirty inode lists and will - * update inode->i_state. - * - * The caller must have a ref on the inode. - */ -int sync_inode(struct inode *inode, struct writeback_control *wbc) -{ - return writeback_single_inode(inode, wbc); -} -EXPORT_SYMBOL(sync_inode); - -/** * sync_inode_metadata - write an inode to disk * @inode: the inode to sync * @wait: wait for I/O to complete. @@ -2762,6 +2745,6 @@ int sync_inode_metadata(struct inode *inode, int wait) .nr_to_write = 0, /* metadata-only */ }; - return sync_inode(inode, &wbc); + return writeback_single_inode(inode, &wbc); } EXPORT_SYMBOL(sync_inode_metadata); diff --git a/fs/fs_context.c b/fs/fs_context.c index de1985eae535..b7e43a780a62 100644 --- a/fs/fs_context.c +++ b/fs/fs_context.c @@ -254,7 +254,7 @@ static struct fs_context *alloc_fs_context(struct file_system_type *fs_type, struct fs_context *fc; int ret = -ENOMEM; - fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL); + fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL_ACCOUNT); if (!fc) return ERR_PTR(-ENOMEM); @@ -649,7 +649,7 @@ const struct fs_context_operations legacy_fs_context_ops = { */ static int legacy_init_fs_context(struct fs_context *fc) { - fc->fs_private = kzalloc(sizeof(struct legacy_fs_context), GFP_KERNEL); + fc->fs_private = kzalloc(sizeof(struct legacy_fs_context), GFP_KERNEL_ACCOUNT); if (!fc->fs_private) return -ENOMEM; fc->ops = &legacy_fs_context_ops; diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig index 427efa73b9bd..b313a978ae0a 100644 --- a/fs/fscache/Kconfig +++ b/fs/fscache/Kconfig @@ -14,6 +14,7 @@ config FSCACHE config FSCACHE_STATS bool "Gather statistical information on local caching" depends on FSCACHE && PROC_FS + select NETFS_STATS help This option causes statistical information to be gathered on local caching and exported through file: @@ -28,23 +29,6 @@ config FSCACHE_STATS See Documentation/filesystems/caching/fscache.rst for more information. -config FSCACHE_HISTOGRAM - bool "Gather latency information on local caching" - depends on FSCACHE && PROC_FS - help - This option causes latency information to be gathered on local - caching and exported through file: - - /proc/fs/fscache/histogram - - The generation of this histogram adds a certain amount of overhead to - execution as there are a number of points at which data is gathered, - and on a multi-CPU system these may be on cachelines that keep - bouncing between CPUs. On the other hand, the histogram may be - useful for debugging purposes. Saying 'N' here is recommended. - - See Documentation/filesystems/caching/fscache.rst for more information. - config FSCACHE_DEBUG bool "Debug FS-Cache" depends on FSCACHE @@ -54,10 +38,3 @@ config FSCACHE_DEBUG enabled by setting bits in /sys/modules/fscache/parameter/debug. See Documentation/filesystems/caching/fscache.rst for more information. - -config FSCACHE_OBJECT_LIST - bool "Maintain global object list for debugging purposes" - depends on FSCACHE && PROC_FS - help - Maintain a global list of active fscache objects that can be - retrieved through /proc/fs/fscache/objects for debugging purposes diff --git a/fs/fscache/Makefile b/fs/fscache/Makefile index 3b2ffa93ac18..03a871d689bb 100644 --- a/fs/fscache/Makefile +++ b/fs/fscache/Makefile @@ -16,7 +16,5 @@ fscache-y := \ fscache-$(CONFIG_PROC_FS) += proc.o fscache-$(CONFIG_FSCACHE_STATS) += stats.o -fscache-$(CONFIG_FSCACHE_HISTOGRAM) += histogram.o -fscache-$(CONFIG_FSCACHE_OBJECT_LIST) += object-list.o obj-$(CONFIG_FSCACHE) := fscache.o diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c index fcc136361415..bd4f44c1cce0 100644 --- a/fs/fscache/cache.c +++ b/fs/fscache/cache.c @@ -116,7 +116,7 @@ struct fscache_cache *fscache_select_cache_for_object( cache = NULL; spin_unlock(&cookie->lock); - _leave(" = %p [parent]", cache); + _leave(" = %s [parent]", cache ? cache->tag->name : "NULL"); return cache; } @@ -152,14 +152,14 @@ struct fscache_cache *fscache_select_cache_for_object( if (test_bit(FSCACHE_IOERROR, &tag->cache->flags)) return NULL; - _leave(" = %p [specific]", tag->cache); + _leave(" = %s [specific]", tag->name); return tag->cache; no_preference: /* netfs has no preference - just select first cache */ cache = list_entry(fscache_cache_list.next, struct fscache_cache, link); - _leave(" = %p [first]", cache); + _leave(" = %s [first]", cache->tag->name); return cache; } @@ -261,7 +261,6 @@ int fscache_add_cache(struct fscache_cache *cache, spin_lock(&cache->object_list_lock); list_add_tail(&ifsdef->cache_link, &cache->object_list); spin_unlock(&cache->object_list_lock); - fscache_objlist_add(ifsdef); /* add the cache's netfs definition index object to the top level index * cookie as a known backing object */ @@ -270,7 +269,7 @@ int fscache_add_cache(struct fscache_cache *cache, hlist_add_head(&ifsdef->cookie_link, &fscache_fsdef_index.backing_objects); - atomic_inc(&fscache_fsdef_index.usage); + refcount_inc(&fscache_fsdef_index.ref); /* done */ spin_unlock(&fscache_fsdef_index.lock); @@ -335,7 +334,7 @@ static void fscache_withdraw_all_objects(struct fscache_cache *cache, struct fscache_object, cache_link); list_move_tail(&object->cache_link, dying_objects); - _debug("withdraw %p", object->cookie); + _debug("withdraw %x", object->cookie->debug_id); /* This must be done under object_list_lock to prevent * a race with fscache_drop_object(). diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 751bc5b1cddf..cd42be646ed3 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -19,6 +19,8 @@ static atomic_t fscache_object_debug_id = ATOMIC_INIT(0); #define fscache_cookie_hash_shift 15 static struct hlist_bl_head fscache_cookie_hash[1 << fscache_cookie_hash_shift]; +static LIST_HEAD(fscache_cookies); +static DEFINE_RWLOCK(fscache_cookies_lock); static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie, loff_t object_size); @@ -29,21 +31,29 @@ static int fscache_attach_object(struct fscache_cookie *cookie, static void fscache_print_cookie(struct fscache_cookie *cookie, char prefix) { - struct hlist_node *object; + struct fscache_object *object; + struct hlist_node *o; const u8 *k; unsigned loop; - pr_err("%c-cookie c=%p [p=%p fl=%lx nc=%u na=%u]\n", - prefix, cookie, cookie->parent, cookie->flags, + pr_err("%c-cookie c=%08x [p=%08x fl=%lx nc=%u na=%u]\n", + prefix, + cookie->debug_id, + cookie->parent ? cookie->parent->debug_id : 0, + cookie->flags, atomic_read(&cookie->n_children), atomic_read(&cookie->n_active)); - pr_err("%c-cookie d=%p n=%p\n", - prefix, cookie->def, cookie->netfs_data); - - object = READ_ONCE(cookie->backing_objects.first); - if (object) - pr_err("%c-cookie o=%p\n", - prefix, hlist_entry(object, struct fscache_object, cookie_link)); + pr_err("%c-cookie d=%p{%s} n=%p\n", + prefix, + cookie->def, + cookie->def ? cookie->def->name : "?", + cookie->netfs_data); + + o = READ_ONCE(cookie->backing_objects.first); + if (o) { + object = hlist_entry(o, struct fscache_object, cookie_link); + pr_err("%c-cookie o=%u\n", prefix, object->debug_id); + } pr_err("%c-key=[%u] '", prefix, cookie->key_len); k = (cookie->key_len <= sizeof(cookie->inline_key)) ? @@ -57,6 +67,9 @@ void fscache_free_cookie(struct fscache_cookie *cookie) { if (cookie) { BUG_ON(!hlist_empty(&cookie->backing_objects)); + write_lock(&fscache_cookies_lock); + list_del(&cookie->proc_link); + write_unlock(&fscache_cookies_lock); if (cookie->aux_len > sizeof(cookie->inline_aux)) kfree(cookie->aux); if (cookie->key_len > sizeof(cookie->inline_key)) @@ -74,10 +87,8 @@ void fscache_free_cookie(struct fscache_cookie *cookie) static int fscache_set_key(struct fscache_cookie *cookie, const void *index_key, size_t index_key_len) { - unsigned long long h; u32 *buf; int bufs; - int i; bufs = DIV_ROUND_UP(index_key_len, sizeof(*buf)); @@ -91,17 +102,7 @@ static int fscache_set_key(struct fscache_cookie *cookie, } memcpy(buf, index_key, index_key_len); - - /* Calculate a hash and combine this with the length in the first word - * or first half word - */ - h = (unsigned long)cookie->parent; - h += index_key_len + cookie->type; - - for (i = 0; i < bufs; i++) - h += buf[i]; - - cookie->key_hash = h ^ (h >> 32); + cookie->key_hash = fscache_hash(0, buf, bufs); return 0; } @@ -129,6 +130,8 @@ static long fscache_compare_cookie(const struct fscache_cookie *a, return memcmp(ka, kb, a->key_len); } +static atomic_t fscache_cookie_debug_id = ATOMIC_INIT(1); + /* * Allocate a cookie. */ @@ -161,8 +164,9 @@ struct fscache_cookie *fscache_alloc_cookie( goto nomem; } - atomic_set(&cookie->usage, 1); + refcount_set(&cookie->ref, 1); atomic_set(&cookie->n_children, 0); + cookie->debug_id = atomic_inc_return(&fscache_cookie_debug_id); /* We keep the active count elevated until relinquishment to prevent an * attempt to wake up every time the object operations queue quiesces. @@ -181,6 +185,10 @@ struct fscache_cookie *fscache_alloc_cookie( /* radix tree insertion won't use the preallocation pool unless it's * told it may not wait */ INIT_RADIX_TREE(&cookie->stores, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); + + write_lock(&fscache_cookies_lock); + list_add_tail(&cookie->proc_link, &fscache_cookies); + write_unlock(&fscache_cookies_lock); return cookie; nomem: @@ -217,8 +225,8 @@ struct fscache_cookie *fscache_hash_cookie(struct fscache_cookie *candidate) collision: if (test_and_set_bit(FSCACHE_COOKIE_ACQUIRED, &cursor->flags)) { - trace_fscache_cookie(cursor, fscache_cookie_collision, - atomic_read(&cursor->usage)); + trace_fscache_cookie(cursor->debug_id, refcount_read(&cursor->ref), + fscache_cookie_collision); pr_err("Duplicate cookie detected\n"); fscache_print_cookie(cursor, 'O'); fscache_print_cookie(candidate, 'N'); @@ -297,7 +305,8 @@ struct fscache_cookie *__fscache_acquire_cookie( cookie = fscache_hash_cookie(candidate); if (!cookie) { - trace_fscache_cookie(candidate, fscache_cookie_discard, 1); + trace_fscache_cookie(candidate->debug_id, 1, + fscache_cookie_discard); goto out; } @@ -355,7 +364,7 @@ void __fscache_enable_cookie(struct fscache_cookie *cookie, bool (*can_enable)(void *data), void *data) { - _enter("%p", cookie); + _enter("%x", cookie->debug_id); trace_fscache_enable(cookie); @@ -452,10 +461,8 @@ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie, /* we may be required to wait for lookup to complete at this point */ if (!fscache_defer_lookup) { - _debug("non-deferred lookup %p", &cookie->flags); wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP, TASK_UNINTERRUPTIBLE); - _debug("complete"); if (test_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags)) goto unavailable; } @@ -480,7 +487,7 @@ static int fscache_alloc_object(struct fscache_cache *cache, struct fscache_object *object; int ret; - _enter("%p,%p{%s}", cache, cookie, cookie->def->name); + _enter("%s,%x{%s}", cache->tag->name, cookie->debug_id, cookie->def->name); spin_lock(&cookie->lock); hlist_for_each_entry(object, &cookie->backing_objects, @@ -600,8 +607,6 @@ static int fscache_attach_object(struct fscache_cookie *cookie, /* Attach to the cookie. The object already has a ref on it. */ hlist_add_head(&object->cookie_link, &cookie->backing_objects); - - fscache_objlist_add(object); ret = 0; cant_attach_object: @@ -658,7 +663,7 @@ EXPORT_SYMBOL(__fscache_invalidate); */ void __fscache_wait_on_invalidate(struct fscache_cookie *cookie) { - _enter("%p", cookie); + _enter("%x", cookie->debug_id); wait_on_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING, TASK_UNINTERRUPTIBLE); @@ -713,7 +718,7 @@ void __fscache_disable_cookie(struct fscache_cookie *cookie, struct fscache_object *object; bool awaken = false; - _enter("%p,%u", cookie, invalidate); + _enter("%x,%u", cookie->debug_id, invalidate); trace_fscache_disable(cookie); @@ -803,8 +808,8 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, return; } - _enter("%p{%s,%p,%d},%d", - cookie, cookie->def->name, cookie->netfs_data, + _enter("%x{%s,%d},%d", + cookie->debug_id, cookie->def->name, atomic_read(&cookie->n_active), retire); trace_fscache_relinquish(cookie, retire); @@ -821,13 +826,12 @@ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, BUG_ON(!radix_tree_empty(&cookie->stores)); if (cookie->parent) { - ASSERTCMP(atomic_read(&cookie->parent->usage), >, 0); + ASSERTCMP(refcount_read(&cookie->parent->ref), >, 0); ASSERTCMP(atomic_read(&cookie->parent->n_children), >, 0); atomic_dec(&cookie->parent->n_children); } /* Dispose of the netfs's link to the cookie */ - ASSERTCMP(atomic_read(&cookie->usage), >, 0); fscache_cookie_put(cookie, fscache_cookie_put_relinquish); _leave(""); @@ -857,17 +861,17 @@ void fscache_cookie_put(struct fscache_cookie *cookie, enum fscache_cookie_trace where) { struct fscache_cookie *parent; - int usage; + int ref; - _enter("%p", cookie); + _enter("%x", cookie->debug_id); do { - usage = atomic_dec_return(&cookie->usage); - trace_fscache_cookie(cookie, where, usage); + unsigned int cookie_debug_id = cookie->debug_id; + bool zero = __refcount_dec_and_test(&cookie->ref, &ref); - if (usage > 0) + trace_fscache_cookie(cookie_debug_id, ref - 1, where); + if (!zero) return; - BUG_ON(usage < 0); parent = cookie->parent; fscache_unhash_cookie(cookie); @@ -881,6 +885,19 @@ void fscache_cookie_put(struct fscache_cookie *cookie, } /* + * Get a reference to a cookie. + */ +struct fscache_cookie *fscache_cookie_get(struct fscache_cookie *cookie, + enum fscache_cookie_trace where) +{ + int ref; + + __refcount_inc(&cookie->ref, &ref); + trace_fscache_cookie(cookie->debug_id, ref + 1, where); + return cookie; +} + +/* * check the consistency between the netfs inode and the backing cache * * NOTE: it only serves no-index type @@ -958,3 +975,97 @@ inconsistent: return -ESTALE; } EXPORT_SYMBOL(__fscache_check_consistency); + +/* + * Generate a list of extant cookies in /proc/fs/fscache/cookies + */ +static int fscache_cookies_seq_show(struct seq_file *m, void *v) +{ + struct fscache_cookie *cookie; + unsigned int keylen = 0, auxlen = 0; + char _type[3], *type; + u8 *p; + + if (v == &fscache_cookies) { + seq_puts(m, + "COOKIE PARENT USAGE CHILD ACT TY FL DEF NETFS_DATA\n" + "======== ======== ===== ===== === == === ================ ==========\n" + ); + return 0; + } + + cookie = list_entry(v, struct fscache_cookie, proc_link); + + switch (cookie->type) { + case 0: + type = "IX"; + break; + case 1: + type = "DT"; + break; + default: + snprintf(_type, sizeof(_type), "%02u", + cookie->type); + type = _type; + break; + } + + seq_printf(m, + "%08x %08x %5u %5u %3u %s %03lx %-16s %px", + cookie->debug_id, + cookie->parent ? cookie->parent->debug_id : 0, + refcount_read(&cookie->ref), + atomic_read(&cookie->n_children), + atomic_read(&cookie->n_active), + type, + cookie->flags, + cookie->def->name, + cookie->netfs_data); + + keylen = cookie->key_len; + auxlen = cookie->aux_len; + + if (keylen > 0 || auxlen > 0) { + seq_puts(m, " "); + p = keylen <= sizeof(cookie->inline_key) ? + cookie->inline_key : cookie->key; + for (; keylen > 0; keylen--) + seq_printf(m, "%02x", *p++); + if (auxlen > 0) { + seq_puts(m, ", "); + p = auxlen <= sizeof(cookie->inline_aux) ? + cookie->inline_aux : cookie->aux; + for (; auxlen > 0; auxlen--) + seq_printf(m, "%02x", *p++); + } + } + + seq_puts(m, "\n"); + return 0; +} + +static void *fscache_cookies_seq_start(struct seq_file *m, loff_t *_pos) + __acquires(fscache_cookies_lock) +{ + read_lock(&fscache_cookies_lock); + return seq_list_start_head(&fscache_cookies, *_pos); +} + +static void *fscache_cookies_seq_next(struct seq_file *m, void *v, loff_t *_pos) +{ + return seq_list_next(v, &fscache_cookies, _pos); +} + +static void fscache_cookies_seq_stop(struct seq_file *m, void *v) + __releases(rcu) +{ + read_unlock(&fscache_cookies_lock); +} + + +const struct seq_operations fscache_cookies_seq_ops = { + .start = fscache_cookies_seq_start, + .next = fscache_cookies_seq_next, + .stop = fscache_cookies_seq_stop, + .show = fscache_cookies_seq_show, +}; diff --git a/fs/fscache/fsdef.c b/fs/fscache/fsdef.c index 09ed8795ad86..0402673c680e 100644 --- a/fs/fscache/fsdef.c +++ b/fs/fscache/fsdef.c @@ -45,7 +45,8 @@ static struct fscache_cookie_def fscache_fsdef_index_def = { }; struct fscache_cookie fscache_fsdef_index = { - .usage = ATOMIC_INIT(1), + .debug_id = 1, + .ref = REFCOUNT_INIT(1), .n_active = ATOMIC_INIT(1), .lock = __SPIN_LOCK_UNLOCKED(fscache_fsdef_index.lock), .backing_objects = HLIST_HEAD_INIT, diff --git a/fs/fscache/histogram.c b/fs/fscache/histogram.c deleted file mode 100644 index 4e5beeaaf454..000000000000 --- a/fs/fscache/histogram.c +++ /dev/null @@ -1,87 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* FS-Cache latency histogram - * - * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - */ - -#define FSCACHE_DEBUG_LEVEL THREAD -#include <linux/module.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> -#include "internal.h" - -atomic_t fscache_obj_instantiate_histogram[HZ]; -atomic_t fscache_objs_histogram[HZ]; -atomic_t fscache_ops_histogram[HZ]; -atomic_t fscache_retrieval_delay_histogram[HZ]; -atomic_t fscache_retrieval_histogram[HZ]; - -/* - * display the time-taken histogram - */ -static int fscache_histogram_show(struct seq_file *m, void *v) -{ - unsigned long index; - unsigned n[5], t; - - switch ((unsigned long) v) { - case 1: - seq_puts(m, "JIFS SECS OBJ INST OP RUNS OBJ RUNS RETRV DLY RETRIEVLS\n"); - return 0; - case 2: - seq_puts(m, "===== ===== ========= ========= ========= ========= =========\n"); - return 0; - default: - index = (unsigned long) v - 3; - n[0] = atomic_read(&fscache_obj_instantiate_histogram[index]); - n[1] = atomic_read(&fscache_ops_histogram[index]); - n[2] = atomic_read(&fscache_objs_histogram[index]); - n[3] = atomic_read(&fscache_retrieval_delay_histogram[index]); - n[4] = atomic_read(&fscache_retrieval_histogram[index]); - if (!(n[0] | n[1] | n[2] | n[3] | n[4])) - return 0; - - t = (index * 1000) / HZ; - - seq_printf(m, "%4lu 0.%03u %9u %9u %9u %9u %9u\n", - index, t, n[0], n[1], n[2], n[3], n[4]); - return 0; - } -} - -/* - * set up the iterator to start reading from the first line - */ -static void *fscache_histogram_start(struct seq_file *m, loff_t *_pos) -{ - if ((unsigned long long)*_pos >= HZ + 2) - return NULL; - if (*_pos == 0) - *_pos = 1; - return (void *)(unsigned long) *_pos; -} - -/* - * move to the next line - */ -static void *fscache_histogram_next(struct seq_file *m, void *v, loff_t *pos) -{ - (*pos)++; - return (unsigned long long)*pos > HZ + 2 ? - NULL : (void *)(unsigned long) *pos; -} - -/* - * clean up after reading - */ -static void fscache_histogram_stop(struct seq_file *m, void *v) -{ -} - -const struct seq_operations fscache_histogram_ops = { - .start = fscache_histogram_start, - .stop = fscache_histogram_stop, - .next = fscache_histogram_next, - .show = fscache_histogram_show, -}; diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index c483863b740a..c3e4804b8fcb 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -45,6 +45,7 @@ extern struct fscache_cache *fscache_select_cache_for_object( * cookie.c */ extern struct kmem_cache *fscache_cookie_jar; +extern const struct seq_operations fscache_cookies_seq_ops; extern void fscache_free_cookie(struct fscache_cookie *); extern struct fscache_cookie *fscache_alloc_cookie(struct fscache_cookie *, @@ -53,9 +54,18 @@ extern struct fscache_cookie *fscache_alloc_cookie(struct fscache_cookie *, const void *, size_t, void *, loff_t); extern struct fscache_cookie *fscache_hash_cookie(struct fscache_cookie *); +extern struct fscache_cookie *fscache_cookie_get(struct fscache_cookie *, + enum fscache_cookie_trace); extern void fscache_cookie_put(struct fscache_cookie *, enum fscache_cookie_trace); +static inline void fscache_cookie_see(struct fscache_cookie *cookie, + enum fscache_cookie_trace where) +{ + trace_fscache_cookie(cookie->debug_id, refcount_read(&cookie->ref), + where); +} + /* * fsdef.c */ @@ -63,30 +73,6 @@ extern struct fscache_cookie fscache_fsdef_index; extern struct fscache_cookie_def fscache_fsdef_netfs_def; /* - * histogram.c - */ -#ifdef CONFIG_FSCACHE_HISTOGRAM -extern atomic_t fscache_obj_instantiate_histogram[HZ]; -extern atomic_t fscache_objs_histogram[HZ]; -extern atomic_t fscache_ops_histogram[HZ]; -extern atomic_t fscache_retrieval_delay_histogram[HZ]; -extern atomic_t fscache_retrieval_histogram[HZ]; - -static inline void fscache_hist(atomic_t histogram[], unsigned long start_jif) -{ - unsigned long jif = jiffies - start_jif; - if (jif >= HZ) - jif = HZ - 1; - atomic_inc(&histogram[jif]); -} - -extern const struct seq_operations fscache_histogram_ops; - -#else -#define fscache_hist(hist, start_jif) do {} while (0) -#endif - -/* * main.c */ extern unsigned fscache_defer_lookup; @@ -97,6 +83,8 @@ extern struct workqueue_struct *fscache_object_wq; extern struct workqueue_struct *fscache_op_wq; DECLARE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait); +extern unsigned int fscache_hash(unsigned int salt, unsigned int *data, unsigned int n); + static inline bool fscache_object_congested(void) { return workqueue_congested(WORK_CPU_UNBOUND, fscache_object_wq); @@ -108,19 +96,6 @@ static inline bool fscache_object_congested(void) extern void fscache_enqueue_object(struct fscache_object *); /* - * object-list.c - */ -#ifdef CONFIG_FSCACHE_OBJECT_LIST -extern const struct proc_ops fscache_objlist_proc_ops; - -extern void fscache_objlist_add(struct fscache_object *); -extern void fscache_objlist_remove(struct fscache_object *); -#else -#define fscache_objlist_add(object) do {} while(0) -#define fscache_objlist_remove(object) do {} while(0) -#endif - -/* * operation.c */ extern int fscache_submit_exclusive_op(struct fscache_object *, @@ -320,14 +295,6 @@ static inline void fscache_raise_event(struct fscache_object *object, fscache_enqueue_object(object); } -static inline void fscache_cookie_get(struct fscache_cookie *cookie, - enum fscache_cookie_trace where) -{ - int usage = atomic_inc_return(&cookie->usage); - - trace_fscache_cookie(cookie, where, usage); -} - /* * get an extra reference to a netfs retrieval context */ diff --git a/fs/fscache/main.c b/fs/fscache/main.c index c1e6cc9091aa..4207f98e405f 100644 --- a/fs/fscache/main.c +++ b/fs/fscache/main.c @@ -94,6 +94,45 @@ static struct ctl_table fscache_sysctls_root[] = { #endif /* + * Mixing scores (in bits) for (7,20): + * Input delta: 1-bit 2-bit + * 1 round: 330.3 9201.6 + * 2 rounds: 1246.4 25475.4 + * 3 rounds: 1907.1 31295.1 + * 4 rounds: 2042.3 31718.6 + * Perfect: 2048 31744 + * (32*64) (32*31/2 * 64) + */ +#define HASH_MIX(x, y, a) \ + ( x ^= (a), \ + y ^= x, x = rol32(x, 7),\ + x += y, y = rol32(y,20),\ + y *= 9 ) + +static inline unsigned int fold_hash(unsigned long x, unsigned long y) +{ + /* Use arch-optimized multiply if one exists */ + return __hash_32(y ^ __hash_32(x)); +} + +/* + * Generate a hash. This is derived from full_name_hash(), but we want to be + * sure it is arch independent and that it doesn't change as bits of the + * computed hash value might appear on disk. The caller also guarantees that + * the hashed data will be a series of aligned 32-bit words. + */ +unsigned int fscache_hash(unsigned int salt, unsigned int *data, unsigned int n) +{ + unsigned int a, x = 0, y = salt; + + for (; n; n--) { + a = *data++; + HASH_MIX(x, y, a); + } + return fold_hash(x, y); +} + +/* * initialise the fs caching module */ static int __init fscache_init(void) diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c index cce92216fa28..d6bdb7b5e723 100644 --- a/fs/fscache/netfs.c +++ b/fs/fscache/netfs.c @@ -37,7 +37,7 @@ int __fscache_register_netfs(struct fscache_netfs *netfs) if (!cookie) goto already_registered; if (cookie != candidate) { - trace_fscache_cookie(candidate, fscache_cookie_discard, 1); + trace_fscache_cookie(candidate->debug_id, 1, fscache_cookie_discard); fscache_free_cookie(candidate); } diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c deleted file mode 100644 index e106a1a1600d..000000000000 --- a/fs/fscache/object-list.c +++ /dev/null @@ -1,414 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* Global fscache object list maintainer and viewer - * - * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - */ - -#define FSCACHE_DEBUG_LEVEL COOKIE -#include <linux/module.h> -#include <linux/proc_fs.h> -#include <linux/seq_file.h> -#include <linux/slab.h> -#include <linux/key.h> -#include <keys/user-type.h> -#include "internal.h" - -static struct rb_root fscache_object_list; -static DEFINE_RWLOCK(fscache_object_list_lock); - -struct fscache_objlist_data { - unsigned long config; /* display configuration */ -#define FSCACHE_OBJLIST_CONFIG_KEY 0x00000001 /* show object keys */ -#define FSCACHE_OBJLIST_CONFIG_AUX 0x00000002 /* show object auxdata */ -#define FSCACHE_OBJLIST_CONFIG_COOKIE 0x00000004 /* show objects with cookies */ -#define FSCACHE_OBJLIST_CONFIG_NOCOOKIE 0x00000008 /* show objects without cookies */ -#define FSCACHE_OBJLIST_CONFIG_BUSY 0x00000010 /* show busy objects */ -#define FSCACHE_OBJLIST_CONFIG_IDLE 0x00000020 /* show idle objects */ -#define FSCACHE_OBJLIST_CONFIG_PENDWR 0x00000040 /* show objects with pending writes */ -#define FSCACHE_OBJLIST_CONFIG_NOPENDWR 0x00000080 /* show objects without pending writes */ -#define FSCACHE_OBJLIST_CONFIG_READS 0x00000100 /* show objects with active reads */ -#define FSCACHE_OBJLIST_CONFIG_NOREADS 0x00000200 /* show objects without active reads */ -#define FSCACHE_OBJLIST_CONFIG_EVENTS 0x00000400 /* show objects with events */ -#define FSCACHE_OBJLIST_CONFIG_NOEVENTS 0x00000800 /* show objects without no events */ -#define FSCACHE_OBJLIST_CONFIG_WORK 0x00001000 /* show objects with work */ -#define FSCACHE_OBJLIST_CONFIG_NOWORK 0x00002000 /* show objects without work */ -}; - -/* - * Add an object to the object list - * - we use the address of the fscache_object structure as the key into the - * tree - */ -void fscache_objlist_add(struct fscache_object *obj) -{ - struct fscache_object *xobj; - struct rb_node **p = &fscache_object_list.rb_node, *parent = NULL; - - ASSERT(RB_EMPTY_NODE(&obj->objlist_link)); - - write_lock(&fscache_object_list_lock); - - while (*p) { - parent = *p; - xobj = rb_entry(parent, struct fscache_object, objlist_link); - - if (obj < xobj) - p = &(*p)->rb_left; - else if (obj > xobj) - p = &(*p)->rb_right; - else - BUG(); - } - - rb_link_node(&obj->objlist_link, parent, p); - rb_insert_color(&obj->objlist_link, &fscache_object_list); - - write_unlock(&fscache_object_list_lock); -} - -/* - * Remove an object from the object list. - */ -void fscache_objlist_remove(struct fscache_object *obj) -{ - if (RB_EMPTY_NODE(&obj->objlist_link)) - return; - - write_lock(&fscache_object_list_lock); - - BUG_ON(RB_EMPTY_ROOT(&fscache_object_list)); - rb_erase(&obj->objlist_link, &fscache_object_list); - - write_unlock(&fscache_object_list_lock); -} - -/* - * find the object in the tree on or after the specified index - */ -static struct fscache_object *fscache_objlist_lookup(loff_t *_pos) -{ - struct fscache_object *pobj, *obj = NULL, *minobj = NULL; - struct rb_node *p; - unsigned long pos; - - if (*_pos >= (unsigned long) ERR_PTR(-ENOENT)) - return NULL; - pos = *_pos; - - /* banners (can't represent line 0 by pos 0 as that would involve - * returning a NULL pointer) */ - if (pos == 0) - return (struct fscache_object *)(long)++(*_pos); - if (pos < 3) - return (struct fscache_object *)pos; - - pobj = (struct fscache_object *)pos; - p = fscache_object_list.rb_node; - while (p) { - obj = rb_entry(p, struct fscache_object, objlist_link); - if (pobj < obj) { - if (!minobj || minobj > obj) - minobj = obj; - p = p->rb_left; - } else if (pobj > obj) { - p = p->rb_right; - } else { - minobj = obj; - break; - } - obj = NULL; - } - - if (!minobj) - *_pos = (unsigned long) ERR_PTR(-ENOENT); - else if (minobj != obj) - *_pos = (unsigned long) minobj; - return minobj; -} - -/* - * set up the iterator to start reading from the first line - */ -static void *fscache_objlist_start(struct seq_file *m, loff_t *_pos) - __acquires(&fscache_object_list_lock) -{ - read_lock(&fscache_object_list_lock); - return fscache_objlist_lookup(_pos); -} - -/* - * move to the next line - */ -static void *fscache_objlist_next(struct seq_file *m, void *v, loff_t *_pos) -{ - (*_pos)++; - return fscache_objlist_lookup(_pos); -} - -/* - * clean up after reading - */ -static void fscache_objlist_stop(struct seq_file *m, void *v) - __releases(&fscache_object_list_lock) -{ - read_unlock(&fscache_object_list_lock); -} - -/* - * display an object - */ -static int fscache_objlist_show(struct seq_file *m, void *v) -{ - struct fscache_objlist_data *data = m->private; - struct fscache_object *obj = v; - struct fscache_cookie *cookie; - unsigned long config = data->config; - char _type[3], *type; - u8 *p; - - if ((unsigned long) v == 1) { - seq_puts(m, "OBJECT PARENT STAT CHLDN OPS OOP IPR EX READS" - " EM EV FL S" - " | NETFS_COOKIE_DEF TY FL NETFS_DATA"); - if (config & (FSCACHE_OBJLIST_CONFIG_KEY | - FSCACHE_OBJLIST_CONFIG_AUX)) - seq_puts(m, " "); - if (config & FSCACHE_OBJLIST_CONFIG_KEY) - seq_puts(m, "OBJECT_KEY"); - if ((config & (FSCACHE_OBJLIST_CONFIG_KEY | - FSCACHE_OBJLIST_CONFIG_AUX)) == - (FSCACHE_OBJLIST_CONFIG_KEY | FSCACHE_OBJLIST_CONFIG_AUX)) - seq_puts(m, ", "); - if (config & FSCACHE_OBJLIST_CONFIG_AUX) - seq_puts(m, "AUX_DATA"); - seq_puts(m, "\n"); - return 0; - } - - if ((unsigned long) v == 2) { - seq_puts(m, "======== ======== ==== ===== === === === == =====" - " == == == =" - " | ================ == == ================"); - if (config & (FSCACHE_OBJLIST_CONFIG_KEY | - FSCACHE_OBJLIST_CONFIG_AUX)) - seq_puts(m, " ================"); - seq_puts(m, "\n"); - return 0; - } - - /* filter out any unwanted objects */ -#define FILTER(criterion, _yes, _no) \ - do { \ - unsigned long yes = FSCACHE_OBJLIST_CONFIG_##_yes; \ - unsigned long no = FSCACHE_OBJLIST_CONFIG_##_no; \ - if (criterion) { \ - if (!(config & yes)) \ - return 0; \ - } else { \ - if (!(config & no)) \ - return 0; \ - } \ - } while(0) - - cookie = obj->cookie; - if (~config) { - FILTER(cookie->def, - COOKIE, NOCOOKIE); - FILTER(fscache_object_is_active(obj) || - obj->n_ops != 0 || - obj->n_obj_ops != 0 || - obj->flags || - !list_empty(&obj->dependents), - BUSY, IDLE); - FILTER(test_bit(FSCACHE_OBJECT_PENDING_WRITE, &obj->flags), - PENDWR, NOPENDWR); - FILTER(atomic_read(&obj->n_reads), - READS, NOREADS); - FILTER(obj->events & obj->event_mask, - EVENTS, NOEVENTS); - FILTER(work_busy(&obj->work), WORK, NOWORK); - } - - seq_printf(m, - "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %2lx %1x | ", - obj->debug_id, - obj->parent ? obj->parent->debug_id : -1, - obj->state->short_name, - obj->n_children, - obj->n_ops, - obj->n_obj_ops, - obj->n_in_progress, - obj->n_exclusive, - atomic_read(&obj->n_reads), - obj->event_mask, - obj->events, - obj->flags, - work_busy(&obj->work)); - - if (fscache_use_cookie(obj)) { - uint16_t keylen = 0, auxlen = 0; - - switch (cookie->type) { - case 0: - type = "IX"; - break; - case 1: - type = "DT"; - break; - default: - snprintf(_type, sizeof(_type), "%02u", - cookie->type); - type = _type; - break; - } - - seq_printf(m, "%-16s %s %2lx %16p", - cookie->def->name, - type, - cookie->flags, - cookie->netfs_data); - - if (config & FSCACHE_OBJLIST_CONFIG_KEY) - keylen = cookie->key_len; - - if (config & FSCACHE_OBJLIST_CONFIG_AUX) - auxlen = cookie->aux_len; - - if (keylen > 0 || auxlen > 0) { - seq_puts(m, " "); - p = keylen <= sizeof(cookie->inline_key) ? - cookie->inline_key : cookie->key; - for (; keylen > 0; keylen--) - seq_printf(m, "%02x", *p++); - if (auxlen > 0) { - if (config & FSCACHE_OBJLIST_CONFIG_KEY) - seq_puts(m, ", "); - p = auxlen <= sizeof(cookie->inline_aux) ? - cookie->inline_aux : cookie->aux; - for (; auxlen > 0; auxlen--) - seq_printf(m, "%02x", *p++); - } - } - - seq_puts(m, "\n"); - fscache_unuse_cookie(obj); - } else { - seq_puts(m, "<no_netfs>\n"); - } - return 0; -} - -static const struct seq_operations fscache_objlist_ops = { - .start = fscache_objlist_start, - .stop = fscache_objlist_stop, - .next = fscache_objlist_next, - .show = fscache_objlist_show, -}; - -/* - * get the configuration for filtering the list - */ -static void fscache_objlist_config(struct fscache_objlist_data *data) -{ -#ifdef CONFIG_KEYS - const struct user_key_payload *confkey; - unsigned long config; - struct key *key; - const char *buf; - int len; - - key = request_key(&key_type_user, "fscache:objlist", NULL); - if (IS_ERR(key)) - goto no_config; - - config = 0; - rcu_read_lock(); - - confkey = user_key_payload_rcu(key); - if (!confkey) { - /* key was revoked */ - rcu_read_unlock(); - key_put(key); - goto no_config; - } - - buf = confkey->data; - - for (len = confkey->datalen - 1; len >= 0; len--) { - switch (buf[len]) { - case 'K': config |= FSCACHE_OBJLIST_CONFIG_KEY; break; - case 'A': config |= FSCACHE_OBJLIST_CONFIG_AUX; break; - case 'C': config |= FSCACHE_OBJLIST_CONFIG_COOKIE; break; - case 'c': config |= FSCACHE_OBJLIST_CONFIG_NOCOOKIE; break; - case 'B': config |= FSCACHE_OBJLIST_CONFIG_BUSY; break; - case 'b': config |= FSCACHE_OBJLIST_CONFIG_IDLE; break; - case 'W': config |= FSCACHE_OBJLIST_CONFIG_PENDWR; break; - case 'w': config |= FSCACHE_OBJLIST_CONFIG_NOPENDWR; break; - case 'R': config |= FSCACHE_OBJLIST_CONFIG_READS; break; - case 'r': config |= FSCACHE_OBJLIST_CONFIG_NOREADS; break; - case 'S': config |= FSCACHE_OBJLIST_CONFIG_WORK; break; - case 's': config |= FSCACHE_OBJLIST_CONFIG_NOWORK; break; - } - } - - rcu_read_unlock(); - key_put(key); - - if (!(config & (FSCACHE_OBJLIST_CONFIG_COOKIE | FSCACHE_OBJLIST_CONFIG_NOCOOKIE))) - config |= FSCACHE_OBJLIST_CONFIG_COOKIE | FSCACHE_OBJLIST_CONFIG_NOCOOKIE; - if (!(config & (FSCACHE_OBJLIST_CONFIG_BUSY | FSCACHE_OBJLIST_CONFIG_IDLE))) - config |= FSCACHE_OBJLIST_CONFIG_BUSY | FSCACHE_OBJLIST_CONFIG_IDLE; - if (!(config & (FSCACHE_OBJLIST_CONFIG_PENDWR | FSCACHE_OBJLIST_CONFIG_NOPENDWR))) - config |= FSCACHE_OBJLIST_CONFIG_PENDWR | FSCACHE_OBJLIST_CONFIG_NOPENDWR; - if (!(config & (FSCACHE_OBJLIST_CONFIG_READS | FSCACHE_OBJLIST_CONFIG_NOREADS))) - config |= FSCACHE_OBJLIST_CONFIG_READS | FSCACHE_OBJLIST_CONFIG_NOREADS; - if (!(config & (FSCACHE_OBJLIST_CONFIG_EVENTS | FSCACHE_OBJLIST_CONFIG_NOEVENTS))) - config |= FSCACHE_OBJLIST_CONFIG_EVENTS | FSCACHE_OBJLIST_CONFIG_NOEVENTS; - if (!(config & (FSCACHE_OBJLIST_CONFIG_WORK | FSCACHE_OBJLIST_CONFIG_NOWORK))) - config |= FSCACHE_OBJLIST_CONFIG_WORK | FSCACHE_OBJLIST_CONFIG_NOWORK; - - data->config = config; - return; - -no_config: -#endif - data->config = ULONG_MAX; -} - -/* - * open "/proc/fs/fscache/objects" to provide a list of active objects - * - can be configured by a user-defined key added to the caller's keyrings - */ -static int fscache_objlist_open(struct inode *inode, struct file *file) -{ - struct fscache_objlist_data *data; - - data = __seq_open_private(file, &fscache_objlist_ops, sizeof(*data)); - if (!data) - return -ENOMEM; - - /* get the configuration key */ - fscache_objlist_config(data); - - return 0; -} - -/* - * clean up on close - */ -static int fscache_objlist_release(struct inode *inode, struct file *file) -{ - struct seq_file *m = file->private_data; - - kfree(m->private); - m->private = NULL; - return seq_release(inode, file); -} - -const struct proc_ops fscache_objlist_proc_ops = { - .proc_open = fscache_objlist_open, - .proc_read = seq_read, - .proc_lseek = seq_lseek, - .proc_release = fscache_objlist_release, -}; diff --git a/fs/fscache/object.c b/fs/fscache/object.c index cb2146e02cd5..f346a78f4bd6 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -277,13 +277,10 @@ static void fscache_object_work_func(struct work_struct *work) { struct fscache_object *object = container_of(work, struct fscache_object, work); - unsigned long start; _enter("{OBJ%x}", object->debug_id); - start = jiffies; fscache_object_sm_dispatcher(object); - fscache_hist(fscache_objs_histogram, start); fscache_put_object(object, fscache_obj_put_work); } @@ -436,7 +433,6 @@ static const struct fscache_state *fscache_parent_ready(struct fscache_object *o spin_lock(&parent->lock); parent->n_ops++; parent->n_obj_ops++; - object->lookup_jif = jiffies; spin_unlock(&parent->lock); _leave(""); @@ -522,7 +518,6 @@ void fscache_object_lookup_negative(struct fscache_object *object) set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags); clear_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags); - _debug("wake up lookup %p", &cookie->flags); clear_bit_unlock(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags); wake_up_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP); } @@ -596,7 +591,6 @@ static const struct fscache_state *fscache_object_available(struct fscache_objec object->cache->ops->lookup_complete(object); fscache_stat_d(&fscache_n_cop_lookup_complete); - fscache_hist(fscache_obj_instantiate_histogram, object->lookup_jif); fscache_stat(&fscache_n_object_avail); _leave(""); @@ -799,8 +793,6 @@ static void fscache_put_object(struct fscache_object *object, */ void fscache_object_destroy(struct fscache_object *object) { - fscache_objlist_remove(object); - /* We can get rid of the cookie now */ fscache_cookie_put(object->cookie, fscache_cookie_put_object); object->cookie = NULL; diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index 4a5651d4904e..433877107700 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -616,7 +616,6 @@ void fscache_op_work_func(struct work_struct *work) { struct fscache_operation *op = container_of(work, struct fscache_operation, work); - unsigned long start; _enter("{OBJ%x OP%x,%d}", op->object->debug_id, op->debug_id, atomic_read(&op->usage)); @@ -624,9 +623,7 @@ void fscache_op_work_func(struct work_struct *work) trace_fscache_op(op->object->cookie, op, fscache_op_work); ASSERT(op->processor != NULL); - start = jiffies; op->processor(op); - fscache_hist(fscache_ops_histogram, start); fscache_put_operation(op); _leave(""); diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 991b0a871744..27df94ef0e0b 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -289,7 +289,6 @@ static void fscache_release_retrieval_op(struct fscache_operation *_op) ASSERTIFCMP(op->op.state != FSCACHE_OP_ST_INITIALISED, atomic_read(&op->n_pages), ==, 0); - fscache_hist(fscache_retrieval_histogram, op->start_time); if (op->context) fscache_put_context(op->cookie, op->context); @@ -324,7 +323,6 @@ struct fscache_retrieval *fscache_alloc_retrieval( op->mapping = mapping; op->end_io_func = end_io_func; op->context = context; - op->start_time = jiffies; INIT_LIST_HEAD(&op->to_do); /* Pin the netfs read context in case we need to do the actual netfs @@ -340,8 +338,6 @@ struct fscache_retrieval *fscache_alloc_retrieval( */ int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie) { - unsigned long jif; - _enter(""); if (!test_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags)) { @@ -351,7 +347,6 @@ int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie) fscache_stat(&fscache_n_retrievals_wait); - jif = jiffies; if (wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP, TASK_INTERRUPTIBLE) != 0) { fscache_stat(&fscache_n_retrievals_intr); @@ -362,7 +357,6 @@ int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie) ASSERT(!test_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags)); smp_rmb(); - fscache_hist(fscache_retrieval_delay_histogram, jif); _leave(" = 0 [dly]"); return 0; } diff --git a/fs/fscache/proc.c b/fs/fscache/proc.c index 90a7bc22f7e1..061df8f61ffc 100644 --- a/fs/fscache/proc.c +++ b/fs/fscache/proc.c @@ -21,18 +21,16 @@ int __init fscache_proc_init(void) if (!proc_mkdir("fs/fscache", NULL)) goto error_dir; + if (!proc_create_seq("fs/fscache/cookies", S_IFREG | 0444, NULL, + &fscache_cookies_seq_ops)) + goto error_cookies; + #ifdef CONFIG_FSCACHE_STATS if (!proc_create_single("fs/fscache/stats", S_IFREG | 0444, NULL, fscache_stats_show)) goto error_stats; #endif -#ifdef CONFIG_FSCACHE_HISTOGRAM - if (!proc_create_seq("fs/fscache/histogram", S_IFREG | 0444, NULL, - &fscache_histogram_ops)) - goto error_histogram; -#endif - #ifdef CONFIG_FSCACHE_OBJECT_LIST if (!proc_create("fs/fscache/objects", S_IFREG | 0444, NULL, &fscache_objlist_proc_ops)) @@ -45,14 +43,12 @@ int __init fscache_proc_init(void) #ifdef CONFIG_FSCACHE_OBJECT_LIST error_objects: #endif -#ifdef CONFIG_FSCACHE_HISTOGRAM - remove_proc_entry("fs/fscache/histogram", NULL); -error_histogram: -#endif #ifdef CONFIG_FSCACHE_STATS remove_proc_entry("fs/fscache/stats", NULL); error_stats: #endif + remove_proc_entry("fs/fscache/cookies", NULL); +error_cookies: remove_proc_entry("fs/fscache", NULL); error_dir: _leave(" = -ENOMEM"); @@ -67,11 +63,9 @@ void fscache_proc_cleanup(void) #ifdef CONFIG_FSCACHE_OBJECT_LIST remove_proc_entry("fs/fscache/objects", NULL); #endif -#ifdef CONFIG_FSCACHE_HISTOGRAM - remove_proc_entry("fs/fscache/histogram", NULL); -#endif #ifdef CONFIG_FSCACHE_STATS remove_proc_entry("fs/fscache/stats", NULL); #endif + remove_proc_entry("fs/fscache/cookies", NULL); remove_proc_entry("fs/fscache", NULL); } diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c index 52b165319be1..337cb29a8dd5 100644 --- a/fs/fuse/acl.c +++ b/fs/fuse/acl.c @@ -11,7 +11,7 @@ #include <linux/posix_acl.h> #include <linux/posix_acl_xattr.h> -struct posix_acl *fuse_get_acl(struct inode *inode, int type) +struct posix_acl *fuse_get_acl(struct inode *inode, int type, bool rcu) { struct fuse_conn *fc = get_fuse_conn(inode); int size; @@ -19,6 +19,9 @@ struct posix_acl *fuse_get_acl(struct inode *inode, int type) void *value = NULL; struct posix_acl *acl; + if (rcu) + return ERR_PTR(-ECHILD); + if (fuse_is_bad(inode)) return ERR_PTR(-EIO); diff --git a/fs/fuse/control.c b/fs/fuse/control.c index cc7e94d73c6c..000d2e5627e9 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -328,7 +328,7 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc) drop_nlink(d_inode(fuse_control_sb->s_root)); } -static int fuse_ctl_fill_super(struct super_block *sb, struct fs_context *fctx) +static int fuse_ctl_fill_super(struct super_block *sb, struct fs_context *fsc) { static const struct tree_descr empty_descr = {""}; struct fuse_conn *fc; @@ -354,18 +354,18 @@ static int fuse_ctl_fill_super(struct super_block *sb, struct fs_context *fctx) return 0; } -static int fuse_ctl_get_tree(struct fs_context *fc) +static int fuse_ctl_get_tree(struct fs_context *fsc) { - return get_tree_single(fc, fuse_ctl_fill_super); + return get_tree_single(fsc, fuse_ctl_fill_super); } static const struct fs_context_operations fuse_ctl_context_ops = { .get_tree = fuse_ctl_get_tree, }; -static int fuse_ctl_init_fs_context(struct fs_context *fc) +static int fuse_ctl_init_fs_context(struct fs_context *fsc) { - fc->ops = &fuse_ctl_context_ops; + fsc->ops = &fuse_ctl_context_ops; return 0; } diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 1c8f79b3dd06..dde341a6388a 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -288,10 +288,10 @@ void fuse_request_end(struct fuse_req *req) /* * test_and_set_bit() implies smp_mb() between bit - * changing and below intr_entry check. Pairs with + * changing and below FR_INTERRUPTED check. Pairs with * smp_mb() from queue_interrupt(). */ - if (!list_empty(&req->intr_entry)) { + if (test_bit(FR_INTERRUPTED, &req->flags)) { spin_lock(&fiq->lock); list_del_init(&req->intr_entry); spin_unlock(&fiq->lock); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 621a662c19fb..11404f8c21c7 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -198,12 +198,11 @@ void fuse_finish_open(struct inode *inode, struct file *file) struct fuse_file *ff = file->private_data; struct fuse_conn *fc = get_fuse_conn(inode); - if (!(ff->open_flags & FOPEN_KEEP_CACHE)) - invalidate_inode_pages2(inode->i_mapping); if (ff->open_flags & FOPEN_STREAM) stream_open(inode, file); else if (ff->open_flags & FOPEN_NONSEEKABLE) nonseekable_open(inode, file); + if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) { struct fuse_inode *fi = get_fuse_inode(inode); @@ -211,10 +210,14 @@ void fuse_finish_open(struct inode *inode, struct file *file) fi->attr_version = atomic64_inc_return(&fc->attr_version); i_size_write(inode, 0); spin_unlock(&fi->lock); + truncate_pagecache(inode, 0); fuse_invalidate_attr(inode); if (fc->writeback_cache) file_update_time(file); + } else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) { + invalidate_inode_pages2(inode->i_mapping); } + if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache) fuse_link_write_file(file); } @@ -389,6 +392,7 @@ struct fuse_writepage_args { struct list_head queue_entry; struct fuse_writepage_args *next; struct inode *inode; + struct fuse_sync_bucket *bucket; }; static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi, @@ -1608,6 +1612,9 @@ static void fuse_writepage_free(struct fuse_writepage_args *wpa) struct fuse_args_pages *ap = &wpa->ia.ap; int i; + if (wpa->bucket) + fuse_sync_bucket_dec(wpa->bucket); + for (i = 0; i < ap->num_pages; i++) __free_page(ap->pages[i]); @@ -1813,8 +1820,7 @@ static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args, fuse_writepage_free(wpa); } -static struct fuse_file *__fuse_write_file_get(struct fuse_conn *fc, - struct fuse_inode *fi) +static struct fuse_file *__fuse_write_file_get(struct fuse_inode *fi) { struct fuse_file *ff = NULL; @@ -1829,22 +1835,20 @@ static struct fuse_file *__fuse_write_file_get(struct fuse_conn *fc, return ff; } -static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc, - struct fuse_inode *fi) +static struct fuse_file *fuse_write_file_get(struct fuse_inode *fi) { - struct fuse_file *ff = __fuse_write_file_get(fc, fi); + struct fuse_file *ff = __fuse_write_file_get(fi); WARN_ON(!ff); return ff; } int fuse_write_inode(struct inode *inode, struct writeback_control *wbc) { - struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_file *ff; int err; - ff = __fuse_write_file_get(fc, fi); + ff = __fuse_write_file_get(fi); err = fuse_flush_times(inode, ff); if (ff) fuse_file_put(ff, false, false); @@ -1871,6 +1875,20 @@ static struct fuse_writepage_args *fuse_writepage_args_alloc(void) } +static void fuse_writepage_add_to_bucket(struct fuse_conn *fc, + struct fuse_writepage_args *wpa) +{ + if (!fc->sync_fs) + return; + + rcu_read_lock(); + /* Prevent resurrection of dead bucket in unlikely race with syncfs */ + do { + wpa->bucket = rcu_dereference(fc->curr_bucket); + } while (unlikely(!atomic_inc_not_zero(&wpa->bucket->count))); + rcu_read_unlock(); +} + static int fuse_writepage_locked(struct page *page) { struct address_space *mapping = page->mapping; @@ -1894,10 +1912,11 @@ static int fuse_writepage_locked(struct page *page) goto err_free; error = -EIO; - wpa->ia.ff = fuse_write_file_get(fc, fi); + wpa->ia.ff = fuse_write_file_get(fi); if (!wpa->ia.ff) goto err_nofile; + fuse_writepage_add_to_bucket(fc, wpa); fuse_write_args_fill(&wpa->ia, wpa->ia.ff, page_offset(page), 0); copy_highpage(tmp_page, page); @@ -2113,7 +2132,7 @@ static int fuse_writepages_fill(struct page *page, if (!data->ff) { err = -EIO; - data->ff = fuse_write_file_get(fc, fi); + data->ff = fuse_write_file_get(fi); if (!data->ff) goto out_unlock; } @@ -2148,6 +2167,8 @@ static int fuse_writepages_fill(struct page *page, __free_page(tmp_page); goto out_unlock; } + fuse_writepage_add_to_bucket(fc, wpa); + data->max_pages = 1; ap = &wpa->ia.ap; @@ -2881,7 +2902,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) static int fuse_writeback_range(struct inode *inode, loff_t start, loff_t end) { - int err = filemap_write_and_wait_range(inode->i_mapping, start, end); + int err = filemap_write_and_wait_range(inode->i_mapping, start, -1); if (!err) fuse_sync_writes(inode); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 6fb639b97ea8..319596df5dc6 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -482,6 +482,7 @@ struct fuse_dev { struct fuse_fs_context { int fd; + struct file *file; unsigned int rootmode; kuid_t user_id; kgid_t group_id; @@ -508,6 +509,13 @@ struct fuse_fs_context { void **fudptr; }; +struct fuse_sync_bucket { + /* count is a possible scalability bottleneck */ + atomic_t count; + wait_queue_head_t waitq; + struct rcu_head rcu; +}; + /** * A Fuse connection. * @@ -800,6 +808,9 @@ struct fuse_conn { /** List of filesystems using this connection */ struct list_head mounts; + + /* New writepages go into this bucket */ + struct fuse_sync_bucket __rcu *curr_bucket; }; /* @@ -903,6 +914,15 @@ static inline void fuse_page_descs_length_init(struct fuse_page_desc *descs, descs[i].length = PAGE_SIZE - descs[i].offset; } +static inline void fuse_sync_bucket_dec(struct fuse_sync_bucket *bucket) +{ + /* Need RCU protection to prevent use after free after the decrement */ + rcu_read_lock(); + if (atomic_dec_and_test(&bucket->count)) + wake_up(&bucket->waitq); + rcu_read_unlock(); +} + /** Device operations */ extern const struct file_operations fuse_dev_operations; @@ -1209,7 +1229,7 @@ extern const struct xattr_handler *fuse_acl_xattr_handlers[]; extern const struct xattr_handler *fuse_no_acl_xattr_handlers[]; struct posix_acl; -struct posix_acl *fuse_get_acl(struct inode *inode, int type); +struct posix_acl *fuse_get_acl(struct inode *inode, int type, bool rcu); int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode, struct posix_acl *acl, int type); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index e07e429f32e1..36cd03114b6d 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -137,12 +137,12 @@ static void fuse_evict_inode(struct inode *inode) } } -static int fuse_reconfigure(struct fs_context *fc) +static int fuse_reconfigure(struct fs_context *fsc) { - struct super_block *sb = fc->root->d_sb; + struct super_block *sb = fsc->root->d_sb; sync_filesystem(sb); - if (fc->sb_flags & SB_MANDLOCK) + if (fsc->sb_flags & SB_MANDLOCK) return -EINVAL; return 0; @@ -505,6 +505,57 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf) return err; } +static struct fuse_sync_bucket *fuse_sync_bucket_alloc(void) +{ + struct fuse_sync_bucket *bucket; + + bucket = kzalloc(sizeof(*bucket), GFP_KERNEL | __GFP_NOFAIL); + if (bucket) { + init_waitqueue_head(&bucket->waitq); + /* Initial active count */ + atomic_set(&bucket->count, 1); + } + return bucket; +} + +static void fuse_sync_fs_writes(struct fuse_conn *fc) +{ + struct fuse_sync_bucket *bucket, *new_bucket; + int count; + + new_bucket = fuse_sync_bucket_alloc(); + spin_lock(&fc->lock); + bucket = rcu_dereference_protected(fc->curr_bucket, 1); + count = atomic_read(&bucket->count); + WARN_ON(count < 1); + /* No outstanding writes? */ + if (count == 1) { + spin_unlock(&fc->lock); + kfree(new_bucket); + return; + } + + /* + * Completion of new bucket depends on completion of this bucket, so add + * one more count. + */ + atomic_inc(&new_bucket->count); + rcu_assign_pointer(fc->curr_bucket, new_bucket); + spin_unlock(&fc->lock); + /* + * Drop initial active count. At this point if all writes in this and + * ancestor buckets complete, the count will go to zero and this task + * will be woken up. + */ + atomic_dec(&bucket->count); + + wait_event(bucket->waitq, atomic_read(&bucket->count) == 0); + + /* Drop temp count on descendant bucket */ + fuse_sync_bucket_dec(new_bucket); + kfree_rcu(bucket, rcu); +} + static int fuse_sync_fs(struct super_block *sb, int wait) { struct fuse_mount *fm = get_fuse_mount_super(sb); @@ -527,6 +578,8 @@ static int fuse_sync_fs(struct super_block *sb, int wait) if (!fc->sync_fs) return 0; + fuse_sync_fs_writes(fc); + memset(&inarg, 0, sizeof(inarg)); args.in_numargs = 1; args.in_args[0].size = sizeof(inarg); @@ -572,38 +625,38 @@ static const struct fs_parameter_spec fuse_fs_parameters[] = { {} }; -static int fuse_parse_param(struct fs_context *fc, struct fs_parameter *param) +static int fuse_parse_param(struct fs_context *fsc, struct fs_parameter *param) { struct fs_parse_result result; - struct fuse_fs_context *ctx = fc->fs_private; + struct fuse_fs_context *ctx = fsc->fs_private; int opt; - if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + if (fsc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { /* * Ignore options coming from mount(MS_REMOUNT) for backward * compatibility. */ - if (fc->oldapi) + if (fsc->oldapi) return 0; - return invalfc(fc, "No changes allowed in reconfigure"); + return invalfc(fsc, "No changes allowed in reconfigure"); } - opt = fs_parse(fc, fuse_fs_parameters, param, &result); + opt = fs_parse(fsc, fuse_fs_parameters, param, &result); if (opt < 0) return opt; switch (opt) { case OPT_SOURCE: - if (fc->source) - return invalfc(fc, "Multiple sources specified"); - fc->source = param->string; + if (fsc->source) + return invalfc(fsc, "Multiple sources specified"); + fsc->source = param->string; param->string = NULL; break; case OPT_SUBTYPE: if (ctx->subtype) - return invalfc(fc, "Multiple subtypes specified"); + return invalfc(fsc, "Multiple subtypes specified"); ctx->subtype = param->string; param->string = NULL; return 0; @@ -615,22 +668,22 @@ static int fuse_parse_param(struct fs_context *fc, struct fs_parameter *param) case OPT_ROOTMODE: if (!fuse_valid_type(result.uint_32)) - return invalfc(fc, "Invalid rootmode"); + return invalfc(fsc, "Invalid rootmode"); ctx->rootmode = result.uint_32; ctx->rootmode_present = true; break; case OPT_USER_ID: - ctx->user_id = make_kuid(fc->user_ns, result.uint_32); + ctx->user_id = make_kuid(fsc->user_ns, result.uint_32); if (!uid_valid(ctx->user_id)) - return invalfc(fc, "Invalid user_id"); + return invalfc(fsc, "Invalid user_id"); ctx->user_id_present = true; break; case OPT_GROUP_ID: - ctx->group_id = make_kgid(fc->user_ns, result.uint_32); + ctx->group_id = make_kgid(fsc->user_ns, result.uint_32); if (!gid_valid(ctx->group_id)) - return invalfc(fc, "Invalid group_id"); + return invalfc(fsc, "Invalid group_id"); ctx->group_id_present = true; break; @@ -648,7 +701,7 @@ static int fuse_parse_param(struct fs_context *fc, struct fs_parameter *param) case OPT_BLKSIZE: if (!ctx->is_bdev) - return invalfc(fc, "blksize only supported for fuseblk"); + return invalfc(fsc, "blksize only supported for fuseblk"); ctx->blksize = result.uint_32; break; @@ -659,9 +712,9 @@ static int fuse_parse_param(struct fs_context *fc, struct fs_parameter *param) return 0; } -static void fuse_free_fc(struct fs_context *fc) +static void fuse_free_fsc(struct fs_context *fsc) { - struct fuse_fs_context *ctx = fc->fs_private; + struct fuse_fs_context *ctx = fsc->fs_private; if (ctx) { kfree(ctx->subtype); @@ -762,6 +815,7 @@ void fuse_conn_put(struct fuse_conn *fc) { if (refcount_dec_and_test(&fc->count)) { struct fuse_iqueue *fiq = &fc->iq; + struct fuse_sync_bucket *bucket; if (IS_ENABLED(CONFIG_FUSE_DAX)) fuse_dax_conn_free(fc); @@ -769,6 +823,11 @@ void fuse_conn_put(struct fuse_conn *fc) fiq->ops->release(fiq); put_pid_ns(fc->pid_ns); put_user_ns(fc->user_ns); + bucket = rcu_dereference_protected(fc->curr_bucket, 1); + if (bucket) { + WARN_ON(atomic_read(&bucket->count) != 1); + kfree(bucket); + } fc->release(fc); } } @@ -1417,6 +1476,7 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) if (sb->s_flags & SB_MANDLOCK) goto err; + rcu_assign_pointer(fc->curr_bucket, fuse_sync_bucket_alloc()); fuse_sb_defaults(sb); if (ctx->is_bdev) { @@ -1508,34 +1568,33 @@ EXPORT_SYMBOL_GPL(fuse_fill_super_common); static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) { struct fuse_fs_context *ctx = fsc->fs_private; - struct file *file; int err; struct fuse_conn *fc; struct fuse_mount *fm; - err = -EINVAL; - file = fget(ctx->fd); - if (!file) - goto err; + if (!ctx->file || !ctx->rootmode_present || + !ctx->user_id_present || !ctx->group_id_present) + return -EINVAL; /* * Require mount to happen from the same user namespace which * opened /dev/fuse to prevent potential attacks. */ - if ((file->f_op != &fuse_dev_operations) || - (file->f_cred->user_ns != sb->s_user_ns)) - goto err_fput; - ctx->fudptr = &file->private_data; + err = -EINVAL; + if ((ctx->file->f_op != &fuse_dev_operations) || + (ctx->file->f_cred->user_ns != sb->s_user_ns)) + goto err; + ctx->fudptr = &ctx->file->private_data; fc = kmalloc(sizeof(*fc), GFP_KERNEL); err = -ENOMEM; if (!fc) - goto err_fput; + goto err; fm = kzalloc(sizeof(*fm), GFP_KERNEL); if (!fm) { kfree(fc); - goto err_fput; + goto err; } fuse_conn_init(fc, fm, sb->s_user_ns, &fuse_dev_fiq_ops, NULL); @@ -1546,12 +1605,8 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) err = fuse_fill_super_common(sb, ctx); if (err) goto err_put_conn; - /* - * atomic_dec_and_test() in fput() provides the necessary - * memory barrier for file->private_data to be visible on all - * CPUs after this - */ - fput(file); + /* file->private_data shall be visible on all CPUs after this */ + smp_mb(); fuse_send_init(get_fuse_mount_super(sb)); return 0; @@ -1559,30 +1614,68 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) fuse_conn_put(fc); kfree(fm); sb->s_fs_info = NULL; - err_fput: - fput(file); err: return err; } -static int fuse_get_tree(struct fs_context *fc) +/* + * This is the path where user supplied an already initialized fuse dev. In + * this case never create a new super if the old one is gone. + */ +static int fuse_set_no_super(struct super_block *sb, struct fs_context *fsc) { - struct fuse_fs_context *ctx = fc->fs_private; + return -ENOTCONN; +} - if (!ctx->fd_present || !ctx->rootmode_present || - !ctx->user_id_present || !ctx->group_id_present) - return -EINVAL; +static int fuse_test_super(struct super_block *sb, struct fs_context *fsc) +{ -#ifdef CONFIG_BLOCK - if (ctx->is_bdev) - return get_tree_bdev(fc, fuse_fill_super); -#endif + return fsc->sget_key == get_fuse_conn_super(sb); +} + +static int fuse_get_tree(struct fs_context *fsc) +{ + struct fuse_fs_context *ctx = fsc->fs_private; + struct fuse_dev *fud; + struct super_block *sb; + int err; - return get_tree_nodev(fc, fuse_fill_super); + if (ctx->fd_present) + ctx->file = fget(ctx->fd); + + if (IS_ENABLED(CONFIG_BLOCK) && ctx->is_bdev) { + err = get_tree_bdev(fsc, fuse_fill_super); + goto out_fput; + } + /* + * While block dev mount can be initialized with a dummy device fd + * (found by device name), normal fuse mounts can't + */ + if (!ctx->file) + return -EINVAL; + + /* + * Allow creating a fuse mount with an already initialized fuse + * connection + */ + fud = READ_ONCE(ctx->file->private_data); + if (ctx->file->f_op == &fuse_dev_operations && fud) { + fsc->sget_key = fud->fc; + sb = sget_fc(fsc, fuse_test_super, fuse_set_no_super); + err = PTR_ERR_OR_ZERO(sb); + if (!IS_ERR(sb)) + fsc->root = dget(sb->s_root); + } else { + err = get_tree_nodev(fsc, fuse_fill_super); + } +out_fput: + if (ctx->file) + fput(ctx->file); + return err; } static const struct fs_context_operations fuse_context_ops = { - .free = fuse_free_fc, + .free = fuse_free_fsc, .parse_param = fuse_parse_param, .reconfigure = fuse_reconfigure, .get_tree = fuse_get_tree, @@ -1591,7 +1684,7 @@ static const struct fs_context_operations fuse_context_ops = { /* * Set up the filesystem mount context. */ -static int fuse_init_fs_context(struct fs_context *fc) +static int fuse_init_fs_context(struct fs_context *fsc) { struct fuse_fs_context *ctx; @@ -1604,14 +1697,14 @@ static int fuse_init_fs_context(struct fs_context *fc) ctx->legacy_opts_show = true; #ifdef CONFIG_BLOCK - if (fc->fs_type == &fuseblk_fs_type) { + if (fsc->fs_type == &fuseblk_fs_type) { ctx->is_bdev = true; ctx->destroy = true; } #endif - fc->fs_private = ctx; - fc->ops = &fuse_context_ops; + fsc->fs_private = ctx; + fsc->ops = &fuse_context_ops; return 0; } diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 8f52cdaa8445..0ad89c6629d7 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -97,14 +97,14 @@ static const struct fs_parameter_spec virtio_fs_parameters[] = { {} }; -static int virtio_fs_parse_param(struct fs_context *fc, +static int virtio_fs_parse_param(struct fs_context *fsc, struct fs_parameter *param) { struct fs_parse_result result; - struct fuse_fs_context *ctx = fc->fs_private; + struct fuse_fs_context *ctx = fsc->fs_private; int opt; - opt = fs_parse(fc, virtio_fs_parameters, param, &result); + opt = fs_parse(fsc, virtio_fs_parameters, param, &result); if (opt < 0) return opt; @@ -119,9 +119,9 @@ static int virtio_fs_parse_param(struct fs_context *fc, return 0; } -static void virtio_fs_free_fc(struct fs_context *fc) +static void virtio_fs_free_fsc(struct fs_context *fsc) { - struct fuse_fs_context *ctx = fc->fs_private; + struct fuse_fs_context *ctx = fsc->fs_private; kfree(ctx); } @@ -1488,7 +1488,7 @@ out_err: } static const struct fs_context_operations virtio_fs_context_ops = { - .free = virtio_fs_free_fc, + .free = virtio_fs_free_fsc, .parse_param = virtio_fs_parse_param, .get_tree = virtio_fs_get_tree, }; diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 9165d70ead07..734d1f05d823 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -57,13 +57,16 @@ static struct posix_acl *__gfs2_get_acl(struct inode *inode, int type) return acl; } -struct posix_acl *gfs2_get_acl(struct inode *inode, int type) +struct posix_acl *gfs2_get_acl(struct inode *inode, int type, bool rcu) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; bool need_unlock = false; struct posix_acl *acl; + if (rcu) + return ERR_PTR(-ECHILD); + if (!gfs2_glock_is_locked_by_me(ip->i_gl)) { int ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh); diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h index eccc6a43326c..cd180ca7c959 100644 --- a/fs/gfs2/acl.h +++ b/fs/gfs2/acl.h @@ -11,7 +11,7 @@ #define GFS2_ACL_MAX_ENTRIES(sdp) ((300 << (sdp)->sd_sb.sb_bsize_shift) >> 12) -extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type); +extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type, bool rcu); extern int __gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type); extern int gfs2_set_acl(struct user_namespace *mnt_userns, struct inode *inode, struct posix_acl *acl, int type); diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 81d8f064126e..005e920f5d4a 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -574,10 +574,9 @@ void adjust_fs_space(struct inode *inode) { struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); - struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode); struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; - struct buffer_head *m_bh, *l_bh; + struct buffer_head *m_bh; u64 fs_total, new_free; if (gfs2_trans_begin(sdp, 2 * RES_STATFS, 0) != 0) @@ -600,11 +599,7 @@ void adjust_fs_space(struct inode *inode) (unsigned long long)new_free); gfs2_statfs_change(sdp, new_free, new_free, 0); - if (gfs2_meta_inode_buffer(l_ip, &l_bh) != 0) - goto out2; - update_statfs(sdp, m_bh, l_bh); - brelse(l_bh); -out2: + update_statfs(sdp, m_bh); brelse(m_bh); out: sdp->sd_rindex_uptodate = 0; diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index ed8b67b21718..5414c2c33580 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1002,7 +1002,7 @@ static void gfs2_write_unlock(struct inode *inode) } static int gfs2_iomap_page_prepare(struct inode *inode, loff_t pos, - unsigned len, struct iomap *iomap) + unsigned len) { unsigned int blockmask = i_blocksize(inode) - 1; struct gfs2_sbd *sdp = GFS2_SB(inode); @@ -1013,8 +1013,7 @@ static int gfs2_iomap_page_prepare(struct inode *inode, loff_t pos, } static void gfs2_iomap_page_done(struct inode *inode, loff_t pos, - unsigned copied, struct page *page, - struct iomap *iomap) + unsigned copied, struct page *page) { struct gfs2_trans *tr = current->journal_info; struct gfs2_inode *ip = GFS2_I(inode); diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 1f3902ecdded..e0eaa9cf9fb6 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1494,12 +1494,11 @@ void gfs2_glock_dq(struct gfs2_holder *gh) list_del_init(&gh->gh_list); clear_bit(HIF_HOLDER, &gh->gh_iflags); - if (find_first_holder(gl) == NULL) { - if (list_empty(&gl->gl_holders) && - !test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && - !test_bit(GLF_DEMOTE, &gl->gl_flags)) - fast_path = 1; - } + if (list_empty(&gl->gl_holders) && + !test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && + !test_bit(GLF_DEMOTE, &gl->gl_flags)) + fast_path = 1; + if (!test_bit(GLF_LFLUSH, &gl->gl_flags) && demote_ok(gl)) gfs2_glock_add_to_lru(gl); @@ -2077,8 +2076,6 @@ static const char *hflags2str(char *buf, u16 flags, unsigned long iflags) *p++ = 'H'; if (test_bit(HIF_WAIT, &iflags)) *p++ = 'W'; - if (test_bit(HIF_FIRST, &iflags)) - *p++ = 'F'; *p = 0; return buf; } diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 54d3fbeb3002..79c621c7863d 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -33,16 +33,18 @@ extern struct workqueue_struct *gfs2_control_wq; static void gfs2_ail_error(struct gfs2_glock *gl, const struct buffer_head *bh) { - fs_err(gl->gl_name.ln_sbd, + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + + fs_err(sdp, "AIL buffer %p: blocknr %llu state 0x%08lx mapping %p page " "state 0x%lx\n", bh, (unsigned long long)bh->b_blocknr, bh->b_state, bh->b_page->mapping, bh->b_page->flags); - fs_err(gl->gl_name.ln_sbd, "AIL glock %u:%llu mapping %p\n", + fs_err(sdp, "AIL glock %u:%llu mapping %p\n", gl->gl_name.ln_type, gl->gl_name.ln_number, gfs2_glock2aspace(gl)); - gfs2_lm(gl->gl_name.ln_sbd, "AIL error\n"); - gfs2_withdraw(gl->gl_name.ln_sbd); + gfs2_lm(sdp, "AIL error\n"); + gfs2_withdraw_delayed(sdp); } /** @@ -610,16 +612,13 @@ static int freeze_go_xmote_bh(struct gfs2_glock *gl) j_gl->gl_ops->go_inval(j_gl, DIO_METADATA); error = gfs2_find_jhead(sdp->sd_jdesc, &head, false); - if (error) - gfs2_consist(sdp); - if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) - gfs2_consist(sdp); - - /* Initialize some head of the log stuff */ - if (!gfs2_withdrawn(sdp)) { - sdp->sd_log_sequence = head.lh_sequence + 1; - gfs2_log_pointers_init(sdp, head.lh_blkno); - } + if (gfs2_assert_withdraw_delayed(sdp, !error)) + return error; + if (gfs2_assert_withdraw_delayed(sdp, head.lh_flags & + GFS2_LOG_HEAD_UNMOUNT)) + return -EIO; + sdp->sd_log_sequence = head.lh_sequence + 1; + gfs2_log_pointers_init(sdp, head.lh_blkno); } return 0; } diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index e6f820f146cb..0fe49770166e 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -253,7 +253,6 @@ struct gfs2_lkstats { enum { /* States */ HIF_HOLDER = 6, /* Set for gh that "holds" the glock */ - HIF_FIRST = 7, HIF_WAIT = 10, }; @@ -768,6 +767,7 @@ struct gfs2_sbd { struct gfs2_glock *sd_jinode_gl; struct gfs2_holder sd_sc_gh; + struct buffer_head *sd_sc_bh; struct gfs2_holder sd_qc_gh; struct completion sd_journal_ready; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 6e15434b23ac..3130f85d2b3f 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -1985,8 +1985,8 @@ static int gfs2_setattr(struct user_namespace *mnt_userns, if (error) goto out; - error = -EPERM; - if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + error = may_setattr(&init_user_ns, inode, attr->ia_valid); + if (error) goto error; error = setattr_prepare(&init_user_ns, dentry, attr); diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index dac040162ecc..50578f881e6d 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -299,6 +299,11 @@ static void gdlm_put_lock(struct gfs2_glock *gl) gfs2_sbstats_inc(gl, GFS2_LKS_DCOUNT); gfs2_update_request_times(gl); + /* don't want to call dlm if we've unmounted the lock protocol */ + if (test_bit(DFL_UNMOUNT, &ls->ls_recover_flags)) { + gfs2_glock_free(gl); + return; + } /* don't want to skip dlm_unlock writing the lvb when lock has one */ if (test_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags) && diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 42c15cfc0821..f0ee3ff6f9a8 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -594,7 +594,7 @@ void gfs2_log_reserve(struct gfs2_sbd *sdp, struct gfs2_trans *tr, { unsigned int blks = tr->tr_reserved; unsigned int revokes = tr->tr_revokes; - unsigned int revoke_blks = 0; + unsigned int revoke_blks; *extra_revokes = 0; if (revokes) { diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 8ee05d25dfa6..ca0bb3a73912 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -761,6 +761,32 @@ static void buf_lo_before_scan(struct gfs2_jdesc *jd, jd->jd_replayed_blocks = 0; } +#define obsolete_rgrp_replay \ +"Replaying 0x%llx from jid=%d/0x%llx but we already have a bh!\n" +#define obsolete_rgrp_replay2 \ +"busy:%d, pinned:%d rg_gen:0x%llx, j_gen:0x%llx\n" + +static void obsolete_rgrp(struct gfs2_jdesc *jd, struct buffer_head *bh_log, + u64 blkno) +{ + struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); + struct gfs2_rgrpd *rgd; + struct gfs2_rgrp *jrgd = (struct gfs2_rgrp *)bh_log->b_data; + + rgd = gfs2_blk2rgrpd(sdp, blkno, false); + if (rgd && rgd->rd_addr == blkno && + rgd->rd_bits && rgd->rd_bits->bi_bh) { + fs_info(sdp, obsolete_rgrp_replay, (unsigned long long)blkno, + jd->jd_jid, bh_log->b_blocknr); + fs_info(sdp, obsolete_rgrp_replay2, + buffer_busy(rgd->rd_bits->bi_bh) ? 1 : 0, + buffer_pinned(rgd->rd_bits->bi_bh), + rgd->rd_igeneration, + be64_to_cpu(jrgd->rg_igeneration)); + gfs2_dump_glock(NULL, rgd->rd_gl, true); + } +} + static int buf_lo_scan_elements(struct gfs2_jdesc *jd, u32 start, struct gfs2_log_descriptor *ld, __be64 *ptr, int pass) @@ -799,21 +825,9 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, u32 start, struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh_ip->b_data; - if (mh->mh_type == cpu_to_be32(GFS2_METATYPE_RG)) { - struct gfs2_rgrpd *rgd; - - rgd = gfs2_blk2rgrpd(sdp, blkno, false); - if (rgd && rgd->rd_addr == blkno && - rgd->rd_bits && rgd->rd_bits->bi_bh) { - fs_info(sdp, "Replaying 0x%llx but we " - "already have a bh!\n", - (unsigned long long)blkno); - fs_info(sdp, "busy:%d, pinned:%d\n", - buffer_busy(rgd->rd_bits->bi_bh) ? 1 : 0, - buffer_pinned(rgd->rd_bits->bi_bh)); - gfs2_dump_glock(NULL, rgd->rd_gl, true); - } - } + if (mh->mh_type == cpu_to_be32(GFS2_METATYPE_RG)) + obsolete_rgrp(jd, bh_log, blkno); + mark_buffer_dirty(bh_ip); } brelse(bh_log); diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 7c9619997355..72d30a682ece 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -258,8 +258,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, struct buffer_head *bh, *bhs[2]; int num = 0; - if (unlikely(gfs2_withdrawn(sdp)) && - (!sdp->sd_jdesc || gl != sdp->sd_jinode_gl)) { + if (unlikely(gfs2_withdrawn(sdp)) && !gfs2_withdraw_in_prog(sdp)) { *bhp = NULL; return -EIO; } @@ -317,7 +316,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh) { - if (unlikely(gfs2_withdrawn(sdp))) + if (unlikely(gfs2_withdrawn(sdp)) && !gfs2_withdraw_in_prog(sdp)) return -EIO; wait_on_buffer(bh); @@ -328,7 +327,7 @@ int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh) gfs2_io_error_bh_wd(sdp, bh); return -EIO; } - if (unlikely(gfs2_withdrawn(sdp))) + if (unlikely(gfs2_withdrawn(sdp)) && !gfs2_withdraw_in_prog(sdp)) return -EIO; return 0; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 5f4504dd0875..7f8410d8fdc1 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -614,6 +614,7 @@ static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh) break; } + d_mark_dontcache(jd->jd_inode); spin_lock(&sdp->sd_jindex_spin); jd->jd_jid = sdp->sd_journals++; jip = GFS2_I(jd->jd_inode); @@ -677,6 +678,7 @@ static int init_statfs(struct gfs2_sbd *sdp) error = PTR_ERR(lsi->si_sc_inode); fs_err(sdp, "can't find local \"sc\" file#%u: %d\n", jd->jd_jid, error); + kfree(lsi); goto free_local; } lsi->si_jid = jd->jd_jid; @@ -695,8 +697,16 @@ static int init_statfs(struct gfs2_sbd *sdp) fs_err(sdp, "can't lock local \"sc\" file: %d\n", error); goto free_local; } + /* read in the local statfs buffer - other nodes don't change it. */ + error = gfs2_meta_inode_buffer(ip, &sdp->sd_sc_bh); + if (error) { + fs_err(sdp, "Cannot read in local statfs: %d\n", error); + goto unlock_sd_gh; + } return 0; +unlock_sd_gh: + gfs2_glock_dq_uninit(&sdp->sd_sc_gh); free_local: free_local_statfs_inodes(sdp); iput(pn); @@ -710,6 +720,7 @@ out: static void uninit_statfs(struct gfs2_sbd *sdp) { if (!sdp->sd_args.ar_spectator) { + brelse(sdp->sd_sc_bh); gfs2_glock_dq_uninit(&sdp->sd_sc_gh); free_local_statfs_inodes(sdp); } @@ -1088,6 +1099,34 @@ void gfs2_online_uevent(struct gfs2_sbd *sdp) kobject_uevent_env(&sdp->sd_kobj, KOBJ_ONLINE, envp); } +static int init_threads(struct gfs2_sbd *sdp) +{ + struct task_struct *p; + int error = 0; + + p = kthread_run(gfs2_logd, sdp, "gfs2_logd"); + if (IS_ERR(p)) { + error = PTR_ERR(p); + fs_err(sdp, "can't start logd thread: %d\n", error); + return error; + } + sdp->sd_logd_process = p; + + p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad"); + if (IS_ERR(p)) { + error = PTR_ERR(p); + fs_err(sdp, "can't start quotad thread: %d\n", error); + goto fail; + } + sdp->sd_quotad_process = p; + return 0; + +fail: + kthread_stop(sdp->sd_logd_process); + sdp->sd_logd_process = NULL; + return error; +} + /** * gfs2_fill_super - Read in superblock * @sb: The VFS superblock @@ -1216,6 +1255,14 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc) goto fail_per_node; } + if (!sb_rdonly(sb)) { + error = init_threads(sdp); + if (error) { + gfs2_withdraw_delayed(sdp); + goto fail_per_node; + } + } + error = gfs2_freeze_lock(sdp, &freeze_gh, 0); if (error) goto fail_per_node; @@ -1225,6 +1272,12 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc) gfs2_freeze_unlock(&freeze_gh); if (error) { + if (sdp->sd_quotad_process) + kthread_stop(sdp->sd_quotad_process); + sdp->sd_quotad_process = NULL; + if (sdp->sd_logd_process) + kthread_stop(sdp->sd_logd_process); + sdp->sd_logd_process = NULL; fs_err(sdp, "can't make FS RW: %d\n", error); goto fail_per_node; } diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 4d4ceb0b6903..6e00d15ef0a8 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -119,34 +119,6 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd) return 0; } -static int init_threads(struct gfs2_sbd *sdp) -{ - struct task_struct *p; - int error = 0; - - p = kthread_run(gfs2_logd, sdp, "gfs2_logd"); - if (IS_ERR(p)) { - error = PTR_ERR(p); - fs_err(sdp, "can't start logd thread: %d\n", error); - return error; - } - sdp->sd_logd_process = p; - - p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad"); - if (IS_ERR(p)) { - error = PTR_ERR(p); - fs_err(sdp, "can't start quotad thread: %d\n", error); - goto fail; - } - sdp->sd_quotad_process = p; - return 0; - -fail: - kthread_stop(sdp->sd_logd_process); - sdp->sd_logd_process = NULL; - return error; -} - /** * gfs2_make_fs_rw - Turn a Read-Only FS into a Read-Write one * @sdp: the filesystem @@ -161,26 +133,17 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp) struct gfs2_log_header_host head; int error; - error = init_threads(sdp); - if (error) { - gfs2_withdraw_delayed(sdp); - return error; - } - j_gl->gl_ops->go_inval(j_gl, DIO_METADATA); - if (gfs2_withdrawn(sdp)) { - error = -EIO; - goto fail; - } + if (gfs2_withdrawn(sdp)) + return -EIO; error = gfs2_find_jhead(sdp->sd_jdesc, &head, false); if (error || gfs2_withdrawn(sdp)) - goto fail; + return error; if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) { gfs2_consist(sdp); - error = -EIO; - goto fail; + return -EIO; } /* Initialize some head of the log stuff */ @@ -188,20 +151,8 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp) gfs2_log_pointers_init(sdp, head.lh_blkno); error = gfs2_quota_init(sdp); - if (error || gfs2_withdrawn(sdp)) - goto fail; - - set_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); - - return 0; - -fail: - if (sdp->sd_quotad_process) - kthread_stop(sdp->sd_quotad_process); - sdp->sd_quotad_process = NULL; - if (sdp->sd_logd_process) - kthread_stop(sdp->sd_logd_process); - sdp->sd_logd_process = NULL; + if (!error && !gfs2_withdrawn(sdp)) + set_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); return error; } @@ -227,9 +178,8 @@ int gfs2_statfs_init(struct gfs2_sbd *sdp) { struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; - struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode); struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; - struct buffer_head *m_bh, *l_bh; + struct buffer_head *m_bh; struct gfs2_holder gh; int error; @@ -248,21 +198,15 @@ int gfs2_statfs_init(struct gfs2_sbd *sdp) sizeof(struct gfs2_dinode)); spin_unlock(&sdp->sd_statfs_spin); } else { - error = gfs2_meta_inode_buffer(l_ip, &l_bh); - if (error) - goto out_m_bh; - spin_lock(&sdp->sd_statfs_spin); gfs2_statfs_change_in(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode)); - gfs2_statfs_change_in(l_sc, l_bh->b_data + + gfs2_statfs_change_in(l_sc, sdp->sd_sc_bh->b_data + sizeof(struct gfs2_dinode)); spin_unlock(&sdp->sd_statfs_spin); - brelse(l_bh); } -out_m_bh: brelse(m_bh); out: gfs2_glock_dq_uninit(&gh); @@ -275,22 +219,17 @@ void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode); struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; - struct buffer_head *l_bh; s64 x, y; int need_sync = 0; - int error; - - error = gfs2_meta_inode_buffer(l_ip, &l_bh); - if (error) - return; - gfs2_trans_add_meta(l_ip->i_gl, l_bh); + gfs2_trans_add_meta(l_ip->i_gl, sdp->sd_sc_bh); spin_lock(&sdp->sd_statfs_spin); l_sc->sc_total += total; l_sc->sc_free += free; l_sc->sc_dinodes += dinodes; - gfs2_statfs_change_out(l_sc, l_bh->b_data + sizeof(struct gfs2_dinode)); + gfs2_statfs_change_out(l_sc, sdp->sd_sc_bh->b_data + + sizeof(struct gfs2_dinode)); if (sdp->sd_args.ar_statfs_percent) { x = 100 * l_sc->sc_free; y = m_sc->sc_free * sdp->sd_args.ar_statfs_percent; @@ -299,20 +238,18 @@ void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, } spin_unlock(&sdp->sd_statfs_spin); - brelse(l_bh); if (need_sync) gfs2_wake_up_statfs(sdp); } -void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh, - struct buffer_head *l_bh) +void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh) { struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode); struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; - gfs2_trans_add_meta(l_ip->i_gl, l_bh); + gfs2_trans_add_meta(l_ip->i_gl, sdp->sd_sc_bh); gfs2_trans_add_meta(m_ip->i_gl, m_bh); spin_lock(&sdp->sd_statfs_spin); @@ -320,7 +257,7 @@ void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh, m_sc->sc_free += l_sc->sc_free; m_sc->sc_dinodes += l_sc->sc_dinodes; memset(l_sc, 0, sizeof(struct gfs2_statfs_change)); - memset(l_bh->b_data + sizeof(struct gfs2_dinode), + memset(sdp->sd_sc_bh->b_data + sizeof(struct gfs2_dinode), 0, sizeof(struct gfs2_statfs_change)); gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode)); spin_unlock(&sdp->sd_statfs_spin); @@ -330,11 +267,10 @@ int gfs2_statfs_sync(struct super_block *sb, int type) { struct gfs2_sbd *sdp = sb->s_fs_info; struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); - struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode); struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; struct gfs2_holder gh; - struct buffer_head *m_bh, *l_bh; + struct buffer_head *m_bh; int error; error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, GL_NOCACHE, @@ -355,21 +291,15 @@ int gfs2_statfs_sync(struct super_block *sb, int type) } spin_unlock(&sdp->sd_statfs_spin); - error = gfs2_meta_inode_buffer(l_ip, &l_bh); - if (error) - goto out_bh; - error = gfs2_trans_begin(sdp, 2 * RES_DINODE, 0); if (error) - goto out_bh2; + goto out_bh; - update_statfs(sdp, m_bh, l_bh); + update_statfs(sdp, m_bh); sdp->sd_statfs_force_sync = 0; gfs2_trans_end(sdp); -out_bh2: - brelse(l_bh); out_bh: brelse(m_bh); out_unlock: @@ -675,6 +605,7 @@ restart: gfs2_glock_dq_uninit(&sdp->sd_journal_gh); if (gfs2_holder_initialized(&sdp->sd_jinode_gh)) gfs2_glock_dq_uninit(&sdp->sd_jinode_gh); + brelse(sdp->sd_sc_bh); gfs2_glock_dq_uninit(&sdp->sd_sc_gh); gfs2_glock_dq_uninit(&sdp->sd_qc_gh); free_local_statfs_inodes(sdp); @@ -1016,7 +947,7 @@ static int gfs2_drop_inode(struct inode *inode) gfs2_glock_hold(gl); if (!gfs2_queue_delete_work(gl, 0)) gfs2_glock_queue_put(gl); - return false; + return 0; } return generic_drop_inode(inode); diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h index ec4affb33ed5..58d13fd77aed 100644 --- a/fs/gfs2/super.h +++ b/fs/gfs2/super.h @@ -43,8 +43,7 @@ extern void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf); extern void gfs2_statfs_change_out(const struct gfs2_statfs_change_host *sc, void *buf); -extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh, - struct buffer_head *l_bh); +extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh); extern int gfs2_statfs_sync(struct super_block *sb, int type); extern void gfs2_freeze_func(struct work_struct *work); diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index f4325b44956d..cf345a86ef67 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -278,6 +278,7 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp) goto skip_recovery; } sdp->sd_jdesc->jd_inode = inode; + d_mark_dontcache(inode); /* * Now wait until recovery is complete. @@ -295,7 +296,7 @@ skip_recovery: fs_warn(sdp, "Journal recovery complete for jid %d.\n", sdp->sd_lockstruct.ls_jid); else - fs_warn(sdp, "Journal recovery skipped for %d until next " + fs_warn(sdp, "Journal recovery skipped for jid %d until next " "mount.\n", sdp->sd_lockstruct.ls_jid); fs_warn(sdp, "Glock dequeues delayed: %lu\n", sdp->sd_glock_dqs_held); sdp->sd_glock_dqs_held = 0; diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h index 69e1a0ae5a4d..78ec190f4155 100644 --- a/fs/gfs2/util.h +++ b/fs/gfs2/util.h @@ -218,6 +218,11 @@ static inline bool gfs2_withdrawing(struct gfs2_sbd *sdp) !test_bit(SDF_WITHDRAWN, &sdp->sd_flags); } +static inline bool gfs2_withdraw_in_prog(struct gfs2_sbd *sdp) +{ + return test_bit(SDF_WITHDRAW_IN_PROG, &sdp->sd_flags); +} + #define gfs2_tune_get(sdp, field) \ gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field) diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 7d0c3dbb2898..d5c9d886cd9f 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -381,6 +381,7 @@ static int hostfs_fsync(struct file *file, loff_t start, loff_t end, static const struct file_operations hostfs_file_fops = { .llseek = generic_file_llseek, .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, .read_iter = generic_file_read_iter, .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, diff --git a/fs/inode.c b/fs/inode.c index 84c528cd1955..37710ca863b5 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -770,7 +770,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item, return LRU_ROTATE; } - if (inode_has_buffers(inode) || inode->i_data.nrpages) { + if (inode_has_buffers(inode) || !mapping_empty(&inode->i_data)) { __iget(inode); spin_unlock(&inode->i_lock); spin_unlock(lru_lock); diff --git a/fs/internal.h b/fs/internal.h index 2bb444600852..3cd065c8a66b 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -18,7 +18,7 @@ struct user_namespace; struct pipe_inode_info; /* - * block_dev.c + * block/bdev.c */ #ifdef CONFIG_BLOCK extern void __init bdev_cache_init(void); @@ -48,8 +48,8 @@ static inline int emergency_thaw_bdev(struct super_block *sb) /* * buffer.c */ -extern int __block_write_begin_int(struct page *page, loff_t pos, unsigned len, - get_block_t *get_block, struct iomap *iomap); +int __block_write_begin_int(struct page *page, loff_t pos, unsigned len, + get_block_t *get_block, const struct iomap *iomap); /* * char_dev.c diff --git a/fs/io-wq.c b/fs/io-wq.c index cd9bd095fb1b..6c55362c1f99 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -23,8 +23,7 @@ enum { IO_WORKER_F_UP = 1, /* up and active */ IO_WORKER_F_RUNNING = 2, /* account as running */ IO_WORKER_F_FREE = 4, /* worker on free list */ - IO_WORKER_F_FIXED = 8, /* static idle worker */ - IO_WORKER_F_BOUND = 16, /* is doing bounded work */ + IO_WORKER_F_BOUND = 8, /* is doing bounded work */ }; enum { @@ -32,7 +31,7 @@ enum { }; enum { - IO_WQE_FLAG_STALLED = 1, /* stalled on hash */ + IO_ACCT_STALLED_BIT = 0, /* stalled on hash */ }; /* @@ -55,7 +54,10 @@ struct io_worker { struct callback_head create_work; int create_index; - struct rcu_head rcu; + union { + struct rcu_head rcu; + struct work_struct work; + }; }; #if BITS_PER_LONG == 64 @@ -71,25 +73,24 @@ struct io_wqe_acct { unsigned max_workers; int index; atomic_t nr_running; + struct io_wq_work_list work_list; + unsigned long flags; }; enum { IO_WQ_ACCT_BOUND, IO_WQ_ACCT_UNBOUND, + IO_WQ_ACCT_NR, }; /* * Per-node worker thread pool */ struct io_wqe { - struct { - raw_spinlock_t lock; - struct io_wq_work_list work_list; - unsigned flags; - } ____cacheline_aligned_in_smp; + raw_spinlock_t lock; + struct io_wqe_acct acct[2]; int node; - struct io_wqe_acct acct[2]; struct hlist_nulls_head free_list; struct list_head all_list; @@ -133,8 +134,11 @@ struct io_cb_cancel_data { bool cancel_all; }; -static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index, bool first); +static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index); static void io_wqe_dec_running(struct io_worker *worker); +static bool io_acct_cancel_pending_work(struct io_wqe *wqe, + struct io_wqe_acct *acct, + struct io_cb_cancel_data *match); static bool io_worker_get(struct io_worker *worker) { @@ -195,11 +199,10 @@ static void io_worker_exit(struct io_worker *worker) do_exit(0); } -static inline bool io_wqe_run_queue(struct io_wqe *wqe) - __must_hold(wqe->lock) +static inline bool io_acct_run_queue(struct io_wqe_acct *acct) { - if (!wq_list_empty(&wqe->work_list) && - !(wqe->flags & IO_WQE_FLAG_STALLED)) + if (!wq_list_empty(&acct->work_list) && + !test_bit(IO_ACCT_STALLED_BIT, &acct->flags)) return true; return false; } @@ -208,7 +211,8 @@ static inline bool io_wqe_run_queue(struct io_wqe *wqe) * Check head of free list for an available worker. If one isn't available, * caller must create one. */ -static bool io_wqe_activate_free_worker(struct io_wqe *wqe) +static bool io_wqe_activate_free_worker(struct io_wqe *wqe, + struct io_wqe_acct *acct) __must_hold(RCU) { struct hlist_nulls_node *n; @@ -222,6 +226,10 @@ static bool io_wqe_activate_free_worker(struct io_wqe *wqe) hlist_nulls_for_each_entry_rcu(worker, n, &wqe->free_list, nulls_node) { if (!io_worker_get(worker)) continue; + if (io_wqe_get_acct(worker) != acct) { + io_worker_release(worker); + continue; + } if (wake_up_process(worker->task)) { io_worker_release(worker); return true; @@ -236,9 +244,9 @@ static bool io_wqe_activate_free_worker(struct io_wqe *wqe) * We need a worker. If we find a free one, we're good. If not, and we're * below the max number of workers, create one. */ -static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct) +static bool io_wqe_create_worker(struct io_wqe *wqe, struct io_wqe_acct *acct) { - bool ret; + bool do_create = false; /* * Most likely an attempt to queue unbounded work on an io_wq that @@ -247,27 +255,19 @@ static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct) if (unlikely(!acct->max_workers)) pr_warn_once("io-wq is not configured for unbound workers"); - rcu_read_lock(); - ret = io_wqe_activate_free_worker(wqe); - rcu_read_unlock(); - - if (!ret) { - bool do_create = false, first = false; - - raw_spin_lock(&wqe->lock); - if (acct->nr_workers < acct->max_workers) { - if (!acct->nr_workers) - first = true; - acct->nr_workers++; - do_create = true; - } - raw_spin_unlock(&wqe->lock); - if (do_create) { - atomic_inc(&acct->nr_running); - atomic_inc(&wqe->wq->worker_refs); - create_io_worker(wqe->wq, wqe, acct->index, first); - } + raw_spin_lock(&wqe->lock); + if (acct->nr_workers < acct->max_workers) { + acct->nr_workers++; + do_create = true; } + raw_spin_unlock(&wqe->lock); + if (do_create) { + atomic_inc(&acct->nr_running); + atomic_inc(&wqe->wq->worker_refs); + return create_io_worker(wqe->wq, wqe, acct->index); + } + + return true; } static void io_wqe_inc_running(struct io_worker *worker) @@ -283,7 +283,7 @@ static void create_worker_cb(struct callback_head *cb) struct io_wq *wq; struct io_wqe *wqe; struct io_wqe_acct *acct; - bool do_create = false, first = false; + bool do_create = false; worker = container_of(cb, struct io_worker, create_work); wqe = worker->wqe; @@ -291,14 +291,12 @@ static void create_worker_cb(struct callback_head *cb) acct = &wqe->acct[worker->create_index]; raw_spin_lock(&wqe->lock); if (acct->nr_workers < acct->max_workers) { - if (!acct->nr_workers) - first = true; acct->nr_workers++; do_create = true; } raw_spin_unlock(&wqe->lock); if (do_create) { - create_io_worker(wq, wqe, worker->create_index, first); + create_io_worker(wq, wqe, worker->create_index); } else { atomic_dec(&acct->nr_running); io_worker_ref_put(wq); @@ -307,9 +305,11 @@ static void create_worker_cb(struct callback_head *cb) io_worker_release(worker); } -static void io_queue_worker_create(struct io_wqe *wqe, struct io_worker *worker, - struct io_wqe_acct *acct) +static bool io_queue_worker_create(struct io_worker *worker, + struct io_wqe_acct *acct, + task_work_func_t func) { + struct io_wqe *wqe = worker->wqe; struct io_wq *wq = wqe->wq; /* raced with exit, just ignore create call */ @@ -327,16 +327,17 @@ static void io_queue_worker_create(struct io_wqe *wqe, struct io_worker *worker, test_and_set_bit_lock(0, &worker->create_state)) goto fail_release; - init_task_work(&worker->create_work, create_worker_cb); + init_task_work(&worker->create_work, func); worker->create_index = acct->index; if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL)) - return; + return true; clear_bit_unlock(0, &worker->create_state); fail_release: io_worker_release(worker); fail: atomic_dec(&acct->nr_running); io_worker_ref_put(wq); + return false; } static void io_wqe_dec_running(struct io_worker *worker) @@ -348,10 +349,10 @@ static void io_wqe_dec_running(struct io_worker *worker) if (!(worker->flags & IO_WORKER_F_UP)) return; - if (atomic_dec_and_test(&acct->nr_running) && io_wqe_run_queue(wqe)) { + if (atomic_dec_and_test(&acct->nr_running) && io_acct_run_queue(acct)) { atomic_inc(&acct->nr_running); atomic_inc(&wqe->wq->worker_refs); - io_queue_worker_create(wqe, worker, acct); + io_queue_worker_create(worker, acct, create_worker_cb); } } @@ -363,29 +364,10 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker, struct io_wq_work *work) __must_hold(wqe->lock) { - bool worker_bound, work_bound; - - BUILD_BUG_ON((IO_WQ_ACCT_UNBOUND ^ IO_WQ_ACCT_BOUND) != 1); - if (worker->flags & IO_WORKER_F_FREE) { worker->flags &= ~IO_WORKER_F_FREE; hlist_nulls_del_init_rcu(&worker->nulls_node); } - - /* - * If worker is moving from bound to unbound (or vice versa), then - * ensure we update the running accounting. - */ - worker_bound = (worker->flags & IO_WORKER_F_BOUND) != 0; - work_bound = (work->flags & IO_WQ_WORK_UNBOUND) == 0; - if (worker_bound != work_bound) { - int index = work_bound ? IO_WQ_ACCT_UNBOUND : IO_WQ_ACCT_BOUND; - io_wqe_dec_running(worker); - worker->flags ^= IO_WORKER_F_BOUND; - wqe->acct[index].nr_workers--; - wqe->acct[index ^ 1].nr_workers++; - io_wqe_inc_running(worker); - } } /* @@ -413,7 +395,7 @@ static void io_wait_on_hash(struct io_wqe *wqe, unsigned int hash) { struct io_wq *wq = wqe->wq; - spin_lock(&wq->hash->wait.lock); + spin_lock_irq(&wq->hash->wait.lock); if (list_empty(&wqe->wait.entry)) { __add_wait_queue(&wq->hash->wait, &wqe->wait); if (!test_bit(hash, &wq->hash->map)) { @@ -421,48 +403,26 @@ static void io_wait_on_hash(struct io_wqe *wqe, unsigned int hash) list_del_init(&wqe->wait.entry); } } - spin_unlock(&wq->hash->wait.lock); -} - -/* - * We can always run the work if the worker is currently the same type as - * the work (eg both are bound, or both are unbound). If they are not the - * same, only allow it if incrementing the worker count would be allowed. - */ -static bool io_worker_can_run_work(struct io_worker *worker, - struct io_wq_work *work) -{ - struct io_wqe_acct *acct; - - if (!(worker->flags & IO_WORKER_F_BOUND) != - !(work->flags & IO_WQ_WORK_UNBOUND)) - return true; - - /* not the same type, check if we'd go over the limit */ - acct = io_work_get_acct(worker->wqe, work); - return acct->nr_workers < acct->max_workers; + spin_unlock_irq(&wq->hash->wait.lock); } -static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, - struct io_worker *worker, - bool *stalled) +static struct io_wq_work *io_get_next_work(struct io_wqe_acct *acct, + struct io_worker *worker) __must_hold(wqe->lock) { struct io_wq_work_node *node, *prev; struct io_wq_work *work, *tail; unsigned int stall_hash = -1U; + struct io_wqe *wqe = worker->wqe; - wq_list_for_each(node, prev, &wqe->work_list) { + wq_list_for_each(node, prev, &acct->work_list) { unsigned int hash; work = container_of(node, struct io_wq_work, list); - if (!io_worker_can_run_work(worker, work)) - break; - /* not hashed, can run anytime */ if (!io_wq_is_hashed(work)) { - wq_list_del(&wqe->work_list, node, prev); + wq_list_del(&acct->work_list, node, prev); return work; } @@ -473,7 +433,7 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, /* hashed, can run if not already running */ if (!test_and_set_bit(hash, &wqe->wq->hash->map)) { wqe->hash_tail[hash] = NULL; - wq_list_cut(&wqe->work_list, &tail->list, prev); + wq_list_cut(&acct->work_list, &tail->list, prev); return work; } if (stall_hash == -1U) @@ -483,10 +443,14 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe, } if (stall_hash != -1U) { + /* + * Set this before dropping the lock to avoid racing with new + * work being added and clearing the stalled bit. + */ + set_bit(IO_ACCT_STALLED_BIT, &acct->flags); raw_spin_unlock(&wqe->lock); io_wait_on_hash(wqe, stall_hash); raw_spin_lock(&wqe->lock); - *stalled = true; } return NULL; @@ -520,13 +484,13 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work); static void io_worker_handle_work(struct io_worker *worker) __releases(wqe->lock) { + struct io_wqe_acct *acct = io_wqe_get_acct(worker); struct io_wqe *wqe = worker->wqe; struct io_wq *wq = wqe->wq; bool do_kill = test_bit(IO_WQ_BIT_EXIT, &wq->state); do { struct io_wq_work *work; - bool stalled; get_next: /* * If we got some work, mark us as busy. If we didn't, but @@ -535,12 +499,9 @@ get_next: * can't make progress, any work completion or insertion will * clear the stalled flag. */ - stalled = false; - work = io_get_next_work(wqe, worker, &stalled); + work = io_get_next_work(acct, worker); if (work) __io_worker_busy(wqe, worker, work); - else if (stalled) - wqe->flags |= IO_WQE_FLAG_STALLED; raw_spin_unlock(&wqe->lock); if (!work) @@ -572,10 +533,10 @@ get_next: if (hash != -1U && !next_hashed) { clear_bit(hash, &wq->hash->map); + clear_bit(IO_ACCT_STALLED_BIT, &acct->flags); if (wq_has_sleeper(&wq->hash->wait)) wake_up(&wq->hash->wait); raw_spin_lock(&wqe->lock); - wqe->flags &= ~IO_WQE_FLAG_STALLED; /* skip unnecessary unlock-lock wqe->lock */ if (!work) goto get_next; @@ -590,8 +551,10 @@ get_next: static int io_wqe_worker(void *data) { struct io_worker *worker = data; + struct io_wqe_acct *acct = io_wqe_get_acct(worker); struct io_wqe *wqe = worker->wqe; struct io_wq *wq = wqe->wq; + bool last_timeout = false; char buf[TASK_COMM_LEN]; worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING); @@ -605,10 +568,17 @@ static int io_wqe_worker(void *data) set_current_state(TASK_INTERRUPTIBLE); loop: raw_spin_lock(&wqe->lock); - if (io_wqe_run_queue(wqe)) { + if (io_acct_run_queue(acct)) { io_worker_handle_work(worker); goto loop; } + /* timed out, exit unless we're the last worker */ + if (last_timeout && acct->nr_workers > 1) { + raw_spin_unlock(&wqe->lock); + __set_current_state(TASK_RUNNING); + break; + } + last_timeout = false; __io_worker_idle(wqe, worker); raw_spin_unlock(&wqe->lock); if (io_flush_signals()) @@ -619,13 +589,11 @@ loop: if (!get_signal(&ksig)) continue; - break; - } - if (ret) + if (fatal_signal_pending(current)) + break; continue; - /* timed out, exit unless we're the fixed worker */ - if (!(worker->flags & IO_WORKER_F_FIXED)) - break; + } + last_timeout = !ret; } if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) { @@ -676,51 +644,131 @@ void io_wq_worker_sleeping(struct task_struct *tsk) raw_spin_unlock(&worker->wqe->lock); } -static void create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index, bool first) +static void io_init_new_worker(struct io_wqe *wqe, struct io_worker *worker, + struct task_struct *tsk) +{ + tsk->pf_io_worker = worker; + worker->task = tsk; + set_cpus_allowed_ptr(tsk, wqe->cpu_mask); + tsk->flags |= PF_NO_SETAFFINITY; + + raw_spin_lock(&wqe->lock); + hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list); + list_add_tail_rcu(&worker->all_list, &wqe->all_list); + worker->flags |= IO_WORKER_F_FREE; + raw_spin_unlock(&wqe->lock); + wake_up_new_task(tsk); +} + +static bool io_wq_work_match_all(struct io_wq_work *work, void *data) +{ + return true; +} + +static inline bool io_should_retry_thread(long err) +{ + switch (err) { + case -EAGAIN: + case -ERESTARTSYS: + case -ERESTARTNOINTR: + case -ERESTARTNOHAND: + return true; + default: + return false; + } +} + +static void create_worker_cont(struct callback_head *cb) { - struct io_wqe_acct *acct = &wqe->acct[index]; struct io_worker *worker; struct task_struct *tsk; + struct io_wqe *wqe; - __set_current_state(TASK_RUNNING); + worker = container_of(cb, struct io_worker, create_work); + clear_bit_unlock(0, &worker->create_state); + wqe = worker->wqe; + tsk = create_io_thread(io_wqe_worker, worker, wqe->node); + if (!IS_ERR(tsk)) { + io_init_new_worker(wqe, worker, tsk); + io_worker_release(worker); + return; + } else if (!io_should_retry_thread(PTR_ERR(tsk))) { + struct io_wqe_acct *acct = io_wqe_get_acct(worker); - worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node); - if (!worker) - goto fail; + atomic_dec(&acct->nr_running); + raw_spin_lock(&wqe->lock); + acct->nr_workers--; + if (!acct->nr_workers) { + struct io_cb_cancel_data match = { + .fn = io_wq_work_match_all, + .cancel_all = true, + }; - refcount_set(&worker->ref, 1); - worker->nulls_node.pprev = NULL; - worker->wqe = wqe; - spin_lock_init(&worker->lock); - init_completion(&worker->ref_done); + while (io_acct_cancel_pending_work(wqe, acct, &match)) + raw_spin_lock(&wqe->lock); + } + raw_spin_unlock(&wqe->lock); + io_worker_ref_put(wqe->wq); + kfree(worker); + return; + } - tsk = create_io_thread(io_wqe_worker, worker, wqe->node); - if (IS_ERR(tsk)) { + /* re-create attempts grab a new worker ref, drop the existing one */ + io_worker_release(worker); + schedule_work(&worker->work); +} + +static void io_workqueue_create(struct work_struct *work) +{ + struct io_worker *worker = container_of(work, struct io_worker, work); + struct io_wqe_acct *acct = io_wqe_get_acct(worker); + + if (!io_queue_worker_create(worker, acct, create_worker_cont)) { + clear_bit_unlock(0, &worker->create_state); + io_worker_release(worker); kfree(worker); + } +} + +static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) +{ + struct io_wqe_acct *acct = &wqe->acct[index]; + struct io_worker *worker; + struct task_struct *tsk; + + __set_current_state(TASK_RUNNING); + + worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node); + if (!worker) { fail: atomic_dec(&acct->nr_running); raw_spin_lock(&wqe->lock); acct->nr_workers--; raw_spin_unlock(&wqe->lock); io_worker_ref_put(wq); - return; + return false; } - tsk->pf_io_worker = worker; - worker->task = tsk; - set_cpus_allowed_ptr(tsk, wqe->cpu_mask); - tsk->flags |= PF_NO_SETAFFINITY; + refcount_set(&worker->ref, 1); + worker->wqe = wqe; + spin_lock_init(&worker->lock); + init_completion(&worker->ref_done); - raw_spin_lock(&wqe->lock); - hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list); - list_add_tail_rcu(&worker->all_list, &wqe->all_list); - worker->flags |= IO_WORKER_F_FREE; if (index == IO_WQ_ACCT_BOUND) worker->flags |= IO_WORKER_F_BOUND; - if (first && (worker->flags & IO_WORKER_F_BOUND)) - worker->flags |= IO_WORKER_F_FIXED; - raw_spin_unlock(&wqe->lock); - wake_up_new_task(tsk); + + tsk = create_io_thread(io_wqe_worker, worker, wqe->node); + if (!IS_ERR(tsk)) { + io_init_new_worker(wqe, worker, tsk); + } else if (!io_should_retry_thread(PTR_ERR(tsk))) { + kfree(worker); + goto fail; + } else { + INIT_WORK(&worker->work, io_workqueue_create); + schedule_work(&worker->work); + } + + return true; } /* @@ -755,11 +803,6 @@ static bool io_wq_worker_wake(struct io_worker *worker, void *data) return false; } -static bool io_wq_work_match_all(struct io_wq_work *work, void *data) -{ - return true; -} - static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe) { struct io_wq *wq = wqe->wq; @@ -773,12 +816,13 @@ static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe) static void io_wqe_insert_work(struct io_wqe *wqe, struct io_wq_work *work) { + struct io_wqe_acct *acct = io_work_get_acct(wqe, work); unsigned int hash; struct io_wq_work *tail; if (!io_wq_is_hashed(work)) { append: - wq_list_add_tail(&work->list, &wqe->work_list); + wq_list_add_tail(&work->list, &acct->work_list); return; } @@ -788,13 +832,19 @@ append: if (!tail) goto append; - wq_list_add_after(&work->list, &tail->list, &wqe->work_list); + wq_list_add_after(&work->list, &tail->list, &acct->work_list); +} + +static bool io_wq_work_match_item(struct io_wq_work *work, void *data) +{ + return work == data; } static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) { struct io_wqe_acct *acct = io_work_get_acct(wqe, work); - bool do_wake; + unsigned work_flags = work->flags; + bool do_create; /* * If io-wq is exiting for this task, or if the request has explicitly @@ -808,13 +858,36 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) raw_spin_lock(&wqe->lock); io_wqe_insert_work(wqe, work); - wqe->flags &= ~IO_WQE_FLAG_STALLED; - do_wake = (work->flags & IO_WQ_WORK_CONCURRENT) || - !atomic_read(&acct->nr_running); + clear_bit(IO_ACCT_STALLED_BIT, &acct->flags); + + rcu_read_lock(); + do_create = !io_wqe_activate_free_worker(wqe, acct); + rcu_read_unlock(); + raw_spin_unlock(&wqe->lock); - if (do_wake) - io_wqe_wake_worker(wqe, acct); + if (do_create && ((work_flags & IO_WQ_WORK_CONCURRENT) || + !atomic_read(&acct->nr_running))) { + bool did_create; + + did_create = io_wqe_create_worker(wqe, acct); + if (likely(did_create)) + return; + + raw_spin_lock(&wqe->lock); + /* fatal condition, failed to create the first worker */ + if (!acct->nr_workers) { + struct io_cb_cancel_data match = { + .fn = io_wq_work_match_item, + .data = work, + .cancel_all = false, + }; + + if (io_acct_cancel_pending_work(wqe, acct, &match)) + raw_spin_lock(&wqe->lock); + } + raw_spin_unlock(&wqe->lock); + } } void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work) @@ -859,6 +932,7 @@ static inline void io_wqe_remove_pending(struct io_wqe *wqe, struct io_wq_work *work, struct io_wq_work_node *prev) { + struct io_wqe_acct *acct = io_work_get_acct(wqe, work); unsigned int hash = io_get_work_hash(work); struct io_wq_work *prev_work = NULL; @@ -870,18 +944,18 @@ static inline void io_wqe_remove_pending(struct io_wqe *wqe, else wqe->hash_tail[hash] = NULL; } - wq_list_del(&wqe->work_list, &work->list, prev); + wq_list_del(&acct->work_list, &work->list, prev); } -static void io_wqe_cancel_pending_work(struct io_wqe *wqe, - struct io_cb_cancel_data *match) +static bool io_acct_cancel_pending_work(struct io_wqe *wqe, + struct io_wqe_acct *acct, + struct io_cb_cancel_data *match) + __releases(wqe->lock) { struct io_wq_work_node *node, *prev; struct io_wq_work *work; -retry: - raw_spin_lock(&wqe->lock); - wq_list_for_each(node, prev, &wqe->work_list) { + wq_list_for_each(node, prev, &acct->work_list) { work = container_of(node, struct io_wq_work, list); if (!match->fn(work, match->data)) continue; @@ -889,11 +963,27 @@ retry: raw_spin_unlock(&wqe->lock); io_run_cancel(work, wqe); match->nr_pending++; - if (!match->cancel_all) - return; - /* not safe to continue after unlock */ - goto retry; + return true; + } + + return false; +} + +static void io_wqe_cancel_pending_work(struct io_wqe *wqe, + struct io_cb_cancel_data *match) +{ + int i; +retry: + raw_spin_lock(&wqe->lock); + for (i = 0; i < IO_WQ_ACCT_NR; i++) { + struct io_wqe_acct *acct = io_get_acct(wqe, i == 0); + + if (io_acct_cancel_pending_work(wqe, acct, match)) { + if (match->cancel_all) + goto retry; + return; + } } raw_spin_unlock(&wqe->lock); } @@ -954,18 +1044,24 @@ static int io_wqe_hash_wake(struct wait_queue_entry *wait, unsigned mode, int sync, void *key) { struct io_wqe *wqe = container_of(wait, struct io_wqe, wait); + int i; list_del_init(&wait->entry); rcu_read_lock(); - io_wqe_activate_free_worker(wqe); + for (i = 0; i < IO_WQ_ACCT_NR; i++) { + struct io_wqe_acct *acct = &wqe->acct[i]; + + if (test_and_clear_bit(IO_ACCT_STALLED_BIT, &acct->flags)) + io_wqe_activate_free_worker(wqe, acct); + } rcu_read_unlock(); return 1; } struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) { - int ret, node; + int ret, node, i; struct io_wq *wq; if (WARN_ON_ONCE(!data->free_work || !data->do_work)) @@ -1000,18 +1096,20 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) cpumask_copy(wqe->cpu_mask, cpumask_of_node(node)); wq->wqes[node] = wqe; wqe->node = alloc_node; - wqe->acct[IO_WQ_ACCT_BOUND].index = IO_WQ_ACCT_BOUND; - wqe->acct[IO_WQ_ACCT_UNBOUND].index = IO_WQ_ACCT_UNBOUND; wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded; - atomic_set(&wqe->acct[IO_WQ_ACCT_BOUND].nr_running, 0); wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers = task_rlimit(current, RLIMIT_NPROC); - atomic_set(&wqe->acct[IO_WQ_ACCT_UNBOUND].nr_running, 0); - wqe->wait.func = io_wqe_hash_wake; INIT_LIST_HEAD(&wqe->wait.entry); + wqe->wait.func = io_wqe_hash_wake; + for (i = 0; i < IO_WQ_ACCT_NR; i++) { + struct io_wqe_acct *acct = &wqe->acct[i]; + + acct->index = i; + atomic_set(&acct->nr_running, 0); + INIT_WQ_LIST(&acct->work_list); + } wqe->wq = wq; raw_spin_lock_init(&wqe->lock); - INIT_WQ_LIST(&wqe->work_list); INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0); INIT_LIST_HEAD(&wqe->all_list); } @@ -1038,7 +1136,7 @@ static bool io_task_work_match(struct callback_head *cb, void *data) { struct io_worker *worker; - if (cb->func != create_worker_cb) + if (cb->func != create_worker_cb && cb->func != create_worker_cont) return false; worker = container_of(cb, struct io_worker, create_work); return worker->wqe->wq == data; @@ -1059,9 +1157,14 @@ static void io_wq_exit_workers(struct io_wq *wq) while ((cb = task_work_cancel_match(wq->task, io_task_work_match, wq)) != NULL) { struct io_worker *worker; + struct io_wqe_acct *acct; worker = container_of(cb, struct io_worker, create_work); - atomic_dec(&worker->wqe->acct[worker->create_index].nr_running); + acct = io_wqe_get_acct(worker); + atomic_dec(&acct->nr_running); + raw_spin_lock(&worker->wqe->lock); + acct->nr_workers--; + raw_spin_unlock(&worker->wqe->lock); io_worker_ref_put(wq); clear_bit_unlock(0, &worker->create_state); io_worker_release(worker); @@ -1193,7 +1296,7 @@ int io_wq_max_workers(struct io_wq *wq, int *new_count) for_each_node(node) { struct io_wqe_acct *acct; - for (i = 0; i < 2; i++) { + for (i = 0; i < IO_WQ_ACCT_NR; i++) { acct = &wq->wqes[node]->acct[i]; prev = max_t(int, acct->max_workers, prev); if (new_count[i]) diff --git a/fs/io_uring.c b/fs/io_uring.c index 6f35b1285865..16fb7436043c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1021,6 +1021,7 @@ static const struct io_op_def io_op_defs[] = { }, [IORING_OP_WRITE] = { .needs_file = 1, + .hash_reg_file = 1, .unbound_nonreg_file = 1, .pollout = 1, .plug = 1, @@ -1481,6 +1482,8 @@ static void io_kill_timeout(struct io_kiocb *req, int status) struct io_timeout_data *io = req->async_data; if (hrtimer_try_to_cancel(&io->timer) != -1) { + if (status) + req_set_fail(req); atomic_set(&req->ctx->cq_timeouts, atomic_read(&req->ctx->cq_timeouts) + 1); list_del_init(&req->timeout.list); @@ -1618,8 +1621,11 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx) static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) { + /* see waitqueue_active() comment */ + smp_mb(); + if (ctx->flags & IORING_SETUP_SQPOLL) { - if (wq_has_sleeper(&ctx->cq_wait)) + if (waitqueue_active(&ctx->cq_wait)) wake_up_all(&ctx->cq_wait); } if (io_should_trigger_evfd(ctx)) @@ -1851,6 +1857,17 @@ static void io_req_complete_failed(struct io_kiocb *req, long res) io_req_complete_post(req, res, 0); } +static void io_req_complete_fail_submit(struct io_kiocb *req) +{ + /* + * We don't submit, fail them all, for that replace hardlinks with + * normal links. Extra REQ_F_LINK is tolerated. + */ + req->flags &= ~REQ_F_HARDLINK; + req->flags |= REQ_F_LINK; + io_req_complete_failed(req, req->result); +} + /* * Don't initialise the fields below on every allocation, but do that in * advance and keep them valid across allocations. @@ -2119,6 +2136,9 @@ static void tctx_task_work(struct callback_head *cb) while (1) { struct io_wq_work_node *node; + if (!tctx->task_list.first && locked && ctx->submit_state.compl_nr) + io_submit_flush_completions(ctx); + spin_lock_irq(&tctx->task_lock); node = tctx->task_list.first; INIT_WQ_LIST(&tctx->task_list); @@ -2673,7 +2693,7 @@ static void __io_complete_rw(struct io_kiocb *req, long res, long res2, { if (__io_complete_rw_common(req, res)) return; - __io_req_complete(req, 0, req->result, io_put_rw_kbuf(req)); + __io_req_complete(req, issue_flags, req->result, io_put_rw_kbuf(req)); } static void io_complete_rw(struct kiocb *kiocb, long res, long res2) @@ -3410,6 +3430,12 @@ static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter) return -EINVAL; } +static bool need_read_all(struct io_kiocb *req) +{ + return req->flags & REQ_F_ISREG || + S_ISBLK(file_inode(req->file)->i_mode); +} + static int io_read(struct io_kiocb *req, unsigned int issue_flags) { struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; @@ -3459,12 +3485,13 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) if (req->flags & REQ_F_NOWAIT) goto done; /* some cases will consume bytes even on error returns */ + iov_iter_reexpand(iter, iter->count + iter->truncated); iov_iter_revert(iter, io_size - iov_iter_count(iter)); ret = 0; } else if (ret == -EIOCBQUEUED) { goto out_free; } else if (ret <= 0 || ret == io_size || !force_nonblock || - (req->flags & REQ_F_NOWAIT) || !(req->flags & REQ_F_ISREG)) { + (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) { /* read all, failed, already did sync or don't want to retry */ goto done; } @@ -3598,6 +3625,7 @@ done: } else { copy_iov: /* some cases will consume bytes even on error returns */ + iov_iter_reexpand(iter, iter->count + iter->truncated); iov_iter_revert(iter, io_size - iov_iter_count(iter)); ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false); return ret ?: -EAGAIN; @@ -5249,7 +5277,7 @@ static void io_poll_remove_double(struct io_kiocb *req) } } -static bool io_poll_complete(struct io_kiocb *req, __poll_t mask) +static bool __io_poll_complete(struct io_kiocb *req, __poll_t mask) __must_hold(&req->ctx->completion_lock) { struct io_ring_ctx *ctx = req->ctx; @@ -5271,10 +5299,19 @@ static bool io_poll_complete(struct io_kiocb *req, __poll_t mask) if (flags & IORING_CQE_F_MORE) ctx->cq_extra++; - io_commit_cqring(ctx); return !(flags & IORING_CQE_F_MORE); } +static inline bool io_poll_complete(struct io_kiocb *req, __poll_t mask) + __must_hold(&req->ctx->completion_lock) +{ + bool done; + + done = __io_poll_complete(req, mask); + io_commit_cqring(req->ctx); + return done; +} + static void io_poll_task_func(struct io_kiocb *req, bool *locked) { struct io_ring_ctx *ctx = req->ctx; @@ -5285,7 +5322,7 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked) } else { bool done; - done = io_poll_complete(req, req->result); + done = __io_poll_complete(req, req->result); if (done) { io_poll_remove_double(req); hash_del(&req->hash_node); @@ -5293,6 +5330,7 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked) req->result = 0; add_wait_queue(req->poll.head, &req->poll.wait); } + io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); io_cqring_ev_posted(ctx); @@ -6398,6 +6436,11 @@ static bool io_drain_req(struct io_kiocb *req) int ret; u32 seq; + if (req->flags & REQ_F_FAIL) { + io_req_complete_fail_submit(req); + return true; + } + /* * If we need to drain a request in the middle of a link, drain the * head request and the next request/link after the current link. @@ -6914,7 +6957,7 @@ static inline void io_queue_sqe(struct io_kiocb *req) if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)))) { __io_queue_sqe(req); } else if (req->flags & REQ_F_FAIL) { - io_req_complete_failed(req, req->result); + io_req_complete_fail_submit(req); } else { int ret = io_req_prep_async(req); @@ -10498,26 +10541,53 @@ static int io_unregister_iowq_aff(struct io_ring_ctx *ctx) static int io_register_iowq_max_workers(struct io_ring_ctx *ctx, void __user *arg) { - struct io_uring_task *tctx = current->io_uring; + struct io_uring_task *tctx = NULL; + struct io_sq_data *sqd = NULL; __u32 new_count[2]; int i, ret; - if (!tctx || !tctx->io_wq) - return -EINVAL; if (copy_from_user(new_count, arg, sizeof(new_count))) return -EFAULT; for (i = 0; i < ARRAY_SIZE(new_count); i++) if (new_count[i] > INT_MAX) return -EINVAL; + if (ctx->flags & IORING_SETUP_SQPOLL) { + sqd = ctx->sq_data; + if (sqd) { + /* + * Observe the correct sqd->lock -> ctx->uring_lock + * ordering. Fine to drop uring_lock here, we hold + * a ref to the ctx. + */ + mutex_unlock(&ctx->uring_lock); + mutex_lock(&sqd->lock); + mutex_lock(&ctx->uring_lock); + tctx = sqd->thread->io_uring; + } + } else { + tctx = current->io_uring; + } + + ret = -EINVAL; + if (!tctx || !tctx->io_wq) + goto err; + ret = io_wq_max_workers(tctx->io_wq, new_count); if (ret) - return ret; + goto err; + + if (sqd) + mutex_unlock(&sqd->lock); if (copy_to_user(arg, new_count, sizeof(new_count))) return -EFAULT; return 0; +err: + if (sqd) + mutex_unlock(&sqd->lock); + return ret; } static bool io_register_op_must_quiesce(int op) @@ -10795,7 +10865,7 @@ static int __init io_uring_init(void) BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8)); BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST); - BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int)); + BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int)); req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); diff --git a/fs/ioctl.c b/fs/ioctl.c index eea8267ae1f2..504e69578112 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -614,6 +614,14 @@ static int fileattr_set_prepare(struct inode *inode, if ((old_ma->fsx_xflags ^ fa->fsx_xflags) & FS_XFLAG_PROJINHERIT) return -EINVAL; + } else { + /* + * Caller is allowed to change the project ID. If it is being + * changed, make sure that the new value is valid. + */ + if (old_ma->fsx_projid != fa->fsx_projid && + !projid_valid(make_kprojid(&init_user_ns, fa->fsx_projid))) + return -EINVAL; } /* Check extent size hints. */ diff --git a/fs/iomap/Makefile b/fs/iomap/Makefile index eef2722d93a1..4143a3ff89db 100644 --- a/fs/iomap/Makefile +++ b/fs/iomap/Makefile @@ -9,9 +9,9 @@ ccflags-y += -I $(srctree)/$(src) # needed for trace events obj-$(CONFIG_FS_IOMAP) += iomap.o iomap-y += trace.o \ - apply.o \ buffered-io.o \ direct-io.o \ fiemap.o \ + iter.o \ seek.o iomap-$(CONFIG_SWAP) += swapfile.o diff --git a/fs/iomap/apply.c b/fs/iomap/apply.c deleted file mode 100644 index 26ab6563181f..000000000000 --- a/fs/iomap/apply.c +++ /dev/null @@ -1,99 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2010 Red Hat, Inc. - * Copyright (c) 2016-2018 Christoph Hellwig. - */ -#include <linux/module.h> -#include <linux/compiler.h> -#include <linux/fs.h> -#include <linux/iomap.h> -#include "trace.h" - -/* - * Execute a iomap write on a segment of the mapping that spans a - * contiguous range of pages that have identical block mapping state. - * - * This avoids the need to map pages individually, do individual allocations - * for each page and most importantly avoid the need for filesystem specific - * locking per page. Instead, all the operations are amortised over the entire - * range of pages. It is assumed that the filesystems will lock whatever - * resources they require in the iomap_begin call, and release them in the - * iomap_end call. - */ -loff_t -iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, - const struct iomap_ops *ops, void *data, iomap_actor_t actor) -{ - struct iomap iomap = { .type = IOMAP_HOLE }; - struct iomap srcmap = { .type = IOMAP_HOLE }; - loff_t written = 0, ret; - u64 end; - - trace_iomap_apply(inode, pos, length, flags, ops, actor, _RET_IP_); - - /* - * Need to map a range from start position for length bytes. This can - * span multiple pages - it is only guaranteed to return a range of a - * single type of pages (e.g. all into a hole, all mapped or all - * unwritten). Failure at this point has nothing to undo. - * - * If allocation is required for this range, reserve the space now so - * that the allocation is guaranteed to succeed later on. Once we copy - * the data into the page cache pages, then we cannot fail otherwise we - * expose transient stale data. If the reserve fails, we can safely - * back out at this point as there is nothing to undo. - */ - ret = ops->iomap_begin(inode, pos, length, flags, &iomap, &srcmap); - if (ret) - return ret; - if (WARN_ON(iomap.offset > pos)) { - written = -EIO; - goto out; - } - if (WARN_ON(iomap.length == 0)) { - written = -EIO; - goto out; - } - - trace_iomap_apply_dstmap(inode, &iomap); - if (srcmap.type != IOMAP_HOLE) - trace_iomap_apply_srcmap(inode, &srcmap); - - /* - * Cut down the length to the one actually provided by the filesystem, - * as it might not be able to give us the whole size that we requested. - */ - end = iomap.offset + iomap.length; - if (srcmap.type != IOMAP_HOLE) - end = min(end, srcmap.offset + srcmap.length); - if (pos + length > end) - length = end - pos; - - /* - * Now that we have guaranteed that the space allocation will succeed, - * we can do the copy-in page by page without having to worry about - * failures exposing transient data. - * - * To support COW operations, we read in data for partially blocks from - * the srcmap if the file system filled it in. In that case we the - * length needs to be limited to the earlier of the ends of the iomaps. - * If the file system did not provide a srcmap we pass in the normal - * iomap into the actors so that they don't need to have special - * handling for the two cases. - */ - written = actor(inode, pos, length, data, &iomap, - srcmap.type != IOMAP_HOLE ? &srcmap : &iomap); - -out: - /* - * Now the data has been copied, commit the range we've copied. This - * should not fail unless the filesystem has had a fatal error. - */ - if (ops->iomap_end) { - ret = ops->iomap_end(inode, pos, length, - written > 0 ? written : 0, - flags, &iomap); - } - - return written ? written : ret; -} diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 87ccb3438bec..9cc5798423d1 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -36,7 +36,7 @@ static inline struct iomap_page *to_iomap_page(struct page *page) { /* * per-block data is stored in the head page. Callers should - * not be dealing with tail pages (and if they are, they can + * not be dealing with tail pages, and if they are, they can * call thp_head() first. */ VM_BUG_ON_PGFLAGS(PageTail(page), page); @@ -98,7 +98,7 @@ iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop, unsigned last = (poff + plen - 1) >> block_bits; /* - * If the block size is smaller than the page size we need to check the + * If the block size is smaller than the page size, we need to check the * per-block uptodate status and adjust the offset and length if needed * to avoid reading in already uptodate ranges. */ @@ -126,7 +126,7 @@ iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop, } /* - * If the extent spans the block that contains the i_size we need to + * If the extent spans the block that contains the i_size, we need to * handle both halves separately so that we properly zero data in the * page cache for blocks that are entirely outside of i_size. */ @@ -205,60 +205,67 @@ struct iomap_readpage_ctx { struct readahead_control *rac; }; -static void -iomap_read_inline_data(struct inode *inode, struct page *page, - struct iomap *iomap) +static loff_t iomap_read_inline_data(const struct iomap_iter *iter, + struct page *page) { - size_t size = i_size_read(inode); + const struct iomap *iomap = iomap_iter_srcmap(iter); + size_t size = i_size_read(iter->inode) - iomap->offset; + size_t poff = offset_in_page(iomap->offset); void *addr; if (PageUptodate(page)) - return; - - BUG_ON(page_has_private(page)); - BUG_ON(page->index); - BUG_ON(size > PAGE_SIZE - offset_in_page(iomap->inline_data)); - - addr = kmap_atomic(page); + return PAGE_SIZE - poff; + + if (WARN_ON_ONCE(size > PAGE_SIZE - poff)) + return -EIO; + if (WARN_ON_ONCE(size > PAGE_SIZE - + offset_in_page(iomap->inline_data))) + return -EIO; + if (WARN_ON_ONCE(size > iomap->length)) + return -EIO; + if (poff > 0) + iomap_page_create(iter->inode, page); + + addr = kmap_local_page(page) + poff; memcpy(addr, iomap->inline_data, size); - memset(addr + size, 0, PAGE_SIZE - size); - kunmap_atomic(addr); - SetPageUptodate(page); + memset(addr + size, 0, PAGE_SIZE - poff - size); + kunmap_local(addr); + iomap_set_range_uptodate(page, poff, PAGE_SIZE - poff); + return PAGE_SIZE - poff; } -static inline bool iomap_block_needs_zeroing(struct inode *inode, - struct iomap *iomap, loff_t pos) +static inline bool iomap_block_needs_zeroing(const struct iomap_iter *iter, + loff_t pos) { - return iomap->type != IOMAP_MAPPED || - (iomap->flags & IOMAP_F_NEW) || - pos >= i_size_read(inode); + const struct iomap *srcmap = iomap_iter_srcmap(iter); + + return srcmap->type != IOMAP_MAPPED || + (srcmap->flags & IOMAP_F_NEW) || + pos >= i_size_read(iter->inode); } -static loff_t -iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, - struct iomap *iomap, struct iomap *srcmap) +static loff_t iomap_readpage_iter(const struct iomap_iter *iter, + struct iomap_readpage_ctx *ctx, loff_t offset) { - struct iomap_readpage_ctx *ctx = data; + const struct iomap *iomap = &iter->iomap; + loff_t pos = iter->pos + offset; + loff_t length = iomap_length(iter) - offset; struct page *page = ctx->cur_page; struct iomap_page *iop; - bool same_page = false, is_contig = false; loff_t orig_pos = pos; unsigned poff, plen; sector_t sector; - if (iomap->type == IOMAP_INLINE) { - WARN_ON_ONCE(pos); - iomap_read_inline_data(inode, page, iomap); - return PAGE_SIZE; - } + if (iomap->type == IOMAP_INLINE) + return min(iomap_read_inline_data(iter, page), length); /* zero post-eof blocks as the page may be mapped */ - iop = iomap_page_create(inode, page); - iomap_adjust_read_range(inode, iop, &pos, length, &poff, &plen); + iop = iomap_page_create(iter->inode, page); + iomap_adjust_read_range(iter->inode, iop, &pos, length, &poff, &plen); if (plen == 0) goto done; - if (iomap_block_needs_zeroing(inode, iomap, pos)) { + if (iomap_block_needs_zeroing(iter, pos)) { zero_user(page, poff, plen); iomap_set_range_uptodate(page, poff, plen); goto done; @@ -268,16 +275,10 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, if (iop) atomic_add(plen, &iop->read_bytes_pending); - /* Try to merge into a previous segment if we can */ sector = iomap_sector(iomap, pos); - if (ctx->bio && bio_end_sector(ctx->bio) == sector) { - if (__bio_try_merge_page(ctx->bio, page, plen, poff, - &same_page)) - goto done; - is_contig = true; - } - - if (!is_contig || bio_full(ctx->bio, plen)) { + if (!ctx->bio || + bio_end_sector(ctx->bio) != sector || + bio_add_page(ctx->bio, page, plen, poff) != plen) { gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL); gfp_t orig_gfp = gfp; unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE); @@ -301,13 +302,12 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, ctx->bio->bi_iter.bi_sector = sector; bio_set_dev(ctx->bio, iomap->bdev); ctx->bio->bi_end_io = iomap_read_end_io; + __bio_add_page(ctx->bio, page, plen, poff); } - - bio_add_page(ctx->bio, page, plen, poff); done: /* * Move the caller beyond our range so that it keeps making progress. - * For that we have to include any leading non-uptodate ranges, but + * For that, we have to include any leading non-uptodate ranges, but * we can skip trailing ones as they will be handled in the next * iteration. */ @@ -317,23 +317,23 @@ done: int iomap_readpage(struct page *page, const struct iomap_ops *ops) { - struct iomap_readpage_ctx ctx = { .cur_page = page }; - struct inode *inode = page->mapping->host; - unsigned poff; - loff_t ret; + struct iomap_iter iter = { + .inode = page->mapping->host, + .pos = page_offset(page), + .len = PAGE_SIZE, + }; + struct iomap_readpage_ctx ctx = { + .cur_page = page, + }; + int ret; trace_iomap_readpage(page->mapping->host, 1); - for (poff = 0; poff < PAGE_SIZE; poff += ret) { - ret = iomap_apply(inode, page_offset(page) + poff, - PAGE_SIZE - poff, 0, ops, &ctx, - iomap_readpage_actor); - if (ret <= 0) { - WARN_ON_ONCE(ret == 0); - SetPageError(page); - break; - } - } + while ((ret = iomap_iter(&iter, ops)) > 0) + iter.processed = iomap_readpage_iter(&iter, &ctx, 0); + + if (ret < 0) + SetPageError(page); if (ctx.bio) { submit_bio(ctx.bio); @@ -344,23 +344,22 @@ iomap_readpage(struct page *page, const struct iomap_ops *ops) } /* - * Just like mpage_readahead and block_read_full_page we always + * Just like mpage_readahead and block_read_full_page, we always * return 0 and just mark the page as PageError on errors. This - * should be cleaned up all through the stack eventually. + * should be cleaned up throughout the stack eventually. */ return 0; } EXPORT_SYMBOL_GPL(iomap_readpage); -static loff_t -iomap_readahead_actor(struct inode *inode, loff_t pos, loff_t length, - void *data, struct iomap *iomap, struct iomap *srcmap) +static loff_t iomap_readahead_iter(const struct iomap_iter *iter, + struct iomap_readpage_ctx *ctx) { - struct iomap_readpage_ctx *ctx = data; + loff_t length = iomap_length(iter); loff_t done, ret; for (done = 0; done < length; done += ret) { - if (ctx->cur_page && offset_in_page(pos + done) == 0) { + if (ctx->cur_page && offset_in_page(iter->pos + done) == 0) { if (!ctx->cur_page_in_bio) unlock_page(ctx->cur_page); put_page(ctx->cur_page); @@ -370,8 +369,7 @@ iomap_readahead_actor(struct inode *inode, loff_t pos, loff_t length, ctx->cur_page = readahead_page(ctx->rac); ctx->cur_page_in_bio = false; } - ret = iomap_readpage_actor(inode, pos + done, length - done, - ctx, iomap, srcmap); + ret = iomap_readpage_iter(iter, ctx, done); } return done; @@ -394,25 +392,19 @@ iomap_readahead_actor(struct inode *inode, loff_t pos, loff_t length, */ void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops) { - struct inode *inode = rac->mapping->host; - loff_t pos = readahead_pos(rac); - size_t length = readahead_length(rac); + struct iomap_iter iter = { + .inode = rac->mapping->host, + .pos = readahead_pos(rac), + .len = readahead_length(rac), + }; struct iomap_readpage_ctx ctx = { .rac = rac, }; - trace_iomap_readahead(inode, readahead_count(rac)); + trace_iomap_readahead(rac->mapping->host, readahead_count(rac)); - while (length > 0) { - ssize_t ret = iomap_apply(inode, pos, length, 0, ops, - &ctx, iomap_readahead_actor); - if (ret <= 0) { - WARN_ON_ONCE(ret == 0); - break; - } - pos += ret; - length -= ret; - } + while (iomap_iter(&iter, ops) > 0) + iter.processed = iomap_readahead_iter(&iter, &ctx); if (ctx.bio) submit_bio(ctx.bio); @@ -467,7 +459,7 @@ iomap_releasepage(struct page *page, gfp_t gfp_mask) /* * mm accommodates an old ext3 case where clean pages might not have had * the dirty bit cleared. Thus, it can send actual dirty pages to - * ->releasepage() via shrink_active_list(), skip those here. + * ->releasepage() via shrink_active_list(); skip those here. */ if (PageDirty(page) || PageWriteback(page)) return 0; @@ -482,7 +474,7 @@ iomap_invalidatepage(struct page *page, unsigned int offset, unsigned int len) trace_iomap_invalidatepage(page->mapping->host, offset, len); /* - * If we are invalidating the entire page, clear the dirty state from it + * If we're invalidating the entire page, clear the dirty state from it * and release it to avoid unnecessary buildup of the LRU. */ if (offset == 0 && len == PAGE_SIZE) { @@ -516,10 +508,6 @@ iomap_migrate_page(struct address_space *mapping, struct page *newpage, EXPORT_SYMBOL_GPL(iomap_migrate_page); #endif /* CONFIG_MIGRATION */ -enum { - IOMAP_WRITE_F_UNSHARE = (1 << 0), -}; - static void iomap_write_failed(struct inode *inode, loff_t pos, unsigned len) { @@ -535,7 +523,7 @@ iomap_write_failed(struct inode *inode, loff_t pos, unsigned len) static int iomap_read_page_sync(loff_t block_start, struct page *page, unsigned poff, - unsigned plen, struct iomap *iomap) + unsigned plen, const struct iomap *iomap) { struct bio_vec bvec; struct bio bio; @@ -548,12 +536,12 @@ iomap_read_page_sync(loff_t block_start, struct page *page, unsigned poff, return submit_bio_wait(&bio); } -static int -__iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, int flags, - struct page *page, struct iomap *srcmap) +static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, + unsigned len, struct page *page) { - struct iomap_page *iop = iomap_page_create(inode, page); - loff_t block_size = i_blocksize(inode); + const struct iomap *srcmap = iomap_iter_srcmap(iter); + struct iomap_page *iop = iomap_page_create(iter->inode, page); + loff_t block_size = i_blocksize(iter->inode); loff_t block_start = round_down(pos, block_size); loff_t block_end = round_up(pos + len, block_size); unsigned from = offset_in_page(pos), to = from + len, poff, plen; @@ -563,18 +551,18 @@ __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, int flags, ClearPageError(page); do { - iomap_adjust_read_range(inode, iop, &block_start, + iomap_adjust_read_range(iter->inode, iop, &block_start, block_end - block_start, &poff, &plen); if (plen == 0) break; - if (!(flags & IOMAP_WRITE_F_UNSHARE) && + if (!(iter->flags & IOMAP_UNSHARE) && (from <= poff || from >= poff + plen) && (to <= poff || to >= poff + plen)) continue; - if (iomap_block_needs_zeroing(inode, srcmap, block_start)) { - if (WARN_ON_ONCE(flags & IOMAP_WRITE_F_UNSHARE)) + if (iomap_block_needs_zeroing(iter, block_start)) { + if (WARN_ON_ONCE(iter->flags & IOMAP_UNSHARE)) return -EIO; zero_user_segments(page, poff, from, to, poff + plen); } else { @@ -589,41 +577,54 @@ __iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, int flags, return 0; } -static int -iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, - struct page **pagep, struct iomap *iomap, struct iomap *srcmap) +static int iomap_write_begin_inline(const struct iomap_iter *iter, + struct page *page) { - const struct iomap_page_ops *page_ops = iomap->page_ops; + int ret; + + /* needs more work for the tailpacking case; disable for now */ + if (WARN_ON_ONCE(iomap_iter_srcmap(iter)->offset != 0)) + return -EIO; + ret = iomap_read_inline_data(iter, page); + if (ret < 0) + return ret; + return 0; +} + +static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos, + unsigned len, struct page **pagep) +{ + const struct iomap_page_ops *page_ops = iter->iomap.page_ops; + const struct iomap *srcmap = iomap_iter_srcmap(iter); struct page *page; int status = 0; - BUG_ON(pos + len > iomap->offset + iomap->length); - if (srcmap != iomap) + BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length); + if (srcmap != &iter->iomap) BUG_ON(pos + len > srcmap->offset + srcmap->length); if (fatal_signal_pending(current)) return -EINTR; if (page_ops && page_ops->page_prepare) { - status = page_ops->page_prepare(inode, pos, len, iomap); + status = page_ops->page_prepare(iter->inode, pos, len); if (status) return status; } - page = grab_cache_page_write_begin(inode->i_mapping, pos >> PAGE_SHIFT, - AOP_FLAG_NOFS); + page = grab_cache_page_write_begin(iter->inode->i_mapping, + pos >> PAGE_SHIFT, AOP_FLAG_NOFS); if (!page) { status = -ENOMEM; goto out_no_page; } if (srcmap->type == IOMAP_INLINE) - iomap_read_inline_data(inode, page, srcmap); - else if (iomap->flags & IOMAP_F_BUFFER_HEAD) + status = iomap_write_begin_inline(iter, page); + else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) status = __block_write_begin_int(page, pos, len, NULL, srcmap); else - status = __iomap_write_begin(inode, pos, len, flags, page, - srcmap); + status = __iomap_write_begin(iter, pos, len, page); if (unlikely(status)) goto out_unlock; @@ -634,11 +635,11 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, out_unlock: unlock_page(page); put_page(page); - iomap_write_failed(inode, pos, len); + iomap_write_failed(iter->inode, pos, len); out_no_page: if (page_ops && page_ops->page_done) - page_ops->page_done(inode, pos, 0, NULL, iomap); + page_ops->page_done(iter->inode, pos, 0, NULL); return status; } @@ -650,13 +651,13 @@ static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len, /* * The blocks that were entirely written will now be uptodate, so we * don't have to worry about a readpage reading them and overwriting a - * partial write. However if we have encountered a short write and only + * partial write. However, if we've encountered a short write and only * partially written into a block, it will not be marked uptodate, so a * readpage might come in and destroy our partial write. * - * Do the simplest thing, and just treat any short write to a non - * uptodate page as a zero-length write, and force the caller to redo - * the whole thing. + * Do the simplest thing and just treat any short write to a + * non-uptodate page as a zero-length write, and force the caller to + * redo the whole thing. */ if (unlikely(copied < len && !PageUptodate(page))) return 0; @@ -665,39 +666,40 @@ static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len, return copied; } -static size_t iomap_write_end_inline(struct inode *inode, struct page *page, - struct iomap *iomap, loff_t pos, size_t copied) +static size_t iomap_write_end_inline(const struct iomap_iter *iter, + struct page *page, loff_t pos, size_t copied) { + const struct iomap *iomap = &iter->iomap; void *addr; WARN_ON_ONCE(!PageUptodate(page)); - BUG_ON(pos + copied > PAGE_SIZE - offset_in_page(iomap->inline_data)); + BUG_ON(!iomap_inline_data_valid(iomap)); flush_dcache_page(page); - addr = kmap_atomic(page); - memcpy(iomap->inline_data + pos, addr + pos, copied); - kunmap_atomic(addr); + addr = kmap_local_page(page) + pos; + memcpy(iomap_inline_data(iomap, pos), addr, copied); + kunmap_local(addr); - mark_inode_dirty(inode); + mark_inode_dirty(iter->inode); return copied; } /* Returns the number of bytes copied. May be 0. Cannot be an errno. */ -static size_t iomap_write_end(struct inode *inode, loff_t pos, size_t len, - size_t copied, struct page *page, struct iomap *iomap, - struct iomap *srcmap) +static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len, + size_t copied, struct page *page) { - const struct iomap_page_ops *page_ops = iomap->page_ops; - loff_t old_size = inode->i_size; + const struct iomap_page_ops *page_ops = iter->iomap.page_ops; + const struct iomap *srcmap = iomap_iter_srcmap(iter); + loff_t old_size = iter->inode->i_size; size_t ret; if (srcmap->type == IOMAP_INLINE) { - ret = iomap_write_end_inline(inode, page, iomap, pos, copied); + ret = iomap_write_end_inline(iter, page, pos, copied); } else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) { - ret = block_write_end(NULL, inode->i_mapping, pos, len, copied, - page, NULL); + ret = block_write_end(NULL, iter->inode->i_mapping, pos, len, + copied, page, NULL); } else { - ret = __iomap_write_end(inode, pos, len, copied, page); + ret = __iomap_write_end(iter->inode, pos, len, copied, page); } /* @@ -706,29 +708,28 @@ static size_t iomap_write_end(struct inode *inode, loff_t pos, size_t len, * preferably after I/O completion so that no stale data is exposed. */ if (pos + ret > old_size) { - i_size_write(inode, pos + ret); - iomap->flags |= IOMAP_F_SIZE_CHANGED; + i_size_write(iter->inode, pos + ret); + iter->iomap.flags |= IOMAP_F_SIZE_CHANGED; } unlock_page(page); if (old_size < pos) - pagecache_isize_extended(inode, old_size, pos); + pagecache_isize_extended(iter->inode, old_size, pos); if (page_ops && page_ops->page_done) - page_ops->page_done(inode, pos, ret, page, iomap); + page_ops->page_done(iter->inode, pos, ret, page); put_page(page); if (ret < len) - iomap_write_failed(inode, pos, len); + iomap_write_failed(iter->inode, pos, len); return ret; } -static loff_t -iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data, - struct iomap *iomap, struct iomap *srcmap) +static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) { - struct iov_iter *i = data; - long status = 0; + loff_t length = iomap_length(iter); + loff_t pos = iter->pos; ssize_t written = 0; + long status = 0; do { struct page *page; @@ -744,7 +745,7 @@ again: bytes = length; /* - * Bring in the user page that we will copy from _first_. + * Bring in the user page that we'll copy from _first_. * Otherwise there's a nasty deadlock on copying from the * same page as we're writing to, without it being marked * up-to-date. @@ -754,18 +755,16 @@ again: break; } - status = iomap_write_begin(inode, pos, bytes, 0, &page, iomap, - srcmap); + status = iomap_write_begin(iter, pos, bytes, &page); if (unlikely(status)) break; - if (mapping_writably_mapped(inode->i_mapping)) + if (mapping_writably_mapped(iter->inode->i_mapping)) flush_dcache_page(page); copied = copy_page_from_iter_atomic(page, offset, bytes, i); - status = iomap_write_end(inode, pos, bytes, copied, page, iomap, - srcmap); + status = iomap_write_end(iter, pos, bytes, copied, page); if (unlikely(copied != status)) iov_iter_revert(i, copied - status); @@ -786,36 +785,38 @@ again: written += status; length -= status; - balance_dirty_pages_ratelimited(inode->i_mapping); + balance_dirty_pages_ratelimited(iter->inode->i_mapping); } while (iov_iter_count(i) && length); return written ? written : status; } ssize_t -iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter, +iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i, const struct iomap_ops *ops) { - struct inode *inode = iocb->ki_filp->f_mapping->host; - loff_t pos = iocb->ki_pos, ret = 0, written = 0; - - while (iov_iter_count(iter)) { - ret = iomap_apply(inode, pos, iov_iter_count(iter), - IOMAP_WRITE, ops, iter, iomap_write_actor); - if (ret <= 0) - break; - pos += ret; - written += ret; - } + struct iomap_iter iter = { + .inode = iocb->ki_filp->f_mapping->host, + .pos = iocb->ki_pos, + .len = iov_iter_count(i), + .flags = IOMAP_WRITE, + }; + int ret; - return written ? written : ret; + while ((ret = iomap_iter(&iter, ops)) > 0) + iter.processed = iomap_write_iter(&iter, i); + if (iter.pos == iocb->ki_pos) + return ret; + return iter.pos - iocb->ki_pos; } EXPORT_SYMBOL_GPL(iomap_file_buffered_write); -static loff_t -iomap_unshare_actor(struct inode *inode, loff_t pos, loff_t length, void *data, - struct iomap *iomap, struct iomap *srcmap) +static loff_t iomap_unshare_iter(struct iomap_iter *iter) { + struct iomap *iomap = &iter->iomap; + const struct iomap *srcmap = iomap_iter_srcmap(iter); + loff_t pos = iter->pos; + loff_t length = iomap_length(iter); long status = 0; loff_t written = 0; @@ -831,13 +832,11 @@ iomap_unshare_actor(struct inode *inode, loff_t pos, loff_t length, void *data, unsigned long bytes = min_t(loff_t, PAGE_SIZE - offset, length); struct page *page; - status = iomap_write_begin(inode, pos, bytes, - IOMAP_WRITE_F_UNSHARE, &page, iomap, srcmap); + status = iomap_write_begin(iter, pos, bytes, &page); if (unlikely(status)) return status; - status = iomap_write_end(inode, pos, bytes, bytes, page, iomap, - srcmap); + status = iomap_write_end(iter, pos, bytes, bytes, page); if (WARN_ON_ONCE(status == 0)) return -EIO; @@ -847,7 +846,7 @@ iomap_unshare_actor(struct inode *inode, loff_t pos, loff_t length, void *data, written += status; length -= status; - balance_dirty_pages_ratelimited(inode->i_mapping); + balance_dirty_pages_ratelimited(iter->inode->i_mapping); } while (length); return written; @@ -857,44 +856,43 @@ int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, const struct iomap_ops *ops) { - loff_t ret; - - while (len) { - ret = iomap_apply(inode, pos, len, IOMAP_WRITE, ops, NULL, - iomap_unshare_actor); - if (ret <= 0) - return ret; - pos += ret; - len -= ret; - } + struct iomap_iter iter = { + .inode = inode, + .pos = pos, + .len = len, + .flags = IOMAP_WRITE | IOMAP_UNSHARE, + }; + int ret; - return 0; + while ((ret = iomap_iter(&iter, ops)) > 0) + iter.processed = iomap_unshare_iter(&iter); + return ret; } EXPORT_SYMBOL_GPL(iomap_file_unshare); -static s64 iomap_zero(struct inode *inode, loff_t pos, u64 length, - struct iomap *iomap, struct iomap *srcmap) +static s64 __iomap_zero_iter(struct iomap_iter *iter, loff_t pos, u64 length) { struct page *page; int status; unsigned offset = offset_in_page(pos); unsigned bytes = min_t(u64, PAGE_SIZE - offset, length); - status = iomap_write_begin(inode, pos, bytes, 0, &page, iomap, srcmap); + status = iomap_write_begin(iter, pos, bytes, &page); if (status) return status; zero_user(page, offset, bytes); mark_page_accessed(page); - return iomap_write_end(inode, pos, bytes, bytes, page, iomap, srcmap); + return iomap_write_end(iter, pos, bytes, bytes, page); } -static loff_t iomap_zero_range_actor(struct inode *inode, loff_t pos, - loff_t length, void *data, struct iomap *iomap, - struct iomap *srcmap) +static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) { - bool *did_zero = data; + struct iomap *iomap = &iter->iomap; + const struct iomap *srcmap = iomap_iter_srcmap(iter); + loff_t pos = iter->pos; + loff_t length = iomap_length(iter); loff_t written = 0; /* already zeroed? we're done. */ @@ -904,10 +902,10 @@ static loff_t iomap_zero_range_actor(struct inode *inode, loff_t pos, do { s64 bytes; - if (IS_DAX(inode)) + if (IS_DAX(iter->inode)) bytes = dax_iomap_zero(pos, length, iomap); else - bytes = iomap_zero(inode, pos, length, iomap, srcmap); + bytes = __iomap_zero_iter(iter, pos, length); if (bytes < 0) return bytes; @@ -925,19 +923,17 @@ int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, const struct iomap_ops *ops) { - loff_t ret; - - while (len > 0) { - ret = iomap_apply(inode, pos, len, IOMAP_ZERO, - ops, did_zero, iomap_zero_range_actor); - if (ret <= 0) - return ret; - - pos += ret; - len -= ret; - } + struct iomap_iter iter = { + .inode = inode, + .pos = pos, + .len = len, + .flags = IOMAP_ZERO, + }; + int ret; - return 0; + while ((ret = iomap_iter(&iter, ops)) > 0) + iter.processed = iomap_zero_iter(&iter, did_zero); + return ret; } EXPORT_SYMBOL_GPL(iomap_zero_range); @@ -955,15 +951,15 @@ iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, } EXPORT_SYMBOL_GPL(iomap_truncate_page); -static loff_t -iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length, - void *data, struct iomap *iomap, struct iomap *srcmap) +static loff_t iomap_page_mkwrite_iter(struct iomap_iter *iter, + struct page *page) { - struct page *page = data; + loff_t length = iomap_length(iter); int ret; - if (iomap->flags & IOMAP_F_BUFFER_HEAD) { - ret = __block_write_begin_int(page, pos, length, NULL, iomap); + if (iter->iomap.flags & IOMAP_F_BUFFER_HEAD) { + ret = __block_write_begin_int(page, iter->pos, length, NULL, + &iter->iomap); if (ret) return ret; block_commit_write(page, 0, length); @@ -977,29 +973,24 @@ iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length, vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) { + struct iomap_iter iter = { + .inode = file_inode(vmf->vma->vm_file), + .flags = IOMAP_WRITE | IOMAP_FAULT, + }; struct page *page = vmf->page; - struct inode *inode = file_inode(vmf->vma->vm_file); - unsigned long length; - loff_t offset; ssize_t ret; lock_page(page); - ret = page_mkwrite_check_truncate(page, inode); + ret = page_mkwrite_check_truncate(page, iter.inode); if (ret < 0) goto out_unlock; - length = ret; - - offset = page_offset(page); - while (length > 0) { - ret = iomap_apply(inode, offset, length, - IOMAP_WRITE | IOMAP_FAULT, ops, page, - iomap_page_mkwrite_actor); - if (unlikely(ret <= 0)) - goto out_unlock; - offset += ret; - length -= ret; - } + iter.pos = page_offset(page); + iter.len = ret; + while ((ret = iomap_iter(&iter, ops)) > 0) + iter.processed = iomap_page_mkwrite_iter(&iter, page); + if (ret < 0) + goto out_unlock; wait_for_stable_page(page); return VM_FAULT_LOCKED; out_unlock: @@ -1016,7 +1007,7 @@ iomap_finish_page_writeback(struct inode *inode, struct page *page, if (error) { SetPageError(page); - mapping_set_error(inode->i_mapping, -EIO); + mapping_set_error(inode->i_mapping, error); } WARN_ON_ONCE(i_blocks_per_page(inode, page) > 1 && !iop); @@ -1153,7 +1144,7 @@ static void iomap_writepage_end_bio(struct bio *bio) * Submit the final bio for an ioend. * * If @error is non-zero, it means that we have a situation where some part of - * the submission process has failed after we have marked paged for writeback + * the submission process has failed after we've marked pages for writeback * and unlocked them. In this situation, we need to fail the bio instead of * submitting it. This typically only happens on a filesystem shutdown. */ @@ -1168,7 +1159,7 @@ iomap_submit_ioend(struct iomap_writepage_ctx *wpc, struct iomap_ioend *ioend, error = wpc->ops->prepare_ioend(ioend, error); if (error) { /* - * If we are failing the IO now, just mark the ioend with an + * If we're failing the IO now, just mark the ioend with an * error and finish it. This will run IO completion immediately * as there is only one reference to the ioend at this point in * time. @@ -1210,7 +1201,7 @@ iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc, /* * Allocate a new bio, and chain the old bio to the new one. * - * Note that we have to do perform the chaining in this unintuitive order + * Note that we have to perform the chaining in this unintuitive order * so that the bi_private linkage is set up in the right direction for the * traversal in iomap_finish_ioend(). */ @@ -1249,7 +1240,7 @@ iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset, /* * Test to see if we have an existing ioend structure that we could append to - * first, otherwise finish off the current ioend and start another. + * first; otherwise finish off the current ioend and start another. */ static void iomap_add_to_ioend(struct inode *inode, loff_t offset, struct page *page, @@ -1259,7 +1250,6 @@ iomap_add_to_ioend(struct inode *inode, loff_t offset, struct page *page, sector_t sector = iomap_sector(&wpc->iomap, offset); unsigned len = i_blocksize(inode); unsigned poff = offset & (PAGE_SIZE - 1); - bool merged, same_page = false; if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, offset, sector)) { if (wpc->ioend) @@ -1267,19 +1257,13 @@ iomap_add_to_ioend(struct inode *inode, loff_t offset, struct page *page, wpc->ioend = iomap_alloc_ioend(inode, wpc, offset, sector, wbc); } - merged = __bio_try_merge_page(wpc->ioend->io_bio, page, len, poff, - &same_page); - if (iop) - atomic_add(len, &iop->write_bytes_pending); - - if (!merged) { - if (bio_full(wpc->ioend->io_bio, len)) { - wpc->ioend->io_bio = - iomap_chain_bio(wpc->ioend->io_bio); - } - bio_add_page(wpc->ioend->io_bio, page, len, poff); + if (bio_add_page(wpc->ioend->io_bio, page, len, poff) != len) { + wpc->ioend->io_bio = iomap_chain_bio(wpc->ioend->io_bio); + __bio_add_page(wpc->ioend->io_bio, page, len, poff); } + if (iop) + atomic_add(len, &iop->write_bytes_pending); wpc->ioend->io_size += len; wbc_account_cgroup_owner(wbc, page, len); } @@ -1287,9 +1271,9 @@ iomap_add_to_ioend(struct inode *inode, loff_t offset, struct page *page, /* * We implement an immediate ioend submission policy here to avoid needing to * chain multiple ioends and hence nest mempool allocations which can violate - * forward progress guarantees we need to provide. The current ioend we are - * adding blocks to is cached on the writepage context, and if the new block - * does not append to the cached ioend it will create a new ioend and cache that + * the forward progress guarantees we need to provide. The current ioend we're + * adding blocks to is cached in the writepage context, and if the new block + * doesn't append to the cached ioend, it will create a new ioend and cache that * instead. * * If a new ioend is created and cached, the old ioend is returned and queued @@ -1351,7 +1335,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, if (unlikely(error)) { /* * Let the filesystem know what portion of the current page - * failed to map. If the page wasn't been added to ioend, it + * failed to map. If the page hasn't been added to ioend, it * won't be affected by I/O completion and we must unlock it * now. */ @@ -1368,7 +1352,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, unlock_page(page); /* - * Preserve the original error if there was one, otherwise catch + * Preserve the original error if there was one; catch * submission errors here and propagate into subsequent ioend * submissions. */ @@ -1395,8 +1379,8 @@ done: /* * Write out a dirty page. * - * For delalloc space on the page we need to allocate space and flush it. - * For unwritten space on the page we need to start the conversion to + * For delalloc space on the page, we need to allocate space and flush it. + * For unwritten space on the page, we need to start the conversion to * regular allocated space. */ static int @@ -1411,7 +1395,7 @@ iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data) trace_iomap_writepage(inode, page_offset(page), PAGE_SIZE); /* - * Refuse to write the page out if we are called from reclaim context. + * Refuse to write the page out if we're called from reclaim context. * * This avoids stack overflows when called from deeply used stacks in * random callers for direct reclaim or memcg reclaim. We explicitly @@ -1456,20 +1440,20 @@ iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data) unsigned offset_into_page = offset & (PAGE_SIZE - 1); /* - * Skip the page if it is fully outside i_size, e.g. due to a - * truncate operation that is in progress. We must redirty the + * Skip the page if it's fully outside i_size, e.g. due to a + * truncate operation that's in progress. We must redirty the * page so that reclaim stops reclaiming it. Otherwise * iomap_vm_releasepage() is called on it and gets confused. * - * Note that the end_index is unsigned long, it would overflow - * if the given offset is greater than 16TB on 32-bit system - * and if we do check the page is fully outside i_size or not - * via "if (page->index >= end_index + 1)" as "end_index + 1" - * will be evaluated to 0. Hence this page will be redirtied - * and be written out repeatedly which would result in an - * infinite loop, the user program that perform this operation - * will hang. Instead, we can verify this situation by checking - * if the page to write is totally beyond the i_size or if it's + * Note that the end_index is unsigned long. If the given + * offset is greater than 16TB on a 32-bit system then if we + * checked if the page is fully outside i_size with + * "if (page->index >= end_index + 1)", "end_index + 1" would + * overflow and evaluate to 0. Hence this page would be + * redirtied and written out repeatedly, which would result in + * an infinite loop; the user program performing this operation + * would hang. Instead, we can detect this situation by + * checking if the page is totally beyond i_size or if its * offset is just equal to the EOF. */ if (page->index > end_index || diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 9398b8c31323..4ecd255e0511 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2010 Red Hat, Inc. - * Copyright (c) 2016-2018 Christoph Hellwig. + * Copyright (c) 2016-2021 Christoph Hellwig. */ #include <linux/module.h> #include <linux/compiler.h> @@ -59,19 +59,17 @@ int iomap_dio_iopoll(struct kiocb *kiocb, bool spin) } EXPORT_SYMBOL_GPL(iomap_dio_iopoll); -static void iomap_dio_submit_bio(struct iomap_dio *dio, struct iomap *iomap, - struct bio *bio, loff_t pos) +static void iomap_dio_submit_bio(const struct iomap_iter *iter, + struct iomap_dio *dio, struct bio *bio, loff_t pos) { atomic_inc(&dio->ref); if (dio->iocb->ki_flags & IOCB_HIPRI) bio_set_polled(bio, dio->iocb); - dio->submit.last_queue = bdev_get_queue(iomap->bdev); + dio->submit.last_queue = bdev_get_queue(iter->iomap.bdev); if (dio->dops && dio->dops->submit_io) - dio->submit.cookie = dio->dops->submit_io( - file_inode(dio->iocb->ki_filp), - iomap, bio, pos); + dio->submit.cookie = dio->dops->submit_io(iter, bio, pos); else dio->submit.cookie = submit_bio(bio); } @@ -181,24 +179,23 @@ static void iomap_dio_bio_end_io(struct bio *bio) } } -static void -iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos, - unsigned len) +static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, + loff_t pos, unsigned len) { struct page *page = ZERO_PAGE(0); int flags = REQ_SYNC | REQ_IDLE; struct bio *bio; bio = bio_alloc(GFP_KERNEL, 1); - bio_set_dev(bio, iomap->bdev); - bio->bi_iter.bi_sector = iomap_sector(iomap, pos); + bio_set_dev(bio, iter->iomap.bdev); + bio->bi_iter.bi_sector = iomap_sector(&iter->iomap, pos); bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; get_page(page); __bio_add_page(bio, page, len, 0); bio_set_op_attrs(bio, REQ_OP_WRITE, flags); - iomap_dio_submit_bio(dio, iomap, bio, pos); + iomap_dio_submit_bio(iter, dio, bio, pos); } /* @@ -206,8 +203,8 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos, * mapping, and whether or not we want FUA. Note that we can end up * clearing the WRITE_FUA flag in the dio request. */ -static inline unsigned int -iomap_dio_bio_opflags(struct iomap_dio *dio, struct iomap *iomap, bool use_fua) +static inline unsigned int iomap_dio_bio_opflags(struct iomap_dio *dio, + const struct iomap *iomap, bool use_fua) { unsigned int opflags = REQ_SYNC | REQ_IDLE; @@ -229,13 +226,16 @@ iomap_dio_bio_opflags(struct iomap_dio *dio, struct iomap *iomap, bool use_fua) return opflags; } -static loff_t -iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, - struct iomap_dio *dio, struct iomap *iomap) +static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, + struct iomap_dio *dio) { + const struct iomap *iomap = &iter->iomap; + struct inode *inode = iter->inode; unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev)); unsigned int fs_block_size = i_blocksize(inode), pad; unsigned int align = iov_iter_alignment(dio->submit.iter); + loff_t length = iomap_length(iter); + loff_t pos = iter->pos; unsigned int bio_opf; struct bio *bio; bool need_zeroout = false; @@ -286,7 +286,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, /* zero out from the start of the block to the write offset */ pad = pos & (fs_block_size - 1); if (pad) - iomap_dio_zero(dio, iomap, pos - pad, pad); + iomap_dio_zero(iter, dio, pos - pad, pad); } /* @@ -339,7 +339,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length, nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS); - iomap_dio_submit_bio(dio, iomap, bio, pos); + iomap_dio_submit_bio(iter, dio, bio, pos); pos += n; } while (nr_pages); @@ -355,7 +355,7 @@ zero_tail: /* zero out from the end of the write to the end of the block */ pad = pos & (fs_block_size - 1); if (pad) - iomap_dio_zero(dio, iomap, pos, fs_block_size - pad); + iomap_dio_zero(iter, dio, pos, fs_block_size - pad); } out: /* Undo iter limitation to current extent */ @@ -365,65 +365,67 @@ out: return ret; } -static loff_t -iomap_dio_hole_actor(loff_t length, struct iomap_dio *dio) +static loff_t iomap_dio_hole_iter(const struct iomap_iter *iter, + struct iomap_dio *dio) { - length = iov_iter_zero(length, dio->submit.iter); + loff_t length = iov_iter_zero(iomap_length(iter), dio->submit.iter); + dio->size += length; return length; } -static loff_t -iomap_dio_inline_actor(struct inode *inode, loff_t pos, loff_t length, - struct iomap_dio *dio, struct iomap *iomap) +static loff_t iomap_dio_inline_iter(const struct iomap_iter *iomi, + struct iomap_dio *dio) { + const struct iomap *iomap = &iomi->iomap; struct iov_iter *iter = dio->submit.iter; + void *inline_data = iomap_inline_data(iomap, iomi->pos); + loff_t length = iomap_length(iomi); + loff_t pos = iomi->pos; size_t copied; - BUG_ON(pos + length > PAGE_SIZE - offset_in_page(iomap->inline_data)); + if (WARN_ON_ONCE(!iomap_inline_data_valid(iomap))) + return -EIO; if (dio->flags & IOMAP_DIO_WRITE) { - loff_t size = inode->i_size; + loff_t size = iomi->inode->i_size; if (pos > size) - memset(iomap->inline_data + size, 0, pos - size); - copied = copy_from_iter(iomap->inline_data + pos, length, iter); + memset(iomap_inline_data(iomap, size), 0, pos - size); + copied = copy_from_iter(inline_data, length, iter); if (copied) { if (pos + copied > size) - i_size_write(inode, pos + copied); - mark_inode_dirty(inode); + i_size_write(iomi->inode, pos + copied); + mark_inode_dirty(iomi->inode); } } else { - copied = copy_to_iter(iomap->inline_data + pos, length, iter); + copied = copy_to_iter(inline_data, length, iter); } dio->size += copied; return copied; } -static loff_t -iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, - void *data, struct iomap *iomap, struct iomap *srcmap) +static loff_t iomap_dio_iter(const struct iomap_iter *iter, + struct iomap_dio *dio) { - struct iomap_dio *dio = data; - - switch (iomap->type) { + switch (iter->iomap.type) { case IOMAP_HOLE: if (WARN_ON_ONCE(dio->flags & IOMAP_DIO_WRITE)) return -EIO; - return iomap_dio_hole_actor(length, dio); + return iomap_dio_hole_iter(iter, dio); case IOMAP_UNWRITTEN: if (!(dio->flags & IOMAP_DIO_WRITE)) - return iomap_dio_hole_actor(length, dio); - return iomap_dio_bio_actor(inode, pos, length, dio, iomap); + return iomap_dio_hole_iter(iter, dio); + return iomap_dio_bio_iter(iter, dio); case IOMAP_MAPPED: - return iomap_dio_bio_actor(inode, pos, length, dio, iomap); + return iomap_dio_bio_iter(iter, dio); case IOMAP_INLINE: - return iomap_dio_inline_actor(inode, pos, length, dio, iomap); + return iomap_dio_inline_iter(iter, dio); case IOMAP_DELALLOC: /* * DIO is not serialised against mmap() access at all, and so * if the page_mkwrite occurs between the writeback and the - * iomap_apply() call in the DIO path, then it will see the + * iomap_iter() call in the DIO path, then it will see the * DELALLOC block that the page-mkwrite allocated. */ pr_warn_ratelimited("Direct I/O collision with buffered writes! File: %pD4 Comm: %.20s\n", @@ -454,16 +456,19 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, { struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = file_inode(iocb->ki_filp); - size_t count = iov_iter_count(iter); - loff_t pos = iocb->ki_pos; - loff_t end = iocb->ki_pos + count - 1, ret = 0; + struct iomap_iter iomi = { + .inode = inode, + .pos = iocb->ki_pos, + .len = iov_iter_count(iter), + .flags = IOMAP_DIRECT, + }; + loff_t end = iomi.pos + iomi.len - 1, ret = 0; bool wait_for_completion = is_sync_kiocb(iocb) || (dio_flags & IOMAP_DIO_FORCE_WAIT); - unsigned int iomap_flags = IOMAP_DIRECT; struct blk_plug plug; struct iomap_dio *dio; - if (!count) + if (!iomi.len) return NULL; dio = kmalloc(sizeof(*dio), GFP_KERNEL); @@ -484,29 +489,30 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, dio->submit.last_queue = NULL; if (iov_iter_rw(iter) == READ) { - if (pos >= dio->i_size) + if (iomi.pos >= dio->i_size) goto out_free_dio; if (iocb->ki_flags & IOCB_NOWAIT) { - if (filemap_range_needs_writeback(mapping, pos, end)) { + if (filemap_range_needs_writeback(mapping, iomi.pos, + end)) { ret = -EAGAIN; goto out_free_dio; } - iomap_flags |= IOMAP_NOWAIT; + iomi.flags |= IOMAP_NOWAIT; } if (iter_is_iovec(iter)) dio->flags |= IOMAP_DIO_DIRTY; } else { - iomap_flags |= IOMAP_WRITE; + iomi.flags |= IOMAP_WRITE; dio->flags |= IOMAP_DIO_WRITE; if (iocb->ki_flags & IOCB_NOWAIT) { - if (filemap_range_has_page(mapping, pos, end)) { + if (filemap_range_has_page(mapping, iomi.pos, end)) { ret = -EAGAIN; goto out_free_dio; } - iomap_flags |= IOMAP_NOWAIT; + iomi.flags |= IOMAP_NOWAIT; } /* for data sync or sync, we need sync completion processing */ @@ -525,12 +531,13 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) { ret = -EAGAIN; - if (pos >= dio->i_size || pos + count > dio->i_size) + if (iomi.pos >= dio->i_size || + iomi.pos + iomi.len > dio->i_size) goto out_free_dio; - iomap_flags |= IOMAP_OVERWRITE_ONLY; + iomi.flags |= IOMAP_OVERWRITE_ONLY; } - ret = filemap_write_and_wait_range(mapping, pos, end); + ret = filemap_write_and_wait_range(mapping, iomi.pos, end); if (ret) goto out_free_dio; @@ -540,9 +547,10 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, * If this invalidation fails, let the caller fall back to * buffered I/O. */ - if (invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, - end >> PAGE_SHIFT)) { - trace_iomap_dio_invalidate_fail(inode, pos, count); + if (invalidate_inode_pages2_range(mapping, + iomi.pos >> PAGE_SHIFT, end >> PAGE_SHIFT)) { + trace_iomap_dio_invalidate_fail(inode, iomi.pos, + iomi.len); ret = -ENOTBLK; goto out_free_dio; } @@ -557,31 +565,23 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, inode_dio_begin(inode); blk_start_plug(&plug); - do { - ret = iomap_apply(inode, pos, count, iomap_flags, ops, dio, - iomap_dio_actor); - if (ret <= 0) { - /* magic error code to fall back to buffered I/O */ - if (ret == -ENOTBLK) { - wait_for_completion = true; - ret = 0; - } - break; - } - pos += ret; - - if (iov_iter_rw(iter) == READ && pos >= dio->i_size) { - /* - * We only report that we've read data up to i_size. - * Revert iter to a state corresponding to that as - * some callers (such as splice code) rely on it. - */ - iov_iter_revert(iter, pos - dio->i_size); - break; - } - } while ((count = iov_iter_count(iter)) > 0); + while ((ret = iomap_iter(&iomi, ops)) > 0) + iomi.processed = iomap_dio_iter(&iomi, dio); blk_finish_plug(&plug); + /* + * We only report that we've read data up to i_size. + * Revert iter to a state corresponding to that as some callers (such + * as the splice code) rely on it. + */ + if (iov_iter_rw(iter) == READ && iomi.pos >= dio->i_size) + iov_iter_revert(iter, iomi.pos - dio->i_size); + + /* magic error code to fall back to buffered I/O */ + if (ret == -ENOTBLK) { + wait_for_completion = true; + ret = 0; + } if (ret < 0) iomap_dio_set_error(dio, ret); diff --git a/fs/iomap/fiemap.c b/fs/iomap/fiemap.c index aab070df4a21..66cf267c68ae 100644 --- a/fs/iomap/fiemap.c +++ b/fs/iomap/fiemap.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright (c) 2016-2018 Christoph Hellwig. + * Copyright (c) 2016-2021 Christoph Hellwig. */ #include <linux/module.h> #include <linux/compiler.h> @@ -8,13 +8,8 @@ #include <linux/iomap.h> #include <linux/fiemap.h> -struct fiemap_ctx { - struct fiemap_extent_info *fi; - struct iomap prev; -}; - static int iomap_to_fiemap(struct fiemap_extent_info *fi, - struct iomap *iomap, u32 flags) + const struct iomap *iomap, u32 flags) { switch (iomap->type) { case IOMAP_HOLE: @@ -43,24 +38,22 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi, iomap->length, flags); } -static loff_t -iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, - struct iomap *iomap, struct iomap *srcmap) +static loff_t iomap_fiemap_iter(const struct iomap_iter *iter, + struct fiemap_extent_info *fi, struct iomap *prev) { - struct fiemap_ctx *ctx = data; - loff_t ret = length; + int ret; - if (iomap->type == IOMAP_HOLE) - return length; + if (iter->iomap.type == IOMAP_HOLE) + return iomap_length(iter); - ret = iomap_to_fiemap(ctx->fi, &ctx->prev, 0); - ctx->prev = *iomap; + ret = iomap_to_fiemap(fi, prev, 0); + *prev = iter->iomap; switch (ret) { case 0: /* success */ - return length; + return iomap_length(iter); case 1: /* extent array full */ return 0; - default: + default: /* error */ return ret; } } @@ -68,73 +61,63 @@ iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, u64 start, u64 len, const struct iomap_ops *ops) { - struct fiemap_ctx ctx; - loff_t ret; - - memset(&ctx, 0, sizeof(ctx)); - ctx.fi = fi; - ctx.prev.type = IOMAP_HOLE; + struct iomap_iter iter = { + .inode = inode, + .pos = start, + .len = len, + .flags = IOMAP_REPORT, + }; + struct iomap prev = { + .type = IOMAP_HOLE, + }; + int ret; - ret = fiemap_prep(inode, fi, start, &len, 0); + ret = fiemap_prep(inode, fi, start, &iter.len, 0); if (ret) return ret; - while (len > 0) { - ret = iomap_apply(inode, start, len, IOMAP_REPORT, ops, &ctx, - iomap_fiemap_actor); - /* inode with no (attribute) mapping will give ENOENT */ - if (ret == -ENOENT) - break; - if (ret < 0) - return ret; - if (ret == 0) - break; - - start += ret; - len -= ret; - } + while ((ret = iomap_iter(&iter, ops)) > 0) + iter.processed = iomap_fiemap_iter(&iter, fi, &prev); - if (ctx.prev.type != IOMAP_HOLE) { - ret = iomap_to_fiemap(fi, &ctx.prev, FIEMAP_EXTENT_LAST); + if (prev.type != IOMAP_HOLE) { + ret = iomap_to_fiemap(fi, &prev, FIEMAP_EXTENT_LAST); if (ret < 0) return ret; } + /* inode with no (attribute) mapping will give ENOENT */ + if (ret < 0 && ret != -ENOENT) + return ret; return 0; } EXPORT_SYMBOL_GPL(iomap_fiemap); -static loff_t -iomap_bmap_actor(struct inode *inode, loff_t pos, loff_t length, - void *data, struct iomap *iomap, struct iomap *srcmap) -{ - sector_t *bno = data, addr; - - if (iomap->type == IOMAP_MAPPED) { - addr = (pos - iomap->offset + iomap->addr) >> inode->i_blkbits; - *bno = addr; - } - return 0; -} - /* legacy ->bmap interface. 0 is the error return (!) */ sector_t iomap_bmap(struct address_space *mapping, sector_t bno, const struct iomap_ops *ops) { - struct inode *inode = mapping->host; - loff_t pos = bno << inode->i_blkbits; - unsigned blocksize = i_blocksize(inode); + struct iomap_iter iter = { + .inode = mapping->host, + .pos = (loff_t)bno << mapping->host->i_blkbits, + .len = i_blocksize(mapping->host), + .flags = IOMAP_REPORT, + }; + const unsigned int blkshift = mapping->host->i_blkbits - SECTOR_SHIFT; int ret; if (filemap_write_and_wait(mapping)) return 0; bno = 0; - ret = iomap_apply(inode, pos, blocksize, 0, ops, &bno, - iomap_bmap_actor); + while ((ret = iomap_iter(&iter, ops)) > 0) { + if (iter.iomap.type == IOMAP_MAPPED) + bno = iomap_sector(&iter.iomap, iter.pos) >> blkshift; + /* leave iter.processed unset to abort loop */ + } if (ret) return 0; + return bno; } EXPORT_SYMBOL_GPL(iomap_bmap); diff --git a/fs/iomap/iter.c b/fs/iomap/iter.c new file mode 100644 index 000000000000..a1c7592d2ade --- /dev/null +++ b/fs/iomap/iter.c @@ -0,0 +1,80 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2010 Red Hat, Inc. + * Copyright (c) 2016-2021 Christoph Hellwig. + */ +#include <linux/fs.h> +#include <linux/iomap.h> +#include "trace.h" + +static inline int iomap_iter_advance(struct iomap_iter *iter) +{ + /* handle the previous iteration (if any) */ + if (iter->iomap.length) { + if (iter->processed <= 0) + return iter->processed; + if (WARN_ON_ONCE(iter->processed > iomap_length(iter))) + return -EIO; + iter->pos += iter->processed; + iter->len -= iter->processed; + if (!iter->len) + return 0; + } + + /* clear the state for the next iteration */ + iter->processed = 0; + memset(&iter->iomap, 0, sizeof(iter->iomap)); + memset(&iter->srcmap, 0, sizeof(iter->srcmap)); + return 1; +} + +static inline void iomap_iter_done(struct iomap_iter *iter) +{ + WARN_ON_ONCE(iter->iomap.offset > iter->pos); + WARN_ON_ONCE(iter->iomap.length == 0); + WARN_ON_ONCE(iter->iomap.offset + iter->iomap.length <= iter->pos); + + trace_iomap_iter_dstmap(iter->inode, &iter->iomap); + if (iter->srcmap.type != IOMAP_HOLE) + trace_iomap_iter_srcmap(iter->inode, &iter->srcmap); +} + +/** + * iomap_iter - iterate over a ranges in a file + * @iter: iteration structue + * @ops: iomap ops provided by the file system + * + * Iterate over filesystem-provided space mappings for the provided file range. + * + * This function handles cleanup of resources acquired for iteration when the + * filesystem indicates there are no more space mappings, which means that this + * function must be called in a loop that continues as long it returns a + * positive value. If 0 or a negative value is returned, the caller must not + * return to the loop body. Within a loop body, there are two ways to break out + * of the loop body: leave @iter.processed unchanged, or set it to a negative + * errno. + */ +int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops) +{ + int ret; + + if (iter->iomap.length && ops->iomap_end) { + ret = ops->iomap_end(iter->inode, iter->pos, iomap_length(iter), + iter->processed > 0 ? iter->processed : 0, + iter->flags, &iter->iomap); + if (ret < 0 && !iter->processed) + return ret; + } + + trace_iomap_iter(iter, ops, _RET_IP_); + ret = iomap_iter_advance(iter); + if (ret <= 0) + return ret; + + ret = ops->iomap_begin(iter->inode, iter->pos, iter->len, iter->flags, + &iter->iomap, &iter->srcmap); + if (ret < 0) + return ret; + iomap_iter_done(iter); + return 1; +} diff --git a/fs/iomap/seek.c b/fs/iomap/seek.c index ce6fb810854f..a845c012b50c 100644 --- a/fs/iomap/seek.c +++ b/fs/iomap/seek.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2017 Red Hat, Inc. - * Copyright (c) 2018 Christoph Hellwig. + * Copyright (c) 2018-2021 Christoph Hellwig. */ #include <linux/module.h> #include <linux/compiler.h> @@ -10,21 +10,20 @@ #include <linux/pagemap.h> #include <linux/pagevec.h> -static loff_t -iomap_seek_hole_actor(struct inode *inode, loff_t start, loff_t length, - void *data, struct iomap *iomap, struct iomap *srcmap) +static loff_t iomap_seek_hole_iter(const struct iomap_iter *iter, + loff_t *hole_pos) { - loff_t offset = start; + loff_t length = iomap_length(iter); - switch (iomap->type) { + switch (iter->iomap.type) { case IOMAP_UNWRITTEN: - offset = mapping_seek_hole_data(inode->i_mapping, start, - start + length, SEEK_HOLE); - if (offset == start + length) + *hole_pos = mapping_seek_hole_data(iter->inode->i_mapping, + iter->pos, iter->pos + length, SEEK_HOLE); + if (*hole_pos == iter->pos + length) return length; - fallthrough; + return 0; case IOMAP_HOLE: - *(loff_t *)data = offset; + *hole_pos = iter->pos; return 0; default: return length; @@ -32,70 +31,73 @@ iomap_seek_hole_actor(struct inode *inode, loff_t start, loff_t length, } loff_t -iomap_seek_hole(struct inode *inode, loff_t offset, const struct iomap_ops *ops) +iomap_seek_hole(struct inode *inode, loff_t pos, const struct iomap_ops *ops) { loff_t size = i_size_read(inode); - loff_t ret; + struct iomap_iter iter = { + .inode = inode, + .pos = pos, + .flags = IOMAP_REPORT, + }; + int ret; /* Nothing to be found before or beyond the end of the file. */ - if (offset < 0 || offset >= size) + if (pos < 0 || pos >= size) return -ENXIO; - while (offset < size) { - ret = iomap_apply(inode, offset, size - offset, IOMAP_REPORT, - ops, &offset, iomap_seek_hole_actor); - if (ret < 0) - return ret; - if (ret == 0) - break; - offset += ret; - } - - return offset; + iter.len = size - pos; + while ((ret = iomap_iter(&iter, ops)) > 0) + iter.processed = iomap_seek_hole_iter(&iter, &pos); + if (ret < 0) + return ret; + if (iter.len) /* found hole before EOF */ + return pos; + return size; } EXPORT_SYMBOL_GPL(iomap_seek_hole); -static loff_t -iomap_seek_data_actor(struct inode *inode, loff_t start, loff_t length, - void *data, struct iomap *iomap, struct iomap *srcmap) +static loff_t iomap_seek_data_iter(const struct iomap_iter *iter, + loff_t *hole_pos) { - loff_t offset = start; + loff_t length = iomap_length(iter); - switch (iomap->type) { + switch (iter->iomap.type) { case IOMAP_HOLE: return length; case IOMAP_UNWRITTEN: - offset = mapping_seek_hole_data(inode->i_mapping, start, - start + length, SEEK_DATA); - if (offset < 0) + *hole_pos = mapping_seek_hole_data(iter->inode->i_mapping, + iter->pos, iter->pos + length, SEEK_DATA); + if (*hole_pos < 0) return length; - fallthrough; + return 0; default: - *(loff_t *)data = offset; + *hole_pos = iter->pos; return 0; } } loff_t -iomap_seek_data(struct inode *inode, loff_t offset, const struct iomap_ops *ops) +iomap_seek_data(struct inode *inode, loff_t pos, const struct iomap_ops *ops) { loff_t size = i_size_read(inode); - loff_t ret; + struct iomap_iter iter = { + .inode = inode, + .pos = pos, + .flags = IOMAP_REPORT, + }; + int ret; /* Nothing to be found before or beyond the end of the file. */ - if (offset < 0 || offset >= size) + if (pos < 0 || pos >= size) return -ENXIO; - while (offset < size) { - ret = iomap_apply(inode, offset, size - offset, IOMAP_REPORT, - ops, &offset, iomap_seek_data_actor); - if (ret < 0) - return ret; - if (ret == 0) - return offset; - offset += ret; - } - + iter.len = size - pos; + while ((ret = iomap_iter(&iter, ops)) > 0) + iter.processed = iomap_seek_data_iter(&iter, &pos); + if (ret < 0) + return ret; + if (iter.len) /* found data before EOF */ + return pos; /* We've reached the end of the file without finding data */ return -ENXIO; } diff --git a/fs/iomap/swapfile.c b/fs/iomap/swapfile.c index 6250ca6a1f85..5fc0ac36dee3 100644 --- a/fs/iomap/swapfile.c +++ b/fs/iomap/swapfile.c @@ -31,11 +31,16 @@ static int iomap_swapfile_add_extent(struct iomap_swapfile_info *isi) { struct iomap *iomap = &isi->iomap; unsigned long nr_pages; + unsigned long max_pages; uint64_t first_ppage; uint64_t first_ppage_reported; uint64_t next_ppage; int error; + if (unlikely(isi->nr_pages >= isi->sis->max)) + return 0; + max_pages = isi->sis->max - isi->nr_pages; + /* * Round the start up and the end down so that the physical * extent aligns to a page boundary. @@ -48,6 +53,7 @@ static int iomap_swapfile_add_extent(struct iomap_swapfile_info *isi) if (first_ppage >= next_ppage) return 0; nr_pages = next_ppage - first_ppage; + nr_pages = min(nr_pages, max_pages); /* * Calculate how much swap space we're adding; the first page contains @@ -88,13 +94,9 @@ static int iomap_swapfile_fail(struct iomap_swapfile_info *isi, const char *str) * swap only cares about contiguous page-aligned physical extents and makes no * distinction between written and unwritten extents. */ -static loff_t iomap_swapfile_activate_actor(struct inode *inode, loff_t pos, - loff_t count, void *data, struct iomap *iomap, - struct iomap *srcmap) +static loff_t iomap_swapfile_iter(const struct iomap_iter *iter, + struct iomap *iomap, struct iomap_swapfile_info *isi) { - struct iomap_swapfile_info *isi = data; - int error; - switch (iomap->type) { case IOMAP_MAPPED: case IOMAP_UNWRITTEN: @@ -125,12 +127,12 @@ static loff_t iomap_swapfile_activate_actor(struct inode *inode, loff_t pos, isi->iomap.length += iomap->length; } else { /* Otherwise, add the retained iomap and store this one. */ - error = iomap_swapfile_add_extent(isi); + int error = iomap_swapfile_add_extent(isi); if (error) return error; memcpy(&isi->iomap, iomap, sizeof(isi->iomap)); } - return count; + return iomap_length(iter); } /* @@ -141,16 +143,19 @@ int iomap_swapfile_activate(struct swap_info_struct *sis, struct file *swap_file, sector_t *pagespan, const struct iomap_ops *ops) { + struct inode *inode = swap_file->f_mapping->host; + struct iomap_iter iter = { + .inode = inode, + .pos = 0, + .len = ALIGN_DOWN(i_size_read(inode), PAGE_SIZE), + .flags = IOMAP_REPORT, + }; struct iomap_swapfile_info isi = { .sis = sis, .lowest_ppage = (sector_t)-1ULL, .file = swap_file, }; - struct address_space *mapping = swap_file->f_mapping; - struct inode *inode = mapping->host; - loff_t pos = 0; - loff_t len = ALIGN_DOWN(i_size_read(inode), PAGE_SIZE); - loff_t ret; + int ret; /* * Persist all file mapping metadata so that we won't have any @@ -160,15 +165,10 @@ int iomap_swapfile_activate(struct swap_info_struct *sis, if (ret) return ret; - while (len > 0) { - ret = iomap_apply(inode, pos, len, IOMAP_REPORT, - ops, &isi, iomap_swapfile_activate_actor); - if (ret <= 0) - return ret; - - pos += ret; - len -= ret; - } + while ((ret = iomap_iter(&iter, ops)) > 0) + iter.processed = iomap_swapfile_iter(&iter, &iter.iomap, &isi); + if (ret < 0) + return ret; if (isi.iomap.length) { ret = iomap_swapfile_add_extent(&isi); diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h index fdc7ae388476..65e39785c284 100644 --- a/fs/iomap/trace.h +++ b/fs/iomap/trace.h @@ -1,9 +1,18 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* - * Copyright (c) 2009-2019 Christoph Hellwig + * Copyright (c) 2009-2021 Christoph Hellwig * - * NOTE: none of these tracepoints shall be consider a stable kernel ABI + * NOTE: none of these tracepoints shall be considered a stable kernel ABI * as they can change at any time. + * + * Current conventions for printing numbers measuring specific units: + * + * offset: byte offset into a subcomponent of a file operation + * pos: file offset, in bytes + * length: length of a file operation, in bytes + * ino: inode number + * + * Numbers describing space allocations should be formatted in hexadecimal. */ #undef TRACE_SYSTEM #define TRACE_SYSTEM iomap @@ -42,14 +51,14 @@ DEFINE_READPAGE_EVENT(iomap_readpage); DEFINE_READPAGE_EVENT(iomap_readahead); DECLARE_EVENT_CLASS(iomap_range_class, - TP_PROTO(struct inode *inode, unsigned long off, unsigned int len), + TP_PROTO(struct inode *inode, loff_t off, u64 len), TP_ARGS(inode, off, len), TP_STRUCT__entry( __field(dev_t, dev) __field(u64, ino) __field(loff_t, size) - __field(unsigned long, offset) - __field(unsigned int, length) + __field(loff_t, offset) + __field(u64, length) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; @@ -58,8 +67,7 @@ DECLARE_EVENT_CLASS(iomap_range_class, __entry->offset = off; __entry->length = len; ), - TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset %lx " - "length %x", + TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx length 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->size, @@ -69,7 +77,7 @@ DECLARE_EVENT_CLASS(iomap_range_class, #define DEFINE_RANGE_EVENT(name) \ DEFINE_EVENT(iomap_range_class, name, \ - TP_PROTO(struct inode *inode, unsigned long off, unsigned int len),\ + TP_PROTO(struct inode *inode, loff_t off, u64 len),\ TP_ARGS(inode, off, len)) DEFINE_RANGE_EVENT(iomap_writepage); DEFINE_RANGE_EVENT(iomap_releasepage); @@ -122,8 +130,8 @@ DECLARE_EVENT_CLASS(iomap_class, __entry->flags = iomap->flags; __entry->bdev = iomap->bdev ? iomap->bdev->bd_dev : 0; ), - TP_printk("dev %d:%d ino 0x%llx bdev %d:%d addr %lld offset %lld " - "length %llu type %s flags %s", + TP_printk("dev %d:%d ino 0x%llx bdev %d:%d addr 0x%llx offset 0x%llx " + "length 0x%llx type %s flags %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, MAJOR(__entry->bdev), MINOR(__entry->bdev), @@ -138,36 +146,32 @@ DECLARE_EVENT_CLASS(iomap_class, DEFINE_EVENT(iomap_class, name, \ TP_PROTO(struct inode *inode, struct iomap *iomap), \ TP_ARGS(inode, iomap)) -DEFINE_IOMAP_EVENT(iomap_apply_dstmap); -DEFINE_IOMAP_EVENT(iomap_apply_srcmap); +DEFINE_IOMAP_EVENT(iomap_iter_dstmap); +DEFINE_IOMAP_EVENT(iomap_iter_srcmap); -TRACE_EVENT(iomap_apply, - TP_PROTO(struct inode *inode, loff_t pos, loff_t length, - unsigned int flags, const void *ops, void *actor, - unsigned long caller), - TP_ARGS(inode, pos, length, flags, ops, actor, caller), +TRACE_EVENT(iomap_iter, + TP_PROTO(struct iomap_iter *iter, const void *ops, + unsigned long caller), + TP_ARGS(iter, ops, caller), TP_STRUCT__entry( __field(dev_t, dev) __field(u64, ino) __field(loff_t, pos) - __field(loff_t, length) + __field(u64, length) __field(unsigned int, flags) __field(const void *, ops) - __field(void *, actor) __field(unsigned long, caller) ), TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; - __entry->ino = inode->i_ino; - __entry->pos = pos; - __entry->length = length; - __entry->flags = flags; + __entry->dev = iter->inode->i_sb->s_dev; + __entry->ino = iter->inode->i_ino; + __entry->pos = iter->pos; + __entry->length = iomap_length(iter); + __entry->flags = iter->flags; __entry->ops = ops; - __entry->actor = actor; __entry->caller = caller; ), - TP_printk("dev %d:%d ino 0x%llx pos %lld length %lld flags %s (0x%x) " - "ops %ps caller %pS actor %ps", + TP_printk("dev %d:%d ino 0x%llx pos 0x%llx length 0x%llx flags %s (0x%x) ops %ps caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->pos, @@ -175,8 +179,7 @@ TRACE_EVENT(iomap_apply, __print_flags(__entry->flags, "|", IOMAP_FLAGS_STRINGS), __entry->flags, __entry->ops, - (void *)__entry->caller, - __entry->actor) + (void *)__entry->caller) ); #endif /* _IOMAP_TRACE_H */ diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index d47a0d96bf30..8ca3527189f8 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -179,8 +179,8 @@ static int jbd2_descriptor_block_csum_verify(journal_t *j, void *buf) if (!jbd2_journal_has_csum_v2or3(j)) return 1; - tail = (struct jbd2_journal_block_tail *)(buf + j->j_blocksize - - sizeof(struct jbd2_journal_block_tail)); + tail = (struct jbd2_journal_block_tail *)((char *)buf + + j->j_blocksize - sizeof(struct jbd2_journal_block_tail)); provided = tail->t_checksum; tail->t_checksum = 0; calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize); @@ -196,7 +196,7 @@ static int jbd2_descriptor_block_csum_verify(journal_t *j, void *buf) static int count_tags(journal_t *journal, struct buffer_head *bh) { char * tagp; - journal_block_tag_t * tag; + journal_block_tag_t tag; int nr = 0, size = journal->j_blocksize; int tag_bytes = journal_tag_bytes(journal); @@ -206,14 +206,14 @@ static int count_tags(journal_t *journal, struct buffer_head *bh) tagp = &bh->b_data[sizeof(journal_header_t)]; while ((tagp - bh->b_data + tag_bytes) <= size) { - tag = (journal_block_tag_t *) tagp; + memcpy(&tag, tagp, sizeof(tag)); nr++; tagp += tag_bytes; - if (!(tag->t_flags & cpu_to_be16(JBD2_FLAG_SAME_UUID))) + if (!(tag.t_flags & cpu_to_be16(JBD2_FLAG_SAME_UUID))) tagp += 16; - if (tag->t_flags & cpu_to_be16(JBD2_FLAG_LAST_TAG)) + if (tag.t_flags & cpu_to_be16(JBD2_FLAG_LAST_TAG)) break; } @@ -433,9 +433,9 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf) } static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag, + journal_block_tag3_t *tag3, void *buf, __u32 sequence) { - journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag; __u32 csum32; __be32 seq; @@ -496,7 +496,7 @@ static int do_one_pass(journal_t *journal, while (1) { int flags; char * tagp; - journal_block_tag_t * tag; + journal_block_tag_t tag; struct buffer_head * obh; struct buffer_head * nbh; @@ -613,8 +613,8 @@ static int do_one_pass(journal_t *journal, <= journal->j_blocksize - descr_csum_size) { unsigned long io_block; - tag = (journal_block_tag_t *) tagp; - flags = be16_to_cpu(tag->t_flags); + memcpy(&tag, tagp, sizeof(tag)); + flags = be16_to_cpu(tag.t_flags); io_block = next_log_block++; wrap(journal, next_log_block); @@ -632,7 +632,7 @@ static int do_one_pass(journal_t *journal, J_ASSERT(obh != NULL); blocknr = read_tag_block(journal, - tag); + &tag); /* If the block has been * revoked, then we're all done @@ -647,8 +647,8 @@ static int do_one_pass(journal_t *journal, /* Look for block corruption */ if (!jbd2_block_tag_csum_verify( - journal, tag, obh->b_data, - be32_to_cpu(tmp->h_sequence))) { + journal, &tag, (journal_block_tag3_t *)tagp, + obh->b_data, be32_to_cpu(tmp->h_sequence))) { brelse(obh); success = -EFSBADCRC; printk(KERN_ERR "JBD2: Invalid " @@ -760,7 +760,6 @@ static int do_one_pass(journal_t *journal, */ jbd_debug(1, "JBD2: Invalid checksum ignored in transaction %u, likely stale data\n", next_commit_ID); - err = 0; brelse(bh); goto done; } @@ -897,7 +896,7 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, { jbd2_journal_revoke_header_t *header; int offset, max; - int csum_size = 0; + unsigned csum_size = 0; __u32 rcount; int record_len = 4; diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 8804e126805f..6a3caedd2285 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -223,9 +223,15 @@ static void sub_reserved_credits(journal_t *journal, int blocks) * with j_state_lock held for reading. Returns 0 if handle joined the running * transaction. Returns 1 if we had to wait, j_state_lock is dropped, and * caller must retry. + * + * Note: because j_state_lock may be dropped depending on the return + * value, we need to fake out sparse so ti doesn't complain about a + * locking imbalance. Callers of add_transaction_credits will need to + * make a similar accomodation. */ static int add_transaction_credits(journal_t *journal, int blocks, int rsv_blocks) +__must_hold(&journal->j_state_lock) { transaction_t *t = journal->j_running_transaction; int needed; @@ -238,6 +244,7 @@ static int add_transaction_credits(journal_t *journal, int blocks, if (t->t_state != T_RUNNING) { WARN_ON_ONCE(t->t_state >= T_FLUSH); wait_transaction_locked(journal); + __acquire(&journal->j_state_lock); /* fake out sparse */ return 1; } @@ -266,10 +273,12 @@ static int add_transaction_credits(journal_t *journal, int blocks, wait_event(journal->j_wait_reserved, atomic_read(&journal->j_reserved_credits) + total <= journal->j_max_transaction_buffers); + __acquire(&journal->j_state_lock); /* fake out sparse */ return 1; } wait_transaction_locked(journal); + __acquire(&journal->j_state_lock); /* fake out sparse */ return 1; } @@ -293,6 +302,7 @@ static int add_transaction_credits(journal_t *journal, int blocks, journal->j_max_transaction_buffers) __jbd2_log_wait_for_space(journal); write_unlock(&journal->j_state_lock); + __acquire(&journal->j_state_lock); /* fake out sparse */ return 1; } @@ -310,6 +320,7 @@ static int add_transaction_credits(journal_t *journal, int blocks, wait_event(journal->j_wait_reserved, atomic_read(&journal->j_reserved_credits) + rsv_blocks <= journal->j_max_transaction_buffers / 2); + __acquire(&journal->j_state_lock); /* fake out sparse */ return 1; } return 0; @@ -413,8 +424,14 @@ repeat: if (!handle->h_reserved) { /* We may have dropped j_state_lock - restart in that case */ - if (add_transaction_credits(journal, blocks, rsv_blocks)) + if (add_transaction_credits(journal, blocks, rsv_blocks)) { + /* + * add_transaction_credits releases + * j_state_lock on a non-zero return + */ + __release(&journal->j_state_lock); goto repeat; + } } else { /* * We have handle reserved so we are allowed to join T_LOCKED @@ -1404,7 +1421,7 @@ void jbd2_journal_set_triggers(struct buffer_head *bh, { struct journal_head *jh = jbd2_journal_grab_journal_head(bh); - if (WARN_ON(!jh)) + if (WARN_ON_ONCE(!jh)) return; jh->b_triggers = type; jbd2_journal_put_journal_head(jh); diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index 55a79df70d24..e945e3484788 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -173,12 +173,15 @@ static void *jffs2_acl_to_medium(const struct posix_acl *acl, size_t *size) return ERR_PTR(-EINVAL); } -struct posix_acl *jffs2_get_acl(struct inode *inode, int type) +struct posix_acl *jffs2_get_acl(struct inode *inode, int type, bool rcu) { struct posix_acl *acl; char *value = NULL; int rc, xprefix; + if (rcu) + return ERR_PTR(-ECHILD); + switch (type) { case ACL_TYPE_ACCESS: xprefix = JFFS2_XPREFIX_ACL_ACCESS; diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h index 62c50da9d493..9d9fb7cf093e 100644 --- a/fs/jffs2/acl.h +++ b/fs/jffs2/acl.h @@ -27,7 +27,7 @@ struct jffs2_acl_header { #ifdef CONFIG_JFFS2_FS_POSIX_ACL -struct posix_acl *jffs2_get_acl(struct inode *inode, int type); +struct posix_acl *jffs2_get_acl(struct inode *inode, int type, bool rcu); int jffs2_set_acl(struct user_namespace *mnt_userns, struct inode *inode, struct posix_acl *acl, int type); extern int jffs2_init_acl_pre(struct inode *, struct inode *, umode_t *); diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index 43c285c3d2a7..a653f34c6e26 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -14,13 +14,16 @@ #include "jfs_xattr.h" #include "jfs_acl.h" -struct posix_acl *jfs_get_acl(struct inode *inode, int type) +struct posix_acl *jfs_get_acl(struct inode *inode, int type, bool rcu) { struct posix_acl *acl; char *ea_name; int size; char *value = NULL; + if (rcu) + return ERR_PTR(-ECHILD); + switch(type) { case ACL_TYPE_ACCESS: ea_name = XATTR_NAME_POSIX_ACL_ACCESS; diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h index 7ae389a7a366..3de40286d31f 100644 --- a/fs/jfs/jfs_acl.h +++ b/fs/jfs/jfs_acl.h @@ -7,7 +7,7 @@ #ifdef CONFIG_JFS_POSIX_ACL -struct posix_acl *jfs_get_acl(struct inode *inode, int type); +struct posix_acl *jfs_get_acl(struct inode *inode, int type, bool rcu); int jfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, struct posix_acl *acl, int type); int jfs_init_acl(tid_t, struct inode *, struct inode *); diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 33166ec90a11..ba581429bf7b 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -17,7 +17,7 @@ #include "kernfs-internal.h" -DEFINE_MUTEX(kernfs_mutex); +DECLARE_RWSEM(kernfs_rwsem); static DEFINE_SPINLOCK(kernfs_rename_lock); /* kn->parent and ->name */ static char kernfs_pr_cont_buf[PATH_MAX]; /* protected by rename_lock */ static DEFINE_SPINLOCK(kernfs_idr_lock); /* root->ino_idr */ @@ -26,7 +26,7 @@ static DEFINE_SPINLOCK(kernfs_idr_lock); /* root->ino_idr */ static bool kernfs_active(struct kernfs_node *kn) { - lockdep_assert_held(&kernfs_mutex); + lockdep_assert_held(&kernfs_rwsem); return atomic_read(&kn->active) >= 0; } @@ -340,7 +340,7 @@ static int kernfs_sd_compare(const struct kernfs_node *left, * @kn->parent->dir.children. * * Locking: - * mutex_lock(kernfs_mutex) + * kernfs_rwsem held exclusive * * RETURNS: * 0 on susccess -EEXIST on failure. @@ -372,6 +372,7 @@ static int kernfs_link_sibling(struct kernfs_node *kn) /* successfully added, account subdir number */ if (kernfs_type(kn) == KERNFS_DIR) kn->parent->dir.subdirs++; + kernfs_inc_rev(kn->parent); return 0; } @@ -385,7 +386,7 @@ static int kernfs_link_sibling(struct kernfs_node *kn) * removed, %false if @kn wasn't on the rbtree. * * Locking: - * mutex_lock(kernfs_mutex) + * kernfs_rwsem held exclusive */ static bool kernfs_unlink_sibling(struct kernfs_node *kn) { @@ -394,6 +395,7 @@ static bool kernfs_unlink_sibling(struct kernfs_node *kn) if (kernfs_type(kn) == KERNFS_DIR) kn->parent->dir.subdirs--; + kernfs_inc_rev(kn->parent); rb_erase(&kn->rb, &kn->parent->dir.children); RB_CLEAR_NODE(&kn->rb); @@ -455,14 +457,14 @@ void kernfs_put_active(struct kernfs_node *kn) * return after draining is complete. */ static void kernfs_drain(struct kernfs_node *kn) - __releases(&kernfs_mutex) __acquires(&kernfs_mutex) + __releases(&kernfs_rwsem) __acquires(&kernfs_rwsem) { struct kernfs_root *root = kernfs_root(kn); - lockdep_assert_held(&kernfs_mutex); + lockdep_assert_held_write(&kernfs_rwsem); WARN_ON_ONCE(kernfs_active(kn)); - mutex_unlock(&kernfs_mutex); + up_write(&kernfs_rwsem); if (kernfs_lockdep(kn)) { rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_); @@ -481,7 +483,7 @@ static void kernfs_drain(struct kernfs_node *kn) kernfs_drain_open_files(kn); - mutex_lock(&kernfs_mutex); + down_write(&kernfs_rwsem); } /** @@ -720,7 +722,7 @@ int kernfs_add_one(struct kernfs_node *kn) bool has_ns; int ret; - mutex_lock(&kernfs_mutex); + down_write(&kernfs_rwsem); ret = -EINVAL; has_ns = kernfs_ns_enabled(parent); @@ -751,7 +753,7 @@ int kernfs_add_one(struct kernfs_node *kn) ps_iattr->ia_mtime = ps_iattr->ia_ctime; } - mutex_unlock(&kernfs_mutex); + up_write(&kernfs_rwsem); /* * Activate the new node unless CREATE_DEACTIVATED is requested. @@ -765,7 +767,7 @@ int kernfs_add_one(struct kernfs_node *kn) return 0; out_unlock: - mutex_unlock(&kernfs_mutex); + up_write(&kernfs_rwsem); return ret; } @@ -786,7 +788,7 @@ static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent, bool has_ns = kernfs_ns_enabled(parent); unsigned int hash; - lockdep_assert_held(&kernfs_mutex); + lockdep_assert_held(&kernfs_rwsem); if (has_ns != (bool)ns) { WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n", @@ -818,7 +820,7 @@ static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent, size_t len; char *p, *name; - lockdep_assert_held(&kernfs_mutex); + lockdep_assert_held_read(&kernfs_rwsem); /* grab kernfs_rename_lock to piggy back on kernfs_pr_cont_buf */ spin_lock_irq(&kernfs_rename_lock); @@ -858,10 +860,10 @@ struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent, { struct kernfs_node *kn; - mutex_lock(&kernfs_mutex); + down_read(&kernfs_rwsem); kn = kernfs_find_ns(parent, name, ns); kernfs_get(kn); - mutex_unlock(&kernfs_mutex); + up_read(&kernfs_rwsem); return kn; } @@ -882,10 +884,10 @@ struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent, { struct kernfs_node *kn; - mutex_lock(&kernfs_mutex); + down_read(&kernfs_rwsem); kn = kernfs_walk_ns(parent, path, ns); kernfs_get(kn); - mutex_unlock(&kernfs_mutex); + up_read(&kernfs_rwsem); return kn; } @@ -1037,12 +1039,34 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags) if (flags & LOOKUP_RCU) return -ECHILD; - /* Always perform fresh lookup for negatives */ - if (d_really_is_negative(dentry)) - goto out_bad_unlocked; + /* Negative hashed dentry? */ + if (d_really_is_negative(dentry)) { + struct kernfs_node *parent; + + /* If the kernfs parent node has changed discard and + * proceed to ->lookup. + */ + down_read(&kernfs_rwsem); + spin_lock(&dentry->d_lock); + parent = kernfs_dentry_node(dentry->d_parent); + if (parent) { + if (kernfs_dir_changed(parent, dentry)) { + spin_unlock(&dentry->d_lock); + up_read(&kernfs_rwsem); + return 0; + } + } + spin_unlock(&dentry->d_lock); + up_read(&kernfs_rwsem); + + /* The kernfs parent node hasn't changed, leave the + * dentry negative and return success. + */ + return 1; + } kn = kernfs_dentry_node(dentry); - mutex_lock(&kernfs_mutex); + down_read(&kernfs_rwsem); /* The kernfs node has been deactivated */ if (!kernfs_active(kn)) @@ -1061,11 +1085,10 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags) kernfs_info(dentry->d_sb)->ns != kn->ns) goto out_bad; - mutex_unlock(&kernfs_mutex); + up_read(&kernfs_rwsem); return 1; out_bad: - mutex_unlock(&kernfs_mutex); -out_bad_unlocked: + up_read(&kernfs_rwsem); return 0; } @@ -1077,37 +1100,29 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { - struct dentry *ret; struct kernfs_node *parent = dir->i_private; struct kernfs_node *kn; - struct inode *inode; + struct inode *inode = NULL; const void *ns = NULL; - mutex_lock(&kernfs_mutex); - + down_read(&kernfs_rwsem); if (kernfs_ns_enabled(parent)) ns = kernfs_info(dir->i_sb)->ns; kn = kernfs_find_ns(parent, dentry->d_name.name, ns); - - /* no such entry */ - if (!kn || !kernfs_active(kn)) { - ret = NULL; - goto out_unlock; - } - /* attach dentry and inode */ - inode = kernfs_get_inode(dir->i_sb, kn); - if (!inode) { - ret = ERR_PTR(-ENOMEM); - goto out_unlock; + if (kn && kernfs_active(kn)) { + inode = kernfs_get_inode(dir->i_sb, kn); + if (!inode) + inode = ERR_PTR(-ENOMEM); } + /* Needed only for negative dentry validation */ + if (!inode) + kernfs_set_rev(parent, dentry); + up_read(&kernfs_rwsem); - /* instantiate and hash dentry */ - ret = d_splice_alias(inode, dentry); - out_unlock: - mutex_unlock(&kernfs_mutex); - return ret; + /* instantiate and hash (possibly negative) dentry */ + return d_splice_alias(inode, dentry); } static int kernfs_iop_mkdir(struct user_namespace *mnt_userns, @@ -1227,7 +1242,7 @@ static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos, { struct rb_node *rbn; - lockdep_assert_held(&kernfs_mutex); + lockdep_assert_held_write(&kernfs_rwsem); /* if first iteration, visit leftmost descendant which may be root */ if (!pos) @@ -1263,7 +1278,7 @@ void kernfs_activate(struct kernfs_node *kn) { struct kernfs_node *pos; - mutex_lock(&kernfs_mutex); + down_write(&kernfs_rwsem); pos = NULL; while ((pos = kernfs_next_descendant_post(pos, kn))) { @@ -1277,14 +1292,14 @@ void kernfs_activate(struct kernfs_node *kn) pos->flags |= KERNFS_ACTIVATED; } - mutex_unlock(&kernfs_mutex); + up_write(&kernfs_rwsem); } static void __kernfs_remove(struct kernfs_node *kn) { struct kernfs_node *pos; - lockdep_assert_held(&kernfs_mutex); + lockdep_assert_held_write(&kernfs_rwsem); /* * Short-circuit if non-root @kn has already finished removal. @@ -1307,7 +1322,7 @@ static void __kernfs_remove(struct kernfs_node *kn) pos = kernfs_leftmost_descendant(kn); /* - * kernfs_drain() drops kernfs_mutex temporarily and @pos's + * kernfs_drain() drops kernfs_rwsem temporarily and @pos's * base ref could have been put by someone else by the time * the function returns. Make sure it doesn't go away * underneath us. @@ -1354,9 +1369,9 @@ static void __kernfs_remove(struct kernfs_node *kn) */ void kernfs_remove(struct kernfs_node *kn) { - mutex_lock(&kernfs_mutex); + down_write(&kernfs_rwsem); __kernfs_remove(kn); - mutex_unlock(&kernfs_mutex); + up_write(&kernfs_rwsem); } /** @@ -1443,17 +1458,17 @@ bool kernfs_remove_self(struct kernfs_node *kn) { bool ret; - mutex_lock(&kernfs_mutex); + down_write(&kernfs_rwsem); kernfs_break_active_protection(kn); /* * SUICIDAL is used to arbitrate among competing invocations. Only * the first one will actually perform removal. When the removal * is complete, SUICIDED is set and the active ref is restored - * while holding kernfs_mutex. The ones which lost arbitration - * waits for SUICDED && drained which can happen only after the - * enclosing kernfs operation which executed the winning instance - * of kernfs_remove_self() finished. + * while kernfs_rwsem for held exclusive. The ones which lost + * arbitration waits for SUICIDED && drained which can happen only + * after the enclosing kernfs operation which executed the winning + * instance of kernfs_remove_self() finished. */ if (!(kn->flags & KERNFS_SUICIDAL)) { kn->flags |= KERNFS_SUICIDAL; @@ -1471,9 +1486,9 @@ bool kernfs_remove_self(struct kernfs_node *kn) atomic_read(&kn->active) == KN_DEACTIVATED_BIAS) break; - mutex_unlock(&kernfs_mutex); + up_write(&kernfs_rwsem); schedule(); - mutex_lock(&kernfs_mutex); + down_write(&kernfs_rwsem); } finish_wait(waitq, &wait); WARN_ON_ONCE(!RB_EMPTY_NODE(&kn->rb)); @@ -1481,12 +1496,12 @@ bool kernfs_remove_self(struct kernfs_node *kn) } /* - * This must be done while holding kernfs_mutex; otherwise, waiting - * for SUICIDED && deactivated could finish prematurely. + * This must be done while kernfs_rwsem held exclusive; otherwise, + * waiting for SUICIDED && deactivated could finish prematurely. */ kernfs_unbreak_active_protection(kn); - mutex_unlock(&kernfs_mutex); + up_write(&kernfs_rwsem); return ret; } @@ -1510,13 +1525,13 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name, return -ENOENT; } - mutex_lock(&kernfs_mutex); + down_write(&kernfs_rwsem); kn = kernfs_find_ns(parent, name, ns); if (kn) __kernfs_remove(kn); - mutex_unlock(&kernfs_mutex); + up_write(&kernfs_rwsem); if (kn) return 0; @@ -1542,7 +1557,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, if (!kn->parent) return -EINVAL; - mutex_lock(&kernfs_mutex); + down_write(&kernfs_rwsem); error = -ENOENT; if (!kernfs_active(kn) || !kernfs_active(new_parent) || @@ -1596,7 +1611,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, error = 0; out: - mutex_unlock(&kernfs_mutex); + up_write(&kernfs_rwsem); return error; } @@ -1671,7 +1686,7 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx) if (!dir_emit_dots(file, ctx)) return 0; - mutex_lock(&kernfs_mutex); + down_read(&kernfs_rwsem); if (kernfs_ns_enabled(parent)) ns = kernfs_info(dentry->d_sb)->ns; @@ -1688,12 +1703,12 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx) file->private_data = pos; kernfs_get(pos); - mutex_unlock(&kernfs_mutex); + up_read(&kernfs_rwsem); if (!dir_emit(ctx, name, len, ino, type)) return 0; - mutex_lock(&kernfs_mutex); + down_read(&kernfs_rwsem); } - mutex_unlock(&kernfs_mutex); + up_read(&kernfs_rwsem); file->private_data = NULL; ctx->pos = INT_MAX; return 0; diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index c75719312147..60e2a86c535e 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -860,7 +860,7 @@ repeat: spin_unlock_irq(&kernfs_notify_lock); /* kick fsnotify */ - mutex_lock(&kernfs_mutex); + down_write(&kernfs_rwsem); list_for_each_entry(info, &kernfs_root(kn)->supers, node) { struct kernfs_node *parent; @@ -898,7 +898,7 @@ repeat: iput(inode); } - mutex_unlock(&kernfs_mutex); + up_write(&kernfs_rwsem); kernfs_put(kn); goto repeat; } diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index 26f2aa3586f9..c0eae1725435 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -100,9 +100,9 @@ int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) { int ret; - mutex_lock(&kernfs_mutex); + down_write(&kernfs_rwsem); ret = __kernfs_setattr(kn, iattr); - mutex_unlock(&kernfs_mutex); + up_write(&kernfs_rwsem); return ret; } @@ -116,7 +116,7 @@ int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (!kn) return -EINVAL; - mutex_lock(&kernfs_mutex); + down_write(&kernfs_rwsem); error = setattr_prepare(&init_user_ns, dentry, iattr); if (error) goto out; @@ -129,7 +129,7 @@ int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, setattr_copy(&init_user_ns, inode, iattr); out: - mutex_unlock(&kernfs_mutex); + up_write(&kernfs_rwsem); return error; } @@ -185,11 +185,13 @@ int kernfs_iop_getattr(struct user_namespace *mnt_userns, struct inode *inode = d_inode(path->dentry); struct kernfs_node *kn = inode->i_private; - mutex_lock(&kernfs_mutex); + down_read(&kernfs_rwsem); + spin_lock(&inode->i_lock); kernfs_refresh_inode(kn, inode); - mutex_unlock(&kernfs_mutex); - generic_fillattr(&init_user_ns, inode, stat); + spin_unlock(&inode->i_lock); + up_read(&kernfs_rwsem); + return 0; } @@ -272,17 +274,21 @@ int kernfs_iop_permission(struct user_namespace *mnt_userns, struct inode *inode, int mask) { struct kernfs_node *kn; + int ret; if (mask & MAY_NOT_BLOCK) return -ECHILD; kn = inode->i_private; - mutex_lock(&kernfs_mutex); + down_read(&kernfs_rwsem); + spin_lock(&inode->i_lock); kernfs_refresh_inode(kn, inode); - mutex_unlock(&kernfs_mutex); + ret = generic_permission(&init_user_ns, inode, mask); + spin_unlock(&inode->i_lock); + up_read(&kernfs_rwsem); - return generic_permission(&init_user_ns, inode, mask); + return ret; } int kernfs_xattr_get(struct kernfs_node *kn, const char *name, diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index ccc3b44f6306..f9cc912c31e1 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -13,6 +13,7 @@ #include <linux/lockdep.h> #include <linux/fs.h> #include <linux/mutex.h> +#include <linux/rwsem.h> #include <linux/xattr.h> #include <linux/kernfs.h> @@ -69,7 +70,7 @@ struct kernfs_super_info { */ const void *ns; - /* anchored at kernfs_root->supers, protected by kernfs_mutex */ + /* anchored at kernfs_root->supers, protected by kernfs_rwsem */ struct list_head node; }; #define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info)) @@ -81,6 +82,25 @@ static inline struct kernfs_node *kernfs_dentry_node(struct dentry *dentry) return d_inode(dentry)->i_private; } +static inline void kernfs_set_rev(struct kernfs_node *parent, + struct dentry *dentry) +{ + dentry->d_time = parent->dir.rev; +} + +static inline void kernfs_inc_rev(struct kernfs_node *parent) +{ + parent->dir.rev++; +} + +static inline bool kernfs_dir_changed(struct kernfs_node *parent, + struct dentry *dentry) +{ + if (parent->dir.rev != dentry->d_time) + return true; + return false; +} + extern const struct super_operations kernfs_sops; extern struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache; @@ -102,7 +122,7 @@ int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr); /* * dir.c */ -extern struct mutex kernfs_mutex; +extern struct rw_semaphore kernfs_rwsem; extern const struct dentry_operations kernfs_dops; extern const struct file_operations kernfs_dir_fops; extern const struct inode_operations kernfs_dir_iops; diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index 9dc7e7a64e10..f2f909d09f52 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -255,9 +255,9 @@ static int kernfs_fill_super(struct super_block *sb, struct kernfs_fs_context *k sb->s_shrink.seeks = 0; /* get root inode, initialize and unlock it */ - mutex_lock(&kernfs_mutex); + down_read(&kernfs_rwsem); inode = kernfs_get_inode(sb, info->root->kn); - mutex_unlock(&kernfs_mutex); + up_read(&kernfs_rwsem); if (!inode) { pr_debug("kernfs: could not get root inode\n"); return -ENOMEM; @@ -344,9 +344,9 @@ int kernfs_get_tree(struct fs_context *fc) } sb->s_flags |= SB_ACTIVE; - mutex_lock(&kernfs_mutex); + down_write(&kernfs_rwsem); list_add(&info->node, &info->root->supers); - mutex_unlock(&kernfs_mutex); + up_write(&kernfs_rwsem); } fc->root = dget(sb->s_root); @@ -372,9 +372,9 @@ void kernfs_kill_sb(struct super_block *sb) { struct kernfs_super_info *info = kernfs_info(sb); - mutex_lock(&kernfs_mutex); + down_write(&kernfs_rwsem); list_del(&info->node); - mutex_unlock(&kernfs_mutex); + up_write(&kernfs_rwsem); /* * Remove the superblock from fs_supers/s_instances diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c index 5432883d819f..c8f8e41b8411 100644 --- a/fs/kernfs/symlink.c +++ b/fs/kernfs/symlink.c @@ -116,9 +116,9 @@ static int kernfs_getlink(struct inode *inode, char *path) struct kernfs_node *target = kn->symlink.target_kn; int error; - mutex_lock(&kernfs_mutex); + down_read(&kernfs_rwsem); error = kernfs_get_target_path(parent, target, path); - mutex_unlock(&kernfs_mutex); + up_read(&kernfs_rwsem); return error; } diff --git a/fs/ksmbd/ndr.c b/fs/ksmbd/ndr.c index 2243a2c64b37..8317f7ca402b 100644 --- a/fs/ksmbd/ndr.c +++ b/fs/ksmbd/ndr.c @@ -28,37 +28,60 @@ static int try_to_realloc_ndr_blob(struct ndr *n, size_t sz) return 0; } -static void ndr_write_int16(struct ndr *n, __u16 value) +static int ndr_write_int16(struct ndr *n, __u16 value) { - if (n->length <= n->offset + sizeof(value)) - try_to_realloc_ndr_blob(n, sizeof(value)); + if (n->length <= n->offset + sizeof(value)) { + int ret; + + ret = try_to_realloc_ndr_blob(n, sizeof(value)); + if (ret) + return ret; + } *(__le16 *)ndr_get_field(n) = cpu_to_le16(value); n->offset += sizeof(value); + return 0; } -static void ndr_write_int32(struct ndr *n, __u32 value) +static int ndr_write_int32(struct ndr *n, __u32 value) { - if (n->length <= n->offset + sizeof(value)) - try_to_realloc_ndr_blob(n, sizeof(value)); + if (n->length <= n->offset + sizeof(value)) { + int ret; + + ret = try_to_realloc_ndr_blob(n, sizeof(value)); + if (ret) + return ret; + } *(__le32 *)ndr_get_field(n) = cpu_to_le32(value); n->offset += sizeof(value); + return 0; } -static void ndr_write_int64(struct ndr *n, __u64 value) +static int ndr_write_int64(struct ndr *n, __u64 value) { - if (n->length <= n->offset + sizeof(value)) - try_to_realloc_ndr_blob(n, sizeof(value)); + if (n->length <= n->offset + sizeof(value)) { + int ret; + + ret = try_to_realloc_ndr_blob(n, sizeof(value)); + if (ret) + return ret; + } *(__le64 *)ndr_get_field(n) = cpu_to_le64(value); n->offset += sizeof(value); + return 0; } static int ndr_write_bytes(struct ndr *n, void *value, size_t sz) { - if (n->length <= n->offset + sz) - try_to_realloc_ndr_blob(n, sz); + if (n->length <= n->offset + sz) { + int ret; + + ret = try_to_realloc_ndr_blob(n, sz); + if (ret) + return ret; + } memcpy(ndr_get_field(n), value, sz); n->offset += sz; @@ -70,8 +93,13 @@ static int ndr_write_string(struct ndr *n, char *value) size_t sz; sz = strlen(value) + 1; - if (n->length <= n->offset + sz) - try_to_realloc_ndr_blob(n, sz); + if (n->length <= n->offset + sz) { + int ret; + + ret = try_to_realloc_ndr_blob(n, sz); + if (ret) + return ret; + } memcpy(ndr_get_field(n), value, sz); n->offset += sz; @@ -81,9 +109,14 @@ static int ndr_write_string(struct ndr *n, char *value) static int ndr_read_string(struct ndr *n, void *value, size_t sz) { - int len = strnlen(ndr_get_field(n), sz); + int len; - memcpy(value, ndr_get_field(n), len); + if (n->offset + sz > n->length) + return -EINVAL; + + len = strnlen(ndr_get_field(n), sz); + if (value) + memcpy(value, ndr_get_field(n), len); len++; n->offset += len; n->offset = ALIGN(n->offset, 2); @@ -92,41 +125,52 @@ static int ndr_read_string(struct ndr *n, void *value, size_t sz) static int ndr_read_bytes(struct ndr *n, void *value, size_t sz) { - memcpy(value, ndr_get_field(n), sz); + if (n->offset + sz > n->length) + return -EINVAL; + + if (value) + memcpy(value, ndr_get_field(n), sz); n->offset += sz; return 0; } -static __u16 ndr_read_int16(struct ndr *n) +static int ndr_read_int16(struct ndr *n, __u16 *value) { - __u16 ret; + if (n->offset + sizeof(__u16) > n->length) + return -EINVAL; - ret = le16_to_cpu(*(__le16 *)ndr_get_field(n)); + if (value) + *value = le16_to_cpu(*(__le16 *)ndr_get_field(n)); n->offset += sizeof(__u16); - return ret; + return 0; } -static __u32 ndr_read_int32(struct ndr *n) +static int ndr_read_int32(struct ndr *n, __u32 *value) { - __u32 ret; + if (n->offset + sizeof(__u32) > n->length) + return 0; - ret = le32_to_cpu(*(__le32 *)ndr_get_field(n)); + if (value) + *value = le32_to_cpu(*(__le32 *)ndr_get_field(n)); n->offset += sizeof(__u32); - return ret; + return 0; } -static __u64 ndr_read_int64(struct ndr *n) +static int ndr_read_int64(struct ndr *n, __u64 *value) { - __u64 ret; + if (n->offset + sizeof(__u64) > n->length) + return -EINVAL; - ret = le64_to_cpu(*(__le64 *)ndr_get_field(n)); + if (value) + *value = le64_to_cpu(*(__le64 *)ndr_get_field(n)); n->offset += sizeof(__u64); - return ret; + return 0; } int ndr_encode_dos_attr(struct ndr *n, struct xattr_dos_attrib *da) { char hex_attr[12] = {0}; + int ret; n->offset = 0; n->length = 1024; @@ -136,97 +180,161 @@ int ndr_encode_dos_attr(struct ndr *n, struct xattr_dos_attrib *da) if (da->version == 3) { snprintf(hex_attr, 10, "0x%x", da->attr); - ndr_write_string(n, hex_attr); + ret = ndr_write_string(n, hex_attr); } else { - ndr_write_string(n, ""); + ret = ndr_write_string(n, ""); } - ndr_write_int16(n, da->version); - ndr_write_int32(n, da->version); + if (ret) + return ret; + + ret = ndr_write_int16(n, da->version); + if (ret) + return ret; + + ret = ndr_write_int32(n, da->version); + if (ret) + return ret; + + ret = ndr_write_int32(n, da->flags); + if (ret) + return ret; + + ret = ndr_write_int32(n, da->attr); + if (ret) + return ret; - ndr_write_int32(n, da->flags); - ndr_write_int32(n, da->attr); if (da->version == 3) { - ndr_write_int32(n, da->ea_size); - ndr_write_int64(n, da->size); - ndr_write_int64(n, da->alloc_size); + ret = ndr_write_int32(n, da->ea_size); + if (ret) + return ret; + ret = ndr_write_int64(n, da->size); + if (ret) + return ret; + ret = ndr_write_int64(n, da->alloc_size); } else { - ndr_write_int64(n, da->itime); + ret = ndr_write_int64(n, da->itime); } - ndr_write_int64(n, da->create_time); + if (ret) + return ret; + + ret = ndr_write_int64(n, da->create_time); + if (ret) + return ret; + if (da->version == 3) - ndr_write_int64(n, da->change_time); - return 0; + ret = ndr_write_int64(n, da->change_time); + return ret; } int ndr_decode_dos_attr(struct ndr *n, struct xattr_dos_attrib *da) { - char *hex_attr; - int version2; - - hex_attr = kzalloc(n->length, GFP_KERNEL); - if (!hex_attr) - return -ENOMEM; + char hex_attr[12]; + unsigned int version2; + int ret; n->offset = 0; - ndr_read_string(n, hex_attr, n->length); - kfree(hex_attr); - da->version = ndr_read_int16(n); + ret = ndr_read_string(n, hex_attr, sizeof(hex_attr)); + if (ret) + return ret; + + ret = ndr_read_int16(n, &da->version); + if (ret) + return ret; if (da->version != 3 && da->version != 4) { pr_err("v%d version is not supported\n", da->version); return -EINVAL; } - version2 = ndr_read_int32(n); + ret = ndr_read_int32(n, &version2); + if (ret) + return ret; + if (da->version != version2) { pr_err("ndr version mismatched(version: %d, version2: %d)\n", da->version, version2); return -EINVAL; } - ndr_read_int32(n); - da->attr = ndr_read_int32(n); + ret = ndr_read_int32(n, NULL); + if (ret) + return ret; + + ret = ndr_read_int32(n, &da->attr); + if (ret) + return ret; + if (da->version == 4) { - da->itime = ndr_read_int64(n); - da->create_time = ndr_read_int64(n); + ret = ndr_read_int64(n, &da->itime); + if (ret) + return ret; + + ret = ndr_read_int64(n, &da->create_time); } else { - ndr_read_int32(n); - ndr_read_int64(n); - ndr_read_int64(n); - da->create_time = ndr_read_int64(n); - ndr_read_int64(n); + ret = ndr_read_int32(n, NULL); + if (ret) + return ret; + + ret = ndr_read_int64(n, NULL); + if (ret) + return ret; + + ret = ndr_read_int64(n, NULL); + if (ret) + return ret; + + ret = ndr_read_int64(n, &da->create_time); + if (ret) + return ret; + + ret = ndr_read_int64(n, NULL); } - return 0; + return ret; } static int ndr_encode_posix_acl_entry(struct ndr *n, struct xattr_smb_acl *acl) { - int i; + int i, ret; + + ret = ndr_write_int32(n, acl->count); + if (ret) + return ret; - ndr_write_int32(n, acl->count); n->offset = ALIGN(n->offset, 8); - ndr_write_int32(n, acl->count); - ndr_write_int32(n, 0); + ret = ndr_write_int32(n, acl->count); + if (ret) + return ret; + + ret = ndr_write_int32(n, 0); + if (ret) + return ret; for (i = 0; i < acl->count; i++) { n->offset = ALIGN(n->offset, 8); - ndr_write_int16(n, acl->entries[i].type); - ndr_write_int16(n, acl->entries[i].type); + ret = ndr_write_int16(n, acl->entries[i].type); + if (ret) + return ret; + + ret = ndr_write_int16(n, acl->entries[i].type); + if (ret) + return ret; if (acl->entries[i].type == SMB_ACL_USER) { n->offset = ALIGN(n->offset, 8); - ndr_write_int64(n, acl->entries[i].uid); + ret = ndr_write_int64(n, acl->entries[i].uid); } else if (acl->entries[i].type == SMB_ACL_GROUP) { n->offset = ALIGN(n->offset, 8); - ndr_write_int64(n, acl->entries[i].gid); + ret = ndr_write_int64(n, acl->entries[i].gid); } + if (ret) + return ret; /* push permission */ - ndr_write_int32(n, acl->entries[i].perm); + ret = ndr_write_int32(n, acl->entries[i].perm); } - return 0; + return ret; } int ndr_encode_posix_acl(struct ndr *n, @@ -235,7 +343,8 @@ int ndr_encode_posix_acl(struct ndr *n, struct xattr_smb_acl *acl, struct xattr_smb_acl *def_acl) { - int ref_id = 0x00020000; + unsigned int ref_id = 0x00020000; + int ret; n->offset = 0; n->length = 1024; @@ -245,35 +354,46 @@ int ndr_encode_posix_acl(struct ndr *n, if (acl) { /* ACL ACCESS */ - ndr_write_int32(n, ref_id); + ret = ndr_write_int32(n, ref_id); ref_id += 4; } else { - ndr_write_int32(n, 0); + ret = ndr_write_int32(n, 0); } + if (ret) + return ret; if (def_acl) { /* DEFAULT ACL ACCESS */ - ndr_write_int32(n, ref_id); + ret = ndr_write_int32(n, ref_id); ref_id += 4; } else { - ndr_write_int32(n, 0); + ret = ndr_write_int32(n, 0); } - - ndr_write_int64(n, from_kuid(user_ns, inode->i_uid)); - ndr_write_int64(n, from_kgid(user_ns, inode->i_gid)); - ndr_write_int32(n, inode->i_mode); + if (ret) + return ret; + + ret = ndr_write_int64(n, from_kuid(&init_user_ns, i_uid_into_mnt(user_ns, inode))); + if (ret) + return ret; + ret = ndr_write_int64(n, from_kgid(&init_user_ns, i_gid_into_mnt(user_ns, inode))); + if (ret) + return ret; + ret = ndr_write_int32(n, inode->i_mode); + if (ret) + return ret; if (acl) { - ndr_encode_posix_acl_entry(n, acl); - if (def_acl) - ndr_encode_posix_acl_entry(n, def_acl); + ret = ndr_encode_posix_acl_entry(n, acl); + if (def_acl && !ret) + ret = ndr_encode_posix_acl_entry(n, def_acl); } - return 0; + return ret; } int ndr_encode_v4_ntacl(struct ndr *n, struct xattr_ntacl *acl) { - int ref_id = 0x00020004; + unsigned int ref_id = 0x00020004; + int ret; n->offset = 0; n->length = 2048; @@ -281,36 +401,65 @@ int ndr_encode_v4_ntacl(struct ndr *n, struct xattr_ntacl *acl) if (!n->data) return -ENOMEM; - ndr_write_int16(n, acl->version); - ndr_write_int32(n, acl->version); - ndr_write_int16(n, 2); - ndr_write_int32(n, ref_id); + ret = ndr_write_int16(n, acl->version); + if (ret) + return ret; + + ret = ndr_write_int32(n, acl->version); + if (ret) + return ret; + + ret = ndr_write_int16(n, 2); + if (ret) + return ret; + + ret = ndr_write_int32(n, ref_id); + if (ret) + return ret; /* push hash type and hash 64bytes */ - ndr_write_int16(n, acl->hash_type); - ndr_write_bytes(n, acl->hash, XATTR_SD_HASH_SIZE); - ndr_write_bytes(n, acl->desc, acl->desc_len); - ndr_write_int64(n, acl->current_time); - ndr_write_bytes(n, acl->posix_acl_hash, XATTR_SD_HASH_SIZE); + ret = ndr_write_int16(n, acl->hash_type); + if (ret) + return ret; - /* push ndr for security descriptor */ - ndr_write_bytes(n, acl->sd_buf, acl->sd_size); + ret = ndr_write_bytes(n, acl->hash, XATTR_SD_HASH_SIZE); + if (ret) + return ret; - return 0; + ret = ndr_write_bytes(n, acl->desc, acl->desc_len); + if (ret) + return ret; + + ret = ndr_write_int64(n, acl->current_time); + if (ret) + return ret; + + ret = ndr_write_bytes(n, acl->posix_acl_hash, XATTR_SD_HASH_SIZE); + if (ret) + return ret; + + /* push ndr for security descriptor */ + ret = ndr_write_bytes(n, acl->sd_buf, acl->sd_size); + return ret; } int ndr_decode_v4_ntacl(struct ndr *n, struct xattr_ntacl *acl) { - int version2; + unsigned int version2; + int ret; n->offset = 0; - acl->version = ndr_read_int16(n); + ret = ndr_read_int16(n, &acl->version); + if (ret) + return ret; if (acl->version != 4) { pr_err("v%d version is not supported\n", acl->version); return -EINVAL; } - version2 = ndr_read_int32(n); + ret = ndr_read_int32(n, &version2); + if (ret) + return ret; if (acl->version != version2) { pr_err("ndr version mismatched(version: %d, version2: %d)\n", acl->version, version2); @@ -318,11 +467,22 @@ int ndr_decode_v4_ntacl(struct ndr *n, struct xattr_ntacl *acl) } /* Read Level */ - ndr_read_int16(n); + ret = ndr_read_int16(n, NULL); + if (ret) + return ret; + /* Read Ref Id */ - ndr_read_int32(n); - acl->hash_type = ndr_read_int16(n); - ndr_read_bytes(n, acl->hash, XATTR_SD_HASH_SIZE); + ret = ndr_read_int32(n, NULL); + if (ret) + return ret; + + ret = ndr_read_int16(n, &acl->hash_type); + if (ret) + return ret; + + ret = ndr_read_bytes(n, acl->hash, XATTR_SD_HASH_SIZE); + if (ret) + return ret; ndr_read_bytes(n, acl->desc, 10); if (strncmp(acl->desc, "posix_acl", 9)) { @@ -331,15 +491,20 @@ int ndr_decode_v4_ntacl(struct ndr *n, struct xattr_ntacl *acl) } /* Read Time */ - ndr_read_int64(n); + ret = ndr_read_int64(n, NULL); + if (ret) + return ret; + /* Read Posix ACL hash */ - ndr_read_bytes(n, acl->posix_acl_hash, XATTR_SD_HASH_SIZE); + ret = ndr_read_bytes(n, acl->posix_acl_hash, XATTR_SD_HASH_SIZE); + if (ret) + return ret; + acl->sd_size = n->length - n->offset; acl->sd_buf = kzalloc(acl->sd_size, GFP_KERNEL); if (!acl->sd_buf) return -ENOMEM; - ndr_read_bytes(n, acl->sd_buf, acl->sd_size); - - return 0; + ret = ndr_read_bytes(n, acl->sd_buf, acl->sd_size); + return ret; } diff --git a/fs/ksmbd/oplock.c b/fs/ksmbd/oplock.c index 6ace6c2f22dc..16b6236d1bd2 100644 --- a/fs/ksmbd/oplock.c +++ b/fs/ksmbd/oplock.c @@ -1614,9 +1614,11 @@ void create_posix_rsp_buf(char *cc, struct ksmbd_file *fp) buf->nlink = cpu_to_le32(inode->i_nlink); buf->reparse_tag = cpu_to_le32(fp->volatile_id); buf->mode = cpu_to_le32(inode->i_mode); - id_to_sid(from_kuid(user_ns, inode->i_uid), + id_to_sid(from_kuid_munged(&init_user_ns, + i_uid_into_mnt(user_ns, inode)), SIDNFS_USER, (struct smb_sid *)&buf->SidBuffer[0]); - id_to_sid(from_kgid(user_ns, inode->i_gid), + id_to_sid(from_kgid_munged(&init_user_ns, + i_gid_into_mnt(user_ns, inode)), SIDNFS_GROUP, (struct smb_sid *)&buf->SidBuffer[20]); } diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index d329ea49fa14..c86164dc70bb 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -2381,10 +2381,12 @@ static int smb2_create_sd_buffer(struct ksmbd_work *work, le32_to_cpu(sd_buf->ccontext.DataLength), true); } -static void ksmbd_acls_fattr(struct smb_fattr *fattr, struct inode *inode) +static void ksmbd_acls_fattr(struct smb_fattr *fattr, + struct user_namespace *mnt_userns, + struct inode *inode) { - fattr->cf_uid = inode->i_uid; - fattr->cf_gid = inode->i_gid; + fattr->cf_uid = i_uid_into_mnt(mnt_userns, inode); + fattr->cf_gid = i_gid_into_mnt(mnt_userns, inode); fattr->cf_mode = inode->i_mode; fattr->cf_acls = NULL; fattr->cf_dacls = NULL; @@ -2893,7 +2895,7 @@ int smb2_open(struct ksmbd_work *work) struct smb_ntsd *pntsd; int pntsd_size, ace_num = 0; - ksmbd_acls_fattr(&fattr, inode); + ksmbd_acls_fattr(&fattr, user_ns, inode); if (fattr.cf_acls) ace_num = fattr.cf_acls->a_count; if (fattr.cf_dacls) @@ -3324,7 +3326,6 @@ static int dentry_name(struct ksmbd_dir_info *d_info, int info_level) */ static int smb2_populate_readdir_entry(struct ksmbd_conn *conn, int info_level, struct ksmbd_dir_info *d_info, - struct user_namespace *user_ns, struct ksmbd_kstat *ksmbd_kstat) { int next_entry_offset = 0; @@ -3478,9 +3479,9 @@ static int smb2_populate_readdir_entry(struct ksmbd_conn *conn, int info_level, S_ISDIR(ksmbd_kstat->kstat->mode) ? ATTR_DIRECTORY_LE : ATTR_ARCHIVE_LE; if (d_info->hide_dot_file && d_info->name[0] == '.') posix_info->DosAttributes |= ATTR_HIDDEN_LE; - id_to_sid(from_kuid(user_ns, ksmbd_kstat->kstat->uid), + id_to_sid(from_kuid_munged(&init_user_ns, ksmbd_kstat->kstat->uid), SIDNFS_USER, (struct smb_sid *)&posix_info->SidBuffer[0]); - id_to_sid(from_kgid(user_ns, ksmbd_kstat->kstat->gid), + id_to_sid(from_kgid_munged(&init_user_ns, ksmbd_kstat->kstat->gid), SIDNFS_GROUP, (struct smb_sid *)&posix_info->SidBuffer[20]); memcpy(posix_info->name, conv_name, conv_len); posix_info->name_len = cpu_to_le32(conv_len); @@ -3543,9 +3544,9 @@ static int process_query_dir_entries(struct smb2_query_dir_private *priv) return -EINVAL; lock_dir(priv->dir_fp); - dent = lookup_one_len(priv->d_info->name, - priv->dir_fp->filp->f_path.dentry, - priv->d_info->name_len); + dent = lookup_one(user_ns, priv->d_info->name, + priv->dir_fp->filp->f_path.dentry, + priv->d_info->name_len); unlock_dir(priv->dir_fp); if (IS_ERR(dent)) { @@ -3571,7 +3572,6 @@ static int process_query_dir_entries(struct smb2_query_dir_private *priv) rc = smb2_populate_readdir_entry(priv->work->conn, priv->info_level, priv->d_info, - user_ns, &ksmbd_kstat); dput(dent); if (rc) @@ -5008,7 +5008,7 @@ static int smb2_get_info_sec(struct ksmbd_work *work, user_ns = file_mnt_user_ns(fp->filp); inode = file_inode(fp->filp); - ksmbd_acls_fattr(&fattr, inode); + ksmbd_acls_fattr(&fattr, user_ns, inode); if (test_share_config_flag(work->tcon->share_conf, KSMBD_SHARE_FLAG_ACL_XATTR)) @@ -5246,7 +5246,9 @@ int smb2_echo(struct ksmbd_work *work) return 0; } -static int smb2_rename(struct ksmbd_work *work, struct ksmbd_file *fp, +static int smb2_rename(struct ksmbd_work *work, + struct ksmbd_file *fp, + struct user_namespace *user_ns, struct smb2_file_rename_info *file_info, struct nls_table *local_nls) { @@ -5310,7 +5312,7 @@ static int smb2_rename(struct ksmbd_work *work, struct ksmbd_file *fp, if (rc) goto out; - rc = ksmbd_vfs_setxattr(file_mnt_user_ns(fp->filp), + rc = ksmbd_vfs_setxattr(user_ns, fp->filp->f_path.dentry, xattr_stream_name, NULL, 0, 0); @@ -5438,11 +5440,11 @@ static int set_file_basic_info(struct ksmbd_file *fp, char *buf, { struct smb2_file_all_info *file_info; struct iattr attrs; - struct iattr temp_attrs; + struct timespec64 ctime; struct file *filp; struct inode *inode; struct user_namespace *user_ns; - int rc; + int rc = 0; if (!(fp->daccess & FILE_WRITE_ATTRIBUTES_LE)) return -EACCES; @@ -5462,11 +5464,11 @@ static int set_file_basic_info(struct ksmbd_file *fp, char *buf, } if (file_info->ChangeTime) { - temp_attrs.ia_ctime = ksmbd_NTtimeToUnix(file_info->ChangeTime); - attrs.ia_ctime = temp_attrs.ia_ctime; + attrs.ia_ctime = ksmbd_NTtimeToUnix(file_info->ChangeTime); + ctime = attrs.ia_ctime; attrs.ia_valid |= ATTR_CTIME; } else { - temp_attrs.ia_ctime = inode->i_ctime; + ctime = inode->i_ctime; } if (file_info->LastWriteTime) { @@ -5505,13 +5507,6 @@ static int set_file_basic_info(struct ksmbd_file *fp, char *buf, rc = 0; } - /* - * HACK : set ctime here to avoid ctime changed - * when file_info->ChangeTime is zero. - */ - attrs.ia_ctime = temp_attrs.ia_ctime; - attrs.ia_valid |= ATTR_CTIME; - if (attrs.ia_valid) { struct dentry *dentry = filp->f_path.dentry; struct inode *inode = d_inode(dentry); @@ -5519,17 +5514,15 @@ static int set_file_basic_info(struct ksmbd_file *fp, char *buf, if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) return -EACCES; - rc = setattr_prepare(user_ns, dentry, &attrs); - if (rc) - return -EINVAL; - inode_lock(inode); - setattr_copy(user_ns, inode, &attrs); - attrs.ia_valid &= ~ATTR_CTIME; rc = notify_change(user_ns, dentry, &attrs, NULL); + if (!rc) { + inode->i_ctime = ctime; + mark_inode_dirty(inode); + } inode_unlock(inode); } - return 0; + return rc; } static int set_file_allocation_info(struct ksmbd_work *work, @@ -5624,6 +5617,7 @@ static int set_end_of_file_info(struct ksmbd_work *work, struct ksmbd_file *fp, static int set_rename_info(struct ksmbd_work *work, struct ksmbd_file *fp, char *buf) { + struct user_namespace *user_ns; struct ksmbd_file *parent_fp; struct dentry *parent; struct dentry *dentry = fp->filp->f_path.dentry; @@ -5634,11 +5628,12 @@ static int set_rename_info(struct ksmbd_work *work, struct ksmbd_file *fp, return -EACCES; } + user_ns = file_mnt_user_ns(fp->filp); if (ksmbd_stream_fd(fp)) goto next; parent = dget_parent(dentry); - ret = ksmbd_vfs_lock_parent(parent, dentry); + ret = ksmbd_vfs_lock_parent(user_ns, parent, dentry); if (ret) { dput(parent); return ret; @@ -5655,7 +5650,7 @@ static int set_rename_info(struct ksmbd_work *work, struct ksmbd_file *fp, } } next: - return smb2_rename(work, fp, + return smb2_rename(work, fp, user_ns, (struct smb2_file_rename_info *)buf, work->sess->conn->local_nls); } @@ -7116,8 +7111,8 @@ static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn, netdev->ethtool_ops->get_link_ksettings(netdev, &cmd); speed = cmd.base.speed; } else { - pr_err("%s %s\n", netdev->name, - "speed is unknown, defaulting to 1Gb/sec"); + ksmbd_debug(SMB, "%s %s\n", netdev->name, + "speed is unknown, defaulting to 1Gb/sec"); speed = SPEED_1000; } diff --git a/fs/ksmbd/smb_common.c b/fs/ksmbd/smb_common.c index b108b918ec84..43d3123d8b62 100644 --- a/fs/ksmbd/smb_common.c +++ b/fs/ksmbd/smb_common.c @@ -291,7 +291,6 @@ int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work, int info_level, char *search_pattern, int (*fn)(struct ksmbd_conn *, int, struct ksmbd_dir_info *, - struct user_namespace *, struct ksmbd_kstat *)) { int i, rc = 0; @@ -322,8 +321,7 @@ int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work, int info_level, user_ns, dir->filp->f_path.dentry->d_parent, &ksmbd_kstat); - rc = fn(conn, info_level, d_info, - user_ns, &ksmbd_kstat); + rc = fn(conn, info_level, d_info, &ksmbd_kstat); if (rc) break; if (d_info->out_buf_len <= 0) diff --git a/fs/ksmbd/smb_common.h b/fs/ksmbd/smb_common.h index eb667d85558e..57c667c1be06 100644 --- a/fs/ksmbd/smb_common.h +++ b/fs/ksmbd/smb_common.h @@ -511,7 +511,6 @@ int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work, int (*fn)(struct ksmbd_conn *, int, struct ksmbd_dir_info *, - struct user_namespace *, struct ksmbd_kstat *)); int ksmbd_extract_shortname(struct ksmbd_conn *conn, diff --git a/fs/ksmbd/smbacl.c b/fs/ksmbd/smbacl.c index 5456e3ad943e..0a95cdec8c80 100644 --- a/fs/ksmbd/smbacl.c +++ b/fs/ksmbd/smbacl.c @@ -274,24 +274,34 @@ static int sid_to_id(struct user_namespace *user_ns, uid_t id; id = le32_to_cpu(psid->sub_auth[psid->num_subauth - 1]); - if (id > 0) { - uid = make_kuid(user_ns, id); - if (uid_valid(uid) && kuid_has_mapping(user_ns, uid)) { - fattr->cf_uid = uid; - rc = 0; - } + /* + * Translate raw sid into kuid in the server's user + * namespace. + */ + uid = make_kuid(&init_user_ns, id); + + /* If this is an idmapped mount, apply the idmapping. */ + uid = kuid_from_mnt(user_ns, uid); + if (uid_valid(uid)) { + fattr->cf_uid = uid; + rc = 0; } } else { kgid_t gid; gid_t id; id = le32_to_cpu(psid->sub_auth[psid->num_subauth - 1]); - if (id > 0) { - gid = make_kgid(user_ns, id); - if (gid_valid(gid) && kgid_has_mapping(user_ns, gid)) { - fattr->cf_gid = gid; - rc = 0; - } + /* + * Translate raw sid into kgid in the server's user + * namespace. + */ + gid = make_kgid(&init_user_ns, id); + + /* If this is an idmapped mount, apply the idmapping. */ + gid = kgid_from_mnt(user_ns, gid); + if (gid_valid(gid)) { + fattr->cf_gid = gid; + rc = 0; } } @@ -587,14 +597,14 @@ static void set_posix_acl_entries_dacl(struct user_namespace *user_ns, uid_t uid; unsigned int sid_type = SIDOWNER; - uid = from_kuid(user_ns, pace->e_uid); + uid = posix_acl_uid_translate(user_ns, pace); if (!uid) sid_type = SIDUNIX_USER; id_to_sid(uid, sid_type, sid); } else if (pace->e_tag == ACL_GROUP) { gid_t gid; - gid = from_kgid(user_ns, pace->e_gid); + gid = posix_acl_gid_translate(user_ns, pace); id_to_sid(gid, SIDUNIX_GROUP, sid); } else if (pace->e_tag == ACL_OTHER && !nt_aces_num) { smb_copy_sid(sid, &sid_everyone); @@ -653,12 +663,12 @@ posix_default_acl: if (pace->e_tag == ACL_USER) { uid_t uid; - uid = from_kuid(user_ns, pace->e_uid); + uid = posix_acl_uid_translate(user_ns, pace); id_to_sid(uid, SIDCREATOR_OWNER, sid); } else if (pace->e_tag == ACL_GROUP) { gid_t gid; - gid = from_kgid(user_ns, pace->e_gid); + gid = posix_acl_gid_translate(user_ns, pace); id_to_sid(gid, SIDCREATOR_GROUP, sid); } else { kfree(sid); @@ -723,7 +733,7 @@ static void set_mode_dacl(struct user_namespace *user_ns, } /* owner RID */ - uid = from_kuid(user_ns, fattr->cf_uid); + uid = from_kuid(&init_user_ns, fattr->cf_uid); if (uid) sid = &server_conf.domain_sid; else @@ -739,7 +749,7 @@ static void set_mode_dacl(struct user_namespace *user_ns, ace_size = fill_ace_for_sid(pace, &sid_unix_groups, ACCESS_ALLOWED, 0, fattr->cf_mode, 0070); pace->sid.sub_auth[pace->sid.num_subauth++] = - cpu_to_le32(from_kgid(user_ns, fattr->cf_gid)); + cpu_to_le32(from_kgid(&init_user_ns, fattr->cf_gid)); pace->size = cpu_to_le16(ace_size + 4); size += le16_to_cpu(pace->size); pace = (struct smb_ace *)((char *)pndace + size); @@ -880,7 +890,7 @@ int build_sec_desc(struct user_namespace *user_ns, if (!nowner_sid_ptr) return -ENOMEM; - uid = from_kuid(user_ns, fattr->cf_uid); + uid = from_kuid(&init_user_ns, fattr->cf_uid); if (!uid) sid_type = SIDUNIX_USER; id_to_sid(uid, sid_type, nowner_sid_ptr); @@ -891,7 +901,7 @@ int build_sec_desc(struct user_namespace *user_ns, return -ENOMEM; } - gid = from_kgid(user_ns, fattr->cf_gid); + gid = from_kgid(&init_user_ns, fattr->cf_gid); id_to_sid(gid, SIDUNIX_GROUP, ngroup_sid_ptr); offset = sizeof(struct smb_ntsd); @@ -1234,11 +1244,9 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, struct path *path, pa_entry = posix_acls->a_entries; for (i = 0; i < posix_acls->a_count; i++, pa_entry++) { if (pa_entry->e_tag == ACL_USER) - id = from_kuid(user_ns, - pa_entry->e_uid); + id = posix_acl_uid_translate(user_ns, pa_entry); else if (pa_entry->e_tag == ACL_GROUP) - id = from_kgid(user_ns, - pa_entry->e_gid); + id = posix_acl_gid_translate(user_ns, pa_entry); else continue; @@ -1322,22 +1330,31 @@ int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon, newattrs.ia_valid |= ATTR_MODE; newattrs.ia_mode = (inode->i_mode & ~0777) | (fattr.cf_mode & 0777); - inode_lock(inode); - rc = notify_change(user_ns, path->dentry, &newattrs, NULL); - inode_unlock(inode); - if (rc) - goto out; - ksmbd_vfs_remove_acl_xattrs(user_ns, path->dentry); /* Update posix acls */ if (IS_ENABLED(CONFIG_FS_POSIX_ACL) && fattr.cf_dacls) { rc = set_posix_acl(user_ns, inode, ACL_TYPE_ACCESS, fattr.cf_acls); - if (S_ISDIR(inode->i_mode) && fattr.cf_dacls) + if (rc < 0) + ksmbd_debug(SMB, + "Set posix acl(ACL_TYPE_ACCESS) failed, rc : %d\n", + rc); + if (S_ISDIR(inode->i_mode) && fattr.cf_dacls) { rc = set_posix_acl(user_ns, inode, ACL_TYPE_DEFAULT, fattr.cf_dacls); + if (rc) + ksmbd_debug(SMB, + "Set posix acl(ACL_TYPE_DEFAULT) failed, rc : %d\n", + rc); + } } + inode_lock(inode); + rc = notify_change(user_ns, path->dentry, &newattrs, NULL); + inode_unlock(inode); + if (rc) + goto out; + /* Check it only calling from SD BUFFER context */ if (type_check && !(le16_to_cpu(pntsd->type) & DACL_PRESENT)) goto out; diff --git a/fs/ksmbd/smbacl.h b/fs/ksmbd/smbacl.h index 940f686a1d95..73e08cad412b 100644 --- a/fs/ksmbd/smbacl.h +++ b/fs/ksmbd/smbacl.h @@ -209,4 +209,29 @@ int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon, bool type_check); void id_to_sid(unsigned int cid, uint sidtype, struct smb_sid *ssid); void ksmbd_init_domain(u32 *sub_auth); + +static inline uid_t posix_acl_uid_translate(struct user_namespace *mnt_userns, + struct posix_acl_entry *pace) +{ + kuid_t kuid; + + /* If this is an idmapped mount, apply the idmapping. */ + kuid = kuid_into_mnt(mnt_userns, pace->e_uid); + + /* Translate the kuid into a userspace id ksmbd would see. */ + return from_kuid(&init_user_ns, kuid); +} + +static inline gid_t posix_acl_gid_translate(struct user_namespace *mnt_userns, + struct posix_acl_entry *pace) +{ + kgid_t kgid; + + /* If this is an idmapped mount, apply the idmapping. */ + kgid = kgid_into_mnt(mnt_userns, pace->e_gid); + + /* Translate the kgid into a userspace id ksmbd would see. */ + return from_kgid(&init_user_ns, kgid); +} + #endif /* _SMBACL_H */ diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c index 58f530056ac0..52b2556e76b1 100644 --- a/fs/ksmbd/transport_rdma.c +++ b/fs/ksmbd/transport_rdma.c @@ -1168,7 +1168,7 @@ static int smb_direct_post_send_data(struct smb_direct_transport *t, pr_err("failed to map buffer\n"); ret = -ENOMEM; goto err; - } else if (sg_cnt + msg->num_sge > SMB_DIRECT_MAX_SEND_SGES - 1) { + } else if (sg_cnt + msg->num_sge > SMB_DIRECT_MAX_SEND_SGES) { pr_err("buffer not fitted into sges\n"); ret = -E2BIG; ib_dma_unmap_sg(t->cm_id->device, sg, sg_cnt, diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c index aee28ee6b19c..b047f2980d96 100644 --- a/fs/ksmbd/vfs.c +++ b/fs/ksmbd/vfs.c @@ -69,14 +69,15 @@ static void ksmbd_vfs_inherit_owner(struct ksmbd_work *work, * * the reference count of @parent isn't incremented. */ -int ksmbd_vfs_lock_parent(struct dentry *parent, struct dentry *child) +int ksmbd_vfs_lock_parent(struct user_namespace *user_ns, struct dentry *parent, + struct dentry *child) { struct dentry *dentry; int ret = 0; inode_lock_nested(d_inode(parent), I_MUTEX_PARENT); - dentry = lookup_one_len(child->d_name.name, parent, - child->d_name.len); + dentry = lookup_one(user_ns, child->d_name.name, parent, + child->d_name.len); if (IS_ERR(dentry)) { ret = PTR_ERR(dentry); goto out_err; @@ -102,7 +103,7 @@ int ksmbd_vfs_may_delete(struct user_namespace *user_ns, int ret; parent = dget_parent(dentry); - ret = ksmbd_vfs_lock_parent(parent, dentry); + ret = ksmbd_vfs_lock_parent(user_ns, parent, dentry); if (ret) { dput(parent); return ret; @@ -137,7 +138,7 @@ int ksmbd_vfs_query_maximal_access(struct user_namespace *user_ns, *daccess |= FILE_EXECUTE_LE; parent = dget_parent(dentry); - ret = ksmbd_vfs_lock_parent(parent, dentry); + ret = ksmbd_vfs_lock_parent(user_ns, parent, dentry); if (ret) { dput(parent); return ret; @@ -197,6 +198,7 @@ int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode) */ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode) { + struct user_namespace *user_ns; struct path path; struct dentry *dentry; int err; @@ -210,16 +212,16 @@ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode) return err; } + user_ns = mnt_user_ns(path.mnt); mode |= S_IFDIR; - err = vfs_mkdir(mnt_user_ns(path.mnt), d_inode(path.dentry), - dentry, mode); + err = vfs_mkdir(user_ns, d_inode(path.dentry), dentry, mode); if (err) { goto out; } else if (d_unhashed(dentry)) { struct dentry *d; - d = lookup_one_len(dentry->d_name.name, dentry->d_parent, - dentry->d_name.len); + d = lookup_one(user_ns, dentry->d_name.name, dentry->d_parent, + dentry->d_name.len); if (IS_ERR(d)) { err = PTR_ERR(d); goto out; @@ -582,6 +584,7 @@ int ksmbd_vfs_fsync(struct ksmbd_work *work, u64 fid, u64 p_id) */ int ksmbd_vfs_remove_file(struct ksmbd_work *work, char *name) { + struct user_namespace *user_ns; struct path path; struct dentry *parent; int err; @@ -601,8 +604,9 @@ int ksmbd_vfs_remove_file(struct ksmbd_work *work, char *name) return err; } + user_ns = mnt_user_ns(path.mnt); parent = dget_parent(path.dentry); - err = ksmbd_vfs_lock_parent(parent, path.dentry); + err = ksmbd_vfs_lock_parent(user_ns, parent, path.dentry); if (err) { dput(parent); path_put(&path); @@ -616,14 +620,12 @@ int ksmbd_vfs_remove_file(struct ksmbd_work *work, char *name) } if (S_ISDIR(d_inode(path.dentry)->i_mode)) { - err = vfs_rmdir(mnt_user_ns(path.mnt), d_inode(parent), - path.dentry); + err = vfs_rmdir(user_ns, d_inode(parent), path.dentry); if (err && err != -ENOTEMPTY) ksmbd_debug(VFS, "%s: rmdir failed, err %d\n", name, err); } else { - err = vfs_unlink(mnt_user_ns(path.mnt), d_inode(parent), - path.dentry, NULL); + err = vfs_unlink(user_ns, d_inode(parent), path.dentry, NULL); if (err) ksmbd_debug(VFS, "%s: unlink failed, err %d\n", name, err); @@ -748,7 +750,8 @@ static int __ksmbd_vfs_rename(struct ksmbd_work *work, if (ksmbd_override_fsids(work)) return -ENOMEM; - dst_dent = lookup_one_len(dst_name, dst_dent_parent, strlen(dst_name)); + dst_dent = lookup_one(dst_user_ns, dst_name, dst_dent_parent, + strlen(dst_name)); err = PTR_ERR(dst_dent); if (IS_ERR(dst_dent)) { pr_err("lookup failed %s [%d]\n", dst_name, err); @@ -779,6 +782,7 @@ out: int ksmbd_vfs_fp_rename(struct ksmbd_work *work, struct ksmbd_file *fp, char *newname) { + struct user_namespace *user_ns; struct path dst_path; struct dentry *src_dent_parent, *dst_dent_parent; struct dentry *src_dent, *trap_dent, *src_child; @@ -808,8 +812,9 @@ int ksmbd_vfs_fp_rename(struct ksmbd_work *work, struct ksmbd_file *fp, trap_dent = lock_rename(src_dent_parent, dst_dent_parent); dget(src_dent); dget(dst_dent_parent); - src_child = lookup_one_len(src_dent->d_name.name, src_dent_parent, - src_dent->d_name.len); + user_ns = file_mnt_user_ns(fp->filp); + src_child = lookup_one(user_ns, src_dent->d_name.name, src_dent_parent, + src_dent->d_name.len); if (IS_ERR(src_child)) { err = PTR_ERR(src_child); goto out_lock; @@ -823,7 +828,7 @@ int ksmbd_vfs_fp_rename(struct ksmbd_work *work, struct ksmbd_file *fp, dput(src_child); err = __ksmbd_vfs_rename(work, - file_mnt_user_ns(fp->filp), + user_ns, src_dent_parent, src_dent, mnt_user_ns(dst_path.mnt), @@ -1109,7 +1114,7 @@ int ksmbd_vfs_unlink(struct user_namespace *user_ns, { int err = 0; - err = ksmbd_vfs_lock_parent(dir, dentry); + err = ksmbd_vfs_lock_parent(user_ns, dir, dentry); if (err) return err; dget(dentry); @@ -1385,14 +1390,14 @@ static struct xattr_smb_acl *ksmbd_vfs_make_xattr_posix_acl(struct user_namespac switch (pa_entry->e_tag) { case ACL_USER: xa_entry->type = SMB_ACL_USER; - xa_entry->uid = from_kuid(user_ns, pa_entry->e_uid); + xa_entry->uid = posix_acl_uid_translate(user_ns, pa_entry); break; case ACL_USER_OBJ: xa_entry->type = SMB_ACL_USER_OBJ; break; case ACL_GROUP: xa_entry->type = SMB_ACL_GROUP; - xa_entry->gid = from_kgid(user_ns, pa_entry->e_gid); + xa_entry->gid = posix_acl_gid_translate(user_ns, pa_entry); break; case ACL_GROUP_OBJ: xa_entry->type = SMB_ACL_GROUP_OBJ; diff --git a/fs/ksmbd/vfs.h b/fs/ksmbd/vfs.h index cb0cba0d5d07..85db50abdb24 100644 --- a/fs/ksmbd/vfs.h +++ b/fs/ksmbd/vfs.h @@ -107,7 +107,8 @@ struct ksmbd_kstat { __le32 file_attributes; }; -int ksmbd_vfs_lock_parent(struct dentry *parent, struct dentry *child); +int ksmbd_vfs_lock_parent(struct user_namespace *user_ns, struct dentry *parent, + struct dentry *child); int ksmbd_vfs_may_delete(struct user_namespace *user_ns, struct dentry *dentry); int ksmbd_vfs_query_maximal_access(struct user_namespace *user_ns, struct dentry *dentry, __le32 *daccess); diff --git a/fs/ksmbd/vfs_cache.c b/fs/ksmbd/vfs_cache.c index 92d8c61ffd2a..29c1db66bd0f 100644 --- a/fs/ksmbd/vfs_cache.c +++ b/fs/ksmbd/vfs_cache.c @@ -666,22 +666,6 @@ void ksmbd_free_global_file_table(void) ksmbd_destroy_file_table(&global_ft); } -int ksmbd_file_table_flush(struct ksmbd_work *work) -{ - struct ksmbd_file *fp = NULL; - unsigned int id; - int ret; - - read_lock(&work->sess->file_table.lock); - idr_for_each_entry(work->sess->file_table.idr, fp, id) { - ret = ksmbd_vfs_fsync(work, fp->volatile_id, KSMBD_NO_FID); - if (ret) - break; - } - read_unlock(&work->sess->file_table.lock); - return ret; -} - int ksmbd_init_file_table(struct ksmbd_file_table *ft) { ft->idr = kzalloc(sizeof(struct idr), GFP_KERNEL); diff --git a/fs/ksmbd/vfs_cache.h b/fs/ksmbd/vfs_cache.h index 70dfe6a99f13..448576fbe4b7 100644 --- a/fs/ksmbd/vfs_cache.h +++ b/fs/ksmbd/vfs_cache.h @@ -152,7 +152,6 @@ void ksmbd_close_session_fds(struct ksmbd_work *work); int ksmbd_close_inode_fds(struct ksmbd_work *work, struct inode *inode); int ksmbd_init_global_file_table(void); void ksmbd_free_global_file_table(void); -int ksmbd_file_table_flush(struct ksmbd_work *work); void ksmbd_set_fd_limit(unsigned long limit); /* diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 2de048f80eb8..b632be3ad57b 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -584,7 +584,7 @@ static struct ctl_table nlm_sysctls[] = { .data = &nsm_use_hostnames, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dobool, }, { .procname = "nsm_local_state", @@ -649,6 +649,7 @@ static int lockd_authenticate(struct svc_rqst *rqstp) switch (rqstp->rq_authop->flavour) { case RPC_AUTH_NULL: case RPC_AUTH_UNIX: + rqstp->rq_auth_stat = rpc_auth_ok; if (rqstp->rq_proc == 0) return SVC_OK; if (is_callback(rqstp->rq_proc)) { @@ -659,6 +660,7 @@ static int lockd_authenticate(struct svc_rqst *rqstp) } return svc_set_client(rqstp); } + rqstp->rq_auth_stat = rpc_autherr_badcred; return SVC_DENIED; } diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 4c10fb5138f1..e10ae2c41279 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -40,12 +40,15 @@ nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp, /* Obtain file pointer. Not used by FREE_ALL call. */ if (filp != NULL) { - if ((error = nlm_lookup_file(rqstp, &file, &lock->fh)) != 0) + int mode = lock_to_openmode(&lock->fl); + + error = nlm_lookup_file(rqstp, &file, lock); + if (error) goto no_locks; *filp = file; /* Set up the missing parts of the file_lock structure */ - lock->fl.fl_file = file->f_file; + lock->fl.fl_file = file->f_file[mode]; lock->fl.fl_pid = current->tgid; lock->fl.fl_lmops = &nlmsvc_lock_operations; nlmsvc_locks_init_private(&lock->fl, host, (pid_t)lock->svid); diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 61d3cc2283dc..e9b85d8fd5fe 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -31,6 +31,7 @@ #include <linux/lockd/nlm.h> #include <linux/lockd/lockd.h> #include <linux/kthread.h> +#include <linux/exportfs.h> #define NLMDBG_FACILITY NLMDBG_SVCLOCK @@ -395,28 +396,10 @@ nlmsvc_release_lockowner(struct nlm_lock *lock) nlmsvc_put_lockowner(lock->fl.fl_owner); } -static void nlmsvc_locks_copy_lock(struct file_lock *new, struct file_lock *fl) -{ - struct nlm_lockowner *nlm_lo = (struct nlm_lockowner *)fl->fl_owner; - new->fl_owner = nlmsvc_get_lockowner(nlm_lo); -} - -static void nlmsvc_locks_release_private(struct file_lock *fl) -{ - nlmsvc_put_lockowner((struct nlm_lockowner *)fl->fl_owner); -} - -static const struct file_lock_operations nlmsvc_lock_ops = { - .fl_copy_lock = nlmsvc_locks_copy_lock, - .fl_release_private = nlmsvc_locks_release_private, -}; - void nlmsvc_locks_init_private(struct file_lock *fl, struct nlm_host *host, pid_t pid) { fl->fl_owner = nlmsvc_find_lockowner(host, pid); - if (fl->fl_owner != NULL) - fl->fl_ops = &nlmsvc_lock_ops; } /* @@ -488,17 +471,24 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, struct nlm_cookie *cookie, int reclaim) { struct nlm_block *block = NULL; + struct inode *inode = nlmsvc_file_inode(file); int error; + int mode; + int async_block = 0; __be32 ret; dprintk("lockd: nlmsvc_lock(%s/%ld, ty=%d, pi=%d, %Ld-%Ld, bl=%d)\n", - locks_inode(file->f_file)->i_sb->s_id, - locks_inode(file->f_file)->i_ino, + inode->i_sb->s_id, inode->i_ino, lock->fl.fl_type, lock->fl.fl_pid, (long long)lock->fl.fl_start, (long long)lock->fl.fl_end, wait); + if (inode->i_sb->s_export_op->flags & EXPORT_OP_SYNC_LOCKS) { + async_block = wait; + wait = 0; + } + /* Lock file against concurrent access */ mutex_lock(&file->f_mutex); /* Get existing block (in case client is busy-waiting) @@ -542,7 +532,8 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, if (!wait) lock->fl.fl_flags &= ~FL_SLEEP; - error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL); + mode = lock_to_openmode(&lock->fl); + error = vfs_lock_file(file->f_file[mode], F_SETLK, &lock->fl, NULL); lock->fl.fl_flags &= ~FL_SLEEP; dprintk("lockd: vfs_lock_file returned %d\n", error); @@ -558,7 +549,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, */ if (wait) break; - ret = nlm_lck_denied; + ret = async_block ? nlm_lck_blocked : nlm_lck_denied; goto out; case FILE_LOCK_DEFERRED: if (wait) @@ -595,12 +586,13 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, struct nlm_lock *conflock, struct nlm_cookie *cookie) { int error; + int mode; __be32 ret; struct nlm_lockowner *test_owner; dprintk("lockd: nlmsvc_testlock(%s/%ld, ty=%d, %Ld-%Ld)\n", - locks_inode(file->f_file)->i_sb->s_id, - locks_inode(file->f_file)->i_ino, + nlmsvc_file_inode(file)->i_sb->s_id, + nlmsvc_file_inode(file)->i_ino, lock->fl.fl_type, (long long)lock->fl.fl_start, (long long)lock->fl.fl_end); @@ -613,7 +605,8 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, /* If there's a conflicting lock, remember to clean up the test lock */ test_owner = (struct nlm_lockowner *)lock->fl.fl_owner; - error = vfs_test_lock(file->f_file, &lock->fl); + mode = lock_to_openmode(&lock->fl); + error = vfs_test_lock(file->f_file[mode], &lock->fl); if (error) { /* We can't currently deal with deferred test requests */ if (error == FILE_LOCK_DEFERRED) @@ -634,7 +627,7 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, conflock->caller = "somehost"; /* FIXME */ conflock->len = strlen(conflock->caller); conflock->oh.len = 0; /* don't return OH info */ - conflock->svid = ((struct nlm_lockowner *)lock->fl.fl_owner)->pid; + conflock->svid = lock->fl.fl_pid; conflock->fl.fl_type = lock->fl.fl_type; conflock->fl.fl_start = lock->fl.fl_start; conflock->fl.fl_end = lock->fl.fl_end; @@ -659,11 +652,11 @@ out: __be32 nlmsvc_unlock(struct net *net, struct nlm_file *file, struct nlm_lock *lock) { - int error; + int error = 0; dprintk("lockd: nlmsvc_unlock(%s/%ld, pi=%d, %Ld-%Ld)\n", - locks_inode(file->f_file)->i_sb->s_id, - locks_inode(file->f_file)->i_ino, + nlmsvc_file_inode(file)->i_sb->s_id, + nlmsvc_file_inode(file)->i_ino, lock->fl.fl_pid, (long long)lock->fl.fl_start, (long long)lock->fl.fl_end); @@ -672,7 +665,12 @@ nlmsvc_unlock(struct net *net, struct nlm_file *file, struct nlm_lock *lock) nlmsvc_cancel_blocked(net, file, lock); lock->fl.fl_type = F_UNLCK; - error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL); + if (file->f_file[O_RDONLY]) + error = vfs_lock_file(file->f_file[O_RDONLY], F_SETLK, + &lock->fl, NULL); + if (file->f_file[O_WRONLY]) + error = vfs_lock_file(file->f_file[O_WRONLY], F_SETLK, + &lock->fl, NULL); return (error < 0)? nlm_lck_denied_nolocks : nlm_granted; } @@ -689,10 +687,11 @@ nlmsvc_cancel_blocked(struct net *net, struct nlm_file *file, struct nlm_lock *l { struct nlm_block *block; int status = 0; + int mode; dprintk("lockd: nlmsvc_cancel(%s/%ld, pi=%d, %Ld-%Ld)\n", - locks_inode(file->f_file)->i_sb->s_id, - locks_inode(file->f_file)->i_ino, + nlmsvc_file_inode(file)->i_sb->s_id, + nlmsvc_file_inode(file)->i_ino, lock->fl.fl_pid, (long long)lock->fl.fl_start, (long long)lock->fl.fl_end); @@ -704,7 +703,8 @@ nlmsvc_cancel_blocked(struct net *net, struct nlm_file *file, struct nlm_lock *l block = nlmsvc_lookup_block(file, lock); mutex_unlock(&file->f_mutex); if (block != NULL) { - vfs_cancel_lock(block->b_file->f_file, + mode = lock_to_openmode(&lock->fl); + vfs_cancel_lock(block->b_file->f_file[mode], &block->b_call->a_args.lock.fl); status = nlmsvc_unlink_block(block); nlmsvc_release_block(block); @@ -788,9 +788,21 @@ nlmsvc_notify_blocked(struct file_lock *fl) printk(KERN_WARNING "lockd: notification for unknown block!\n"); } +static fl_owner_t nlmsvc_get_owner(fl_owner_t owner) +{ + return nlmsvc_get_lockowner(owner); +} + +static void nlmsvc_put_owner(fl_owner_t owner) +{ + nlmsvc_put_lockowner(owner); +} + const struct lock_manager_operations nlmsvc_lock_operations = { .lm_notify = nlmsvc_notify_blocked, .lm_grant = nlmsvc_grant_deferred, + .lm_get_owner = nlmsvc_get_owner, + .lm_put_owner = nlmsvc_put_owner, }; /* @@ -809,6 +821,7 @@ nlmsvc_grant_blocked(struct nlm_block *block) { struct nlm_file *file = block->b_file; struct nlm_lock *lock = &block->b_call->a_args.lock; + int mode; int error; loff_t fl_start, fl_end; @@ -834,7 +847,8 @@ nlmsvc_grant_blocked(struct nlm_block *block) lock->fl.fl_flags |= FL_SLEEP; fl_start = lock->fl.fl_start; fl_end = lock->fl.fl_end; - error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL); + mode = lock_to_openmode(&lock->fl); + error = vfs_lock_file(file->f_file[mode], F_SETLK, &lock->fl, NULL); lock->fl.fl_flags &= ~FL_SLEEP; lock->fl.fl_start = fl_start; lock->fl.fl_end = fl_end; diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 4ae4b63b5392..99696d3f6dd6 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -55,6 +55,7 @@ nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp, struct nlm_host *host = NULL; struct nlm_file *file = NULL; struct nlm_lock *lock = &argp->lock; + int mode; __be32 error = 0; /* nfsd callbacks must have been installed for this procedure */ @@ -69,13 +70,14 @@ nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp, /* Obtain file pointer. Not used by FREE_ALL call. */ if (filp != NULL) { - error = cast_status(nlm_lookup_file(rqstp, &file, &lock->fh)); + error = cast_status(nlm_lookup_file(rqstp, &file, lock)); if (error != 0) goto no_locks; *filp = file; /* Set up the missing parts of the file_lock structure */ - lock->fl.fl_file = file->f_file; + mode = lock_to_openmode(&lock->fl); + lock->fl.fl_file = file->f_file[mode]; lock->fl.fl_pid = current->tgid; lock->fl.fl_lmops = &nlmsvc_lock_operations; nlmsvc_locks_init_private(&lock->fl, host, (pid_t)lock->svid); diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index 028fc152da22..cb3a7512c33e 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -45,7 +45,7 @@ static inline void nlm_debug_print_fh(char *msg, struct nfs_fh *f) static inline void nlm_debug_print_file(char *msg, struct nlm_file *file) { - struct inode *inode = locks_inode(file->f_file); + struct inode *inode = nlmsvc_file_inode(file); dprintk("lockd: %s %s/%ld\n", msg, inode->i_sb->s_id, inode->i_ino); @@ -71,56 +71,75 @@ static inline unsigned int file_hash(struct nfs_fh *f) return tmp & (FILE_NRHASH - 1); } +int lock_to_openmode(struct file_lock *lock) +{ + return (lock->fl_type == F_WRLCK) ? O_WRONLY : O_RDONLY; +} + +/* + * Open the file. Note that if we're reexporting, for example, + * this could block the lockd thread for a while. + * + * We have to make sure we have the right credential to open + * the file. + */ +static __be32 nlm_do_fopen(struct svc_rqst *rqstp, + struct nlm_file *file, int mode) +{ + struct file **fp = &file->f_file[mode]; + __be32 nfserr; + + if (*fp) + return 0; + nfserr = nlmsvc_ops->fopen(rqstp, &file->f_handle, fp, mode); + if (nfserr) + dprintk("lockd: open failed (error %d)\n", nfserr); + return nfserr; +} + /* * Lookup file info. If it doesn't exist, create a file info struct * and open a (VFS) file for the given inode. - * - * FIXME: - * Note that we open the file O_RDONLY even when creating write locks. - * This is not quite right, but for now, we assume the client performs - * the proper R/W checking. */ __be32 nlm_lookup_file(struct svc_rqst *rqstp, struct nlm_file **result, - struct nfs_fh *f) + struct nlm_lock *lock) { struct nlm_file *file; unsigned int hash; __be32 nfserr; + int mode; - nlm_debug_print_fh("nlm_lookup_file", f); + nlm_debug_print_fh("nlm_lookup_file", &lock->fh); - hash = file_hash(f); + hash = file_hash(&lock->fh); + mode = lock_to_openmode(&lock->fl); /* Lock file table */ mutex_lock(&nlm_file_mutex); hlist_for_each_entry(file, &nlm_files[hash], f_list) - if (!nfs_compare_fh(&file->f_handle, f)) + if (!nfs_compare_fh(&file->f_handle, &lock->fh)) { + mutex_lock(&file->f_mutex); + nfserr = nlm_do_fopen(rqstp, file, mode); + mutex_unlock(&file->f_mutex); goto found; - - nlm_debug_print_fh("creating file for", f); + } + nlm_debug_print_fh("creating file for", &lock->fh); nfserr = nlm_lck_denied_nolocks; file = kzalloc(sizeof(*file), GFP_KERNEL); if (!file) - goto out_unlock; + goto out_free; - memcpy(&file->f_handle, f, sizeof(struct nfs_fh)); + memcpy(&file->f_handle, &lock->fh, sizeof(struct nfs_fh)); mutex_init(&file->f_mutex); INIT_HLIST_NODE(&file->f_list); INIT_LIST_HEAD(&file->f_blocks); - /* Open the file. Note that this must not sleep for too long, else - * we would lock up lockd:-) So no NFS re-exports, folks. - * - * We have to make sure we have the right credential to open - * the file. - */ - if ((nfserr = nlmsvc_ops->fopen(rqstp, f, &file->f_file)) != 0) { - dprintk("lockd: open failed (error %d)\n", nfserr); - goto out_free; - } + nfserr = nlm_do_fopen(rqstp, file, mode); + if (nfserr) + goto out_unlock; hlist_add_head(&file->f_list, &nlm_files[hash]); @@ -128,7 +147,6 @@ found: dprintk("lockd: found file %p (count %d)\n", file, file->f_count); *result = file; file->f_count++; - nfserr = 0; out_unlock: mutex_unlock(&nlm_file_mutex); @@ -148,13 +166,34 @@ nlm_delete_file(struct nlm_file *file) nlm_debug_print_file("closing file", file); if (!hlist_unhashed(&file->f_list)) { hlist_del(&file->f_list); - nlmsvc_ops->fclose(file->f_file); + if (file->f_file[O_RDONLY]) + nlmsvc_ops->fclose(file->f_file[O_RDONLY]); + if (file->f_file[O_WRONLY]) + nlmsvc_ops->fclose(file->f_file[O_WRONLY]); kfree(file); } else { printk(KERN_WARNING "lockd: attempt to release unknown file!\n"); } } +static int nlm_unlock_files(struct nlm_file *file) +{ + struct file_lock lock; + struct file *f; + + lock.fl_type = F_UNLCK; + lock.fl_start = 0; + lock.fl_end = OFFSET_MAX; + for (f = file->f_file[0]; f <= file->f_file[1]; f++) { + if (f && vfs_lock_file(f, F_SETLK, &lock, NULL) < 0) { + pr_warn("lockd: unlock failure in %s:%d\n", + __FILE__, __LINE__); + return 1; + } + } + return 0; +} + /* * Loop over all locks on the given file and perform the specified * action. @@ -182,17 +221,10 @@ again: lockhost = ((struct nlm_lockowner *)fl->fl_owner)->host; if (match(lockhost, host)) { - struct file_lock lock = *fl; spin_unlock(&flctx->flc_lock); - lock.fl_type = F_UNLCK; - lock.fl_start = 0; - lock.fl_end = OFFSET_MAX; - if (vfs_lock_file(file->f_file, F_SETLK, &lock, NULL) < 0) { - printk("lockd: unlock failure in %s:%d\n", - __FILE__, __LINE__); + if (nlm_unlock_files(file)) return 1; - } goto again; } } @@ -246,6 +278,15 @@ nlm_file_inuse(struct nlm_file *file) return 0; } +static void nlm_close_files(struct nlm_file *file) +{ + struct file *f; + + for (f = file->f_file[0]; f <= file->f_file[1]; f++) + if (f) + nlmsvc_ops->fclose(f); +} + /* * Loop over all files in the file table. */ @@ -276,7 +317,7 @@ nlm_traverse_files(void *data, nlm_host_match_fn_t match, if (list_empty(&file->f_blocks) && !file->f_locks && !file->f_shares && !file->f_count) { hlist_del(&file->f_list); - nlmsvc_ops->fclose(file->f_file); + nlm_close_files(file); kfree(file); } } @@ -410,12 +451,13 @@ nlmsvc_invalidate_all(void) nlm_traverse_files(NULL, nlmsvc_is_client, NULL); } + static int nlmsvc_match_sb(void *datap, struct nlm_file *file) { struct super_block *sb = datap; - return sb == locks_inode(file->f_file)->i_sb; + return sb == nlmsvc_file_inode(file)->i_sb; } /** diff --git a/fs/namei.c b/fs/namei.c index 32351c045bae..95a881e0552b 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2598,8 +2598,9 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, } EXPORT_SYMBOL(vfs_path_lookup); -static int lookup_one_len_common(const char *name, struct dentry *base, - int len, struct qstr *this) +static int lookup_one_common(struct user_namespace *mnt_userns, + const char *name, struct dentry *base, int len, + struct qstr *this) { this->name = name; this->len = len; @@ -2627,7 +2628,7 @@ static int lookup_one_len_common(const char *name, struct dentry *base, return err; } - return inode_permission(&init_user_ns, base->d_inode, MAY_EXEC); + return inode_permission(mnt_userns, base->d_inode, MAY_EXEC); } /** @@ -2651,7 +2652,7 @@ struct dentry *try_lookup_one_len(const char *name, struct dentry *base, int len WARN_ON_ONCE(!inode_is_locked(base->d_inode)); - err = lookup_one_len_common(name, base, len, &this); + err = lookup_one_common(&init_user_ns, name, base, len, &this); if (err) return ERR_PTR(err); @@ -2678,7 +2679,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) WARN_ON_ONCE(!inode_is_locked(base->d_inode)); - err = lookup_one_len_common(name, base, len, &this); + err = lookup_one_common(&init_user_ns, name, base, len, &this); if (err) return ERR_PTR(err); @@ -2688,6 +2689,36 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) EXPORT_SYMBOL(lookup_one_len); /** + * lookup_one - filesystem helper to lookup single pathname component + * @mnt_userns: user namespace of the mount the lookup is performed from + * @name: pathname component to lookup + * @base: base directory to lookup from + * @len: maximum length @len should be interpreted to + * + * Note that this routine is purely a helper for filesystem usage and should + * not be called by generic code. + * + * The caller must hold base->i_mutex. + */ +struct dentry *lookup_one(struct user_namespace *mnt_userns, const char *name, + struct dentry *base, int len) +{ + struct dentry *dentry; + struct qstr this; + int err; + + WARN_ON_ONCE(!inode_is_locked(base->d_inode)); + + err = lookup_one_common(mnt_userns, name, base, len, &this); + if (err) + return ERR_PTR(err); + + dentry = lookup_dcache(&this, base, 0); + return dentry ? dentry : __lookup_slow(&this, base, 0); +} +EXPORT_SYMBOL(lookup_one); + +/** * lookup_one_len_unlocked - filesystem helper to lookup single pathname component * @name: pathname component to lookup * @base: base directory to lookup from @@ -2706,7 +2737,7 @@ struct dentry *lookup_one_len_unlocked(const char *name, int err; struct dentry *ret; - err = lookup_one_len_common(name, base, len, &this); + err = lookup_one_common(&init_user_ns, name, base, len, &this); if (err) return ERR_PTR(err); @@ -4058,7 +4089,9 @@ int vfs_unlink(struct user_namespace *mnt_userns, struct inode *dir, return -EPERM; inode_lock(target); - if (is_local_mountpoint(dentry)) + if (IS_SWAPFILE(target)) + error = -EPERM; + else if (is_local_mountpoint(dentry)) error = -EBUSY; else { error = security_inode_unlink(dir, dentry); @@ -4566,6 +4599,10 @@ int vfs_rename(struct renamedata *rd) else if (target) inode_lock(target); + error = -EPERM; + if (IS_SWAPFILE(source) || (target && IS_SWAPFILE(target))) + goto out; + error = -EBUSY; if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry)) goto out; diff --git a/fs/namespace.c b/fs/namespace.c index 20caa4b4c539..659a8f39c61a 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -203,7 +203,8 @@ static struct mount *alloc_vfsmnt(const char *name) goto out_free_cache; if (name) { - mnt->mnt_devname = kstrdup_const(name, GFP_KERNEL); + mnt->mnt_devname = kstrdup_const(name, + GFP_KERNEL_ACCOUNT); if (!mnt->mnt_devname) goto out_free_id; } @@ -2694,6 +2695,78 @@ out: return ret; } +static int do_set_group(struct path *from_path, struct path *to_path) +{ + struct mount *from, *to; + int err; + + from = real_mount(from_path->mnt); + to = real_mount(to_path->mnt); + + namespace_lock(); + + err = -EINVAL; + /* To and From must be mounted */ + if (!is_mounted(&from->mnt)) + goto out; + if (!is_mounted(&to->mnt)) + goto out; + + err = -EPERM; + /* We should be allowed to modify mount namespaces of both mounts */ + if (!ns_capable(from->mnt_ns->user_ns, CAP_SYS_ADMIN)) + goto out; + if (!ns_capable(to->mnt_ns->user_ns, CAP_SYS_ADMIN)) + goto out; + + err = -EINVAL; + /* To and From paths should be mount roots */ + if (from_path->dentry != from_path->mnt->mnt_root) + goto out; + if (to_path->dentry != to_path->mnt->mnt_root) + goto out; + + /* Setting sharing groups is only allowed across same superblock */ + if (from->mnt.mnt_sb != to->mnt.mnt_sb) + goto out; + + /* From mount root should be wider than To mount root */ + if (!is_subdir(to->mnt.mnt_root, from->mnt.mnt_root)) + goto out; + + /* From mount should not have locked children in place of To's root */ + if (has_locked_children(from, to->mnt.mnt_root)) + goto out; + + /* Setting sharing groups is only allowed on private mounts */ + if (IS_MNT_SHARED(to) || IS_MNT_SLAVE(to)) + goto out; + + /* From should not be private */ + if (!IS_MNT_SHARED(from) && !IS_MNT_SLAVE(from)) + goto out; + + if (IS_MNT_SLAVE(from)) { + struct mount *m = from->mnt_master; + + list_add(&to->mnt_slave, &m->mnt_slave_list); + to->mnt_master = m; + } + + if (IS_MNT_SHARED(from)) { + to->mnt_group_id = from->mnt_group_id; + list_add(&to->mnt_share, &from->mnt_share); + lock_mount_hash(); + set_mnt_shared(to); + unlock_mount_hash(); + } + + err = 0; +out: + namespace_unlock(); + return err; +} + static int do_move_mount(struct path *old_path, struct path *new_path) { struct mnt_namespace *ns; @@ -3298,7 +3371,7 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a if (!ucounts) return ERR_PTR(-ENOSPC); - new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL); + new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL_ACCOUNT); if (!new_ns) { dec_mnt_namespaces(ucounts); return ERR_PTR(-ENOMEM); @@ -3678,7 +3751,10 @@ SYSCALL_DEFINE5(move_mount, if (ret < 0) goto out_to; - ret = do_move_mount(&from_path, &to_path); + if (flags & MOVE_MOUNT_SET_GROUP) + ret = do_set_group(&from_path, &to_path); + else + ret = do_move_mount(&from_path, &to_path); out_to: path_put(&to_path); @@ -4231,7 +4307,7 @@ void __init mnt_init(void) int err; mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount), - 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); mount_hashtable = alloc_large_system_hash("Mount-cache", sizeof(struct hlist_head), diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 7817ad94a6ba..86d856de1389 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -429,6 +429,8 @@ check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp) */ static int nfs_callback_authenticate(struct svc_rqst *rqstp) { + rqstp->rq_auth_stat = rpc_autherr_badcred; + switch (rqstp->rq_authop->flavour) { case RPC_AUTH_NULL: if (rqstp->rq_proc != CB_NULL) @@ -439,6 +441,8 @@ static int nfs_callback_authenticate(struct svc_rqst *rqstp) if (svc_is_backchannel(rqstp)) return SVC_DENIED; } + + rqstp->rq_auth_stat = rpc_auth_ok; return SVC_OK; } diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index c5348ba81129..4c48d85f6517 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -63,11 +63,10 @@ static __be32 nfs4_callback_null(struct svc_rqst *rqstp) return htonl(NFS4_OK); } -static int nfs4_decode_void(struct svc_rqst *rqstp, __be32 *p) -{ - return xdr_argsize_check(rqstp, p); -} - +/* + * svc_process_common() looks for an XDR encoder to know when + * not to drop a Reply. + */ static int nfs4_encode_void(struct svc_rqst *rqstp, __be32 *p) { return xdr_ressize_check(rqstp, p); @@ -864,17 +863,16 @@ preprocess_nfs4_op(unsigned int op_nr, struct callback_op **op) } static __be32 process_op(int nop, struct svc_rqst *rqstp, - struct xdr_stream *xdr_in, void *argp, - struct xdr_stream *xdr_out, void *resp, - struct cb_process_state *cps) + struct cb_process_state *cps) { + struct xdr_stream *xdr_out = &rqstp->rq_res_stream; struct callback_op *op = &callback_ops[0]; unsigned int op_nr; __be32 status; long maxlen; __be32 res; - status = decode_op_hdr(xdr_in, &op_nr); + status = decode_op_hdr(&rqstp->rq_arg_stream, &op_nr); if (unlikely(status)) return status; @@ -904,9 +902,11 @@ static __be32 process_op(int nop, struct svc_rqst *rqstp, maxlen = xdr_out->end - xdr_out->p; if (maxlen > 0 && maxlen < PAGE_SIZE) { - status = op->decode_args(rqstp, xdr_in, argp); + status = op->decode_args(rqstp, &rqstp->rq_arg_stream, + rqstp->rq_argp); if (likely(status == 0)) - status = op->process_op(argp, resp, cps); + status = op->process_op(rqstp->rq_argp, rqstp->rq_resp, + cps); } else status = htonl(NFS4ERR_RESOURCE); @@ -915,7 +915,7 @@ encode_hdr: if (unlikely(res)) return res; if (op->encode_res != NULL && status == 0) - status = op->encode_res(rqstp, xdr_out, resp); + status = op->encode_res(rqstp, xdr_out, rqstp->rq_resp); return status; } @@ -926,22 +926,15 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp) { struct cb_compound_hdr_arg hdr_arg = { 0 }; struct cb_compound_hdr_res hdr_res = { NULL }; - struct xdr_stream xdr_in, xdr_out; - __be32 *p, status; struct cb_process_state cps = { .drc_status = 0, .clp = NULL, .net = SVC_NET(rqstp), }; unsigned int nops = 0; + __be32 status; - xdr_init_decode(&xdr_in, &rqstp->rq_arg, - rqstp->rq_arg.head[0].iov_base, NULL); - - p = (__be32*)((char *)rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len); - xdr_init_encode(&xdr_out, &rqstp->rq_res, p, NULL); - - status = decode_compound_hdr_arg(&xdr_in, &hdr_arg); + status = decode_compound_hdr_arg(&rqstp->rq_arg_stream, &hdr_arg); if (status == htonl(NFS4ERR_RESOURCE)) return rpc_garbage_args; @@ -961,15 +954,13 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp) cps.minorversion = hdr_arg.minorversion; hdr_res.taglen = hdr_arg.taglen; hdr_res.tag = hdr_arg.tag; - if (encode_compound_hdr_res(&xdr_out, &hdr_res) != 0) { + if (encode_compound_hdr_res(&rqstp->rq_res_stream, &hdr_res) != 0) { if (cps.clp) nfs_put_client(cps.clp); return rpc_system_err; } while (status == 0 && nops != hdr_arg.nops) { - status = process_op(nops, rqstp, &xdr_in, - rqstp->rq_argp, &xdr_out, rqstp->rq_resp, - &cps); + status = process_op(nops, rqstp, &cps); nops++; } @@ -988,7 +979,20 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp) out_invalidcred: pr_warn_ratelimited("NFS: NFSv4 callback contains invalid cred\n"); - return svc_return_autherr(rqstp, rpc_autherr_badcred); + rqstp->rq_auth_stat = rpc_autherr_badcred; + return rpc_success; +} + +static int +nfs_callback_dispatch(struct svc_rqst *rqstp, __be32 *statp) +{ + const struct svc_procedure *procp = rqstp->rq_procinfo; + + svcxdr_init_decode(rqstp); + svcxdr_init_encode(rqstp); + + *statp = procp->pc_func(rqstp); + return 1; } /* @@ -1057,7 +1061,6 @@ static struct callback_op callback_ops[] = { static const struct svc_procedure nfs4_callback_procedures1[] = { [CB_NULL] = { .pc_func = nfs4_callback_null, - .pc_decode = nfs4_decode_void, .pc_encode = nfs4_encode_void, .pc_xdrressize = 1, .pc_name = "NULL", @@ -1079,7 +1082,7 @@ const struct svc_version nfs4_callback_version1 = { .vs_proc = nfs4_callback_procedures1, .vs_count = nfs4_callback_count1, .vs_xdrsize = NFS4_CALLBACK_XDRSIZE, - .vs_dispatch = NULL, + .vs_dispatch = nfs_callback_dispatch, .vs_hidden = true, .vs_need_cong_ctrl = true, }; @@ -1091,7 +1094,7 @@ const struct svc_version nfs4_callback_version4 = { .vs_proc = nfs4_callback_procedures1, .vs_count = nfs4_callback_count4, .vs_xdrsize = NFS4_CALLBACK_XDRSIZE, - .vs_dispatch = NULL, + .vs_dispatch = nfs_callback_dispatch, .vs_hidden = true, .vs_need_cong_ctrl = true, }; diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 330f65727c45..23e165d5ec9c 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -179,6 +179,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init) clp->cl_proto = cl_init->proto; clp->cl_nconnect = cl_init->nconnect; + clp->cl_max_connect = cl_init->max_connect ? cl_init->max_connect : 1; clp->cl_net = get_net(cl_init->net); clp->cl_principal = "*"; @@ -540,6 +541,7 @@ int nfs_create_rpc_client(struct nfs_client *clp, clnt->cl_principal = clp->cl_principal; clp->cl_rpcclient = clnt; + clnt->cl_max_connect = clp->cl_max_connect; return 0; } EXPORT_SYMBOL_GPL(nfs_create_rpc_client); diff --git a/fs/nfs/export.c b/fs/nfs/export.c index 37a1a88df771..d772c20bbfd1 100644 --- a/fs/nfs/export.c +++ b/fs/nfs/export.c @@ -180,5 +180,5 @@ const struct export_operations nfs_export_ops = { .fetch_iversion = nfs_fetch_iversion, .flags = EXPORT_OP_NOWCC|EXPORT_OP_NOSUBTREECHK| EXPORT_OP_CLOSE_BEFORE_UNLINK|EXPORT_OP_REMOTE_FS| - EXPORT_OP_NOATOMIC_ATTR, + EXPORT_OP_NOATOMIC_ATTR|EXPORT_OP_SYNC_LOCKS, }; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 514be5d28d70..aa353fd58240 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -806,6 +806,9 @@ int nfs_lock(struct file *filp, int cmd, struct file_lock *fl) nfs_inc_stats(inode, NFSIOS_VFSLOCK); + if (fl->fl_flags & FL_RECLAIM) + return -ENOGRACE; + if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FCNTL) is_local = 1; diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index d95c9a39bc70..0d444a90f513 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -60,6 +60,7 @@ enum nfs_param { Opt_mountvers, Opt_namelen, Opt_nconnect, + Opt_max_connect, Opt_port, Opt_posix, Opt_proto, @@ -158,6 +159,7 @@ static const struct fs_parameter_spec nfs_fs_parameters[] = { fsparam_u32 ("mountvers", Opt_mountvers), fsparam_u32 ("namlen", Opt_namelen), fsparam_u32 ("nconnect", Opt_nconnect), + fsparam_u32 ("max_connect", Opt_max_connect), fsparam_string("nfsvers", Opt_vers), fsparam_u32 ("port", Opt_port), fsparam_flag_no("posix", Opt_posix), @@ -770,6 +772,11 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, goto out_of_bounds; ctx->nfs_server.nconnect = result.uint_32; break; + case Opt_max_connect: + if (result.uint_32 < 1 || result.uint_32 > NFS_MAX_TRANSPORTS) + goto out_of_bounds; + ctx->nfs_server.max_connect = result.uint_32; + break; case Opt_lookupcache: switch (result.uint_32) { case Opt_lookupcache_all: diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index a36af04188c2..66fc936834f2 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -67,6 +67,7 @@ struct nfs_client_initdata { int proto; u32 minorversion; unsigned int nconnect; + unsigned int max_connect; struct net *net; const struct rpc_timeout *timeparms; const struct cred *cred; @@ -121,6 +122,7 @@ struct nfs_fs_context { int port; unsigned short protocol; unsigned short nconnect; + unsigned short max_connect; unsigned short export_path_len; } nfs_server; diff --git a/fs/nfs/nfs3_fs.h b/fs/nfs/nfs3_fs.h index c8a192802dda..03a4e679fd99 100644 --- a/fs/nfs/nfs3_fs.h +++ b/fs/nfs/nfs3_fs.h @@ -11,7 +11,7 @@ * nfs3acl.c */ #ifdef CONFIG_NFS_V3_ACL -extern struct posix_acl *nfs3_get_acl(struct inode *inode, int type); +extern struct posix_acl *nfs3_get_acl(struct inode *inode, int type, bool rcu); extern int nfs3_set_acl(struct user_namespace *mnt_userns, struct inode *inode, struct posix_acl *acl, int type); extern int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 9ec560aa4a50..93de0b58647a 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -44,7 +44,7 @@ static void nfs3_abort_get_acl(struct posix_acl **p) cmpxchg(p, sentinel, ACL_NOT_CACHED); } -struct posix_acl *nfs3_get_acl(struct inode *inode, int type) +struct posix_acl *nfs3_get_acl(struct inode *inode, int type, bool rcu) { struct nfs_server *server = NFS_SERVER(inode); struct page *pages[NFSACL_MAXPAGES] = { }; @@ -62,6 +62,9 @@ struct posix_acl *nfs3_get_acl(struct inode *inode, int type) }; int status, count; + if (rcu) + return ERR_PTR(-ECHILD); + if (!nfs_server_capable(inode, NFS_CAP_ACLS)) return ERR_PTR(-EOPNOTSUPP); diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 2299446b3b89..f7524310ddf4 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -49,8 +49,7 @@ nfs3_async_handle_jukebox(struct rpc_task *task, struct inode *inode) { if (task->tk_status != -EJUKEBOX) return 0; - if (task->tk_status == -EJUKEBOX) - nfs_inc_stats(inode, NFSIOS_DELAY); + nfs_inc_stats(inode, NFSIOS_DELAY); task->tk_status = 0; rpc_restart_call(task); rpc_delay(task, NFS_JUKEBOX_RETRY_TIME); diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 28431acd1230..af57332503be 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -402,6 +402,33 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp) return nfs4_init_callback(clp); } +static void nfs4_add_trunk(struct nfs_client *clp, struct nfs_client *old) +{ + struct sockaddr_storage clp_addr, old_addr; + struct sockaddr *clp_sap = (struct sockaddr *)&clp_addr; + struct sockaddr *old_sap = (struct sockaddr *)&old_addr; + size_t clp_salen; + struct xprt_create xprt_args = { + .ident = old->cl_proto, + .net = old->cl_net, + .servername = old->cl_hostname, + }; + + if (clp->cl_proto != old->cl_proto) + return; + clp_salen = rpc_peeraddr(clp->cl_rpcclient, clp_sap, sizeof(clp_addr)); + rpc_peeraddr(old->cl_rpcclient, old_sap, sizeof(old_addr)); + + if (clp_addr.ss_family != old_addr.ss_family) + return; + + xprt_args.dstaddr = clp_sap; + xprt_args.addrlen = clp_salen; + + rpc_clnt_add_xprt(old->cl_rpcclient, &xprt_args, + rpc_clnt_test_and_add_xprt, NULL); +} + /** * nfs4_init_client - Initialise an NFS4 client record * @@ -436,6 +463,8 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp, * won't try to use it. */ nfs_mark_client_ready(clp, -EPERM); + if (old->cl_mvops->session_trunk) + nfs4_add_trunk(clp, old); } clear_bit(NFS_CS_TSM_POSSIBLE, &clp->cl_flags); nfs_put_client(clp); @@ -865,6 +894,7 @@ static int nfs4_set_client(struct nfs_server *server, const char *ip_addr, int proto, const struct rpc_timeout *timeparms, u32 minorversion, unsigned int nconnect, + unsigned int max_connect, struct net *net) { struct nfs_client_initdata cl_init = { @@ -883,6 +913,8 @@ static int nfs4_set_client(struct nfs_server *server, if (minorversion == 0) __set_bit(NFS_CS_REUSEPORT, &cl_init.init_flags); + else + cl_init.max_connect = max_connect; if (proto == XPRT_TRANSPORT_TCP) cl_init.nconnect = nconnect; @@ -952,8 +984,10 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, return ERR_PTR(-EINVAL); cl_init.hostname = buf; - if (mds_clp->cl_nconnect > 1 && ds_proto == XPRT_TRANSPORT_TCP) + if (mds_clp->cl_nconnect > 1 && ds_proto == XPRT_TRANSPORT_TCP) { cl_init.nconnect = mds_clp->cl_nconnect; + cl_init.max_connect = NFS_MAX_TRANSPORTS; + } if (mds_srv->flags & NFS_MOUNT_NORESVPORT) __set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); @@ -1122,6 +1156,7 @@ static int nfs4_init_server(struct nfs_server *server, struct fs_context *fc) &timeparms, ctx->minorversion, ctx->nfs_server.nconnect, + ctx->nfs_server.max_connect, fc->net_ns); if (error < 0) return error; @@ -1211,6 +1246,7 @@ struct nfs_server *nfs4_create_referral_server(struct fs_context *fc) parent_server->client->cl_timeout, parent_client->cl_mvops->minor_version, parent_client->cl_nconnect, + parent_client->cl_max_connect, parent_client->cl_net); if (!error) goto init_server; @@ -1226,6 +1262,7 @@ struct nfs_server *nfs4_create_referral_server(struct fs_context *fc) parent_server->client->cl_timeout, parent_client->cl_mvops->minor_version, parent_client->cl_nconnect, + parent_client->cl_max_connect, parent_client->cl_net); if (error < 0) goto error; @@ -1323,7 +1360,7 @@ int nfs4_update_server(struct nfs_server *server, const char *hostname, error = nfs4_set_client(server, hostname, sap, salen, buf, clp->cl_proto, clnt->cl_timeout, clp->cl_minorversion, - clp->cl_nconnect, net); + clp->cl_nconnect, clp->cl_max_connect, net); clear_bit(NFS_MIG_TSM_POSSIBLE, &server->mig_status); if (error != 0) { nfs_server_insert_lists(server); diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index c820de58a661..c91565227ea2 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -158,13 +158,11 @@ static ssize_t __nfs4_copy_file_range(struct file *file_in, loff_t pos_in, sync = true; retry: if (!nfs42_files_from_same_server(file_in, file_out)) { - /* for inter copy, if copy size if smaller than 12 RPC - * payloads, fallback to traditional copy. There are - * 14 RPCs during an NFSv4.x mount between source/dest - * servers. + /* + * for inter copy, if copy size is too small + * then fallback to generic copy. */ - if (sync || - count <= 14 * NFS_SERVER(file_inode(file_in))->rsize) + if (sync) return -EOPNOTSUPP; cn_resp = kzalloc(sizeof(struct nfs42_copy_notify_res), GFP_NOFS); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index ef14ea0b6ab8..7c9090a28e5c 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -335,7 +335,7 @@ static bool pnfs_seqid_is_newer(u32 s1, u32 s2) static void pnfs_barrier_update(struct pnfs_layout_hdr *lo, u32 newseq) { - if (pnfs_seqid_is_newer(newseq, lo->plh_barrier)) + if (pnfs_seqid_is_newer(newseq, lo->plh_barrier) || !lo->plh_barrier) lo->plh_barrier = newseq; } @@ -347,11 +347,15 @@ pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode, iomode = IOMODE_ANY; lo->plh_return_iomode = iomode; set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); - if (seq != 0) { - WARN_ON_ONCE(lo->plh_return_seq != 0 && lo->plh_return_seq != seq); + /* + * We must set lo->plh_return_seq to avoid livelocks with + * pnfs_layout_need_return() + */ + if (seq == 0) + seq = be32_to_cpu(lo->plh_stateid.seqid); + if (!lo->plh_return_seq || pnfs_seqid_is_newer(seq, lo->plh_return_seq)) lo->plh_return_seq = seq; - pnfs_barrier_update(lo, seq); - } + pnfs_barrier_update(lo, seq); } static void @@ -592,10 +596,6 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg) inode = lo->plh_inode; if (refcount_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) { - if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) { - spin_unlock(&inode->i_lock); - return; - } pnfs_get_layout_hdr(lo); pnfs_layout_remove_lseg(lo, lseg); if (pnfs_cache_lseg_for_layoutreturn(lo, lseg)) @@ -1000,7 +1000,7 @@ pnfs_layout_stateid_blocked(const struct pnfs_layout_hdr *lo, { u32 seqid = be32_to_cpu(stateid->seqid); - return !pnfs_seqid_is_newer(seqid, lo->plh_barrier) && lo->plh_barrier; + return lo->plh_barrier && pnfs_seqid_is_newer(lo->plh_barrier, seqid); } /* lget is set to 1 if called from inside send_layoutget call chain */ diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 9f39e0a1a38b..08d6cc57cbc3 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -293,15 +293,19 @@ static int readpage_async_filler(void *data, struct page *page) { struct nfs_readdesc *desc = data; + struct inode *inode = page_file_mapping(page)->host; + unsigned int rsize = NFS_SERVER(inode)->rsize; struct nfs_page *new; - unsigned int len; + unsigned int len, aligned_len; int error; len = nfs_page_length(page); if (len == 0) return nfs_return_empty_page(page); - new = nfs_create_request(desc->ctx, page, 0, len); + aligned_len = min_t(unsigned int, ALIGN(len, rsize), PAGE_SIZE); + + new = nfs_create_request(desc->ctx, page, 0, aligned_len); if (IS_ERR(new)) goto out_error; diff --git a/fs/nfs/super.c b/fs/nfs/super.c index fe58525cfed4..e65c83494c05 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -480,6 +480,8 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, if (clp->cl_nconnect > 0) seq_printf(m, ",nconnect=%u", clp->cl_nconnect); if (version == 4) { + if (clp->cl_max_connect > 1) + seq_printf(m, ",max_connect=%u", clp->cl_max_connect); if (nfss->port != NFS_PORT) seq_printf(m, ",port=%u", nfss->port); } else diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index f229172652be..6e9ea4ee0f73 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -109,7 +109,7 @@ config NFSD_SCSILAYOUT depends on NFSD_V4 && BLOCK select NFSD_PNFS select EXPORTFS_BLOCK_OPS - select BLK_SCSI_REQUEST + select SCSI_COMMON help This option enables support for the exporting pNFS SCSI layouts in the kernel's NFS server. The pNFS SCSI layout enables NFS diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c index 3f5b3d7b62b7..606fa155c28a 100644 --- a/fs/nfsd/lockd.c +++ b/fs/nfsd/lockd.c @@ -25,9 +25,11 @@ * Note: we hold the dentry use count while the file is open. */ static __be32 -nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp) +nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp, + int mode) { __be32 nfserr; + int access; struct svc_fh fh; /* must initialize before using! but maxsize doesn't matter */ @@ -36,7 +38,9 @@ nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp) memcpy((char*)&fh.fh_handle.fh_base, f->data, f->size); fh.fh_export = NULL; - nfserr = nfsd_open(rqstp, &fh, S_IFREG, NFSD_MAY_LOCK, filp); + access = (mode == O_WRONLY) ? NFSD_MAY_WRITE : NFSD_MAY_READ; + access |= NFSD_MAY_LOCK; + nfserr = nfsd_open(rqstp, &fh, S_IFREG, access, filp); fh_put(&fh); /* We return nlm error codes as nlm doesn't know * about nfsd, but nfsd does know about nlm.. diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 8313e1dbb5dc..42356416f0a0 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2687,9 +2687,9 @@ static void force_expire_client(struct nfs4_client *clp) trace_nfsd_clid_admin_expired(&clp->cl_clientid); - spin_lock(&clp->cl_lock); + spin_lock(&nn->client_lock); clp->cl_time = 0; - spin_unlock(&clp->cl_lock); + spin_unlock(&nn->client_lock); wait_event(expiry_wq, atomic_read(&clp->cl_rpc_users) == 0); spin_lock(&nn->client_lock); @@ -6821,6 +6821,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_blocked_lock *nbl = NULL; struct file_lock *file_lock = NULL; struct file_lock *conflock = NULL; + struct super_block *sb; __be32 status = 0; int lkflg; int err; @@ -6842,6 +6843,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, dprintk("NFSD: nfsd4_lock: permission denied!\n"); return status; } + sb = cstate->current_fh.fh_dentry->d_sb; if (lock->lk_is_new) { if (nfsd4_has_session(cstate)) @@ -6887,10 +6889,14 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (!locks_in_grace(net) && lock->lk_reclaim) goto out; + if (lock->lk_reclaim) + fl_flags |= FL_RECLAIM; + fp = lock_stp->st_stid.sc_file; switch (lock->lk_type) { case NFS4_READW_LT: - if (nfsd4_has_session(cstate)) + if (nfsd4_has_session(cstate) && + !(sb->s_export_op->flags & EXPORT_OP_SYNC_LOCKS)) fl_flags |= FL_SLEEP; fallthrough; case NFS4_READ_LT: @@ -6902,7 +6908,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, fl_type = F_RDLCK; break; case NFS4_WRITEW_LT: - if (nfsd4_has_session(cstate)) + if (nfsd4_has_session(cstate) && + !(sb->s_export_op->flags & EXPORT_OP_SYNC_LOCKS)) fl_flags |= FL_SLEEP; fallthrough; case NFS4_WRITE_LT: @@ -7022,8 +7029,7 @@ out: /* * The NFSv4 spec allows a client to do a LOCKT without holding an OPEN, * so we do a temporary open here just to get an open file to pass to - * vfs_test_lock. (Arguably perhaps test_lock should be done with an - * inode operation.) + * vfs_test_lock. */ static __be32 nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file_lock *lock) { @@ -7038,7 +7044,9 @@ static __be32 nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct NFSD_MAY_READ)); if (err) goto out; + lock->fl_file = nf->nf_file; err = nfserrno(vfs_test_lock(nf->nf_file, lock)); + lock->fl_file = NULL; out: fh_unlock(fhp); nfsd_file_put(nf); diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 60d7c59e7935..90fcd6178823 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -881,6 +881,7 @@ nfserrno (int errno) { nfserr_serverfault, -ENFILE }, { nfserr_io, -EUCLEAN }, { nfserr_perm, -ENOKEY }, + { nfserr_no_grace, -ENOGRACE}, }; int i; diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index adaec43548d1..538520957a81 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -400,18 +400,16 @@ TRACE_EVENT(nfsd_dirent, TP_STRUCT__entry( __field(u32, fh_hash) __field(u64, ino) - __field(int, len) - __dynamic_array(unsigned char, name, namlen) + __string_len(name, name, namlen) ), TP_fast_assign( __entry->fh_hash = fhp ? knfsd_fh_hash(&fhp->fh_handle) : 0; __entry->ino = ino; - __entry->len = namlen; - memcpy(__get_str(name), name, namlen); + __assign_str_len(name, name, namlen) ), - TP_printk("fh_hash=0x%08x ino=%llu name=%.*s", - __entry->fh_hash, __entry->ino, - __entry->len, __get_str(name)) + TP_printk("fh_hash=0x%08x ino=%llu name=%s", + __entry->fh_hash, __entry->ino, __get_str(name) + ) ) #include "state.h" @@ -608,7 +606,7 @@ DECLARE_EVENT_CLASS(nfsd_clid_class, __array(unsigned char, addr, sizeof(struct sockaddr_in6)) __field(unsigned long, flavor) __array(unsigned char, verifier, NFS4_VERIFIER_SIZE) - __dynamic_array(char, name, clp->cl_name.len + 1) + __string_len(name, name, clp->cl_name.len) ), TP_fast_assign( __entry->cl_boot = clp->cl_clientid.cl_boot; @@ -618,8 +616,7 @@ DECLARE_EVENT_CLASS(nfsd_clid_class, __entry->flavor = clp->cl_cred.cr_flavor; memcpy(__entry->verifier, (void *)&clp->cl_verifier, NFS4_VERIFIER_SIZE); - memcpy(__get_str(name), clp->cl_name.data, clp->cl_name.len); - __get_str(name)[clp->cl_name.len] = '\0'; + __assign_str_len(name, clp->cl_name.data, clp->cl_name.len); ), TP_printk("addr=%pISpc name='%s' verifier=0x%s flavor=%s client=%08x:%08x", __entry->addr, __get_str(name), diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 92e77f92268a..738d564ca4ce 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -244,7 +244,6 @@ out_nfserr: * returned. Otherwise the covered directory is returned. * NOTE: this mountpoint crossing is not supported properly by all * clients and is explicitly disallowed for NFSv3 - * NeilBrown <neilb@cse.unsw.edu.au> */ __be32 nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name, @@ -826,26 +825,16 @@ nfsd_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct svc_rqst *rqstp = sd->u.data; struct page **pp = rqstp->rq_next_page; struct page *page = buf->page; - size_t size; - - size = sd->len; if (rqstp->rq_res.page_len == 0) { - get_page(page); - put_page(*rqstp->rq_next_page); - *(rqstp->rq_next_page++) = page; + svc_rqst_replace_page(rqstp, page); rqstp->rq_res.page_base = buf->offset; - rqstp->rq_res.page_len = size; } else if (page != pp[-1]) { - get_page(page); - if (*rqstp->rq_next_page) - put_page(*rqstp->rq_next_page); - *(rqstp->rq_next_page++) = page; - rqstp->rq_res.page_len += size; - } else - rqstp->rq_res.page_len += size; + svc_rqst_replace_page(rqstp, page); + } + rqstp->rq_res.page_len += sd->len; - return size; + return sd->len; } static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe, diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c index 68e8d61e28dd..62f8a7ac19c8 100644 --- a/fs/nilfs2/sysfs.c +++ b/fs/nilfs2/sysfs.c @@ -51,11 +51,9 @@ static const struct sysfs_ops nilfs_##name##_attr_ops = { \ #define NILFS_DEV_INT_GROUP_TYPE(name, parent_name) \ static void nilfs_##name##_attr_release(struct kobject *kobj) \ { \ - struct nilfs_sysfs_##parent_name##_subgroups *subgroups; \ - struct the_nilfs *nilfs = container_of(kobj->parent, \ - struct the_nilfs, \ - ns_##parent_name##_kobj); \ - subgroups = nilfs->ns_##parent_name##_subgroups; \ + struct nilfs_sysfs_##parent_name##_subgroups *subgroups = container_of(kobj, \ + struct nilfs_sysfs_##parent_name##_subgroups, \ + sg_##name##_kobj); \ complete(&subgroups->sg_##name##_kobj_unregister); \ } \ static struct kobj_type nilfs_##name##_ktype = { \ @@ -81,12 +79,12 @@ static int nilfs_sysfs_create_##name##_group(struct the_nilfs *nilfs) \ err = kobject_init_and_add(kobj, &nilfs_##name##_ktype, parent, \ #name); \ if (err) \ - return err; \ - return 0; \ + kobject_put(kobj); \ + return err; \ } \ static void nilfs_sysfs_delete_##name##_group(struct the_nilfs *nilfs) \ { \ - kobject_del(&nilfs->ns_##parent_name##_subgroups->sg_##name##_kobj); \ + kobject_put(&nilfs->ns_##parent_name##_subgroups->sg_##name##_kobj); \ } /************************************************************************ @@ -197,14 +195,14 @@ int nilfs_sysfs_create_snapshot_group(struct nilfs_root *root) } if (err) - return err; + kobject_put(&root->snapshot_kobj); - return 0; + return err; } void nilfs_sysfs_delete_snapshot_group(struct nilfs_root *root) { - kobject_del(&root->snapshot_kobj); + kobject_put(&root->snapshot_kobj); } /************************************************************************ @@ -986,7 +984,7 @@ int nilfs_sysfs_create_device_group(struct super_block *sb) err = kobject_init_and_add(&nilfs->ns_dev_kobj, &nilfs_dev_ktype, NULL, "%s", sb->s_id); if (err) - goto free_dev_subgroups; + goto cleanup_dev_kobject; err = nilfs_sysfs_create_mounted_snapshots_group(nilfs); if (err) @@ -1023,9 +1021,7 @@ delete_mounted_snapshots_group: nilfs_sysfs_delete_mounted_snapshots_group(nilfs); cleanup_dev_kobject: - kobject_del(&nilfs->ns_dev_kobj); - -free_dev_subgroups: + kobject_put(&nilfs->ns_dev_kobj); kfree(nilfs->ns_dev_subgroups); failed_create_device_group: diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 8b7b01a380ce..c8bfc01da5d7 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -792,14 +792,13 @@ nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno) void nilfs_put_root(struct nilfs_root *root) { - if (refcount_dec_and_test(&root->count)) { - struct the_nilfs *nilfs = root->nilfs; + struct the_nilfs *nilfs = root->nilfs; - nilfs_sysfs_delete_snapshot_group(root); - - spin_lock(&nilfs->ns_cptree_lock); + if (refcount_dec_and_lock(&root->count, &nilfs->ns_cptree_lock)) { rb_erase(&root->rb_node, &nilfs->ns_cptree); spin_unlock(&nilfs->ns_cptree_lock); + + nilfs_sysfs_delete_snapshot_group(root); iput(root->ifile); kfree(root); diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 95006d1d29ab..fa1d99101f89 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -531,6 +531,7 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp, /* Someone else created list structure for us */ if (inode) fsnotify_put_inode_ref(inode); + fsnotify_put_sb_connectors(conn); kmem_cache_free(fsnotify_mark_connector_cachep, conn); } diff --git a/fs/ntfs3/Kconfig b/fs/ntfs3/Kconfig new file mode 100644 index 000000000000..6e4cbc48ab8e --- /dev/null +++ b/fs/ntfs3/Kconfig @@ -0,0 +1,46 @@ +# SPDX-License-Identifier: GPL-2.0-only +config NTFS3_FS + tristate "NTFS Read-Write file system support" + select NLS + help + Windows OS native file system (NTFS) support up to NTFS version 3.1. + + Y or M enables the NTFS3 driver with full features enabled (read, + write, journal replaying, sparse/compressed files support). + File system type to use on mount is "ntfs3". Module name (M option) + is also "ntfs3". + + Documentation: <file:Documentation/filesystems/ntfs3.rst> + +config NTFS3_64BIT_CLUSTER + bool "64 bits per NTFS clusters" + depends on NTFS3_FS && 64BIT + help + Windows implementation of ntfs.sys uses 32 bits per clusters. + If activated 64 bits per clusters you will be able to use 4k cluster + for 16T+ volumes. Windows will not be able to mount such volumes. + + It is recommended to say N here. + +config NTFS3_LZX_XPRESS + bool "activate support of external compressions lzx/xpress" + depends on NTFS3_FS + help + In Windows 10 one can use command "compact" to compress any files. + 4 possible variants of compression are: xpress4k, xpress8k, xpress16k and lzx. + If activated you will be able to read such files correctly. + + It is recommended to say Y here. + +config NTFS3_FS_POSIX_ACL + bool "NTFS POSIX Access Control Lists" + depends on NTFS3_FS + select FS_POSIX_ACL + help + POSIX Access Control Lists (ACLs) support additional access rights + for users and groups beyond the standard owner/group/world scheme, + and this option selects support for ACLs specifically for ntfs + filesystems. + NOTE: this is linux only feature. Windows will ignore these ACLs. + + If you don't know what Access Control Lists are, say N. diff --git a/fs/ntfs3/Makefile b/fs/ntfs3/Makefile new file mode 100644 index 000000000000..279701b62bbe --- /dev/null +++ b/fs/ntfs3/Makefile @@ -0,0 +1,36 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for the ntfs3 filesystem support. +# + +# to check robot warnings +ccflags-y += -Wint-to-pointer-cast \ + $(call cc-option,-Wunused-but-set-variable,-Wunused-const-variable) \ + $(call cc-option,-Wold-style-declaration,-Wout-of-line-declaration) + +obj-$(CONFIG_NTFS3_FS) += ntfs3.o + +ntfs3-y := attrib.o \ + attrlist.o \ + bitfunc.o \ + bitmap.o \ + dir.o \ + fsntfs.o \ + frecord.o \ + file.o \ + fslog.o \ + inode.o \ + index.o \ + lznt.o \ + namei.o \ + record.o \ + run.o \ + super.o \ + upcase.o \ + xattr.o + +ntfs3-$(CONFIG_NTFS3_LZX_XPRESS) += $(addprefix lib/,\ + decompress_common.o \ + lzx_decompress.o \ + xpress_decompress.o \ + )
\ No newline at end of file diff --git a/fs/ntfs3/attrib.c b/fs/ntfs3/attrib.c new file mode 100644 index 000000000000..34c4cbf7e29b --- /dev/null +++ b/fs/ntfs3/attrib.c @@ -0,0 +1,2093 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * + * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. + * + * TODO: Merge attr_set_size/attr_data_get_block/attr_allocate_frame? + */ + +#include <linux/blkdev.h> +#include <linux/buffer_head.h> +#include <linux/fs.h> +#include <linux/hash.h> +#include <linux/nls.h> +#include <linux/ratelimit.h> +#include <linux/slab.h> + +#include "debug.h" +#include "ntfs.h" +#include "ntfs_fs.h" + +/* + * You can set external NTFS_MIN_LOG2_OF_CLUMP/NTFS_MAX_LOG2_OF_CLUMP to manage + * preallocate algorithm. + */ +#ifndef NTFS_MIN_LOG2_OF_CLUMP +#define NTFS_MIN_LOG2_OF_CLUMP 16 +#endif + +#ifndef NTFS_MAX_LOG2_OF_CLUMP +#define NTFS_MAX_LOG2_OF_CLUMP 26 +#endif + +// 16M +#define NTFS_CLUMP_MIN (1 << (NTFS_MIN_LOG2_OF_CLUMP + 8)) +// 16G +#define NTFS_CLUMP_MAX (1ull << (NTFS_MAX_LOG2_OF_CLUMP + 8)) + +static inline u64 get_pre_allocated(u64 size) +{ + u32 clump; + u8 align_shift; + u64 ret; + + if (size <= NTFS_CLUMP_MIN) { + clump = 1 << NTFS_MIN_LOG2_OF_CLUMP; + align_shift = NTFS_MIN_LOG2_OF_CLUMP; + } else if (size >= NTFS_CLUMP_MAX) { + clump = 1 << NTFS_MAX_LOG2_OF_CLUMP; + align_shift = NTFS_MAX_LOG2_OF_CLUMP; + } else { + align_shift = NTFS_MIN_LOG2_OF_CLUMP - 1 + + __ffs(size >> (8 + NTFS_MIN_LOG2_OF_CLUMP)); + clump = 1u << align_shift; + } + + ret = (((size + clump - 1) >> align_shift)) << align_shift; + + return ret; +} + +/* + * attr_must_be_resident + * + * Return: True if attribute must be resident. + */ +static inline bool attr_must_be_resident(struct ntfs_sb_info *sbi, + enum ATTR_TYPE type) +{ + const struct ATTR_DEF_ENTRY *de; + + switch (type) { + case ATTR_STD: + case ATTR_NAME: + case ATTR_ID: + case ATTR_LABEL: + case ATTR_VOL_INFO: + case ATTR_ROOT: + case ATTR_EA_INFO: + return true; + default: + de = ntfs_query_def(sbi, type); + if (de && (de->flags & NTFS_ATTR_MUST_BE_RESIDENT)) + return true; + return false; + } +} + +/* + * attr_load_runs - Load all runs stored in @attr. + */ +int attr_load_runs(struct ATTRIB *attr, struct ntfs_inode *ni, + struct runs_tree *run, const CLST *vcn) +{ + int err; + CLST svcn = le64_to_cpu(attr->nres.svcn); + CLST evcn = le64_to_cpu(attr->nres.evcn); + u32 asize; + u16 run_off; + + if (svcn >= evcn + 1 || run_is_mapped_full(run, svcn, evcn)) + return 0; + + if (vcn && (evcn < *vcn || *vcn < svcn)) + return -EINVAL; + + asize = le32_to_cpu(attr->size); + run_off = le16_to_cpu(attr->nres.run_off); + err = run_unpack_ex(run, ni->mi.sbi, ni->mi.rno, svcn, evcn, + vcn ? *vcn : svcn, Add2Ptr(attr, run_off), + asize - run_off); + if (err < 0) + return err; + + return 0; +} + +/* + * run_deallocate_ex - Deallocate clusters. + */ +static int run_deallocate_ex(struct ntfs_sb_info *sbi, struct runs_tree *run, + CLST vcn, CLST len, CLST *done, bool trim) +{ + int err = 0; + CLST vcn_next, vcn0 = vcn, lcn, clen, dn = 0; + size_t idx; + + if (!len) + goto out; + + if (!run_lookup_entry(run, vcn, &lcn, &clen, &idx)) { +failed: + run_truncate(run, vcn0); + err = -EINVAL; + goto out; + } + + for (;;) { + if (clen > len) + clen = len; + + if (!clen) { + err = -EINVAL; + goto out; + } + + if (lcn != SPARSE_LCN) { + mark_as_free_ex(sbi, lcn, clen, trim); + dn += clen; + } + + len -= clen; + if (!len) + break; + + vcn_next = vcn + clen; + if (!run_get_entry(run, ++idx, &vcn, &lcn, &clen) || + vcn != vcn_next) { + /* Save memory - don't load entire run. */ + goto failed; + } + } + +out: + if (done) + *done += dn; + + return err; +} + +/* + * attr_allocate_clusters - Find free space, mark it as used and store in @run. + */ +int attr_allocate_clusters(struct ntfs_sb_info *sbi, struct runs_tree *run, + CLST vcn, CLST lcn, CLST len, CLST *pre_alloc, + enum ALLOCATE_OPT opt, CLST *alen, const size_t fr, + CLST *new_lcn) +{ + int err; + CLST flen, vcn0 = vcn, pre = pre_alloc ? *pre_alloc : 0; + struct wnd_bitmap *wnd = &sbi->used.bitmap; + size_t cnt = run->count; + + for (;;) { + err = ntfs_look_for_free_space(sbi, lcn, len + pre, &lcn, &flen, + opt); + + if (err == -ENOSPC && pre) { + pre = 0; + if (*pre_alloc) + *pre_alloc = 0; + continue; + } + + if (err) + goto out; + + if (new_lcn && vcn == vcn0) + *new_lcn = lcn; + + /* Add new fragment into run storage. */ + if (!run_add_entry(run, vcn, lcn, flen, opt == ALLOCATE_MFT)) { + /* Undo last 'ntfs_look_for_free_space' */ + down_write_nested(&wnd->rw_lock, BITMAP_MUTEX_CLUSTERS); + wnd_set_free(wnd, lcn, flen); + up_write(&wnd->rw_lock); + err = -ENOMEM; + goto out; + } + + vcn += flen; + + if (flen >= len || opt == ALLOCATE_MFT || + (fr && run->count - cnt >= fr)) { + *alen = vcn - vcn0; + return 0; + } + + len -= flen; + } + +out: + /* Undo 'ntfs_look_for_free_space' */ + if (vcn - vcn0) { + run_deallocate_ex(sbi, run, vcn0, vcn - vcn0, NULL, false); + run_truncate(run, vcn0); + } + + return err; +} + +/* + * attr_make_nonresident + * + * If page is not NULL - it is already contains resident data + * and locked (called from ni_write_frame()). + */ +int attr_make_nonresident(struct ntfs_inode *ni, struct ATTRIB *attr, + struct ATTR_LIST_ENTRY *le, struct mft_inode *mi, + u64 new_size, struct runs_tree *run, + struct ATTRIB **ins_attr, struct page *page) +{ + struct ntfs_sb_info *sbi; + struct ATTRIB *attr_s; + struct MFT_REC *rec; + u32 used, asize, rsize, aoff, align; + bool is_data; + CLST len, alen; + char *next; + int err; + + if (attr->non_res) { + *ins_attr = attr; + return 0; + } + + sbi = mi->sbi; + rec = mi->mrec; + attr_s = NULL; + used = le32_to_cpu(rec->used); + asize = le32_to_cpu(attr->size); + next = Add2Ptr(attr, asize); + aoff = PtrOffset(rec, attr); + rsize = le32_to_cpu(attr->res.data_size); + is_data = attr->type == ATTR_DATA && !attr->name_len; + + align = sbi->cluster_size; + if (is_attr_compressed(attr)) + align <<= COMPRESSION_UNIT; + len = (rsize + align - 1) >> sbi->cluster_bits; + + run_init(run); + + /* Make a copy of original attribute. */ + attr_s = kmemdup(attr, asize, GFP_NOFS); + if (!attr_s) { + err = -ENOMEM; + goto out; + } + + if (!len) { + /* Empty resident -> Empty nonresident. */ + alen = 0; + } else { + const char *data = resident_data(attr); + + err = attr_allocate_clusters(sbi, run, 0, 0, len, NULL, + ALLOCATE_DEF, &alen, 0, NULL); + if (err) + goto out1; + + if (!rsize) { + /* Empty resident -> Non empty nonresident. */ + } else if (!is_data) { + err = ntfs_sb_write_run(sbi, run, 0, data, rsize); + if (err) + goto out2; + } else if (!page) { + char *kaddr; + + page = grab_cache_page(ni->vfs_inode.i_mapping, 0); + if (!page) { + err = -ENOMEM; + goto out2; + } + kaddr = kmap_atomic(page); + memcpy(kaddr, data, rsize); + memset(kaddr + rsize, 0, PAGE_SIZE - rsize); + kunmap_atomic(kaddr); + flush_dcache_page(page); + SetPageUptodate(page); + set_page_dirty(page); + unlock_page(page); + put_page(page); + } + } + + /* Remove original attribute. */ + used -= asize; + memmove(attr, Add2Ptr(attr, asize), used - aoff); + rec->used = cpu_to_le32(used); + mi->dirty = true; + if (le) + al_remove_le(ni, le); + + err = ni_insert_nonresident(ni, attr_s->type, attr_name(attr_s), + attr_s->name_len, run, 0, alen, + attr_s->flags, &attr, NULL); + if (err) + goto out3; + + kfree(attr_s); + attr->nres.data_size = cpu_to_le64(rsize); + attr->nres.valid_size = attr->nres.data_size; + + *ins_attr = attr; + + if (is_data) + ni->ni_flags &= ~NI_FLAG_RESIDENT; + + /* Resident attribute becomes non resident. */ + return 0; + +out3: + attr = Add2Ptr(rec, aoff); + memmove(next, attr, used - aoff); + memcpy(attr, attr_s, asize); + rec->used = cpu_to_le32(used + asize); + mi->dirty = true; +out2: + /* Undo: do not trim new allocated clusters. */ + run_deallocate(sbi, run, false); + run_close(run); +out1: + kfree(attr_s); +out: + return err; +} + +/* + * attr_set_size_res - Helper for attr_set_size(). + */ +static int attr_set_size_res(struct ntfs_inode *ni, struct ATTRIB *attr, + struct ATTR_LIST_ENTRY *le, struct mft_inode *mi, + u64 new_size, struct runs_tree *run, + struct ATTRIB **ins_attr) +{ + struct ntfs_sb_info *sbi = mi->sbi; + struct MFT_REC *rec = mi->mrec; + u32 used = le32_to_cpu(rec->used); + u32 asize = le32_to_cpu(attr->size); + u32 aoff = PtrOffset(rec, attr); + u32 rsize = le32_to_cpu(attr->res.data_size); + u32 tail = used - aoff - asize; + char *next = Add2Ptr(attr, asize); + s64 dsize = ALIGN(new_size, 8) - ALIGN(rsize, 8); + + if (dsize < 0) { + memmove(next + dsize, next, tail); + } else if (dsize > 0) { + if (used + dsize > sbi->max_bytes_per_attr) + return attr_make_nonresident(ni, attr, le, mi, new_size, + run, ins_attr, NULL); + + memmove(next + dsize, next, tail); + memset(next, 0, dsize); + } + + if (new_size > rsize) + memset(Add2Ptr(resident_data(attr), rsize), 0, + new_size - rsize); + + rec->used = cpu_to_le32(used + dsize); + attr->size = cpu_to_le32(asize + dsize); + attr->res.data_size = cpu_to_le32(new_size); + mi->dirty = true; + *ins_attr = attr; + + return 0; +} + +/* + * attr_set_size - Change the size of attribute. + * + * Extend: + * - Sparse/compressed: No allocated clusters. + * - Normal: Append allocated and preallocated new clusters. + * Shrink: + * - No deallocate if @keep_prealloc is set. + */ +int attr_set_size(struct ntfs_inode *ni, enum ATTR_TYPE type, + const __le16 *name, u8 name_len, struct runs_tree *run, + u64 new_size, const u64 *new_valid, bool keep_prealloc, + struct ATTRIB **ret) +{ + int err = 0; + struct ntfs_sb_info *sbi = ni->mi.sbi; + u8 cluster_bits = sbi->cluster_bits; + bool is_mft = + ni->mi.rno == MFT_REC_MFT && type == ATTR_DATA && !name_len; + u64 old_valid, old_size, old_alloc, new_alloc, new_alloc_tmp; + struct ATTRIB *attr = NULL, *attr_b; + struct ATTR_LIST_ENTRY *le, *le_b; + struct mft_inode *mi, *mi_b; + CLST alen, vcn, lcn, new_alen, old_alen, svcn, evcn; + CLST next_svcn, pre_alloc = -1, done = 0; + bool is_ext; + u32 align; + struct MFT_REC *rec; + +again: + le_b = NULL; + attr_b = ni_find_attr(ni, NULL, &le_b, type, name, name_len, NULL, + &mi_b); + if (!attr_b) { + err = -ENOENT; + goto out; + } + + if (!attr_b->non_res) { + err = attr_set_size_res(ni, attr_b, le_b, mi_b, new_size, run, + &attr_b); + if (err || !attr_b->non_res) + goto out; + + /* Layout of records may be changed, so do a full search. */ + goto again; + } + + is_ext = is_attr_ext(attr_b); + +again_1: + align = sbi->cluster_size; + + if (is_ext) { + align <<= attr_b->nres.c_unit; + if (is_attr_sparsed(attr_b)) + keep_prealloc = false; + } + + old_valid = le64_to_cpu(attr_b->nres.valid_size); + old_size = le64_to_cpu(attr_b->nres.data_size); + old_alloc = le64_to_cpu(attr_b->nres.alloc_size); + old_alen = old_alloc >> cluster_bits; + + new_alloc = (new_size + align - 1) & ~(u64)(align - 1); + new_alen = new_alloc >> cluster_bits; + + if (keep_prealloc && is_ext) + keep_prealloc = false; + + if (keep_prealloc && new_size < old_size) { + attr_b->nres.data_size = cpu_to_le64(new_size); + mi_b->dirty = true; + goto ok; + } + + vcn = old_alen - 1; + + svcn = le64_to_cpu(attr_b->nres.svcn); + evcn = le64_to_cpu(attr_b->nres.evcn); + + if (svcn <= vcn && vcn <= evcn) { + attr = attr_b; + le = le_b; + mi = mi_b; + } else if (!le_b) { + err = -EINVAL; + goto out; + } else { + le = le_b; + attr = ni_find_attr(ni, attr_b, &le, type, name, name_len, &vcn, + &mi); + if (!attr) { + err = -EINVAL; + goto out; + } + +next_le_1: + svcn = le64_to_cpu(attr->nres.svcn); + evcn = le64_to_cpu(attr->nres.evcn); + } + +next_le: + rec = mi->mrec; + + err = attr_load_runs(attr, ni, run, NULL); + if (err) + goto out; + + if (new_size > old_size) { + CLST to_allocate; + size_t free; + + if (new_alloc <= old_alloc) { + attr_b->nres.data_size = cpu_to_le64(new_size); + mi_b->dirty = true; + goto ok; + } + + to_allocate = new_alen - old_alen; +add_alloc_in_same_attr_seg: + lcn = 0; + if (is_mft) { + /* MFT allocates clusters from MFT zone. */ + pre_alloc = 0; + } else if (is_ext) { + /* No preallocate for sparse/compress. */ + pre_alloc = 0; + } else if (pre_alloc == -1) { + pre_alloc = 0; + if (type == ATTR_DATA && !name_len && + sbi->options.prealloc) { + CLST new_alen2 = bytes_to_cluster( + sbi, get_pre_allocated(new_size)); + pre_alloc = new_alen2 - new_alen; + } + + /* Get the last LCN to allocate from. */ + if (old_alen && + !run_lookup_entry(run, vcn, &lcn, NULL, NULL)) { + lcn = SPARSE_LCN; + } + + if (lcn == SPARSE_LCN) + lcn = 0; + else if (lcn) + lcn += 1; + + free = wnd_zeroes(&sbi->used.bitmap); + if (to_allocate > free) { + err = -ENOSPC; + goto out; + } + + if (pre_alloc && to_allocate + pre_alloc > free) + pre_alloc = 0; + } + + vcn = old_alen; + + if (is_ext) { + if (!run_add_entry(run, vcn, SPARSE_LCN, to_allocate, + false)) { + err = -ENOMEM; + goto out; + } + alen = to_allocate; + } else { + /* ~3 bytes per fragment. */ + err = attr_allocate_clusters( + sbi, run, vcn, lcn, to_allocate, &pre_alloc, + is_mft ? ALLOCATE_MFT : 0, &alen, + is_mft ? 0 + : (sbi->record_size - + le32_to_cpu(rec->used) + 8) / + 3 + + 1, + NULL); + if (err) + goto out; + } + + done += alen; + vcn += alen; + if (to_allocate > alen) + to_allocate -= alen; + else + to_allocate = 0; + +pack_runs: + err = mi_pack_runs(mi, attr, run, vcn - svcn); + if (err) + goto out; + + next_svcn = le64_to_cpu(attr->nres.evcn) + 1; + new_alloc_tmp = (u64)next_svcn << cluster_bits; + attr_b->nres.alloc_size = cpu_to_le64(new_alloc_tmp); + mi_b->dirty = true; + + if (next_svcn >= vcn && !to_allocate) { + /* Normal way. Update attribute and exit. */ + attr_b->nres.data_size = cpu_to_le64(new_size); + goto ok; + } + + /* At least two MFT to avoid recursive loop. */ + if (is_mft && next_svcn == vcn && + ((u64)done << sbi->cluster_bits) >= 2 * sbi->record_size) { + new_size = new_alloc_tmp; + attr_b->nres.data_size = attr_b->nres.alloc_size; + goto ok; + } + + if (le32_to_cpu(rec->used) < sbi->record_size) { + old_alen = next_svcn; + evcn = old_alen - 1; + goto add_alloc_in_same_attr_seg; + } + + attr_b->nres.data_size = attr_b->nres.alloc_size; + if (new_alloc_tmp < old_valid) + attr_b->nres.valid_size = attr_b->nres.data_size; + + if (type == ATTR_LIST) { + err = ni_expand_list(ni); + if (err) + goto out; + if (next_svcn < vcn) + goto pack_runs; + + /* Layout of records is changed. */ + goto again; + } + + if (!ni->attr_list.size) { + err = ni_create_attr_list(ni); + if (err) + goto out; + /* Layout of records is changed. */ + } + + if (next_svcn >= vcn) { + /* This is MFT data, repeat. */ + goto again; + } + + /* Insert new attribute segment. */ + err = ni_insert_nonresident(ni, type, name, name_len, run, + next_svcn, vcn - next_svcn, + attr_b->flags, &attr, &mi); + if (err) + goto out; + + if (!is_mft) + run_truncate_head(run, evcn + 1); + + svcn = le64_to_cpu(attr->nres.svcn); + evcn = le64_to_cpu(attr->nres.evcn); + + le_b = NULL; + /* + * Layout of records maybe changed. + * Find base attribute to update. + */ + attr_b = ni_find_attr(ni, NULL, &le_b, type, name, name_len, + NULL, &mi_b); + if (!attr_b) { + err = -ENOENT; + goto out; + } + + attr_b->nres.alloc_size = cpu_to_le64((u64)vcn << cluster_bits); + attr_b->nres.data_size = attr_b->nres.alloc_size; + attr_b->nres.valid_size = attr_b->nres.alloc_size; + mi_b->dirty = true; + goto again_1; + } + + if (new_size != old_size || + (new_alloc != old_alloc && !keep_prealloc)) { + vcn = max(svcn, new_alen); + new_alloc_tmp = (u64)vcn << cluster_bits; + + alen = 0; + err = run_deallocate_ex(sbi, run, vcn, evcn - vcn + 1, &alen, + true); + if (err) + goto out; + + run_truncate(run, vcn); + + if (vcn > svcn) { + err = mi_pack_runs(mi, attr, run, vcn - svcn); + if (err) + goto out; + } else if (le && le->vcn) { + u16 le_sz = le16_to_cpu(le->size); + + /* + * NOTE: List entries for one attribute are always + * the same size. We deal with last entry (vcn==0) + * and it is not first in entries array + * (list entry for std attribute always first). + * So it is safe to step back. + */ + mi_remove_attr(NULL, mi, attr); + + if (!al_remove_le(ni, le)) { + err = -EINVAL; + goto out; + } + + le = (struct ATTR_LIST_ENTRY *)((u8 *)le - le_sz); + } else { + attr->nres.evcn = cpu_to_le64((u64)vcn - 1); + mi->dirty = true; + } + + attr_b->nres.alloc_size = cpu_to_le64(new_alloc_tmp); + + if (vcn == new_alen) { + attr_b->nres.data_size = cpu_to_le64(new_size); + if (new_size < old_valid) + attr_b->nres.valid_size = + attr_b->nres.data_size; + } else { + if (new_alloc_tmp <= + le64_to_cpu(attr_b->nres.data_size)) + attr_b->nres.data_size = + attr_b->nres.alloc_size; + if (new_alloc_tmp < + le64_to_cpu(attr_b->nres.valid_size)) + attr_b->nres.valid_size = + attr_b->nres.alloc_size; + } + + if (is_ext) + le64_sub_cpu(&attr_b->nres.total_size, + ((u64)alen << cluster_bits)); + + mi_b->dirty = true; + + if (new_alloc_tmp <= new_alloc) + goto ok; + + old_size = new_alloc_tmp; + vcn = svcn - 1; + + if (le == le_b) { + attr = attr_b; + mi = mi_b; + evcn = svcn - 1; + svcn = 0; + goto next_le; + } + + if (le->type != type || le->name_len != name_len || + memcmp(le_name(le), name, name_len * sizeof(short))) { + err = -EINVAL; + goto out; + } + + err = ni_load_mi(ni, le, &mi); + if (err) + goto out; + + attr = mi_find_attr(mi, NULL, type, name, name_len, &le->id); + if (!attr) { + err = -EINVAL; + goto out; + } + goto next_le_1; + } + +ok: + if (new_valid) { + __le64 valid = cpu_to_le64(min(*new_valid, new_size)); + + if (attr_b->nres.valid_size != valid) { + attr_b->nres.valid_size = valid; + mi_b->dirty = true; + } + } + +out: + if (!err && attr_b && ret) + *ret = attr_b; + + /* Update inode_set_bytes. */ + if (!err && ((type == ATTR_DATA && !name_len) || + (type == ATTR_ALLOC && name == I30_NAME))) { + bool dirty = false; + + if (ni->vfs_inode.i_size != new_size) { + ni->vfs_inode.i_size = new_size; + dirty = true; + } + + if (attr_b && attr_b->non_res) { + new_alloc = le64_to_cpu(attr_b->nres.alloc_size); + if (inode_get_bytes(&ni->vfs_inode) != new_alloc) { + inode_set_bytes(&ni->vfs_inode, new_alloc); + dirty = true; + } + } + + if (dirty) { + ni->ni_flags |= NI_FLAG_UPDATE_PARENT; + mark_inode_dirty(&ni->vfs_inode); + } + } + + return err; +} + +int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn, + CLST *len, bool *new) +{ + int err = 0; + struct runs_tree *run = &ni->file.run; + struct ntfs_sb_info *sbi; + u8 cluster_bits; + struct ATTRIB *attr = NULL, *attr_b; + struct ATTR_LIST_ENTRY *le, *le_b; + struct mft_inode *mi, *mi_b; + CLST hint, svcn, to_alloc, evcn1, next_svcn, asize, end; + u64 total_size; + u32 clst_per_frame; + bool ok; + + if (new) + *new = false; + + down_read(&ni->file.run_lock); + ok = run_lookup_entry(run, vcn, lcn, len, NULL); + up_read(&ni->file.run_lock); + + if (ok && (*lcn != SPARSE_LCN || !new)) { + /* Normal way. */ + return 0; + } + + if (!clen) + clen = 1; + + if (ok && clen > *len) + clen = *len; + + sbi = ni->mi.sbi; + cluster_bits = sbi->cluster_bits; + + ni_lock(ni); + down_write(&ni->file.run_lock); + + le_b = NULL; + attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, &mi_b); + if (!attr_b) { + err = -ENOENT; + goto out; + } + + if (!attr_b->non_res) { + *lcn = RESIDENT_LCN; + *len = 1; + goto out; + } + + asize = le64_to_cpu(attr_b->nres.alloc_size) >> sbi->cluster_bits; + if (vcn >= asize) { + err = -EINVAL; + goto out; + } + + clst_per_frame = 1u << attr_b->nres.c_unit; + to_alloc = (clen + clst_per_frame - 1) & ~(clst_per_frame - 1); + + if (vcn + to_alloc > asize) + to_alloc = asize - vcn; + + svcn = le64_to_cpu(attr_b->nres.svcn); + evcn1 = le64_to_cpu(attr_b->nres.evcn) + 1; + + attr = attr_b; + le = le_b; + mi = mi_b; + + if (le_b && (vcn < svcn || evcn1 <= vcn)) { + attr = ni_find_attr(ni, attr_b, &le, ATTR_DATA, NULL, 0, &vcn, + &mi); + if (!attr) { + err = -EINVAL; + goto out; + } + svcn = le64_to_cpu(attr->nres.svcn); + evcn1 = le64_to_cpu(attr->nres.evcn) + 1; + } + + err = attr_load_runs(attr, ni, run, NULL); + if (err) + goto out; + + if (!ok) { + ok = run_lookup_entry(run, vcn, lcn, len, NULL); + if (ok && (*lcn != SPARSE_LCN || !new)) { + /* Normal way. */ + err = 0; + goto ok; + } + + if (!ok && !new) { + *len = 0; + err = 0; + goto ok; + } + + if (ok && clen > *len) { + clen = *len; + to_alloc = (clen + clst_per_frame - 1) & + ~(clst_per_frame - 1); + } + } + + if (!is_attr_ext(attr_b)) { + err = -EINVAL; + goto out; + } + + /* Get the last LCN to allocate from. */ + hint = 0; + + if (vcn > evcn1) { + if (!run_add_entry(run, evcn1, SPARSE_LCN, vcn - evcn1, + false)) { + err = -ENOMEM; + goto out; + } + } else if (vcn && !run_lookup_entry(run, vcn - 1, &hint, NULL, NULL)) { + hint = -1; + } + + err = attr_allocate_clusters( + sbi, run, vcn, hint + 1, to_alloc, NULL, 0, len, + (sbi->record_size - le32_to_cpu(mi->mrec->used) + 8) / 3 + 1, + lcn); + if (err) + goto out; + *new = true; + + end = vcn + *len; + + total_size = le64_to_cpu(attr_b->nres.total_size) + + ((u64)*len << cluster_bits); + +repack: + err = mi_pack_runs(mi, attr, run, max(end, evcn1) - svcn); + if (err) + goto out; + + attr_b->nres.total_size = cpu_to_le64(total_size); + inode_set_bytes(&ni->vfs_inode, total_size); + ni->ni_flags |= NI_FLAG_UPDATE_PARENT; + + mi_b->dirty = true; + mark_inode_dirty(&ni->vfs_inode); + + /* Stored [vcn : next_svcn) from [vcn : end). */ + next_svcn = le64_to_cpu(attr->nres.evcn) + 1; + + if (end <= evcn1) { + if (next_svcn == evcn1) { + /* Normal way. Update attribute and exit. */ + goto ok; + } + /* Add new segment [next_svcn : evcn1 - next_svcn). */ + if (!ni->attr_list.size) { + err = ni_create_attr_list(ni); + if (err) + goto out; + /* Layout of records is changed. */ + le_b = NULL; + attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, + 0, NULL, &mi_b); + if (!attr_b) { + err = -ENOENT; + goto out; + } + + attr = attr_b; + le = le_b; + mi = mi_b; + goto repack; + } + } + + svcn = evcn1; + + /* Estimate next attribute. */ + attr = ni_find_attr(ni, attr, &le, ATTR_DATA, NULL, 0, &svcn, &mi); + + if (attr) { + CLST alloc = bytes_to_cluster( + sbi, le64_to_cpu(attr_b->nres.alloc_size)); + CLST evcn = le64_to_cpu(attr->nres.evcn); + + if (end < next_svcn) + end = next_svcn; + while (end > evcn) { + /* Remove segment [svcn : evcn). */ + mi_remove_attr(NULL, mi, attr); + + if (!al_remove_le(ni, le)) { + err = -EINVAL; + goto out; + } + + if (evcn + 1 >= alloc) { + /* Last attribute segment. */ + evcn1 = evcn + 1; + goto ins_ext; + } + + if (ni_load_mi(ni, le, &mi)) { + attr = NULL; + goto out; + } + + attr = mi_find_attr(mi, NULL, ATTR_DATA, NULL, 0, + &le->id); + if (!attr) { + err = -EINVAL; + goto out; + } + svcn = le64_to_cpu(attr->nres.svcn); + evcn = le64_to_cpu(attr->nres.evcn); + } + + if (end < svcn) + end = svcn; + + err = attr_load_runs(attr, ni, run, &end); + if (err) + goto out; + + evcn1 = evcn + 1; + attr->nres.svcn = cpu_to_le64(next_svcn); + err = mi_pack_runs(mi, attr, run, evcn1 - next_svcn); + if (err) + goto out; + + le->vcn = cpu_to_le64(next_svcn); + ni->attr_list.dirty = true; + mi->dirty = true; + + next_svcn = le64_to_cpu(attr->nres.evcn) + 1; + } +ins_ext: + if (evcn1 > next_svcn) { + err = ni_insert_nonresident(ni, ATTR_DATA, NULL, 0, run, + next_svcn, evcn1 - next_svcn, + attr_b->flags, &attr, &mi); + if (err) + goto out; + } +ok: + run_truncate_around(run, vcn); +out: + up_write(&ni->file.run_lock); + ni_unlock(ni); + + return err; +} + +int attr_data_read_resident(struct ntfs_inode *ni, struct page *page) +{ + u64 vbo; + struct ATTRIB *attr; + u32 data_size; + + attr = ni_find_attr(ni, NULL, NULL, ATTR_DATA, NULL, 0, NULL, NULL); + if (!attr) + return -EINVAL; + + if (attr->non_res) + return E_NTFS_NONRESIDENT; + + vbo = page->index << PAGE_SHIFT; + data_size = le32_to_cpu(attr->res.data_size); + if (vbo < data_size) { + const char *data = resident_data(attr); + char *kaddr = kmap_atomic(page); + u32 use = data_size - vbo; + + if (use > PAGE_SIZE) + use = PAGE_SIZE; + + memcpy(kaddr, data + vbo, use); + memset(kaddr + use, 0, PAGE_SIZE - use); + kunmap_atomic(kaddr); + flush_dcache_page(page); + SetPageUptodate(page); + } else if (!PageUptodate(page)) { + zero_user_segment(page, 0, PAGE_SIZE); + SetPageUptodate(page); + } + + return 0; +} + +int attr_data_write_resident(struct ntfs_inode *ni, struct page *page) +{ + u64 vbo; + struct mft_inode *mi; + struct ATTRIB *attr; + u32 data_size; + + attr = ni_find_attr(ni, NULL, NULL, ATTR_DATA, NULL, 0, NULL, &mi); + if (!attr) + return -EINVAL; + + if (attr->non_res) { + /* Return special error code to check this case. */ + return E_NTFS_NONRESIDENT; + } + + vbo = page->index << PAGE_SHIFT; + data_size = le32_to_cpu(attr->res.data_size); + if (vbo < data_size) { + char *data = resident_data(attr); + char *kaddr = kmap_atomic(page); + u32 use = data_size - vbo; + + if (use > PAGE_SIZE) + use = PAGE_SIZE; + memcpy(data + vbo, kaddr, use); + kunmap_atomic(kaddr); + mi->dirty = true; + } + ni->i_valid = data_size; + + return 0; +} + +/* + * attr_load_runs_vcn - Load runs with VCN. + */ +int attr_load_runs_vcn(struct ntfs_inode *ni, enum ATTR_TYPE type, + const __le16 *name, u8 name_len, struct runs_tree *run, + CLST vcn) +{ + struct ATTRIB *attr; + int err; + CLST svcn, evcn; + u16 ro; + + attr = ni_find_attr(ni, NULL, NULL, type, name, name_len, &vcn, NULL); + if (!attr) { + /* Is record corrupted? */ + return -ENOENT; + } + + svcn = le64_to_cpu(attr->nres.svcn); + evcn = le64_to_cpu(attr->nres.evcn); + + if (evcn < vcn || vcn < svcn) { + /* Is record corrupted? */ + return -EINVAL; + } + + ro = le16_to_cpu(attr->nres.run_off); + err = run_unpack_ex(run, ni->mi.sbi, ni->mi.rno, svcn, evcn, svcn, + Add2Ptr(attr, ro), le32_to_cpu(attr->size) - ro); + if (err < 0) + return err; + return 0; +} + +/* + * attr_load_runs_range - Load runs for given range [from to). + */ +int attr_load_runs_range(struct ntfs_inode *ni, enum ATTR_TYPE type, + const __le16 *name, u8 name_len, struct runs_tree *run, + u64 from, u64 to) +{ + struct ntfs_sb_info *sbi = ni->mi.sbi; + u8 cluster_bits = sbi->cluster_bits; + CLST vcn = from >> cluster_bits; + CLST vcn_last = (to - 1) >> cluster_bits; + CLST lcn, clen; + int err; + + for (vcn = from >> cluster_bits; vcn <= vcn_last; vcn += clen) { + if (!run_lookup_entry(run, vcn, &lcn, &clen, NULL)) { + err = attr_load_runs_vcn(ni, type, name, name_len, run, + vcn); + if (err) + return err; + clen = 0; /* Next run_lookup_entry(vcn) must be success. */ + } + } + + return 0; +} + +#ifdef CONFIG_NTFS3_LZX_XPRESS +/* + * attr_wof_frame_info + * + * Read header of Xpress/LZX file to get info about frame. + */ +int attr_wof_frame_info(struct ntfs_inode *ni, struct ATTRIB *attr, + struct runs_tree *run, u64 frame, u64 frames, + u8 frame_bits, u32 *ondisk_size, u64 *vbo_data) +{ + struct ntfs_sb_info *sbi = ni->mi.sbi; + u64 vbo[2], off[2], wof_size; + u32 voff; + u8 bytes_per_off; + char *addr; + struct page *page; + int i, err; + __le32 *off32; + __le64 *off64; + + if (ni->vfs_inode.i_size < 0x100000000ull) { + /* File starts with array of 32 bit offsets. */ + bytes_per_off = sizeof(__le32); + vbo[1] = frame << 2; + *vbo_data = frames << 2; + } else { + /* File starts with array of 64 bit offsets. */ + bytes_per_off = sizeof(__le64); + vbo[1] = frame << 3; + *vbo_data = frames << 3; + } + + /* + * Read 4/8 bytes at [vbo - 4(8)] == offset where compressed frame starts. + * Read 4/8 bytes at [vbo] == offset where compressed frame ends. + */ + if (!attr->non_res) { + if (vbo[1] + bytes_per_off > le32_to_cpu(attr->res.data_size)) { + ntfs_inode_err(&ni->vfs_inode, "is corrupted"); + return -EINVAL; + } + addr = resident_data(attr); + + if (bytes_per_off == sizeof(__le32)) { + off32 = Add2Ptr(addr, vbo[1]); + off[0] = vbo[1] ? le32_to_cpu(off32[-1]) : 0; + off[1] = le32_to_cpu(off32[0]); + } else { + off64 = Add2Ptr(addr, vbo[1]); + off[0] = vbo[1] ? le64_to_cpu(off64[-1]) : 0; + off[1] = le64_to_cpu(off64[0]); + } + + *vbo_data += off[0]; + *ondisk_size = off[1] - off[0]; + return 0; + } + + wof_size = le64_to_cpu(attr->nres.data_size); + down_write(&ni->file.run_lock); + page = ni->file.offs_page; + if (!page) { + page = alloc_page(GFP_KERNEL); + if (!page) { + err = -ENOMEM; + goto out; + } + page->index = -1; + ni->file.offs_page = page; + } + lock_page(page); + addr = page_address(page); + + if (vbo[1]) { + voff = vbo[1] & (PAGE_SIZE - 1); + vbo[0] = vbo[1] - bytes_per_off; + i = 0; + } else { + voff = 0; + vbo[0] = 0; + off[0] = 0; + i = 1; + } + + do { + pgoff_t index = vbo[i] >> PAGE_SHIFT; + + if (index != page->index) { + u64 from = vbo[i] & ~(u64)(PAGE_SIZE - 1); + u64 to = min(from + PAGE_SIZE, wof_size); + + err = attr_load_runs_range(ni, ATTR_DATA, WOF_NAME, + ARRAY_SIZE(WOF_NAME), run, + from, to); + if (err) + goto out1; + + err = ntfs_bio_pages(sbi, run, &page, 1, from, + to - from, REQ_OP_READ); + if (err) { + page->index = -1; + goto out1; + } + page->index = index; + } + + if (i) { + if (bytes_per_off == sizeof(__le32)) { + off32 = Add2Ptr(addr, voff); + off[1] = le32_to_cpu(*off32); + } else { + off64 = Add2Ptr(addr, voff); + off[1] = le64_to_cpu(*off64); + } + } else if (!voff) { + if (bytes_per_off == sizeof(__le32)) { + off32 = Add2Ptr(addr, PAGE_SIZE - sizeof(u32)); + off[0] = le32_to_cpu(*off32); + } else { + off64 = Add2Ptr(addr, PAGE_SIZE - sizeof(u64)); + off[0] = le64_to_cpu(*off64); + } + } else { + /* Two values in one page. */ + if (bytes_per_off == sizeof(__le32)) { + off32 = Add2Ptr(addr, voff); + off[0] = le32_to_cpu(off32[-1]); + off[1] = le32_to_cpu(off32[0]); + } else { + off64 = Add2Ptr(addr, voff); + off[0] = le64_to_cpu(off64[-1]); + off[1] = le64_to_cpu(off64[0]); + } + break; + } + } while (++i < 2); + + *vbo_data += off[0]; + *ondisk_size = off[1] - off[0]; + +out1: + unlock_page(page); +out: + up_write(&ni->file.run_lock); + return err; +} +#endif + +/* + * attr_is_frame_compressed - Used to detect compressed frame. + */ +int attr_is_frame_compressed(struct ntfs_inode *ni, struct ATTRIB *attr, + CLST frame, CLST *clst_data) +{ + int err; + u32 clst_frame; + CLST clen, lcn, vcn, alen, slen, vcn_next; + size_t idx; + struct runs_tree *run; + + *clst_data = 0; + + if (!is_attr_compressed(attr)) + return 0; + + if (!attr->non_res) + return 0; + + clst_frame = 1u << attr->nres.c_unit; + vcn = frame * clst_frame; + run = &ni->file.run; + + if (!run_lookup_entry(run, vcn, &lcn, &clen, &idx)) { + err = attr_load_runs_vcn(ni, attr->type, attr_name(attr), + attr->name_len, run, vcn); + if (err) + return err; + + if (!run_lookup_entry(run, vcn, &lcn, &clen, &idx)) + return -EINVAL; + } + + if (lcn == SPARSE_LCN) { + /* Sparsed frame. */ + return 0; + } + + if (clen >= clst_frame) { + /* + * The frame is not compressed 'cause + * it does not contain any sparse clusters. + */ + *clst_data = clst_frame; + return 0; + } + + alen = bytes_to_cluster(ni->mi.sbi, le64_to_cpu(attr->nres.alloc_size)); + slen = 0; + *clst_data = clen; + + /* + * The frame is compressed if *clst_data + slen >= clst_frame. + * Check next fragments. + */ + while ((vcn += clen) < alen) { + vcn_next = vcn; + + if (!run_get_entry(run, ++idx, &vcn, &lcn, &clen) || + vcn_next != vcn) { + err = attr_load_runs_vcn(ni, attr->type, + attr_name(attr), + attr->name_len, run, vcn_next); + if (err) + return err; + vcn = vcn_next; + + if (!run_lookup_entry(run, vcn, &lcn, &clen, &idx)) + return -EINVAL; + } + + if (lcn == SPARSE_LCN) { + slen += clen; + } else { + if (slen) { + /* + * Data_clusters + sparse_clusters = + * not enough for frame. + */ + return -EINVAL; + } + *clst_data += clen; + } + + if (*clst_data + slen >= clst_frame) { + if (!slen) { + /* + * There is no sparsed clusters in this frame + * so it is not compressed. + */ + *clst_data = clst_frame; + } else { + /* Frame is compressed. */ + } + break; + } + } + + return 0; +} + +/* + * attr_allocate_frame - Allocate/free clusters for @frame. + * + * Assumed: down_write(&ni->file.run_lock); + */ +int attr_allocate_frame(struct ntfs_inode *ni, CLST frame, size_t compr_size, + u64 new_valid) +{ + int err = 0; + struct runs_tree *run = &ni->file.run; + struct ntfs_sb_info *sbi = ni->mi.sbi; + struct ATTRIB *attr = NULL, *attr_b; + struct ATTR_LIST_ENTRY *le, *le_b; + struct mft_inode *mi, *mi_b; + CLST svcn, evcn1, next_svcn, lcn, len; + CLST vcn, end, clst_data; + u64 total_size, valid_size, data_size; + + le_b = NULL; + attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, &mi_b); + if (!attr_b) + return -ENOENT; + + if (!is_attr_ext(attr_b)) + return -EINVAL; + + vcn = frame << NTFS_LZNT_CUNIT; + total_size = le64_to_cpu(attr_b->nres.total_size); + + svcn = le64_to_cpu(attr_b->nres.svcn); + evcn1 = le64_to_cpu(attr_b->nres.evcn) + 1; + data_size = le64_to_cpu(attr_b->nres.data_size); + + if (svcn <= vcn && vcn < evcn1) { + attr = attr_b; + le = le_b; + mi = mi_b; + } else if (!le_b) { + err = -EINVAL; + goto out; + } else { + le = le_b; + attr = ni_find_attr(ni, attr_b, &le, ATTR_DATA, NULL, 0, &vcn, + &mi); + if (!attr) { + err = -EINVAL; + goto out; + } + svcn = le64_to_cpu(attr->nres.svcn); + evcn1 = le64_to_cpu(attr->nres.evcn) + 1; + } + + err = attr_load_runs(attr, ni, run, NULL); + if (err) + goto out; + + err = attr_is_frame_compressed(ni, attr_b, frame, &clst_data); + if (err) + goto out; + + total_size -= (u64)clst_data << sbi->cluster_bits; + + len = bytes_to_cluster(sbi, compr_size); + + if (len == clst_data) + goto out; + + if (len < clst_data) { + err = run_deallocate_ex(sbi, run, vcn + len, clst_data - len, + NULL, true); + if (err) + goto out; + + if (!run_add_entry(run, vcn + len, SPARSE_LCN, clst_data - len, + false)) { + err = -ENOMEM; + goto out; + } + end = vcn + clst_data; + /* Run contains updated range [vcn + len : end). */ + } else { + CLST alen, hint = 0; + /* Get the last LCN to allocate from. */ + if (vcn + clst_data && + !run_lookup_entry(run, vcn + clst_data - 1, &hint, NULL, + NULL)) { + hint = -1; + } + + err = attr_allocate_clusters(sbi, run, vcn + clst_data, + hint + 1, len - clst_data, NULL, 0, + &alen, 0, &lcn); + if (err) + goto out; + + end = vcn + len; + /* Run contains updated range [vcn + clst_data : end). */ + } + + total_size += (u64)len << sbi->cluster_bits; + +repack: + err = mi_pack_runs(mi, attr, run, max(end, evcn1) - svcn); + if (err) + goto out; + + attr_b->nres.total_size = cpu_to_le64(total_size); + inode_set_bytes(&ni->vfs_inode, total_size); + + mi_b->dirty = true; + mark_inode_dirty(&ni->vfs_inode); + + /* Stored [vcn : next_svcn) from [vcn : end). */ + next_svcn = le64_to_cpu(attr->nres.evcn) + 1; + + if (end <= evcn1) { + if (next_svcn == evcn1) { + /* Normal way. Update attribute and exit. */ + goto ok; + } + /* Add new segment [next_svcn : evcn1 - next_svcn). */ + if (!ni->attr_list.size) { + err = ni_create_attr_list(ni); + if (err) + goto out; + /* Layout of records is changed. */ + le_b = NULL; + attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, + 0, NULL, &mi_b); + if (!attr_b) { + err = -ENOENT; + goto out; + } + + attr = attr_b; + le = le_b; + mi = mi_b; + goto repack; + } + } + + svcn = evcn1; + + /* Estimate next attribute. */ + attr = ni_find_attr(ni, attr, &le, ATTR_DATA, NULL, 0, &svcn, &mi); + + if (attr) { + CLST alloc = bytes_to_cluster( + sbi, le64_to_cpu(attr_b->nres.alloc_size)); + CLST evcn = le64_to_cpu(attr->nres.evcn); + + if (end < next_svcn) + end = next_svcn; + while (end > evcn) { + /* Remove segment [svcn : evcn). */ + mi_remove_attr(NULL, mi, attr); + + if (!al_remove_le(ni, le)) { + err = -EINVAL; + goto out; + } + + if (evcn + 1 >= alloc) { + /* Last attribute segment. */ + evcn1 = evcn + 1; + goto ins_ext; + } + + if (ni_load_mi(ni, le, &mi)) { + attr = NULL; + goto out; + } + + attr = mi_find_attr(mi, NULL, ATTR_DATA, NULL, 0, + &le->id); + if (!attr) { + err = -EINVAL; + goto out; + } + svcn = le64_to_cpu(attr->nres.svcn); + evcn = le64_to_cpu(attr->nres.evcn); + } + + if (end < svcn) + end = svcn; + + err = attr_load_runs(attr, ni, run, &end); + if (err) + goto out; + + evcn1 = evcn + 1; + attr->nres.svcn = cpu_to_le64(next_svcn); + err = mi_pack_runs(mi, attr, run, evcn1 - next_svcn); + if (err) + goto out; + + le->vcn = cpu_to_le64(next_svcn); + ni->attr_list.dirty = true; + mi->dirty = true; + + next_svcn = le64_to_cpu(attr->nres.evcn) + 1; + } +ins_ext: + if (evcn1 > next_svcn) { + err = ni_insert_nonresident(ni, ATTR_DATA, NULL, 0, run, + next_svcn, evcn1 - next_svcn, + attr_b->flags, &attr, &mi); + if (err) + goto out; + } +ok: + run_truncate_around(run, vcn); +out: + if (new_valid > data_size) + new_valid = data_size; + + valid_size = le64_to_cpu(attr_b->nres.valid_size); + if (new_valid != valid_size) { + attr_b->nres.valid_size = cpu_to_le64(valid_size); + mi_b->dirty = true; + } + + return err; +} + +/* + * attr_collapse_range - Collapse range in file. + */ +int attr_collapse_range(struct ntfs_inode *ni, u64 vbo, u64 bytes) +{ + int err = 0; + struct runs_tree *run = &ni->file.run; + struct ntfs_sb_info *sbi = ni->mi.sbi; + struct ATTRIB *attr = NULL, *attr_b; + struct ATTR_LIST_ENTRY *le, *le_b; + struct mft_inode *mi, *mi_b; + CLST svcn, evcn1, len, dealloc, alen; + CLST vcn, end; + u64 valid_size, data_size, alloc_size, total_size; + u32 mask; + __le16 a_flags; + + if (!bytes) + return 0; + + le_b = NULL; + attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, &mi_b); + if (!attr_b) + return -ENOENT; + + if (!attr_b->non_res) { + /* Attribute is resident. Nothing to do? */ + return 0; + } + + data_size = le64_to_cpu(attr_b->nres.data_size); + alloc_size = le64_to_cpu(attr_b->nres.alloc_size); + a_flags = attr_b->flags; + + if (is_attr_ext(attr_b)) { + total_size = le64_to_cpu(attr_b->nres.total_size); + mask = (sbi->cluster_size << attr_b->nres.c_unit) - 1; + } else { + total_size = alloc_size; + mask = sbi->cluster_mask; + } + + if ((vbo & mask) || (bytes & mask)) { + /* Allow to collapse only cluster aligned ranges. */ + return -EINVAL; + } + + if (vbo > data_size) + return -EINVAL; + + down_write(&ni->file.run_lock); + + if (vbo + bytes >= data_size) { + u64 new_valid = min(ni->i_valid, vbo); + + /* Simple truncate file at 'vbo'. */ + truncate_setsize(&ni->vfs_inode, vbo); + err = attr_set_size(ni, ATTR_DATA, NULL, 0, &ni->file.run, vbo, + &new_valid, true, NULL); + + if (!err && new_valid < ni->i_valid) + ni->i_valid = new_valid; + + goto out; + } + + /* + * Enumerate all attribute segments and collapse. + */ + alen = alloc_size >> sbi->cluster_bits; + vcn = vbo >> sbi->cluster_bits; + len = bytes >> sbi->cluster_bits; + end = vcn + len; + dealloc = 0; + + svcn = le64_to_cpu(attr_b->nres.svcn); + evcn1 = le64_to_cpu(attr_b->nres.evcn) + 1; + + if (svcn <= vcn && vcn < evcn1) { + attr = attr_b; + le = le_b; + mi = mi_b; + } else if (!le_b) { + err = -EINVAL; + goto out; + } else { + le = le_b; + attr = ni_find_attr(ni, attr_b, &le, ATTR_DATA, NULL, 0, &vcn, + &mi); + if (!attr) { + err = -EINVAL; + goto out; + } + + svcn = le64_to_cpu(attr->nres.svcn); + evcn1 = le64_to_cpu(attr->nres.evcn) + 1; + } + + for (;;) { + if (svcn >= end) { + /* Shift VCN- */ + attr->nres.svcn = cpu_to_le64(svcn - len); + attr->nres.evcn = cpu_to_le64(evcn1 - 1 - len); + if (le) { + le->vcn = attr->nres.svcn; + ni->attr_list.dirty = true; + } + mi->dirty = true; + } else if (svcn < vcn || end < evcn1) { + CLST vcn1, eat, next_svcn; + + /* Collapse a part of this attribute segment. */ + err = attr_load_runs(attr, ni, run, &svcn); + if (err) + goto out; + vcn1 = max(vcn, svcn); + eat = min(end, evcn1) - vcn1; + + err = run_deallocate_ex(sbi, run, vcn1, eat, &dealloc, + true); + if (err) + goto out; + + if (!run_collapse_range(run, vcn1, eat)) { + err = -ENOMEM; + goto out; + } + + if (svcn >= vcn) { + /* Shift VCN */ + attr->nres.svcn = cpu_to_le64(vcn); + if (le) { + le->vcn = attr->nres.svcn; + ni->attr_list.dirty = true; + } + } + + err = mi_pack_runs(mi, attr, run, evcn1 - svcn - eat); + if (err) + goto out; + + next_svcn = le64_to_cpu(attr->nres.evcn) + 1; + if (next_svcn + eat < evcn1) { + err = ni_insert_nonresident( + ni, ATTR_DATA, NULL, 0, run, next_svcn, + evcn1 - eat - next_svcn, a_flags, &attr, + &mi); + if (err) + goto out; + + /* Layout of records maybe changed. */ + attr_b = NULL; + le = al_find_ex(ni, NULL, ATTR_DATA, NULL, 0, + &next_svcn); + if (!le) { + err = -EINVAL; + goto out; + } + } + + /* Free all allocated memory. */ + run_truncate(run, 0); + } else { + u16 le_sz; + u16 roff = le16_to_cpu(attr->nres.run_off); + + run_unpack_ex(RUN_DEALLOCATE, sbi, ni->mi.rno, svcn, + evcn1 - 1, svcn, Add2Ptr(attr, roff), + le32_to_cpu(attr->size) - roff); + + /* Delete this attribute segment. */ + mi_remove_attr(NULL, mi, attr); + if (!le) + break; + + le_sz = le16_to_cpu(le->size); + if (!al_remove_le(ni, le)) { + err = -EINVAL; + goto out; + } + + if (evcn1 >= alen) + break; + + if (!svcn) { + /* Load next record that contains this attribute. */ + if (ni_load_mi(ni, le, &mi)) { + err = -EINVAL; + goto out; + } + + /* Look for required attribute. */ + attr = mi_find_attr(mi, NULL, ATTR_DATA, NULL, + 0, &le->id); + if (!attr) { + err = -EINVAL; + goto out; + } + goto next_attr; + } + le = (struct ATTR_LIST_ENTRY *)((u8 *)le - le_sz); + } + + if (evcn1 >= alen) + break; + + attr = ni_enum_attr_ex(ni, attr, &le, &mi); + if (!attr) { + err = -EINVAL; + goto out; + } + +next_attr: + svcn = le64_to_cpu(attr->nres.svcn); + evcn1 = le64_to_cpu(attr->nres.evcn) + 1; + } + + if (!attr_b) { + le_b = NULL; + attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, + &mi_b); + if (!attr_b) { + err = -ENOENT; + goto out; + } + } + + data_size -= bytes; + valid_size = ni->i_valid; + if (vbo + bytes <= valid_size) + valid_size -= bytes; + else if (vbo < valid_size) + valid_size = vbo; + + attr_b->nres.alloc_size = cpu_to_le64(alloc_size - bytes); + attr_b->nres.data_size = cpu_to_le64(data_size); + attr_b->nres.valid_size = cpu_to_le64(min(valid_size, data_size)); + total_size -= (u64)dealloc << sbi->cluster_bits; + if (is_attr_ext(attr_b)) + attr_b->nres.total_size = cpu_to_le64(total_size); + mi_b->dirty = true; + + /* Update inode size. */ + ni->i_valid = valid_size; + ni->vfs_inode.i_size = data_size; + inode_set_bytes(&ni->vfs_inode, total_size); + ni->ni_flags |= NI_FLAG_UPDATE_PARENT; + mark_inode_dirty(&ni->vfs_inode); + +out: + up_write(&ni->file.run_lock); + if (err) + make_bad_inode(&ni->vfs_inode); + + return err; +} + +/* + * attr_punch_hole + * + * Not for normal files. + */ +int attr_punch_hole(struct ntfs_inode *ni, u64 vbo, u64 bytes, u32 *frame_size) +{ + int err = 0; + struct runs_tree *run = &ni->file.run; + struct ntfs_sb_info *sbi = ni->mi.sbi; + struct ATTRIB *attr = NULL, *attr_b; + struct ATTR_LIST_ENTRY *le, *le_b; + struct mft_inode *mi, *mi_b; + CLST svcn, evcn1, vcn, len, end, alen, dealloc; + u64 total_size, alloc_size; + u32 mask; + + if (!bytes) + return 0; + + le_b = NULL; + attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, &mi_b); + if (!attr_b) + return -ENOENT; + + if (!attr_b->non_res) { + u32 data_size = le32_to_cpu(attr->res.data_size); + u32 from, to; + + if (vbo > data_size) + return 0; + + from = vbo; + to = (vbo + bytes) < data_size ? (vbo + bytes) : data_size; + memset(Add2Ptr(resident_data(attr_b), from), 0, to - from); + return 0; + } + + if (!is_attr_ext(attr_b)) + return -EOPNOTSUPP; + + alloc_size = le64_to_cpu(attr_b->nres.alloc_size); + total_size = le64_to_cpu(attr_b->nres.total_size); + + if (vbo >= alloc_size) { + /* NOTE: It is allowed. */ + return 0; + } + + mask = (sbi->cluster_size << attr_b->nres.c_unit) - 1; + + bytes += vbo; + if (bytes > alloc_size) + bytes = alloc_size; + bytes -= vbo; + + if ((vbo & mask) || (bytes & mask)) { + /* We have to zero a range(s). */ + if (frame_size == NULL) { + /* Caller insists range is aligned. */ + return -EINVAL; + } + *frame_size = mask + 1; + return E_NTFS_NOTALIGNED; + } + + down_write(&ni->file.run_lock); + /* + * Enumerate all attribute segments and punch hole where necessary. + */ + alen = alloc_size >> sbi->cluster_bits; + vcn = vbo >> sbi->cluster_bits; + len = bytes >> sbi->cluster_bits; + end = vcn + len; + dealloc = 0; + + svcn = le64_to_cpu(attr_b->nres.svcn); + evcn1 = le64_to_cpu(attr_b->nres.evcn) + 1; + + if (svcn <= vcn && vcn < evcn1) { + attr = attr_b; + le = le_b; + mi = mi_b; + } else if (!le_b) { + err = -EINVAL; + goto out; + } else { + le = le_b; + attr = ni_find_attr(ni, attr_b, &le, ATTR_DATA, NULL, 0, &vcn, + &mi); + if (!attr) { + err = -EINVAL; + goto out; + } + + svcn = le64_to_cpu(attr->nres.svcn); + evcn1 = le64_to_cpu(attr->nres.evcn) + 1; + } + + while (svcn < end) { + CLST vcn1, zero, dealloc2; + + err = attr_load_runs(attr, ni, run, &svcn); + if (err) + goto out; + vcn1 = max(vcn, svcn); + zero = min(end, evcn1) - vcn1; + + dealloc2 = dealloc; + err = run_deallocate_ex(sbi, run, vcn1, zero, &dealloc, true); + if (err) + goto out; + + if (dealloc2 == dealloc) { + /* Looks like the required range is already sparsed. */ + } else { + if (!run_add_entry(run, vcn1, SPARSE_LCN, zero, + false)) { + err = -ENOMEM; + goto out; + } + + err = mi_pack_runs(mi, attr, run, evcn1 - svcn); + if (err) + goto out; + } + /* Free all allocated memory. */ + run_truncate(run, 0); + + if (evcn1 >= alen) + break; + + attr = ni_enum_attr_ex(ni, attr, &le, &mi); + if (!attr) { + err = -EINVAL; + goto out; + } + + svcn = le64_to_cpu(attr->nres.svcn); + evcn1 = le64_to_cpu(attr->nres.evcn) + 1; + } + + total_size -= (u64)dealloc << sbi->cluster_bits; + attr_b->nres.total_size = cpu_to_le64(total_size); + mi_b->dirty = true; + + /* Update inode size. */ + inode_set_bytes(&ni->vfs_inode, total_size); + ni->ni_flags |= NI_FLAG_UPDATE_PARENT; + mark_inode_dirty(&ni->vfs_inode); + +out: + up_write(&ni->file.run_lock); + if (err) + make_bad_inode(&ni->vfs_inode); + + return err; +} diff --git a/fs/ntfs3/attrlist.c b/fs/ntfs3/attrlist.c new file mode 100644 index 000000000000..fa32399eb517 --- /dev/null +++ b/fs/ntfs3/attrlist.c @@ -0,0 +1,460 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * + * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. + * + */ + +#include <linux/blkdev.h> +#include <linux/buffer_head.h> +#include <linux/fs.h> +#include <linux/nls.h> + +#include "debug.h" +#include "ntfs.h" +#include "ntfs_fs.h" + +/* + * al_is_valid_le + * + * Return: True if @le is valid. + */ +static inline bool al_is_valid_le(const struct ntfs_inode *ni, + struct ATTR_LIST_ENTRY *le) +{ + if (!le || !ni->attr_list.le || !ni->attr_list.size) + return false; + + return PtrOffset(ni->attr_list.le, le) + le16_to_cpu(le->size) <= + ni->attr_list.size; +} + +void al_destroy(struct ntfs_inode *ni) +{ + run_close(&ni->attr_list.run); + kfree(ni->attr_list.le); + ni->attr_list.le = NULL; + ni->attr_list.size = 0; + ni->attr_list.dirty = false; +} + +/* + * ntfs_load_attr_list + * + * This method makes sure that the ATTRIB list, if present, + * has been properly set up. + */ +int ntfs_load_attr_list(struct ntfs_inode *ni, struct ATTRIB *attr) +{ + int err; + size_t lsize; + void *le = NULL; + + if (ni->attr_list.size) + return 0; + + if (!attr->non_res) { + lsize = le32_to_cpu(attr->res.data_size); + le = kmalloc(al_aligned(lsize), GFP_NOFS); + if (!le) { + err = -ENOMEM; + goto out; + } + memcpy(le, resident_data(attr), lsize); + } else if (attr->nres.svcn) { + err = -EINVAL; + goto out; + } else { + u16 run_off = le16_to_cpu(attr->nres.run_off); + + lsize = le64_to_cpu(attr->nres.data_size); + + run_init(&ni->attr_list.run); + + err = run_unpack_ex(&ni->attr_list.run, ni->mi.sbi, ni->mi.rno, + 0, le64_to_cpu(attr->nres.evcn), 0, + Add2Ptr(attr, run_off), + le32_to_cpu(attr->size) - run_off); + if (err < 0) + goto out; + + le = kmalloc(al_aligned(lsize), GFP_NOFS); + if (!le) { + err = -ENOMEM; + goto out; + } + + err = ntfs_read_run_nb(ni->mi.sbi, &ni->attr_list.run, 0, le, + lsize, NULL); + if (err) + goto out; + } + + ni->attr_list.size = lsize; + ni->attr_list.le = le; + + return 0; + +out: + ni->attr_list.le = le; + al_destroy(ni); + + return err; +} + +/* + * al_enumerate + * + * Return: + * * The next list le. + * * If @le is NULL then return the first le. + */ +struct ATTR_LIST_ENTRY *al_enumerate(struct ntfs_inode *ni, + struct ATTR_LIST_ENTRY *le) +{ + size_t off; + u16 sz; + + if (!le) { + le = ni->attr_list.le; + } else { + sz = le16_to_cpu(le->size); + if (sz < sizeof(struct ATTR_LIST_ENTRY)) { + /* Impossible 'cause we should not return such le. */ + return NULL; + } + le = Add2Ptr(le, sz); + } + + /* Check boundary. */ + off = PtrOffset(ni->attr_list.le, le); + if (off + sizeof(struct ATTR_LIST_ENTRY) > ni->attr_list.size) { + /* The regular end of list. */ + return NULL; + } + + sz = le16_to_cpu(le->size); + + /* Check le for errors. */ + if (sz < sizeof(struct ATTR_LIST_ENTRY) || + off + sz > ni->attr_list.size || + sz < le->name_off + le->name_len * sizeof(short)) { + return NULL; + } + + return le; +} + +/* + * al_find_le + * + * Find the first le in the list which matches type, name and VCN. + * + * Return: NULL if not found. + */ +struct ATTR_LIST_ENTRY *al_find_le(struct ntfs_inode *ni, + struct ATTR_LIST_ENTRY *le, + const struct ATTRIB *attr) +{ + CLST svcn = attr_svcn(attr); + + return al_find_ex(ni, le, attr->type, attr_name(attr), attr->name_len, + &svcn); +} + +/* + * al_find_ex + * + * Find the first le in the list which matches type, name and VCN. + * + * Return: NULL if not found. + */ +struct ATTR_LIST_ENTRY *al_find_ex(struct ntfs_inode *ni, + struct ATTR_LIST_ENTRY *le, + enum ATTR_TYPE type, const __le16 *name, + u8 name_len, const CLST *vcn) +{ + struct ATTR_LIST_ENTRY *ret = NULL; + u32 type_in = le32_to_cpu(type); + + while ((le = al_enumerate(ni, le))) { + u64 le_vcn; + int diff = le32_to_cpu(le->type) - type_in; + + /* List entries are sorted by type, name and VCN. */ + if (diff < 0) + continue; + + if (diff > 0) + return ret; + + if (le->name_len != name_len) + continue; + + le_vcn = le64_to_cpu(le->vcn); + if (!le_vcn) { + /* + * Compare entry names only for entry with vcn == 0. + */ + diff = ntfs_cmp_names(le_name(le), name_len, name, + name_len, ni->mi.sbi->upcase, + true); + if (diff < 0) + continue; + + if (diff > 0) + return ret; + } + + if (!vcn) + return le; + + if (*vcn == le_vcn) + return le; + + if (*vcn < le_vcn) + return ret; + + ret = le; + } + + return ret; +} + +/* + * al_find_le_to_insert + * + * Find the first list entry which matches type, name and VCN. + */ +static struct ATTR_LIST_ENTRY *al_find_le_to_insert(struct ntfs_inode *ni, + enum ATTR_TYPE type, + const __le16 *name, + u8 name_len, CLST vcn) +{ + struct ATTR_LIST_ENTRY *le = NULL, *prev; + u32 type_in = le32_to_cpu(type); + + /* List entries are sorted by type, name and VCN. */ + while ((le = al_enumerate(ni, prev = le))) { + int diff = le32_to_cpu(le->type) - type_in; + + if (diff < 0) + continue; + + if (diff > 0) + return le; + + if (!le->vcn) { + /* + * Compare entry names only for entry with vcn == 0. + */ + diff = ntfs_cmp_names(le_name(le), le->name_len, name, + name_len, ni->mi.sbi->upcase, + true); + if (diff < 0) + continue; + + if (diff > 0) + return le; + } + + if (le64_to_cpu(le->vcn) >= vcn) + return le; + } + + return prev ? Add2Ptr(prev, le16_to_cpu(prev->size)) : ni->attr_list.le; +} + +/* + * al_add_le + * + * Add an "attribute list entry" to the list. + */ +int al_add_le(struct ntfs_inode *ni, enum ATTR_TYPE type, const __le16 *name, + u8 name_len, CLST svcn, __le16 id, const struct MFT_REF *ref, + struct ATTR_LIST_ENTRY **new_le) +{ + int err; + struct ATTRIB *attr; + struct ATTR_LIST_ENTRY *le; + size_t off; + u16 sz; + size_t asize, new_asize, old_size; + u64 new_size; + typeof(ni->attr_list) *al = &ni->attr_list; + + /* + * Compute the size of the new 'le' + */ + sz = le_size(name_len); + old_size = al->size; + new_size = old_size + sz; + asize = al_aligned(old_size); + new_asize = al_aligned(new_size); + + /* Scan forward to the point at which the new 'le' should be inserted. */ + le = al_find_le_to_insert(ni, type, name, name_len, svcn); + off = PtrOffset(al->le, le); + + if (new_size > asize) { + void *ptr = kmalloc(new_asize, GFP_NOFS); + + if (!ptr) + return -ENOMEM; + + memcpy(ptr, al->le, off); + memcpy(Add2Ptr(ptr, off + sz), le, old_size - off); + le = Add2Ptr(ptr, off); + kfree(al->le); + al->le = ptr; + } else { + memmove(Add2Ptr(le, sz), le, old_size - off); + } + *new_le = le; + + al->size = new_size; + + le->type = type; + le->size = cpu_to_le16(sz); + le->name_len = name_len; + le->name_off = offsetof(struct ATTR_LIST_ENTRY, name); + le->vcn = cpu_to_le64(svcn); + le->ref = *ref; + le->id = id; + memcpy(le->name, name, sizeof(short) * name_len); + + err = attr_set_size(ni, ATTR_LIST, NULL, 0, &al->run, new_size, + &new_size, true, &attr); + if (err) { + /* Undo memmove above. */ + memmove(le, Add2Ptr(le, sz), old_size - off); + al->size = old_size; + return err; + } + + al->dirty = true; + + if (attr && attr->non_res) { + err = ntfs_sb_write_run(ni->mi.sbi, &al->run, 0, al->le, + al->size); + if (err) + return err; + al->dirty = false; + } + + return 0; +} + +/* + * al_remove_le - Remove @le from attribute list. + */ +bool al_remove_le(struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le) +{ + u16 size; + size_t off; + typeof(ni->attr_list) *al = &ni->attr_list; + + if (!al_is_valid_le(ni, le)) + return false; + + /* Save on stack the size of 'le' */ + size = le16_to_cpu(le->size); + off = PtrOffset(al->le, le); + + memmove(le, Add2Ptr(le, size), al->size - (off + size)); + + al->size -= size; + al->dirty = true; + + return true; +} + +/* + * al_delete_le - Delete first le from the list which matches its parameters. + */ +bool al_delete_le(struct ntfs_inode *ni, enum ATTR_TYPE type, CLST vcn, + const __le16 *name, size_t name_len, + const struct MFT_REF *ref) +{ + u16 size; + struct ATTR_LIST_ENTRY *le; + size_t off; + typeof(ni->attr_list) *al = &ni->attr_list; + + /* Scan forward to the first le that matches the input. */ + le = al_find_ex(ni, NULL, type, name, name_len, &vcn); + if (!le) + return false; + + off = PtrOffset(al->le, le); + +next: + if (off >= al->size) + return false; + if (le->type != type) + return false; + if (le->name_len != name_len) + return false; + if (name_len && ntfs_cmp_names(le_name(le), name_len, name, name_len, + ni->mi.sbi->upcase, true)) + return false; + if (le64_to_cpu(le->vcn) != vcn) + return false; + + /* + * The caller specified a segment reference, so we have to + * scan through the matching entries until we find that segment + * reference or we run of matching entries. + */ + if (ref && memcmp(ref, &le->ref, sizeof(*ref))) { + off += le16_to_cpu(le->size); + le = Add2Ptr(al->le, off); + goto next; + } + + /* Save on stack the size of 'le'. */ + size = le16_to_cpu(le->size); + /* Delete the le. */ + memmove(le, Add2Ptr(le, size), al->size - (off + size)); + + al->size -= size; + al->dirty = true; + + return true; +} + +int al_update(struct ntfs_inode *ni) +{ + int err; + struct ATTRIB *attr; + typeof(ni->attr_list) *al = &ni->attr_list; + + if (!al->dirty || !al->size) + return 0; + + /* + * Attribute list increased on demand in al_add_le. + * Attribute list decreased here. + */ + err = attr_set_size(ni, ATTR_LIST, NULL, 0, &al->run, al->size, NULL, + false, &attr); + if (err) + goto out; + + if (!attr->non_res) { + memcpy(resident_data(attr), al->le, al->size); + } else { + err = ntfs_sb_write_run(ni->mi.sbi, &al->run, 0, al->le, + al->size); + if (err) + goto out; + + attr->nres.valid_size = attr->nres.data_size; + } + + ni->mi.dirty = true; + al->dirty = false; + +out: + return err; +} diff --git a/fs/ntfs3/bitfunc.c b/fs/ntfs3/bitfunc.c new file mode 100644 index 000000000000..ce304d40b5e1 --- /dev/null +++ b/fs/ntfs3/bitfunc.c @@ -0,0 +1,134 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * + * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. + * + */ + +#include <linux/blkdev.h> +#include <linux/buffer_head.h> +#include <linux/fs.h> +#include <linux/nls.h> + +#include "debug.h" +#include "ntfs.h" +#include "ntfs_fs.h" + +#define BITS_IN_SIZE_T (sizeof(size_t) * 8) + +/* + * fill_mask[i] - first i bits are '1' , i = 0,1,2,3,4,5,6,7,8 + * fill_mask[i] = 0xFF >> (8-i) + */ +static const u8 fill_mask[] = { 0x00, 0x01, 0x03, 0x07, 0x0F, + 0x1F, 0x3F, 0x7F, 0xFF }; + +/* + * zero_mask[i] - first i bits are '0' , i = 0,1,2,3,4,5,6,7,8 + * zero_mask[i] = 0xFF << i + */ +static const u8 zero_mask[] = { 0xFF, 0xFE, 0xFC, 0xF8, 0xF0, + 0xE0, 0xC0, 0x80, 0x00 }; + +/* + * are_bits_clear + * + * Return: True if all bits [bit, bit+nbits) are zeros "0". + */ +bool are_bits_clear(const ulong *lmap, size_t bit, size_t nbits) +{ + size_t pos = bit & 7; + const u8 *map = (u8 *)lmap + (bit >> 3); + + if (pos) { + if (8 - pos >= nbits) + return !nbits || !(*map & fill_mask[pos + nbits] & + zero_mask[pos]); + + if (*map++ & zero_mask[pos]) + return false; + nbits -= 8 - pos; + } + + pos = ((size_t)map) & (sizeof(size_t) - 1); + if (pos) { + pos = sizeof(size_t) - pos; + if (nbits >= pos * 8) { + for (nbits -= pos * 8; pos; pos--, map++) { + if (*map) + return false; + } + } + } + + for (pos = nbits / BITS_IN_SIZE_T; pos; pos--, map += sizeof(size_t)) { + if (*((size_t *)map)) + return false; + } + + for (pos = (nbits % BITS_IN_SIZE_T) >> 3; pos; pos--, map++) { + if (*map) + return false; + } + + pos = nbits & 7; + if (pos && (*map & fill_mask[pos])) + return false; + + return true; +} + +/* + * are_bits_set + * + * Return: True if all bits [bit, bit+nbits) are ones "1". + */ +bool are_bits_set(const ulong *lmap, size_t bit, size_t nbits) +{ + u8 mask; + size_t pos = bit & 7; + const u8 *map = (u8 *)lmap + (bit >> 3); + + if (pos) { + if (8 - pos >= nbits) { + mask = fill_mask[pos + nbits] & zero_mask[pos]; + return !nbits || (*map & mask) == mask; + } + + mask = zero_mask[pos]; + if ((*map++ & mask) != mask) + return false; + nbits -= 8 - pos; + } + + pos = ((size_t)map) & (sizeof(size_t) - 1); + if (pos) { + pos = sizeof(size_t) - pos; + if (nbits >= pos * 8) { + for (nbits -= pos * 8; pos; pos--, map++) { + if (*map != 0xFF) + return false; + } + } + } + + for (pos = nbits / BITS_IN_SIZE_T; pos; pos--, map += sizeof(size_t)) { + if (*((size_t *)map) != MINUS_ONE_T) + return false; + } + + for (pos = (nbits % BITS_IN_SIZE_T) >> 3; pos; pos--, map++) { + if (*map != 0xFF) + return false; + } + + pos = nbits & 7; + if (pos) { + u8 mask = fill_mask[pos]; + + if ((*map & mask) != mask) + return false; + } + + return true; +} diff --git a/fs/ntfs3/bitmap.c b/fs/ntfs3/bitmap.c new file mode 100644 index 000000000000..831501555009 --- /dev/null +++ b/fs/ntfs3/bitmap.c @@ -0,0 +1,1493 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * + * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. + * + * This code builds two trees of free clusters extents. + * Trees are sorted by start of extent and by length of extent. + * NTFS_MAX_WND_EXTENTS defines the maximum number of elements in trees. + * In extreme case code reads on-disk bitmap to find free clusters. + * + */ + +#include <linux/blkdev.h> +#include <linux/buffer_head.h> +#include <linux/fs.h> +#include <linux/nls.h> + +#include "debug.h" +#include "ntfs.h" +#include "ntfs_fs.h" + +/* + * Maximum number of extents in tree. + */ +#define NTFS_MAX_WND_EXTENTS (32u * 1024u) + +struct rb_node_key { + struct rb_node node; + size_t key; +}; + +struct e_node { + struct rb_node_key start; /* Tree sorted by start. */ + struct rb_node_key count; /* Tree sorted by len. */ +}; + +static int wnd_rescan(struct wnd_bitmap *wnd); +static struct buffer_head *wnd_map(struct wnd_bitmap *wnd, size_t iw); +static bool wnd_is_free_hlp(struct wnd_bitmap *wnd, size_t bit, size_t bits); + +static struct kmem_cache *ntfs_enode_cachep; + +int __init ntfs3_init_bitmap(void) +{ + ntfs_enode_cachep = + kmem_cache_create("ntfs3_enode_cache", sizeof(struct e_node), 0, + SLAB_RECLAIM_ACCOUNT, NULL); + return ntfs_enode_cachep ? 0 : -ENOMEM; +} + +void ntfs3_exit_bitmap(void) +{ + kmem_cache_destroy(ntfs_enode_cachep); +} + +static inline u32 wnd_bits(const struct wnd_bitmap *wnd, size_t i) +{ + return i + 1 == wnd->nwnd ? wnd->bits_last : wnd->sb->s_blocksize * 8; +} + +/* + * wnd_scan + * + * b_pos + b_len - biggest fragment. + * Scan range [wpos wbits) window @buf. + * + * Return: -1 if not found. + */ +static size_t wnd_scan(const ulong *buf, size_t wbit, u32 wpos, u32 wend, + size_t to_alloc, size_t *prev_tail, size_t *b_pos, + size_t *b_len) +{ + while (wpos < wend) { + size_t free_len; + u32 free_bits, end; + u32 used = find_next_zero_bit(buf, wend, wpos); + + if (used >= wend) { + if (*b_len < *prev_tail) { + *b_pos = wbit - *prev_tail; + *b_len = *prev_tail; + } + + *prev_tail = 0; + return -1; + } + + if (used > wpos) { + wpos = used; + if (*b_len < *prev_tail) { + *b_pos = wbit - *prev_tail; + *b_len = *prev_tail; + } + + *prev_tail = 0; + } + + /* + * Now we have a fragment [wpos, wend) staring with 0. + */ + end = wpos + to_alloc - *prev_tail; + free_bits = find_next_bit(buf, min(end, wend), wpos); + + free_len = *prev_tail + free_bits - wpos; + + if (*b_len < free_len) { + *b_pos = wbit + wpos - *prev_tail; + *b_len = free_len; + } + + if (free_len >= to_alloc) + return wbit + wpos - *prev_tail; + + if (free_bits >= wend) { + *prev_tail += free_bits - wpos; + return -1; + } + + wpos = free_bits + 1; + + *prev_tail = 0; + } + + return -1; +} + +/* + * wnd_close - Frees all resources. + */ +void wnd_close(struct wnd_bitmap *wnd) +{ + struct rb_node *node, *next; + + kfree(wnd->free_bits); + run_close(&wnd->run); + + node = rb_first(&wnd->start_tree); + + while (node) { + next = rb_next(node); + rb_erase(node, &wnd->start_tree); + kmem_cache_free(ntfs_enode_cachep, + rb_entry(node, struct e_node, start.node)); + node = next; + } +} + +static struct rb_node *rb_lookup(struct rb_root *root, size_t v) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *r = NULL; + + while (*p) { + struct rb_node_key *k; + + k = rb_entry(*p, struct rb_node_key, node); + if (v < k->key) { + p = &(*p)->rb_left; + } else if (v > k->key) { + r = &k->node; + p = &(*p)->rb_right; + } else { + return &k->node; + } + } + + return r; +} + +/* + * rb_insert_count - Helper function to insert special kind of 'count' tree. + */ +static inline bool rb_insert_count(struct rb_root *root, struct e_node *e) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + size_t e_ckey = e->count.key; + size_t e_skey = e->start.key; + + while (*p) { + struct e_node *k = + rb_entry(parent = *p, struct e_node, count.node); + + if (e_ckey > k->count.key) { + p = &(*p)->rb_left; + } else if (e_ckey < k->count.key) { + p = &(*p)->rb_right; + } else if (e_skey < k->start.key) { + p = &(*p)->rb_left; + } else if (e_skey > k->start.key) { + p = &(*p)->rb_right; + } else { + WARN_ON(1); + return false; + } + } + + rb_link_node(&e->count.node, parent, p); + rb_insert_color(&e->count.node, root); + return true; +} + +/* + * rb_insert_start - Helper function to insert special kind of 'count' tree. + */ +static inline bool rb_insert_start(struct rb_root *root, struct e_node *e) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + size_t e_skey = e->start.key; + + while (*p) { + struct e_node *k; + + parent = *p; + + k = rb_entry(parent, struct e_node, start.node); + if (e_skey < k->start.key) { + p = &(*p)->rb_left; + } else if (e_skey > k->start.key) { + p = &(*p)->rb_right; + } else { + WARN_ON(1); + return false; + } + } + + rb_link_node(&e->start.node, parent, p); + rb_insert_color(&e->start.node, root); + return true; +} + +/* + * wnd_add_free_ext - Adds a new extent of free space. + * @build: 1 when building tree. + */ +static void wnd_add_free_ext(struct wnd_bitmap *wnd, size_t bit, size_t len, + bool build) +{ + struct e_node *e, *e0 = NULL; + size_t ib, end_in = bit + len; + struct rb_node *n; + + if (build) { + /* Use extent_min to filter too short extents. */ + if (wnd->count >= NTFS_MAX_WND_EXTENTS && + len <= wnd->extent_min) { + wnd->uptodated = -1; + return; + } + } else { + /* Try to find extent before 'bit'. */ + n = rb_lookup(&wnd->start_tree, bit); + + if (!n) { + n = rb_first(&wnd->start_tree); + } else { + e = rb_entry(n, struct e_node, start.node); + n = rb_next(n); + if (e->start.key + e->count.key == bit) { + /* Remove left. */ + bit = e->start.key; + len += e->count.key; + rb_erase(&e->start.node, &wnd->start_tree); + rb_erase(&e->count.node, &wnd->count_tree); + wnd->count -= 1; + e0 = e; + } + } + + while (n) { + size_t next_end; + + e = rb_entry(n, struct e_node, start.node); + next_end = e->start.key + e->count.key; + if (e->start.key > end_in) + break; + + /* Remove right. */ + n = rb_next(n); + len += next_end - end_in; + end_in = next_end; + rb_erase(&e->start.node, &wnd->start_tree); + rb_erase(&e->count.node, &wnd->count_tree); + wnd->count -= 1; + + if (!e0) + e0 = e; + else + kmem_cache_free(ntfs_enode_cachep, e); + } + + if (wnd->uptodated != 1) { + /* Check bits before 'bit'. */ + ib = wnd->zone_bit == wnd->zone_end || + bit < wnd->zone_end + ? 0 + : wnd->zone_end; + + while (bit > ib && wnd_is_free_hlp(wnd, bit - 1, 1)) { + bit -= 1; + len += 1; + } + + /* Check bits after 'end_in'. */ + ib = wnd->zone_bit == wnd->zone_end || + end_in > wnd->zone_bit + ? wnd->nbits + : wnd->zone_bit; + + while (end_in < ib && wnd_is_free_hlp(wnd, end_in, 1)) { + end_in += 1; + len += 1; + } + } + } + /* Insert new fragment. */ + if (wnd->count >= NTFS_MAX_WND_EXTENTS) { + if (e0) + kmem_cache_free(ntfs_enode_cachep, e0); + + wnd->uptodated = -1; + + /* Compare with smallest fragment. */ + n = rb_last(&wnd->count_tree); + e = rb_entry(n, struct e_node, count.node); + if (len <= e->count.key) + goto out; /* Do not insert small fragments. */ + + if (build) { + struct e_node *e2; + + n = rb_prev(n); + e2 = rb_entry(n, struct e_node, count.node); + /* Smallest fragment will be 'e2->count.key'. */ + wnd->extent_min = e2->count.key; + } + + /* Replace smallest fragment by new one. */ + rb_erase(&e->start.node, &wnd->start_tree); + rb_erase(&e->count.node, &wnd->count_tree); + wnd->count -= 1; + } else { + e = e0 ? e0 : kmem_cache_alloc(ntfs_enode_cachep, GFP_ATOMIC); + if (!e) { + wnd->uptodated = -1; + goto out; + } + + if (build && len <= wnd->extent_min) + wnd->extent_min = len; + } + e->start.key = bit; + e->count.key = len; + if (len > wnd->extent_max) + wnd->extent_max = len; + + rb_insert_start(&wnd->start_tree, e); + rb_insert_count(&wnd->count_tree, e); + wnd->count += 1; + +out:; +} + +/* + * wnd_remove_free_ext - Remove a run from the cached free space. + */ +static void wnd_remove_free_ext(struct wnd_bitmap *wnd, size_t bit, size_t len) +{ + struct rb_node *n, *n3; + struct e_node *e, *e3; + size_t end_in = bit + len; + size_t end3, end, new_key, new_len, max_new_len; + + /* Try to find extent before 'bit'. */ + n = rb_lookup(&wnd->start_tree, bit); + + if (!n) + return; + + e = rb_entry(n, struct e_node, start.node); + end = e->start.key + e->count.key; + + new_key = new_len = 0; + len = e->count.key; + + /* Range [bit,end_in) must be inside 'e' or outside 'e' and 'n'. */ + if (e->start.key > bit) + ; + else if (end_in <= end) { + /* Range [bit,end_in) inside 'e'. */ + new_key = end_in; + new_len = end - end_in; + len = bit - e->start.key; + } else if (bit > end) { + bool bmax = false; + + n3 = rb_next(n); + + while (n3) { + e3 = rb_entry(n3, struct e_node, start.node); + if (e3->start.key >= end_in) + break; + + if (e3->count.key == wnd->extent_max) + bmax = true; + + end3 = e3->start.key + e3->count.key; + if (end3 > end_in) { + e3->start.key = end_in; + rb_erase(&e3->count.node, &wnd->count_tree); + e3->count.key = end3 - end_in; + rb_insert_count(&wnd->count_tree, e3); + break; + } + + n3 = rb_next(n3); + rb_erase(&e3->start.node, &wnd->start_tree); + rb_erase(&e3->count.node, &wnd->count_tree); + wnd->count -= 1; + kmem_cache_free(ntfs_enode_cachep, e3); + } + if (!bmax) + return; + n3 = rb_first(&wnd->count_tree); + wnd->extent_max = + n3 ? rb_entry(n3, struct e_node, count.node)->count.key + : 0; + return; + } + + if (e->count.key != wnd->extent_max) { + ; + } else if (rb_prev(&e->count.node)) { + ; + } else { + n3 = rb_next(&e->count.node); + max_new_len = len > new_len ? len : new_len; + if (!n3) { + wnd->extent_max = max_new_len; + } else { + e3 = rb_entry(n3, struct e_node, count.node); + wnd->extent_max = max(e3->count.key, max_new_len); + } + } + + if (!len) { + if (new_len) { + e->start.key = new_key; + rb_erase(&e->count.node, &wnd->count_tree); + e->count.key = new_len; + rb_insert_count(&wnd->count_tree, e); + } else { + rb_erase(&e->start.node, &wnd->start_tree); + rb_erase(&e->count.node, &wnd->count_tree); + wnd->count -= 1; + kmem_cache_free(ntfs_enode_cachep, e); + } + goto out; + } + rb_erase(&e->count.node, &wnd->count_tree); + e->count.key = len; + rb_insert_count(&wnd->count_tree, e); + + if (!new_len) + goto out; + + if (wnd->count >= NTFS_MAX_WND_EXTENTS) { + wnd->uptodated = -1; + + /* Get minimal extent. */ + e = rb_entry(rb_last(&wnd->count_tree), struct e_node, + count.node); + if (e->count.key > new_len) + goto out; + + /* Replace minimum. */ + rb_erase(&e->start.node, &wnd->start_tree); + rb_erase(&e->count.node, &wnd->count_tree); + wnd->count -= 1; + } else { + e = kmem_cache_alloc(ntfs_enode_cachep, GFP_ATOMIC); + if (!e) + wnd->uptodated = -1; + } + + if (e) { + e->start.key = new_key; + e->count.key = new_len; + rb_insert_start(&wnd->start_tree, e); + rb_insert_count(&wnd->count_tree, e); + wnd->count += 1; + } + +out: + if (!wnd->count && 1 != wnd->uptodated) + wnd_rescan(wnd); +} + +/* + * wnd_rescan - Scan all bitmap. Used while initialization. + */ +static int wnd_rescan(struct wnd_bitmap *wnd) +{ + int err = 0; + size_t prev_tail = 0; + struct super_block *sb = wnd->sb; + struct ntfs_sb_info *sbi = sb->s_fs_info; + u64 lbo, len = 0; + u32 blocksize = sb->s_blocksize; + u8 cluster_bits = sbi->cluster_bits; + u32 wbits = 8 * sb->s_blocksize; + u32 used, frb; + const ulong *buf; + size_t wpos, wbit, iw, vbo; + struct buffer_head *bh = NULL; + CLST lcn, clen; + + wnd->uptodated = 0; + wnd->extent_max = 0; + wnd->extent_min = MINUS_ONE_T; + wnd->total_zeroes = 0; + + vbo = 0; + + for (iw = 0; iw < wnd->nwnd; iw++) { + if (iw + 1 == wnd->nwnd) + wbits = wnd->bits_last; + + if (wnd->inited) { + if (!wnd->free_bits[iw]) { + /* All ones. */ + if (prev_tail) { + wnd_add_free_ext(wnd, + vbo * 8 - prev_tail, + prev_tail, true); + prev_tail = 0; + } + goto next_wnd; + } + if (wbits == wnd->free_bits[iw]) { + /* All zeroes. */ + prev_tail += wbits; + wnd->total_zeroes += wbits; + goto next_wnd; + } + } + + if (!len) { + u32 off = vbo & sbi->cluster_mask; + + if (!run_lookup_entry(&wnd->run, vbo >> cluster_bits, + &lcn, &clen, NULL)) { + err = -ENOENT; + goto out; + } + + lbo = ((u64)lcn << cluster_bits) + off; + len = ((u64)clen << cluster_bits) - off; + } + + bh = ntfs_bread(sb, lbo >> sb->s_blocksize_bits); + if (!bh) { + err = -EIO; + goto out; + } + + buf = (ulong *)bh->b_data; + + used = __bitmap_weight(buf, wbits); + if (used < wbits) { + frb = wbits - used; + wnd->free_bits[iw] = frb; + wnd->total_zeroes += frb; + } + + wpos = 0; + wbit = vbo * 8; + + if (wbit + wbits > wnd->nbits) + wbits = wnd->nbits - wbit; + + do { + used = find_next_zero_bit(buf, wbits, wpos); + + if (used > wpos && prev_tail) { + wnd_add_free_ext(wnd, wbit + wpos - prev_tail, + prev_tail, true); + prev_tail = 0; + } + + wpos = used; + + if (wpos >= wbits) { + /* No free blocks. */ + prev_tail = 0; + break; + } + + frb = find_next_bit(buf, wbits, wpos); + if (frb >= wbits) { + /* Keep last free block. */ + prev_tail += frb - wpos; + break; + } + + wnd_add_free_ext(wnd, wbit + wpos - prev_tail, + frb + prev_tail - wpos, true); + + /* Skip free block and first '1'. */ + wpos = frb + 1; + /* Reset previous tail. */ + prev_tail = 0; + } while (wpos < wbits); + +next_wnd: + + if (bh) + put_bh(bh); + bh = NULL; + + vbo += blocksize; + if (len) { + len -= blocksize; + lbo += blocksize; + } + } + + /* Add last block. */ + if (prev_tail) + wnd_add_free_ext(wnd, wnd->nbits - prev_tail, prev_tail, true); + + /* + * Before init cycle wnd->uptodated was 0. + * If any errors or limits occurs while initialization then + * wnd->uptodated will be -1. + * If 'uptodated' is still 0 then Tree is really updated. + */ + if (!wnd->uptodated) + wnd->uptodated = 1; + + if (wnd->zone_bit != wnd->zone_end) { + size_t zlen = wnd->zone_end - wnd->zone_bit; + + wnd->zone_end = wnd->zone_bit; + wnd_zone_set(wnd, wnd->zone_bit, zlen); + } + +out: + return err; +} + +int wnd_init(struct wnd_bitmap *wnd, struct super_block *sb, size_t nbits) +{ + int err; + u32 blocksize = sb->s_blocksize; + u32 wbits = blocksize * 8; + + init_rwsem(&wnd->rw_lock); + + wnd->sb = sb; + wnd->nbits = nbits; + wnd->total_zeroes = nbits; + wnd->extent_max = MINUS_ONE_T; + wnd->zone_bit = wnd->zone_end = 0; + wnd->nwnd = bytes_to_block(sb, bitmap_size(nbits)); + wnd->bits_last = nbits & (wbits - 1); + if (!wnd->bits_last) + wnd->bits_last = wbits; + + wnd->free_bits = kcalloc(wnd->nwnd, sizeof(u16), GFP_NOFS); + if (!wnd->free_bits) + return -ENOMEM; + + err = wnd_rescan(wnd); + if (err) + return err; + + wnd->inited = true; + + return 0; +} + +/* + * wnd_map - Call sb_bread for requested window. + */ +static struct buffer_head *wnd_map(struct wnd_bitmap *wnd, size_t iw) +{ + size_t vbo; + CLST lcn, clen; + struct super_block *sb = wnd->sb; + struct ntfs_sb_info *sbi; + struct buffer_head *bh; + u64 lbo; + + sbi = sb->s_fs_info; + vbo = (u64)iw << sb->s_blocksize_bits; + + if (!run_lookup_entry(&wnd->run, vbo >> sbi->cluster_bits, &lcn, &clen, + NULL)) { + return ERR_PTR(-ENOENT); + } + + lbo = ((u64)lcn << sbi->cluster_bits) + (vbo & sbi->cluster_mask); + + bh = ntfs_bread(wnd->sb, lbo >> sb->s_blocksize_bits); + if (!bh) + return ERR_PTR(-EIO); + + return bh; +} + +/* + * wnd_set_free - Mark the bits range from bit to bit + bits as free. + */ +int wnd_set_free(struct wnd_bitmap *wnd, size_t bit, size_t bits) +{ + int err = 0; + struct super_block *sb = wnd->sb; + size_t bits0 = bits; + u32 wbits = 8 * sb->s_blocksize; + size_t iw = bit >> (sb->s_blocksize_bits + 3); + u32 wbit = bit & (wbits - 1); + struct buffer_head *bh; + + while (iw < wnd->nwnd && bits) { + u32 tail, op; + ulong *buf; + + if (iw + 1 == wnd->nwnd) + wbits = wnd->bits_last; + + tail = wbits - wbit; + op = tail < bits ? tail : bits; + + bh = wnd_map(wnd, iw); + if (IS_ERR(bh)) { + err = PTR_ERR(bh); + break; + } + + buf = (ulong *)bh->b_data; + + lock_buffer(bh); + + __bitmap_clear(buf, wbit, op); + + wnd->free_bits[iw] += op; + + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + unlock_buffer(bh); + put_bh(bh); + + wnd->total_zeroes += op; + bits -= op; + wbit = 0; + iw += 1; + } + + wnd_add_free_ext(wnd, bit, bits0, false); + + return err; +} + +/* + * wnd_set_used - Mark the bits range from bit to bit + bits as used. + */ +int wnd_set_used(struct wnd_bitmap *wnd, size_t bit, size_t bits) +{ + int err = 0; + struct super_block *sb = wnd->sb; + size_t bits0 = bits; + size_t iw = bit >> (sb->s_blocksize_bits + 3); + u32 wbits = 8 * sb->s_blocksize; + u32 wbit = bit & (wbits - 1); + struct buffer_head *bh; + + while (iw < wnd->nwnd && bits) { + u32 tail, op; + ulong *buf; + + if (unlikely(iw + 1 == wnd->nwnd)) + wbits = wnd->bits_last; + + tail = wbits - wbit; + op = tail < bits ? tail : bits; + + bh = wnd_map(wnd, iw); + if (IS_ERR(bh)) { + err = PTR_ERR(bh); + break; + } + buf = (ulong *)bh->b_data; + + lock_buffer(bh); + + __bitmap_set(buf, wbit, op); + wnd->free_bits[iw] -= op; + + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + unlock_buffer(bh); + put_bh(bh); + + wnd->total_zeroes -= op; + bits -= op; + wbit = 0; + iw += 1; + } + + if (!RB_EMPTY_ROOT(&wnd->start_tree)) + wnd_remove_free_ext(wnd, bit, bits0); + + return err; +} + +/* + * wnd_is_free_hlp + * + * Return: True if all clusters [bit, bit+bits) are free (bitmap only). + */ +static bool wnd_is_free_hlp(struct wnd_bitmap *wnd, size_t bit, size_t bits) +{ + struct super_block *sb = wnd->sb; + size_t iw = bit >> (sb->s_blocksize_bits + 3); + u32 wbits = 8 * sb->s_blocksize; + u32 wbit = bit & (wbits - 1); + + while (iw < wnd->nwnd && bits) { + u32 tail, op; + + if (unlikely(iw + 1 == wnd->nwnd)) + wbits = wnd->bits_last; + + tail = wbits - wbit; + op = tail < bits ? tail : bits; + + if (wbits != wnd->free_bits[iw]) { + bool ret; + struct buffer_head *bh = wnd_map(wnd, iw); + + if (IS_ERR(bh)) + return false; + + ret = are_bits_clear((ulong *)bh->b_data, wbit, op); + + put_bh(bh); + if (!ret) + return false; + } + + bits -= op; + wbit = 0; + iw += 1; + } + + return true; +} + +/* + * wnd_is_free + * + * Return: True if all clusters [bit, bit+bits) are free. + */ +bool wnd_is_free(struct wnd_bitmap *wnd, size_t bit, size_t bits) +{ + bool ret; + struct rb_node *n; + size_t end; + struct e_node *e; + + if (RB_EMPTY_ROOT(&wnd->start_tree)) + goto use_wnd; + + n = rb_lookup(&wnd->start_tree, bit); + if (!n) + goto use_wnd; + + e = rb_entry(n, struct e_node, start.node); + + end = e->start.key + e->count.key; + + if (bit < end && bit + bits <= end) + return true; + +use_wnd: + ret = wnd_is_free_hlp(wnd, bit, bits); + + return ret; +} + +/* + * wnd_is_used + * + * Return: True if all clusters [bit, bit+bits) are used. + */ +bool wnd_is_used(struct wnd_bitmap *wnd, size_t bit, size_t bits) +{ + bool ret = false; + struct super_block *sb = wnd->sb; + size_t iw = bit >> (sb->s_blocksize_bits + 3); + u32 wbits = 8 * sb->s_blocksize; + u32 wbit = bit & (wbits - 1); + size_t end; + struct rb_node *n; + struct e_node *e; + + if (RB_EMPTY_ROOT(&wnd->start_tree)) + goto use_wnd; + + end = bit + bits; + n = rb_lookup(&wnd->start_tree, end - 1); + if (!n) + goto use_wnd; + + e = rb_entry(n, struct e_node, start.node); + if (e->start.key + e->count.key > bit) + return false; + +use_wnd: + while (iw < wnd->nwnd && bits) { + u32 tail, op; + + if (unlikely(iw + 1 == wnd->nwnd)) + wbits = wnd->bits_last; + + tail = wbits - wbit; + op = tail < bits ? tail : bits; + + if (wnd->free_bits[iw]) { + bool ret; + struct buffer_head *bh = wnd_map(wnd, iw); + + if (IS_ERR(bh)) + goto out; + + ret = are_bits_set((ulong *)bh->b_data, wbit, op); + put_bh(bh); + if (!ret) + goto out; + } + + bits -= op; + wbit = 0; + iw += 1; + } + ret = true; + +out: + return ret; +} + +/* + * wnd_find - Look for free space. + * + * - flags - BITMAP_FIND_XXX flags + * + * Return: 0 if not found. + */ +size_t wnd_find(struct wnd_bitmap *wnd, size_t to_alloc, size_t hint, + size_t flags, size_t *allocated) +{ + struct super_block *sb; + u32 wbits, wpos, wzbit, wzend; + size_t fnd, max_alloc, b_len, b_pos; + size_t iw, prev_tail, nwnd, wbit, ebit, zbit, zend; + size_t to_alloc0 = to_alloc; + const ulong *buf; + const struct e_node *e; + const struct rb_node *pr, *cr; + u8 log2_bits; + bool fbits_valid; + struct buffer_head *bh; + + /* Fast checking for available free space. */ + if (flags & BITMAP_FIND_FULL) { + size_t zeroes = wnd_zeroes(wnd); + + zeroes -= wnd->zone_end - wnd->zone_bit; + if (zeroes < to_alloc0) + goto no_space; + + if (to_alloc0 > wnd->extent_max) + goto no_space; + } else { + if (to_alloc > wnd->extent_max) + to_alloc = wnd->extent_max; + } + + if (wnd->zone_bit <= hint && hint < wnd->zone_end) + hint = wnd->zone_end; + + max_alloc = wnd->nbits; + b_len = b_pos = 0; + + if (hint >= max_alloc) + hint = 0; + + if (RB_EMPTY_ROOT(&wnd->start_tree)) { + if (wnd->uptodated == 1) { + /* Extents tree is updated -> No free space. */ + goto no_space; + } + goto scan_bitmap; + } + + e = NULL; + if (!hint) + goto allocate_biggest; + + /* Use hint: Enumerate extents by start >= hint. */ + pr = NULL; + cr = wnd->start_tree.rb_node; + + for (;;) { + e = rb_entry(cr, struct e_node, start.node); + + if (e->start.key == hint) + break; + + if (e->start.key < hint) { + pr = cr; + cr = cr->rb_right; + if (!cr) + break; + continue; + } + + cr = cr->rb_left; + if (!cr) { + e = pr ? rb_entry(pr, struct e_node, start.node) : NULL; + break; + } + } + + if (!e) + goto allocate_biggest; + + if (e->start.key + e->count.key > hint) { + /* We have found extension with 'hint' inside. */ + size_t len = e->start.key + e->count.key - hint; + + if (len >= to_alloc && hint + to_alloc <= max_alloc) { + fnd = hint; + goto found; + } + + if (!(flags & BITMAP_FIND_FULL)) { + if (len > to_alloc) + len = to_alloc; + + if (hint + len <= max_alloc) { + fnd = hint; + to_alloc = len; + goto found; + } + } + } + +allocate_biggest: + /* Allocate from biggest free extent. */ + e = rb_entry(rb_first(&wnd->count_tree), struct e_node, count.node); + if (e->count.key != wnd->extent_max) + wnd->extent_max = e->count.key; + + if (e->count.key < max_alloc) { + if (e->count.key >= to_alloc) { + ; + } else if (flags & BITMAP_FIND_FULL) { + if (e->count.key < to_alloc0) { + /* Biggest free block is less then requested. */ + goto no_space; + } + to_alloc = e->count.key; + } else if (-1 != wnd->uptodated) { + to_alloc = e->count.key; + } else { + /* Check if we can use more bits. */ + size_t op, max_check; + struct rb_root start_tree; + + memcpy(&start_tree, &wnd->start_tree, + sizeof(struct rb_root)); + memset(&wnd->start_tree, 0, sizeof(struct rb_root)); + + max_check = e->start.key + to_alloc; + if (max_check > max_alloc) + max_check = max_alloc; + for (op = e->start.key + e->count.key; op < max_check; + op++) { + if (!wnd_is_free(wnd, op, 1)) + break; + } + memcpy(&wnd->start_tree, &start_tree, + sizeof(struct rb_root)); + to_alloc = op - e->start.key; + } + + /* Prepare to return. */ + fnd = e->start.key; + if (e->start.key + to_alloc > max_alloc) + to_alloc = max_alloc - e->start.key; + goto found; + } + + if (wnd->uptodated == 1) { + /* Extents tree is updated -> no free space. */ + goto no_space; + } + + b_len = e->count.key; + b_pos = e->start.key; + +scan_bitmap: + sb = wnd->sb; + log2_bits = sb->s_blocksize_bits + 3; + + /* At most two ranges [hint, max_alloc) + [0, hint). */ +Again: + + /* TODO: Optimize request for case nbits > wbits. */ + iw = hint >> log2_bits; + wbits = sb->s_blocksize * 8; + wpos = hint & (wbits - 1); + prev_tail = 0; + fbits_valid = true; + + if (max_alloc == wnd->nbits) { + nwnd = wnd->nwnd; + } else { + size_t t = max_alloc + wbits - 1; + + nwnd = likely(t > max_alloc) ? (t >> log2_bits) : wnd->nwnd; + } + + /* Enumerate all windows. */ + for (; iw < nwnd; iw++) { + wbit = iw << log2_bits; + + if (!wnd->free_bits[iw]) { + if (prev_tail > b_len) { + b_pos = wbit - prev_tail; + b_len = prev_tail; + } + + /* Skip full used window. */ + prev_tail = 0; + wpos = 0; + continue; + } + + if (unlikely(iw + 1 == nwnd)) { + if (max_alloc == wnd->nbits) { + wbits = wnd->bits_last; + } else { + size_t t = max_alloc & (wbits - 1); + + if (t) { + wbits = t; + fbits_valid = false; + } + } + } + + if (wnd->zone_end > wnd->zone_bit) { + ebit = wbit + wbits; + zbit = max(wnd->zone_bit, wbit); + zend = min(wnd->zone_end, ebit); + + /* Here we have a window [wbit, ebit) and zone [zbit, zend). */ + if (zend <= zbit) { + /* Zone does not overlap window. */ + } else { + wzbit = zbit - wbit; + wzend = zend - wbit; + + /* Zone overlaps window. */ + if (wnd->free_bits[iw] == wzend - wzbit) { + prev_tail = 0; + wpos = 0; + continue; + } + + /* Scan two ranges window: [wbit, zbit) and [zend, ebit). */ + bh = wnd_map(wnd, iw); + + if (IS_ERR(bh)) { + /* TODO: Error */ + prev_tail = 0; + wpos = 0; + continue; + } + + buf = (ulong *)bh->b_data; + + /* Scan range [wbit, zbit). */ + if (wpos < wzbit) { + /* Scan range [wpos, zbit). */ + fnd = wnd_scan(buf, wbit, wpos, wzbit, + to_alloc, &prev_tail, + &b_pos, &b_len); + if (fnd != MINUS_ONE_T) { + put_bh(bh); + goto found; + } + } + + prev_tail = 0; + + /* Scan range [zend, ebit). */ + if (wzend < wbits) { + fnd = wnd_scan(buf, wbit, + max(wzend, wpos), wbits, + to_alloc, &prev_tail, + &b_pos, &b_len); + if (fnd != MINUS_ONE_T) { + put_bh(bh); + goto found; + } + } + + wpos = 0; + put_bh(bh); + continue; + } + } + + /* Current window does not overlap zone. */ + if (!wpos && fbits_valid && wnd->free_bits[iw] == wbits) { + /* Window is empty. */ + if (prev_tail + wbits >= to_alloc) { + fnd = wbit + wpos - prev_tail; + goto found; + } + + /* Increase 'prev_tail' and process next window. */ + prev_tail += wbits; + wpos = 0; + continue; + } + + /* Read window. */ + bh = wnd_map(wnd, iw); + if (IS_ERR(bh)) { + // TODO: Error. + prev_tail = 0; + wpos = 0; + continue; + } + + buf = (ulong *)bh->b_data; + + /* Scan range [wpos, eBits). */ + fnd = wnd_scan(buf, wbit, wpos, wbits, to_alloc, &prev_tail, + &b_pos, &b_len); + put_bh(bh); + if (fnd != MINUS_ONE_T) + goto found; + } + + if (b_len < prev_tail) { + /* The last fragment. */ + b_len = prev_tail; + b_pos = max_alloc - prev_tail; + } + + if (hint) { + /* + * We have scanned range [hint max_alloc). + * Prepare to scan range [0 hint + to_alloc). + */ + size_t nextmax = hint + to_alloc; + + if (likely(nextmax >= hint) && nextmax < max_alloc) + max_alloc = nextmax; + hint = 0; + goto Again; + } + + if (!b_len) + goto no_space; + + wnd->extent_max = b_len; + + if (flags & BITMAP_FIND_FULL) + goto no_space; + + fnd = b_pos; + to_alloc = b_len; + +found: + if (flags & BITMAP_FIND_MARK_AS_USED) { + /* TODO: Optimize remove extent (pass 'e'?). */ + if (wnd_set_used(wnd, fnd, to_alloc)) + goto no_space; + } else if (wnd->extent_max != MINUS_ONE_T && + to_alloc > wnd->extent_max) { + wnd->extent_max = to_alloc; + } + + *allocated = fnd; + return to_alloc; + +no_space: + return 0; +} + +/* + * wnd_extend - Extend bitmap ($MFT bitmap). + */ +int wnd_extend(struct wnd_bitmap *wnd, size_t new_bits) +{ + int err; + struct super_block *sb = wnd->sb; + struct ntfs_sb_info *sbi = sb->s_fs_info; + u32 blocksize = sb->s_blocksize; + u32 wbits = blocksize * 8; + u32 b0, new_last; + size_t bits, iw, new_wnd; + size_t old_bits = wnd->nbits; + u16 *new_free; + + if (new_bits <= old_bits) + return -EINVAL; + + /* Align to 8 byte boundary. */ + new_wnd = bytes_to_block(sb, bitmap_size(new_bits)); + new_last = new_bits & (wbits - 1); + if (!new_last) + new_last = wbits; + + if (new_wnd != wnd->nwnd) { + new_free = kmalloc(new_wnd * sizeof(u16), GFP_NOFS); + if (!new_free) + return -ENOMEM; + + if (new_free != wnd->free_bits) + memcpy(new_free, wnd->free_bits, + wnd->nwnd * sizeof(short)); + memset(new_free + wnd->nwnd, 0, + (new_wnd - wnd->nwnd) * sizeof(short)); + kfree(wnd->free_bits); + wnd->free_bits = new_free; + } + + /* Zero bits [old_bits,new_bits). */ + bits = new_bits - old_bits; + b0 = old_bits & (wbits - 1); + + for (iw = old_bits >> (sb->s_blocksize_bits + 3); bits; iw += 1) { + u32 op; + size_t frb; + u64 vbo, lbo, bytes; + struct buffer_head *bh; + ulong *buf; + + if (iw + 1 == new_wnd) + wbits = new_last; + + op = b0 + bits > wbits ? wbits - b0 : bits; + vbo = (u64)iw * blocksize; + + err = ntfs_vbo_to_lbo(sbi, &wnd->run, vbo, &lbo, &bytes); + if (err) + break; + + bh = ntfs_bread(sb, lbo >> sb->s_blocksize_bits); + if (!bh) + return -EIO; + + lock_buffer(bh); + buf = (ulong *)bh->b_data; + + __bitmap_clear(buf, b0, blocksize * 8 - b0); + frb = wbits - __bitmap_weight(buf, wbits); + wnd->total_zeroes += frb - wnd->free_bits[iw]; + wnd->free_bits[iw] = frb; + + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + unlock_buffer(bh); + /* err = sync_dirty_buffer(bh); */ + + b0 = 0; + bits -= op; + } + + wnd->nbits = new_bits; + wnd->nwnd = new_wnd; + wnd->bits_last = new_last; + + wnd_add_free_ext(wnd, old_bits, new_bits - old_bits, false); + + return 0; +} + +void wnd_zone_set(struct wnd_bitmap *wnd, size_t lcn, size_t len) +{ + size_t zlen; + + zlen = wnd->zone_end - wnd->zone_bit; + if (zlen) + wnd_add_free_ext(wnd, wnd->zone_bit, zlen, false); + + if (!RB_EMPTY_ROOT(&wnd->start_tree) && len) + wnd_remove_free_ext(wnd, lcn, len); + + wnd->zone_bit = lcn; + wnd->zone_end = lcn + len; +} + +int ntfs_trim_fs(struct ntfs_sb_info *sbi, struct fstrim_range *range) +{ + int err = 0; + struct super_block *sb = sbi->sb; + struct wnd_bitmap *wnd = &sbi->used.bitmap; + u32 wbits = 8 * sb->s_blocksize; + CLST len = 0, lcn = 0, done = 0; + CLST minlen = bytes_to_cluster(sbi, range->minlen); + CLST lcn_from = bytes_to_cluster(sbi, range->start); + size_t iw = lcn_from >> (sb->s_blocksize_bits + 3); + u32 wbit = lcn_from & (wbits - 1); + const ulong *buf; + CLST lcn_to; + + if (!minlen) + minlen = 1; + + if (range->len == (u64)-1) + lcn_to = wnd->nbits; + else + lcn_to = bytes_to_cluster(sbi, range->start + range->len); + + down_read_nested(&wnd->rw_lock, BITMAP_MUTEX_CLUSTERS); + + for (; iw < wnd->nbits; iw++, wbit = 0) { + CLST lcn_wnd = iw * wbits; + struct buffer_head *bh; + + if (lcn_wnd > lcn_to) + break; + + if (!wnd->free_bits[iw]) + continue; + + if (iw + 1 == wnd->nwnd) + wbits = wnd->bits_last; + + if (lcn_wnd + wbits > lcn_to) + wbits = lcn_to - lcn_wnd; + + bh = wnd_map(wnd, iw); + if (IS_ERR(bh)) { + err = PTR_ERR(bh); + break; + } + + buf = (ulong *)bh->b_data; + + for (; wbit < wbits; wbit++) { + if (!test_bit(wbit, buf)) { + if (!len) + lcn = lcn_wnd + wbit; + len += 1; + continue; + } + if (len >= minlen) { + err = ntfs_discard(sbi, lcn, len); + if (err) + goto out; + done += len; + } + len = 0; + } + put_bh(bh); + } + + /* Process the last fragment. */ + if (len >= minlen) { + err = ntfs_discard(sbi, lcn, len); + if (err) + goto out; + done += len; + } + +out: + range->len = (u64)done << sbi->cluster_bits; + + up_read(&wnd->rw_lock); + + return err; +} diff --git a/fs/ntfs3/debug.h b/fs/ntfs3/debug.h new file mode 100644 index 000000000000..31120569a87b --- /dev/null +++ b/fs/ntfs3/debug.h @@ -0,0 +1,52 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * + * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. + * + * Useful functions for debugging. + * + */ + +// clang-format off +#ifndef _LINUX_NTFS3_DEBUG_H +#define _LINUX_NTFS3_DEBUG_H + +#ifndef Add2Ptr +#define Add2Ptr(P, I) ((void *)((u8 *)(P) + (I))) +#define PtrOffset(B, O) ((size_t)((size_t)(O) - (size_t)(B))) +#endif + +#ifdef CONFIG_PRINTK +__printf(2, 3) +void ntfs_printk(const struct super_block *sb, const char *fmt, ...); +__printf(2, 3) +void ntfs_inode_printk(struct inode *inode, const char *fmt, ...); +#else +static inline __printf(2, 3) +void ntfs_printk(const struct super_block *sb, const char *fmt, ...) +{ +} + +static inline __printf(2, 3) +void ntfs_inode_printk(struct inode *inode, const char *fmt, ...) +{ +} +#endif + +/* + * Logging macros. Thanks Joe Perches <joe@perches.com> for implementation. + */ + +#define ntfs_err(sb, fmt, ...) ntfs_printk(sb, KERN_ERR fmt, ##__VA_ARGS__) +#define ntfs_warn(sb, fmt, ...) ntfs_printk(sb, KERN_WARNING fmt, ##__VA_ARGS__) +#define ntfs_info(sb, fmt, ...) ntfs_printk(sb, KERN_INFO fmt, ##__VA_ARGS__) +#define ntfs_notice(sb, fmt, ...) \ + ntfs_printk(sb, KERN_NOTICE fmt, ##__VA_ARGS__) + +#define ntfs_inode_err(inode, fmt, ...) \ + ntfs_inode_printk(inode, KERN_ERR fmt, ##__VA_ARGS__) +#define ntfs_inode_warn(inode, fmt, ...) \ + ntfs_inode_printk(inode, KERN_WARNING fmt, ##__VA_ARGS__) + +#endif /* _LINUX_NTFS3_DEBUG_H */ +// clang-format on diff --git a/fs/ntfs3/dir.c b/fs/ntfs3/dir.c new file mode 100644 index 000000000000..93f6d485564e --- /dev/null +++ b/fs/ntfs3/dir.c @@ -0,0 +1,599 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * + * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. + * + * Directory handling functions for NTFS-based filesystems. + * + */ + +#include <linux/blkdev.h> +#include <linux/buffer_head.h> +#include <linux/fs.h> +#include <linux/iversion.h> +#include <linux/nls.h> + +#include "debug.h" +#include "ntfs.h" +#include "ntfs_fs.h" + +/* Convert little endian UTF-16 to NLS string. */ +int ntfs_utf16_to_nls(struct ntfs_sb_info *sbi, const struct le_str *uni, + u8 *buf, int buf_len) +{ + int ret, uni_len, warn; + const __le16 *ip; + u8 *op; + struct nls_table *nls = sbi->options.nls; + + static_assert(sizeof(wchar_t) == sizeof(__le16)); + + if (!nls) { + /* UTF-16 -> UTF-8 */ + ret = utf16s_to_utf8s((wchar_t *)uni->name, uni->len, + UTF16_LITTLE_ENDIAN, buf, buf_len); + buf[ret] = '\0'; + return ret; + } + + ip = uni->name; + op = buf; + uni_len = uni->len; + warn = 0; + + while (uni_len--) { + u16 ec; + int charlen; + char dump[5]; + + if (buf_len < NLS_MAX_CHARSET_SIZE) { + ntfs_warn(sbi->sb, + "filename was truncated while converting."); + break; + } + + ec = le16_to_cpu(*ip++); + charlen = nls->uni2char(ec, op, buf_len); + + if (charlen > 0) { + op += charlen; + buf_len -= charlen; + continue; + } + + *op++ = '_'; + buf_len -= 1; + if (warn) + continue; + + warn = 1; + hex_byte_pack(&dump[0], ec >> 8); + hex_byte_pack(&dump[2], ec); + dump[4] = 0; + + ntfs_err(sbi->sb, "failed to convert \"%s\" to %s", dump, + nls->charset); + } + + *op = '\0'; + return op - buf; +} + +// clang-format off +#define PLANE_SIZE 0x00010000 + +#define SURROGATE_PAIR 0x0000d800 +#define SURROGATE_LOW 0x00000400 +#define SURROGATE_BITS 0x000003ff +// clang-format on + +/* + * put_utf16 - Modified version of put_utf16 from fs/nls/nls_base.c + * + * Function is sparse warnings free. + */ +static inline void put_utf16(wchar_t *s, unsigned int c, + enum utf16_endian endian) +{ + static_assert(sizeof(wchar_t) == sizeof(__le16)); + static_assert(sizeof(wchar_t) == sizeof(__be16)); + + switch (endian) { + default: + *s = (wchar_t)c; + break; + case UTF16_LITTLE_ENDIAN: + *(__le16 *)s = __cpu_to_le16(c); + break; + case UTF16_BIG_ENDIAN: + *(__be16 *)s = __cpu_to_be16(c); + break; + } +} + +/* + * _utf8s_to_utf16s + * + * Modified version of 'utf8s_to_utf16s' allows to + * detect -ENAMETOOLONG without writing out of expected maximum. + */ +static int _utf8s_to_utf16s(const u8 *s, int inlen, enum utf16_endian endian, + wchar_t *pwcs, int maxout) +{ + u16 *op; + int size; + unicode_t u; + + op = pwcs; + while (inlen > 0 && *s) { + if (*s & 0x80) { + size = utf8_to_utf32(s, inlen, &u); + if (size < 0) + return -EINVAL; + s += size; + inlen -= size; + + if (u >= PLANE_SIZE) { + if (maxout < 2) + return -ENAMETOOLONG; + + u -= PLANE_SIZE; + put_utf16(op++, + SURROGATE_PAIR | + ((u >> 10) & SURROGATE_BITS), + endian); + put_utf16(op++, + SURROGATE_PAIR | SURROGATE_LOW | + (u & SURROGATE_BITS), + endian); + maxout -= 2; + } else { + if (maxout < 1) + return -ENAMETOOLONG; + + put_utf16(op++, u, endian); + maxout--; + } + } else { + if (maxout < 1) + return -ENAMETOOLONG; + + put_utf16(op++, *s++, endian); + inlen--; + maxout--; + } + } + return op - pwcs; +} + +/* + * ntfs_nls_to_utf16 - Convert input string to UTF-16. + * @name: Input name. + * @name_len: Input name length. + * @uni: Destination memory. + * @max_ulen: Destination memory. + * @endian: Endian of target UTF-16 string. + * + * This function is called: + * - to create NTFS name + * - to create symlink + * + * Return: UTF-16 string length or error (if negative). + */ +int ntfs_nls_to_utf16(struct ntfs_sb_info *sbi, const u8 *name, u32 name_len, + struct cpu_str *uni, u32 max_ulen, + enum utf16_endian endian) +{ + int ret, slen; + const u8 *end; + struct nls_table *nls = sbi->options.nls; + u16 *uname = uni->name; + + static_assert(sizeof(wchar_t) == sizeof(u16)); + + if (!nls) { + /* utf8 -> utf16 */ + ret = _utf8s_to_utf16s(name, name_len, endian, uname, max_ulen); + uni->len = ret; + return ret; + } + + for (ret = 0, end = name + name_len; name < end; ret++, name += slen) { + if (ret >= max_ulen) + return -ENAMETOOLONG; + + slen = nls->char2uni(name, end - name, uname + ret); + if (!slen) + return -EINVAL; + if (slen < 0) + return slen; + } + +#ifdef __BIG_ENDIAN + if (endian == UTF16_LITTLE_ENDIAN) { + int i = ret; + + while (i--) { + __cpu_to_le16s(uname); + uname++; + } + } +#else + if (endian == UTF16_BIG_ENDIAN) { + int i = ret; + + while (i--) { + __cpu_to_be16s(uname); + uname++; + } + } +#endif + + uni->len = ret; + return ret; +} + +/* + * dir_search_u - Helper function. + */ +struct inode *dir_search_u(struct inode *dir, const struct cpu_str *uni, + struct ntfs_fnd *fnd) +{ + int err = 0; + struct super_block *sb = dir->i_sb; + struct ntfs_sb_info *sbi = sb->s_fs_info; + struct ntfs_inode *ni = ntfs_i(dir); + struct NTFS_DE *e; + int diff; + struct inode *inode = NULL; + struct ntfs_fnd *fnd_a = NULL; + + if (!fnd) { + fnd_a = fnd_get(); + if (!fnd_a) { + err = -ENOMEM; + goto out; + } + fnd = fnd_a; + } + + err = indx_find(&ni->dir, ni, NULL, uni, 0, sbi, &diff, &e, fnd); + + if (err) + goto out; + + if (diff) { + err = -ENOENT; + goto out; + } + + inode = ntfs_iget5(sb, &e->ref, uni); + if (!IS_ERR(inode) && is_bad_inode(inode)) { + iput(inode); + err = -EINVAL; + } +out: + fnd_put(fnd_a); + + return err == -ENOENT ? NULL : err ? ERR_PTR(err) : inode; +} + +static inline int ntfs_filldir(struct ntfs_sb_info *sbi, struct ntfs_inode *ni, + const struct NTFS_DE *e, u8 *name, + struct dir_context *ctx) +{ + const struct ATTR_FILE_NAME *fname; + unsigned long ino; + int name_len; + u32 dt_type; + + fname = Add2Ptr(e, sizeof(struct NTFS_DE)); + + if (fname->type == FILE_NAME_DOS) + return 0; + + if (!mi_is_ref(&ni->mi, &fname->home)) + return 0; + + ino = ino_get(&e->ref); + + if (ino == MFT_REC_ROOT) + return 0; + + /* Skip meta files. Unless option to show metafiles is set. */ + if (!sbi->options.showmeta && ntfs_is_meta_file(sbi, ino)) + return 0; + + if (sbi->options.nohidden && (fname->dup.fa & FILE_ATTRIBUTE_HIDDEN)) + return 0; + + name_len = ntfs_utf16_to_nls(sbi, (struct le_str *)&fname->name_len, + name, PATH_MAX); + if (name_len <= 0) { + ntfs_warn(sbi->sb, "failed to convert name for inode %lx.", + ino); + return 0; + } + + dt_type = (fname->dup.fa & FILE_ATTRIBUTE_DIRECTORY) ? DT_DIR : DT_REG; + + return !dir_emit(ctx, (s8 *)name, name_len, ino, dt_type); +} + +/* + * ntfs_read_hdr - Helper function for ntfs_readdir(). + */ +static int ntfs_read_hdr(struct ntfs_sb_info *sbi, struct ntfs_inode *ni, + const struct INDEX_HDR *hdr, u64 vbo, u64 pos, + u8 *name, struct dir_context *ctx) +{ + int err; + const struct NTFS_DE *e; + u32 e_size; + u32 end = le32_to_cpu(hdr->used); + u32 off = le32_to_cpu(hdr->de_off); + + for (;; off += e_size) { + if (off + sizeof(struct NTFS_DE) > end) + return -1; + + e = Add2Ptr(hdr, off); + e_size = le16_to_cpu(e->size); + if (e_size < sizeof(struct NTFS_DE) || off + e_size > end) + return -1; + + if (de_is_last(e)) + return 0; + + /* Skip already enumerated. */ + if (vbo + off < pos) + continue; + + if (le16_to_cpu(e->key_size) < SIZEOF_ATTRIBUTE_FILENAME) + return -1; + + ctx->pos = vbo + off; + + /* Submit the name to the filldir callback. */ + err = ntfs_filldir(sbi, ni, e, name, ctx); + if (err) + return err; + } +} + +/* + * ntfs_readdir - file_operations::iterate_shared + * + * Use non sorted enumeration. + * We have an example of broken volume where sorted enumeration + * counts each name twice. + */ +static int ntfs_readdir(struct file *file, struct dir_context *ctx) +{ + const struct INDEX_ROOT *root; + u64 vbo; + size_t bit; + loff_t eod; + int err = 0; + struct inode *dir = file_inode(file); + struct ntfs_inode *ni = ntfs_i(dir); + struct super_block *sb = dir->i_sb; + struct ntfs_sb_info *sbi = sb->s_fs_info; + loff_t i_size = i_size_read(dir); + u32 pos = ctx->pos; + u8 *name = NULL; + struct indx_node *node = NULL; + u8 index_bits = ni->dir.index_bits; + + /* Name is a buffer of PATH_MAX length. */ + static_assert(NTFS_NAME_LEN * 4 < PATH_MAX); + + eod = i_size + sbi->record_size; + + if (pos >= eod) + return 0; + + if (!dir_emit_dots(file, ctx)) + return 0; + + /* Allocate PATH_MAX bytes. */ + name = __getname(); + if (!name) + return -ENOMEM; + + if (!ni->mi_loaded && ni->attr_list.size) { + /* + * Directory inode is locked for read. + * Load all subrecords to avoid 'write' access to 'ni' during + * directory reading. + */ + ni_lock(ni); + if (!ni->mi_loaded && ni->attr_list.size) { + err = ni_load_all_mi(ni); + if (!err) + ni->mi_loaded = true; + } + ni_unlock(ni); + if (err) + goto out; + } + + root = indx_get_root(&ni->dir, ni, NULL, NULL); + if (!root) { + err = -EINVAL; + goto out; + } + + if (pos >= sbi->record_size) { + bit = (pos - sbi->record_size) >> index_bits; + } else { + err = ntfs_read_hdr(sbi, ni, &root->ihdr, 0, pos, name, ctx); + if (err) + goto out; + bit = 0; + } + + if (!i_size) { + ctx->pos = eod; + goto out; + } + + for (;;) { + vbo = (u64)bit << index_bits; + if (vbo >= i_size) { + ctx->pos = eod; + goto out; + } + + err = indx_used_bit(&ni->dir, ni, &bit); + if (err) + goto out; + + if (bit == MINUS_ONE_T) { + ctx->pos = eod; + goto out; + } + + vbo = (u64)bit << index_bits; + if (vbo >= i_size) { + ntfs_inode_err(dir, "Looks like your dir is corrupt"); + err = -EINVAL; + goto out; + } + + err = indx_read(&ni->dir, ni, bit << ni->dir.idx2vbn_bits, + &node); + if (err) + goto out; + + err = ntfs_read_hdr(sbi, ni, &node->index->ihdr, + vbo + sbi->record_size, pos, name, ctx); + if (err) + goto out; + + bit += 1; + } + +out: + + __putname(name); + put_indx_node(node); + + if (err == -ENOENT) { + err = 0; + ctx->pos = pos; + } + + return err; +} + +static int ntfs_dir_count(struct inode *dir, bool *is_empty, size_t *dirs, + size_t *files) +{ + int err = 0; + struct ntfs_inode *ni = ntfs_i(dir); + struct NTFS_DE *e = NULL; + struct INDEX_ROOT *root; + struct INDEX_HDR *hdr; + const struct ATTR_FILE_NAME *fname; + u32 e_size, off, end; + u64 vbo = 0; + size_t drs = 0, fles = 0, bit = 0; + loff_t i_size = ni->vfs_inode.i_size; + struct indx_node *node = NULL; + u8 index_bits = ni->dir.index_bits; + + if (is_empty) + *is_empty = true; + + root = indx_get_root(&ni->dir, ni, NULL, NULL); + if (!root) + return -EINVAL; + + hdr = &root->ihdr; + + for (;;) { + end = le32_to_cpu(hdr->used); + off = le32_to_cpu(hdr->de_off); + + for (; off + sizeof(struct NTFS_DE) <= end; off += e_size) { + e = Add2Ptr(hdr, off); + e_size = le16_to_cpu(e->size); + if (e_size < sizeof(struct NTFS_DE) || + off + e_size > end) + break; + + if (de_is_last(e)) + break; + + fname = de_get_fname(e); + if (!fname) + continue; + + if (fname->type == FILE_NAME_DOS) + continue; + + if (is_empty) { + *is_empty = false; + if (!dirs && !files) + goto out; + } + + if (fname->dup.fa & FILE_ATTRIBUTE_DIRECTORY) + drs += 1; + else + fles += 1; + } + + if (vbo >= i_size) + goto out; + + err = indx_used_bit(&ni->dir, ni, &bit); + if (err) + goto out; + + if (bit == MINUS_ONE_T) + goto out; + + vbo = (u64)bit << index_bits; + if (vbo >= i_size) + goto out; + + err = indx_read(&ni->dir, ni, bit << ni->dir.idx2vbn_bits, + &node); + if (err) + goto out; + + hdr = &node->index->ihdr; + bit += 1; + vbo = (u64)bit << ni->dir.idx2vbn_bits; + } + +out: + put_indx_node(node); + if (dirs) + *dirs = drs; + if (files) + *files = fles; + + return err; +} + +bool dir_is_empty(struct inode *dir) +{ + bool is_empty = false; + + ntfs_dir_count(dir, &is_empty, NULL, NULL); + + return is_empty; +} + +// clang-format off +const struct file_operations ntfs_dir_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .iterate_shared = ntfs_readdir, + .fsync = generic_file_fsync, + .open = ntfs_file_open, +}; +// clang-format on diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c new file mode 100644 index 000000000000..424450e77ad5 --- /dev/null +++ b/fs/ntfs3/file.c @@ -0,0 +1,1251 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * + * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. + * + * Regular file handling primitives for NTFS-based filesystems. + * + */ + +#include <linux/backing-dev.h> +#include <linux/buffer_head.h> +#include <linux/compat.h> +#include <linux/falloc.h> +#include <linux/fiemap.h> +#include <linux/nls.h> + +#include "debug.h" +#include "ntfs.h" +#include "ntfs_fs.h" + +static int ntfs_ioctl_fitrim(struct ntfs_sb_info *sbi, unsigned long arg) +{ + struct fstrim_range __user *user_range; + struct fstrim_range range; + struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev); + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!blk_queue_discard(q)) + return -EOPNOTSUPP; + + user_range = (struct fstrim_range __user *)arg; + if (copy_from_user(&range, user_range, sizeof(range))) + return -EFAULT; + + range.minlen = max_t(u32, range.minlen, q->limits.discard_granularity); + + err = ntfs_trim_fs(sbi, &range); + if (err < 0) + return err; + + if (copy_to_user(user_range, &range, sizeof(range))) + return -EFAULT; + + return 0; +} + +static long ntfs_ioctl(struct file *filp, u32 cmd, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct ntfs_sb_info *sbi = inode->i_sb->s_fs_info; + + switch (cmd) { + case FITRIM: + return ntfs_ioctl_fitrim(sbi, arg); + } + return -ENOTTY; /* Inappropriate ioctl for device. */ +} + +#ifdef CONFIG_COMPAT +static long ntfs_compat_ioctl(struct file *filp, u32 cmd, unsigned long arg) + +{ + return ntfs_ioctl(filp, cmd, (unsigned long)compat_ptr(arg)); +} +#endif + +/* + * ntfs_getattr - inode_operations::getattr + */ +int ntfs_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, u32 flags) +{ + struct inode *inode = d_inode(path->dentry); + struct ntfs_inode *ni = ntfs_i(inode); + + if (is_compressed(ni)) + stat->attributes |= STATX_ATTR_COMPRESSED; + + if (is_encrypted(ni)) + stat->attributes |= STATX_ATTR_ENCRYPTED; + + stat->attributes_mask |= STATX_ATTR_COMPRESSED | STATX_ATTR_ENCRYPTED; + + generic_fillattr(mnt_userns, inode, stat); + + stat->result_mask |= STATX_BTIME; + stat->btime = ni->i_crtime; + stat->blksize = ni->mi.sbi->cluster_size; /* 512, 1K, ..., 2M */ + + return 0; +} + +static int ntfs_extend_initialized_size(struct file *file, + struct ntfs_inode *ni, + const loff_t valid, + const loff_t new_valid) +{ + struct inode *inode = &ni->vfs_inode; + struct address_space *mapping = inode->i_mapping; + struct ntfs_sb_info *sbi = inode->i_sb->s_fs_info; + loff_t pos = valid; + int err; + + if (is_resident(ni)) { + ni->i_valid = new_valid; + return 0; + } + + WARN_ON(is_compressed(ni)); + WARN_ON(valid >= new_valid); + + for (;;) { + u32 zerofrom, len; + struct page *page; + void *fsdata; + u8 bits; + CLST vcn, lcn, clen; + + if (is_sparsed(ni)) { + bits = sbi->cluster_bits; + vcn = pos >> bits; + + err = attr_data_get_block(ni, vcn, 0, &lcn, &clen, + NULL); + if (err) + goto out; + + if (lcn == SPARSE_LCN) { + loff_t vbo = (loff_t)vcn << bits; + loff_t to = vbo + ((loff_t)clen << bits); + + if (to <= new_valid) { + ni->i_valid = to; + pos = to; + goto next; + } + + if (vbo < pos) { + pos = vbo; + } else { + to = (new_valid >> bits) << bits; + if (pos < to) { + ni->i_valid = to; + pos = to; + goto next; + } + } + } + } + + zerofrom = pos & (PAGE_SIZE - 1); + len = PAGE_SIZE - zerofrom; + + if (pos + len > new_valid) + len = new_valid - pos; + + err = pagecache_write_begin(file, mapping, pos, len, 0, &page, + &fsdata); + if (err) + goto out; + + zero_user_segment(page, zerofrom, PAGE_SIZE); + + /* This function in any case puts page. */ + err = pagecache_write_end(file, mapping, pos, len, len, page, + fsdata); + if (err < 0) + goto out; + pos += len; + +next: + if (pos >= new_valid) + break; + + balance_dirty_pages_ratelimited(mapping); + cond_resched(); + } + + return 0; + +out: + ni->i_valid = valid; + ntfs_inode_warn(inode, "failed to extend initialized size to %llx.", + new_valid); + return err; +} + +/* + * ntfs_zero_range - Helper function for punch_hole. + * + * It zeroes a range [vbo, vbo_to). + */ +static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to) +{ + int err = 0; + struct address_space *mapping = inode->i_mapping; + u32 blocksize = 1 << inode->i_blkbits; + pgoff_t idx = vbo >> PAGE_SHIFT; + u32 z_start = vbo & (PAGE_SIZE - 1); + pgoff_t idx_end = (vbo_to + PAGE_SIZE - 1) >> PAGE_SHIFT; + loff_t page_off; + struct buffer_head *head, *bh; + u32 bh_next, bh_off, z_end; + sector_t iblock; + struct page *page; + + for (; idx < idx_end; idx += 1, z_start = 0) { + page_off = (loff_t)idx << PAGE_SHIFT; + z_end = (page_off + PAGE_SIZE) > vbo_to ? (vbo_to - page_off) + : PAGE_SIZE; + iblock = page_off >> inode->i_blkbits; + + page = find_or_create_page(mapping, idx, + mapping_gfp_constraint(mapping, + ~__GFP_FS)); + if (!page) + return -ENOMEM; + + if (!page_has_buffers(page)) + create_empty_buffers(page, blocksize, 0); + + bh = head = page_buffers(page); + bh_off = 0; + do { + bh_next = bh_off + blocksize; + + if (bh_next <= z_start || bh_off >= z_end) + continue; + + if (!buffer_mapped(bh)) { + ntfs_get_block(inode, iblock, bh, 0); + /* Unmapped? It's a hole - nothing to do. */ + if (!buffer_mapped(bh)) + continue; + } + + /* Ok, it's mapped. Make sure it's up-to-date. */ + if (PageUptodate(page)) + set_buffer_uptodate(bh); + + if (!buffer_uptodate(bh)) { + lock_buffer(bh); + bh->b_end_io = end_buffer_read_sync; + get_bh(bh); + submit_bh(REQ_OP_READ, 0, bh); + + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { + unlock_page(page); + put_page(page); + err = -EIO; + goto out; + } + } + + mark_buffer_dirty(bh); + + } while (bh_off = bh_next, iblock += 1, + head != (bh = bh->b_this_page)); + + zero_user_segment(page, z_start, z_end); + + unlock_page(page); + put_page(page); + cond_resched(); + } +out: + mark_inode_dirty(inode); + return err; +} + +/* + * ntfs_sparse_cluster - Helper function to zero a new allocated clusters. + * + * NOTE: 512 <= cluster size <= 2M + */ +void ntfs_sparse_cluster(struct inode *inode, struct page *page0, CLST vcn, + CLST len) +{ + struct address_space *mapping = inode->i_mapping; + struct ntfs_sb_info *sbi = inode->i_sb->s_fs_info; + u64 vbo = (u64)vcn << sbi->cluster_bits; + u64 bytes = (u64)len << sbi->cluster_bits; + u32 blocksize = 1 << inode->i_blkbits; + pgoff_t idx0 = page0 ? page0->index : -1; + loff_t vbo_clst = vbo & sbi->cluster_mask_inv; + loff_t end = ntfs_up_cluster(sbi, vbo + bytes); + pgoff_t idx = vbo_clst >> PAGE_SHIFT; + u32 from = vbo_clst & (PAGE_SIZE - 1); + pgoff_t idx_end = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; + loff_t page_off; + u32 to; + bool partial; + struct page *page; + + for (; idx < idx_end; idx += 1, from = 0) { + page = idx == idx0 ? page0 : grab_cache_page(mapping, idx); + + if (!page) + continue; + + page_off = (loff_t)idx << PAGE_SHIFT; + to = (page_off + PAGE_SIZE) > end ? (end - page_off) + : PAGE_SIZE; + partial = false; + + if ((from || PAGE_SIZE != to) && + likely(!page_has_buffers(page))) { + create_empty_buffers(page, blocksize, 0); + } + + if (page_has_buffers(page)) { + struct buffer_head *head, *bh; + u32 bh_off = 0; + + bh = head = page_buffers(page); + do { + u32 bh_next = bh_off + blocksize; + + if (from <= bh_off && bh_next <= to) { + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + } else if (!buffer_uptodate(bh)) { + partial = true; + } + bh_off = bh_next; + } while (head != (bh = bh->b_this_page)); + } + + zero_user_segment(page, from, to); + + if (!partial) { + if (!PageUptodate(page)) + SetPageUptodate(page); + set_page_dirty(page); + } + + if (idx != idx0) { + unlock_page(page); + put_page(page); + } + cond_resched(); + } + mark_inode_dirty(inode); +} + +/* + * ntfs_file_mmap - file_operations::mmap + */ +static int ntfs_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + struct ntfs_inode *ni = ntfs_i(inode); + u64 from = ((u64)vma->vm_pgoff << PAGE_SHIFT); + bool rw = vma->vm_flags & VM_WRITE; + int err; + + if (is_encrypted(ni)) { + ntfs_inode_warn(inode, "mmap encrypted not supported"); + return -EOPNOTSUPP; + } + + if (is_dedup(ni)) { + ntfs_inode_warn(inode, "mmap deduplicated not supported"); + return -EOPNOTSUPP; + } + + if (is_compressed(ni) && rw) { + ntfs_inode_warn(inode, "mmap(write) compressed not supported"); + return -EOPNOTSUPP; + } + + if (rw) { + u64 to = min_t(loff_t, i_size_read(inode), + from + vma->vm_end - vma->vm_start); + + if (is_sparsed(ni)) { + /* Allocate clusters for rw map. */ + struct ntfs_sb_info *sbi = inode->i_sb->s_fs_info; + CLST lcn, len; + CLST vcn = from >> sbi->cluster_bits; + CLST end = bytes_to_cluster(sbi, to); + bool new; + + for (; vcn < end; vcn += len) { + err = attr_data_get_block(ni, vcn, 1, &lcn, + &len, &new); + if (err) + goto out; + + if (!new) + continue; + ntfs_sparse_cluster(inode, NULL, vcn, 1); + } + } + + if (ni->i_valid < to) { + if (!inode_trylock(inode)) { + err = -EAGAIN; + goto out; + } + err = ntfs_extend_initialized_size(file, ni, + ni->i_valid, to); + inode_unlock(inode); + if (err) + goto out; + } + } + + err = generic_file_mmap(file, vma); +out: + return err; +} + +static int ntfs_extend(struct inode *inode, loff_t pos, size_t count, + struct file *file) +{ + struct ntfs_inode *ni = ntfs_i(inode); + struct address_space *mapping = inode->i_mapping; + loff_t end = pos + count; + bool extend_init = file && pos > ni->i_valid; + int err; + + if (end <= inode->i_size && !extend_init) + return 0; + + /* Mark rw ntfs as dirty. It will be cleared at umount. */ + ntfs_set_state(ni->mi.sbi, NTFS_DIRTY_DIRTY); + + if (end > inode->i_size) { + err = ntfs_set_size(inode, end); + if (err) + goto out; + inode->i_size = end; + } + + if (extend_init && !is_compressed(ni)) { + err = ntfs_extend_initialized_size(file, ni, ni->i_valid, pos); + if (err) + goto out; + } else { + err = 0; + } + + inode->i_ctime = inode->i_mtime = current_time(inode); + mark_inode_dirty(inode); + + if (IS_SYNC(inode)) { + int err2; + + err = filemap_fdatawrite_range(mapping, pos, end - 1); + err2 = sync_mapping_buffers(mapping); + if (!err) + err = err2; + err2 = write_inode_now(inode, 1); + if (!err) + err = err2; + if (!err) + err = filemap_fdatawait_range(mapping, pos, end - 1); + } + +out: + return err; +} + +static int ntfs_truncate(struct inode *inode, loff_t new_size) +{ + struct super_block *sb = inode->i_sb; + struct ntfs_inode *ni = ntfs_i(inode); + int err, dirty = 0; + u64 new_valid; + + if (!S_ISREG(inode->i_mode)) + return 0; + + if (is_compressed(ni)) { + if (ni->i_valid > new_size) + ni->i_valid = new_size; + } else { + err = block_truncate_page(inode->i_mapping, new_size, + ntfs_get_block); + if (err) + return err; + } + + new_valid = ntfs_up_block(sb, min_t(u64, ni->i_valid, new_size)); + + ni_lock(ni); + + truncate_setsize(inode, new_size); + + down_write(&ni->file.run_lock); + err = attr_set_size(ni, ATTR_DATA, NULL, 0, &ni->file.run, new_size, + &new_valid, true, NULL); + up_write(&ni->file.run_lock); + + if (new_valid < ni->i_valid) + ni->i_valid = new_valid; + + ni_unlock(ni); + + ni->std_fa |= FILE_ATTRIBUTE_ARCHIVE; + inode->i_ctime = inode->i_mtime = current_time(inode); + if (!IS_DIRSYNC(inode)) { + dirty = 1; + } else { + err = ntfs_sync_inode(inode); + if (err) + return err; + } + + if (dirty) + mark_inode_dirty(inode); + + /*ntfs_flush_inodes(inode->i_sb, inode, NULL);*/ + + return 0; +} + +/* + * ntfs_fallocate + * + * Preallocate space for a file. This implements ntfs's fallocate file + * operation, which gets called from sys_fallocate system call. User + * space requests 'len' bytes at 'vbo'. If FALLOC_FL_KEEP_SIZE is set + * we just allocate clusters without zeroing them out. Otherwise we + * allocate and zero out clusters via an expanding truncate. + */ +static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len) +{ + struct inode *inode = file->f_mapping->host; + struct super_block *sb = inode->i_sb; + struct ntfs_sb_info *sbi = sb->s_fs_info; + struct ntfs_inode *ni = ntfs_i(inode); + loff_t end = vbo + len; + loff_t vbo_down = round_down(vbo, PAGE_SIZE); + loff_t i_size; + int err; + + /* No support for dir. */ + if (!S_ISREG(inode->i_mode)) + return -EOPNOTSUPP; + + /* Return error if mode is not supported. */ + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_COLLAPSE_RANGE)) { + ntfs_inode_warn(inode, "fallocate(0x%x) is not supported", + mode); + return -EOPNOTSUPP; + } + + ntfs_set_state(sbi, NTFS_DIRTY_DIRTY); + + inode_lock(inode); + i_size = inode->i_size; + + if (WARN_ON(ni->ni_flags & NI_FLAG_COMPRESSED_MASK)) { + /* Should never be here, see ntfs_file_open. */ + err = -EOPNOTSUPP; + goto out; + } + + if (mode & FALLOC_FL_PUNCH_HOLE) { + u32 frame_size; + loff_t mask, vbo_a, end_a, tmp; + + if (!(mode & FALLOC_FL_KEEP_SIZE)) { + err = -EINVAL; + goto out; + } + + err = filemap_write_and_wait_range(inode->i_mapping, vbo, + end - 1); + if (err) + goto out; + + err = filemap_write_and_wait_range(inode->i_mapping, end, + LLONG_MAX); + if (err) + goto out; + + inode_dio_wait(inode); + + truncate_pagecache(inode, vbo_down); + + if (!is_sparsed(ni) && !is_compressed(ni)) { + /* Normal file. */ + err = ntfs_zero_range(inode, vbo, end); + goto out; + } + + ni_lock(ni); + err = attr_punch_hole(ni, vbo, len, &frame_size); + ni_unlock(ni); + if (err != E_NTFS_NOTALIGNED) + goto out; + + /* Process not aligned punch. */ + mask = frame_size - 1; + vbo_a = (vbo + mask) & ~mask; + end_a = end & ~mask; + + tmp = min(vbo_a, end); + if (tmp > vbo) { + err = ntfs_zero_range(inode, vbo, tmp); + if (err) + goto out; + } + + if (vbo < end_a && end_a < end) { + err = ntfs_zero_range(inode, end_a, end); + if (err) + goto out; + } + + /* Aligned punch_hole */ + if (end_a > vbo_a) { + ni_lock(ni); + err = attr_punch_hole(ni, vbo_a, end_a - vbo_a, NULL); + ni_unlock(ni); + } + } else if (mode & FALLOC_FL_COLLAPSE_RANGE) { + if (mode & ~FALLOC_FL_COLLAPSE_RANGE) { + err = -EINVAL; + goto out; + } + + /* + * Write tail of the last page before removed range since + * it will get removed from the page cache below. + */ + err = filemap_write_and_wait_range(inode->i_mapping, vbo_down, + vbo); + if (err) + goto out; + + /* + * Write data that will be shifted to preserve them + * when discarding page cache below. + */ + err = filemap_write_and_wait_range(inode->i_mapping, end, + LLONG_MAX); + if (err) + goto out; + + /* Wait for existing dio to complete. */ + inode_dio_wait(inode); + + truncate_pagecache(inode, vbo_down); + + ni_lock(ni); + err = attr_collapse_range(ni, vbo, len); + ni_unlock(ni); + } else { + /* + * Normal file: Allocate clusters, do not change 'valid' size. + */ + err = ntfs_set_size(inode, max(end, i_size)); + if (err) + goto out; + + if (is_sparsed(ni) || is_compressed(ni)) { + CLST vcn_v = ni->i_valid >> sbi->cluster_bits; + CLST vcn = vbo >> sbi->cluster_bits; + CLST cend = bytes_to_cluster(sbi, end); + CLST lcn, clen; + bool new; + + /* + * Allocate but do not zero new clusters. (see below comments) + * This breaks security: One can read unused on-disk areas. + * Zeroing these clusters may be too long. + * Maybe we should check here for root rights? + */ + for (; vcn < cend; vcn += clen) { + err = attr_data_get_block(ni, vcn, cend - vcn, + &lcn, &clen, &new); + if (err) + goto out; + if (!new || vcn >= vcn_v) + continue; + + /* + * Unwritten area. + * NTFS is not able to store several unwritten areas. + * Activate 'ntfs_sparse_cluster' to zero new allocated clusters. + * + * Dangerous in case: + * 1G of sparsed clusters + 1 cluster of data => + * valid_size == 1G + 1 cluster + * fallocate(1G) will zero 1G and this can be very long + * xfstest 016/086 will fail without 'ntfs_sparse_cluster'. + */ + ntfs_sparse_cluster(inode, NULL, vcn, + min(vcn_v - vcn, clen)); + } + } + + if (mode & FALLOC_FL_KEEP_SIZE) { + ni_lock(ni); + /* True - Keep preallocated. */ + err = attr_set_size(ni, ATTR_DATA, NULL, 0, + &ni->file.run, i_size, &ni->i_valid, + true, NULL); + ni_unlock(ni); + } + } + +out: + if (err == -EFBIG) + err = -ENOSPC; + + if (!err) { + inode->i_ctime = inode->i_mtime = current_time(inode); + mark_inode_dirty(inode); + } + + inode_unlock(inode); + return err; +} + +/* + * ntfs3_setattr - inode_operations::setattr + */ +int ntfs3_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr) +{ + struct super_block *sb = dentry->d_sb; + struct ntfs_sb_info *sbi = sb->s_fs_info; + struct inode *inode = d_inode(dentry); + struct ntfs_inode *ni = ntfs_i(inode); + u32 ia_valid = attr->ia_valid; + umode_t mode = inode->i_mode; + int err; + + if (sbi->options.no_acs_rules) { + /* "No access rules" - Force any changes of time etc. */ + attr->ia_valid |= ATTR_FORCE; + /* and disable for editing some attributes. */ + attr->ia_valid &= ~(ATTR_UID | ATTR_GID | ATTR_MODE); + ia_valid = attr->ia_valid; + } + + err = setattr_prepare(mnt_userns, dentry, attr); + if (err) + goto out; + + if (ia_valid & ATTR_SIZE) { + loff_t oldsize = inode->i_size; + + if (WARN_ON(ni->ni_flags & NI_FLAG_COMPRESSED_MASK)) { + /* Should never be here, see ntfs_file_open(). */ + err = -EOPNOTSUPP; + goto out; + } + inode_dio_wait(inode); + + if (attr->ia_size < oldsize) + err = ntfs_truncate(inode, attr->ia_size); + else if (attr->ia_size > oldsize) + err = ntfs_extend(inode, attr->ia_size, 0, NULL); + + if (err) + goto out; + + ni->ni_flags |= NI_FLAG_UPDATE_PARENT; + } + + setattr_copy(mnt_userns, inode, attr); + + if (mode != inode->i_mode) { + err = ntfs_acl_chmod(mnt_userns, inode); + if (err) + goto out; + + /* Linux 'w' -> Windows 'ro'. */ + if (0222 & inode->i_mode) + ni->std_fa &= ~FILE_ATTRIBUTE_READONLY; + else + ni->std_fa |= FILE_ATTRIBUTE_READONLY; + } + + if (ia_valid & (ATTR_UID | ATTR_GID | ATTR_MODE)) + ntfs_save_wsl_perm(inode); + mark_inode_dirty(inode); +out: + return err; +} + +static ssize_t ntfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file->f_mapping->host; + struct ntfs_inode *ni = ntfs_i(inode); + + if (is_encrypted(ni)) { + ntfs_inode_warn(inode, "encrypted i/o not supported"); + return -EOPNOTSUPP; + } + + if (is_compressed(ni) && (iocb->ki_flags & IOCB_DIRECT)) { + ntfs_inode_warn(inode, "direct i/o + compressed not supported"); + return -EOPNOTSUPP; + } + +#ifndef CONFIG_NTFS3_LZX_XPRESS + if (ni->ni_flags & NI_FLAG_COMPRESSED_MASK) { + ntfs_inode_warn( + inode, + "activate CONFIG_NTFS3_LZX_XPRESS to read external compressed files"); + return -EOPNOTSUPP; + } +#endif + + if (is_dedup(ni)) { + ntfs_inode_warn(inode, "read deduplicated not supported"); + return -EOPNOTSUPP; + } + + return generic_file_read_iter(iocb, iter); +} + +/* + * ntfs_get_frame_pages + * + * Return: Array of locked pages. + */ +static int ntfs_get_frame_pages(struct address_space *mapping, pgoff_t index, + struct page **pages, u32 pages_per_frame, + bool *frame_uptodate) +{ + gfp_t gfp_mask = mapping_gfp_mask(mapping); + u32 npages; + + *frame_uptodate = true; + + for (npages = 0; npages < pages_per_frame; npages++, index++) { + struct page *page; + + page = find_or_create_page(mapping, index, gfp_mask); + if (!page) { + while (npages--) { + page = pages[npages]; + unlock_page(page); + put_page(page); + } + + return -ENOMEM; + } + + if (!PageUptodate(page)) + *frame_uptodate = false; + + pages[npages] = page; + } + + return 0; +} + +/* + * ntfs_compress_write - Helper for ntfs_file_write_iter() (compressed files). + */ +static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) +{ + int err; + struct file *file = iocb->ki_filp; + size_t count = iov_iter_count(from); + loff_t pos = iocb->ki_pos; + struct inode *inode = file_inode(file); + loff_t i_size = inode->i_size; + struct address_space *mapping = inode->i_mapping; + struct ntfs_inode *ni = ntfs_i(inode); + u64 valid = ni->i_valid; + struct ntfs_sb_info *sbi = ni->mi.sbi; + struct page *page, **pages = NULL; + size_t written = 0; + u8 frame_bits = NTFS_LZNT_CUNIT + sbi->cluster_bits; + u32 frame_size = 1u << frame_bits; + u32 pages_per_frame = frame_size >> PAGE_SHIFT; + u32 ip, off; + CLST frame; + u64 frame_vbo; + pgoff_t index; + bool frame_uptodate; + + if (frame_size < PAGE_SIZE) { + /* + * frame_size == 8K if cluster 512 + * frame_size == 64K if cluster 4096 + */ + ntfs_inode_warn(inode, "page size is bigger than frame size"); + return -EOPNOTSUPP; + } + + pages = kmalloc_array(pages_per_frame, sizeof(struct page *), GFP_NOFS); + if (!pages) + return -ENOMEM; + + current->backing_dev_info = inode_to_bdi(inode); + err = file_remove_privs(file); + if (err) + goto out; + + err = file_update_time(file); + if (err) + goto out; + + /* Zero range [valid : pos). */ + while (valid < pos) { + CLST lcn, clen; + + frame = valid >> frame_bits; + frame_vbo = valid & ~(frame_size - 1); + off = valid & (frame_size - 1); + + err = attr_data_get_block(ni, frame << NTFS_LZNT_CUNIT, 0, &lcn, + &clen, NULL); + if (err) + goto out; + + if (lcn == SPARSE_LCN) { + ni->i_valid = valid = + frame_vbo + ((u64)clen << sbi->cluster_bits); + continue; + } + + /* Load full frame. */ + err = ntfs_get_frame_pages(mapping, frame_vbo >> PAGE_SHIFT, + pages, pages_per_frame, + &frame_uptodate); + if (err) + goto out; + + if (!frame_uptodate && off) { + err = ni_read_frame(ni, frame_vbo, pages, + pages_per_frame); + if (err) { + for (ip = 0; ip < pages_per_frame; ip++) { + page = pages[ip]; + unlock_page(page); + put_page(page); + } + goto out; + } + } + + ip = off >> PAGE_SHIFT; + off = offset_in_page(valid); + for (; ip < pages_per_frame; ip++, off = 0) { + page = pages[ip]; + zero_user_segment(page, off, PAGE_SIZE); + flush_dcache_page(page); + SetPageUptodate(page); + } + + ni_lock(ni); + err = ni_write_frame(ni, pages, pages_per_frame); + ni_unlock(ni); + + for (ip = 0; ip < pages_per_frame; ip++) { + page = pages[ip]; + SetPageUptodate(page); + unlock_page(page); + put_page(page); + } + + if (err) + goto out; + + ni->i_valid = valid = frame_vbo + frame_size; + } + + /* Copy user data [pos : pos + count). */ + while (count) { + size_t copied, bytes; + + off = pos & (frame_size - 1); + bytes = frame_size - off; + if (bytes > count) + bytes = count; + + frame = pos >> frame_bits; + frame_vbo = pos & ~(frame_size - 1); + index = frame_vbo >> PAGE_SHIFT; + + if (unlikely(iov_iter_fault_in_readable(from, bytes))) { + err = -EFAULT; + goto out; + } + + /* Load full frame. */ + err = ntfs_get_frame_pages(mapping, index, pages, + pages_per_frame, &frame_uptodate); + if (err) + goto out; + + if (!frame_uptodate) { + loff_t to = pos + bytes; + + if (off || (to < i_size && (to & (frame_size - 1)))) { + err = ni_read_frame(ni, frame_vbo, pages, + pages_per_frame); + if (err) { + for (ip = 0; ip < pages_per_frame; + ip++) { + page = pages[ip]; + unlock_page(page); + put_page(page); + } + goto out; + } + } + } + + WARN_ON(!bytes); + copied = 0; + ip = off >> PAGE_SHIFT; + off = offset_in_page(pos); + + /* Copy user data to pages. */ + for (;;) { + size_t cp, tail = PAGE_SIZE - off; + + page = pages[ip]; + cp = copy_page_from_iter_atomic(page, off, + min(tail, bytes), from); + flush_dcache_page(page); + + copied += cp; + bytes -= cp; + if (!bytes || !cp) + break; + + if (cp < tail) { + off += cp; + } else { + ip++; + off = 0; + } + } + + ni_lock(ni); + err = ni_write_frame(ni, pages, pages_per_frame); + ni_unlock(ni); + + for (ip = 0; ip < pages_per_frame; ip++) { + page = pages[ip]; + ClearPageDirty(page); + SetPageUptodate(page); + unlock_page(page); + put_page(page); + } + + if (err) + goto out; + + /* + * We can loop for a long time in here. Be nice and allow + * us to schedule out to avoid softlocking if preempt + * is disabled. + */ + cond_resched(); + + pos += copied; + written += copied; + + count = iov_iter_count(from); + } + +out: + kfree(pages); + + current->backing_dev_info = NULL; + + if (err < 0) + return err; + + iocb->ki_pos += written; + if (iocb->ki_pos > ni->i_valid) + ni->i_valid = iocb->ki_pos; + + return written; +} + +/* + * ntfs_file_write_iter - file_operations::write_iter + */ +static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + ssize_t ret; + struct ntfs_inode *ni = ntfs_i(inode); + + if (is_encrypted(ni)) { + ntfs_inode_warn(inode, "encrypted i/o not supported"); + return -EOPNOTSUPP; + } + + if (is_compressed(ni) && (iocb->ki_flags & IOCB_DIRECT)) { + ntfs_inode_warn(inode, "direct i/o + compressed not supported"); + return -EOPNOTSUPP; + } + + if (is_dedup(ni)) { + ntfs_inode_warn(inode, "write into deduplicated not supported"); + return -EOPNOTSUPP; + } + + if (!inode_trylock(inode)) { + if (iocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + inode_lock(inode); + } + + ret = generic_write_checks(iocb, from); + if (ret <= 0) + goto out; + + if (WARN_ON(ni->ni_flags & NI_FLAG_COMPRESSED_MASK)) { + /* Should never be here, see ntfs_file_open(). */ + ret = -EOPNOTSUPP; + goto out; + } + + ret = ntfs_extend(inode, iocb->ki_pos, ret, file); + if (ret) + goto out; + + ret = is_compressed(ni) ? ntfs_compress_write(iocb, from) + : __generic_file_write_iter(iocb, from); + +out: + inode_unlock(inode); + + if (ret > 0) + ret = generic_write_sync(iocb, ret); + + return ret; +} + +/* + * ntfs_file_open - file_operations::open + */ +int ntfs_file_open(struct inode *inode, struct file *file) +{ + struct ntfs_inode *ni = ntfs_i(inode); + + if (unlikely((is_compressed(ni) || is_encrypted(ni)) && + (file->f_flags & O_DIRECT))) { + return -EOPNOTSUPP; + } + + /* Decompress "external compressed" file if opened for rw. */ + if ((ni->ni_flags & NI_FLAG_COMPRESSED_MASK) && + (file->f_flags & (O_WRONLY | O_RDWR | O_TRUNC))) { +#ifdef CONFIG_NTFS3_LZX_XPRESS + int err = ni_decompress_file(ni); + + if (err) + return err; +#else + ntfs_inode_warn( + inode, + "activate CONFIG_NTFS3_LZX_XPRESS to write external compressed files"); + return -EOPNOTSUPP; +#endif + } + + return generic_file_open(inode, file); +} + +/* + * ntfs_file_release - file_operations::release + */ +static int ntfs_file_release(struct inode *inode, struct file *file) +{ + struct ntfs_inode *ni = ntfs_i(inode); + struct ntfs_sb_info *sbi = ni->mi.sbi; + int err = 0; + + /* If we are last writer on the inode, drop the block reservation. */ + if (sbi->options.prealloc && ((file->f_mode & FMODE_WRITE) && + atomic_read(&inode->i_writecount) == 1)) { + ni_lock(ni); + down_write(&ni->file.run_lock); + + err = attr_set_size(ni, ATTR_DATA, NULL, 0, &ni->file.run, + inode->i_size, &ni->i_valid, false, NULL); + + up_write(&ni->file.run_lock); + ni_unlock(ni); + } + return err; +} + +/* + * ntfs_fiemap - file_operations::fiemap + */ +int ntfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len) +{ + int err; + struct ntfs_inode *ni = ntfs_i(inode); + + err = fiemap_prep(inode, fieinfo, start, &len, ~FIEMAP_FLAG_XATTR); + if (err) + return err; + + ni_lock(ni); + + err = ni_fiemap(ni, fieinfo, start, len); + + ni_unlock(ni); + + return err; +} + +// clang-format off +const struct inode_operations ntfs_file_inode_operations = { + .getattr = ntfs_getattr, + .setattr = ntfs3_setattr, + .listxattr = ntfs_listxattr, + .permission = ntfs_permission, + .get_acl = ntfs_get_acl, + .set_acl = ntfs_set_acl, + .fiemap = ntfs_fiemap, +}; + +const struct file_operations ntfs_file_operations = { + .llseek = generic_file_llseek, + .read_iter = ntfs_file_read_iter, + .write_iter = ntfs_file_write_iter, + .unlocked_ioctl = ntfs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ntfs_compat_ioctl, +#endif + .splice_read = generic_file_splice_read, + .mmap = ntfs_file_mmap, + .open = ntfs_file_open, + .fsync = generic_file_fsync, + .splice_write = iter_file_splice_write, + .fallocate = ntfs_fallocate, + .release = ntfs_file_release, +}; +// clang-format on diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c new file mode 100644 index 000000000000..938b12d56ca6 --- /dev/null +++ b/fs/ntfs3/frecord.c @@ -0,0 +1,3257 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * + * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. + * + */ + +#include <linux/blkdev.h> +#include <linux/buffer_head.h> +#include <linux/fiemap.h> +#include <linux/fs.h> +#include <linux/nls.h> +#include <linux/vmalloc.h> + +#include "debug.h" +#include "ntfs.h" +#include "ntfs_fs.h" +#ifdef CONFIG_NTFS3_LZX_XPRESS +#include "lib/lib.h" +#endif + +static struct mft_inode *ni_ins_mi(struct ntfs_inode *ni, struct rb_root *tree, + CLST ino, struct rb_node *ins) +{ + struct rb_node **p = &tree->rb_node; + struct rb_node *pr = NULL; + + while (*p) { + struct mft_inode *mi; + + pr = *p; + mi = rb_entry(pr, struct mft_inode, node); + if (mi->rno > ino) + p = &pr->rb_left; + else if (mi->rno < ino) + p = &pr->rb_right; + else + return mi; + } + + if (!ins) + return NULL; + + rb_link_node(ins, pr, p); + rb_insert_color(ins, tree); + return rb_entry(ins, struct mft_inode, node); +} + +/* + * ni_find_mi - Find mft_inode by record number. + */ +static struct mft_inode *ni_find_mi(struct ntfs_inode *ni, CLST rno) +{ + return ni_ins_mi(ni, &ni->mi_tree, rno, NULL); +} + +/* + * ni_add_mi - Add new mft_inode into ntfs_inode. + */ +static void ni_add_mi(struct ntfs_inode *ni, struct mft_inode *mi) +{ + ni_ins_mi(ni, &ni->mi_tree, mi->rno, &mi->node); +} + +/* + * ni_remove_mi - Remove mft_inode from ntfs_inode. + */ +void ni_remove_mi(struct ntfs_inode *ni, struct mft_inode *mi) +{ + rb_erase(&mi->node, &ni->mi_tree); +} + +/* + * ni_std - Return: Pointer into std_info from primary record. + */ +struct ATTR_STD_INFO *ni_std(struct ntfs_inode *ni) +{ + const struct ATTRIB *attr; + + attr = mi_find_attr(&ni->mi, NULL, ATTR_STD, NULL, 0, NULL); + return attr ? resident_data_ex(attr, sizeof(struct ATTR_STD_INFO)) + : NULL; +} + +/* + * ni_std5 + * + * Return: Pointer into std_info from primary record. + */ +struct ATTR_STD_INFO5 *ni_std5(struct ntfs_inode *ni) +{ + const struct ATTRIB *attr; + + attr = mi_find_attr(&ni->mi, NULL, ATTR_STD, NULL, 0, NULL); + + return attr ? resident_data_ex(attr, sizeof(struct ATTR_STD_INFO5)) + : NULL; +} + +/* + * ni_clear - Clear resources allocated by ntfs_inode. + */ +void ni_clear(struct ntfs_inode *ni) +{ + struct rb_node *node; + + if (!ni->vfs_inode.i_nlink && is_rec_inuse(ni->mi.mrec)) + ni_delete_all(ni); + + al_destroy(ni); + + for (node = rb_first(&ni->mi_tree); node;) { + struct rb_node *next = rb_next(node); + struct mft_inode *mi = rb_entry(node, struct mft_inode, node); + + rb_erase(node, &ni->mi_tree); + mi_put(mi); + node = next; + } + + /* Bad inode always has mode == S_IFREG. */ + if (ni->ni_flags & NI_FLAG_DIR) + indx_clear(&ni->dir); + else { + run_close(&ni->file.run); +#ifdef CONFIG_NTFS3_LZX_XPRESS + if (ni->file.offs_page) { + /* On-demand allocated page for offsets. */ + put_page(ni->file.offs_page); + ni->file.offs_page = NULL; + } +#endif + } + + mi_clear(&ni->mi); +} + +/* + * ni_load_mi_ex - Find mft_inode by record number. + */ +int ni_load_mi_ex(struct ntfs_inode *ni, CLST rno, struct mft_inode **mi) +{ + int err; + struct mft_inode *r; + + r = ni_find_mi(ni, rno); + if (r) + goto out; + + err = mi_get(ni->mi.sbi, rno, &r); + if (err) + return err; + + ni_add_mi(ni, r); + +out: + if (mi) + *mi = r; + return 0; +} + +/* + * ni_load_mi - Load mft_inode corresponded list_entry. + */ +int ni_load_mi(struct ntfs_inode *ni, const struct ATTR_LIST_ENTRY *le, + struct mft_inode **mi) +{ + CLST rno; + + if (!le) { + *mi = &ni->mi; + return 0; + } + + rno = ino_get(&le->ref); + if (rno == ni->mi.rno) { + *mi = &ni->mi; + return 0; + } + return ni_load_mi_ex(ni, rno, mi); +} + +/* + * ni_find_attr + * + * Return: Attribute and record this attribute belongs to. + */ +struct ATTRIB *ni_find_attr(struct ntfs_inode *ni, struct ATTRIB *attr, + struct ATTR_LIST_ENTRY **le_o, enum ATTR_TYPE type, + const __le16 *name, u8 name_len, const CLST *vcn, + struct mft_inode **mi) +{ + struct ATTR_LIST_ENTRY *le; + struct mft_inode *m; + + if (!ni->attr_list.size || + (!name_len && (type == ATTR_LIST || type == ATTR_STD))) { + if (le_o) + *le_o = NULL; + if (mi) + *mi = &ni->mi; + + /* Look for required attribute in primary record. */ + return mi_find_attr(&ni->mi, attr, type, name, name_len, NULL); + } + + /* First look for list entry of required type. */ + le = al_find_ex(ni, le_o ? *le_o : NULL, type, name, name_len, vcn); + if (!le) + return NULL; + + if (le_o) + *le_o = le; + + /* Load record that contains this attribute. */ + if (ni_load_mi(ni, le, &m)) + return NULL; + + /* Look for required attribute. */ + attr = mi_find_attr(m, NULL, type, name, name_len, &le->id); + + if (!attr) + goto out; + + if (!attr->non_res) { + if (vcn && *vcn) + goto out; + } else if (!vcn) { + if (attr->nres.svcn) + goto out; + } else if (le64_to_cpu(attr->nres.svcn) > *vcn || + *vcn > le64_to_cpu(attr->nres.evcn)) { + goto out; + } + + if (mi) + *mi = m; + return attr; + +out: + ntfs_set_state(ni->mi.sbi, NTFS_DIRTY_ERROR); + return NULL; +} + +/* + * ni_enum_attr_ex - Enumerates attributes in ntfs_inode. + */ +struct ATTRIB *ni_enum_attr_ex(struct ntfs_inode *ni, struct ATTRIB *attr, + struct ATTR_LIST_ENTRY **le, + struct mft_inode **mi) +{ + struct mft_inode *mi2; + struct ATTR_LIST_ENTRY *le2; + + /* Do we have an attribute list? */ + if (!ni->attr_list.size) { + *le = NULL; + if (mi) + *mi = &ni->mi; + /* Enum attributes in primary record. */ + return mi_enum_attr(&ni->mi, attr); + } + + /* Get next list entry. */ + le2 = *le = al_enumerate(ni, attr ? *le : NULL); + if (!le2) + return NULL; + + /* Load record that contains the required attribute. */ + if (ni_load_mi(ni, le2, &mi2)) + return NULL; + + if (mi) + *mi = mi2; + + /* Find attribute in loaded record. */ + return rec_find_attr_le(mi2, le2); +} + +/* + * ni_load_attr - Load attribute that contains given VCN. + */ +struct ATTRIB *ni_load_attr(struct ntfs_inode *ni, enum ATTR_TYPE type, + const __le16 *name, u8 name_len, CLST vcn, + struct mft_inode **pmi) +{ + struct ATTR_LIST_ENTRY *le; + struct ATTRIB *attr; + struct mft_inode *mi; + struct ATTR_LIST_ENTRY *next; + + if (!ni->attr_list.size) { + if (pmi) + *pmi = &ni->mi; + return mi_find_attr(&ni->mi, NULL, type, name, name_len, NULL); + } + + le = al_find_ex(ni, NULL, type, name, name_len, NULL); + if (!le) + return NULL; + + /* + * Unfortunately ATTR_LIST_ENTRY contains only start VCN. + * So to find the ATTRIB segment that contains 'vcn' we should + * enumerate some entries. + */ + if (vcn) { + for (;; le = next) { + next = al_find_ex(ni, le, type, name, name_len, NULL); + if (!next || le64_to_cpu(next->vcn) > vcn) + break; + } + } + + if (ni_load_mi(ni, le, &mi)) + return NULL; + + if (pmi) + *pmi = mi; + + attr = mi_find_attr(mi, NULL, type, name, name_len, &le->id); + if (!attr) + return NULL; + + if (!attr->non_res) + return attr; + + if (le64_to_cpu(attr->nres.svcn) <= vcn && + vcn <= le64_to_cpu(attr->nres.evcn)) + return attr; + + return NULL; +} + +/* + * ni_load_all_mi - Load all subrecords. + */ +int ni_load_all_mi(struct ntfs_inode *ni) +{ + int err; + struct ATTR_LIST_ENTRY *le; + + if (!ni->attr_list.size) + return 0; + + le = NULL; + + while ((le = al_enumerate(ni, le))) { + CLST rno = ino_get(&le->ref); + + if (rno == ni->mi.rno) + continue; + + err = ni_load_mi_ex(ni, rno, NULL); + if (err) + return err; + } + + return 0; +} + +/* + * ni_add_subrecord - Allocate + format + attach a new subrecord. + */ +bool ni_add_subrecord(struct ntfs_inode *ni, CLST rno, struct mft_inode **mi) +{ + struct mft_inode *m; + + m = kzalloc(sizeof(struct mft_inode), GFP_NOFS); + if (!m) + return false; + + if (mi_format_new(m, ni->mi.sbi, rno, 0, ni->mi.rno == MFT_REC_MFT)) { + mi_put(m); + return false; + } + + mi_get_ref(&ni->mi, &m->mrec->parent_ref); + + ni_add_mi(ni, m); + *mi = m; + return true; +} + +/* + * ni_remove_attr - Remove all attributes for the given type/name/id. + */ +int ni_remove_attr(struct ntfs_inode *ni, enum ATTR_TYPE type, + const __le16 *name, size_t name_len, bool base_only, + const __le16 *id) +{ + int err; + struct ATTRIB *attr; + struct ATTR_LIST_ENTRY *le; + struct mft_inode *mi; + u32 type_in; + int diff; + + if (base_only || type == ATTR_LIST || !ni->attr_list.size) { + attr = mi_find_attr(&ni->mi, NULL, type, name, name_len, id); + if (!attr) + return -ENOENT; + + mi_remove_attr(ni, &ni->mi, attr); + return 0; + } + + type_in = le32_to_cpu(type); + le = NULL; + + for (;;) { + le = al_enumerate(ni, le); + if (!le) + return 0; + +next_le2: + diff = le32_to_cpu(le->type) - type_in; + if (diff < 0) + continue; + + if (diff > 0) + return 0; + + if (le->name_len != name_len) + continue; + + if (name_len && + memcmp(le_name(le), name, name_len * sizeof(short))) + continue; + + if (id && le->id != *id) + continue; + err = ni_load_mi(ni, le, &mi); + if (err) + return err; + + al_remove_le(ni, le); + + attr = mi_find_attr(mi, NULL, type, name, name_len, id); + if (!attr) + return -ENOENT; + + mi_remove_attr(ni, mi, attr); + + if (PtrOffset(ni->attr_list.le, le) >= ni->attr_list.size) + return 0; + goto next_le2; + } +} + +/* + * ni_ins_new_attr - Insert the attribute into record. + * + * Return: Not full constructed attribute or NULL if not possible to create. + */ +static struct ATTRIB * +ni_ins_new_attr(struct ntfs_inode *ni, struct mft_inode *mi, + struct ATTR_LIST_ENTRY *le, enum ATTR_TYPE type, + const __le16 *name, u8 name_len, u32 asize, u16 name_off, + CLST svcn, struct ATTR_LIST_ENTRY **ins_le) +{ + int err; + struct ATTRIB *attr; + bool le_added = false; + struct MFT_REF ref; + + mi_get_ref(mi, &ref); + + if (type != ATTR_LIST && !le && ni->attr_list.size) { + err = al_add_le(ni, type, name, name_len, svcn, cpu_to_le16(-1), + &ref, &le); + if (err) { + /* No memory or no space. */ + return NULL; + } + le_added = true; + + /* + * al_add_le -> attr_set_size (list) -> ni_expand_list + * which moves some attributes out of primary record + * this means that name may point into moved memory + * reinit 'name' from le. + */ + name = le->name; + } + + attr = mi_insert_attr(mi, type, name, name_len, asize, name_off); + if (!attr) { + if (le_added) + al_remove_le(ni, le); + return NULL; + } + + if (type == ATTR_LIST) { + /* Attr list is not in list entry array. */ + goto out; + } + + if (!le) + goto out; + + /* Update ATTRIB Id and record reference. */ + le->id = attr->id; + ni->attr_list.dirty = true; + le->ref = ref; + +out: + if (ins_le) + *ins_le = le; + return attr; +} + +/* + * ni_repack + * + * Random write access to sparsed or compressed file may result to + * not optimized packed runs. + * Here is the place to optimize it. + */ +static int ni_repack(struct ntfs_inode *ni) +{ + int err = 0; + struct ntfs_sb_info *sbi = ni->mi.sbi; + struct mft_inode *mi, *mi_p = NULL; + struct ATTRIB *attr = NULL, *attr_p; + struct ATTR_LIST_ENTRY *le = NULL, *le_p; + CLST alloc = 0; + u8 cluster_bits = sbi->cluster_bits; + CLST svcn, evcn = 0, svcn_p, evcn_p, next_svcn; + u32 roff, rs = sbi->record_size; + struct runs_tree run; + + run_init(&run); + + while ((attr = ni_enum_attr_ex(ni, attr, &le, &mi))) { + if (!attr->non_res) + continue; + + svcn = le64_to_cpu(attr->nres.svcn); + if (svcn != le64_to_cpu(le->vcn)) { + err = -EINVAL; + break; + } + + if (!svcn) { + alloc = le64_to_cpu(attr->nres.alloc_size) >> + cluster_bits; + mi_p = NULL; + } else if (svcn != evcn + 1) { + err = -EINVAL; + break; + } + + evcn = le64_to_cpu(attr->nres.evcn); + + if (svcn > evcn + 1) { + err = -EINVAL; + break; + } + + if (!mi_p) { + /* Do not try if not enogh free space. */ + if (le32_to_cpu(mi->mrec->used) + 8 >= rs) + continue; + + /* Do not try if last attribute segment. */ + if (evcn + 1 == alloc) + continue; + run_close(&run); + } + + roff = le16_to_cpu(attr->nres.run_off); + err = run_unpack(&run, sbi, ni->mi.rno, svcn, evcn, svcn, + Add2Ptr(attr, roff), + le32_to_cpu(attr->size) - roff); + if (err < 0) + break; + + if (!mi_p) { + mi_p = mi; + attr_p = attr; + svcn_p = svcn; + evcn_p = evcn; + le_p = le; + err = 0; + continue; + } + + /* + * Run contains data from two records: mi_p and mi + * Try to pack in one. + */ + err = mi_pack_runs(mi_p, attr_p, &run, evcn + 1 - svcn_p); + if (err) + break; + + next_svcn = le64_to_cpu(attr_p->nres.evcn) + 1; + + if (next_svcn >= evcn + 1) { + /* We can remove this attribute segment. */ + al_remove_le(ni, le); + mi_remove_attr(NULL, mi, attr); + le = le_p; + continue; + } + + attr->nres.svcn = le->vcn = cpu_to_le64(next_svcn); + mi->dirty = true; + ni->attr_list.dirty = true; + + if (evcn + 1 == alloc) { + err = mi_pack_runs(mi, attr, &run, + evcn + 1 - next_svcn); + if (err) + break; + mi_p = NULL; + } else { + mi_p = mi; + attr_p = attr; + svcn_p = next_svcn; + evcn_p = evcn; + le_p = le; + run_truncate_head(&run, next_svcn); + } + } + + if (err) { + ntfs_inode_warn(&ni->vfs_inode, "repack problem"); + ntfs_set_state(sbi, NTFS_DIRTY_ERROR); + + /* Pack loaded but not packed runs. */ + if (mi_p) + mi_pack_runs(mi_p, attr_p, &run, evcn_p + 1 - svcn_p); + } + + run_close(&run); + return err; +} + +/* + * ni_try_remove_attr_list + * + * Can we remove attribute list? + * Check the case when primary record contains enough space for all attributes. + */ +static int ni_try_remove_attr_list(struct ntfs_inode *ni) +{ + int err = 0; + struct ntfs_sb_info *sbi = ni->mi.sbi; + struct ATTRIB *attr, *attr_list, *attr_ins; + struct ATTR_LIST_ENTRY *le; + struct mft_inode *mi; + u32 asize, free; + struct MFT_REF ref; + __le16 id; + + if (!ni->attr_list.dirty) + return 0; + + err = ni_repack(ni); + if (err) + return err; + + attr_list = mi_find_attr(&ni->mi, NULL, ATTR_LIST, NULL, 0, NULL); + if (!attr_list) + return 0; + + asize = le32_to_cpu(attr_list->size); + + /* Free space in primary record without attribute list. */ + free = sbi->record_size - le32_to_cpu(ni->mi.mrec->used) + asize; + mi_get_ref(&ni->mi, &ref); + + le = NULL; + while ((le = al_enumerate(ni, le))) { + if (!memcmp(&le->ref, &ref, sizeof(ref))) + continue; + + if (le->vcn) + return 0; + + mi = ni_find_mi(ni, ino_get(&le->ref)); + if (!mi) + return 0; + + attr = mi_find_attr(mi, NULL, le->type, le_name(le), + le->name_len, &le->id); + if (!attr) + return 0; + + asize = le32_to_cpu(attr->size); + if (asize > free) + return 0; + + free -= asize; + } + + /* It seems that attribute list can be removed from primary record. */ + mi_remove_attr(NULL, &ni->mi, attr_list); + + /* + * Repeat the cycle above and move all attributes to primary record. + * It should be success! + */ + le = NULL; + while ((le = al_enumerate(ni, le))) { + if (!memcmp(&le->ref, &ref, sizeof(ref))) + continue; + + mi = ni_find_mi(ni, ino_get(&le->ref)); + + attr = mi_find_attr(mi, NULL, le->type, le_name(le), + le->name_len, &le->id); + asize = le32_to_cpu(attr->size); + + /* Insert into primary record. */ + attr_ins = mi_insert_attr(&ni->mi, le->type, le_name(le), + le->name_len, asize, + le16_to_cpu(attr->name_off)); + id = attr_ins->id; + + /* Copy all except id. */ + memcpy(attr_ins, attr, asize); + attr_ins->id = id; + + /* Remove from original record. */ + mi_remove_attr(NULL, mi, attr); + } + + run_deallocate(sbi, &ni->attr_list.run, true); + run_close(&ni->attr_list.run); + ni->attr_list.size = 0; + kfree(ni->attr_list.le); + ni->attr_list.le = NULL; + ni->attr_list.dirty = false; + + return 0; +} + +/* + * ni_create_attr_list - Generates an attribute list for this primary record. + */ +int ni_create_attr_list(struct ntfs_inode *ni) +{ + struct ntfs_sb_info *sbi = ni->mi.sbi; + int err; + u32 lsize; + struct ATTRIB *attr; + struct ATTRIB *arr_move[7]; + struct ATTR_LIST_ENTRY *le, *le_b[7]; + struct MFT_REC *rec; + bool is_mft; + CLST rno = 0; + struct mft_inode *mi; + u32 free_b, nb, to_free, rs; + u16 sz; + + is_mft = ni->mi.rno == MFT_REC_MFT; + rec = ni->mi.mrec; + rs = sbi->record_size; + + /* + * Skip estimating exact memory requirement. + * Looks like one record_size is always enough. + */ + le = kmalloc(al_aligned(rs), GFP_NOFS); + if (!le) { + err = -ENOMEM; + goto out; + } + + mi_get_ref(&ni->mi, &le->ref); + ni->attr_list.le = le; + + attr = NULL; + nb = 0; + free_b = 0; + attr = NULL; + + for (; (attr = mi_enum_attr(&ni->mi, attr)); le = Add2Ptr(le, sz)) { + sz = le_size(attr->name_len); + le->type = attr->type; + le->size = cpu_to_le16(sz); + le->name_len = attr->name_len; + le->name_off = offsetof(struct ATTR_LIST_ENTRY, name); + le->vcn = 0; + if (le != ni->attr_list.le) + le->ref = ni->attr_list.le->ref; + le->id = attr->id; + + if (attr->name_len) + memcpy(le->name, attr_name(attr), + sizeof(short) * attr->name_len); + else if (attr->type == ATTR_STD) + continue; + else if (attr->type == ATTR_LIST) + continue; + else if (is_mft && attr->type == ATTR_DATA) + continue; + + if (!nb || nb < ARRAY_SIZE(arr_move)) { + le_b[nb] = le; + arr_move[nb++] = attr; + free_b += le32_to_cpu(attr->size); + } + } + + lsize = PtrOffset(ni->attr_list.le, le); + ni->attr_list.size = lsize; + + to_free = le32_to_cpu(rec->used) + lsize + SIZEOF_RESIDENT; + if (to_free <= rs) { + to_free = 0; + } else { + to_free -= rs; + + if (to_free > free_b) { + err = -EINVAL; + goto out1; + } + } + + /* Allocate child MFT. */ + err = ntfs_look_free_mft(sbi, &rno, is_mft, ni, &mi); + if (err) + goto out1; + + /* Call mi_remove_attr() in reverse order to keep pointers 'arr_move' valid. */ + while (to_free > 0) { + struct ATTRIB *b = arr_move[--nb]; + u32 asize = le32_to_cpu(b->size); + u16 name_off = le16_to_cpu(b->name_off); + + attr = mi_insert_attr(mi, b->type, Add2Ptr(b, name_off), + b->name_len, asize, name_off); + WARN_ON(!attr); + + mi_get_ref(mi, &le_b[nb]->ref); + le_b[nb]->id = attr->id; + + /* Copy all except id. */ + memcpy(attr, b, asize); + attr->id = le_b[nb]->id; + + /* Remove from primary record. */ + WARN_ON(!mi_remove_attr(NULL, &ni->mi, b)); + + if (to_free <= asize) + break; + to_free -= asize; + WARN_ON(!nb); + } + + attr = mi_insert_attr(&ni->mi, ATTR_LIST, NULL, 0, + lsize + SIZEOF_RESIDENT, SIZEOF_RESIDENT); + WARN_ON(!attr); + + attr->non_res = 0; + attr->flags = 0; + attr->res.data_size = cpu_to_le32(lsize); + attr->res.data_off = SIZEOF_RESIDENT_LE; + attr->res.flags = 0; + attr->res.res = 0; + + memcpy(resident_data_ex(attr, lsize), ni->attr_list.le, lsize); + + ni->attr_list.dirty = false; + + mark_inode_dirty(&ni->vfs_inode); + goto out; + +out1: + kfree(ni->attr_list.le); + ni->attr_list.le = NULL; + ni->attr_list.size = 0; + +out: + return err; +} + +/* + * ni_ins_attr_ext - Add an external attribute to the ntfs_inode. + */ +static int ni_ins_attr_ext(struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le, + enum ATTR_TYPE type, const __le16 *name, u8 name_len, + u32 asize, CLST svcn, u16 name_off, bool force_ext, + struct ATTRIB **ins_attr, struct mft_inode **ins_mi, + struct ATTR_LIST_ENTRY **ins_le) +{ + struct ATTRIB *attr; + struct mft_inode *mi; + CLST rno; + u64 vbo; + struct rb_node *node; + int err; + bool is_mft, is_mft_data; + struct ntfs_sb_info *sbi = ni->mi.sbi; + + is_mft = ni->mi.rno == MFT_REC_MFT; + is_mft_data = is_mft && type == ATTR_DATA && !name_len; + + if (asize > sbi->max_bytes_per_attr) { + err = -EINVAL; + goto out; + } + + /* + * Standard information and attr_list cannot be made external. + * The Log File cannot have any external attributes. + */ + if (type == ATTR_STD || type == ATTR_LIST || + ni->mi.rno == MFT_REC_LOG) { + err = -EINVAL; + goto out; + } + + /* Create attribute list if it is not already existed. */ + if (!ni->attr_list.size) { + err = ni_create_attr_list(ni); + if (err) + goto out; + } + + vbo = is_mft_data ? ((u64)svcn << sbi->cluster_bits) : 0; + + if (force_ext) + goto insert_ext; + + /* Load all subrecords into memory. */ + err = ni_load_all_mi(ni); + if (err) + goto out; + + /* Check each of loaded subrecord. */ + for (node = rb_first(&ni->mi_tree); node; node = rb_next(node)) { + mi = rb_entry(node, struct mft_inode, node); + + if (is_mft_data && + (mi_enum_attr(mi, NULL) || + vbo <= ((u64)mi->rno << sbi->record_bits))) { + /* We can't accept this record 'cause MFT's bootstrapping. */ + continue; + } + if (is_mft && + mi_find_attr(mi, NULL, ATTR_DATA, NULL, 0, NULL)) { + /* + * This child record already has a ATTR_DATA. + * So it can't accept any other records. + */ + continue; + } + + if ((type != ATTR_NAME || name_len) && + mi_find_attr(mi, NULL, type, name, name_len, NULL)) { + /* Only indexed attributes can share same record. */ + continue; + } + + /* Try to insert attribute into this subrecord. */ + attr = ni_ins_new_attr(ni, mi, le, type, name, name_len, asize, + name_off, svcn, ins_le); + if (!attr) + continue; + + if (ins_attr) + *ins_attr = attr; + if (ins_mi) + *ins_mi = mi; + return 0; + } + +insert_ext: + /* We have to allocate a new child subrecord. */ + err = ntfs_look_free_mft(sbi, &rno, is_mft_data, ni, &mi); + if (err) + goto out; + + if (is_mft_data && vbo <= ((u64)rno << sbi->record_bits)) { + err = -EINVAL; + goto out1; + } + + attr = ni_ins_new_attr(ni, mi, le, type, name, name_len, asize, + name_off, svcn, ins_le); + if (!attr) + goto out2; + + if (ins_attr) + *ins_attr = attr; + if (ins_mi) + *ins_mi = mi; + + return 0; + +out2: + ni_remove_mi(ni, mi); + mi_put(mi); + err = -EINVAL; + +out1: + ntfs_mark_rec_free(sbi, rno); + +out: + return err; +} + +/* + * ni_insert_attr - Insert an attribute into the file. + * + * If the primary record has room, it will just insert the attribute. + * If not, it may make the attribute external. + * For $MFT::Data it may make room for the attribute by + * making other attributes external. + * + * NOTE: + * The ATTR_LIST and ATTR_STD cannot be made external. + * This function does not fill new attribute full. + * It only fills 'size'/'type'/'id'/'name_len' fields. + */ +static int ni_insert_attr(struct ntfs_inode *ni, enum ATTR_TYPE type, + const __le16 *name, u8 name_len, u32 asize, + u16 name_off, CLST svcn, struct ATTRIB **ins_attr, + struct mft_inode **ins_mi, + struct ATTR_LIST_ENTRY **ins_le) +{ + struct ntfs_sb_info *sbi = ni->mi.sbi; + int err; + struct ATTRIB *attr, *eattr; + struct MFT_REC *rec; + bool is_mft; + struct ATTR_LIST_ENTRY *le; + u32 list_reserve, max_free, free, used, t32; + __le16 id; + u16 t16; + + is_mft = ni->mi.rno == MFT_REC_MFT; + rec = ni->mi.mrec; + + list_reserve = SIZEOF_NONRESIDENT + 3 * (1 + 2 * sizeof(u32)); + used = le32_to_cpu(rec->used); + free = sbi->record_size - used; + + if (is_mft && type != ATTR_LIST) { + /* Reserve space for the ATTRIB list. */ + if (free < list_reserve) + free = 0; + else + free -= list_reserve; + } + + if (asize <= free) { + attr = ni_ins_new_attr(ni, &ni->mi, NULL, type, name, name_len, + asize, name_off, svcn, ins_le); + if (attr) { + if (ins_attr) + *ins_attr = attr; + if (ins_mi) + *ins_mi = &ni->mi; + err = 0; + goto out; + } + } + + if (!is_mft || type != ATTR_DATA || svcn) { + /* This ATTRIB will be external. */ + err = ni_ins_attr_ext(ni, NULL, type, name, name_len, asize, + svcn, name_off, false, ins_attr, ins_mi, + ins_le); + goto out; + } + + /* + * Here we have: "is_mft && type == ATTR_DATA && !svcn" + * + * The first chunk of the $MFT::Data ATTRIB must be the base record. + * Evict as many other attributes as possible. + */ + max_free = free; + + /* Estimate the result of moving all possible attributes away. */ + attr = NULL; + + while ((attr = mi_enum_attr(&ni->mi, attr))) { + if (attr->type == ATTR_STD) + continue; + if (attr->type == ATTR_LIST) + continue; + max_free += le32_to_cpu(attr->size); + } + + if (max_free < asize + list_reserve) { + /* Impossible to insert this attribute into primary record. */ + err = -EINVAL; + goto out; + } + + /* Start real attribute moving. */ + attr = NULL; + + for (;;) { + attr = mi_enum_attr(&ni->mi, attr); + if (!attr) { + /* We should never be here 'cause we have already check this case. */ + err = -EINVAL; + goto out; + } + + /* Skip attributes that MUST be primary record. */ + if (attr->type == ATTR_STD || attr->type == ATTR_LIST) + continue; + + le = NULL; + if (ni->attr_list.size) { + le = al_find_le(ni, NULL, attr); + if (!le) { + /* Really this is a serious bug. */ + err = -EINVAL; + goto out; + } + } + + t32 = le32_to_cpu(attr->size); + t16 = le16_to_cpu(attr->name_off); + err = ni_ins_attr_ext(ni, le, attr->type, Add2Ptr(attr, t16), + attr->name_len, t32, attr_svcn(attr), t16, + false, &eattr, NULL, NULL); + if (err) + return err; + + id = eattr->id; + memcpy(eattr, attr, t32); + eattr->id = id; + + /* Remove from primary record. */ + mi_remove_attr(NULL, &ni->mi, attr); + + /* attr now points to next attribute. */ + if (attr->type == ATTR_END) + goto out; + } + while (asize + list_reserve > sbi->record_size - le32_to_cpu(rec->used)) + ; + + attr = ni_ins_new_attr(ni, &ni->mi, NULL, type, name, name_len, asize, + name_off, svcn, ins_le); + if (!attr) { + err = -EINVAL; + goto out; + } + + if (ins_attr) + *ins_attr = attr; + if (ins_mi) + *ins_mi = &ni->mi; + +out: + return err; +} + +/* ni_expand_mft_list - Split ATTR_DATA of $MFT. */ +static int ni_expand_mft_list(struct ntfs_inode *ni) +{ + int err = 0; + struct runs_tree *run = &ni->file.run; + u32 asize, run_size, done = 0; + struct ATTRIB *attr; + struct rb_node *node; + CLST mft_min, mft_new, svcn, evcn, plen; + struct mft_inode *mi, *mi_min, *mi_new; + struct ntfs_sb_info *sbi = ni->mi.sbi; + + /* Find the nearest MFT. */ + mft_min = 0; + mft_new = 0; + mi_min = NULL; + + for (node = rb_first(&ni->mi_tree); node; node = rb_next(node)) { + mi = rb_entry(node, struct mft_inode, node); + + attr = mi_enum_attr(mi, NULL); + + if (!attr) { + mft_min = mi->rno; + mi_min = mi; + break; + } + } + + if (ntfs_look_free_mft(sbi, &mft_new, true, ni, &mi_new)) { + mft_new = 0; + /* Really this is not critical. */ + } else if (mft_min > mft_new) { + mft_min = mft_new; + mi_min = mi_new; + } else { + ntfs_mark_rec_free(sbi, mft_new); + mft_new = 0; + ni_remove_mi(ni, mi_new); + } + + attr = mi_find_attr(&ni->mi, NULL, ATTR_DATA, NULL, 0, NULL); + if (!attr) { + err = -EINVAL; + goto out; + } + + asize = le32_to_cpu(attr->size); + + evcn = le64_to_cpu(attr->nres.evcn); + svcn = bytes_to_cluster(sbi, (u64)(mft_min + 1) << sbi->record_bits); + if (evcn + 1 >= svcn) { + err = -EINVAL; + goto out; + } + + /* + * Split primary attribute [0 evcn] in two parts [0 svcn) + [svcn evcn]. + * + * Update first part of ATTR_DATA in 'primary MFT. + */ + err = run_pack(run, 0, svcn, Add2Ptr(attr, SIZEOF_NONRESIDENT), + asize - SIZEOF_NONRESIDENT, &plen); + if (err < 0) + goto out; + + run_size = ALIGN(err, 8); + err = 0; + + if (plen < svcn) { + err = -EINVAL; + goto out; + } + + attr->nres.evcn = cpu_to_le64(svcn - 1); + attr->size = cpu_to_le32(run_size + SIZEOF_NONRESIDENT); + /* 'done' - How many bytes of primary MFT becomes free. */ + done = asize - run_size - SIZEOF_NONRESIDENT; + le32_sub_cpu(&ni->mi.mrec->used, done); + + /* Estimate the size of second part: run_buf=NULL. */ + err = run_pack(run, svcn, evcn + 1 - svcn, NULL, sbi->record_size, + &plen); + if (err < 0) + goto out; + + run_size = ALIGN(err, 8); + err = 0; + + if (plen < evcn + 1 - svcn) { + err = -EINVAL; + goto out; + } + + /* + * This function may implicitly call expand attr_list. + * Insert second part of ATTR_DATA in 'mi_min'. + */ + attr = ni_ins_new_attr(ni, mi_min, NULL, ATTR_DATA, NULL, 0, + SIZEOF_NONRESIDENT + run_size, + SIZEOF_NONRESIDENT, svcn, NULL); + if (!attr) { + err = -EINVAL; + goto out; + } + + attr->non_res = 1; + attr->name_off = SIZEOF_NONRESIDENT_LE; + attr->flags = 0; + + run_pack(run, svcn, evcn + 1 - svcn, Add2Ptr(attr, SIZEOF_NONRESIDENT), + run_size, &plen); + + attr->nres.svcn = cpu_to_le64(svcn); + attr->nres.evcn = cpu_to_le64(evcn); + attr->nres.run_off = cpu_to_le16(SIZEOF_NONRESIDENT); + +out: + if (mft_new) { + ntfs_mark_rec_free(sbi, mft_new); + ni_remove_mi(ni, mi_new); + } + + return !err && !done ? -EOPNOTSUPP : err; +} + +/* + * ni_expand_list - Move all possible attributes out of primary record. + */ +int ni_expand_list(struct ntfs_inode *ni) +{ + int err = 0; + u32 asize, done = 0; + struct ATTRIB *attr, *ins_attr; + struct ATTR_LIST_ENTRY *le; + bool is_mft = ni->mi.rno == MFT_REC_MFT; + struct MFT_REF ref; + + mi_get_ref(&ni->mi, &ref); + le = NULL; + + while ((le = al_enumerate(ni, le))) { + if (le->type == ATTR_STD) + continue; + + if (memcmp(&ref, &le->ref, sizeof(struct MFT_REF))) + continue; + + if (is_mft && le->type == ATTR_DATA) + continue; + + /* Find attribute in primary record. */ + attr = rec_find_attr_le(&ni->mi, le); + if (!attr) { + err = -EINVAL; + goto out; + } + + asize = le32_to_cpu(attr->size); + + /* Always insert into new record to avoid collisions (deep recursive). */ + err = ni_ins_attr_ext(ni, le, attr->type, attr_name(attr), + attr->name_len, asize, attr_svcn(attr), + le16_to_cpu(attr->name_off), true, + &ins_attr, NULL, NULL); + + if (err) + goto out; + + memcpy(ins_attr, attr, asize); + ins_attr->id = le->id; + /* Remove from primary record. */ + mi_remove_attr(NULL, &ni->mi, attr); + + done += asize; + goto out; + } + + if (!is_mft) { + err = -EFBIG; /* Attr list is too big(?) */ + goto out; + } + + /* Split MFT data as much as possible. */ + err = ni_expand_mft_list(ni); + if (err) + goto out; + +out: + return !err && !done ? -EOPNOTSUPP : err; +} + +/* + * ni_insert_nonresident - Insert new nonresident attribute. + */ +int ni_insert_nonresident(struct ntfs_inode *ni, enum ATTR_TYPE type, + const __le16 *name, u8 name_len, + const struct runs_tree *run, CLST svcn, CLST len, + __le16 flags, struct ATTRIB **new_attr, + struct mft_inode **mi) +{ + int err; + CLST plen; + struct ATTRIB *attr; + bool is_ext = + (flags & (ATTR_FLAG_SPARSED | ATTR_FLAG_COMPRESSED)) && !svcn; + u32 name_size = ALIGN(name_len * sizeof(short), 8); + u32 name_off = is_ext ? SIZEOF_NONRESIDENT_EX : SIZEOF_NONRESIDENT; + u32 run_off = name_off + name_size; + u32 run_size, asize; + struct ntfs_sb_info *sbi = ni->mi.sbi; + + err = run_pack(run, svcn, len, NULL, sbi->max_bytes_per_attr - run_off, + &plen); + if (err < 0) + goto out; + + run_size = ALIGN(err, 8); + + if (plen < len) { + err = -EINVAL; + goto out; + } + + asize = run_off + run_size; + + if (asize > sbi->max_bytes_per_attr) { + err = -EINVAL; + goto out; + } + + err = ni_insert_attr(ni, type, name, name_len, asize, name_off, svcn, + &attr, mi, NULL); + + if (err) + goto out; + + attr->non_res = 1; + attr->name_off = cpu_to_le16(name_off); + attr->flags = flags; + + run_pack(run, svcn, len, Add2Ptr(attr, run_off), run_size, &plen); + + attr->nres.svcn = cpu_to_le64(svcn); + attr->nres.evcn = cpu_to_le64((u64)svcn + len - 1); + + err = 0; + if (new_attr) + *new_attr = attr; + + *(__le64 *)&attr->nres.run_off = cpu_to_le64(run_off); + + attr->nres.alloc_size = + svcn ? 0 : cpu_to_le64((u64)len << ni->mi.sbi->cluster_bits); + attr->nres.data_size = attr->nres.alloc_size; + attr->nres.valid_size = attr->nres.alloc_size; + + if (is_ext) { + if (flags & ATTR_FLAG_COMPRESSED) + attr->nres.c_unit = COMPRESSION_UNIT; + attr->nres.total_size = attr->nres.alloc_size; + } + +out: + return err; +} + +/* + * ni_insert_resident - Inserts new resident attribute. + */ +int ni_insert_resident(struct ntfs_inode *ni, u32 data_size, + enum ATTR_TYPE type, const __le16 *name, u8 name_len, + struct ATTRIB **new_attr, struct mft_inode **mi, + struct ATTR_LIST_ENTRY **le) +{ + int err; + u32 name_size = ALIGN(name_len * sizeof(short), 8); + u32 asize = SIZEOF_RESIDENT + name_size + ALIGN(data_size, 8); + struct ATTRIB *attr; + + err = ni_insert_attr(ni, type, name, name_len, asize, SIZEOF_RESIDENT, + 0, &attr, mi, le); + if (err) + return err; + + attr->non_res = 0; + attr->flags = 0; + + attr->res.data_size = cpu_to_le32(data_size); + attr->res.data_off = cpu_to_le16(SIZEOF_RESIDENT + name_size); + if (type == ATTR_NAME) { + attr->res.flags = RESIDENT_FLAG_INDEXED; + + /* is_attr_indexed(attr)) == true */ + le16_add_cpu(&ni->mi.mrec->hard_links, +1); + ni->mi.dirty = true; + } + attr->res.res = 0; + + if (new_attr) + *new_attr = attr; + + return 0; +} + +/* + * ni_remove_attr_le - Remove attribute from record. + */ +void ni_remove_attr_le(struct ntfs_inode *ni, struct ATTRIB *attr, + struct mft_inode *mi, struct ATTR_LIST_ENTRY *le) +{ + mi_remove_attr(ni, mi, attr); + + if (le) + al_remove_le(ni, le); +} + +/* + * ni_delete_all - Remove all attributes and frees allocates space. + * + * ntfs_evict_inode->ntfs_clear_inode->ni_delete_all (if no links). + */ +int ni_delete_all(struct ntfs_inode *ni) +{ + int err; + struct ATTR_LIST_ENTRY *le = NULL; + struct ATTRIB *attr = NULL; + struct rb_node *node; + u16 roff; + u32 asize; + CLST svcn, evcn; + struct ntfs_sb_info *sbi = ni->mi.sbi; + bool nt3 = is_ntfs3(sbi); + struct MFT_REF ref; + + while ((attr = ni_enum_attr_ex(ni, attr, &le, NULL))) { + if (!nt3 || attr->name_len) { + ; + } else if (attr->type == ATTR_REPARSE) { + mi_get_ref(&ni->mi, &ref); + ntfs_remove_reparse(sbi, 0, &ref); + } else if (attr->type == ATTR_ID && !attr->non_res && + le32_to_cpu(attr->res.data_size) >= + sizeof(struct GUID)) { + ntfs_objid_remove(sbi, resident_data(attr)); + } + + if (!attr->non_res) + continue; + + svcn = le64_to_cpu(attr->nres.svcn); + evcn = le64_to_cpu(attr->nres.evcn); + + if (evcn + 1 <= svcn) + continue; + + asize = le32_to_cpu(attr->size); + roff = le16_to_cpu(attr->nres.run_off); + + /* run==1 means unpack and deallocate. */ + run_unpack_ex(RUN_DEALLOCATE, sbi, ni->mi.rno, svcn, evcn, svcn, + Add2Ptr(attr, roff), asize - roff); + } + + if (ni->attr_list.size) { + run_deallocate(ni->mi.sbi, &ni->attr_list.run, true); + al_destroy(ni); + } + + /* Free all subrecords. */ + for (node = rb_first(&ni->mi_tree); node;) { + struct rb_node *next = rb_next(node); + struct mft_inode *mi = rb_entry(node, struct mft_inode, node); + + clear_rec_inuse(mi->mrec); + mi->dirty = true; + mi_write(mi, 0); + + ntfs_mark_rec_free(sbi, mi->rno); + ni_remove_mi(ni, mi); + mi_put(mi); + node = next; + } + + /* Free base record. */ + clear_rec_inuse(ni->mi.mrec); + ni->mi.dirty = true; + err = mi_write(&ni->mi, 0); + + ntfs_mark_rec_free(sbi, ni->mi.rno); + + return err; +} + +/* ni_fname_name + * + * Return: File name attribute by its value. + */ +struct ATTR_FILE_NAME *ni_fname_name(struct ntfs_inode *ni, + const struct cpu_str *uni, + const struct MFT_REF *home_dir, + struct mft_inode **mi, + struct ATTR_LIST_ENTRY **le) +{ + struct ATTRIB *attr = NULL; + struct ATTR_FILE_NAME *fname; + + *le = NULL; + + /* Enumerate all names. */ +next: + attr = ni_find_attr(ni, attr, le, ATTR_NAME, NULL, 0, NULL, mi); + if (!attr) + return NULL; + + fname = resident_data_ex(attr, SIZEOF_ATTRIBUTE_FILENAME); + if (!fname) + goto next; + + if (home_dir && memcmp(home_dir, &fname->home, sizeof(*home_dir))) + goto next; + + if (!uni) + goto next; + + if (uni->len != fname->name_len) + goto next; + + if (ntfs_cmp_names_cpu(uni, (struct le_str *)&fname->name_len, NULL, + false)) + goto next; + + return fname; +} + +/* + * ni_fname_type + * + * Return: File name attribute with given type. + */ +struct ATTR_FILE_NAME *ni_fname_type(struct ntfs_inode *ni, u8 name_type, + struct mft_inode **mi, + struct ATTR_LIST_ENTRY **le) +{ + struct ATTRIB *attr = NULL; + struct ATTR_FILE_NAME *fname; + + *le = NULL; + + if (FILE_NAME_POSIX == name_type) + return NULL; + + /* Enumerate all names. */ + for (;;) { + attr = ni_find_attr(ni, attr, le, ATTR_NAME, NULL, 0, NULL, mi); + if (!attr) + return NULL; + + fname = resident_data_ex(attr, SIZEOF_ATTRIBUTE_FILENAME); + if (fname && name_type == fname->type) + return fname; + } +} + +/* + * ni_new_attr_flags + * + * Process compressed/sparsed in special way. + * NOTE: You need to set ni->std_fa = new_fa + * after this function to keep internal structures in consistency. + */ +int ni_new_attr_flags(struct ntfs_inode *ni, enum FILE_ATTRIBUTE new_fa) +{ + struct ATTRIB *attr; + struct mft_inode *mi; + __le16 new_aflags; + u32 new_asize; + + attr = ni_find_attr(ni, NULL, NULL, ATTR_DATA, NULL, 0, NULL, &mi); + if (!attr) + return -EINVAL; + + new_aflags = attr->flags; + + if (new_fa & FILE_ATTRIBUTE_SPARSE_FILE) + new_aflags |= ATTR_FLAG_SPARSED; + else + new_aflags &= ~ATTR_FLAG_SPARSED; + + if (new_fa & FILE_ATTRIBUTE_COMPRESSED) + new_aflags |= ATTR_FLAG_COMPRESSED; + else + new_aflags &= ~ATTR_FLAG_COMPRESSED; + + if (new_aflags == attr->flags) + return 0; + + if ((new_aflags & (ATTR_FLAG_COMPRESSED | ATTR_FLAG_SPARSED)) == + (ATTR_FLAG_COMPRESSED | ATTR_FLAG_SPARSED)) { + ntfs_inode_warn(&ni->vfs_inode, + "file can't be sparsed and compressed"); + return -EOPNOTSUPP; + } + + if (!attr->non_res) + goto out; + + if (attr->nres.data_size) { + ntfs_inode_warn( + &ni->vfs_inode, + "one can change sparsed/compressed only for empty files"); + return -EOPNOTSUPP; + } + + /* Resize nonresident empty attribute in-place only. */ + new_asize = (new_aflags & (ATTR_FLAG_COMPRESSED | ATTR_FLAG_SPARSED)) + ? (SIZEOF_NONRESIDENT_EX + 8) + : (SIZEOF_NONRESIDENT + 8); + + if (!mi_resize_attr(mi, attr, new_asize - le32_to_cpu(attr->size))) + return -EOPNOTSUPP; + + if (new_aflags & ATTR_FLAG_SPARSED) { + attr->name_off = SIZEOF_NONRESIDENT_EX_LE; + /* Windows uses 16 clusters per frame but supports one cluster per frame too. */ + attr->nres.c_unit = 0; + ni->vfs_inode.i_mapping->a_ops = &ntfs_aops; + } else if (new_aflags & ATTR_FLAG_COMPRESSED) { + attr->name_off = SIZEOF_NONRESIDENT_EX_LE; + /* The only allowed: 16 clusters per frame. */ + attr->nres.c_unit = NTFS_LZNT_CUNIT; + ni->vfs_inode.i_mapping->a_ops = &ntfs_aops_cmpr; + } else { + attr->name_off = SIZEOF_NONRESIDENT_LE; + /* Normal files. */ + attr->nres.c_unit = 0; + ni->vfs_inode.i_mapping->a_ops = &ntfs_aops; + } + attr->nres.run_off = attr->name_off; +out: + attr->flags = new_aflags; + mi->dirty = true; + + return 0; +} + +/* + * ni_parse_reparse + * + * Buffer is at least 24 bytes. + */ +enum REPARSE_SIGN ni_parse_reparse(struct ntfs_inode *ni, struct ATTRIB *attr, + void *buffer) +{ + const struct REPARSE_DATA_BUFFER *rp = NULL; + u8 bits; + u16 len; + typeof(rp->CompressReparseBuffer) *cmpr; + + static_assert(sizeof(struct REPARSE_DATA_BUFFER) <= 24); + + /* Try to estimate reparse point. */ + if (!attr->non_res) { + rp = resident_data_ex(attr, sizeof(struct REPARSE_DATA_BUFFER)); + } else if (le64_to_cpu(attr->nres.data_size) >= + sizeof(struct REPARSE_DATA_BUFFER)) { + struct runs_tree run; + + run_init(&run); + + if (!attr_load_runs_vcn(ni, ATTR_REPARSE, NULL, 0, &run, 0) && + !ntfs_read_run_nb(ni->mi.sbi, &run, 0, buffer, + sizeof(struct REPARSE_DATA_BUFFER), + NULL)) { + rp = buffer; + } + + run_close(&run); + } + + if (!rp) + return REPARSE_NONE; + + len = le16_to_cpu(rp->ReparseDataLength); + switch (rp->ReparseTag) { + case (IO_REPARSE_TAG_MICROSOFT | IO_REPARSE_TAG_SYMBOLIC_LINK): + break; /* Symbolic link. */ + case IO_REPARSE_TAG_MOUNT_POINT: + break; /* Mount points and junctions. */ + case IO_REPARSE_TAG_SYMLINK: + break; + case IO_REPARSE_TAG_COMPRESS: + /* + * WOF - Windows Overlay Filter - Used to compress files with + * LZX/Xpress. + * + * Unlike native NTFS file compression, the Windows + * Overlay Filter supports only read operations. This means + * that it doesn't need to sector-align each compressed chunk, + * so the compressed data can be packed more tightly together. + * If you open the file for writing, the WOF just decompresses + * the entire file, turning it back into a plain file. + * + * Ntfs3 driver decompresses the entire file only on write or + * change size requests. + */ + + cmpr = &rp->CompressReparseBuffer; + if (len < sizeof(*cmpr) || + cmpr->WofVersion != WOF_CURRENT_VERSION || + cmpr->WofProvider != WOF_PROVIDER_SYSTEM || + cmpr->ProviderVer != WOF_PROVIDER_CURRENT_VERSION) { + return REPARSE_NONE; + } + + switch (cmpr->CompressionFormat) { + case WOF_COMPRESSION_XPRESS4K: + bits = 0xc; // 4k + break; + case WOF_COMPRESSION_XPRESS8K: + bits = 0xd; // 8k + break; + case WOF_COMPRESSION_XPRESS16K: + bits = 0xe; // 16k + break; + case WOF_COMPRESSION_LZX32K: + bits = 0xf; // 32k + break; + default: + bits = 0x10; // 64k + break; + } + ni_set_ext_compress_bits(ni, bits); + return REPARSE_COMPRESSED; + + case IO_REPARSE_TAG_DEDUP: + ni->ni_flags |= NI_FLAG_DEDUPLICATED; + return REPARSE_DEDUPLICATED; + + default: + if (rp->ReparseTag & IO_REPARSE_TAG_NAME_SURROGATE) + break; + + return REPARSE_NONE; + } + + /* Looks like normal symlink. */ + return REPARSE_LINK; +} + +/* + * ni_fiemap - Helper for file_fiemap(). + * + * Assumed ni_lock. + * TODO: Less aggressive locks. + */ +int ni_fiemap(struct ntfs_inode *ni, struct fiemap_extent_info *fieinfo, + __u64 vbo, __u64 len) +{ + int err = 0; + struct ntfs_sb_info *sbi = ni->mi.sbi; + u8 cluster_bits = sbi->cluster_bits; + struct runs_tree *run; + struct rw_semaphore *run_lock; + struct ATTRIB *attr; + CLST vcn = vbo >> cluster_bits; + CLST lcn, clen; + u64 valid = ni->i_valid; + u64 lbo, bytes; + u64 end, alloc_size; + size_t idx = -1; + u32 flags; + bool ok; + + if (S_ISDIR(ni->vfs_inode.i_mode)) { + run = &ni->dir.alloc_run; + attr = ni_find_attr(ni, NULL, NULL, ATTR_ALLOC, I30_NAME, + ARRAY_SIZE(I30_NAME), NULL, NULL); + run_lock = &ni->dir.run_lock; + } else { + run = &ni->file.run; + attr = ni_find_attr(ni, NULL, NULL, ATTR_DATA, NULL, 0, NULL, + NULL); + if (!attr) { + err = -EINVAL; + goto out; + } + if (is_attr_compressed(attr)) { + /* Unfortunately cp -r incorrectly treats compressed clusters. */ + err = -EOPNOTSUPP; + ntfs_inode_warn( + &ni->vfs_inode, + "fiemap is not supported for compressed file (cp -r)"); + goto out; + } + run_lock = &ni->file.run_lock; + } + + if (!attr || !attr->non_res) { + err = fiemap_fill_next_extent( + fieinfo, 0, 0, + attr ? le32_to_cpu(attr->res.data_size) : 0, + FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_LAST | + FIEMAP_EXTENT_MERGED); + goto out; + } + + end = vbo + len; + alloc_size = le64_to_cpu(attr->nres.alloc_size); + if (end > alloc_size) + end = alloc_size; + + down_read(run_lock); + + while (vbo < end) { + if (idx == -1) { + ok = run_lookup_entry(run, vcn, &lcn, &clen, &idx); + } else { + CLST vcn_next = vcn; + + ok = run_get_entry(run, ++idx, &vcn, &lcn, &clen) && + vcn == vcn_next; + if (!ok) + vcn = vcn_next; + } + + if (!ok) { + up_read(run_lock); + down_write(run_lock); + + err = attr_load_runs_vcn(ni, attr->type, + attr_name(attr), + attr->name_len, run, vcn); + + up_write(run_lock); + down_read(run_lock); + + if (err) + break; + + ok = run_lookup_entry(run, vcn, &lcn, &clen, &idx); + + if (!ok) { + err = -EINVAL; + break; + } + } + + if (!clen) { + err = -EINVAL; // ? + break; + } + + if (lcn == SPARSE_LCN) { + vcn += clen; + vbo = (u64)vcn << cluster_bits; + continue; + } + + flags = FIEMAP_EXTENT_MERGED; + if (S_ISDIR(ni->vfs_inode.i_mode)) { + ; + } else if (is_attr_compressed(attr)) { + CLST clst_data; + + err = attr_is_frame_compressed( + ni, attr, vcn >> attr->nres.c_unit, &clst_data); + if (err) + break; + if (clst_data < NTFS_LZNT_CLUSTERS) + flags |= FIEMAP_EXTENT_ENCODED; + } else if (is_attr_encrypted(attr)) { + flags |= FIEMAP_EXTENT_DATA_ENCRYPTED; + } + + vbo = (u64)vcn << cluster_bits; + bytes = (u64)clen << cluster_bits; + lbo = (u64)lcn << cluster_bits; + + vcn += clen; + + if (vbo + bytes >= end) { + bytes = end - vbo; + flags |= FIEMAP_EXTENT_LAST; + } + + if (vbo + bytes <= valid) { + ; + } else if (vbo >= valid) { + flags |= FIEMAP_EXTENT_UNWRITTEN; + } else { + /* vbo < valid && valid < vbo + bytes */ + u64 dlen = valid - vbo; + + err = fiemap_fill_next_extent(fieinfo, vbo, lbo, dlen, + flags); + if (err < 0) + break; + if (err == 1) { + err = 0; + break; + } + + vbo = valid; + bytes -= dlen; + if (!bytes) + continue; + + lbo += dlen; + flags |= FIEMAP_EXTENT_UNWRITTEN; + } + + err = fiemap_fill_next_extent(fieinfo, vbo, lbo, bytes, flags); + if (err < 0) + break; + if (err == 1) { + err = 0; + break; + } + + vbo += bytes; + } + + up_read(run_lock); + +out: + return err; +} + +/* + * ni_readpage_cmpr + * + * When decompressing, we typically obtain more than one page per reference. + * We inject the additional pages into the page cache. + */ +int ni_readpage_cmpr(struct ntfs_inode *ni, struct page *page) +{ + int err; + struct ntfs_sb_info *sbi = ni->mi.sbi; + struct address_space *mapping = page->mapping; + pgoff_t index = page->index; + u64 frame_vbo, vbo = (u64)index << PAGE_SHIFT; + struct page **pages = NULL; /* Array of at most 16 pages. stack? */ + u8 frame_bits; + CLST frame; + u32 i, idx, frame_size, pages_per_frame; + gfp_t gfp_mask; + struct page *pg; + + if (vbo >= ni->vfs_inode.i_size) { + SetPageUptodate(page); + err = 0; + goto out; + } + + if (ni->ni_flags & NI_FLAG_COMPRESSED_MASK) { + /* Xpress or LZX. */ + frame_bits = ni_ext_compress_bits(ni); + } else { + /* LZNT compression. */ + frame_bits = NTFS_LZNT_CUNIT + sbi->cluster_bits; + } + frame_size = 1u << frame_bits; + frame = vbo >> frame_bits; + frame_vbo = (u64)frame << frame_bits; + idx = (vbo - frame_vbo) >> PAGE_SHIFT; + + pages_per_frame = frame_size >> PAGE_SHIFT; + pages = kcalloc(pages_per_frame, sizeof(struct page *), GFP_NOFS); + if (!pages) { + err = -ENOMEM; + goto out; + } + + pages[idx] = page; + index = frame_vbo >> PAGE_SHIFT; + gfp_mask = mapping_gfp_mask(mapping); + + for (i = 0; i < pages_per_frame; i++, index++) { + if (i == idx) + continue; + + pg = find_or_create_page(mapping, index, gfp_mask); + if (!pg) { + err = -ENOMEM; + goto out1; + } + pages[i] = pg; + } + + err = ni_read_frame(ni, frame_vbo, pages, pages_per_frame); + +out1: + if (err) + SetPageError(page); + + for (i = 0; i < pages_per_frame; i++) { + pg = pages[i]; + if (i == idx) + continue; + unlock_page(pg); + put_page(pg); + } + +out: + /* At this point, err contains 0 or -EIO depending on the "critical" page. */ + kfree(pages); + unlock_page(page); + + return err; +} + +#ifdef CONFIG_NTFS3_LZX_XPRESS +/* + * ni_decompress_file - Decompress LZX/Xpress compressed file. + * + * Remove ATTR_DATA::WofCompressedData. + * Remove ATTR_REPARSE. + */ +int ni_decompress_file(struct ntfs_inode *ni) +{ + struct ntfs_sb_info *sbi = ni->mi.sbi; + struct inode *inode = &ni->vfs_inode; + loff_t i_size = inode->i_size; + struct address_space *mapping = inode->i_mapping; + gfp_t gfp_mask = mapping_gfp_mask(mapping); + struct page **pages = NULL; + struct ATTR_LIST_ENTRY *le; + struct ATTRIB *attr; + CLST vcn, cend, lcn, clen, end; + pgoff_t index; + u64 vbo; + u8 frame_bits; + u32 i, frame_size, pages_per_frame, bytes; + struct mft_inode *mi; + int err; + + /* Clusters for decompressed data. */ + cend = bytes_to_cluster(sbi, i_size); + + if (!i_size) + goto remove_wof; + + /* Check in advance. */ + if (cend > wnd_zeroes(&sbi->used.bitmap)) { + err = -ENOSPC; + goto out; + } + + frame_bits = ni_ext_compress_bits(ni); + frame_size = 1u << frame_bits; + pages_per_frame = frame_size >> PAGE_SHIFT; + pages = kcalloc(pages_per_frame, sizeof(struct page *), GFP_NOFS); + if (!pages) { + err = -ENOMEM; + goto out; + } + + /* + * Step 1: Decompress data and copy to new allocated clusters. + */ + index = 0; + for (vbo = 0; vbo < i_size; vbo += bytes) { + u32 nr_pages; + bool new; + + if (vbo + frame_size > i_size) { + bytes = i_size - vbo; + nr_pages = (bytes + PAGE_SIZE - 1) >> PAGE_SHIFT; + } else { + nr_pages = pages_per_frame; + bytes = frame_size; + } + + end = bytes_to_cluster(sbi, vbo + bytes); + + for (vcn = vbo >> sbi->cluster_bits; vcn < end; vcn += clen) { + err = attr_data_get_block(ni, vcn, cend - vcn, &lcn, + &clen, &new); + if (err) + goto out; + } + + for (i = 0; i < pages_per_frame; i++, index++) { + struct page *pg; + + pg = find_or_create_page(mapping, index, gfp_mask); + if (!pg) { + while (i--) { + unlock_page(pages[i]); + put_page(pages[i]); + } + err = -ENOMEM; + goto out; + } + pages[i] = pg; + } + + err = ni_read_frame(ni, vbo, pages, pages_per_frame); + + if (!err) { + down_read(&ni->file.run_lock); + err = ntfs_bio_pages(sbi, &ni->file.run, pages, + nr_pages, vbo, bytes, + REQ_OP_WRITE); + up_read(&ni->file.run_lock); + } + + for (i = 0; i < pages_per_frame; i++) { + unlock_page(pages[i]); + put_page(pages[i]); + } + + if (err) + goto out; + + cond_resched(); + } + +remove_wof: + /* + * Step 2: Deallocate attributes ATTR_DATA::WofCompressedData + * and ATTR_REPARSE. + */ + attr = NULL; + le = NULL; + while ((attr = ni_enum_attr_ex(ni, attr, &le, NULL))) { + CLST svcn, evcn; + u32 asize, roff; + + if (attr->type == ATTR_REPARSE) { + struct MFT_REF ref; + + mi_get_ref(&ni->mi, &ref); + ntfs_remove_reparse(sbi, 0, &ref); + } + + if (!attr->non_res) + continue; + + if (attr->type != ATTR_REPARSE && + (attr->type != ATTR_DATA || + attr->name_len != ARRAY_SIZE(WOF_NAME) || + memcmp(attr_name(attr), WOF_NAME, sizeof(WOF_NAME)))) + continue; + + svcn = le64_to_cpu(attr->nres.svcn); + evcn = le64_to_cpu(attr->nres.evcn); + + if (evcn + 1 <= svcn) + continue; + + asize = le32_to_cpu(attr->size); + roff = le16_to_cpu(attr->nres.run_off); + + /*run==1 Means unpack and deallocate. */ + run_unpack_ex(RUN_DEALLOCATE, sbi, ni->mi.rno, svcn, evcn, svcn, + Add2Ptr(attr, roff), asize - roff); + } + + /* + * Step 3: Remove attribute ATTR_DATA::WofCompressedData. + */ + err = ni_remove_attr(ni, ATTR_DATA, WOF_NAME, ARRAY_SIZE(WOF_NAME), + false, NULL); + if (err) + goto out; + + /* + * Step 4: Remove ATTR_REPARSE. + */ + err = ni_remove_attr(ni, ATTR_REPARSE, NULL, 0, false, NULL); + if (err) + goto out; + + /* + * Step 5: Remove sparse flag from data attribute. + */ + attr = ni_find_attr(ni, NULL, NULL, ATTR_DATA, NULL, 0, NULL, &mi); + if (!attr) { + err = -EINVAL; + goto out; + } + + if (attr->non_res && is_attr_sparsed(attr)) { + /* Sparsed attribute header is 8 bytes bigger than normal. */ + struct MFT_REC *rec = mi->mrec; + u32 used = le32_to_cpu(rec->used); + u32 asize = le32_to_cpu(attr->size); + u16 roff = le16_to_cpu(attr->nres.run_off); + char *rbuf = Add2Ptr(attr, roff); + + memmove(rbuf - 8, rbuf, used - PtrOffset(rec, rbuf)); + attr->size = cpu_to_le32(asize - 8); + attr->flags &= ~ATTR_FLAG_SPARSED; + attr->nres.run_off = cpu_to_le16(roff - 8); + attr->nres.c_unit = 0; + rec->used = cpu_to_le32(used - 8); + mi->dirty = true; + ni->std_fa &= ~(FILE_ATTRIBUTE_SPARSE_FILE | + FILE_ATTRIBUTE_REPARSE_POINT); + + mark_inode_dirty(inode); + } + + /* Clear cached flag. */ + ni->ni_flags &= ~NI_FLAG_COMPRESSED_MASK; + if (ni->file.offs_page) { + put_page(ni->file.offs_page); + ni->file.offs_page = NULL; + } + mapping->a_ops = &ntfs_aops; + +out: + kfree(pages); + if (err) { + make_bad_inode(inode); + ntfs_set_state(sbi, NTFS_DIRTY_ERROR); + } + + return err; +} + +/* + * decompress_lzx_xpress - External compression LZX/Xpress. + */ +static int decompress_lzx_xpress(struct ntfs_sb_info *sbi, const char *cmpr, + size_t cmpr_size, void *unc, size_t unc_size, + u32 frame_size) +{ + int err; + void *ctx; + + if (cmpr_size == unc_size) { + /* Frame not compressed. */ + memcpy(unc, cmpr, unc_size); + return 0; + } + + err = 0; + if (frame_size == 0x8000) { + mutex_lock(&sbi->compress.mtx_lzx); + /* LZX: Frame compressed. */ + ctx = sbi->compress.lzx; + if (!ctx) { + /* Lazy initialize LZX decompress context. */ + ctx = lzx_allocate_decompressor(); + if (!ctx) { + err = -ENOMEM; + goto out1; + } + + sbi->compress.lzx = ctx; + } + + if (lzx_decompress(ctx, cmpr, cmpr_size, unc, unc_size)) { + /* Treat all errors as "invalid argument". */ + err = -EINVAL; + } +out1: + mutex_unlock(&sbi->compress.mtx_lzx); + } else { + /* XPRESS: Frame compressed. */ + mutex_lock(&sbi->compress.mtx_xpress); + ctx = sbi->compress.xpress; + if (!ctx) { + /* Lazy initialize Xpress decompress context. */ + ctx = xpress_allocate_decompressor(); + if (!ctx) { + err = -ENOMEM; + goto out2; + } + + sbi->compress.xpress = ctx; + } + + if (xpress_decompress(ctx, cmpr, cmpr_size, unc, unc_size)) { + /* Treat all errors as "invalid argument". */ + err = -EINVAL; + } +out2: + mutex_unlock(&sbi->compress.mtx_xpress); + } + return err; +} +#endif + +/* + * ni_read_frame + * + * Pages - Array of locked pages. + */ +int ni_read_frame(struct ntfs_inode *ni, u64 frame_vbo, struct page **pages, + u32 pages_per_frame) +{ + int err; + struct ntfs_sb_info *sbi = ni->mi.sbi; + u8 cluster_bits = sbi->cluster_bits; + char *frame_ondisk = NULL; + char *frame_mem = NULL; + struct page **pages_disk = NULL; + struct ATTR_LIST_ENTRY *le = NULL; + struct runs_tree *run = &ni->file.run; + u64 valid_size = ni->i_valid; + u64 vbo_disk; + size_t unc_size; + u32 frame_size, i, npages_disk, ondisk_size; + struct page *pg; + struct ATTRIB *attr; + CLST frame, clst_data; + + /* + * To simplify decompress algorithm do vmap for source + * and target pages. + */ + for (i = 0; i < pages_per_frame; i++) + kmap(pages[i]); + + frame_size = pages_per_frame << PAGE_SHIFT; + frame_mem = vmap(pages, pages_per_frame, VM_MAP, PAGE_KERNEL); + if (!frame_mem) { + err = -ENOMEM; + goto out; + } + + attr = ni_find_attr(ni, NULL, &le, ATTR_DATA, NULL, 0, NULL, NULL); + if (!attr) { + err = -ENOENT; + goto out1; + } + + if (!attr->non_res) { + u32 data_size = le32_to_cpu(attr->res.data_size); + + memset(frame_mem, 0, frame_size); + if (frame_vbo < data_size) { + ondisk_size = data_size - frame_vbo; + memcpy(frame_mem, resident_data(attr) + frame_vbo, + min(ondisk_size, frame_size)); + } + err = 0; + goto out1; + } + + if (frame_vbo >= valid_size) { + memset(frame_mem, 0, frame_size); + err = 0; + goto out1; + } + + if (ni->ni_flags & NI_FLAG_COMPRESSED_MASK) { +#ifndef CONFIG_NTFS3_LZX_XPRESS + err = -EOPNOTSUPP; + goto out1; +#else + u32 frame_bits = ni_ext_compress_bits(ni); + u64 frame64 = frame_vbo >> frame_bits; + u64 frames, vbo_data; + + if (frame_size != (1u << frame_bits)) { + err = -EINVAL; + goto out1; + } + switch (frame_size) { + case 0x1000: + case 0x2000: + case 0x4000: + case 0x8000: + break; + default: + /* Unknown compression. */ + err = -EOPNOTSUPP; + goto out1; + } + + attr = ni_find_attr(ni, attr, &le, ATTR_DATA, WOF_NAME, + ARRAY_SIZE(WOF_NAME), NULL, NULL); + if (!attr) { + ntfs_inode_err( + &ni->vfs_inode, + "external compressed file should contains data attribute \"WofCompressedData\""); + err = -EINVAL; + goto out1; + } + + if (!attr->non_res) { + run = NULL; + } else { + run = run_alloc(); + if (!run) { + err = -ENOMEM; + goto out1; + } + } + + frames = (ni->vfs_inode.i_size - 1) >> frame_bits; + + err = attr_wof_frame_info(ni, attr, run, frame64, frames, + frame_bits, &ondisk_size, &vbo_data); + if (err) + goto out2; + + if (frame64 == frames) { + unc_size = 1 + ((ni->vfs_inode.i_size - 1) & + (frame_size - 1)); + ondisk_size = attr_size(attr) - vbo_data; + } else { + unc_size = frame_size; + } + + if (ondisk_size > frame_size) { + err = -EINVAL; + goto out2; + } + + if (!attr->non_res) { + if (vbo_data + ondisk_size > + le32_to_cpu(attr->res.data_size)) { + err = -EINVAL; + goto out1; + } + + err = decompress_lzx_xpress( + sbi, Add2Ptr(resident_data(attr), vbo_data), + ondisk_size, frame_mem, unc_size, frame_size); + goto out1; + } + vbo_disk = vbo_data; + /* Load all runs to read [vbo_disk-vbo_to). */ + err = attr_load_runs_range(ni, ATTR_DATA, WOF_NAME, + ARRAY_SIZE(WOF_NAME), run, vbo_disk, + vbo_data + ondisk_size); + if (err) + goto out2; + npages_disk = (ondisk_size + (vbo_disk & (PAGE_SIZE - 1)) + + PAGE_SIZE - 1) >> + PAGE_SHIFT; +#endif + } else if (is_attr_compressed(attr)) { + /* LZNT compression. */ + if (sbi->cluster_size > NTFS_LZNT_MAX_CLUSTER) { + err = -EOPNOTSUPP; + goto out1; + } + + if (attr->nres.c_unit != NTFS_LZNT_CUNIT) { + err = -EOPNOTSUPP; + goto out1; + } + + down_write(&ni->file.run_lock); + run_truncate_around(run, le64_to_cpu(attr->nres.svcn)); + frame = frame_vbo >> (cluster_bits + NTFS_LZNT_CUNIT); + err = attr_is_frame_compressed(ni, attr, frame, &clst_data); + up_write(&ni->file.run_lock); + if (err) + goto out1; + + if (!clst_data) { + memset(frame_mem, 0, frame_size); + goto out1; + } + + frame_size = sbi->cluster_size << NTFS_LZNT_CUNIT; + ondisk_size = clst_data << cluster_bits; + + if (clst_data >= NTFS_LZNT_CLUSTERS) { + /* Frame is not compressed. */ + down_read(&ni->file.run_lock); + err = ntfs_bio_pages(sbi, run, pages, pages_per_frame, + frame_vbo, ondisk_size, + REQ_OP_READ); + up_read(&ni->file.run_lock); + goto out1; + } + vbo_disk = frame_vbo; + npages_disk = (ondisk_size + PAGE_SIZE - 1) >> PAGE_SHIFT; + } else { + __builtin_unreachable(); + err = -EINVAL; + goto out1; + } + + pages_disk = kzalloc(npages_disk * sizeof(struct page *), GFP_NOFS); + if (!pages_disk) { + err = -ENOMEM; + goto out2; + } + + for (i = 0; i < npages_disk; i++) { + pg = alloc_page(GFP_KERNEL); + if (!pg) { + err = -ENOMEM; + goto out3; + } + pages_disk[i] = pg; + lock_page(pg); + kmap(pg); + } + + /* Read 'ondisk_size' bytes from disk. */ + down_read(&ni->file.run_lock); + err = ntfs_bio_pages(sbi, run, pages_disk, npages_disk, vbo_disk, + ondisk_size, REQ_OP_READ); + up_read(&ni->file.run_lock); + if (err) + goto out3; + + /* + * To simplify decompress algorithm do vmap for source and target pages. + */ + frame_ondisk = vmap(pages_disk, npages_disk, VM_MAP, PAGE_KERNEL_RO); + if (!frame_ondisk) { + err = -ENOMEM; + goto out3; + } + + /* Decompress: Frame_ondisk -> frame_mem. */ +#ifdef CONFIG_NTFS3_LZX_XPRESS + if (run != &ni->file.run) { + /* LZX or XPRESS */ + err = decompress_lzx_xpress( + sbi, frame_ondisk + (vbo_disk & (PAGE_SIZE - 1)), + ondisk_size, frame_mem, unc_size, frame_size); + } else +#endif + { + /* LZNT - Native NTFS compression. */ + unc_size = decompress_lznt(frame_ondisk, ondisk_size, frame_mem, + frame_size); + if ((ssize_t)unc_size < 0) + err = unc_size; + else if (!unc_size || unc_size > frame_size) + err = -EINVAL; + } + if (!err && valid_size < frame_vbo + frame_size) { + size_t ok = valid_size - frame_vbo; + + memset(frame_mem + ok, 0, frame_size - ok); + } + + vunmap(frame_ondisk); + +out3: + for (i = 0; i < npages_disk; i++) { + pg = pages_disk[i]; + if (pg) { + kunmap(pg); + unlock_page(pg); + put_page(pg); + } + } + kfree(pages_disk); + +out2: +#ifdef CONFIG_NTFS3_LZX_XPRESS + if (run != &ni->file.run) + run_free(run); +#endif +out1: + vunmap(frame_mem); +out: + for (i = 0; i < pages_per_frame; i++) { + pg = pages[i]; + kunmap(pg); + ClearPageError(pg); + SetPageUptodate(pg); + } + + return err; +} + +/* + * ni_write_frame + * + * Pages - Array of locked pages. + */ +int ni_write_frame(struct ntfs_inode *ni, struct page **pages, + u32 pages_per_frame) +{ + int err; + struct ntfs_sb_info *sbi = ni->mi.sbi; + u8 frame_bits = NTFS_LZNT_CUNIT + sbi->cluster_bits; + u32 frame_size = sbi->cluster_size << NTFS_LZNT_CUNIT; + u64 frame_vbo = (u64)pages[0]->index << PAGE_SHIFT; + CLST frame = frame_vbo >> frame_bits; + char *frame_ondisk = NULL; + struct page **pages_disk = NULL; + struct ATTR_LIST_ENTRY *le = NULL; + char *frame_mem; + struct ATTRIB *attr; + struct mft_inode *mi; + u32 i; + struct page *pg; + size_t compr_size, ondisk_size; + struct lznt *lznt; + + attr = ni_find_attr(ni, NULL, &le, ATTR_DATA, NULL, 0, NULL, &mi); + if (!attr) { + err = -ENOENT; + goto out; + } + + if (WARN_ON(!is_attr_compressed(attr))) { + err = -EINVAL; + goto out; + } + + if (sbi->cluster_size > NTFS_LZNT_MAX_CLUSTER) { + err = -EOPNOTSUPP; + goto out; + } + + if (!attr->non_res) { + down_write(&ni->file.run_lock); + err = attr_make_nonresident(ni, attr, le, mi, + le32_to_cpu(attr->res.data_size), + &ni->file.run, &attr, pages[0]); + up_write(&ni->file.run_lock); + if (err) + goto out; + } + + if (attr->nres.c_unit != NTFS_LZNT_CUNIT) { + err = -EOPNOTSUPP; + goto out; + } + + pages_disk = kcalloc(pages_per_frame, sizeof(struct page *), GFP_NOFS); + if (!pages_disk) { + err = -ENOMEM; + goto out; + } + + for (i = 0; i < pages_per_frame; i++) { + pg = alloc_page(GFP_KERNEL); + if (!pg) { + err = -ENOMEM; + goto out1; + } + pages_disk[i] = pg; + lock_page(pg); + kmap(pg); + } + + /* To simplify compress algorithm do vmap for source and target pages. */ + frame_ondisk = vmap(pages_disk, pages_per_frame, VM_MAP, PAGE_KERNEL); + if (!frame_ondisk) { + err = -ENOMEM; + goto out1; + } + + for (i = 0; i < pages_per_frame; i++) + kmap(pages[i]); + + /* Map in-memory frame for read-only. */ + frame_mem = vmap(pages, pages_per_frame, VM_MAP, PAGE_KERNEL_RO); + if (!frame_mem) { + err = -ENOMEM; + goto out2; + } + + mutex_lock(&sbi->compress.mtx_lznt); + lznt = NULL; + if (!sbi->compress.lznt) { + /* + * LZNT implements two levels of compression: + * 0 - Standard compression + * 1 - Best compression, requires a lot of cpu + * use mount option? + */ + lznt = get_lznt_ctx(0); + if (!lznt) { + mutex_unlock(&sbi->compress.mtx_lznt); + err = -ENOMEM; + goto out3; + } + + sbi->compress.lznt = lznt; + lznt = NULL; + } + + /* Compress: frame_mem -> frame_ondisk */ + compr_size = compress_lznt(frame_mem, frame_size, frame_ondisk, + frame_size, sbi->compress.lznt); + mutex_unlock(&sbi->compress.mtx_lznt); + kfree(lznt); + + if (compr_size + sbi->cluster_size > frame_size) { + /* Frame is not compressed. */ + compr_size = frame_size; + ondisk_size = frame_size; + } else if (compr_size) { + /* Frame is compressed. */ + ondisk_size = ntfs_up_cluster(sbi, compr_size); + memset(frame_ondisk + compr_size, 0, ondisk_size - compr_size); + } else { + /* Frame is sparsed. */ + ondisk_size = 0; + } + + down_write(&ni->file.run_lock); + run_truncate_around(&ni->file.run, le64_to_cpu(attr->nres.svcn)); + err = attr_allocate_frame(ni, frame, compr_size, ni->i_valid); + up_write(&ni->file.run_lock); + if (err) + goto out2; + + if (!ondisk_size) + goto out2; + + down_read(&ni->file.run_lock); + err = ntfs_bio_pages(sbi, &ni->file.run, + ondisk_size < frame_size ? pages_disk : pages, + pages_per_frame, frame_vbo, ondisk_size, + REQ_OP_WRITE); + up_read(&ni->file.run_lock); + +out3: + vunmap(frame_mem); + +out2: + for (i = 0; i < pages_per_frame; i++) + kunmap(pages[i]); + + vunmap(frame_ondisk); +out1: + for (i = 0; i < pages_per_frame; i++) { + pg = pages_disk[i]; + if (pg) { + kunmap(pg); + unlock_page(pg); + put_page(pg); + } + } + kfree(pages_disk); +out: + return err; +} + +/* + * ni_remove_name - Removes name 'de' from MFT and from directory. + * 'de2' and 'undo_step' are used to restore MFT/dir, if error occurs. + */ +int ni_remove_name(struct ntfs_inode *dir_ni, struct ntfs_inode *ni, + struct NTFS_DE *de, struct NTFS_DE **de2, int *undo_step) +{ + int err; + struct ntfs_sb_info *sbi = ni->mi.sbi; + struct ATTR_FILE_NAME *de_name = (struct ATTR_FILE_NAME *)(de + 1); + struct ATTR_FILE_NAME *fname; + struct ATTR_LIST_ENTRY *le; + struct mft_inode *mi; + u16 de_key_size = le16_to_cpu(de->key_size); + u8 name_type; + + *undo_step = 0; + + /* Find name in record. */ + mi_get_ref(&dir_ni->mi, &de_name->home); + + fname = ni_fname_name(ni, (struct cpu_str *)&de_name->name_len, + &de_name->home, &mi, &le); + if (!fname) + return -ENOENT; + + memcpy(&de_name->dup, &fname->dup, sizeof(struct NTFS_DUP_INFO)); + name_type = paired_name(fname->type); + + /* Mark ntfs as dirty. It will be cleared at umount. */ + ntfs_set_state(sbi, NTFS_DIRTY_DIRTY); + + /* Step 1: Remove name from directory. */ + err = indx_delete_entry(&dir_ni->dir, dir_ni, fname, de_key_size, sbi); + if (err) + return err; + + /* Step 2: Remove name from MFT. */ + ni_remove_attr_le(ni, attr_from_name(fname), mi, le); + + *undo_step = 2; + + /* Get paired name. */ + fname = ni_fname_type(ni, name_type, &mi, &le); + if (fname) { + u16 de2_key_size = fname_full_size(fname); + + *de2 = Add2Ptr(de, 1024); + (*de2)->key_size = cpu_to_le16(de2_key_size); + + memcpy(*de2 + 1, fname, de2_key_size); + + /* Step 3: Remove paired name from directory. */ + err = indx_delete_entry(&dir_ni->dir, dir_ni, fname, + de2_key_size, sbi); + if (err) + return err; + + /* Step 4: Remove paired name from MFT. */ + ni_remove_attr_le(ni, attr_from_name(fname), mi, le); + + *undo_step = 4; + } + return 0; +} + +/* + * ni_remove_name_undo - Paired function for ni_remove_name. + * + * Return: True if ok + */ +bool ni_remove_name_undo(struct ntfs_inode *dir_ni, struct ntfs_inode *ni, + struct NTFS_DE *de, struct NTFS_DE *de2, int undo_step) +{ + struct ntfs_sb_info *sbi = ni->mi.sbi; + struct ATTRIB *attr; + u16 de_key_size = de2 ? le16_to_cpu(de2->key_size) : 0; + + switch (undo_step) { + case 4: + if (ni_insert_resident(ni, de_key_size, ATTR_NAME, NULL, 0, + &attr, NULL, NULL)) { + return false; + } + memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), de2 + 1, de_key_size); + + mi_get_ref(&ni->mi, &de2->ref); + de2->size = cpu_to_le16(ALIGN(de_key_size, 8) + + sizeof(struct NTFS_DE)); + de2->flags = 0; + de2->res = 0; + + if (indx_insert_entry(&dir_ni->dir, dir_ni, de2, sbi, NULL, + 1)) { + return false; + } + fallthrough; + + case 2: + de_key_size = le16_to_cpu(de->key_size); + + if (ni_insert_resident(ni, de_key_size, ATTR_NAME, NULL, 0, + &attr, NULL, NULL)) { + return false; + } + + memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), de + 1, de_key_size); + mi_get_ref(&ni->mi, &de->ref); + + if (indx_insert_entry(&dir_ni->dir, dir_ni, de, sbi, NULL, 1)) { + return false; + } + } + + return true; +} + +/* + * ni_add_name - Add new name in MFT and in directory. + */ +int ni_add_name(struct ntfs_inode *dir_ni, struct ntfs_inode *ni, + struct NTFS_DE *de) +{ + int err; + struct ATTRIB *attr; + struct ATTR_LIST_ENTRY *le; + struct mft_inode *mi; + struct ATTR_FILE_NAME *de_name = (struct ATTR_FILE_NAME *)(de + 1); + u16 de_key_size = le16_to_cpu(de->key_size); + + mi_get_ref(&ni->mi, &de->ref); + mi_get_ref(&dir_ni->mi, &de_name->home); + + /* Insert new name in MFT. */ + err = ni_insert_resident(ni, de_key_size, ATTR_NAME, NULL, 0, &attr, + &mi, &le); + if (err) + return err; + + memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), de_name, de_key_size); + + /* Insert new name in directory. */ + err = indx_insert_entry(&dir_ni->dir, dir_ni, de, ni->mi.sbi, NULL, 0); + if (err) + ni_remove_attr_le(ni, attr, mi, le); + + return err; +} + +/* + * ni_rename - Remove one name and insert new name. + */ +int ni_rename(struct ntfs_inode *dir_ni, struct ntfs_inode *new_dir_ni, + struct ntfs_inode *ni, struct NTFS_DE *de, struct NTFS_DE *new_de, + bool *is_bad) +{ + int err; + struct NTFS_DE *de2 = NULL; + int undo = 0; + + /* + * There are two possible ways to rename: + * 1) Add new name and remove old name. + * 2) Remove old name and add new name. + * + * In most cases (not all!) adding new name in MFT and in directory can + * allocate additional cluster(s). + * Second way may result to bad inode if we can't add new name + * and then can't restore (add) old name. + */ + + /* + * Way 1 - Add new + remove old. + */ + err = ni_add_name(new_dir_ni, ni, new_de); + if (!err) { + err = ni_remove_name(dir_ni, ni, de, &de2, &undo); + if (err && ni_remove_name(new_dir_ni, ni, new_de, &de2, &undo)) + *is_bad = true; + } + + /* + * Way 2 - Remove old + add new. + */ + /* + * err = ni_remove_name(dir_ni, ni, de, &de2, &undo); + * if (!err) { + * err = ni_add_name(new_dir_ni, ni, new_de); + * if (err && !ni_remove_name_undo(dir_ni, ni, de, de2, undo)) + * *is_bad = true; + * } + */ + + return err; +} + +/* + * ni_is_dirty - Return: True if 'ni' requires ni_write_inode. + */ +bool ni_is_dirty(struct inode *inode) +{ + struct ntfs_inode *ni = ntfs_i(inode); + struct rb_node *node; + + if (ni->mi.dirty || ni->attr_list.dirty || + (ni->ni_flags & NI_FLAG_UPDATE_PARENT)) + return true; + + for (node = rb_first(&ni->mi_tree); node; node = rb_next(node)) { + if (rb_entry(node, struct mft_inode, node)->dirty) + return true; + } + + return false; +} + +/* + * ni_update_parent + * + * Update duplicate info of ATTR_FILE_NAME in MFT and in parent directories. + */ +static bool ni_update_parent(struct ntfs_inode *ni, struct NTFS_DUP_INFO *dup, + int sync) +{ + struct ATTRIB *attr; + struct mft_inode *mi; + struct ATTR_LIST_ENTRY *le = NULL; + struct ntfs_sb_info *sbi = ni->mi.sbi; + struct super_block *sb = sbi->sb; + bool re_dirty = false; + + if (ni->mi.mrec->flags & RECORD_FLAG_DIR) { + dup->fa |= FILE_ATTRIBUTE_DIRECTORY; + attr = NULL; + dup->alloc_size = 0; + dup->data_size = 0; + } else { + dup->fa &= ~FILE_ATTRIBUTE_DIRECTORY; + + attr = ni_find_attr(ni, NULL, &le, ATTR_DATA, NULL, 0, NULL, + &mi); + if (!attr) { + dup->alloc_size = dup->data_size = 0; + } else if (!attr->non_res) { + u32 data_size = le32_to_cpu(attr->res.data_size); + + dup->alloc_size = cpu_to_le64(ALIGN(data_size, 8)); + dup->data_size = cpu_to_le64(data_size); + } else { + u64 new_valid = ni->i_valid; + u64 data_size = le64_to_cpu(attr->nres.data_size); + __le64 valid_le; + + dup->alloc_size = is_attr_ext(attr) + ? attr->nres.total_size + : attr->nres.alloc_size; + dup->data_size = attr->nres.data_size; + + if (new_valid > data_size) + new_valid = data_size; + + valid_le = cpu_to_le64(new_valid); + if (valid_le != attr->nres.valid_size) { + attr->nres.valid_size = valid_le; + mi->dirty = true; + } + } + } + + /* TODO: Fill reparse info. */ + dup->reparse = 0; + dup->ea_size = 0; + + if (ni->ni_flags & NI_FLAG_EA) { + attr = ni_find_attr(ni, attr, &le, ATTR_EA_INFO, NULL, 0, NULL, + NULL); + if (attr) { + const struct EA_INFO *info; + + info = resident_data_ex(attr, sizeof(struct EA_INFO)); + dup->ea_size = info->size_pack; + } + } + + attr = NULL; + le = NULL; + + while ((attr = ni_find_attr(ni, attr, &le, ATTR_NAME, NULL, 0, NULL, + &mi))) { + struct inode *dir; + struct ATTR_FILE_NAME *fname; + + fname = resident_data_ex(attr, SIZEOF_ATTRIBUTE_FILENAME); + if (!fname || !memcmp(&fname->dup, dup, sizeof(fname->dup))) + continue; + + /* ntfs_iget5 may sleep. */ + dir = ntfs_iget5(sb, &fname->home, NULL); + if (IS_ERR(dir)) { + ntfs_inode_warn( + &ni->vfs_inode, + "failed to open parent directory r=%lx to update", + (long)ino_get(&fname->home)); + continue; + } + + if (!is_bad_inode(dir)) { + struct ntfs_inode *dir_ni = ntfs_i(dir); + + if (!ni_trylock(dir_ni)) { + re_dirty = true; + } else { + indx_update_dup(dir_ni, sbi, fname, dup, sync); + ni_unlock(dir_ni); + memcpy(&fname->dup, dup, sizeof(fname->dup)); + mi->dirty = true; + } + } + iput(dir); + } + + return re_dirty; +} + +/* + * ni_write_inode - Write MFT base record and all subrecords to disk. + */ +int ni_write_inode(struct inode *inode, int sync, const char *hint) +{ + int err = 0, err2; + struct ntfs_inode *ni = ntfs_i(inode); + struct super_block *sb = inode->i_sb; + struct ntfs_sb_info *sbi = sb->s_fs_info; + bool re_dirty = false; + struct ATTR_STD_INFO *std; + struct rb_node *node, *next; + struct NTFS_DUP_INFO dup; + + if (is_bad_inode(inode) || sb_rdonly(sb)) + return 0; + + if (!ni_trylock(ni)) { + /* 'ni' is under modification, skip for now. */ + mark_inode_dirty_sync(inode); + return 0; + } + + if (is_rec_inuse(ni->mi.mrec) && + !(sbi->flags & NTFS_FLAGS_LOG_REPLAYING) && inode->i_nlink) { + bool modified = false; + + /* Update times in standard attribute. */ + std = ni_std(ni); + if (!std) { + err = -EINVAL; + goto out; + } + + /* Update the access times if they have changed. */ + dup.m_time = kernel2nt(&inode->i_mtime); + if (std->m_time != dup.m_time) { + std->m_time = dup.m_time; + modified = true; + } + + dup.c_time = kernel2nt(&inode->i_ctime); + if (std->c_time != dup.c_time) { + std->c_time = dup.c_time; + modified = true; + } + + dup.a_time = kernel2nt(&inode->i_atime); + if (std->a_time != dup.a_time) { + std->a_time = dup.a_time; + modified = true; + } + + dup.fa = ni->std_fa; + if (std->fa != dup.fa) { + std->fa = dup.fa; + modified = true; + } + + if (modified) + ni->mi.dirty = true; + + if (!ntfs_is_meta_file(sbi, inode->i_ino) && + (modified || (ni->ni_flags & NI_FLAG_UPDATE_PARENT)) + /* Avoid __wait_on_freeing_inode(inode). */ + && (sb->s_flags & SB_ACTIVE)) { + dup.cr_time = std->cr_time; + /* Not critical if this function fail. */ + re_dirty = ni_update_parent(ni, &dup, sync); + + if (re_dirty) + ni->ni_flags |= NI_FLAG_UPDATE_PARENT; + else + ni->ni_flags &= ~NI_FLAG_UPDATE_PARENT; + } + + /* Update attribute list. */ + if (ni->attr_list.size && ni->attr_list.dirty) { + if (inode->i_ino != MFT_REC_MFT || sync) { + err = ni_try_remove_attr_list(ni); + if (err) + goto out; + } + + err = al_update(ni); + if (err) + goto out; + } + } + + for (node = rb_first(&ni->mi_tree); node; node = next) { + struct mft_inode *mi = rb_entry(node, struct mft_inode, node); + bool is_empty; + + next = rb_next(node); + + if (!mi->dirty) + continue; + + is_empty = !mi_enum_attr(mi, NULL); + + if (is_empty) + clear_rec_inuse(mi->mrec); + + err2 = mi_write(mi, sync); + if (!err && err2) + err = err2; + + if (is_empty) { + ntfs_mark_rec_free(sbi, mi->rno); + rb_erase(node, &ni->mi_tree); + mi_put(mi); + } + } + + if (ni->mi.dirty) { + err2 = mi_write(&ni->mi, sync); + if (!err && err2) + err = err2; + } +out: + ni_unlock(ni); + + if (err) { + ntfs_err(sb, "%s r=%lx failed, %d.", hint, inode->i_ino, err); + ntfs_set_state(sbi, NTFS_DIRTY_ERROR); + return err; + } + + if (re_dirty) + mark_inode_dirty_sync(inode); + + return 0; +} diff --git a/fs/ntfs3/fslog.c b/fs/ntfs3/fslog.c new file mode 100644 index 000000000000..b5853aed0e25 --- /dev/null +++ b/fs/ntfs3/fslog.c @@ -0,0 +1,5217 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * + * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. + * + */ + +#include <linux/blkdev.h> +#include <linux/buffer_head.h> +#include <linux/fs.h> +#include <linux/hash.h> +#include <linux/nls.h> +#include <linux/random.h> +#include <linux/ratelimit.h> +#include <linux/slab.h> + +#include "debug.h" +#include "ntfs.h" +#include "ntfs_fs.h" + +/* + * LOG FILE structs + */ + +// clang-format off + +#define MaxLogFileSize 0x100000000ull +#define DefaultLogPageSize 4096 +#define MinLogRecordPages 0x30 + +struct RESTART_HDR { + struct NTFS_RECORD_HEADER rhdr; // 'RSTR' + __le32 sys_page_size; // 0x10: Page size of the system which initialized the log. + __le32 page_size; // 0x14: Log page size used for this log file. + __le16 ra_off; // 0x18: + __le16 minor_ver; // 0x1A: + __le16 major_ver; // 0x1C: + __le16 fixups[]; +}; + +#define LFS_NO_CLIENT 0xffff +#define LFS_NO_CLIENT_LE cpu_to_le16(0xffff) + +struct CLIENT_REC { + __le64 oldest_lsn; + __le64 restart_lsn; // 0x08: + __le16 prev_client; // 0x10: + __le16 next_client; // 0x12: + __le16 seq_num; // 0x14: + u8 align[6]; // 0x16: + __le32 name_bytes; // 0x1C: In bytes. + __le16 name[32]; // 0x20: Name of client. +}; + +static_assert(sizeof(struct CLIENT_REC) == 0x60); + +/* Two copies of these will exist at the beginning of the log file */ +struct RESTART_AREA { + __le64 current_lsn; // 0x00: Current logical end of log file. + __le16 log_clients; // 0x08: Maximum number of clients. + __le16 client_idx[2]; // 0x0A: Free/use index into the client record arrays. + __le16 flags; // 0x0E: See RESTART_SINGLE_PAGE_IO. + __le32 seq_num_bits; // 0x10: The number of bits in sequence number. + __le16 ra_len; // 0x14: + __le16 client_off; // 0x16: + __le64 l_size; // 0x18: Usable log file size. + __le32 last_lsn_data_len; // 0x20: + __le16 rec_hdr_len; // 0x24: Log page data offset. + __le16 data_off; // 0x26: Log page data length. + __le32 open_log_count; // 0x28: + __le32 align[5]; // 0x2C: + struct CLIENT_REC clients[]; // 0x40: +}; + +struct LOG_REC_HDR { + __le16 redo_op; // 0x00: NTFS_LOG_OPERATION + __le16 undo_op; // 0x02: NTFS_LOG_OPERATION + __le16 redo_off; // 0x04: Offset to Redo record. + __le16 redo_len; // 0x06: Redo length. + __le16 undo_off; // 0x08: Offset to Undo record. + __le16 undo_len; // 0x0A: Undo length. + __le16 target_attr; // 0x0C: + __le16 lcns_follow; // 0x0E: + __le16 record_off; // 0x10: + __le16 attr_off; // 0x12: + __le16 cluster_off; // 0x14: + __le16 reserved; // 0x16: + __le64 target_vcn; // 0x18: + __le64 page_lcns[]; // 0x20: +}; + +static_assert(sizeof(struct LOG_REC_HDR) == 0x20); + +#define RESTART_ENTRY_ALLOCATED 0xFFFFFFFF +#define RESTART_ENTRY_ALLOCATED_LE cpu_to_le32(0xFFFFFFFF) + +struct RESTART_TABLE { + __le16 size; // 0x00: In bytes + __le16 used; // 0x02: Entries + __le16 total; // 0x04: Entries + __le16 res[3]; // 0x06: + __le32 free_goal; // 0x0C: + __le32 first_free; // 0x10: + __le32 last_free; // 0x14: + +}; + +static_assert(sizeof(struct RESTART_TABLE) == 0x18); + +struct ATTR_NAME_ENTRY { + __le16 off; // Offset in the Open attribute Table. + __le16 name_bytes; + __le16 name[]; +}; + +struct OPEN_ATTR_ENRTY { + __le32 next; // 0x00: RESTART_ENTRY_ALLOCATED if allocated + __le32 bytes_per_index; // 0x04: + enum ATTR_TYPE type; // 0x08: + u8 is_dirty_pages; // 0x0C: + u8 is_attr_name; // 0x0B: Faked field to manage 'ptr' + u8 name_len; // 0x0C: Faked field to manage 'ptr' + u8 res; + struct MFT_REF ref; // 0x10: File Reference of file containing attribute + __le64 open_record_lsn; // 0x18: + void *ptr; // 0x20: +}; + +/* 32 bit version of 'struct OPEN_ATTR_ENRTY' */ +struct OPEN_ATTR_ENRTY_32 { + __le32 next; // 0x00: RESTART_ENTRY_ALLOCATED if allocated + __le32 ptr; // 0x04: + struct MFT_REF ref; // 0x08: + __le64 open_record_lsn; // 0x10: + u8 is_dirty_pages; // 0x18: + u8 is_attr_name; // 0x19: + u8 res1[2]; + enum ATTR_TYPE type; // 0x1C: + u8 name_len; // 0x20: In wchar + u8 res2[3]; + __le32 AttributeName; // 0x24: + __le32 bytes_per_index; // 0x28: +}; + +#define SIZEOF_OPENATTRIBUTEENTRY0 0x2c +// static_assert( 0x2C == sizeof(struct OPEN_ATTR_ENRTY_32) ); +static_assert(sizeof(struct OPEN_ATTR_ENRTY) < SIZEOF_OPENATTRIBUTEENTRY0); + +/* + * One entry exists in the Dirty Pages Table for each page which is dirty at + * the time the Restart Area is written. + */ +struct DIR_PAGE_ENTRY { + __le32 next; // 0x00: RESTART_ENTRY_ALLOCATED if allocated + __le32 target_attr; // 0x04: Index into the Open attribute Table + __le32 transfer_len; // 0x08: + __le32 lcns_follow; // 0x0C: + __le64 vcn; // 0x10: Vcn of dirty page + __le64 oldest_lsn; // 0x18: + __le64 page_lcns[]; // 0x20: +}; + +static_assert(sizeof(struct DIR_PAGE_ENTRY) == 0x20); + +/* 32 bit version of 'struct DIR_PAGE_ENTRY' */ +struct DIR_PAGE_ENTRY_32 { + __le32 next; // 0x00: RESTART_ENTRY_ALLOCATED if allocated + __le32 target_attr; // 0x04: Index into the Open attribute Table + __le32 transfer_len; // 0x08: + __le32 lcns_follow; // 0x0C: + __le32 reserved; // 0x10: + __le32 vcn_low; // 0x14: Vcn of dirty page + __le32 vcn_hi; // 0x18: Vcn of dirty page + __le32 oldest_lsn_low; // 0x1C: + __le32 oldest_lsn_hi; // 0x1C: + __le32 page_lcns_low; // 0x24: + __le32 page_lcns_hi; // 0x24: +}; + +static_assert(offsetof(struct DIR_PAGE_ENTRY_32, vcn_low) == 0x14); +static_assert(sizeof(struct DIR_PAGE_ENTRY_32) == 0x2c); + +enum transact_state { + TransactionUninitialized = 0, + TransactionActive, + TransactionPrepared, + TransactionCommitted +}; + +struct TRANSACTION_ENTRY { + __le32 next; // 0x00: RESTART_ENTRY_ALLOCATED if allocated + u8 transact_state; // 0x04: + u8 reserved[3]; // 0x05: + __le64 first_lsn; // 0x08: + __le64 prev_lsn; // 0x10: + __le64 undo_next_lsn; // 0x18: + __le32 undo_records; // 0x20: Number of undo log records pending abort + __le32 undo_len; // 0x24: Total undo size +}; + +static_assert(sizeof(struct TRANSACTION_ENTRY) == 0x28); + +struct NTFS_RESTART { + __le32 major_ver; // 0x00: + __le32 minor_ver; // 0x04: + __le64 check_point_start; // 0x08: + __le64 open_attr_table_lsn; // 0x10: + __le64 attr_names_lsn; // 0x18: + __le64 dirty_pages_table_lsn; // 0x20: + __le64 transact_table_lsn; // 0x28: + __le32 open_attr_len; // 0x30: In bytes + __le32 attr_names_len; // 0x34: In bytes + __le32 dirty_pages_len; // 0x38: In bytes + __le32 transact_table_len; // 0x3C: In bytes +}; + +static_assert(sizeof(struct NTFS_RESTART) == 0x40); + +struct NEW_ATTRIBUTE_SIZES { + __le64 alloc_size; + __le64 valid_size; + __le64 data_size; + __le64 total_size; +}; + +struct BITMAP_RANGE { + __le32 bitmap_off; + __le32 bits; +}; + +struct LCN_RANGE { + __le64 lcn; + __le64 len; +}; + +/* The following type defines the different log record types. */ +#define LfsClientRecord cpu_to_le32(1) +#define LfsClientRestart cpu_to_le32(2) + +/* This is used to uniquely identify a client for a particular log file. */ +struct CLIENT_ID { + __le16 seq_num; + __le16 client_idx; +}; + +/* This is the header that begins every Log Record in the log file. */ +struct LFS_RECORD_HDR { + __le64 this_lsn; // 0x00: + __le64 client_prev_lsn; // 0x08: + __le64 client_undo_next_lsn; // 0x10: + __le32 client_data_len; // 0x18: + struct CLIENT_ID client; // 0x1C: Owner of this log record. + __le32 record_type; // 0x20: LfsClientRecord or LfsClientRestart. + __le32 transact_id; // 0x24: + __le16 flags; // 0x28: LOG_RECORD_MULTI_PAGE + u8 align[6]; // 0x2A: +}; + +#define LOG_RECORD_MULTI_PAGE cpu_to_le16(1) + +static_assert(sizeof(struct LFS_RECORD_HDR) == 0x30); + +struct LFS_RECORD { + __le16 next_record_off; // 0x00: Offset of the free space in the page, + u8 align[6]; // 0x02: + __le64 last_end_lsn; // 0x08: lsn for the last log record which ends on the page, +}; + +static_assert(sizeof(struct LFS_RECORD) == 0x10); + +struct RECORD_PAGE_HDR { + struct NTFS_RECORD_HEADER rhdr; // 'RCRD' + __le32 rflags; // 0x10: See LOG_PAGE_LOG_RECORD_END + __le16 page_count; // 0x14: + __le16 page_pos; // 0x16: + struct LFS_RECORD record_hdr; // 0x18: + __le16 fixups[10]; // 0x28: + __le32 file_off; // 0x3c: Used when major version >= 2 +}; + +// clang-format on + +// Page contains the end of a log record. +#define LOG_PAGE_LOG_RECORD_END cpu_to_le32(0x00000001) + +static inline bool is_log_record_end(const struct RECORD_PAGE_HDR *hdr) +{ + return hdr->rflags & LOG_PAGE_LOG_RECORD_END; +} + +static_assert(offsetof(struct RECORD_PAGE_HDR, file_off) == 0x3c); + +/* + * END of NTFS LOG structures + */ + +/* Define some tuning parameters to keep the restart tables a reasonable size. */ +#define INITIAL_NUMBER_TRANSACTIONS 5 + +enum NTFS_LOG_OPERATION { + + Noop = 0x00, + CompensationLogRecord = 0x01, + InitializeFileRecordSegment = 0x02, + DeallocateFileRecordSegment = 0x03, + WriteEndOfFileRecordSegment = 0x04, + CreateAttribute = 0x05, + DeleteAttribute = 0x06, + UpdateResidentValue = 0x07, + UpdateNonresidentValue = 0x08, + UpdateMappingPairs = 0x09, + DeleteDirtyClusters = 0x0A, + SetNewAttributeSizes = 0x0B, + AddIndexEntryRoot = 0x0C, + DeleteIndexEntryRoot = 0x0D, + AddIndexEntryAllocation = 0x0E, + DeleteIndexEntryAllocation = 0x0F, + WriteEndOfIndexBuffer = 0x10, + SetIndexEntryVcnRoot = 0x11, + SetIndexEntryVcnAllocation = 0x12, + UpdateFileNameRoot = 0x13, + UpdateFileNameAllocation = 0x14, + SetBitsInNonresidentBitMap = 0x15, + ClearBitsInNonresidentBitMap = 0x16, + HotFix = 0x17, + EndTopLevelAction = 0x18, + PrepareTransaction = 0x19, + CommitTransaction = 0x1A, + ForgetTransaction = 0x1B, + OpenNonresidentAttribute = 0x1C, + OpenAttributeTableDump = 0x1D, + AttributeNamesDump = 0x1E, + DirtyPageTableDump = 0x1F, + TransactionTableDump = 0x20, + UpdateRecordDataRoot = 0x21, + UpdateRecordDataAllocation = 0x22, + + UpdateRelativeDataInIndex = + 0x23, // NtOfsRestartUpdateRelativeDataInIndex + UpdateRelativeDataInIndex2 = 0x24, + ZeroEndOfFileRecord = 0x25, +}; + +/* + * Array for log records which require a target attribute. + * A true indicates that the corresponding restart operation + * requires a target attribute. + */ +static const u8 AttributeRequired[] = { + 0xFC, 0xFB, 0xFF, 0x10, 0x06, +}; + +static inline bool is_target_required(u16 op) +{ + bool ret = op <= UpdateRecordDataAllocation && + (AttributeRequired[op >> 3] >> (op & 7) & 1); + return ret; +} + +static inline bool can_skip_action(enum NTFS_LOG_OPERATION op) +{ + switch (op) { + case Noop: + case DeleteDirtyClusters: + case HotFix: + case EndTopLevelAction: + case PrepareTransaction: + case CommitTransaction: + case ForgetTransaction: + case CompensationLogRecord: + case OpenNonresidentAttribute: + case OpenAttributeTableDump: + case AttributeNamesDump: + case DirtyPageTableDump: + case TransactionTableDump: + return true; + default: + return false; + } +} + +enum { lcb_ctx_undo_next, lcb_ctx_prev, lcb_ctx_next }; + +/* Bytes per restart table. */ +static inline u32 bytes_per_rt(const struct RESTART_TABLE *rt) +{ + return le16_to_cpu(rt->used) * le16_to_cpu(rt->size) + + sizeof(struct RESTART_TABLE); +} + +/* Log record length. */ +static inline u32 lrh_length(const struct LOG_REC_HDR *lr) +{ + u16 t16 = le16_to_cpu(lr->lcns_follow); + + return struct_size(lr, page_lcns, max_t(u16, 1, t16)); +} + +struct lcb { + struct LFS_RECORD_HDR *lrh; // Log record header of the current lsn. + struct LOG_REC_HDR *log_rec; + u32 ctx_mode; // lcb_ctx_undo_next/lcb_ctx_prev/lcb_ctx_next + struct CLIENT_ID client; + bool alloc; // If true the we should deallocate 'log_rec'. +}; + +static void lcb_put(struct lcb *lcb) +{ + if (lcb->alloc) + kfree(lcb->log_rec); + kfree(lcb->lrh); + kfree(lcb); +} + +/* Find the oldest lsn from active clients. */ +static inline void oldest_client_lsn(const struct CLIENT_REC *ca, + __le16 next_client, u64 *oldest_lsn) +{ + while (next_client != LFS_NO_CLIENT_LE) { + const struct CLIENT_REC *cr = ca + le16_to_cpu(next_client); + u64 lsn = le64_to_cpu(cr->oldest_lsn); + + /* Ignore this block if it's oldest lsn is 0. */ + if (lsn && lsn < *oldest_lsn) + *oldest_lsn = lsn; + + next_client = cr->next_client; + } +} + +static inline bool is_rst_page_hdr_valid(u32 file_off, + const struct RESTART_HDR *rhdr) +{ + u32 sys_page = le32_to_cpu(rhdr->sys_page_size); + u32 page_size = le32_to_cpu(rhdr->page_size); + u32 end_usa; + u16 ro; + + if (sys_page < SECTOR_SIZE || page_size < SECTOR_SIZE || + sys_page & (sys_page - 1) || page_size & (page_size - 1)) { + return false; + } + + /* Check that if the file offset isn't 0, it is the system page size. */ + if (file_off && file_off != sys_page) + return false; + + /* Check support version 1.1+. */ + if (le16_to_cpu(rhdr->major_ver) <= 1 && !rhdr->minor_ver) + return false; + + if (le16_to_cpu(rhdr->major_ver) > 2) + return false; + + ro = le16_to_cpu(rhdr->ra_off); + if (!IS_ALIGNED(ro, 8) || ro > sys_page) + return false; + + end_usa = ((sys_page >> SECTOR_SHIFT) + 1) * sizeof(short); + end_usa += le16_to_cpu(rhdr->rhdr.fix_off); + + if (ro < end_usa) + return false; + + return true; +} + +static inline bool is_rst_area_valid(const struct RESTART_HDR *rhdr) +{ + const struct RESTART_AREA *ra; + u16 cl, fl, ul; + u32 off, l_size, file_dat_bits, file_size_round; + u16 ro = le16_to_cpu(rhdr->ra_off); + u32 sys_page = le32_to_cpu(rhdr->sys_page_size); + + if (ro + offsetof(struct RESTART_AREA, l_size) > + SECTOR_SIZE - sizeof(short)) + return false; + + ra = Add2Ptr(rhdr, ro); + cl = le16_to_cpu(ra->log_clients); + + if (cl > 1) + return false; + + off = le16_to_cpu(ra->client_off); + + if (!IS_ALIGNED(off, 8) || ro + off > SECTOR_SIZE - sizeof(short)) + return false; + + off += cl * sizeof(struct CLIENT_REC); + + if (off > sys_page) + return false; + + /* + * Check the restart length field and whether the entire + * restart area is contained that length. + */ + if (le16_to_cpu(rhdr->ra_off) + le16_to_cpu(ra->ra_len) > sys_page || + off > le16_to_cpu(ra->ra_len)) { + return false; + } + + /* + * As a final check make sure that the use list and the free list + * are either empty or point to a valid client. + */ + fl = le16_to_cpu(ra->client_idx[0]); + ul = le16_to_cpu(ra->client_idx[1]); + if ((fl != LFS_NO_CLIENT && fl >= cl) || + (ul != LFS_NO_CLIENT && ul >= cl)) + return false; + + /* Make sure the sequence number bits match the log file size. */ + l_size = le64_to_cpu(ra->l_size); + + file_dat_bits = sizeof(u64) * 8 - le32_to_cpu(ra->seq_num_bits); + file_size_round = 1u << (file_dat_bits + 3); + if (file_size_round != l_size && + (file_size_round < l_size || (file_size_round / 2) > l_size)) { + return false; + } + + /* The log page data offset and record header length must be quad-aligned. */ + if (!IS_ALIGNED(le16_to_cpu(ra->data_off), 8) || + !IS_ALIGNED(le16_to_cpu(ra->rec_hdr_len), 8)) + return false; + + return true; +} + +static inline bool is_client_area_valid(const struct RESTART_HDR *rhdr, + bool usa_error) +{ + u16 ro = le16_to_cpu(rhdr->ra_off); + const struct RESTART_AREA *ra = Add2Ptr(rhdr, ro); + u16 ra_len = le16_to_cpu(ra->ra_len); + const struct CLIENT_REC *ca; + u32 i; + + if (usa_error && ra_len + ro > SECTOR_SIZE - sizeof(short)) + return false; + + /* Find the start of the client array. */ + ca = Add2Ptr(ra, le16_to_cpu(ra->client_off)); + + /* + * Start with the free list. + * Check that all the clients are valid and that there isn't a cycle. + * Do the in-use list on the second pass. + */ + for (i = 0; i < 2; i++) { + u16 client_idx = le16_to_cpu(ra->client_idx[i]); + bool first_client = true; + u16 clients = le16_to_cpu(ra->log_clients); + + while (client_idx != LFS_NO_CLIENT) { + const struct CLIENT_REC *cr; + + if (!clients || + client_idx >= le16_to_cpu(ra->log_clients)) + return false; + + clients -= 1; + cr = ca + client_idx; + + client_idx = le16_to_cpu(cr->next_client); + + if (first_client) { + first_client = false; + if (cr->prev_client != LFS_NO_CLIENT_LE) + return false; + } + } + } + + return true; +} + +/* + * remove_client + * + * Remove a client record from a client record list an restart area. + */ +static inline void remove_client(struct CLIENT_REC *ca, + const struct CLIENT_REC *cr, __le16 *head) +{ + if (cr->prev_client == LFS_NO_CLIENT_LE) + *head = cr->next_client; + else + ca[le16_to_cpu(cr->prev_client)].next_client = cr->next_client; + + if (cr->next_client != LFS_NO_CLIENT_LE) + ca[le16_to_cpu(cr->next_client)].prev_client = cr->prev_client; +} + +/* + * add_client - Add a client record to the start of a list. + */ +static inline void add_client(struct CLIENT_REC *ca, u16 index, __le16 *head) +{ + struct CLIENT_REC *cr = ca + index; + + cr->prev_client = LFS_NO_CLIENT_LE; + cr->next_client = *head; + + if (*head != LFS_NO_CLIENT_LE) + ca[le16_to_cpu(*head)].prev_client = cpu_to_le16(index); + + *head = cpu_to_le16(index); +} + +static inline void *enum_rstbl(struct RESTART_TABLE *t, void *c) +{ + __le32 *e; + u32 bprt; + u16 rsize = t ? le16_to_cpu(t->size) : 0; + + if (!c) { + if (!t || !t->total) + return NULL; + e = Add2Ptr(t, sizeof(struct RESTART_TABLE)); + } else { + e = Add2Ptr(c, rsize); + } + + /* Loop until we hit the first one allocated, or the end of the list. */ + for (bprt = bytes_per_rt(t); PtrOffset(t, e) < bprt; + e = Add2Ptr(e, rsize)) { + if (*e == RESTART_ENTRY_ALLOCATED_LE) + return e; + } + return NULL; +} + +/* + * find_dp - Search for a @vcn in Dirty Page Table. + */ +static inline struct DIR_PAGE_ENTRY *find_dp(struct RESTART_TABLE *dptbl, + u32 target_attr, u64 vcn) +{ + __le32 ta = cpu_to_le32(target_attr); + struct DIR_PAGE_ENTRY *dp = NULL; + + while ((dp = enum_rstbl(dptbl, dp))) { + u64 dp_vcn = le64_to_cpu(dp->vcn); + + if (dp->target_attr == ta && vcn >= dp_vcn && + vcn < dp_vcn + le32_to_cpu(dp->lcns_follow)) { + return dp; + } + } + return NULL; +} + +static inline u32 norm_file_page(u32 page_size, u32 *l_size, bool use_default) +{ + if (use_default) + page_size = DefaultLogPageSize; + + /* Round the file size down to a system page boundary. */ + *l_size &= ~(page_size - 1); + + /* File should contain at least 2 restart pages and MinLogRecordPages pages. */ + if (*l_size < (MinLogRecordPages + 2) * page_size) + return 0; + + return page_size; +} + +static bool check_log_rec(const struct LOG_REC_HDR *lr, u32 bytes, u32 tr, + u32 bytes_per_attr_entry) +{ + u16 t16; + + if (bytes < sizeof(struct LOG_REC_HDR)) + return false; + if (!tr) + return false; + + if ((tr - sizeof(struct RESTART_TABLE)) % + sizeof(struct TRANSACTION_ENTRY)) + return false; + + if (le16_to_cpu(lr->redo_off) & 7) + return false; + + if (le16_to_cpu(lr->undo_off) & 7) + return false; + + if (lr->target_attr) + goto check_lcns; + + if (is_target_required(le16_to_cpu(lr->redo_op))) + return false; + + if (is_target_required(le16_to_cpu(lr->undo_op))) + return false; + +check_lcns: + if (!lr->lcns_follow) + goto check_length; + + t16 = le16_to_cpu(lr->target_attr); + if ((t16 - sizeof(struct RESTART_TABLE)) % bytes_per_attr_entry) + return false; + +check_length: + if (bytes < lrh_length(lr)) + return false; + + return true; +} + +static bool check_rstbl(const struct RESTART_TABLE *rt, size_t bytes) +{ + u32 ts; + u32 i, off; + u16 rsize = le16_to_cpu(rt->size); + u16 ne = le16_to_cpu(rt->used); + u32 ff = le32_to_cpu(rt->first_free); + u32 lf = le32_to_cpu(rt->last_free); + + ts = rsize * ne + sizeof(struct RESTART_TABLE); + + if (!rsize || rsize > bytes || + rsize + sizeof(struct RESTART_TABLE) > bytes || bytes < ts || + le16_to_cpu(rt->total) > ne || ff > ts || lf > ts || + (ff && ff < sizeof(struct RESTART_TABLE)) || + (lf && lf < sizeof(struct RESTART_TABLE))) { + return false; + } + + /* + * Verify each entry is either allocated or points + * to a valid offset the table. + */ + for (i = 0; i < ne; i++) { + off = le32_to_cpu(*(__le32 *)Add2Ptr( + rt, i * rsize + sizeof(struct RESTART_TABLE))); + + if (off != RESTART_ENTRY_ALLOCATED && off && + (off < sizeof(struct RESTART_TABLE) || + ((off - sizeof(struct RESTART_TABLE)) % rsize))) { + return false; + } + } + + /* + * Walk through the list headed by the first entry to make + * sure none of the entries are currently being used. + */ + for (off = ff; off;) { + if (off == RESTART_ENTRY_ALLOCATED) + return false; + + off = le32_to_cpu(*(__le32 *)Add2Ptr(rt, off)); + } + + return true; +} + +/* + * free_rsttbl_idx - Free a previously allocated index a Restart Table. + */ +static inline void free_rsttbl_idx(struct RESTART_TABLE *rt, u32 off) +{ + __le32 *e; + u32 lf = le32_to_cpu(rt->last_free); + __le32 off_le = cpu_to_le32(off); + + e = Add2Ptr(rt, off); + + if (off < le32_to_cpu(rt->free_goal)) { + *e = rt->first_free; + rt->first_free = off_le; + if (!lf) + rt->last_free = off_le; + } else { + if (lf) + *(__le32 *)Add2Ptr(rt, lf) = off_le; + else + rt->first_free = off_le; + + rt->last_free = off_le; + *e = 0; + } + + le16_sub_cpu(&rt->total, 1); +} + +static inline struct RESTART_TABLE *init_rsttbl(u16 esize, u16 used) +{ + __le32 *e, *last_free; + u32 off; + u32 bytes = esize * used + sizeof(struct RESTART_TABLE); + u32 lf = sizeof(struct RESTART_TABLE) + (used - 1) * esize; + struct RESTART_TABLE *t = kzalloc(bytes, GFP_NOFS); + + if (!t) + return NULL; + + t->size = cpu_to_le16(esize); + t->used = cpu_to_le16(used); + t->free_goal = cpu_to_le32(~0u); + t->first_free = cpu_to_le32(sizeof(struct RESTART_TABLE)); + t->last_free = cpu_to_le32(lf); + + e = (__le32 *)(t + 1); + last_free = Add2Ptr(t, lf); + + for (off = sizeof(struct RESTART_TABLE) + esize; e < last_free; + e = Add2Ptr(e, esize), off += esize) { + *e = cpu_to_le32(off); + } + return t; +} + +static inline struct RESTART_TABLE *extend_rsttbl(struct RESTART_TABLE *tbl, + u32 add, u32 free_goal) +{ + u16 esize = le16_to_cpu(tbl->size); + __le32 osize = cpu_to_le32(bytes_per_rt(tbl)); + u32 used = le16_to_cpu(tbl->used); + struct RESTART_TABLE *rt; + + rt = init_rsttbl(esize, used + add); + if (!rt) + return NULL; + + memcpy(rt + 1, tbl + 1, esize * used); + + rt->free_goal = free_goal == ~0u + ? cpu_to_le32(~0u) + : cpu_to_le32(sizeof(struct RESTART_TABLE) + + free_goal * esize); + + if (tbl->first_free) { + rt->first_free = tbl->first_free; + *(__le32 *)Add2Ptr(rt, le32_to_cpu(tbl->last_free)) = osize; + } else { + rt->first_free = osize; + } + + rt->total = tbl->total; + + kfree(tbl); + return rt; +} + +/* + * alloc_rsttbl_idx + * + * Allocate an index from within a previously initialized Restart Table. + */ +static inline void *alloc_rsttbl_idx(struct RESTART_TABLE **tbl) +{ + u32 off; + __le32 *e; + struct RESTART_TABLE *t = *tbl; + + if (!t->first_free) { + *tbl = t = extend_rsttbl(t, 16, ~0u); + if (!t) + return NULL; + } + + off = le32_to_cpu(t->first_free); + + /* Dequeue this entry and zero it. */ + e = Add2Ptr(t, off); + + t->first_free = *e; + + memset(e, 0, le16_to_cpu(t->size)); + + *e = RESTART_ENTRY_ALLOCATED_LE; + + /* If list is going empty, then we fix the last_free as well. */ + if (!t->first_free) + t->last_free = 0; + + le16_add_cpu(&t->total, 1); + + return Add2Ptr(t, off); +} + +/* + * alloc_rsttbl_from_idx + * + * Allocate a specific index from within a previously initialized Restart Table. + */ +static inline void *alloc_rsttbl_from_idx(struct RESTART_TABLE **tbl, u32 vbo) +{ + u32 off; + __le32 *e; + struct RESTART_TABLE *rt = *tbl; + u32 bytes = bytes_per_rt(rt); + u16 esize = le16_to_cpu(rt->size); + + /* If the entry is not the table, we will have to extend the table. */ + if (vbo >= bytes) { + /* + * Extend the size by computing the number of entries between + * the existing size and the desired index and adding 1 to that. + */ + u32 bytes2idx = vbo - bytes; + + /* + * There should always be an integral number of entries + * being added. Now extend the table. + */ + *tbl = rt = extend_rsttbl(rt, bytes2idx / esize + 1, bytes); + if (!rt) + return NULL; + } + + /* See if the entry is already allocated, and just return if it is. */ + e = Add2Ptr(rt, vbo); + + if (*e == RESTART_ENTRY_ALLOCATED_LE) + return e; + + /* + * Walk through the table, looking for the entry we're + * interested and the previous entry. + */ + off = le32_to_cpu(rt->first_free); + e = Add2Ptr(rt, off); + + if (off == vbo) { + /* this is a match */ + rt->first_free = *e; + goto skip_looking; + } + + /* + * Need to walk through the list looking for the predecessor + * of our entry. + */ + for (;;) { + /* Remember the entry just found */ + u32 last_off = off; + __le32 *last_e = e; + + /* Should never run of entries. */ + + /* Lookup up the next entry the list. */ + off = le32_to_cpu(*last_e); + e = Add2Ptr(rt, off); + + /* If this is our match we are done. */ + if (off == vbo) { + *last_e = *e; + + /* + * If this was the last entry, we update that + * table as well. + */ + if (le32_to_cpu(rt->last_free) == off) + rt->last_free = cpu_to_le32(last_off); + break; + } + } + +skip_looking: + /* If the list is now empty, we fix the last_free as well. */ + if (!rt->first_free) + rt->last_free = 0; + + /* Zero this entry. */ + memset(e, 0, esize); + *e = RESTART_ENTRY_ALLOCATED_LE; + + le16_add_cpu(&rt->total, 1); + + return e; +} + +#define RESTART_SINGLE_PAGE_IO cpu_to_le16(0x0001) + +#define NTFSLOG_WRAPPED 0x00000001 +#define NTFSLOG_MULTIPLE_PAGE_IO 0x00000002 +#define NTFSLOG_NO_LAST_LSN 0x00000004 +#define NTFSLOG_REUSE_TAIL 0x00000010 +#define NTFSLOG_NO_OLDEST_LSN 0x00000020 + +/* Helper struct to work with NTFS $LogFile. */ +struct ntfs_log { + struct ntfs_inode *ni; + + u32 l_size; + u32 sys_page_size; + u32 sys_page_mask; + u32 page_size; + u32 page_mask; // page_size - 1 + u8 page_bits; + struct RECORD_PAGE_HDR *one_page_buf; + + struct RESTART_TABLE *open_attr_tbl; + u32 transaction_id; + u32 clst_per_page; + + u32 first_page; + u32 next_page; + u32 ra_off; + u32 data_off; + u32 restart_size; + u32 data_size; + u16 record_header_len; + u64 seq_num; + u32 seq_num_bits; + u32 file_data_bits; + u32 seq_num_mask; /* (1 << file_data_bits) - 1 */ + + struct RESTART_AREA *ra; /* In-memory image of the next restart area. */ + u32 ra_size; /* The usable size of the restart area. */ + + /* + * If true, then the in-memory restart area is to be written + * to the first position on the disk. + */ + bool init_ra; + bool set_dirty; /* True if we need to set dirty flag. */ + + u64 oldest_lsn; + + u32 oldest_lsn_off; + u64 last_lsn; + + u32 total_avail; + u32 total_avail_pages; + u32 total_undo_commit; + u32 max_current_avail; + u32 current_avail; + u32 reserved; + + short major_ver; + short minor_ver; + + u32 l_flags; /* See NTFSLOG_XXX */ + u32 current_openlog_count; /* On-disk value for open_log_count. */ + + struct CLIENT_ID client_id; + u32 client_undo_commit; +}; + +static inline u32 lsn_to_vbo(struct ntfs_log *log, const u64 lsn) +{ + u32 vbo = (lsn << log->seq_num_bits) >> (log->seq_num_bits - 3); + + return vbo; +} + +/* Compute the offset in the log file of the next log page. */ +static inline u32 next_page_off(struct ntfs_log *log, u32 off) +{ + off = (off & ~log->sys_page_mask) + log->page_size; + return off >= log->l_size ? log->first_page : off; +} + +static inline u32 lsn_to_page_off(struct ntfs_log *log, u64 lsn) +{ + return (((u32)lsn) << 3) & log->page_mask; +} + +static inline u64 vbo_to_lsn(struct ntfs_log *log, u32 off, u64 Seq) +{ + return (off >> 3) + (Seq << log->file_data_bits); +} + +static inline bool is_lsn_in_file(struct ntfs_log *log, u64 lsn) +{ + return lsn >= log->oldest_lsn && + lsn <= le64_to_cpu(log->ra->current_lsn); +} + +static inline u32 hdr_file_off(struct ntfs_log *log, + struct RECORD_PAGE_HDR *hdr) +{ + if (log->major_ver < 2) + return le64_to_cpu(hdr->rhdr.lsn); + + return le32_to_cpu(hdr->file_off); +} + +static inline u64 base_lsn(struct ntfs_log *log, + const struct RECORD_PAGE_HDR *hdr, u64 lsn) +{ + u64 h_lsn = le64_to_cpu(hdr->rhdr.lsn); + u64 ret = (((h_lsn >> log->file_data_bits) + + (lsn < (lsn_to_vbo(log, h_lsn) & ~log->page_mask) ? 1 : 0)) + << log->file_data_bits) + + ((((is_log_record_end(hdr) && + h_lsn <= le64_to_cpu(hdr->record_hdr.last_end_lsn)) + ? le16_to_cpu(hdr->record_hdr.next_record_off) + : log->page_size) + + lsn) >> + 3); + + return ret; +} + +static inline bool verify_client_lsn(struct ntfs_log *log, + const struct CLIENT_REC *client, u64 lsn) +{ + return lsn >= le64_to_cpu(client->oldest_lsn) && + lsn <= le64_to_cpu(log->ra->current_lsn) && lsn; +} + +struct restart_info { + u64 last_lsn; + struct RESTART_HDR *r_page; + u32 vbo; + bool chkdsk_was_run; + bool valid_page; + bool initialized; + bool restart; +}; + +static int read_log_page(struct ntfs_log *log, u32 vbo, + struct RECORD_PAGE_HDR **buffer, bool *usa_error) +{ + int err = 0; + u32 page_idx = vbo >> log->page_bits; + u32 page_off = vbo & log->page_mask; + u32 bytes = log->page_size - page_off; + void *to_free = NULL; + u32 page_vbo = page_idx << log->page_bits; + struct RECORD_PAGE_HDR *page_buf; + struct ntfs_inode *ni = log->ni; + bool bBAAD; + + if (vbo >= log->l_size) + return -EINVAL; + + if (!*buffer) { + to_free = kmalloc(bytes, GFP_NOFS); + if (!to_free) + return -ENOMEM; + *buffer = to_free; + } + + page_buf = page_off ? log->one_page_buf : *buffer; + + err = ntfs_read_run_nb(ni->mi.sbi, &ni->file.run, page_vbo, page_buf, + log->page_size, NULL); + if (err) + goto out; + + if (page_buf->rhdr.sign != NTFS_FFFF_SIGNATURE) + ntfs_fix_post_read(&page_buf->rhdr, PAGE_SIZE, false); + + if (page_buf != *buffer) + memcpy(*buffer, Add2Ptr(page_buf, page_off), bytes); + + bBAAD = page_buf->rhdr.sign == NTFS_BAAD_SIGNATURE; + + if (usa_error) + *usa_error = bBAAD; + /* Check that the update sequence array for this page is valid */ + /* If we don't allow errors, raise an error status */ + else if (bBAAD) + err = -EINVAL; + +out: + if (err && to_free) { + kfree(to_free); + *buffer = NULL; + } + + return err; +} + +/* + * log_read_rst + * + * It walks through 512 blocks of the file looking for a valid + * restart page header. It will stop the first time we find a + * valid page header. + */ +static int log_read_rst(struct ntfs_log *log, u32 l_size, bool first, + struct restart_info *info) +{ + u32 skip, vbo; + struct RESTART_HDR *r_page = kmalloc(DefaultLogPageSize, GFP_NOFS); + + if (!r_page) + return -ENOMEM; + + memset(info, 0, sizeof(struct restart_info)); + + /* Determine which restart area we are looking for. */ + if (first) { + vbo = 0; + skip = 512; + } else { + vbo = 512; + skip = 0; + } + + /* Loop continuously until we succeed. */ + for (; vbo < l_size; vbo = 2 * vbo + skip, skip = 0) { + bool usa_error; + u32 sys_page_size; + bool brst, bchk; + struct RESTART_AREA *ra; + + /* Read a page header at the current offset. */ + if (read_log_page(log, vbo, (struct RECORD_PAGE_HDR **)&r_page, + &usa_error)) { + /* Ignore any errors. */ + continue; + } + + /* Exit if the signature is a log record page. */ + if (r_page->rhdr.sign == NTFS_RCRD_SIGNATURE) { + info->initialized = true; + break; + } + + brst = r_page->rhdr.sign == NTFS_RSTR_SIGNATURE; + bchk = r_page->rhdr.sign == NTFS_CHKD_SIGNATURE; + + if (!bchk && !brst) { + if (r_page->rhdr.sign != NTFS_FFFF_SIGNATURE) { + /* + * Remember if the signature does not + * indicate uninitialized file. + */ + info->initialized = true; + } + continue; + } + + ra = NULL; + info->valid_page = false; + info->initialized = true; + info->vbo = vbo; + + /* Let's check the restart area if this is a valid page. */ + if (!is_rst_page_hdr_valid(vbo, r_page)) + goto check_result; + ra = Add2Ptr(r_page, le16_to_cpu(r_page->ra_off)); + + if (!is_rst_area_valid(r_page)) + goto check_result; + + /* + * We have a valid restart page header and restart area. + * If chkdsk was run or we have no clients then we have + * no more checking to do. + */ + if (bchk || ra->client_idx[1] == LFS_NO_CLIENT_LE) { + info->valid_page = true; + goto check_result; + } + + /* Read the entire restart area. */ + sys_page_size = le32_to_cpu(r_page->sys_page_size); + if (DefaultLogPageSize != sys_page_size) { + kfree(r_page); + r_page = kzalloc(sys_page_size, GFP_NOFS); + if (!r_page) + return -ENOMEM; + + if (read_log_page(log, vbo, + (struct RECORD_PAGE_HDR **)&r_page, + &usa_error)) { + /* Ignore any errors. */ + kfree(r_page); + r_page = NULL; + continue; + } + } + + if (is_client_area_valid(r_page, usa_error)) { + info->valid_page = true; + ra = Add2Ptr(r_page, le16_to_cpu(r_page->ra_off)); + } + +check_result: + /* + * If chkdsk was run then update the caller's + * values and return. + */ + if (r_page->rhdr.sign == NTFS_CHKD_SIGNATURE) { + info->chkdsk_was_run = true; + info->last_lsn = le64_to_cpu(r_page->rhdr.lsn); + info->restart = true; + info->r_page = r_page; + return 0; + } + + /* + * If we have a valid page then copy the values + * we need from it. + */ + if (info->valid_page) { + info->last_lsn = le64_to_cpu(ra->current_lsn); + info->restart = true; + info->r_page = r_page; + return 0; + } + } + + kfree(r_page); + + return 0; +} + +/* + * Ilog_init_pg_hdr - Init @log from restart page header. + */ +static void log_init_pg_hdr(struct ntfs_log *log, u32 sys_page_size, + u32 page_size, u16 major_ver, u16 minor_ver) +{ + log->sys_page_size = sys_page_size; + log->sys_page_mask = sys_page_size - 1; + log->page_size = page_size; + log->page_mask = page_size - 1; + log->page_bits = blksize_bits(page_size); + + log->clst_per_page = log->page_size >> log->ni->mi.sbi->cluster_bits; + if (!log->clst_per_page) + log->clst_per_page = 1; + + log->first_page = major_ver >= 2 + ? 0x22 * page_size + : ((sys_page_size << 1) + (page_size << 1)); + log->major_ver = major_ver; + log->minor_ver = minor_ver; +} + +/* + * log_create - Init @log in cases when we don't have a restart area to use. + */ +static void log_create(struct ntfs_log *log, u32 l_size, const u64 last_lsn, + u32 open_log_count, bool wrapped, bool use_multi_page) +{ + log->l_size = l_size; + /* All file offsets must be quadword aligned. */ + log->file_data_bits = blksize_bits(l_size) - 3; + log->seq_num_mask = (8 << log->file_data_bits) - 1; + log->seq_num_bits = sizeof(u64) * 8 - log->file_data_bits; + log->seq_num = (last_lsn >> log->file_data_bits) + 2; + log->next_page = log->first_page; + log->oldest_lsn = log->seq_num << log->file_data_bits; + log->oldest_lsn_off = 0; + log->last_lsn = log->oldest_lsn; + + log->l_flags |= NTFSLOG_NO_LAST_LSN | NTFSLOG_NO_OLDEST_LSN; + + /* Set the correct flags for the I/O and indicate if we have wrapped. */ + if (wrapped) + log->l_flags |= NTFSLOG_WRAPPED; + + if (use_multi_page) + log->l_flags |= NTFSLOG_MULTIPLE_PAGE_IO; + + /* Compute the log page values. */ + log->data_off = ALIGN( + offsetof(struct RECORD_PAGE_HDR, fixups) + + sizeof(short) * ((log->page_size >> SECTOR_SHIFT) + 1), + 8); + log->data_size = log->page_size - log->data_off; + log->record_header_len = sizeof(struct LFS_RECORD_HDR); + + /* Remember the different page sizes for reservation. */ + log->reserved = log->data_size - log->record_header_len; + + /* Compute the restart page values. */ + log->ra_off = ALIGN( + offsetof(struct RESTART_HDR, fixups) + + sizeof(short) * + ((log->sys_page_size >> SECTOR_SHIFT) + 1), + 8); + log->restart_size = log->sys_page_size - log->ra_off; + log->ra_size = struct_size(log->ra, clients, 1); + log->current_openlog_count = open_log_count; + + /* + * The total available log file space is the number of + * log file pages times the space available on each page. + */ + log->total_avail_pages = log->l_size - log->first_page; + log->total_avail = log->total_avail_pages >> log->page_bits; + + /* + * We assume that we can't use the end of the page less than + * the file record size. + * Then we won't need to reserve more than the caller asks for. + */ + log->max_current_avail = log->total_avail * log->reserved; + log->total_avail = log->total_avail * log->data_size; + log->current_avail = log->max_current_avail; +} + +/* + * log_create_ra - Fill a restart area from the values stored in @log. + */ +static struct RESTART_AREA *log_create_ra(struct ntfs_log *log) +{ + struct CLIENT_REC *cr; + struct RESTART_AREA *ra = kzalloc(log->restart_size, GFP_NOFS); + + if (!ra) + return NULL; + + ra->current_lsn = cpu_to_le64(log->last_lsn); + ra->log_clients = cpu_to_le16(1); + ra->client_idx[1] = LFS_NO_CLIENT_LE; + if (log->l_flags & NTFSLOG_MULTIPLE_PAGE_IO) + ra->flags = RESTART_SINGLE_PAGE_IO; + ra->seq_num_bits = cpu_to_le32(log->seq_num_bits); + ra->ra_len = cpu_to_le16(log->ra_size); + ra->client_off = cpu_to_le16(offsetof(struct RESTART_AREA, clients)); + ra->l_size = cpu_to_le64(log->l_size); + ra->rec_hdr_len = cpu_to_le16(log->record_header_len); + ra->data_off = cpu_to_le16(log->data_off); + ra->open_log_count = cpu_to_le32(log->current_openlog_count + 1); + + cr = ra->clients; + + cr->prev_client = LFS_NO_CLIENT_LE; + cr->next_client = LFS_NO_CLIENT_LE; + + return ra; +} + +static u32 final_log_off(struct ntfs_log *log, u64 lsn, u32 data_len) +{ + u32 base_vbo = lsn << 3; + u32 final_log_off = (base_vbo & log->seq_num_mask) & ~log->page_mask; + u32 page_off = base_vbo & log->page_mask; + u32 tail = log->page_size - page_off; + + page_off -= 1; + + /* Add the length of the header. */ + data_len += log->record_header_len; + + /* + * If this lsn is contained this log page we are done. + * Otherwise we need to walk through several log pages. + */ + if (data_len > tail) { + data_len -= tail; + tail = log->data_size; + page_off = log->data_off - 1; + + for (;;) { + final_log_off = next_page_off(log, final_log_off); + + /* + * We are done if the remaining bytes + * fit on this page. + */ + if (data_len <= tail) + break; + data_len -= tail; + } + } + + /* + * We add the remaining bytes to our starting position on this page + * and then add that value to the file offset of this log page. + */ + return final_log_off + data_len + page_off; +} + +static int next_log_lsn(struct ntfs_log *log, const struct LFS_RECORD_HDR *rh, + u64 *lsn) +{ + int err; + u64 this_lsn = le64_to_cpu(rh->this_lsn); + u32 vbo = lsn_to_vbo(log, this_lsn); + u32 end = + final_log_off(log, this_lsn, le32_to_cpu(rh->client_data_len)); + u32 hdr_off = end & ~log->sys_page_mask; + u64 seq = this_lsn >> log->file_data_bits; + struct RECORD_PAGE_HDR *page = NULL; + + /* Remember if we wrapped. */ + if (end <= vbo) + seq += 1; + + /* Log page header for this page. */ + err = read_log_page(log, hdr_off, &page, NULL); + if (err) + return err; + + /* + * If the lsn we were given was not the last lsn on this page, + * then the starting offset for the next lsn is on a quad word + * boundary following the last file offset for the current lsn. + * Otherwise the file offset is the start of the data on the next page. + */ + if (this_lsn == le64_to_cpu(page->rhdr.lsn)) { + /* If we wrapped, we need to increment the sequence number. */ + hdr_off = next_page_off(log, hdr_off); + if (hdr_off == log->first_page) + seq += 1; + + vbo = hdr_off + log->data_off; + } else { + vbo = ALIGN(end, 8); + } + + /* Compute the lsn based on the file offset and the sequence count. */ + *lsn = vbo_to_lsn(log, vbo, seq); + + /* + * If this lsn is within the legal range for the file, we return true. + * Otherwise false indicates that there are no more lsn's. + */ + if (!is_lsn_in_file(log, *lsn)) + *lsn = 0; + + kfree(page); + + return 0; +} + +/* + * current_log_avail - Calculate the number of bytes available for log records. + */ +static u32 current_log_avail(struct ntfs_log *log) +{ + u32 oldest_off, next_free_off, free_bytes; + + if (log->l_flags & NTFSLOG_NO_LAST_LSN) { + /* The entire file is available. */ + return log->max_current_avail; + } + + /* + * If there is a last lsn the restart area then we know that we will + * have to compute the free range. + * If there is no oldest lsn then start at the first page of the file. + */ + oldest_off = (log->l_flags & NTFSLOG_NO_OLDEST_LSN) + ? log->first_page + : (log->oldest_lsn_off & ~log->sys_page_mask); + + /* + * We will use the next log page offset to compute the next free page. + * If we are going to reuse this page go to the next page. + * If we are at the first page then use the end of the file. + */ + next_free_off = (log->l_flags & NTFSLOG_REUSE_TAIL) + ? log->next_page + log->page_size + : log->next_page == log->first_page + ? log->l_size + : log->next_page; + + /* If the two offsets are the same then there is no available space. */ + if (oldest_off == next_free_off) + return 0; + /* + * If the free offset follows the oldest offset then subtract + * this range from the total available pages. + */ + free_bytes = + oldest_off < next_free_off + ? log->total_avail_pages - (next_free_off - oldest_off) + : oldest_off - next_free_off; + + free_bytes >>= log->page_bits; + return free_bytes * log->reserved; +} + +static bool check_subseq_log_page(struct ntfs_log *log, + const struct RECORD_PAGE_HDR *rp, u32 vbo, + u64 seq) +{ + u64 lsn_seq; + const struct NTFS_RECORD_HEADER *rhdr = &rp->rhdr; + u64 lsn = le64_to_cpu(rhdr->lsn); + + if (rhdr->sign == NTFS_FFFF_SIGNATURE || !rhdr->sign) + return false; + + /* + * If the last lsn on the page occurs was written after the page + * that caused the original error then we have a fatal error. + */ + lsn_seq = lsn >> log->file_data_bits; + + /* + * If the sequence number for the lsn the page is equal or greater + * than lsn we expect, then this is a subsequent write. + */ + return lsn_seq >= seq || + (lsn_seq == seq - 1 && log->first_page == vbo && + vbo != (lsn_to_vbo(log, lsn) & ~log->page_mask)); +} + +/* + * last_log_lsn + * + * Walks through the log pages for a file, searching for the + * last log page written to the file. + */ +static int last_log_lsn(struct ntfs_log *log) +{ + int err; + bool usa_error = false; + bool replace_page = false; + bool reuse_page = log->l_flags & NTFSLOG_REUSE_TAIL; + bool wrapped_file, wrapped; + + u32 page_cnt = 1, page_pos = 1; + u32 page_off = 0, page_off1 = 0, saved_off = 0; + u32 final_off, second_off, final_off_prev = 0, second_off_prev = 0; + u32 first_file_off = 0, second_file_off = 0; + u32 part_io_count = 0; + u32 tails = 0; + u32 this_off, curpage_off, nextpage_off, remain_pages; + + u64 expected_seq, seq_base = 0, lsn_base = 0; + u64 best_lsn, best_lsn1, best_lsn2; + u64 lsn_cur, lsn1, lsn2; + u64 last_ok_lsn = reuse_page ? log->last_lsn : 0; + + u16 cur_pos, best_page_pos; + + struct RECORD_PAGE_HDR *page = NULL; + struct RECORD_PAGE_HDR *tst_page = NULL; + struct RECORD_PAGE_HDR *first_tail = NULL; + struct RECORD_PAGE_HDR *second_tail = NULL; + struct RECORD_PAGE_HDR *tail_page = NULL; + struct RECORD_PAGE_HDR *second_tail_prev = NULL; + struct RECORD_PAGE_HDR *first_tail_prev = NULL; + struct RECORD_PAGE_HDR *page_bufs = NULL; + struct RECORD_PAGE_HDR *best_page; + + if (log->major_ver >= 2) { + final_off = 0x02 * log->page_size; + second_off = 0x12 * log->page_size; + + // 0x10 == 0x12 - 0x2 + page_bufs = kmalloc(log->page_size * 0x10, GFP_NOFS); + if (!page_bufs) + return -ENOMEM; + } else { + second_off = log->first_page - log->page_size; + final_off = second_off - log->page_size; + } + +next_tail: + /* Read second tail page (at pos 3/0x12000). */ + if (read_log_page(log, second_off, &second_tail, &usa_error) || + usa_error || second_tail->rhdr.sign != NTFS_RCRD_SIGNATURE) { + kfree(second_tail); + second_tail = NULL; + second_file_off = 0; + lsn2 = 0; + } else { + second_file_off = hdr_file_off(log, second_tail); + lsn2 = le64_to_cpu(second_tail->record_hdr.last_end_lsn); + } + + /* Read first tail page (at pos 2/0x2000). */ + if (read_log_page(log, final_off, &first_tail, &usa_error) || + usa_error || first_tail->rhdr.sign != NTFS_RCRD_SIGNATURE) { + kfree(first_tail); + first_tail = NULL; + first_file_off = 0; + lsn1 = 0; + } else { + first_file_off = hdr_file_off(log, first_tail); + lsn1 = le64_to_cpu(first_tail->record_hdr.last_end_lsn); + } + + if (log->major_ver < 2) { + int best_page; + + first_tail_prev = first_tail; + final_off_prev = first_file_off; + second_tail_prev = second_tail; + second_off_prev = second_file_off; + tails = 1; + + if (!first_tail && !second_tail) + goto tail_read; + + if (first_tail && second_tail) + best_page = lsn1 < lsn2 ? 1 : 0; + else if (first_tail) + best_page = 0; + else + best_page = 1; + + page_off = best_page ? second_file_off : first_file_off; + seq_base = (best_page ? lsn2 : lsn1) >> log->file_data_bits; + goto tail_read; + } + + best_lsn1 = first_tail ? base_lsn(log, first_tail, first_file_off) : 0; + best_lsn2 = + second_tail ? base_lsn(log, second_tail, second_file_off) : 0; + + if (first_tail && second_tail) { + if (best_lsn1 > best_lsn2) { + best_lsn = best_lsn1; + best_page = first_tail; + this_off = first_file_off; + } else { + best_lsn = best_lsn2; + best_page = second_tail; + this_off = second_file_off; + } + } else if (first_tail) { + best_lsn = best_lsn1; + best_page = first_tail; + this_off = first_file_off; + } else if (second_tail) { + best_lsn = best_lsn2; + best_page = second_tail; + this_off = second_file_off; + } else { + goto tail_read; + } + + best_page_pos = le16_to_cpu(best_page->page_pos); + + if (!tails) { + if (best_page_pos == page_pos) { + seq_base = best_lsn >> log->file_data_bits; + saved_off = page_off = le32_to_cpu(best_page->file_off); + lsn_base = best_lsn; + + memmove(page_bufs, best_page, log->page_size); + + page_cnt = le16_to_cpu(best_page->page_count); + if (page_cnt > 1) + page_pos += 1; + + tails = 1; + } + } else if (seq_base == (best_lsn >> log->file_data_bits) && + saved_off + log->page_size == this_off && + lsn_base < best_lsn && + (page_pos != page_cnt || best_page_pos == page_pos || + best_page_pos == 1) && + (page_pos >= page_cnt || best_page_pos == page_pos)) { + u16 bppc = le16_to_cpu(best_page->page_count); + + saved_off += log->page_size; + lsn_base = best_lsn; + + memmove(Add2Ptr(page_bufs, tails * log->page_size), best_page, + log->page_size); + + tails += 1; + + if (best_page_pos != bppc) { + page_cnt = bppc; + page_pos = best_page_pos; + + if (page_cnt > 1) + page_pos += 1; + } else { + page_pos = page_cnt = 1; + } + } else { + kfree(first_tail); + kfree(second_tail); + goto tail_read; + } + + kfree(first_tail_prev); + first_tail_prev = first_tail; + final_off_prev = first_file_off; + first_tail = NULL; + + kfree(second_tail_prev); + second_tail_prev = second_tail; + second_off_prev = second_file_off; + second_tail = NULL; + + final_off += log->page_size; + second_off += log->page_size; + + if (tails < 0x10) + goto next_tail; +tail_read: + first_tail = first_tail_prev; + final_off = final_off_prev; + + second_tail = second_tail_prev; + second_off = second_off_prev; + + page_cnt = page_pos = 1; + + curpage_off = seq_base == log->seq_num ? min(log->next_page, page_off) + : log->next_page; + + wrapped_file = + curpage_off == log->first_page && + !(log->l_flags & (NTFSLOG_NO_LAST_LSN | NTFSLOG_REUSE_TAIL)); + + expected_seq = wrapped_file ? (log->seq_num + 1) : log->seq_num; + + nextpage_off = curpage_off; + +next_page: + tail_page = NULL; + /* Read the next log page. */ + err = read_log_page(log, curpage_off, &page, &usa_error); + + /* Compute the next log page offset the file. */ + nextpage_off = next_page_off(log, curpage_off); + wrapped = nextpage_off == log->first_page; + + if (tails > 1) { + struct RECORD_PAGE_HDR *cur_page = + Add2Ptr(page_bufs, curpage_off - page_off); + + if (curpage_off == saved_off) { + tail_page = cur_page; + goto use_tail_page; + } + + if (page_off > curpage_off || curpage_off >= saved_off) + goto use_tail_page; + + if (page_off1) + goto use_cur_page; + + if (!err && !usa_error && + page->rhdr.sign == NTFS_RCRD_SIGNATURE && + cur_page->rhdr.lsn == page->rhdr.lsn && + cur_page->record_hdr.next_record_off == + page->record_hdr.next_record_off && + ((page_pos == page_cnt && + le16_to_cpu(page->page_pos) == 1) || + (page_pos != page_cnt && + le16_to_cpu(page->page_pos) == page_pos + 1 && + le16_to_cpu(page->page_count) == page_cnt))) { + cur_page = NULL; + goto use_tail_page; + } + + page_off1 = page_off; + +use_cur_page: + + lsn_cur = le64_to_cpu(cur_page->rhdr.lsn); + + if (last_ok_lsn != + le64_to_cpu(cur_page->record_hdr.last_end_lsn) && + ((lsn_cur >> log->file_data_bits) + + ((curpage_off < + (lsn_to_vbo(log, lsn_cur) & ~log->page_mask)) + ? 1 + : 0)) != expected_seq) { + goto check_tail; + } + + if (!is_log_record_end(cur_page)) { + tail_page = NULL; + last_ok_lsn = lsn_cur; + goto next_page_1; + } + + log->seq_num = expected_seq; + log->l_flags &= ~NTFSLOG_NO_LAST_LSN; + log->last_lsn = le64_to_cpu(cur_page->record_hdr.last_end_lsn); + log->ra->current_lsn = cur_page->record_hdr.last_end_lsn; + + if (log->record_header_len <= + log->page_size - + le16_to_cpu(cur_page->record_hdr.next_record_off)) { + log->l_flags |= NTFSLOG_REUSE_TAIL; + log->next_page = curpage_off; + } else { + log->l_flags &= ~NTFSLOG_REUSE_TAIL; + log->next_page = nextpage_off; + } + + if (wrapped_file) + log->l_flags |= NTFSLOG_WRAPPED; + + last_ok_lsn = le64_to_cpu(cur_page->record_hdr.last_end_lsn); + goto next_page_1; + } + + /* + * If we are at the expected first page of a transfer check to see + * if either tail copy is at this offset. + * If this page is the last page of a transfer, check if we wrote + * a subsequent tail copy. + */ + if (page_cnt == page_pos || page_cnt == page_pos + 1) { + /* + * Check if the offset matches either the first or second + * tail copy. It is possible it will match both. + */ + if (curpage_off == final_off) + tail_page = first_tail; + + /* + * If we already matched on the first page then + * check the ending lsn's. + */ + if (curpage_off == second_off) { + if (!tail_page || + (second_tail && + le64_to_cpu(second_tail->record_hdr.last_end_lsn) > + le64_to_cpu(first_tail->record_hdr + .last_end_lsn))) { + tail_page = second_tail; + } + } + } + +use_tail_page: + if (tail_page) { + /* We have a candidate for a tail copy. */ + lsn_cur = le64_to_cpu(tail_page->record_hdr.last_end_lsn); + + if (last_ok_lsn < lsn_cur) { + /* + * If the sequence number is not expected, + * then don't use the tail copy. + */ + if (expected_seq != (lsn_cur >> log->file_data_bits)) + tail_page = NULL; + } else if (last_ok_lsn > lsn_cur) { + /* + * If the last lsn is greater than the one on + * this page then forget this tail. + */ + tail_page = NULL; + } + } + + /* + *If we have an error on the current page, + * we will break of this loop. + */ + if (err || usa_error) + goto check_tail; + + /* + * Done if the last lsn on this page doesn't match the previous known + * last lsn or the sequence number is not expected. + */ + lsn_cur = le64_to_cpu(page->rhdr.lsn); + if (last_ok_lsn != lsn_cur && + expected_seq != (lsn_cur >> log->file_data_bits)) { + goto check_tail; + } + + /* + * Check that the page position and page count values are correct. + * If this is the first page of a transfer the position must be 1 + * and the count will be unknown. + */ + if (page_cnt == page_pos) { + if (page->page_pos != cpu_to_le16(1) && + (!reuse_page || page->page_pos != page->page_count)) { + /* + * If the current page is the first page we are + * looking at and we are reusing this page then + * it can be either the first or last page of a + * transfer. Otherwise it can only be the first. + */ + goto check_tail; + } + } else if (le16_to_cpu(page->page_count) != page_cnt || + le16_to_cpu(page->page_pos) != page_pos + 1) { + /* + * The page position better be 1 more than the last page + * position and the page count better match. + */ + goto check_tail; + } + + /* + * We have a valid page the file and may have a valid page + * the tail copy area. + * If the tail page was written after the page the file then + * break of the loop. + */ + if (tail_page && + le64_to_cpu(tail_page->record_hdr.last_end_lsn) > lsn_cur) { + /* Remember if we will replace the page. */ + replace_page = true; + goto check_tail; + } + + tail_page = NULL; + + if (is_log_record_end(page)) { + /* + * Since we have read this page we know the sequence number + * is the same as our expected value. + */ + log->seq_num = expected_seq; + log->last_lsn = le64_to_cpu(page->record_hdr.last_end_lsn); + log->ra->current_lsn = page->record_hdr.last_end_lsn; + log->l_flags &= ~NTFSLOG_NO_LAST_LSN; + + /* + * If there is room on this page for another header then + * remember we want to reuse the page. + */ + if (log->record_header_len <= + log->page_size - + le16_to_cpu(page->record_hdr.next_record_off)) { + log->l_flags |= NTFSLOG_REUSE_TAIL; + log->next_page = curpage_off; + } else { + log->l_flags &= ~NTFSLOG_REUSE_TAIL; + log->next_page = nextpage_off; + } + + /* Remember if we wrapped the log file. */ + if (wrapped_file) + log->l_flags |= NTFSLOG_WRAPPED; + } + + /* + * Remember the last page count and position. + * Also remember the last known lsn. + */ + page_cnt = le16_to_cpu(page->page_count); + page_pos = le16_to_cpu(page->page_pos); + last_ok_lsn = le64_to_cpu(page->rhdr.lsn); + +next_page_1: + + if (wrapped) { + expected_seq += 1; + wrapped_file = 1; + } + + curpage_off = nextpage_off; + kfree(page); + page = NULL; + reuse_page = 0; + goto next_page; + +check_tail: + if (tail_page) { + log->seq_num = expected_seq; + log->last_lsn = le64_to_cpu(tail_page->record_hdr.last_end_lsn); + log->ra->current_lsn = tail_page->record_hdr.last_end_lsn; + log->l_flags &= ~NTFSLOG_NO_LAST_LSN; + + if (log->page_size - + le16_to_cpu( + tail_page->record_hdr.next_record_off) >= + log->record_header_len) { + log->l_flags |= NTFSLOG_REUSE_TAIL; + log->next_page = curpage_off; + } else { + log->l_flags &= ~NTFSLOG_REUSE_TAIL; + log->next_page = nextpage_off; + } + + if (wrapped) + log->l_flags |= NTFSLOG_WRAPPED; + } + + /* Remember that the partial IO will start at the next page. */ + second_off = nextpage_off; + + /* + * If the next page is the first page of the file then update + * the sequence number for log records which begon the next page. + */ + if (wrapped) + expected_seq += 1; + + /* + * If we have a tail copy or are performing single page I/O we can + * immediately look at the next page. + */ + if (replace_page || (log->ra->flags & RESTART_SINGLE_PAGE_IO)) { + page_cnt = 2; + page_pos = 1; + goto check_valid; + } + + if (page_pos != page_cnt) + goto check_valid; + /* + * If the next page causes us to wrap to the beginning of the log + * file then we know which page to check next. + */ + if (wrapped) { + page_cnt = 2; + page_pos = 1; + goto check_valid; + } + + cur_pos = 2; + +next_test_page: + kfree(tst_page); + tst_page = NULL; + + /* Walk through the file, reading log pages. */ + err = read_log_page(log, nextpage_off, &tst_page, &usa_error); + + /* + * If we get a USA error then assume that we correctly found + * the end of the original transfer. + */ + if (usa_error) + goto file_is_valid; + + /* + * If we were able to read the page, we examine it to see if it + * is the same or different Io block. + */ + if (err) + goto next_test_page_1; + + if (le16_to_cpu(tst_page->page_pos) == cur_pos && + check_subseq_log_page(log, tst_page, nextpage_off, expected_seq)) { + page_cnt = le16_to_cpu(tst_page->page_count) + 1; + page_pos = le16_to_cpu(tst_page->page_pos); + goto check_valid; + } else { + goto file_is_valid; + } + +next_test_page_1: + + nextpage_off = next_page_off(log, curpage_off); + wrapped = nextpage_off == log->first_page; + + if (wrapped) { + expected_seq += 1; + page_cnt = 2; + page_pos = 1; + } + + cur_pos += 1; + part_io_count += 1; + if (!wrapped) + goto next_test_page; + +check_valid: + /* Skip over the remaining pages this transfer. */ + remain_pages = page_cnt - page_pos - 1; + part_io_count += remain_pages; + + while (remain_pages--) { + nextpage_off = next_page_off(log, curpage_off); + wrapped = nextpage_off == log->first_page; + + if (wrapped) + expected_seq += 1; + } + + /* Call our routine to check this log page. */ + kfree(tst_page); + tst_page = NULL; + + err = read_log_page(log, nextpage_off, &tst_page, &usa_error); + if (!err && !usa_error && + check_subseq_log_page(log, tst_page, nextpage_off, expected_seq)) { + err = -EINVAL; + goto out; + } + +file_is_valid: + + /* We have a valid file. */ + if (page_off1 || tail_page) { + struct RECORD_PAGE_HDR *tmp_page; + + if (sb_rdonly(log->ni->mi.sbi->sb)) { + err = -EROFS; + goto out; + } + + if (page_off1) { + tmp_page = Add2Ptr(page_bufs, page_off1 - page_off); + tails -= (page_off1 - page_off) / log->page_size; + if (!tail_page) + tails -= 1; + } else { + tmp_page = tail_page; + tails = 1; + } + + while (tails--) { + u64 off = hdr_file_off(log, tmp_page); + + if (!page) { + page = kmalloc(log->page_size, GFP_NOFS); + if (!page) + return -ENOMEM; + } + + /* + * Correct page and copy the data from this page + * into it and flush it to disk. + */ + memcpy(page, tmp_page, log->page_size); + + /* Fill last flushed lsn value flush the page. */ + if (log->major_ver < 2) + page->rhdr.lsn = page->record_hdr.last_end_lsn; + else + page->file_off = 0; + + page->page_pos = page->page_count = cpu_to_le16(1); + + ntfs_fix_pre_write(&page->rhdr, log->page_size); + + err = ntfs_sb_write_run(log->ni->mi.sbi, + &log->ni->file.run, off, page, + log->page_size); + + if (err) + goto out; + + if (part_io_count && second_off == off) { + second_off += log->page_size; + part_io_count -= 1; + } + + tmp_page = Add2Ptr(tmp_page, log->page_size); + } + } + + if (part_io_count) { + if (sb_rdonly(log->ni->mi.sbi->sb)) { + err = -EROFS; + goto out; + } + } + +out: + kfree(second_tail); + kfree(first_tail); + kfree(page); + kfree(tst_page); + kfree(page_bufs); + + return err; +} + +/* + * read_log_rec_buf - Copy a log record from the file to a buffer. + * + * The log record may span several log pages and may even wrap the file. + */ +static int read_log_rec_buf(struct ntfs_log *log, + const struct LFS_RECORD_HDR *rh, void *buffer) +{ + int err; + struct RECORD_PAGE_HDR *ph = NULL; + u64 lsn = le64_to_cpu(rh->this_lsn); + u32 vbo = lsn_to_vbo(log, lsn) & ~log->page_mask; + u32 off = lsn_to_page_off(log, lsn) + log->record_header_len; + u32 data_len = le32_to_cpu(rh->client_data_len); + + /* + * While there are more bytes to transfer, + * we continue to attempt to perform the read. + */ + for (;;) { + bool usa_error; + u32 tail = log->page_size - off; + + if (tail >= data_len) + tail = data_len; + + data_len -= tail; + + err = read_log_page(log, vbo, &ph, &usa_error); + if (err) + goto out; + + /* + * The last lsn on this page better be greater or equal + * to the lsn we are copying. + */ + if (lsn > le64_to_cpu(ph->rhdr.lsn)) { + err = -EINVAL; + goto out; + } + + memcpy(buffer, Add2Ptr(ph, off), tail); + + /* If there are no more bytes to transfer, we exit the loop. */ + if (!data_len) { + if (!is_log_record_end(ph) || + lsn > le64_to_cpu(ph->record_hdr.last_end_lsn)) { + err = -EINVAL; + goto out; + } + break; + } + + if (ph->rhdr.lsn == ph->record_hdr.last_end_lsn || + lsn > le64_to_cpu(ph->rhdr.lsn)) { + err = -EINVAL; + goto out; + } + + vbo = next_page_off(log, vbo); + off = log->data_off; + + /* + * Adjust our pointer the user's buffer to transfer + * the next block to. + */ + buffer = Add2Ptr(buffer, tail); + } + +out: + kfree(ph); + return err; +} + +static int read_rst_area(struct ntfs_log *log, struct NTFS_RESTART **rst_, + u64 *lsn) +{ + int err; + struct LFS_RECORD_HDR *rh = NULL; + const struct CLIENT_REC *cr = + Add2Ptr(log->ra, le16_to_cpu(log->ra->client_off)); + u64 lsnr, lsnc = le64_to_cpu(cr->restart_lsn); + u32 len; + struct NTFS_RESTART *rst; + + *lsn = 0; + *rst_ = NULL; + + /* If the client doesn't have a restart area, go ahead and exit now. */ + if (!lsnc) + return 0; + + err = read_log_page(log, lsn_to_vbo(log, lsnc), + (struct RECORD_PAGE_HDR **)&rh, NULL); + if (err) + return err; + + rst = NULL; + lsnr = le64_to_cpu(rh->this_lsn); + + if (lsnc != lsnr) { + /* If the lsn values don't match, then the disk is corrupt. */ + err = -EINVAL; + goto out; + } + + *lsn = lsnr; + len = le32_to_cpu(rh->client_data_len); + + if (!len) { + err = 0; + goto out; + } + + if (len < sizeof(struct NTFS_RESTART)) { + err = -EINVAL; + goto out; + } + + rst = kmalloc(len, GFP_NOFS); + if (!rst) { + err = -ENOMEM; + goto out; + } + + /* Copy the data into the 'rst' buffer. */ + err = read_log_rec_buf(log, rh, rst); + if (err) + goto out; + + *rst_ = rst; + rst = NULL; + +out: + kfree(rh); + kfree(rst); + + return err; +} + +static int find_log_rec(struct ntfs_log *log, u64 lsn, struct lcb *lcb) +{ + int err; + struct LFS_RECORD_HDR *rh = lcb->lrh; + u32 rec_len, len; + + /* Read the record header for this lsn. */ + if (!rh) { + err = read_log_page(log, lsn_to_vbo(log, lsn), + (struct RECORD_PAGE_HDR **)&rh, NULL); + + lcb->lrh = rh; + if (err) + return err; + } + + /* + * If the lsn the log record doesn't match the desired + * lsn then the disk is corrupt. + */ + if (lsn != le64_to_cpu(rh->this_lsn)) + return -EINVAL; + + len = le32_to_cpu(rh->client_data_len); + + /* + * Check that the length field isn't greater than the total + * available space the log file. + */ + rec_len = len + log->record_header_len; + if (rec_len >= log->total_avail) + return -EINVAL; + + /* + * If the entire log record is on this log page, + * put a pointer to the log record the context block. + */ + if (rh->flags & LOG_RECORD_MULTI_PAGE) { + void *lr = kmalloc(len, GFP_NOFS); + + if (!lr) + return -ENOMEM; + + lcb->log_rec = lr; + lcb->alloc = true; + + /* Copy the data into the buffer returned. */ + err = read_log_rec_buf(log, rh, lr); + if (err) + return err; + } else { + /* If beyond the end of the current page -> an error. */ + u32 page_off = lsn_to_page_off(log, lsn); + + if (page_off + len + log->record_header_len > log->page_size) + return -EINVAL; + + lcb->log_rec = Add2Ptr(rh, sizeof(struct LFS_RECORD_HDR)); + lcb->alloc = false; + } + + return 0; +} + +/* + * read_log_rec_lcb - Init the query operation. + */ +static int read_log_rec_lcb(struct ntfs_log *log, u64 lsn, u32 ctx_mode, + struct lcb **lcb_) +{ + int err; + const struct CLIENT_REC *cr; + struct lcb *lcb; + + switch (ctx_mode) { + case lcb_ctx_undo_next: + case lcb_ctx_prev: + case lcb_ctx_next: + break; + default: + return -EINVAL; + } + + /* Check that the given lsn is the legal range for this client. */ + cr = Add2Ptr(log->ra, le16_to_cpu(log->ra->client_off)); + + if (!verify_client_lsn(log, cr, lsn)) + return -EINVAL; + + lcb = kzalloc(sizeof(struct lcb), GFP_NOFS); + if (!lcb) + return -ENOMEM; + lcb->client = log->client_id; + lcb->ctx_mode = ctx_mode; + + /* Find the log record indicated by the given lsn. */ + err = find_log_rec(log, lsn, lcb); + if (err) + goto out; + + *lcb_ = lcb; + return 0; + +out: + lcb_put(lcb); + *lcb_ = NULL; + return err; +} + +/* + * find_client_next_lsn + * + * Attempt to find the next lsn to return to a client based on the context mode. + */ +static int find_client_next_lsn(struct ntfs_log *log, struct lcb *lcb, u64 *lsn) +{ + int err; + u64 next_lsn; + struct LFS_RECORD_HDR *hdr; + + hdr = lcb->lrh; + *lsn = 0; + + if (lcb_ctx_next != lcb->ctx_mode) + goto check_undo_next; + + /* Loop as long as another lsn can be found. */ + for (;;) { + u64 current_lsn; + + err = next_log_lsn(log, hdr, ¤t_lsn); + if (err) + goto out; + + if (!current_lsn) + break; + + if (hdr != lcb->lrh) + kfree(hdr); + + hdr = NULL; + err = read_log_page(log, lsn_to_vbo(log, current_lsn), + (struct RECORD_PAGE_HDR **)&hdr, NULL); + if (err) + goto out; + + if (memcmp(&hdr->client, &lcb->client, + sizeof(struct CLIENT_ID))) { + /*err = -EINVAL; */ + } else if (LfsClientRecord == hdr->record_type) { + kfree(lcb->lrh); + lcb->lrh = hdr; + *lsn = current_lsn; + return 0; + } + } + +out: + if (hdr != lcb->lrh) + kfree(hdr); + return err; + +check_undo_next: + if (lcb_ctx_undo_next == lcb->ctx_mode) + next_lsn = le64_to_cpu(hdr->client_undo_next_lsn); + else if (lcb_ctx_prev == lcb->ctx_mode) + next_lsn = le64_to_cpu(hdr->client_prev_lsn); + else + return 0; + + if (!next_lsn) + return 0; + + if (!verify_client_lsn( + log, Add2Ptr(log->ra, le16_to_cpu(log->ra->client_off)), + next_lsn)) + return 0; + + hdr = NULL; + err = read_log_page(log, lsn_to_vbo(log, next_lsn), + (struct RECORD_PAGE_HDR **)&hdr, NULL); + if (err) + return err; + kfree(lcb->lrh); + lcb->lrh = hdr; + + *lsn = next_lsn; + + return 0; +} + +static int read_next_log_rec(struct ntfs_log *log, struct lcb *lcb, u64 *lsn) +{ + int err; + + err = find_client_next_lsn(log, lcb, lsn); + if (err) + return err; + + if (!*lsn) + return 0; + + if (lcb->alloc) + kfree(lcb->log_rec); + + lcb->log_rec = NULL; + lcb->alloc = false; + kfree(lcb->lrh); + lcb->lrh = NULL; + + return find_log_rec(log, *lsn, lcb); +} + +static inline bool check_index_header(const struct INDEX_HDR *hdr, size_t bytes) +{ + __le16 mask; + u32 min_de, de_off, used, total; + const struct NTFS_DE *e; + + if (hdr_has_subnode(hdr)) { + min_de = sizeof(struct NTFS_DE) + sizeof(u64); + mask = NTFS_IE_HAS_SUBNODES; + } else { + min_de = sizeof(struct NTFS_DE); + mask = 0; + } + + de_off = le32_to_cpu(hdr->de_off); + used = le32_to_cpu(hdr->used); + total = le32_to_cpu(hdr->total); + + if (de_off > bytes - min_de || used > bytes || total > bytes || + de_off + min_de > used || used > total) { + return false; + } + + e = Add2Ptr(hdr, de_off); + for (;;) { + u16 esize = le16_to_cpu(e->size); + struct NTFS_DE *next = Add2Ptr(e, esize); + + if (esize < min_de || PtrOffset(hdr, next) > used || + (e->flags & NTFS_IE_HAS_SUBNODES) != mask) { + return false; + } + + if (de_is_last(e)) + break; + + e = next; + } + + return true; +} + +static inline bool check_index_buffer(const struct INDEX_BUFFER *ib, u32 bytes) +{ + u16 fo; + const struct NTFS_RECORD_HEADER *r = &ib->rhdr; + + if (r->sign != NTFS_INDX_SIGNATURE) + return false; + + fo = (SECTOR_SIZE - ((bytes >> SECTOR_SHIFT) + 1) * sizeof(short)); + + if (le16_to_cpu(r->fix_off) > fo) + return false; + + if ((le16_to_cpu(r->fix_num) - 1) * SECTOR_SIZE != bytes) + return false; + + return check_index_header(&ib->ihdr, + bytes - offsetof(struct INDEX_BUFFER, ihdr)); +} + +static inline bool check_index_root(const struct ATTRIB *attr, + struct ntfs_sb_info *sbi) +{ + bool ret; + const struct INDEX_ROOT *root = resident_data(attr); + u8 index_bits = le32_to_cpu(root->index_block_size) >= sbi->cluster_size + ? sbi->cluster_bits + : SECTOR_SHIFT; + u8 block_clst = root->index_block_clst; + + if (le32_to_cpu(attr->res.data_size) < sizeof(struct INDEX_ROOT) || + (root->type != ATTR_NAME && root->type != ATTR_ZERO) || + (root->type == ATTR_NAME && + root->rule != NTFS_COLLATION_TYPE_FILENAME) || + (le32_to_cpu(root->index_block_size) != + (block_clst << index_bits)) || + (block_clst != 1 && block_clst != 2 && block_clst != 4 && + block_clst != 8 && block_clst != 0x10 && block_clst != 0x20 && + block_clst != 0x40 && block_clst != 0x80)) { + return false; + } + + ret = check_index_header(&root->ihdr, + le32_to_cpu(attr->res.data_size) - + offsetof(struct INDEX_ROOT, ihdr)); + return ret; +} + +static inline bool check_attr(const struct MFT_REC *rec, + const struct ATTRIB *attr, + struct ntfs_sb_info *sbi) +{ + u32 asize = le32_to_cpu(attr->size); + u32 rsize = 0; + u64 dsize, svcn, evcn; + u16 run_off; + + /* Check the fixed part of the attribute record header. */ + if (asize >= sbi->record_size || + asize + PtrOffset(rec, attr) >= sbi->record_size || + (attr->name_len && + le16_to_cpu(attr->name_off) + attr->name_len * sizeof(short) > + asize)) { + return false; + } + + /* Check the attribute fields. */ + switch (attr->non_res) { + case 0: + rsize = le32_to_cpu(attr->res.data_size); + if (rsize >= asize || + le16_to_cpu(attr->res.data_off) + rsize > asize) { + return false; + } + break; + + case 1: + dsize = le64_to_cpu(attr->nres.data_size); + svcn = le64_to_cpu(attr->nres.svcn); + evcn = le64_to_cpu(attr->nres.evcn); + run_off = le16_to_cpu(attr->nres.run_off); + + if (svcn > evcn + 1 || run_off >= asize || + le64_to_cpu(attr->nres.valid_size) > dsize || + dsize > le64_to_cpu(attr->nres.alloc_size)) { + return false; + } + + if (run_unpack(NULL, sbi, 0, svcn, evcn, svcn, + Add2Ptr(attr, run_off), asize - run_off) < 0) { + return false; + } + + return true; + + default: + return false; + } + + switch (attr->type) { + case ATTR_NAME: + if (fname_full_size(Add2Ptr( + attr, le16_to_cpu(attr->res.data_off))) > asize) { + return false; + } + break; + + case ATTR_ROOT: + return check_index_root(attr, sbi); + + case ATTR_STD: + if (rsize < sizeof(struct ATTR_STD_INFO5) && + rsize != sizeof(struct ATTR_STD_INFO)) { + return false; + } + break; + + case ATTR_LIST: + case ATTR_ID: + case ATTR_SECURE: + case ATTR_LABEL: + case ATTR_VOL_INFO: + case ATTR_DATA: + case ATTR_ALLOC: + case ATTR_BITMAP: + case ATTR_REPARSE: + case ATTR_EA_INFO: + case ATTR_EA: + case ATTR_PROPERTYSET: + case ATTR_LOGGED_UTILITY_STREAM: + break; + + default: + return false; + } + + return true; +} + +static inline bool check_file_record(const struct MFT_REC *rec, + const struct MFT_REC *rec2, + struct ntfs_sb_info *sbi) +{ + const struct ATTRIB *attr; + u16 fo = le16_to_cpu(rec->rhdr.fix_off); + u16 fn = le16_to_cpu(rec->rhdr.fix_num); + u16 ao = le16_to_cpu(rec->attr_off); + u32 rs = sbi->record_size; + + /* Check the file record header for consistency. */ + if (rec->rhdr.sign != NTFS_FILE_SIGNATURE || + fo > (SECTOR_SIZE - ((rs >> SECTOR_SHIFT) + 1) * sizeof(short)) || + (fn - 1) * SECTOR_SIZE != rs || ao < MFTRECORD_FIXUP_OFFSET_1 || + ao > sbi->record_size - SIZEOF_RESIDENT || !is_rec_inuse(rec) || + le32_to_cpu(rec->total) != rs) { + return false; + } + + /* Loop to check all of the attributes. */ + for (attr = Add2Ptr(rec, ao); attr->type != ATTR_END; + attr = Add2Ptr(attr, le32_to_cpu(attr->size))) { + if (check_attr(rec, attr, sbi)) + continue; + return false; + } + + return true; +} + +static inline int check_lsn(const struct NTFS_RECORD_HEADER *hdr, + const u64 *rlsn) +{ + u64 lsn; + + if (!rlsn) + return true; + + lsn = le64_to_cpu(hdr->lsn); + + if (hdr->sign == NTFS_HOLE_SIGNATURE) + return false; + + if (*rlsn > lsn) + return true; + + return false; +} + +static inline bool check_if_attr(const struct MFT_REC *rec, + const struct LOG_REC_HDR *lrh) +{ + u16 ro = le16_to_cpu(lrh->record_off); + u16 o = le16_to_cpu(rec->attr_off); + const struct ATTRIB *attr = Add2Ptr(rec, o); + + while (o < ro) { + u32 asize; + + if (attr->type == ATTR_END) + break; + + asize = le32_to_cpu(attr->size); + if (!asize) + break; + + o += asize; + attr = Add2Ptr(attr, asize); + } + + return o == ro; +} + +static inline bool check_if_index_root(const struct MFT_REC *rec, + const struct LOG_REC_HDR *lrh) +{ + u16 ro = le16_to_cpu(lrh->record_off); + u16 o = le16_to_cpu(rec->attr_off); + const struct ATTRIB *attr = Add2Ptr(rec, o); + + while (o < ro) { + u32 asize; + + if (attr->type == ATTR_END) + break; + + asize = le32_to_cpu(attr->size); + if (!asize) + break; + + o += asize; + attr = Add2Ptr(attr, asize); + } + + return o == ro && attr->type == ATTR_ROOT; +} + +static inline bool check_if_root_index(const struct ATTRIB *attr, + const struct INDEX_HDR *hdr, + const struct LOG_REC_HDR *lrh) +{ + u16 ao = le16_to_cpu(lrh->attr_off); + u32 de_off = le32_to_cpu(hdr->de_off); + u32 o = PtrOffset(attr, hdr) + de_off; + const struct NTFS_DE *e = Add2Ptr(hdr, de_off); + u32 asize = le32_to_cpu(attr->size); + + while (o < ao) { + u16 esize; + + if (o >= asize) + break; + + esize = le16_to_cpu(e->size); + if (!esize) + break; + + o += esize; + e = Add2Ptr(e, esize); + } + + return o == ao; +} + +static inline bool check_if_alloc_index(const struct INDEX_HDR *hdr, + u32 attr_off) +{ + u32 de_off = le32_to_cpu(hdr->de_off); + u32 o = offsetof(struct INDEX_BUFFER, ihdr) + de_off; + const struct NTFS_DE *e = Add2Ptr(hdr, de_off); + u32 used = le32_to_cpu(hdr->used); + + while (o < attr_off) { + u16 esize; + + if (de_off >= used) + break; + + esize = le16_to_cpu(e->size); + if (!esize) + break; + + o += esize; + de_off += esize; + e = Add2Ptr(e, esize); + } + + return o == attr_off; +} + +static inline void change_attr_size(struct MFT_REC *rec, struct ATTRIB *attr, + u32 nsize) +{ + u32 asize = le32_to_cpu(attr->size); + int dsize = nsize - asize; + u8 *next = Add2Ptr(attr, asize); + u32 used = le32_to_cpu(rec->used); + + memmove(Add2Ptr(attr, nsize), next, used - PtrOffset(rec, next)); + + rec->used = cpu_to_le32(used + dsize); + attr->size = cpu_to_le32(nsize); +} + +struct OpenAttr { + struct ATTRIB *attr; + struct runs_tree *run1; + struct runs_tree run0; + struct ntfs_inode *ni; + // CLST rno; +}; + +/* + * cmp_type_and_name + * + * Return: 0 if 'attr' has the same type and name. + */ +static inline int cmp_type_and_name(const struct ATTRIB *a1, + const struct ATTRIB *a2) +{ + return a1->type != a2->type || a1->name_len != a2->name_len || + (a1->name_len && memcmp(attr_name(a1), attr_name(a2), + a1->name_len * sizeof(short))); +} + +static struct OpenAttr *find_loaded_attr(struct ntfs_log *log, + const struct ATTRIB *attr, CLST rno) +{ + struct OPEN_ATTR_ENRTY *oe = NULL; + + while ((oe = enum_rstbl(log->open_attr_tbl, oe))) { + struct OpenAttr *op_attr; + + if (ino_get(&oe->ref) != rno) + continue; + + op_attr = (struct OpenAttr *)oe->ptr; + if (!cmp_type_and_name(op_attr->attr, attr)) + return op_attr; + } + return NULL; +} + +static struct ATTRIB *attr_create_nonres_log(struct ntfs_sb_info *sbi, + enum ATTR_TYPE type, u64 size, + const u16 *name, size_t name_len, + __le16 flags) +{ + struct ATTRIB *attr; + u32 name_size = ALIGN(name_len * sizeof(short), 8); + bool is_ext = flags & (ATTR_FLAG_COMPRESSED | ATTR_FLAG_SPARSED); + u32 asize = name_size + + (is_ext ? SIZEOF_NONRESIDENT_EX : SIZEOF_NONRESIDENT); + + attr = kzalloc(asize, GFP_NOFS); + if (!attr) + return NULL; + + attr->type = type; + attr->size = cpu_to_le32(asize); + attr->flags = flags; + attr->non_res = 1; + attr->name_len = name_len; + + attr->nres.evcn = cpu_to_le64((u64)bytes_to_cluster(sbi, size) - 1); + attr->nres.alloc_size = cpu_to_le64(ntfs_up_cluster(sbi, size)); + attr->nres.data_size = cpu_to_le64(size); + attr->nres.valid_size = attr->nres.data_size; + if (is_ext) { + attr->name_off = SIZEOF_NONRESIDENT_EX_LE; + if (is_attr_compressed(attr)) + attr->nres.c_unit = COMPRESSION_UNIT; + + attr->nres.run_off = + cpu_to_le16(SIZEOF_NONRESIDENT_EX + name_size); + memcpy(Add2Ptr(attr, SIZEOF_NONRESIDENT_EX), name, + name_len * sizeof(short)); + } else { + attr->name_off = SIZEOF_NONRESIDENT_LE; + attr->nres.run_off = + cpu_to_le16(SIZEOF_NONRESIDENT + name_size); + memcpy(Add2Ptr(attr, SIZEOF_NONRESIDENT), name, + name_len * sizeof(short)); + } + + return attr; +} + +/* + * do_action - Common routine for the Redo and Undo Passes. + * @rlsn: If it is NULL then undo. + */ +static int do_action(struct ntfs_log *log, struct OPEN_ATTR_ENRTY *oe, + const struct LOG_REC_HDR *lrh, u32 op, void *data, + u32 dlen, u32 rec_len, const u64 *rlsn) +{ + int err = 0; + struct ntfs_sb_info *sbi = log->ni->mi.sbi; + struct inode *inode = NULL, *inode_parent; + struct mft_inode *mi = NULL, *mi2_child = NULL; + CLST rno = 0, rno_base = 0; + struct INDEX_BUFFER *ib = NULL; + struct MFT_REC *rec = NULL; + struct ATTRIB *attr = NULL, *attr2; + struct INDEX_HDR *hdr; + struct INDEX_ROOT *root; + struct NTFS_DE *e, *e1, *e2; + struct NEW_ATTRIBUTE_SIZES *new_sz; + struct ATTR_FILE_NAME *fname; + struct OpenAttr *oa, *oa2; + u32 nsize, t32, asize, used, esize, bmp_off, bmp_bits; + u16 id, id2; + u32 record_size = sbi->record_size; + u64 t64; + u16 roff = le16_to_cpu(lrh->record_off); + u16 aoff = le16_to_cpu(lrh->attr_off); + u64 lco = 0; + u64 cbo = (u64)le16_to_cpu(lrh->cluster_off) << SECTOR_SHIFT; + u64 tvo = le64_to_cpu(lrh->target_vcn) << sbi->cluster_bits; + u64 vbo = cbo + tvo; + void *buffer_le = NULL; + u32 bytes = 0; + bool a_dirty = false; + u16 data_off; + + oa = oe->ptr; + + /* Big switch to prepare. */ + switch (op) { + /* ============================================================ + * Process MFT records, as described by the current log record. + * ============================================================ + */ + case InitializeFileRecordSegment: + case DeallocateFileRecordSegment: + case WriteEndOfFileRecordSegment: + case CreateAttribute: + case DeleteAttribute: + case UpdateResidentValue: + case UpdateMappingPairs: + case SetNewAttributeSizes: + case AddIndexEntryRoot: + case DeleteIndexEntryRoot: + case SetIndexEntryVcnRoot: + case UpdateFileNameRoot: + case UpdateRecordDataRoot: + case ZeroEndOfFileRecord: + rno = vbo >> sbi->record_bits; + inode = ilookup(sbi->sb, rno); + if (inode) { + mi = &ntfs_i(inode)->mi; + } else if (op == InitializeFileRecordSegment) { + mi = kzalloc(sizeof(struct mft_inode), GFP_NOFS); + if (!mi) + return -ENOMEM; + err = mi_format_new(mi, sbi, rno, 0, false); + if (err) + goto out; + } else { + /* Read from disk. */ + err = mi_get(sbi, rno, &mi); + if (err) + return err; + } + rec = mi->mrec; + + if (op == DeallocateFileRecordSegment) + goto skip_load_parent; + + if (InitializeFileRecordSegment != op) { + if (rec->rhdr.sign == NTFS_BAAD_SIGNATURE) + goto dirty_vol; + if (!check_lsn(&rec->rhdr, rlsn)) + goto out; + if (!check_file_record(rec, NULL, sbi)) + goto dirty_vol; + attr = Add2Ptr(rec, roff); + } + + if (is_rec_base(rec) || InitializeFileRecordSegment == op) { + rno_base = rno; + goto skip_load_parent; + } + + rno_base = ino_get(&rec->parent_ref); + inode_parent = ntfs_iget5(sbi->sb, &rec->parent_ref, NULL); + if (IS_ERR(inode_parent)) + goto skip_load_parent; + + if (is_bad_inode(inode_parent)) { + iput(inode_parent); + goto skip_load_parent; + } + + if (ni_load_mi_ex(ntfs_i(inode_parent), rno, &mi2_child)) { + iput(inode_parent); + } else { + if (mi2_child->mrec != mi->mrec) + memcpy(mi2_child->mrec, mi->mrec, + sbi->record_size); + + if (inode) + iput(inode); + else if (mi) + mi_put(mi); + + inode = inode_parent; + mi = mi2_child; + rec = mi2_child->mrec; + attr = Add2Ptr(rec, roff); + } + +skip_load_parent: + inode_parent = NULL; + break; + + /* + * Process attributes, as described by the current log record. + */ + case UpdateNonresidentValue: + case AddIndexEntryAllocation: + case DeleteIndexEntryAllocation: + case WriteEndOfIndexBuffer: + case SetIndexEntryVcnAllocation: + case UpdateFileNameAllocation: + case SetBitsInNonresidentBitMap: + case ClearBitsInNonresidentBitMap: + case UpdateRecordDataAllocation: + attr = oa->attr; + bytes = UpdateNonresidentValue == op ? dlen : 0; + lco = (u64)le16_to_cpu(lrh->lcns_follow) << sbi->cluster_bits; + + if (attr->type == ATTR_ALLOC) { + t32 = le32_to_cpu(oe->bytes_per_index); + if (bytes < t32) + bytes = t32; + } + + if (!bytes) + bytes = lco - cbo; + + bytes += roff; + if (attr->type == ATTR_ALLOC) + bytes = (bytes + 511) & ~511; // align + + buffer_le = kmalloc(bytes, GFP_NOFS); + if (!buffer_le) + return -ENOMEM; + + err = ntfs_read_run_nb(sbi, oa->run1, vbo, buffer_le, bytes, + NULL); + if (err) + goto out; + + if (attr->type == ATTR_ALLOC && *(int *)buffer_le) + ntfs_fix_post_read(buffer_le, bytes, false); + break; + + default: + WARN_ON(1); + } + + /* Big switch to do operation. */ + switch (op) { + case InitializeFileRecordSegment: + if (roff + dlen > record_size) + goto dirty_vol; + + memcpy(Add2Ptr(rec, roff), data, dlen); + mi->dirty = true; + break; + + case DeallocateFileRecordSegment: + clear_rec_inuse(rec); + le16_add_cpu(&rec->seq, 1); + mi->dirty = true; + break; + + case WriteEndOfFileRecordSegment: + attr2 = (struct ATTRIB *)data; + if (!check_if_attr(rec, lrh) || roff + dlen > record_size) + goto dirty_vol; + + memmove(attr, attr2, dlen); + rec->used = cpu_to_le32(ALIGN(roff + dlen, 8)); + + mi->dirty = true; + break; + + case CreateAttribute: + attr2 = (struct ATTRIB *)data; + asize = le32_to_cpu(attr2->size); + used = le32_to_cpu(rec->used); + + if (!check_if_attr(rec, lrh) || dlen < SIZEOF_RESIDENT || + !IS_ALIGNED(asize, 8) || + Add2Ptr(attr2, asize) > Add2Ptr(lrh, rec_len) || + dlen > record_size - used) { + goto dirty_vol; + } + + memmove(Add2Ptr(attr, asize), attr, used - roff); + memcpy(attr, attr2, asize); + + rec->used = cpu_to_le32(used + asize); + id = le16_to_cpu(rec->next_attr_id); + id2 = le16_to_cpu(attr2->id); + if (id <= id2) + rec->next_attr_id = cpu_to_le16(id2 + 1); + if (is_attr_indexed(attr)) + le16_add_cpu(&rec->hard_links, 1); + + oa2 = find_loaded_attr(log, attr, rno_base); + if (oa2) { + void *p2 = kmemdup(attr, le32_to_cpu(attr->size), + GFP_NOFS); + if (p2) { + // run_close(oa2->run1); + kfree(oa2->attr); + oa2->attr = p2; + } + } + + mi->dirty = true; + break; + + case DeleteAttribute: + asize = le32_to_cpu(attr->size); + used = le32_to_cpu(rec->used); + + if (!check_if_attr(rec, lrh)) + goto dirty_vol; + + rec->used = cpu_to_le32(used - asize); + if (is_attr_indexed(attr)) + le16_add_cpu(&rec->hard_links, -1); + + memmove(attr, Add2Ptr(attr, asize), used - asize - roff); + + mi->dirty = true; + break; + + case UpdateResidentValue: + nsize = aoff + dlen; + + if (!check_if_attr(rec, lrh)) + goto dirty_vol; + + asize = le32_to_cpu(attr->size); + used = le32_to_cpu(rec->used); + + if (lrh->redo_len == lrh->undo_len) { + if (nsize > asize) + goto dirty_vol; + goto move_data; + } + + if (nsize > asize && nsize - asize > record_size - used) + goto dirty_vol; + + nsize = ALIGN(nsize, 8); + data_off = le16_to_cpu(attr->res.data_off); + + if (nsize < asize) { + memmove(Add2Ptr(attr, aoff), data, dlen); + data = NULL; // To skip below memmove(). + } + + memmove(Add2Ptr(attr, nsize), Add2Ptr(attr, asize), + used - le16_to_cpu(lrh->record_off) - asize); + + rec->used = cpu_to_le32(used + nsize - asize); + attr->size = cpu_to_le32(nsize); + attr->res.data_size = cpu_to_le32(aoff + dlen - data_off); + +move_data: + if (data) + memmove(Add2Ptr(attr, aoff), data, dlen); + + oa2 = find_loaded_attr(log, attr, rno_base); + if (oa2) { + void *p2 = kmemdup(attr, le32_to_cpu(attr->size), + GFP_NOFS); + if (p2) { + // run_close(&oa2->run0); + oa2->run1 = &oa2->run0; + kfree(oa2->attr); + oa2->attr = p2; + } + } + + mi->dirty = true; + break; + + case UpdateMappingPairs: + nsize = aoff + dlen; + asize = le32_to_cpu(attr->size); + used = le32_to_cpu(rec->used); + + if (!check_if_attr(rec, lrh) || !attr->non_res || + aoff < le16_to_cpu(attr->nres.run_off) || aoff > asize || + (nsize > asize && nsize - asize > record_size - used)) { + goto dirty_vol; + } + + nsize = ALIGN(nsize, 8); + + memmove(Add2Ptr(attr, nsize), Add2Ptr(attr, asize), + used - le16_to_cpu(lrh->record_off) - asize); + rec->used = cpu_to_le32(used + nsize - asize); + attr->size = cpu_to_le32(nsize); + memmove(Add2Ptr(attr, aoff), data, dlen); + + if (run_get_highest_vcn(le64_to_cpu(attr->nres.svcn), + attr_run(attr), &t64)) { + goto dirty_vol; + } + + attr->nres.evcn = cpu_to_le64(t64); + oa2 = find_loaded_attr(log, attr, rno_base); + if (oa2 && oa2->attr->non_res) + oa2->attr->nres.evcn = attr->nres.evcn; + + mi->dirty = true; + break; + + case SetNewAttributeSizes: + new_sz = data; + if (!check_if_attr(rec, lrh) || !attr->non_res) + goto dirty_vol; + + attr->nres.alloc_size = new_sz->alloc_size; + attr->nres.data_size = new_sz->data_size; + attr->nres.valid_size = new_sz->valid_size; + + if (dlen >= sizeof(struct NEW_ATTRIBUTE_SIZES)) + attr->nres.total_size = new_sz->total_size; + + oa2 = find_loaded_attr(log, attr, rno_base); + if (oa2) { + void *p2 = kmemdup(attr, le32_to_cpu(attr->size), + GFP_NOFS); + if (p2) { + kfree(oa2->attr); + oa2->attr = p2; + } + } + mi->dirty = true; + break; + + case AddIndexEntryRoot: + e = (struct NTFS_DE *)data; + esize = le16_to_cpu(e->size); + root = resident_data(attr); + hdr = &root->ihdr; + used = le32_to_cpu(hdr->used); + + if (!check_if_index_root(rec, lrh) || + !check_if_root_index(attr, hdr, lrh) || + Add2Ptr(data, esize) > Add2Ptr(lrh, rec_len) || + esize > le32_to_cpu(rec->total) - le32_to_cpu(rec->used)) { + goto dirty_vol; + } + + e1 = Add2Ptr(attr, le16_to_cpu(lrh->attr_off)); + + change_attr_size(rec, attr, le32_to_cpu(attr->size) + esize); + + memmove(Add2Ptr(e1, esize), e1, + PtrOffset(e1, Add2Ptr(hdr, used))); + memmove(e1, e, esize); + + le32_add_cpu(&attr->res.data_size, esize); + hdr->used = cpu_to_le32(used + esize); + le32_add_cpu(&hdr->total, esize); + + mi->dirty = true; + break; + + case DeleteIndexEntryRoot: + root = resident_data(attr); + hdr = &root->ihdr; + used = le32_to_cpu(hdr->used); + + if (!check_if_index_root(rec, lrh) || + !check_if_root_index(attr, hdr, lrh)) { + goto dirty_vol; + } + + e1 = Add2Ptr(attr, le16_to_cpu(lrh->attr_off)); + esize = le16_to_cpu(e1->size); + e2 = Add2Ptr(e1, esize); + + memmove(e1, e2, PtrOffset(e2, Add2Ptr(hdr, used))); + + le32_sub_cpu(&attr->res.data_size, esize); + hdr->used = cpu_to_le32(used - esize); + le32_sub_cpu(&hdr->total, esize); + + change_attr_size(rec, attr, le32_to_cpu(attr->size) - esize); + + mi->dirty = true; + break; + + case SetIndexEntryVcnRoot: + root = resident_data(attr); + hdr = &root->ihdr; + + if (!check_if_index_root(rec, lrh) || + !check_if_root_index(attr, hdr, lrh)) { + goto dirty_vol; + } + + e = Add2Ptr(attr, le16_to_cpu(lrh->attr_off)); + + de_set_vbn_le(e, *(__le64 *)data); + mi->dirty = true; + break; + + case UpdateFileNameRoot: + root = resident_data(attr); + hdr = &root->ihdr; + + if (!check_if_index_root(rec, lrh) || + !check_if_root_index(attr, hdr, lrh)) { + goto dirty_vol; + } + + e = Add2Ptr(attr, le16_to_cpu(lrh->attr_off)); + fname = (struct ATTR_FILE_NAME *)(e + 1); + memmove(&fname->dup, data, sizeof(fname->dup)); // + mi->dirty = true; + break; + + case UpdateRecordDataRoot: + root = resident_data(attr); + hdr = &root->ihdr; + + if (!check_if_index_root(rec, lrh) || + !check_if_root_index(attr, hdr, lrh)) { + goto dirty_vol; + } + + e = Add2Ptr(attr, le16_to_cpu(lrh->attr_off)); + + memmove(Add2Ptr(e, le16_to_cpu(e->view.data_off)), data, dlen); + + mi->dirty = true; + break; + + case ZeroEndOfFileRecord: + if (roff + dlen > record_size) + goto dirty_vol; + + memset(attr, 0, dlen); + mi->dirty = true; + break; + + case UpdateNonresidentValue: + if (lco < cbo + roff + dlen) + goto dirty_vol; + + memcpy(Add2Ptr(buffer_le, roff), data, dlen); + + a_dirty = true; + if (attr->type == ATTR_ALLOC) + ntfs_fix_pre_write(buffer_le, bytes); + break; + + case AddIndexEntryAllocation: + ib = Add2Ptr(buffer_le, roff); + hdr = &ib->ihdr; + e = data; + esize = le16_to_cpu(e->size); + e1 = Add2Ptr(ib, aoff); + + if (is_baad(&ib->rhdr)) + goto dirty_vol; + if (!check_lsn(&ib->rhdr, rlsn)) + goto out; + + used = le32_to_cpu(hdr->used); + + if (!check_index_buffer(ib, bytes) || + !check_if_alloc_index(hdr, aoff) || + Add2Ptr(e, esize) > Add2Ptr(lrh, rec_len) || + used + esize > le32_to_cpu(hdr->total)) { + goto dirty_vol; + } + + memmove(Add2Ptr(e1, esize), e1, + PtrOffset(e1, Add2Ptr(hdr, used))); + memcpy(e1, e, esize); + + hdr->used = cpu_to_le32(used + esize); + + a_dirty = true; + + ntfs_fix_pre_write(&ib->rhdr, bytes); + break; + + case DeleteIndexEntryAllocation: + ib = Add2Ptr(buffer_le, roff); + hdr = &ib->ihdr; + e = Add2Ptr(ib, aoff); + esize = le16_to_cpu(e->size); + + if (is_baad(&ib->rhdr)) + goto dirty_vol; + if (!check_lsn(&ib->rhdr, rlsn)) + goto out; + + if (!check_index_buffer(ib, bytes) || + !check_if_alloc_index(hdr, aoff)) { + goto dirty_vol; + } + + e1 = Add2Ptr(e, esize); + nsize = esize; + used = le32_to_cpu(hdr->used); + + memmove(e, e1, PtrOffset(e1, Add2Ptr(hdr, used))); + + hdr->used = cpu_to_le32(used - nsize); + + a_dirty = true; + + ntfs_fix_pre_write(&ib->rhdr, bytes); + break; + + case WriteEndOfIndexBuffer: + ib = Add2Ptr(buffer_le, roff); + hdr = &ib->ihdr; + e = Add2Ptr(ib, aoff); + + if (is_baad(&ib->rhdr)) + goto dirty_vol; + if (!check_lsn(&ib->rhdr, rlsn)) + goto out; + if (!check_index_buffer(ib, bytes) || + !check_if_alloc_index(hdr, aoff) || + aoff + dlen > offsetof(struct INDEX_BUFFER, ihdr) + + le32_to_cpu(hdr->total)) { + goto dirty_vol; + } + + hdr->used = cpu_to_le32(dlen + PtrOffset(hdr, e)); + memmove(e, data, dlen); + + a_dirty = true; + ntfs_fix_pre_write(&ib->rhdr, bytes); + break; + + case SetIndexEntryVcnAllocation: + ib = Add2Ptr(buffer_le, roff); + hdr = &ib->ihdr; + e = Add2Ptr(ib, aoff); + + if (is_baad(&ib->rhdr)) + goto dirty_vol; + + if (!check_lsn(&ib->rhdr, rlsn)) + goto out; + if (!check_index_buffer(ib, bytes) || + !check_if_alloc_index(hdr, aoff)) { + goto dirty_vol; + } + + de_set_vbn_le(e, *(__le64 *)data); + + a_dirty = true; + ntfs_fix_pre_write(&ib->rhdr, bytes); + break; + + case UpdateFileNameAllocation: + ib = Add2Ptr(buffer_le, roff); + hdr = &ib->ihdr; + e = Add2Ptr(ib, aoff); + + if (is_baad(&ib->rhdr)) + goto dirty_vol; + + if (!check_lsn(&ib->rhdr, rlsn)) + goto out; + if (!check_index_buffer(ib, bytes) || + !check_if_alloc_index(hdr, aoff)) { + goto dirty_vol; + } + + fname = (struct ATTR_FILE_NAME *)(e + 1); + memmove(&fname->dup, data, sizeof(fname->dup)); + + a_dirty = true; + ntfs_fix_pre_write(&ib->rhdr, bytes); + break; + + case SetBitsInNonresidentBitMap: + bmp_off = + le32_to_cpu(((struct BITMAP_RANGE *)data)->bitmap_off); + bmp_bits = le32_to_cpu(((struct BITMAP_RANGE *)data)->bits); + + if (cbo + (bmp_off + 7) / 8 > lco || + cbo + ((bmp_off + bmp_bits + 7) / 8) > lco) { + goto dirty_vol; + } + + __bitmap_set(Add2Ptr(buffer_le, roff), bmp_off, bmp_bits); + a_dirty = true; + break; + + case ClearBitsInNonresidentBitMap: + bmp_off = + le32_to_cpu(((struct BITMAP_RANGE *)data)->bitmap_off); + bmp_bits = le32_to_cpu(((struct BITMAP_RANGE *)data)->bits); + + if (cbo + (bmp_off + 7) / 8 > lco || + cbo + ((bmp_off + bmp_bits + 7) / 8) > lco) { + goto dirty_vol; + } + + __bitmap_clear(Add2Ptr(buffer_le, roff), bmp_off, bmp_bits); + a_dirty = true; + break; + + case UpdateRecordDataAllocation: + ib = Add2Ptr(buffer_le, roff); + hdr = &ib->ihdr; + e = Add2Ptr(ib, aoff); + + if (is_baad(&ib->rhdr)) + goto dirty_vol; + + if (!check_lsn(&ib->rhdr, rlsn)) + goto out; + if (!check_index_buffer(ib, bytes) || + !check_if_alloc_index(hdr, aoff)) { + goto dirty_vol; + } + + memmove(Add2Ptr(e, le16_to_cpu(e->view.data_off)), data, dlen); + + a_dirty = true; + ntfs_fix_pre_write(&ib->rhdr, bytes); + break; + + default: + WARN_ON(1); + } + + if (rlsn) { + __le64 t64 = cpu_to_le64(*rlsn); + + if (rec) + rec->rhdr.lsn = t64; + if (ib) + ib->rhdr.lsn = t64; + } + + if (mi && mi->dirty) { + err = mi_write(mi, 0); + if (err) + goto out; + } + + if (a_dirty) { + attr = oa->attr; + err = ntfs_sb_write_run(sbi, oa->run1, vbo, buffer_le, bytes); + if (err) + goto out; + } + +out: + + if (inode) + iput(inode); + else if (mi != mi2_child) + mi_put(mi); + + kfree(buffer_le); + + return err; + +dirty_vol: + log->set_dirty = true; + goto out; +} + +/* + * log_replay - Replays log and empties it. + * + * This function is called during mount operation. + * It replays log and empties it. + * Initialized is set false if logfile contains '-1'. + */ +int log_replay(struct ntfs_inode *ni, bool *initialized) +{ + int err; + struct ntfs_sb_info *sbi = ni->mi.sbi; + struct ntfs_log *log; + + struct restart_info rst_info, rst_info2; + u64 rec_lsn, ra_lsn, checkpt_lsn = 0, rlsn = 0; + struct ATTR_NAME_ENTRY *attr_names = NULL; + struct ATTR_NAME_ENTRY *ane; + struct RESTART_TABLE *dptbl = NULL; + struct RESTART_TABLE *trtbl = NULL; + const struct RESTART_TABLE *rt; + struct RESTART_TABLE *oatbl = NULL; + struct inode *inode; + struct OpenAttr *oa; + struct ntfs_inode *ni_oe; + struct ATTRIB *attr = NULL; + u64 size, vcn, undo_next_lsn; + CLST rno, lcn, lcn0, len0, clen; + void *data; + struct NTFS_RESTART *rst = NULL; + struct lcb *lcb = NULL; + struct OPEN_ATTR_ENRTY *oe; + struct TRANSACTION_ENTRY *tr; + struct DIR_PAGE_ENTRY *dp; + u32 i, bytes_per_attr_entry; + u32 l_size = ni->vfs_inode.i_size; + u32 orig_file_size = l_size; + u32 page_size, vbo, tail, off, dlen; + u32 saved_len, rec_len, transact_id; + bool use_second_page; + struct RESTART_AREA *ra2, *ra = NULL; + struct CLIENT_REC *ca, *cr; + __le16 client; + struct RESTART_HDR *rh; + const struct LFS_RECORD_HDR *frh; + const struct LOG_REC_HDR *lrh; + bool is_mapped; + bool is_ro = sb_rdonly(sbi->sb); + u64 t64; + u16 t16; + u32 t32; + + /* Get the size of page. NOTE: To replay we can use default page. */ +#if PAGE_SIZE >= DefaultLogPageSize && PAGE_SIZE <= DefaultLogPageSize * 2 + page_size = norm_file_page(PAGE_SIZE, &l_size, true); +#else + page_size = norm_file_page(PAGE_SIZE, &l_size, false); +#endif + if (!page_size) + return -EINVAL; + + log = kzalloc(sizeof(struct ntfs_log), GFP_NOFS); + if (!log) + return -ENOMEM; + + log->ni = ni; + log->l_size = l_size; + log->one_page_buf = kmalloc(page_size, GFP_NOFS); + + if (!log->one_page_buf) { + err = -ENOMEM; + goto out; + } + + log->page_size = page_size; + log->page_mask = page_size - 1; + log->page_bits = blksize_bits(page_size); + + /* Look for a restart area on the disk. */ + err = log_read_rst(log, l_size, true, &rst_info); + if (err) + goto out; + + /* remember 'initialized' */ + *initialized = rst_info.initialized; + + if (!rst_info.restart) { + if (rst_info.initialized) { + /* No restart area but the file is not initialized. */ + err = -EINVAL; + goto out; + } + + log_init_pg_hdr(log, page_size, page_size, 1, 1); + log_create(log, l_size, 0, get_random_int(), false, false); + + log->ra = ra; + + ra = log_create_ra(log); + if (!ra) { + err = -ENOMEM; + goto out; + } + log->ra = ra; + log->init_ra = true; + + goto process_log; + } + + /* + * If the restart offset above wasn't zero then we won't + * look for a second restart. + */ + if (rst_info.vbo) + goto check_restart_area; + + err = log_read_rst(log, l_size, false, &rst_info2); + + /* Determine which restart area to use. */ + if (!rst_info2.restart || rst_info2.last_lsn <= rst_info.last_lsn) + goto use_first_page; + + use_second_page = true; + + if (rst_info.chkdsk_was_run && page_size != rst_info.vbo) { + struct RECORD_PAGE_HDR *sp = NULL; + bool usa_error; + + if (!read_log_page(log, page_size, &sp, &usa_error) && + sp->rhdr.sign == NTFS_CHKD_SIGNATURE) { + use_second_page = false; + } + kfree(sp); + } + + if (use_second_page) { + kfree(rst_info.r_page); + memcpy(&rst_info, &rst_info2, sizeof(struct restart_info)); + rst_info2.r_page = NULL; + } + +use_first_page: + kfree(rst_info2.r_page); + +check_restart_area: + /* + * If the restart area is at offset 0, we want + * to write the second restart area first. + */ + log->init_ra = !!rst_info.vbo; + + /* If we have a valid page then grab a pointer to the restart area. */ + ra2 = rst_info.valid_page + ? Add2Ptr(rst_info.r_page, + le16_to_cpu(rst_info.r_page->ra_off)) + : NULL; + + if (rst_info.chkdsk_was_run || + (ra2 && ra2->client_idx[1] == LFS_NO_CLIENT_LE)) { + bool wrapped = false; + bool use_multi_page = false; + u32 open_log_count; + + /* Do some checks based on whether we have a valid log page. */ + if (!rst_info.valid_page) { + open_log_count = get_random_int(); + goto init_log_instance; + } + open_log_count = le32_to_cpu(ra2->open_log_count); + + /* + * If the restart page size isn't changing then we want to + * check how much work we need to do. + */ + if (page_size != le32_to_cpu(rst_info.r_page->sys_page_size)) + goto init_log_instance; + +init_log_instance: + log_init_pg_hdr(log, page_size, page_size, 1, 1); + + log_create(log, l_size, rst_info.last_lsn, open_log_count, + wrapped, use_multi_page); + + ra = log_create_ra(log); + if (!ra) { + err = -ENOMEM; + goto out; + } + log->ra = ra; + + /* Put the restart areas and initialize + * the log file as required. + */ + goto process_log; + } + + if (!ra2) { + err = -EINVAL; + goto out; + } + + /* + * If the log page or the system page sizes have changed, we can't + * use the log file. We must use the system page size instead of the + * default size if there is not a clean shutdown. + */ + t32 = le32_to_cpu(rst_info.r_page->sys_page_size); + if (page_size != t32) { + l_size = orig_file_size; + page_size = + norm_file_page(t32, &l_size, t32 == DefaultLogPageSize); + } + + if (page_size != t32 || + page_size != le32_to_cpu(rst_info.r_page->page_size)) { + err = -EINVAL; + goto out; + } + + /* If the file size has shrunk then we won't mount it. */ + if (l_size < le64_to_cpu(ra2->l_size)) { + err = -EINVAL; + goto out; + } + + log_init_pg_hdr(log, page_size, page_size, + le16_to_cpu(rst_info.r_page->major_ver), + le16_to_cpu(rst_info.r_page->minor_ver)); + + log->l_size = le64_to_cpu(ra2->l_size); + log->seq_num_bits = le32_to_cpu(ra2->seq_num_bits); + log->file_data_bits = sizeof(u64) * 8 - log->seq_num_bits; + log->seq_num_mask = (8 << log->file_data_bits) - 1; + log->last_lsn = le64_to_cpu(ra2->current_lsn); + log->seq_num = log->last_lsn >> log->file_data_bits; + log->ra_off = le16_to_cpu(rst_info.r_page->ra_off); + log->restart_size = log->sys_page_size - log->ra_off; + log->record_header_len = le16_to_cpu(ra2->rec_hdr_len); + log->ra_size = le16_to_cpu(ra2->ra_len); + log->data_off = le16_to_cpu(ra2->data_off); + log->data_size = log->page_size - log->data_off; + log->reserved = log->data_size - log->record_header_len; + + vbo = lsn_to_vbo(log, log->last_lsn); + + if (vbo < log->first_page) { + /* This is a pseudo lsn. */ + log->l_flags |= NTFSLOG_NO_LAST_LSN; + log->next_page = log->first_page; + goto find_oldest; + } + + /* Find the end of this log record. */ + off = final_log_off(log, log->last_lsn, + le32_to_cpu(ra2->last_lsn_data_len)); + + /* If we wrapped the file then increment the sequence number. */ + if (off <= vbo) { + log->seq_num += 1; + log->l_flags |= NTFSLOG_WRAPPED; + } + + /* Now compute the next log page to use. */ + vbo &= ~log->sys_page_mask; + tail = log->page_size - (off & log->page_mask) - 1; + + /* + *If we can fit another log record on the page, + * move back a page the log file. + */ + if (tail >= log->record_header_len) { + log->l_flags |= NTFSLOG_REUSE_TAIL; + log->next_page = vbo; + } else { + log->next_page = next_page_off(log, vbo); + } + +find_oldest: + /* + * Find the oldest client lsn. Use the last + * flushed lsn as a starting point. + */ + log->oldest_lsn = log->last_lsn; + oldest_client_lsn(Add2Ptr(ra2, le16_to_cpu(ra2->client_off)), + ra2->client_idx[1], &log->oldest_lsn); + log->oldest_lsn_off = lsn_to_vbo(log, log->oldest_lsn); + + if (log->oldest_lsn_off < log->first_page) + log->l_flags |= NTFSLOG_NO_OLDEST_LSN; + + if (!(ra2->flags & RESTART_SINGLE_PAGE_IO)) + log->l_flags |= NTFSLOG_WRAPPED | NTFSLOG_MULTIPLE_PAGE_IO; + + log->current_openlog_count = le32_to_cpu(ra2->open_log_count); + log->total_avail_pages = log->l_size - log->first_page; + log->total_avail = log->total_avail_pages >> log->page_bits; + log->max_current_avail = log->total_avail * log->reserved; + log->total_avail = log->total_avail * log->data_size; + + log->current_avail = current_log_avail(log); + + ra = kzalloc(log->restart_size, GFP_NOFS); + if (!ra) { + err = -ENOMEM; + goto out; + } + log->ra = ra; + + t16 = le16_to_cpu(ra2->client_off); + if (t16 == offsetof(struct RESTART_AREA, clients)) { + memcpy(ra, ra2, log->ra_size); + } else { + memcpy(ra, ra2, offsetof(struct RESTART_AREA, clients)); + memcpy(ra->clients, Add2Ptr(ra2, t16), + le16_to_cpu(ra2->ra_len) - t16); + + log->current_openlog_count = get_random_int(); + ra->open_log_count = cpu_to_le32(log->current_openlog_count); + log->ra_size = offsetof(struct RESTART_AREA, clients) + + sizeof(struct CLIENT_REC); + ra->client_off = + cpu_to_le16(offsetof(struct RESTART_AREA, clients)); + ra->ra_len = cpu_to_le16(log->ra_size); + } + + le32_add_cpu(&ra->open_log_count, 1); + + /* Now we need to walk through looking for the last lsn. */ + err = last_log_lsn(log); + if (err) + goto out; + + log->current_avail = current_log_avail(log); + + /* Remember which restart area to write first. */ + log->init_ra = rst_info.vbo; + +process_log: + /* 1.0, 1.1, 2.0 log->major_ver/minor_ver - short values. */ + switch ((log->major_ver << 16) + log->minor_ver) { + case 0x10000: + case 0x10001: + case 0x20000: + break; + default: + ntfs_warn(sbi->sb, "\x24LogFile version %d.%d is not supported", + log->major_ver, log->minor_ver); + err = -EOPNOTSUPP; + log->set_dirty = true; + goto out; + } + + /* One client "NTFS" per logfile. */ + ca = Add2Ptr(ra, le16_to_cpu(ra->client_off)); + + for (client = ra->client_idx[1];; client = cr->next_client) { + if (client == LFS_NO_CLIENT_LE) { + /* Insert "NTFS" client LogFile. */ + client = ra->client_idx[0]; + if (client == LFS_NO_CLIENT_LE) + return -EINVAL; + + t16 = le16_to_cpu(client); + cr = ca + t16; + + remove_client(ca, cr, &ra->client_idx[0]); + + cr->restart_lsn = 0; + cr->oldest_lsn = cpu_to_le64(log->oldest_lsn); + cr->name_bytes = cpu_to_le32(8); + cr->name[0] = cpu_to_le16('N'); + cr->name[1] = cpu_to_le16('T'); + cr->name[2] = cpu_to_le16('F'); + cr->name[3] = cpu_to_le16('S'); + + add_client(ca, t16, &ra->client_idx[1]); + break; + } + + cr = ca + le16_to_cpu(client); + + if (cpu_to_le32(8) == cr->name_bytes && + cpu_to_le16('N') == cr->name[0] && + cpu_to_le16('T') == cr->name[1] && + cpu_to_le16('F') == cr->name[2] && + cpu_to_le16('S') == cr->name[3]) + break; + } + + /* Update the client handle with the client block information. */ + log->client_id.seq_num = cr->seq_num; + log->client_id.client_idx = client; + + err = read_rst_area(log, &rst, &ra_lsn); + if (err) + goto out; + + if (!rst) + goto out; + + bytes_per_attr_entry = !rst->major_ver ? 0x2C : 0x28; + + checkpt_lsn = le64_to_cpu(rst->check_point_start); + if (!checkpt_lsn) + checkpt_lsn = ra_lsn; + + /* Allocate and Read the Transaction Table. */ + if (!rst->transact_table_len) + goto check_dirty_page_table; + + t64 = le64_to_cpu(rst->transact_table_lsn); + err = read_log_rec_lcb(log, t64, lcb_ctx_prev, &lcb); + if (err) + goto out; + + lrh = lcb->log_rec; + frh = lcb->lrh; + rec_len = le32_to_cpu(frh->client_data_len); + + if (!check_log_rec(lrh, rec_len, le32_to_cpu(frh->transact_id), + bytes_per_attr_entry)) { + err = -EINVAL; + goto out; + } + + t16 = le16_to_cpu(lrh->redo_off); + + rt = Add2Ptr(lrh, t16); + t32 = rec_len - t16; + + /* Now check that this is a valid restart table. */ + if (!check_rstbl(rt, t32)) { + err = -EINVAL; + goto out; + } + + trtbl = kmemdup(rt, t32, GFP_NOFS); + if (!trtbl) { + err = -ENOMEM; + goto out; + } + + lcb_put(lcb); + lcb = NULL; + +check_dirty_page_table: + /* The next record back should be the Dirty Pages Table. */ + if (!rst->dirty_pages_len) + goto check_attribute_names; + + t64 = le64_to_cpu(rst->dirty_pages_table_lsn); + err = read_log_rec_lcb(log, t64, lcb_ctx_prev, &lcb); + if (err) + goto out; + + lrh = lcb->log_rec; + frh = lcb->lrh; + rec_len = le32_to_cpu(frh->client_data_len); + + if (!check_log_rec(lrh, rec_len, le32_to_cpu(frh->transact_id), + bytes_per_attr_entry)) { + err = -EINVAL; + goto out; + } + + t16 = le16_to_cpu(lrh->redo_off); + + rt = Add2Ptr(lrh, t16); + t32 = rec_len - t16; + + /* Now check that this is a valid restart table. */ + if (!check_rstbl(rt, t32)) { + err = -EINVAL; + goto out; + } + + dptbl = kmemdup(rt, t32, GFP_NOFS); + if (!dptbl) { + err = -ENOMEM; + goto out; + } + + /* Convert Ra version '0' into version '1'. */ + if (rst->major_ver) + goto end_conv_1; + + dp = NULL; + while ((dp = enum_rstbl(dptbl, dp))) { + struct DIR_PAGE_ENTRY_32 *dp0 = (struct DIR_PAGE_ENTRY_32 *)dp; + // NOTE: Danger. Check for of boundary. + memmove(&dp->vcn, &dp0->vcn_low, + 2 * sizeof(u64) + + le32_to_cpu(dp->lcns_follow) * sizeof(u64)); + } + +end_conv_1: + lcb_put(lcb); + lcb = NULL; + + /* + * Go through the table and remove the duplicates, + * remembering the oldest lsn values. + */ + if (sbi->cluster_size <= log->page_size) + goto trace_dp_table; + + dp = NULL; + while ((dp = enum_rstbl(dptbl, dp))) { + struct DIR_PAGE_ENTRY *next = dp; + + while ((next = enum_rstbl(dptbl, next))) { + if (next->target_attr == dp->target_attr && + next->vcn == dp->vcn) { + if (le64_to_cpu(next->oldest_lsn) < + le64_to_cpu(dp->oldest_lsn)) { + dp->oldest_lsn = next->oldest_lsn; + } + + free_rsttbl_idx(dptbl, PtrOffset(dptbl, next)); + } + } + } +trace_dp_table: +check_attribute_names: + /* The next record should be the Attribute Names. */ + if (!rst->attr_names_len) + goto check_attr_table; + + t64 = le64_to_cpu(rst->attr_names_lsn); + err = read_log_rec_lcb(log, t64, lcb_ctx_prev, &lcb); + if (err) + goto out; + + lrh = lcb->log_rec; + frh = lcb->lrh; + rec_len = le32_to_cpu(frh->client_data_len); + + if (!check_log_rec(lrh, rec_len, le32_to_cpu(frh->transact_id), + bytes_per_attr_entry)) { + err = -EINVAL; + goto out; + } + + t32 = lrh_length(lrh); + rec_len -= t32; + + attr_names = kmemdup(Add2Ptr(lrh, t32), rec_len, GFP_NOFS); + + lcb_put(lcb); + lcb = NULL; + +check_attr_table: + /* The next record should be the attribute Table. */ + if (!rst->open_attr_len) + goto check_attribute_names2; + + t64 = le64_to_cpu(rst->open_attr_table_lsn); + err = read_log_rec_lcb(log, t64, lcb_ctx_prev, &lcb); + if (err) + goto out; + + lrh = lcb->log_rec; + frh = lcb->lrh; + rec_len = le32_to_cpu(frh->client_data_len); + + if (!check_log_rec(lrh, rec_len, le32_to_cpu(frh->transact_id), + bytes_per_attr_entry)) { + err = -EINVAL; + goto out; + } + + t16 = le16_to_cpu(lrh->redo_off); + + rt = Add2Ptr(lrh, t16); + t32 = rec_len - t16; + + if (!check_rstbl(rt, t32)) { + err = -EINVAL; + goto out; + } + + oatbl = kmemdup(rt, t32, GFP_NOFS); + if (!oatbl) { + err = -ENOMEM; + goto out; + } + + log->open_attr_tbl = oatbl; + + /* Clear all of the Attr pointers. */ + oe = NULL; + while ((oe = enum_rstbl(oatbl, oe))) { + if (!rst->major_ver) { + struct OPEN_ATTR_ENRTY_32 oe0; + + /* Really 'oe' points to OPEN_ATTR_ENRTY_32. */ + memcpy(&oe0, oe, SIZEOF_OPENATTRIBUTEENTRY0); + + oe->bytes_per_index = oe0.bytes_per_index; + oe->type = oe0.type; + oe->is_dirty_pages = oe0.is_dirty_pages; + oe->name_len = 0; + oe->ref = oe0.ref; + oe->open_record_lsn = oe0.open_record_lsn; + } + + oe->is_attr_name = 0; + oe->ptr = NULL; + } + + lcb_put(lcb); + lcb = NULL; + +check_attribute_names2: + if (!rst->attr_names_len) + goto trace_attribute_table; + + ane = attr_names; + if (!oatbl) + goto trace_attribute_table; + while (ane->off) { + /* TODO: Clear table on exit! */ + oe = Add2Ptr(oatbl, le16_to_cpu(ane->off)); + t16 = le16_to_cpu(ane->name_bytes); + oe->name_len = t16 / sizeof(short); + oe->ptr = ane->name; + oe->is_attr_name = 2; + ane = Add2Ptr(ane, sizeof(struct ATTR_NAME_ENTRY) + t16); + } + +trace_attribute_table: + /* + * If the checkpt_lsn is zero, then this is a freshly + * formatted disk and we have no work to do. + */ + if (!checkpt_lsn) { + err = 0; + goto out; + } + + if (!oatbl) { + oatbl = init_rsttbl(bytes_per_attr_entry, 8); + if (!oatbl) { + err = -ENOMEM; + goto out; + } + } + + log->open_attr_tbl = oatbl; + + /* Start the analysis pass from the Checkpoint lsn. */ + rec_lsn = checkpt_lsn; + + /* Read the first lsn. */ + err = read_log_rec_lcb(log, checkpt_lsn, lcb_ctx_next, &lcb); + if (err) + goto out; + + /* Loop to read all subsequent records to the end of the log file. */ +next_log_record_analyze: + err = read_next_log_rec(log, lcb, &rec_lsn); + if (err) + goto out; + + if (!rec_lsn) + goto end_log_records_enumerate; + + frh = lcb->lrh; + transact_id = le32_to_cpu(frh->transact_id); + rec_len = le32_to_cpu(frh->client_data_len); + lrh = lcb->log_rec; + + if (!check_log_rec(lrh, rec_len, transact_id, bytes_per_attr_entry)) { + err = -EINVAL; + goto out; + } + + /* + * The first lsn after the previous lsn remembered + * the checkpoint is the first candidate for the rlsn. + */ + if (!rlsn) + rlsn = rec_lsn; + + if (LfsClientRecord != frh->record_type) + goto next_log_record_analyze; + + /* + * Now update the Transaction Table for this transaction. If there + * is no entry present or it is unallocated we allocate the entry. + */ + if (!trtbl) { + trtbl = init_rsttbl(sizeof(struct TRANSACTION_ENTRY), + INITIAL_NUMBER_TRANSACTIONS); + if (!trtbl) { + err = -ENOMEM; + goto out; + } + } + + tr = Add2Ptr(trtbl, transact_id); + + if (transact_id >= bytes_per_rt(trtbl) || + tr->next != RESTART_ENTRY_ALLOCATED_LE) { + tr = alloc_rsttbl_from_idx(&trtbl, transact_id); + if (!tr) { + err = -ENOMEM; + goto out; + } + tr->transact_state = TransactionActive; + tr->first_lsn = cpu_to_le64(rec_lsn); + } + + tr->prev_lsn = tr->undo_next_lsn = cpu_to_le64(rec_lsn); + + /* + * If this is a compensation log record, then change + * the undo_next_lsn to be the undo_next_lsn of this record. + */ + if (lrh->undo_op == cpu_to_le16(CompensationLogRecord)) + tr->undo_next_lsn = frh->client_undo_next_lsn; + + /* Dispatch to handle log record depending on type. */ + switch (le16_to_cpu(lrh->redo_op)) { + case InitializeFileRecordSegment: + case DeallocateFileRecordSegment: + case WriteEndOfFileRecordSegment: + case CreateAttribute: + case DeleteAttribute: + case UpdateResidentValue: + case UpdateNonresidentValue: + case UpdateMappingPairs: + case SetNewAttributeSizes: + case AddIndexEntryRoot: + case DeleteIndexEntryRoot: + case AddIndexEntryAllocation: + case DeleteIndexEntryAllocation: + case WriteEndOfIndexBuffer: + case SetIndexEntryVcnRoot: + case SetIndexEntryVcnAllocation: + case UpdateFileNameRoot: + case UpdateFileNameAllocation: + case SetBitsInNonresidentBitMap: + case ClearBitsInNonresidentBitMap: + case UpdateRecordDataRoot: + case UpdateRecordDataAllocation: + case ZeroEndOfFileRecord: + t16 = le16_to_cpu(lrh->target_attr); + t64 = le64_to_cpu(lrh->target_vcn); + dp = find_dp(dptbl, t16, t64); + + if (dp) + goto copy_lcns; + + /* + * Calculate the number of clusters per page the system + * which wrote the checkpoint, possibly creating the table. + */ + if (dptbl) { + t32 = (le16_to_cpu(dptbl->size) - + sizeof(struct DIR_PAGE_ENTRY)) / + sizeof(u64); + } else { + t32 = log->clst_per_page; + kfree(dptbl); + dptbl = init_rsttbl(struct_size(dp, page_lcns, t32), + 32); + if (!dptbl) { + err = -ENOMEM; + goto out; + } + } + + dp = alloc_rsttbl_idx(&dptbl); + if (!dp) { + err = -ENOMEM; + goto out; + } + dp->target_attr = cpu_to_le32(t16); + dp->transfer_len = cpu_to_le32(t32 << sbi->cluster_bits); + dp->lcns_follow = cpu_to_le32(t32); + dp->vcn = cpu_to_le64(t64 & ~((u64)t32 - 1)); + dp->oldest_lsn = cpu_to_le64(rec_lsn); + +copy_lcns: + /* + * Copy the Lcns from the log record into the Dirty Page Entry. + * TODO: For different page size support, must somehow make + * whole routine a loop, case Lcns do not fit below. + */ + t16 = le16_to_cpu(lrh->lcns_follow); + for (i = 0; i < t16; i++) { + size_t j = (size_t)(le64_to_cpu(lrh->target_vcn) - + le64_to_cpu(dp->vcn)); + dp->page_lcns[j + i] = lrh->page_lcns[i]; + } + + goto next_log_record_analyze; + + case DeleteDirtyClusters: { + u32 range_count = + le16_to_cpu(lrh->redo_len) / sizeof(struct LCN_RANGE); + const struct LCN_RANGE *r = + Add2Ptr(lrh, le16_to_cpu(lrh->redo_off)); + + /* Loop through all of the Lcn ranges this log record. */ + for (i = 0; i < range_count; i++, r++) { + u64 lcn0 = le64_to_cpu(r->lcn); + u64 lcn_e = lcn0 + le64_to_cpu(r->len) - 1; + + dp = NULL; + while ((dp = enum_rstbl(dptbl, dp))) { + u32 j; + + t32 = le32_to_cpu(dp->lcns_follow); + for (j = 0; j < t32; j++) { + t64 = le64_to_cpu(dp->page_lcns[j]); + if (t64 >= lcn0 && t64 <= lcn_e) + dp->page_lcns[j] = 0; + } + } + } + goto next_log_record_analyze; + ; + } + + case OpenNonresidentAttribute: + t16 = le16_to_cpu(lrh->target_attr); + if (t16 >= bytes_per_rt(oatbl)) { + /* + * Compute how big the table needs to be. + * Add 10 extra entries for some cushion. + */ + u32 new_e = t16 / le16_to_cpu(oatbl->size); + + new_e += 10 - le16_to_cpu(oatbl->used); + + oatbl = extend_rsttbl(oatbl, new_e, ~0u); + log->open_attr_tbl = oatbl; + if (!oatbl) { + err = -ENOMEM; + goto out; + } + } + + /* Point to the entry being opened. */ + oe = alloc_rsttbl_from_idx(&oatbl, t16); + log->open_attr_tbl = oatbl; + if (!oe) { + err = -ENOMEM; + goto out; + } + + /* Initialize this entry from the log record. */ + t16 = le16_to_cpu(lrh->redo_off); + if (!rst->major_ver) { + /* Convert version '0' into version '1'. */ + struct OPEN_ATTR_ENRTY_32 *oe0 = Add2Ptr(lrh, t16); + + oe->bytes_per_index = oe0->bytes_per_index; + oe->type = oe0->type; + oe->is_dirty_pages = oe0->is_dirty_pages; + oe->name_len = 0; //oe0.name_len; + oe->ref = oe0->ref; + oe->open_record_lsn = oe0->open_record_lsn; + } else { + memcpy(oe, Add2Ptr(lrh, t16), bytes_per_attr_entry); + } + + t16 = le16_to_cpu(lrh->undo_len); + if (t16) { + oe->ptr = kmalloc(t16, GFP_NOFS); + if (!oe->ptr) { + err = -ENOMEM; + goto out; + } + oe->name_len = t16 / sizeof(short); + memcpy(oe->ptr, + Add2Ptr(lrh, le16_to_cpu(lrh->undo_off)), t16); + oe->is_attr_name = 1; + } else { + oe->ptr = NULL; + oe->is_attr_name = 0; + } + + goto next_log_record_analyze; + + case HotFix: + t16 = le16_to_cpu(lrh->target_attr); + t64 = le64_to_cpu(lrh->target_vcn); + dp = find_dp(dptbl, t16, t64); + if (dp) { + size_t j = le64_to_cpu(lrh->target_vcn) - + le64_to_cpu(dp->vcn); + if (dp->page_lcns[j]) + dp->page_lcns[j] = lrh->page_lcns[0]; + } + goto next_log_record_analyze; + + case EndTopLevelAction: + tr = Add2Ptr(trtbl, transact_id); + tr->prev_lsn = cpu_to_le64(rec_lsn); + tr->undo_next_lsn = frh->client_undo_next_lsn; + goto next_log_record_analyze; + + case PrepareTransaction: + tr = Add2Ptr(trtbl, transact_id); + tr->transact_state = TransactionPrepared; + goto next_log_record_analyze; + + case CommitTransaction: + tr = Add2Ptr(trtbl, transact_id); + tr->transact_state = TransactionCommitted; + goto next_log_record_analyze; + + case ForgetTransaction: + free_rsttbl_idx(trtbl, transact_id); + goto next_log_record_analyze; + + case Noop: + case OpenAttributeTableDump: + case AttributeNamesDump: + case DirtyPageTableDump: + case TransactionTableDump: + /* The following cases require no action the Analysis Pass. */ + goto next_log_record_analyze; + + default: + /* + * All codes will be explicitly handled. + * If we see a code we do not expect, then we are trouble. + */ + goto next_log_record_analyze; + } + +end_log_records_enumerate: + lcb_put(lcb); + lcb = NULL; + + /* + * Scan the Dirty Page Table and Transaction Table for + * the lowest lsn, and return it as the Redo lsn. + */ + dp = NULL; + while ((dp = enum_rstbl(dptbl, dp))) { + t64 = le64_to_cpu(dp->oldest_lsn); + if (t64 && t64 < rlsn) + rlsn = t64; + } + + tr = NULL; + while ((tr = enum_rstbl(trtbl, tr))) { + t64 = le64_to_cpu(tr->first_lsn); + if (t64 && t64 < rlsn) + rlsn = t64; + } + + /* + * Only proceed if the Dirty Page Table or Transaction + * table are not empty. + */ + if ((!dptbl || !dptbl->total) && (!trtbl || !trtbl->total)) + goto end_reply; + + sbi->flags |= NTFS_FLAGS_NEED_REPLAY; + if (is_ro) + goto out; + + /* Reopen all of the attributes with dirty pages. */ + oe = NULL; +next_open_attribute: + + oe = enum_rstbl(oatbl, oe); + if (!oe) { + err = 0; + dp = NULL; + goto next_dirty_page; + } + + oa = kzalloc(sizeof(struct OpenAttr), GFP_NOFS); + if (!oa) { + err = -ENOMEM; + goto out; + } + + inode = ntfs_iget5(sbi->sb, &oe->ref, NULL); + if (IS_ERR(inode)) + goto fake_attr; + + if (is_bad_inode(inode)) { + iput(inode); +fake_attr: + if (oa->ni) { + iput(&oa->ni->vfs_inode); + oa->ni = NULL; + } + + attr = attr_create_nonres_log(sbi, oe->type, 0, oe->ptr, + oe->name_len, 0); + if (!attr) { + kfree(oa); + err = -ENOMEM; + goto out; + } + oa->attr = attr; + oa->run1 = &oa->run0; + goto final_oe; + } + + ni_oe = ntfs_i(inode); + oa->ni = ni_oe; + + attr = ni_find_attr(ni_oe, NULL, NULL, oe->type, oe->ptr, oe->name_len, + NULL, NULL); + + if (!attr) + goto fake_attr; + + t32 = le32_to_cpu(attr->size); + oa->attr = kmemdup(attr, t32, GFP_NOFS); + if (!oa->attr) + goto fake_attr; + + if (!S_ISDIR(inode->i_mode)) { + if (attr->type == ATTR_DATA && !attr->name_len) { + oa->run1 = &ni_oe->file.run; + goto final_oe; + } + } else { + if (attr->type == ATTR_ALLOC && + attr->name_len == ARRAY_SIZE(I30_NAME) && + !memcmp(attr_name(attr), I30_NAME, sizeof(I30_NAME))) { + oa->run1 = &ni_oe->dir.alloc_run; + goto final_oe; + } + } + + if (attr->non_res) { + u16 roff = le16_to_cpu(attr->nres.run_off); + CLST svcn = le64_to_cpu(attr->nres.svcn); + + err = run_unpack(&oa->run0, sbi, inode->i_ino, svcn, + le64_to_cpu(attr->nres.evcn), svcn, + Add2Ptr(attr, roff), t32 - roff); + if (err < 0) { + kfree(oa->attr); + oa->attr = NULL; + goto fake_attr; + } + err = 0; + } + oa->run1 = &oa->run0; + attr = oa->attr; + +final_oe: + if (oe->is_attr_name == 1) + kfree(oe->ptr); + oe->is_attr_name = 0; + oe->ptr = oa; + oe->name_len = attr->name_len; + + goto next_open_attribute; + + /* + * Now loop through the dirty page table to extract all of the Vcn/Lcn. + * Mapping that we have, and insert it into the appropriate run. + */ +next_dirty_page: + dp = enum_rstbl(dptbl, dp); + if (!dp) + goto do_redo_1; + + oe = Add2Ptr(oatbl, le32_to_cpu(dp->target_attr)); + + if (oe->next != RESTART_ENTRY_ALLOCATED_LE) + goto next_dirty_page; + + oa = oe->ptr; + if (!oa) + goto next_dirty_page; + + i = -1; +next_dirty_page_vcn: + i += 1; + if (i >= le32_to_cpu(dp->lcns_follow)) + goto next_dirty_page; + + vcn = le64_to_cpu(dp->vcn) + i; + size = (vcn + 1) << sbi->cluster_bits; + + if (!dp->page_lcns[i]) + goto next_dirty_page_vcn; + + rno = ino_get(&oe->ref); + if (rno <= MFT_REC_MIRR && + size < (MFT_REC_VOL + 1) * sbi->record_size && + oe->type == ATTR_DATA) { + goto next_dirty_page_vcn; + } + + lcn = le64_to_cpu(dp->page_lcns[i]); + + if ((!run_lookup_entry(oa->run1, vcn, &lcn0, &len0, NULL) || + lcn0 != lcn) && + !run_add_entry(oa->run1, vcn, lcn, 1, false)) { + err = -ENOMEM; + goto out; + } + attr = oa->attr; + t64 = le64_to_cpu(attr->nres.alloc_size); + if (size > t64) { + attr->nres.valid_size = attr->nres.data_size = + attr->nres.alloc_size = cpu_to_le64(size); + } + goto next_dirty_page_vcn; + +do_redo_1: + /* + * Perform the Redo Pass, to restore all of the dirty pages to the same + * contents that they had immediately before the crash. If the dirty + * page table is empty, then we can skip the entire Redo Pass. + */ + if (!dptbl || !dptbl->total) + goto do_undo_action; + + rec_lsn = rlsn; + + /* + * Read the record at the Redo lsn, before falling + * into common code to handle each record. + */ + err = read_log_rec_lcb(log, rlsn, lcb_ctx_next, &lcb); + if (err) + goto out; + + /* + * Now loop to read all of our log records forwards, until + * we hit the end of the file, cleaning up at the end. + */ +do_action_next: + frh = lcb->lrh; + + if (LfsClientRecord != frh->record_type) + goto read_next_log_do_action; + + transact_id = le32_to_cpu(frh->transact_id); + rec_len = le32_to_cpu(frh->client_data_len); + lrh = lcb->log_rec; + + if (!check_log_rec(lrh, rec_len, transact_id, bytes_per_attr_entry)) { + err = -EINVAL; + goto out; + } + + /* Ignore log records that do not update pages. */ + if (lrh->lcns_follow) + goto find_dirty_page; + + goto read_next_log_do_action; + +find_dirty_page: + t16 = le16_to_cpu(lrh->target_attr); + t64 = le64_to_cpu(lrh->target_vcn); + dp = find_dp(dptbl, t16, t64); + + if (!dp) + goto read_next_log_do_action; + + if (rec_lsn < le64_to_cpu(dp->oldest_lsn)) + goto read_next_log_do_action; + + t16 = le16_to_cpu(lrh->target_attr); + if (t16 >= bytes_per_rt(oatbl)) { + err = -EINVAL; + goto out; + } + + oe = Add2Ptr(oatbl, t16); + + if (oe->next != RESTART_ENTRY_ALLOCATED_LE) { + err = -EINVAL; + goto out; + } + + oa = oe->ptr; + + if (!oa) { + err = -EINVAL; + goto out; + } + attr = oa->attr; + + vcn = le64_to_cpu(lrh->target_vcn); + + if (!run_lookup_entry(oa->run1, vcn, &lcn, NULL, NULL) || + lcn == SPARSE_LCN) { + goto read_next_log_do_action; + } + + /* Point to the Redo data and get its length. */ + data = Add2Ptr(lrh, le16_to_cpu(lrh->redo_off)); + dlen = le16_to_cpu(lrh->redo_len); + + /* Shorten length by any Lcns which were deleted. */ + saved_len = dlen; + + for (i = le16_to_cpu(lrh->lcns_follow); i; i--) { + size_t j; + u32 alen, voff; + + voff = le16_to_cpu(lrh->record_off) + + le16_to_cpu(lrh->attr_off); + voff += le16_to_cpu(lrh->cluster_off) << SECTOR_SHIFT; + + /* If the Vcn question is allocated, we can just get out. */ + j = le64_to_cpu(lrh->target_vcn) - le64_to_cpu(dp->vcn); + if (dp->page_lcns[j + i - 1]) + break; + + if (!saved_len) + saved_len = 1; + + /* + * Calculate the allocated space left relative to the + * log record Vcn, after removing this unallocated Vcn. + */ + alen = (i - 1) << sbi->cluster_bits; + + /* + * If the update described this log record goes beyond + * the allocated space, then we will have to reduce the length. + */ + if (voff >= alen) + dlen = 0; + else if (voff + dlen > alen) + dlen = alen - voff; + } + + /* + * If the resulting dlen from above is now zero, + * we can skip this log record. + */ + if (!dlen && saved_len) + goto read_next_log_do_action; + + t16 = le16_to_cpu(lrh->redo_op); + if (can_skip_action(t16)) + goto read_next_log_do_action; + + /* Apply the Redo operation a common routine. */ + err = do_action(log, oe, lrh, t16, data, dlen, rec_len, &rec_lsn); + if (err) + goto out; + + /* Keep reading and looping back until end of file. */ +read_next_log_do_action: + err = read_next_log_rec(log, lcb, &rec_lsn); + if (!err && rec_lsn) + goto do_action_next; + + lcb_put(lcb); + lcb = NULL; + +do_undo_action: + /* Scan Transaction Table. */ + tr = NULL; +transaction_table_next: + tr = enum_rstbl(trtbl, tr); + if (!tr) + goto undo_action_done; + + if (TransactionActive != tr->transact_state || !tr->undo_next_lsn) { + free_rsttbl_idx(trtbl, PtrOffset(trtbl, tr)); + goto transaction_table_next; + } + + log->transaction_id = PtrOffset(trtbl, tr); + undo_next_lsn = le64_to_cpu(tr->undo_next_lsn); + + /* + * We only have to do anything if the transaction has + * something its undo_next_lsn field. + */ + if (!undo_next_lsn) + goto commit_undo; + + /* Read the first record to be undone by this transaction. */ + err = read_log_rec_lcb(log, undo_next_lsn, lcb_ctx_undo_next, &lcb); + if (err) + goto out; + + /* + * Now loop to read all of our log records forwards, + * until we hit the end of the file, cleaning up at the end. + */ +undo_action_next: + + lrh = lcb->log_rec; + frh = lcb->lrh; + transact_id = le32_to_cpu(frh->transact_id); + rec_len = le32_to_cpu(frh->client_data_len); + + if (!check_log_rec(lrh, rec_len, transact_id, bytes_per_attr_entry)) { + err = -EINVAL; + goto out; + } + + if (lrh->undo_op == cpu_to_le16(Noop)) + goto read_next_log_undo_action; + + oe = Add2Ptr(oatbl, le16_to_cpu(lrh->target_attr)); + oa = oe->ptr; + + t16 = le16_to_cpu(lrh->lcns_follow); + if (!t16) + goto add_allocated_vcns; + + is_mapped = run_lookup_entry(oa->run1, le64_to_cpu(lrh->target_vcn), + &lcn, &clen, NULL); + + /* + * If the mapping isn't already the table or the mapping + * corresponds to a hole the mapping, we need to make sure + * there is no partial page already memory. + */ + if (is_mapped && lcn != SPARSE_LCN && clen >= t16) + goto add_allocated_vcns; + + vcn = le64_to_cpu(lrh->target_vcn); + vcn &= ~(log->clst_per_page - 1); + +add_allocated_vcns: + for (i = 0, vcn = le64_to_cpu(lrh->target_vcn), + size = (vcn + 1) << sbi->cluster_bits; + i < t16; i++, vcn += 1, size += sbi->cluster_size) { + attr = oa->attr; + if (!attr->non_res) { + if (size > le32_to_cpu(attr->res.data_size)) + attr->res.data_size = cpu_to_le32(size); + } else { + if (size > le64_to_cpu(attr->nres.data_size)) + attr->nres.valid_size = attr->nres.data_size = + attr->nres.alloc_size = + cpu_to_le64(size); + } + } + + t16 = le16_to_cpu(lrh->undo_op); + if (can_skip_action(t16)) + goto read_next_log_undo_action; + + /* Point to the Redo data and get its length. */ + data = Add2Ptr(lrh, le16_to_cpu(lrh->undo_off)); + dlen = le16_to_cpu(lrh->undo_len); + + /* It is time to apply the undo action. */ + err = do_action(log, oe, lrh, t16, data, dlen, rec_len, NULL); + +read_next_log_undo_action: + /* + * Keep reading and looping back until we have read the + * last record for this transaction. + */ + err = read_next_log_rec(log, lcb, &rec_lsn); + if (err) + goto out; + + if (rec_lsn) + goto undo_action_next; + + lcb_put(lcb); + lcb = NULL; + +commit_undo: + free_rsttbl_idx(trtbl, log->transaction_id); + + log->transaction_id = 0; + + goto transaction_table_next; + +undo_action_done: + + ntfs_update_mftmirr(sbi, 0); + + sbi->flags &= ~NTFS_FLAGS_NEED_REPLAY; + +end_reply: + + err = 0; + if (is_ro) + goto out; + + rh = kzalloc(log->page_size, GFP_NOFS); + if (!rh) { + err = -ENOMEM; + goto out; + } + + rh->rhdr.sign = NTFS_RSTR_SIGNATURE; + rh->rhdr.fix_off = cpu_to_le16(offsetof(struct RESTART_HDR, fixups)); + t16 = (log->page_size >> SECTOR_SHIFT) + 1; + rh->rhdr.fix_num = cpu_to_le16(t16); + rh->sys_page_size = cpu_to_le32(log->page_size); + rh->page_size = cpu_to_le32(log->page_size); + + t16 = ALIGN(offsetof(struct RESTART_HDR, fixups) + sizeof(short) * t16, + 8); + rh->ra_off = cpu_to_le16(t16); + rh->minor_ver = cpu_to_le16(1); // 0x1A: + rh->major_ver = cpu_to_le16(1); // 0x1C: + + ra2 = Add2Ptr(rh, t16); + memcpy(ra2, ra, sizeof(struct RESTART_AREA)); + + ra2->client_idx[0] = 0; + ra2->client_idx[1] = LFS_NO_CLIENT_LE; + ra2->flags = cpu_to_le16(2); + + le32_add_cpu(&ra2->open_log_count, 1); + + ntfs_fix_pre_write(&rh->rhdr, log->page_size); + + err = ntfs_sb_write_run(sbi, &ni->file.run, 0, rh, log->page_size); + if (!err) + err = ntfs_sb_write_run(sbi, &log->ni->file.run, log->page_size, + rh, log->page_size); + + kfree(rh); + if (err) + goto out; + +out: + kfree(rst); + if (lcb) + lcb_put(lcb); + + /* + * Scan the Open Attribute Table to close all of + * the open attributes. + */ + oe = NULL; + while ((oe = enum_rstbl(oatbl, oe))) { + rno = ino_get(&oe->ref); + + if (oe->is_attr_name == 1) { + kfree(oe->ptr); + oe->ptr = NULL; + continue; + } + + if (oe->is_attr_name) + continue; + + oa = oe->ptr; + if (!oa) + continue; + + run_close(&oa->run0); + kfree(oa->attr); + if (oa->ni) + iput(&oa->ni->vfs_inode); + kfree(oa); + } + + kfree(trtbl); + kfree(oatbl); + kfree(dptbl); + kfree(attr_names); + kfree(rst_info.r_page); + + kfree(ra); + kfree(log->one_page_buf); + + if (err) + sbi->flags |= NTFS_FLAGS_NEED_REPLAY; + + if (err == -EROFS) + err = 0; + else if (log->set_dirty) + ntfs_set_state(sbi, NTFS_DIRTY_ERROR); + + kfree(log); + + return err; +} diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c new file mode 100644 index 000000000000..91e3743e1442 --- /dev/null +++ b/fs/ntfs3/fsntfs.c @@ -0,0 +1,2509 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * + * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. + * + */ + +#include <linux/blkdev.h> +#include <linux/buffer_head.h> +#include <linux/fs.h> +#include <linux/nls.h> + +#include "debug.h" +#include "ntfs.h" +#include "ntfs_fs.h" + +// clang-format off +const struct cpu_str NAME_MFT = { + 4, 0, { '$', 'M', 'F', 'T' }, +}; +const struct cpu_str NAME_MIRROR = { + 8, 0, { '$', 'M', 'F', 'T', 'M', 'i', 'r', 'r' }, +}; +const struct cpu_str NAME_LOGFILE = { + 8, 0, { '$', 'L', 'o', 'g', 'F', 'i', 'l', 'e' }, +}; +const struct cpu_str NAME_VOLUME = { + 7, 0, { '$', 'V', 'o', 'l', 'u', 'm', 'e' }, +}; +const struct cpu_str NAME_ATTRDEF = { + 8, 0, { '$', 'A', 't', 't', 'r', 'D', 'e', 'f' }, +}; +const struct cpu_str NAME_ROOT = { + 1, 0, { '.' }, +}; +const struct cpu_str NAME_BITMAP = { + 7, 0, { '$', 'B', 'i', 't', 'm', 'a', 'p' }, +}; +const struct cpu_str NAME_BOOT = { + 5, 0, { '$', 'B', 'o', 'o', 't' }, +}; +const struct cpu_str NAME_BADCLUS = { + 8, 0, { '$', 'B', 'a', 'd', 'C', 'l', 'u', 's' }, +}; +const struct cpu_str NAME_QUOTA = { + 6, 0, { '$', 'Q', 'u', 'o', 't', 'a' }, +}; +const struct cpu_str NAME_SECURE = { + 7, 0, { '$', 'S', 'e', 'c', 'u', 'r', 'e' }, +}; +const struct cpu_str NAME_UPCASE = { + 7, 0, { '$', 'U', 'p', 'C', 'a', 's', 'e' }, +}; +const struct cpu_str NAME_EXTEND = { + 7, 0, { '$', 'E', 'x', 't', 'e', 'n', 'd' }, +}; +const struct cpu_str NAME_OBJID = { + 6, 0, { '$', 'O', 'b', 'j', 'I', 'd' }, +}; +const struct cpu_str NAME_REPARSE = { + 8, 0, { '$', 'R', 'e', 'p', 'a', 'r', 's', 'e' }, +}; +const struct cpu_str NAME_USNJRNL = { + 8, 0, { '$', 'U', 's', 'n', 'J', 'r', 'n', 'l' }, +}; +const __le16 BAD_NAME[4] = { + cpu_to_le16('$'), cpu_to_le16('B'), cpu_to_le16('a'), cpu_to_le16('d'), +}; +const __le16 I30_NAME[4] = { + cpu_to_le16('$'), cpu_to_le16('I'), cpu_to_le16('3'), cpu_to_le16('0'), +}; +const __le16 SII_NAME[4] = { + cpu_to_le16('$'), cpu_to_le16('S'), cpu_to_le16('I'), cpu_to_le16('I'), +}; +const __le16 SDH_NAME[4] = { + cpu_to_le16('$'), cpu_to_le16('S'), cpu_to_le16('D'), cpu_to_le16('H'), +}; +const __le16 SDS_NAME[4] = { + cpu_to_le16('$'), cpu_to_le16('S'), cpu_to_le16('D'), cpu_to_le16('S'), +}; +const __le16 SO_NAME[2] = { + cpu_to_le16('$'), cpu_to_le16('O'), +}; +const __le16 SQ_NAME[2] = { + cpu_to_le16('$'), cpu_to_le16('Q'), +}; +const __le16 SR_NAME[2] = { + cpu_to_le16('$'), cpu_to_le16('R'), +}; + +#ifdef CONFIG_NTFS3_LZX_XPRESS +const __le16 WOF_NAME[17] = { + cpu_to_le16('W'), cpu_to_le16('o'), cpu_to_le16('f'), cpu_to_le16('C'), + cpu_to_le16('o'), cpu_to_le16('m'), cpu_to_le16('p'), cpu_to_le16('r'), + cpu_to_le16('e'), cpu_to_le16('s'), cpu_to_le16('s'), cpu_to_le16('e'), + cpu_to_le16('d'), cpu_to_le16('D'), cpu_to_le16('a'), cpu_to_le16('t'), + cpu_to_le16('a'), +}; +#endif + +// clang-format on + +/* + * ntfs_fix_pre_write - Insert fixups into @rhdr before writing to disk. + */ +bool ntfs_fix_pre_write(struct NTFS_RECORD_HEADER *rhdr, size_t bytes) +{ + u16 *fixup, *ptr; + u16 sample; + u16 fo = le16_to_cpu(rhdr->fix_off); + u16 fn = le16_to_cpu(rhdr->fix_num); + + if ((fo & 1) || fo + fn * sizeof(short) > SECTOR_SIZE || !fn-- || + fn * SECTOR_SIZE > bytes) { + return false; + } + + /* Get fixup pointer. */ + fixup = Add2Ptr(rhdr, fo); + + if (*fixup >= 0x7FFF) + *fixup = 1; + else + *fixup += 1; + + sample = *fixup; + + ptr = Add2Ptr(rhdr, SECTOR_SIZE - sizeof(short)); + + while (fn--) { + *++fixup = *ptr; + *ptr = sample; + ptr += SECTOR_SIZE / sizeof(short); + } + return true; +} + +/* + * ntfs_fix_post_read - Remove fixups after reading from disk. + * + * Return: < 0 if error, 0 if ok, 1 if need to update fixups. + */ +int ntfs_fix_post_read(struct NTFS_RECORD_HEADER *rhdr, size_t bytes, + bool simple) +{ + int ret; + u16 *fixup, *ptr; + u16 sample, fo, fn; + + fo = le16_to_cpu(rhdr->fix_off); + fn = simple ? ((bytes >> SECTOR_SHIFT) + 1) + : le16_to_cpu(rhdr->fix_num); + + /* Check errors. */ + if ((fo & 1) || fo + fn * sizeof(short) > SECTOR_SIZE || !fn-- || + fn * SECTOR_SIZE > bytes) { + return -EINVAL; /* Native chkntfs returns ok! */ + } + + /* Get fixup pointer. */ + fixup = Add2Ptr(rhdr, fo); + sample = *fixup; + ptr = Add2Ptr(rhdr, SECTOR_SIZE - sizeof(short)); + ret = 0; + + while (fn--) { + /* Test current word. */ + if (*ptr != sample) { + /* Fixup does not match! Is it serious error? */ + ret = -E_NTFS_FIXUP; + } + + /* Replace fixup. */ + *ptr = *++fixup; + ptr += SECTOR_SIZE / sizeof(short); + } + + return ret; +} + +/* + * ntfs_extend_init - Load $Extend file. + */ +int ntfs_extend_init(struct ntfs_sb_info *sbi) +{ + int err; + struct super_block *sb = sbi->sb; + struct inode *inode, *inode2; + struct MFT_REF ref; + + if (sbi->volume.major_ver < 3) { + ntfs_notice(sb, "Skip $Extend 'cause NTFS version"); + return 0; + } + + ref.low = cpu_to_le32(MFT_REC_EXTEND); + ref.high = 0; + ref.seq = cpu_to_le16(MFT_REC_EXTEND); + inode = ntfs_iget5(sb, &ref, &NAME_EXTEND); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + ntfs_err(sb, "Failed to load $Extend."); + inode = NULL; + goto out; + } + + /* If ntfs_iget5() reads from disk it never returns bad inode. */ + if (!S_ISDIR(inode->i_mode)) { + err = -EINVAL; + goto out; + } + + /* Try to find $ObjId */ + inode2 = dir_search_u(inode, &NAME_OBJID, NULL); + if (inode2 && !IS_ERR(inode2)) { + if (is_bad_inode(inode2)) { + iput(inode2); + } else { + sbi->objid.ni = ntfs_i(inode2); + sbi->objid_no = inode2->i_ino; + } + } + + /* Try to find $Quota */ + inode2 = dir_search_u(inode, &NAME_QUOTA, NULL); + if (inode2 && !IS_ERR(inode2)) { + sbi->quota_no = inode2->i_ino; + iput(inode2); + } + + /* Try to find $Reparse */ + inode2 = dir_search_u(inode, &NAME_REPARSE, NULL); + if (inode2 && !IS_ERR(inode2)) { + sbi->reparse.ni = ntfs_i(inode2); + sbi->reparse_no = inode2->i_ino; + } + + /* Try to find $UsnJrnl */ + inode2 = dir_search_u(inode, &NAME_USNJRNL, NULL); + if (inode2 && !IS_ERR(inode2)) { + sbi->usn_jrnl_no = inode2->i_ino; + iput(inode2); + } + + err = 0; +out: + iput(inode); + return err; +} + +int ntfs_loadlog_and_replay(struct ntfs_inode *ni, struct ntfs_sb_info *sbi) +{ + int err = 0; + struct super_block *sb = sbi->sb; + bool initialized = false; + struct MFT_REF ref; + struct inode *inode; + + /* Check for 4GB. */ + if (ni->vfs_inode.i_size >= 0x100000000ull) { + ntfs_err(sb, "\x24LogFile is too big"); + err = -EINVAL; + goto out; + } + + sbi->flags |= NTFS_FLAGS_LOG_REPLAYING; + + ref.low = cpu_to_le32(MFT_REC_MFT); + ref.high = 0; + ref.seq = cpu_to_le16(1); + + inode = ntfs_iget5(sb, &ref, NULL); + + if (IS_ERR(inode)) + inode = NULL; + + if (!inode) { + /* Try to use MFT copy. */ + u64 t64 = sbi->mft.lbo; + + sbi->mft.lbo = sbi->mft.lbo2; + inode = ntfs_iget5(sb, &ref, NULL); + sbi->mft.lbo = t64; + if (IS_ERR(inode)) + inode = NULL; + } + + if (!inode) { + err = -EINVAL; + ntfs_err(sb, "Failed to load $MFT."); + goto out; + } + + sbi->mft.ni = ntfs_i(inode); + + /* LogFile should not contains attribute list. */ + err = ni_load_all_mi(sbi->mft.ni); + if (!err) + err = log_replay(ni, &initialized); + + iput(inode); + sbi->mft.ni = NULL; + + sync_blockdev(sb->s_bdev); + invalidate_bdev(sb->s_bdev); + + if (sbi->flags & NTFS_FLAGS_NEED_REPLAY) { + err = 0; + goto out; + } + + if (sb_rdonly(sb) || !initialized) + goto out; + + /* Fill LogFile by '-1' if it is initialized. */ + err = ntfs_bio_fill_1(sbi, &ni->file.run); + +out: + sbi->flags &= ~NTFS_FLAGS_LOG_REPLAYING; + + return err; +} + +/* + * ntfs_query_def + * + * Return: Current ATTR_DEF_ENTRY for given attribute type. + */ +const struct ATTR_DEF_ENTRY *ntfs_query_def(struct ntfs_sb_info *sbi, + enum ATTR_TYPE type) +{ + int type_in = le32_to_cpu(type); + size_t min_idx = 0; + size_t max_idx = sbi->def_entries - 1; + + while (min_idx <= max_idx) { + size_t i = min_idx + ((max_idx - min_idx) >> 1); + const struct ATTR_DEF_ENTRY *entry = sbi->def_table + i; + int diff = le32_to_cpu(entry->type) - type_in; + + if (!diff) + return entry; + if (diff < 0) + min_idx = i + 1; + else if (i) + max_idx = i - 1; + else + return NULL; + } + return NULL; +} + +/* + * ntfs_look_for_free_space - Look for a free space in bitmap. + */ +int ntfs_look_for_free_space(struct ntfs_sb_info *sbi, CLST lcn, CLST len, + CLST *new_lcn, CLST *new_len, + enum ALLOCATE_OPT opt) +{ + int err; + CLST alen = 0; + struct super_block *sb = sbi->sb; + size_t alcn, zlen, zeroes, zlcn, zlen2, ztrim, new_zlen; + struct wnd_bitmap *wnd = &sbi->used.bitmap; + + down_write_nested(&wnd->rw_lock, BITMAP_MUTEX_CLUSTERS); + if (opt & ALLOCATE_MFT) { + zlen = wnd_zone_len(wnd); + + if (!zlen) { + err = ntfs_refresh_zone(sbi); + if (err) + goto out; + zlen = wnd_zone_len(wnd); + } + + if (!zlen) { + ntfs_err(sbi->sb, "no free space to extend mft"); + goto out; + } + + lcn = wnd_zone_bit(wnd); + alen = zlen > len ? len : zlen; + + wnd_zone_set(wnd, lcn + alen, zlen - alen); + + err = wnd_set_used(wnd, lcn, alen); + if (err) { + up_write(&wnd->rw_lock); + return err; + } + alcn = lcn; + goto out; + } + /* + * 'Cause cluster 0 is always used this value means that we should use + * cached value of 'next_free_lcn' to improve performance. + */ + if (!lcn) + lcn = sbi->used.next_free_lcn; + + if (lcn >= wnd->nbits) + lcn = 0; + + alen = wnd_find(wnd, len, lcn, BITMAP_FIND_MARK_AS_USED, &alcn); + if (alen) + goto out; + + /* Try to use clusters from MftZone. */ + zlen = wnd_zone_len(wnd); + zeroes = wnd_zeroes(wnd); + + /* Check too big request */ + if (len > zeroes + zlen || zlen <= NTFS_MIN_MFT_ZONE) + goto out; + + /* How many clusters to cat from zone. */ + zlcn = wnd_zone_bit(wnd); + zlen2 = zlen >> 1; + ztrim = len > zlen ? zlen : (len > zlen2 ? len : zlen2); + new_zlen = zlen - ztrim; + + if (new_zlen < NTFS_MIN_MFT_ZONE) { + new_zlen = NTFS_MIN_MFT_ZONE; + if (new_zlen > zlen) + new_zlen = zlen; + } + + wnd_zone_set(wnd, zlcn, new_zlen); + + /* Allocate continues clusters. */ + alen = wnd_find(wnd, len, 0, + BITMAP_FIND_MARK_AS_USED | BITMAP_FIND_FULL, &alcn); + +out: + if (alen) { + err = 0; + *new_len = alen; + *new_lcn = alcn; + + ntfs_unmap_meta(sb, alcn, alen); + + /* Set hint for next requests. */ + if (!(opt & ALLOCATE_MFT)) + sbi->used.next_free_lcn = alcn + alen; + } else { + err = -ENOSPC; + } + + up_write(&wnd->rw_lock); + return err; +} + +/* + * ntfs_extend_mft - Allocate additional MFT records. + * + * sbi->mft.bitmap is locked for write. + * + * NOTE: recursive: + * ntfs_look_free_mft -> + * ntfs_extend_mft -> + * attr_set_size -> + * ni_insert_nonresident -> + * ni_insert_attr -> + * ni_ins_attr_ext -> + * ntfs_look_free_mft -> + * ntfs_extend_mft + * + * To avoid recursive always allocate space for two new MFT records + * see attrib.c: "at least two MFT to avoid recursive loop". + */ +static int ntfs_extend_mft(struct ntfs_sb_info *sbi) +{ + int err; + struct ntfs_inode *ni = sbi->mft.ni; + size_t new_mft_total; + u64 new_mft_bytes, new_bitmap_bytes; + struct ATTRIB *attr; + struct wnd_bitmap *wnd = &sbi->mft.bitmap; + + new_mft_total = (wnd->nbits + MFT_INCREASE_CHUNK + 127) & (CLST)~127; + new_mft_bytes = (u64)new_mft_total << sbi->record_bits; + + /* Step 1: Resize $MFT::DATA. */ + down_write(&ni->file.run_lock); + err = attr_set_size(ni, ATTR_DATA, NULL, 0, &ni->file.run, + new_mft_bytes, NULL, false, &attr); + + if (err) { + up_write(&ni->file.run_lock); + goto out; + } + + attr->nres.valid_size = attr->nres.data_size; + new_mft_total = le64_to_cpu(attr->nres.alloc_size) >> sbi->record_bits; + ni->mi.dirty = true; + + /* Step 2: Resize $MFT::BITMAP. */ + new_bitmap_bytes = bitmap_size(new_mft_total); + + err = attr_set_size(ni, ATTR_BITMAP, NULL, 0, &sbi->mft.bitmap.run, + new_bitmap_bytes, &new_bitmap_bytes, true, NULL); + + /* Refresh MFT Zone if necessary. */ + down_write_nested(&sbi->used.bitmap.rw_lock, BITMAP_MUTEX_CLUSTERS); + + ntfs_refresh_zone(sbi); + + up_write(&sbi->used.bitmap.rw_lock); + up_write(&ni->file.run_lock); + + if (err) + goto out; + + err = wnd_extend(wnd, new_mft_total); + + if (err) + goto out; + + ntfs_clear_mft_tail(sbi, sbi->mft.used, new_mft_total); + + err = _ni_write_inode(&ni->vfs_inode, 0); +out: + return err; +} + +/* + * ntfs_look_free_mft - Look for a free MFT record. + */ +int ntfs_look_free_mft(struct ntfs_sb_info *sbi, CLST *rno, bool mft, + struct ntfs_inode *ni, struct mft_inode **mi) +{ + int err = 0; + size_t zbit, zlen, from, to, fr; + size_t mft_total; + struct MFT_REF ref; + struct super_block *sb = sbi->sb; + struct wnd_bitmap *wnd = &sbi->mft.bitmap; + u32 ir; + + static_assert(sizeof(sbi->mft.reserved_bitmap) * 8 >= + MFT_REC_FREE - MFT_REC_RESERVED); + + if (!mft) + down_write_nested(&wnd->rw_lock, BITMAP_MUTEX_MFT); + + zlen = wnd_zone_len(wnd); + + /* Always reserve space for MFT. */ + if (zlen) { + if (mft) { + zbit = wnd_zone_bit(wnd); + *rno = zbit; + wnd_zone_set(wnd, zbit + 1, zlen - 1); + } + goto found; + } + + /* No MFT zone. Find the nearest to '0' free MFT. */ + if (!wnd_find(wnd, 1, MFT_REC_FREE, 0, &zbit)) { + /* Resize MFT */ + mft_total = wnd->nbits; + + err = ntfs_extend_mft(sbi); + if (!err) { + zbit = mft_total; + goto reserve_mft; + } + + if (!mft || MFT_REC_FREE == sbi->mft.next_reserved) + goto out; + + err = 0; + + /* + * Look for free record reserved area [11-16) == + * [MFT_REC_RESERVED, MFT_REC_FREE ) MFT bitmap always + * marks it as used. + */ + if (!sbi->mft.reserved_bitmap) { + /* Once per session create internal bitmap for 5 bits. */ + sbi->mft.reserved_bitmap = 0xFF; + + ref.high = 0; + for (ir = MFT_REC_RESERVED; ir < MFT_REC_FREE; ir++) { + struct inode *i; + struct ntfs_inode *ni; + struct MFT_REC *mrec; + + ref.low = cpu_to_le32(ir); + ref.seq = cpu_to_le16(ir); + + i = ntfs_iget5(sb, &ref, NULL); + if (IS_ERR(i)) { +next: + ntfs_notice( + sb, + "Invalid reserved record %x", + ref.low); + continue; + } + if (is_bad_inode(i)) { + iput(i); + goto next; + } + + ni = ntfs_i(i); + + mrec = ni->mi.mrec; + + if (!is_rec_base(mrec)) + goto next; + + if (mrec->hard_links) + goto next; + + if (!ni_std(ni)) + goto next; + + if (ni_find_attr(ni, NULL, NULL, ATTR_NAME, + NULL, 0, NULL, NULL)) + goto next; + + __clear_bit(ir - MFT_REC_RESERVED, + &sbi->mft.reserved_bitmap); + } + } + + /* Scan 5 bits for zero. Bit 0 == MFT_REC_RESERVED */ + zbit = find_next_zero_bit(&sbi->mft.reserved_bitmap, + MFT_REC_FREE, MFT_REC_RESERVED); + if (zbit >= MFT_REC_FREE) { + sbi->mft.next_reserved = MFT_REC_FREE; + goto out; + } + + zlen = 1; + sbi->mft.next_reserved = zbit; + } else { +reserve_mft: + zlen = zbit == MFT_REC_FREE ? (MFT_REC_USER - MFT_REC_FREE) : 4; + if (zbit + zlen > wnd->nbits) + zlen = wnd->nbits - zbit; + + while (zlen > 1 && !wnd_is_free(wnd, zbit, zlen)) + zlen -= 1; + + /* [zbit, zbit + zlen) will be used for MFT itself. */ + from = sbi->mft.used; + if (from < zbit) + from = zbit; + to = zbit + zlen; + if (from < to) { + ntfs_clear_mft_tail(sbi, from, to); + sbi->mft.used = to; + } + } + + if (mft) { + *rno = zbit; + zbit += 1; + zlen -= 1; + } + + wnd_zone_set(wnd, zbit, zlen); + +found: + if (!mft) { + /* The request to get record for general purpose. */ + if (sbi->mft.next_free < MFT_REC_USER) + sbi->mft.next_free = MFT_REC_USER; + + for (;;) { + if (sbi->mft.next_free >= sbi->mft.bitmap.nbits) { + } else if (!wnd_find(wnd, 1, MFT_REC_USER, 0, &fr)) { + sbi->mft.next_free = sbi->mft.bitmap.nbits; + } else { + *rno = fr; + sbi->mft.next_free = *rno + 1; + break; + } + + err = ntfs_extend_mft(sbi); + if (err) + goto out; + } + } + + if (ni && !ni_add_subrecord(ni, *rno, mi)) { + err = -ENOMEM; + goto out; + } + + /* We have found a record that are not reserved for next MFT. */ + if (*rno >= MFT_REC_FREE) + wnd_set_used(wnd, *rno, 1); + else if (*rno >= MFT_REC_RESERVED && sbi->mft.reserved_bitmap_inited) + __set_bit(*rno - MFT_REC_RESERVED, &sbi->mft.reserved_bitmap); + +out: + if (!mft) + up_write(&wnd->rw_lock); + + return err; +} + +/* + * ntfs_mark_rec_free - Mark record as free. + */ +void ntfs_mark_rec_free(struct ntfs_sb_info *sbi, CLST rno) +{ + struct wnd_bitmap *wnd = &sbi->mft.bitmap; + + down_write_nested(&wnd->rw_lock, BITMAP_MUTEX_MFT); + if (rno >= wnd->nbits) + goto out; + + if (rno >= MFT_REC_FREE) { + if (!wnd_is_used(wnd, rno, 1)) + ntfs_set_state(sbi, NTFS_DIRTY_ERROR); + else + wnd_set_free(wnd, rno, 1); + } else if (rno >= MFT_REC_RESERVED && sbi->mft.reserved_bitmap_inited) { + __clear_bit(rno - MFT_REC_RESERVED, &sbi->mft.reserved_bitmap); + } + + if (rno < wnd_zone_bit(wnd)) + wnd_zone_set(wnd, rno, 1); + else if (rno < sbi->mft.next_free && rno >= MFT_REC_USER) + sbi->mft.next_free = rno; + +out: + up_write(&wnd->rw_lock); +} + +/* + * ntfs_clear_mft_tail - Format empty records [from, to). + * + * sbi->mft.bitmap is locked for write. + */ +int ntfs_clear_mft_tail(struct ntfs_sb_info *sbi, size_t from, size_t to) +{ + int err; + u32 rs; + u64 vbo; + struct runs_tree *run; + struct ntfs_inode *ni; + + if (from >= to) + return 0; + + rs = sbi->record_size; + ni = sbi->mft.ni; + run = &ni->file.run; + + down_read(&ni->file.run_lock); + vbo = (u64)from * rs; + for (; from < to; from++, vbo += rs) { + struct ntfs_buffers nb; + + err = ntfs_get_bh(sbi, run, vbo, rs, &nb); + if (err) + goto out; + + err = ntfs_write_bh(sbi, &sbi->new_rec->rhdr, &nb, 0); + nb_put(&nb); + if (err) + goto out; + } + +out: + sbi->mft.used = from; + up_read(&ni->file.run_lock); + return err; +} + +/* + * ntfs_refresh_zone - Refresh MFT zone. + * + * sbi->used.bitmap is locked for rw. + * sbi->mft.bitmap is locked for write. + * sbi->mft.ni->file.run_lock for write. + */ +int ntfs_refresh_zone(struct ntfs_sb_info *sbi) +{ + CLST zone_limit, zone_max, lcn, vcn, len; + size_t lcn_s, zlen; + struct wnd_bitmap *wnd = &sbi->used.bitmap; + struct ntfs_inode *ni = sbi->mft.ni; + + /* Do not change anything unless we have non empty MFT zone. */ + if (wnd_zone_len(wnd)) + return 0; + + /* + * Compute the MFT zone at two steps. + * It would be nice if we are able to allocate 1/8 of + * total clusters for MFT but not more then 512 MB. + */ + zone_limit = (512 * 1024 * 1024) >> sbi->cluster_bits; + zone_max = wnd->nbits >> 3; + if (zone_max > zone_limit) + zone_max = zone_limit; + + vcn = bytes_to_cluster(sbi, + (u64)sbi->mft.bitmap.nbits << sbi->record_bits); + + if (!run_lookup_entry(&ni->file.run, vcn - 1, &lcn, &len, NULL)) + lcn = SPARSE_LCN; + + /* We should always find Last Lcn for MFT. */ + if (lcn == SPARSE_LCN) + return -EINVAL; + + lcn_s = lcn + 1; + + /* Try to allocate clusters after last MFT run. */ + zlen = wnd_find(wnd, zone_max, lcn_s, 0, &lcn_s); + if (!zlen) { + ntfs_notice(sbi->sb, "MftZone: unavailable"); + return 0; + } + + /* Truncate too large zone. */ + wnd_zone_set(wnd, lcn_s, zlen); + + return 0; +} + +/* + * ntfs_update_mftmirr - Update $MFTMirr data. + */ +int ntfs_update_mftmirr(struct ntfs_sb_info *sbi, int wait) +{ + int err; + struct super_block *sb = sbi->sb; + u32 blocksize = sb->s_blocksize; + sector_t block1, block2; + u32 bytes; + + if (!(sbi->flags & NTFS_FLAGS_MFTMIRR)) + return 0; + + err = 0; + bytes = sbi->mft.recs_mirr << sbi->record_bits; + block1 = sbi->mft.lbo >> sb->s_blocksize_bits; + block2 = sbi->mft.lbo2 >> sb->s_blocksize_bits; + + for (; bytes >= blocksize; bytes -= blocksize) { + struct buffer_head *bh1, *bh2; + + bh1 = sb_bread(sb, block1++); + if (!bh1) { + err = -EIO; + goto out; + } + + bh2 = sb_getblk(sb, block2++); + if (!bh2) { + put_bh(bh1); + err = -EIO; + goto out; + } + + if (buffer_locked(bh2)) + __wait_on_buffer(bh2); + + lock_buffer(bh2); + memcpy(bh2->b_data, bh1->b_data, blocksize); + set_buffer_uptodate(bh2); + mark_buffer_dirty(bh2); + unlock_buffer(bh2); + + put_bh(bh1); + bh1 = NULL; + + if (wait) + err = sync_dirty_buffer(bh2); + + put_bh(bh2); + if (err) + goto out; + } + + sbi->flags &= ~NTFS_FLAGS_MFTMIRR; + +out: + return err; +} + +/* + * ntfs_set_state + * + * Mount: ntfs_set_state(NTFS_DIRTY_DIRTY) + * Umount: ntfs_set_state(NTFS_DIRTY_CLEAR) + * NTFS error: ntfs_set_state(NTFS_DIRTY_ERROR) + */ +int ntfs_set_state(struct ntfs_sb_info *sbi, enum NTFS_DIRTY_FLAGS dirty) +{ + int err; + struct ATTRIB *attr; + struct VOLUME_INFO *info; + struct mft_inode *mi; + struct ntfs_inode *ni; + + /* + * Do not change state if fs was real_dirty. + * Do not change state if fs already dirty(clear). + * Do not change any thing if mounted read only. + */ + if (sbi->volume.real_dirty || sb_rdonly(sbi->sb)) + return 0; + + /* Check cached value. */ + if ((dirty == NTFS_DIRTY_CLEAR ? 0 : VOLUME_FLAG_DIRTY) == + (sbi->volume.flags & VOLUME_FLAG_DIRTY)) + return 0; + + ni = sbi->volume.ni; + if (!ni) + return -EINVAL; + + mutex_lock_nested(&ni->ni_lock, NTFS_INODE_MUTEX_DIRTY); + + attr = ni_find_attr(ni, NULL, NULL, ATTR_VOL_INFO, NULL, 0, NULL, &mi); + if (!attr) { + err = -EINVAL; + goto out; + } + + info = resident_data_ex(attr, SIZEOF_ATTRIBUTE_VOLUME_INFO); + if (!info) { + err = -EINVAL; + goto out; + } + + switch (dirty) { + case NTFS_DIRTY_ERROR: + ntfs_notice(sbi->sb, "Mark volume as dirty due to NTFS errors"); + sbi->volume.real_dirty = true; + fallthrough; + case NTFS_DIRTY_DIRTY: + info->flags |= VOLUME_FLAG_DIRTY; + break; + case NTFS_DIRTY_CLEAR: + info->flags &= ~VOLUME_FLAG_DIRTY; + break; + } + /* Cache current volume flags. */ + sbi->volume.flags = info->flags; + mi->dirty = true; + err = 0; + +out: + ni_unlock(ni); + if (err) + return err; + + mark_inode_dirty(&ni->vfs_inode); + /* verify(!ntfs_update_mftmirr()); */ + + /* + * If we used wait=1, sync_inode_metadata waits for the io for the + * inode to finish. It hangs when media is removed. + * So wait=0 is sent down to sync_inode_metadata + * and filemap_fdatawrite is used for the data blocks. + */ + err = sync_inode_metadata(&ni->vfs_inode, 0); + if (!err) + err = filemap_fdatawrite(ni->vfs_inode.i_mapping); + + return err; +} + +/* + * security_hash - Calculates a hash of security descriptor. + */ +static inline __le32 security_hash(const void *sd, size_t bytes) +{ + u32 hash = 0; + const __le32 *ptr = sd; + + bytes >>= 2; + while (bytes--) + hash = ((hash >> 0x1D) | (hash << 3)) + le32_to_cpu(*ptr++); + return cpu_to_le32(hash); +} + +int ntfs_sb_read(struct super_block *sb, u64 lbo, size_t bytes, void *buffer) +{ + struct block_device *bdev = sb->s_bdev; + u32 blocksize = sb->s_blocksize; + u64 block = lbo >> sb->s_blocksize_bits; + u32 off = lbo & (blocksize - 1); + u32 op = blocksize - off; + + for (; bytes; block += 1, off = 0, op = blocksize) { + struct buffer_head *bh = __bread(bdev, block, blocksize); + + if (!bh) + return -EIO; + + if (op > bytes) + op = bytes; + + memcpy(buffer, bh->b_data + off, op); + + put_bh(bh); + + bytes -= op; + buffer = Add2Ptr(buffer, op); + } + + return 0; +} + +int ntfs_sb_write(struct super_block *sb, u64 lbo, size_t bytes, + const void *buf, int wait) +{ + u32 blocksize = sb->s_blocksize; + struct block_device *bdev = sb->s_bdev; + sector_t block = lbo >> sb->s_blocksize_bits; + u32 off = lbo & (blocksize - 1); + u32 op = blocksize - off; + struct buffer_head *bh; + + if (!wait && (sb->s_flags & SB_SYNCHRONOUS)) + wait = 1; + + for (; bytes; block += 1, off = 0, op = blocksize) { + if (op > bytes) + op = bytes; + + if (op < blocksize) { + bh = __bread(bdev, block, blocksize); + if (!bh) { + ntfs_err(sb, "failed to read block %llx", + (u64)block); + return -EIO; + } + } else { + bh = __getblk(bdev, block, blocksize); + if (!bh) + return -ENOMEM; + } + + if (buffer_locked(bh)) + __wait_on_buffer(bh); + + lock_buffer(bh); + if (buf) { + memcpy(bh->b_data + off, buf, op); + buf = Add2Ptr(buf, op); + } else { + memset(bh->b_data + off, -1, op); + } + + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + unlock_buffer(bh); + + if (wait) { + int err = sync_dirty_buffer(bh); + + if (err) { + ntfs_err( + sb, + "failed to sync buffer at block %llx, error %d", + (u64)block, err); + put_bh(bh); + return err; + } + } + + put_bh(bh); + + bytes -= op; + } + return 0; +} + +int ntfs_sb_write_run(struct ntfs_sb_info *sbi, const struct runs_tree *run, + u64 vbo, const void *buf, size_t bytes) +{ + struct super_block *sb = sbi->sb; + u8 cluster_bits = sbi->cluster_bits; + u32 off = vbo & sbi->cluster_mask; + CLST lcn, clen, vcn = vbo >> cluster_bits, vcn_next; + u64 lbo, len; + size_t idx; + + if (!run_lookup_entry(run, vcn, &lcn, &clen, &idx)) + return -ENOENT; + + if (lcn == SPARSE_LCN) + return -EINVAL; + + lbo = ((u64)lcn << cluster_bits) + off; + len = ((u64)clen << cluster_bits) - off; + + for (;;) { + u32 op = len < bytes ? len : bytes; + int err = ntfs_sb_write(sb, lbo, op, buf, 0); + + if (err) + return err; + + bytes -= op; + if (!bytes) + break; + + vcn_next = vcn + clen; + if (!run_get_entry(run, ++idx, &vcn, &lcn, &clen) || + vcn != vcn_next) + return -ENOENT; + + if (lcn == SPARSE_LCN) + return -EINVAL; + + if (buf) + buf = Add2Ptr(buf, op); + + lbo = ((u64)lcn << cluster_bits); + len = ((u64)clen << cluster_bits); + } + + return 0; +} + +struct buffer_head *ntfs_bread_run(struct ntfs_sb_info *sbi, + const struct runs_tree *run, u64 vbo) +{ + struct super_block *sb = sbi->sb; + u8 cluster_bits = sbi->cluster_bits; + CLST lcn; + u64 lbo; + + if (!run_lookup_entry(run, vbo >> cluster_bits, &lcn, NULL, NULL)) + return ERR_PTR(-ENOENT); + + lbo = ((u64)lcn << cluster_bits) + (vbo & sbi->cluster_mask); + + return ntfs_bread(sb, lbo >> sb->s_blocksize_bits); +} + +int ntfs_read_run_nb(struct ntfs_sb_info *sbi, const struct runs_tree *run, + u64 vbo, void *buf, u32 bytes, struct ntfs_buffers *nb) +{ + int err; + struct super_block *sb = sbi->sb; + u32 blocksize = sb->s_blocksize; + u8 cluster_bits = sbi->cluster_bits; + u32 off = vbo & sbi->cluster_mask; + u32 nbh = 0; + CLST vcn_next, vcn = vbo >> cluster_bits; + CLST lcn, clen; + u64 lbo, len; + size_t idx; + struct buffer_head *bh; + + if (!run) { + /* First reading of $Volume + $MFTMirr + $LogFile goes here. */ + if (vbo > MFT_REC_VOL * sbi->record_size) { + err = -ENOENT; + goto out; + } + + /* Use absolute boot's 'MFTCluster' to read record. */ + lbo = vbo + sbi->mft.lbo; + len = sbi->record_size; + } else if (!run_lookup_entry(run, vcn, &lcn, &clen, &idx)) { + err = -ENOENT; + goto out; + } else { + if (lcn == SPARSE_LCN) { + err = -EINVAL; + goto out; + } + + lbo = ((u64)lcn << cluster_bits) + off; + len = ((u64)clen << cluster_bits) - off; + } + + off = lbo & (blocksize - 1); + if (nb) { + nb->off = off; + nb->bytes = bytes; + } + + for (;;) { + u32 len32 = len >= bytes ? bytes : len; + sector_t block = lbo >> sb->s_blocksize_bits; + + do { + u32 op = blocksize - off; + + if (op > len32) + op = len32; + + bh = ntfs_bread(sb, block); + if (!bh) { + err = -EIO; + goto out; + } + + if (buf) { + memcpy(buf, bh->b_data + off, op); + buf = Add2Ptr(buf, op); + } + + if (!nb) { + put_bh(bh); + } else if (nbh >= ARRAY_SIZE(nb->bh)) { + err = -EINVAL; + goto out; + } else { + nb->bh[nbh++] = bh; + nb->nbufs = nbh; + } + + bytes -= op; + if (!bytes) + return 0; + len32 -= op; + block += 1; + off = 0; + + } while (len32); + + vcn_next = vcn + clen; + if (!run_get_entry(run, ++idx, &vcn, &lcn, &clen) || + vcn != vcn_next) { + err = -ENOENT; + goto out; + } + + if (lcn == SPARSE_LCN) { + err = -EINVAL; + goto out; + } + + lbo = ((u64)lcn << cluster_bits); + len = ((u64)clen << cluster_bits); + } + +out: + if (!nbh) + return err; + + while (nbh) { + put_bh(nb->bh[--nbh]); + nb->bh[nbh] = NULL; + } + + nb->nbufs = 0; + return err; +} + +/* + * ntfs_read_bh + * + * Return: < 0 if error, 0 if ok, -E_NTFS_FIXUP if need to update fixups. + */ +int ntfs_read_bh(struct ntfs_sb_info *sbi, const struct runs_tree *run, u64 vbo, + struct NTFS_RECORD_HEADER *rhdr, u32 bytes, + struct ntfs_buffers *nb) +{ + int err = ntfs_read_run_nb(sbi, run, vbo, rhdr, bytes, nb); + + if (err) + return err; + return ntfs_fix_post_read(rhdr, nb->bytes, true); +} + +int ntfs_get_bh(struct ntfs_sb_info *sbi, const struct runs_tree *run, u64 vbo, + u32 bytes, struct ntfs_buffers *nb) +{ + int err = 0; + struct super_block *sb = sbi->sb; + u32 blocksize = sb->s_blocksize; + u8 cluster_bits = sbi->cluster_bits; + CLST vcn_next, vcn = vbo >> cluster_bits; + u32 off; + u32 nbh = 0; + CLST lcn, clen; + u64 lbo, len; + size_t idx; + + nb->bytes = bytes; + + if (!run_lookup_entry(run, vcn, &lcn, &clen, &idx)) { + err = -ENOENT; + goto out; + } + + off = vbo & sbi->cluster_mask; + lbo = ((u64)lcn << cluster_bits) + off; + len = ((u64)clen << cluster_bits) - off; + + nb->off = off = lbo & (blocksize - 1); + + for (;;) { + u32 len32 = len < bytes ? len : bytes; + sector_t block = lbo >> sb->s_blocksize_bits; + + do { + u32 op; + struct buffer_head *bh; + + if (nbh >= ARRAY_SIZE(nb->bh)) { + err = -EINVAL; + goto out; + } + + op = blocksize - off; + if (op > len32) + op = len32; + + if (op == blocksize) { + bh = sb_getblk(sb, block); + if (!bh) { + err = -ENOMEM; + goto out; + } + if (buffer_locked(bh)) + __wait_on_buffer(bh); + set_buffer_uptodate(bh); + } else { + bh = ntfs_bread(sb, block); + if (!bh) { + err = -EIO; + goto out; + } + } + + nb->bh[nbh++] = bh; + bytes -= op; + if (!bytes) { + nb->nbufs = nbh; + return 0; + } + + block += 1; + len32 -= op; + off = 0; + } while (len32); + + vcn_next = vcn + clen; + if (!run_get_entry(run, ++idx, &vcn, &lcn, &clen) || + vcn != vcn_next) { + err = -ENOENT; + goto out; + } + + lbo = ((u64)lcn << cluster_bits); + len = ((u64)clen << cluster_bits); + } + +out: + while (nbh) { + put_bh(nb->bh[--nbh]); + nb->bh[nbh] = NULL; + } + + nb->nbufs = 0; + + return err; +} + +int ntfs_write_bh(struct ntfs_sb_info *sbi, struct NTFS_RECORD_HEADER *rhdr, + struct ntfs_buffers *nb, int sync) +{ + int err = 0; + struct super_block *sb = sbi->sb; + u32 block_size = sb->s_blocksize; + u32 bytes = nb->bytes; + u32 off = nb->off; + u16 fo = le16_to_cpu(rhdr->fix_off); + u16 fn = le16_to_cpu(rhdr->fix_num); + u32 idx; + __le16 *fixup; + __le16 sample; + + if ((fo & 1) || fo + fn * sizeof(short) > SECTOR_SIZE || !fn-- || + fn * SECTOR_SIZE > bytes) { + return -EINVAL; + } + + for (idx = 0; bytes && idx < nb->nbufs; idx += 1, off = 0) { + u32 op = block_size - off; + char *bh_data; + struct buffer_head *bh = nb->bh[idx]; + __le16 *ptr, *end_data; + + if (op > bytes) + op = bytes; + + if (buffer_locked(bh)) + __wait_on_buffer(bh); + + lock_buffer(nb->bh[idx]); + + bh_data = bh->b_data + off; + end_data = Add2Ptr(bh_data, op); + memcpy(bh_data, rhdr, op); + + if (!idx) { + u16 t16; + + fixup = Add2Ptr(bh_data, fo); + sample = *fixup; + t16 = le16_to_cpu(sample); + if (t16 >= 0x7FFF) { + sample = *fixup = cpu_to_le16(1); + } else { + sample = cpu_to_le16(t16 + 1); + *fixup = sample; + } + + *(__le16 *)Add2Ptr(rhdr, fo) = sample; + } + + ptr = Add2Ptr(bh_data, SECTOR_SIZE - sizeof(short)); + + do { + *++fixup = *ptr; + *ptr = sample; + ptr += SECTOR_SIZE / sizeof(short); + } while (ptr < end_data); + + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + unlock_buffer(bh); + + if (sync) { + int err2 = sync_dirty_buffer(bh); + + if (!err && err2) + err = err2; + } + + bytes -= op; + rhdr = Add2Ptr(rhdr, op); + } + + return err; +} + +static inline struct bio *ntfs_alloc_bio(u32 nr_vecs) +{ + struct bio *bio = bio_alloc(GFP_NOFS | __GFP_HIGH, nr_vecs); + + if (!bio && (current->flags & PF_MEMALLOC)) { + while (!bio && (nr_vecs /= 2)) + bio = bio_alloc(GFP_NOFS | __GFP_HIGH, nr_vecs); + } + return bio; +} + +/* + * ntfs_bio_pages - Read/write pages from/to disk. + */ +int ntfs_bio_pages(struct ntfs_sb_info *sbi, const struct runs_tree *run, + struct page **pages, u32 nr_pages, u64 vbo, u32 bytes, + u32 op) +{ + int err = 0; + struct bio *new, *bio = NULL; + struct super_block *sb = sbi->sb; + struct block_device *bdev = sb->s_bdev; + struct page *page; + u8 cluster_bits = sbi->cluster_bits; + CLST lcn, clen, vcn, vcn_next; + u32 add, off, page_idx; + u64 lbo, len; + size_t run_idx; + struct blk_plug plug; + + if (!bytes) + return 0; + + blk_start_plug(&plug); + + /* Align vbo and bytes to be 512 bytes aligned. */ + lbo = (vbo + bytes + 511) & ~511ull; + vbo = vbo & ~511ull; + bytes = lbo - vbo; + + vcn = vbo >> cluster_bits; + if (!run_lookup_entry(run, vcn, &lcn, &clen, &run_idx)) { + err = -ENOENT; + goto out; + } + off = vbo & sbi->cluster_mask; + page_idx = 0; + page = pages[0]; + + for (;;) { + lbo = ((u64)lcn << cluster_bits) + off; + len = ((u64)clen << cluster_bits) - off; +new_bio: + new = ntfs_alloc_bio(nr_pages - page_idx); + if (!new) { + err = -ENOMEM; + goto out; + } + if (bio) { + bio_chain(bio, new); + submit_bio(bio); + } + bio = new; + bio_set_dev(bio, bdev); + bio->bi_iter.bi_sector = lbo >> 9; + bio->bi_opf = op; + + while (len) { + off = vbo & (PAGE_SIZE - 1); + add = off + len > PAGE_SIZE ? (PAGE_SIZE - off) : len; + + if (bio_add_page(bio, page, add, off) < add) + goto new_bio; + + if (bytes <= add) + goto out; + bytes -= add; + vbo += add; + + if (add + off == PAGE_SIZE) { + page_idx += 1; + if (WARN_ON(page_idx >= nr_pages)) { + err = -EINVAL; + goto out; + } + page = pages[page_idx]; + } + + if (len <= add) + break; + len -= add; + lbo += add; + } + + vcn_next = vcn + clen; + if (!run_get_entry(run, ++run_idx, &vcn, &lcn, &clen) || + vcn != vcn_next) { + err = -ENOENT; + goto out; + } + off = 0; + } +out: + if (bio) { + if (!err) + err = submit_bio_wait(bio); + bio_put(bio); + } + blk_finish_plug(&plug); + + return err; +} + +/* + * ntfs_bio_fill_1 - Helper for ntfs_loadlog_and_replay(). + * + * Fill on-disk logfile range by (-1) + * this means empty logfile. + */ +int ntfs_bio_fill_1(struct ntfs_sb_info *sbi, const struct runs_tree *run) +{ + int err = 0; + struct super_block *sb = sbi->sb; + struct block_device *bdev = sb->s_bdev; + u8 cluster_bits = sbi->cluster_bits; + struct bio *new, *bio = NULL; + CLST lcn, clen; + u64 lbo, len; + size_t run_idx; + struct page *fill; + void *kaddr; + struct blk_plug plug; + + fill = alloc_page(GFP_KERNEL); + if (!fill) + return -ENOMEM; + + kaddr = kmap_atomic(fill); + memset(kaddr, -1, PAGE_SIZE); + kunmap_atomic(kaddr); + flush_dcache_page(fill); + lock_page(fill); + + if (!run_lookup_entry(run, 0, &lcn, &clen, &run_idx)) { + err = -ENOENT; + goto out; + } + + /* + * TODO: Try blkdev_issue_write_same. + */ + blk_start_plug(&plug); + do { + lbo = (u64)lcn << cluster_bits; + len = (u64)clen << cluster_bits; +new_bio: + new = ntfs_alloc_bio(BIO_MAX_VECS); + if (!new) { + err = -ENOMEM; + break; + } + if (bio) { + bio_chain(bio, new); + submit_bio(bio); + } + bio = new; + bio_set_dev(bio, bdev); + bio->bi_opf = REQ_OP_WRITE; + bio->bi_iter.bi_sector = lbo >> 9; + + for (;;) { + u32 add = len > PAGE_SIZE ? PAGE_SIZE : len; + + if (bio_add_page(bio, fill, add, 0) < add) + goto new_bio; + + lbo += add; + if (len <= add) + break; + len -= add; + } + } while (run_get_entry(run, ++run_idx, NULL, &lcn, &clen)); + + if (bio) { + if (!err) + err = submit_bio_wait(bio); + bio_put(bio); + } + blk_finish_plug(&plug); +out: + unlock_page(fill); + put_page(fill); + + return err; +} + +int ntfs_vbo_to_lbo(struct ntfs_sb_info *sbi, const struct runs_tree *run, + u64 vbo, u64 *lbo, u64 *bytes) +{ + u32 off; + CLST lcn, len; + u8 cluster_bits = sbi->cluster_bits; + + if (!run_lookup_entry(run, vbo >> cluster_bits, &lcn, &len, NULL)) + return -ENOENT; + + off = vbo & sbi->cluster_mask; + *lbo = lcn == SPARSE_LCN ? -1 : (((u64)lcn << cluster_bits) + off); + *bytes = ((u64)len << cluster_bits) - off; + + return 0; +} + +struct ntfs_inode *ntfs_new_inode(struct ntfs_sb_info *sbi, CLST rno, bool dir) +{ + int err = 0; + struct super_block *sb = sbi->sb; + struct inode *inode = new_inode(sb); + struct ntfs_inode *ni; + + if (!inode) + return ERR_PTR(-ENOMEM); + + ni = ntfs_i(inode); + + err = mi_format_new(&ni->mi, sbi, rno, dir ? RECORD_FLAG_DIR : 0, + false); + if (err) + goto out; + + inode->i_ino = rno; + if (insert_inode_locked(inode) < 0) { + err = -EIO; + goto out; + } + +out: + if (err) { + iput(inode); + ni = ERR_PTR(err); + } + return ni; +} + +/* + * O:BAG:BAD:(A;OICI;FA;;;WD) + * Owner S-1-5-32-544 (Administrators) + * Group S-1-5-32-544 (Administrators) + * ACE: allow S-1-1-0 (Everyone) with FILE_ALL_ACCESS + */ +const u8 s_default_security[] __aligned(8) = { + 0x01, 0x00, 0x04, 0x80, 0x30, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x02, 0x00, 0x1C, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x03, 0x14, 0x00, 0xFF, 0x01, 0x1F, 0x00, + 0x01, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x20, 0x00, 0x00, 0x00, + 0x20, 0x02, 0x00, 0x00, 0x01, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, + 0x20, 0x00, 0x00, 0x00, 0x20, 0x02, 0x00, 0x00, +}; + +static_assert(sizeof(s_default_security) == 0x50); + +static inline u32 sid_length(const struct SID *sid) +{ + return struct_size(sid, SubAuthority, sid->SubAuthorityCount); +} + +/* + * is_acl_valid + * + * Thanks Mark Harmstone for idea. + */ +static bool is_acl_valid(const struct ACL *acl, u32 len) +{ + const struct ACE_HEADER *ace; + u32 i; + u16 ace_count, ace_size; + + if (acl->AclRevision != ACL_REVISION && + acl->AclRevision != ACL_REVISION_DS) { + /* + * This value should be ACL_REVISION, unless the ACL contains an + * object-specific ACE, in which case this value must be ACL_REVISION_DS. + * All ACEs in an ACL must be at the same revision level. + */ + return false; + } + + if (acl->Sbz1) + return false; + + if (le16_to_cpu(acl->AclSize) > len) + return false; + + if (acl->Sbz2) + return false; + + len -= sizeof(struct ACL); + ace = (struct ACE_HEADER *)&acl[1]; + ace_count = le16_to_cpu(acl->AceCount); + + for (i = 0; i < ace_count; i++) { + if (len < sizeof(struct ACE_HEADER)) + return false; + + ace_size = le16_to_cpu(ace->AceSize); + if (len < ace_size) + return false; + + len -= ace_size; + ace = Add2Ptr(ace, ace_size); + } + + return true; +} + +bool is_sd_valid(const struct SECURITY_DESCRIPTOR_RELATIVE *sd, u32 len) +{ + u32 sd_owner, sd_group, sd_sacl, sd_dacl; + + if (len < sizeof(struct SECURITY_DESCRIPTOR_RELATIVE)) + return false; + + if (sd->Revision != 1) + return false; + + if (sd->Sbz1) + return false; + + if (!(sd->Control & SE_SELF_RELATIVE)) + return false; + + sd_owner = le32_to_cpu(sd->Owner); + if (sd_owner) { + const struct SID *owner = Add2Ptr(sd, sd_owner); + + if (sd_owner + offsetof(struct SID, SubAuthority) > len) + return false; + + if (owner->Revision != 1) + return false; + + if (sd_owner + sid_length(owner) > len) + return false; + } + + sd_group = le32_to_cpu(sd->Group); + if (sd_group) { + const struct SID *group = Add2Ptr(sd, sd_group); + + if (sd_group + offsetof(struct SID, SubAuthority) > len) + return false; + + if (group->Revision != 1) + return false; + + if (sd_group + sid_length(group) > len) + return false; + } + + sd_sacl = le32_to_cpu(sd->Sacl); + if (sd_sacl) { + const struct ACL *sacl = Add2Ptr(sd, sd_sacl); + + if (sd_sacl + sizeof(struct ACL) > len) + return false; + + if (!is_acl_valid(sacl, len - sd_sacl)) + return false; + } + + sd_dacl = le32_to_cpu(sd->Dacl); + if (sd_dacl) { + const struct ACL *dacl = Add2Ptr(sd, sd_dacl); + + if (sd_dacl + sizeof(struct ACL) > len) + return false; + + if (!is_acl_valid(dacl, len - sd_dacl)) + return false; + } + + return true; +} + +/* + * ntfs_security_init - Load and parse $Secure. + */ +int ntfs_security_init(struct ntfs_sb_info *sbi) +{ + int err; + struct super_block *sb = sbi->sb; + struct inode *inode; + struct ntfs_inode *ni; + struct MFT_REF ref; + struct ATTRIB *attr; + struct ATTR_LIST_ENTRY *le; + u64 sds_size; + size_t off; + struct NTFS_DE *ne; + struct NTFS_DE_SII *sii_e; + struct ntfs_fnd *fnd_sii = NULL; + const struct INDEX_ROOT *root_sii; + const struct INDEX_ROOT *root_sdh; + struct ntfs_index *indx_sdh = &sbi->security.index_sdh; + struct ntfs_index *indx_sii = &sbi->security.index_sii; + + ref.low = cpu_to_le32(MFT_REC_SECURE); + ref.high = 0; + ref.seq = cpu_to_le16(MFT_REC_SECURE); + + inode = ntfs_iget5(sb, &ref, &NAME_SECURE); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + ntfs_err(sb, "Failed to load $Secure."); + inode = NULL; + goto out; + } + + ni = ntfs_i(inode); + + le = NULL; + + attr = ni_find_attr(ni, NULL, &le, ATTR_ROOT, SDH_NAME, + ARRAY_SIZE(SDH_NAME), NULL, NULL); + if (!attr) { + err = -EINVAL; + goto out; + } + + root_sdh = resident_data(attr); + if (root_sdh->type != ATTR_ZERO || + root_sdh->rule != NTFS_COLLATION_TYPE_SECURITY_HASH) { + err = -EINVAL; + goto out; + } + + err = indx_init(indx_sdh, sbi, attr, INDEX_MUTEX_SDH); + if (err) + goto out; + + attr = ni_find_attr(ni, attr, &le, ATTR_ROOT, SII_NAME, + ARRAY_SIZE(SII_NAME), NULL, NULL); + if (!attr) { + err = -EINVAL; + goto out; + } + + root_sii = resident_data(attr); + if (root_sii->type != ATTR_ZERO || + root_sii->rule != NTFS_COLLATION_TYPE_UINT) { + err = -EINVAL; + goto out; + } + + err = indx_init(indx_sii, sbi, attr, INDEX_MUTEX_SII); + if (err) + goto out; + + fnd_sii = fnd_get(); + if (!fnd_sii) { + err = -ENOMEM; + goto out; + } + + sds_size = inode->i_size; + + /* Find the last valid Id. */ + sbi->security.next_id = SECURITY_ID_FIRST; + /* Always write new security at the end of bucket. */ + sbi->security.next_off = + ALIGN(sds_size - SecurityDescriptorsBlockSize, 16); + + off = 0; + ne = NULL; + + for (;;) { + u32 next_id; + + err = indx_find_raw(indx_sii, ni, root_sii, &ne, &off, fnd_sii); + if (err || !ne) + break; + + sii_e = (struct NTFS_DE_SII *)ne; + if (le16_to_cpu(ne->view.data_size) < SIZEOF_SECURITY_HDR) + continue; + + next_id = le32_to_cpu(sii_e->sec_id) + 1; + if (next_id >= sbi->security.next_id) + sbi->security.next_id = next_id; + } + + sbi->security.ni = ni; + inode = NULL; +out: + iput(inode); + fnd_put(fnd_sii); + + return err; +} + +/* + * ntfs_get_security_by_id - Read security descriptor by id. + */ +int ntfs_get_security_by_id(struct ntfs_sb_info *sbi, __le32 security_id, + struct SECURITY_DESCRIPTOR_RELATIVE **sd, + size_t *size) +{ + int err; + int diff; + struct ntfs_inode *ni = sbi->security.ni; + struct ntfs_index *indx = &sbi->security.index_sii; + void *p = NULL; + struct NTFS_DE_SII *sii_e; + struct ntfs_fnd *fnd_sii; + struct SECURITY_HDR d_security; + const struct INDEX_ROOT *root_sii; + u32 t32; + + *sd = NULL; + + mutex_lock_nested(&ni->ni_lock, NTFS_INODE_MUTEX_SECURITY); + + fnd_sii = fnd_get(); + if (!fnd_sii) { + err = -ENOMEM; + goto out; + } + + root_sii = indx_get_root(indx, ni, NULL, NULL); + if (!root_sii) { + err = -EINVAL; + goto out; + } + + /* Try to find this SECURITY descriptor in SII indexes. */ + err = indx_find(indx, ni, root_sii, &security_id, sizeof(security_id), + NULL, &diff, (struct NTFS_DE **)&sii_e, fnd_sii); + if (err) + goto out; + + if (diff) + goto out; + + t32 = le32_to_cpu(sii_e->sec_hdr.size); + if (t32 < SIZEOF_SECURITY_HDR) { + err = -EINVAL; + goto out; + } + + if (t32 > SIZEOF_SECURITY_HDR + 0x10000) { + /* Looks like too big security. 0x10000 - is arbitrary big number. */ + err = -EFBIG; + goto out; + } + + *size = t32 - SIZEOF_SECURITY_HDR; + + p = kmalloc(*size, GFP_NOFS); + if (!p) { + err = -ENOMEM; + goto out; + } + + err = ntfs_read_run_nb(sbi, &ni->file.run, + le64_to_cpu(sii_e->sec_hdr.off), &d_security, + sizeof(d_security), NULL); + if (err) + goto out; + + if (memcmp(&d_security, &sii_e->sec_hdr, SIZEOF_SECURITY_HDR)) { + err = -EINVAL; + goto out; + } + + err = ntfs_read_run_nb(sbi, &ni->file.run, + le64_to_cpu(sii_e->sec_hdr.off) + + SIZEOF_SECURITY_HDR, + p, *size, NULL); + if (err) + goto out; + + *sd = p; + p = NULL; + +out: + kfree(p); + fnd_put(fnd_sii); + ni_unlock(ni); + + return err; +} + +/* + * ntfs_insert_security - Insert security descriptor into $Secure::SDS. + * + * SECURITY Descriptor Stream data is organized into chunks of 256K bytes + * and it contains a mirror copy of each security descriptor. When writing + * to a security descriptor at location X, another copy will be written at + * location (X+256K). + * When writing a security descriptor that will cross the 256K boundary, + * the pointer will be advanced by 256K to skip + * over the mirror portion. + */ +int ntfs_insert_security(struct ntfs_sb_info *sbi, + const struct SECURITY_DESCRIPTOR_RELATIVE *sd, + u32 size_sd, __le32 *security_id, bool *inserted) +{ + int err, diff; + struct ntfs_inode *ni = sbi->security.ni; + struct ntfs_index *indx_sdh = &sbi->security.index_sdh; + struct ntfs_index *indx_sii = &sbi->security.index_sii; + struct NTFS_DE_SDH *e; + struct NTFS_DE_SDH sdh_e; + struct NTFS_DE_SII sii_e; + struct SECURITY_HDR *d_security; + u32 new_sec_size = size_sd + SIZEOF_SECURITY_HDR; + u32 aligned_sec_size = ALIGN(new_sec_size, 16); + struct SECURITY_KEY hash_key; + struct ntfs_fnd *fnd_sdh = NULL; + const struct INDEX_ROOT *root_sdh; + const struct INDEX_ROOT *root_sii; + u64 mirr_off, new_sds_size; + u32 next, left; + + static_assert((1 << Log2OfSecurityDescriptorsBlockSize) == + SecurityDescriptorsBlockSize); + + hash_key.hash = security_hash(sd, size_sd); + hash_key.sec_id = SECURITY_ID_INVALID; + + if (inserted) + *inserted = false; + *security_id = SECURITY_ID_INVALID; + + /* Allocate a temporal buffer. */ + d_security = kzalloc(aligned_sec_size, GFP_NOFS); + if (!d_security) + return -ENOMEM; + + mutex_lock_nested(&ni->ni_lock, NTFS_INODE_MUTEX_SECURITY); + + fnd_sdh = fnd_get(); + if (!fnd_sdh) { + err = -ENOMEM; + goto out; + } + + root_sdh = indx_get_root(indx_sdh, ni, NULL, NULL); + if (!root_sdh) { + err = -EINVAL; + goto out; + } + + root_sii = indx_get_root(indx_sii, ni, NULL, NULL); + if (!root_sii) { + err = -EINVAL; + goto out; + } + + /* + * Check if such security already exists. + * Use "SDH" and hash -> to get the offset in "SDS". + */ + err = indx_find(indx_sdh, ni, root_sdh, &hash_key, sizeof(hash_key), + &d_security->key.sec_id, &diff, (struct NTFS_DE **)&e, + fnd_sdh); + if (err) + goto out; + + while (e) { + if (le32_to_cpu(e->sec_hdr.size) == new_sec_size) { + err = ntfs_read_run_nb(sbi, &ni->file.run, + le64_to_cpu(e->sec_hdr.off), + d_security, new_sec_size, NULL); + if (err) + goto out; + + if (le32_to_cpu(d_security->size) == new_sec_size && + d_security->key.hash == hash_key.hash && + !memcmp(d_security + 1, sd, size_sd)) { + *security_id = d_security->key.sec_id; + /* Such security already exists. */ + err = 0; + goto out; + } + } + + err = indx_find_sort(indx_sdh, ni, root_sdh, + (struct NTFS_DE **)&e, fnd_sdh); + if (err) + goto out; + + if (!e || e->key.hash != hash_key.hash) + break; + } + + /* Zero unused space. */ + next = sbi->security.next_off & (SecurityDescriptorsBlockSize - 1); + left = SecurityDescriptorsBlockSize - next; + + /* Zero gap until SecurityDescriptorsBlockSize. */ + if (left < new_sec_size) { + /* Zero "left" bytes from sbi->security.next_off. */ + sbi->security.next_off += SecurityDescriptorsBlockSize + left; + } + + /* Zero tail of previous security. */ + //used = ni->vfs_inode.i_size & (SecurityDescriptorsBlockSize - 1); + + /* + * Example: + * 0x40438 == ni->vfs_inode.i_size + * 0x00440 == sbi->security.next_off + * need to zero [0x438-0x440) + * if (next > used) { + * u32 tozero = next - used; + * zero "tozero" bytes from sbi->security.next_off - tozero + */ + + /* Format new security descriptor. */ + d_security->key.hash = hash_key.hash; + d_security->key.sec_id = cpu_to_le32(sbi->security.next_id); + d_security->off = cpu_to_le64(sbi->security.next_off); + d_security->size = cpu_to_le32(new_sec_size); + memcpy(d_security + 1, sd, size_sd); + + /* Write main SDS bucket. */ + err = ntfs_sb_write_run(sbi, &ni->file.run, sbi->security.next_off, + d_security, aligned_sec_size); + + if (err) + goto out; + + mirr_off = sbi->security.next_off + SecurityDescriptorsBlockSize; + new_sds_size = mirr_off + aligned_sec_size; + + if (new_sds_size > ni->vfs_inode.i_size) { + err = attr_set_size(ni, ATTR_DATA, SDS_NAME, + ARRAY_SIZE(SDS_NAME), &ni->file.run, + new_sds_size, &new_sds_size, false, NULL); + if (err) + goto out; + } + + /* Write copy SDS bucket. */ + err = ntfs_sb_write_run(sbi, &ni->file.run, mirr_off, d_security, + aligned_sec_size); + if (err) + goto out; + + /* Fill SII entry. */ + sii_e.de.view.data_off = + cpu_to_le16(offsetof(struct NTFS_DE_SII, sec_hdr)); + sii_e.de.view.data_size = cpu_to_le16(SIZEOF_SECURITY_HDR); + sii_e.de.view.res = 0; + sii_e.de.size = cpu_to_le16(SIZEOF_SII_DIRENTRY); + sii_e.de.key_size = cpu_to_le16(sizeof(d_security->key.sec_id)); + sii_e.de.flags = 0; + sii_e.de.res = 0; + sii_e.sec_id = d_security->key.sec_id; + memcpy(&sii_e.sec_hdr, d_security, SIZEOF_SECURITY_HDR); + + err = indx_insert_entry(indx_sii, ni, &sii_e.de, NULL, NULL, 0); + if (err) + goto out; + + /* Fill SDH entry. */ + sdh_e.de.view.data_off = + cpu_to_le16(offsetof(struct NTFS_DE_SDH, sec_hdr)); + sdh_e.de.view.data_size = cpu_to_le16(SIZEOF_SECURITY_HDR); + sdh_e.de.view.res = 0; + sdh_e.de.size = cpu_to_le16(SIZEOF_SDH_DIRENTRY); + sdh_e.de.key_size = cpu_to_le16(sizeof(sdh_e.key)); + sdh_e.de.flags = 0; + sdh_e.de.res = 0; + sdh_e.key.hash = d_security->key.hash; + sdh_e.key.sec_id = d_security->key.sec_id; + memcpy(&sdh_e.sec_hdr, d_security, SIZEOF_SECURITY_HDR); + sdh_e.magic[0] = cpu_to_le16('I'); + sdh_e.magic[1] = cpu_to_le16('I'); + + fnd_clear(fnd_sdh); + err = indx_insert_entry(indx_sdh, ni, &sdh_e.de, (void *)(size_t)1, + fnd_sdh, 0); + if (err) + goto out; + + *security_id = d_security->key.sec_id; + if (inserted) + *inserted = true; + + /* Update Id and offset for next descriptor. */ + sbi->security.next_id += 1; + sbi->security.next_off += aligned_sec_size; + +out: + fnd_put(fnd_sdh); + mark_inode_dirty(&ni->vfs_inode); + ni_unlock(ni); + kfree(d_security); + + return err; +} + +/* + * ntfs_reparse_init - Load and parse $Extend/$Reparse. + */ +int ntfs_reparse_init(struct ntfs_sb_info *sbi) +{ + int err; + struct ntfs_inode *ni = sbi->reparse.ni; + struct ntfs_index *indx = &sbi->reparse.index_r; + struct ATTRIB *attr; + struct ATTR_LIST_ENTRY *le; + const struct INDEX_ROOT *root_r; + + if (!ni) + return 0; + + le = NULL; + attr = ni_find_attr(ni, NULL, &le, ATTR_ROOT, SR_NAME, + ARRAY_SIZE(SR_NAME), NULL, NULL); + if (!attr) { + err = -EINVAL; + goto out; + } + + root_r = resident_data(attr); + if (root_r->type != ATTR_ZERO || + root_r->rule != NTFS_COLLATION_TYPE_UINTS) { + err = -EINVAL; + goto out; + } + + err = indx_init(indx, sbi, attr, INDEX_MUTEX_SR); + if (err) + goto out; + +out: + return err; +} + +/* + * ntfs_objid_init - Load and parse $Extend/$ObjId. + */ +int ntfs_objid_init(struct ntfs_sb_info *sbi) +{ + int err; + struct ntfs_inode *ni = sbi->objid.ni; + struct ntfs_index *indx = &sbi->objid.index_o; + struct ATTRIB *attr; + struct ATTR_LIST_ENTRY *le; + const struct INDEX_ROOT *root; + + if (!ni) + return 0; + + le = NULL; + attr = ni_find_attr(ni, NULL, &le, ATTR_ROOT, SO_NAME, + ARRAY_SIZE(SO_NAME), NULL, NULL); + if (!attr) { + err = -EINVAL; + goto out; + } + + root = resident_data(attr); + if (root->type != ATTR_ZERO || + root->rule != NTFS_COLLATION_TYPE_UINTS) { + err = -EINVAL; + goto out; + } + + err = indx_init(indx, sbi, attr, INDEX_MUTEX_SO); + if (err) + goto out; + +out: + return err; +} + +int ntfs_objid_remove(struct ntfs_sb_info *sbi, struct GUID *guid) +{ + int err; + struct ntfs_inode *ni = sbi->objid.ni; + struct ntfs_index *indx = &sbi->objid.index_o; + + if (!ni) + return -EINVAL; + + mutex_lock_nested(&ni->ni_lock, NTFS_INODE_MUTEX_OBJID); + + err = indx_delete_entry(indx, ni, guid, sizeof(*guid), NULL); + + mark_inode_dirty(&ni->vfs_inode); + ni_unlock(ni); + + return err; +} + +int ntfs_insert_reparse(struct ntfs_sb_info *sbi, __le32 rtag, + const struct MFT_REF *ref) +{ + int err; + struct ntfs_inode *ni = sbi->reparse.ni; + struct ntfs_index *indx = &sbi->reparse.index_r; + struct NTFS_DE_R re; + + if (!ni) + return -EINVAL; + + memset(&re, 0, sizeof(re)); + + re.de.view.data_off = cpu_to_le16(offsetof(struct NTFS_DE_R, zero)); + re.de.size = cpu_to_le16(sizeof(struct NTFS_DE_R)); + re.de.key_size = cpu_to_le16(sizeof(re.key)); + + re.key.ReparseTag = rtag; + memcpy(&re.key.ref, ref, sizeof(*ref)); + + mutex_lock_nested(&ni->ni_lock, NTFS_INODE_MUTEX_REPARSE); + + err = indx_insert_entry(indx, ni, &re.de, NULL, NULL, 0); + + mark_inode_dirty(&ni->vfs_inode); + ni_unlock(ni); + + return err; +} + +int ntfs_remove_reparse(struct ntfs_sb_info *sbi, __le32 rtag, + const struct MFT_REF *ref) +{ + int err, diff; + struct ntfs_inode *ni = sbi->reparse.ni; + struct ntfs_index *indx = &sbi->reparse.index_r; + struct ntfs_fnd *fnd = NULL; + struct REPARSE_KEY rkey; + struct NTFS_DE_R *re; + struct INDEX_ROOT *root_r; + + if (!ni) + return -EINVAL; + + rkey.ReparseTag = rtag; + rkey.ref = *ref; + + mutex_lock_nested(&ni->ni_lock, NTFS_INODE_MUTEX_REPARSE); + + if (rtag) { + err = indx_delete_entry(indx, ni, &rkey, sizeof(rkey), NULL); + goto out1; + } + + fnd = fnd_get(); + if (!fnd) { + err = -ENOMEM; + goto out1; + } + + root_r = indx_get_root(indx, ni, NULL, NULL); + if (!root_r) { + err = -EINVAL; + goto out; + } + + /* 1 - forces to ignore rkey.ReparseTag when comparing keys. */ + err = indx_find(indx, ni, root_r, &rkey, sizeof(rkey), (void *)1, &diff, + (struct NTFS_DE **)&re, fnd); + if (err) + goto out; + + if (memcmp(&re->key.ref, ref, sizeof(*ref))) { + /* Impossible. Looks like volume corrupt? */ + goto out; + } + + memcpy(&rkey, &re->key, sizeof(rkey)); + + fnd_put(fnd); + fnd = NULL; + + err = indx_delete_entry(indx, ni, &rkey, sizeof(rkey), NULL); + if (err) + goto out; + +out: + fnd_put(fnd); + +out1: + mark_inode_dirty(&ni->vfs_inode); + ni_unlock(ni); + + return err; +} + +static inline void ntfs_unmap_and_discard(struct ntfs_sb_info *sbi, CLST lcn, + CLST len) +{ + ntfs_unmap_meta(sbi->sb, lcn, len); + ntfs_discard(sbi, lcn, len); +} + +void mark_as_free_ex(struct ntfs_sb_info *sbi, CLST lcn, CLST len, bool trim) +{ + CLST end, i; + struct wnd_bitmap *wnd = &sbi->used.bitmap; + + down_write_nested(&wnd->rw_lock, BITMAP_MUTEX_CLUSTERS); + if (!wnd_is_used(wnd, lcn, len)) { + ntfs_set_state(sbi, NTFS_DIRTY_ERROR); + + end = lcn + len; + len = 0; + for (i = lcn; i < end; i++) { + if (wnd_is_used(wnd, i, 1)) { + if (!len) + lcn = i; + len += 1; + continue; + } + + if (!len) + continue; + + if (trim) + ntfs_unmap_and_discard(sbi, lcn, len); + + wnd_set_free(wnd, lcn, len); + len = 0; + } + + if (!len) + goto out; + } + + if (trim) + ntfs_unmap_and_discard(sbi, lcn, len); + wnd_set_free(wnd, lcn, len); + +out: + up_write(&wnd->rw_lock); +} + +/* + * run_deallocate - Deallocate clusters. + */ +int run_deallocate(struct ntfs_sb_info *sbi, struct runs_tree *run, bool trim) +{ + CLST lcn, len; + size_t idx = 0; + + while (run_get_entry(run, idx++, NULL, &lcn, &len)) { + if (lcn == SPARSE_LCN) + continue; + + mark_as_free_ex(sbi, lcn, len, trim); + } + + return 0; +} diff --git a/fs/ntfs3/index.c b/fs/ntfs3/index.c new file mode 100644 index 000000000000..0daca9adc54c --- /dev/null +++ b/fs/ntfs3/index.c @@ -0,0 +1,2650 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * + * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. + * + */ + +#include <linux/blkdev.h> +#include <linux/buffer_head.h> +#include <linux/fs.h> +#include <linux/nls.h> + +#include "debug.h" +#include "ntfs.h" +#include "ntfs_fs.h" + +static const struct INDEX_NAMES { + const __le16 *name; + u8 name_len; +} s_index_names[INDEX_MUTEX_TOTAL] = { + { I30_NAME, ARRAY_SIZE(I30_NAME) }, { SII_NAME, ARRAY_SIZE(SII_NAME) }, + { SDH_NAME, ARRAY_SIZE(SDH_NAME) }, { SO_NAME, ARRAY_SIZE(SO_NAME) }, + { SQ_NAME, ARRAY_SIZE(SQ_NAME) }, { SR_NAME, ARRAY_SIZE(SR_NAME) }, +}; + +/* + * cmp_fnames - Compare two names in index. + * + * if l1 != 0 + * Both names are little endian on-disk ATTR_FILE_NAME structs. + * else + * key1 - cpu_str, key2 - ATTR_FILE_NAME + */ +static int cmp_fnames(const void *key1, size_t l1, const void *key2, size_t l2, + const void *data) +{ + const struct ATTR_FILE_NAME *f2 = key2; + const struct ntfs_sb_info *sbi = data; + const struct ATTR_FILE_NAME *f1; + u16 fsize2; + bool both_case; + + if (l2 <= offsetof(struct ATTR_FILE_NAME, name)) + return -1; + + fsize2 = fname_full_size(f2); + if (l2 < fsize2) + return -1; + + both_case = f2->type != FILE_NAME_DOS /*&& !sbi->options.nocase*/; + if (!l1) { + const struct le_str *s2 = (struct le_str *)&f2->name_len; + + /* + * If names are equal (case insensitive) + * try to compare it case sensitive. + */ + return ntfs_cmp_names_cpu(key1, s2, sbi->upcase, both_case); + } + + f1 = key1; + return ntfs_cmp_names(f1->name, f1->name_len, f2->name, f2->name_len, + sbi->upcase, both_case); +} + +/* + * cmp_uint - $SII of $Secure and $Q of Quota + */ +static int cmp_uint(const void *key1, size_t l1, const void *key2, size_t l2, + const void *data) +{ + const u32 *k1 = key1; + const u32 *k2 = key2; + + if (l2 < sizeof(u32)) + return -1; + + if (*k1 < *k2) + return -1; + if (*k1 > *k2) + return 1; + return 0; +} + +/* + * cmp_sdh - $SDH of $Secure + */ +static int cmp_sdh(const void *key1, size_t l1, const void *key2, size_t l2, + const void *data) +{ + const struct SECURITY_KEY *k1 = key1; + const struct SECURITY_KEY *k2 = key2; + u32 t1, t2; + + if (l2 < sizeof(struct SECURITY_KEY)) + return -1; + + t1 = le32_to_cpu(k1->hash); + t2 = le32_to_cpu(k2->hash); + + /* First value is a hash value itself. */ + if (t1 < t2) + return -1; + if (t1 > t2) + return 1; + + /* Second value is security Id. */ + if (data) { + t1 = le32_to_cpu(k1->sec_id); + t2 = le32_to_cpu(k2->sec_id); + if (t1 < t2) + return -1; + if (t1 > t2) + return 1; + } + + return 0; +} + +/* + * cmp_uints - $O of ObjId and "$R" for Reparse. + */ +static int cmp_uints(const void *key1, size_t l1, const void *key2, size_t l2, + const void *data) +{ + const __le32 *k1 = key1; + const __le32 *k2 = key2; + size_t count; + + if ((size_t)data == 1) { + /* + * ni_delete_all -> ntfs_remove_reparse -> + * delete all with this reference. + * k1, k2 - pointers to REPARSE_KEY + */ + + k1 += 1; // Skip REPARSE_KEY.ReparseTag + k2 += 1; // Skip REPARSE_KEY.ReparseTag + if (l2 <= sizeof(int)) + return -1; + l2 -= sizeof(int); + if (l1 <= sizeof(int)) + return 1; + l1 -= sizeof(int); + } + + if (l2 < sizeof(int)) + return -1; + + for (count = min(l1, l2) >> 2; count > 0; --count, ++k1, ++k2) { + u32 t1 = le32_to_cpu(*k1); + u32 t2 = le32_to_cpu(*k2); + + if (t1 > t2) + return 1; + if (t1 < t2) + return -1; + } + + if (l1 > l2) + return 1; + if (l1 < l2) + return -1; + + return 0; +} + +static inline NTFS_CMP_FUNC get_cmp_func(const struct INDEX_ROOT *root) +{ + switch (root->type) { + case ATTR_NAME: + if (root->rule == NTFS_COLLATION_TYPE_FILENAME) + return &cmp_fnames; + break; + case ATTR_ZERO: + switch (root->rule) { + case NTFS_COLLATION_TYPE_UINT: + return &cmp_uint; + case NTFS_COLLATION_TYPE_SECURITY_HASH: + return &cmp_sdh; + case NTFS_COLLATION_TYPE_UINTS: + return &cmp_uints; + default: + break; + } + break; + default: + break; + } + + return NULL; +} + +struct bmp_buf { + struct ATTRIB *b; + struct mft_inode *mi; + struct buffer_head *bh; + ulong *buf; + size_t bit; + u32 nbits; + u64 new_valid; +}; + +static int bmp_buf_get(struct ntfs_index *indx, struct ntfs_inode *ni, + size_t bit, struct bmp_buf *bbuf) +{ + struct ATTRIB *b; + size_t data_size, valid_size, vbo, off = bit >> 3; + struct ntfs_sb_info *sbi = ni->mi.sbi; + CLST vcn = off >> sbi->cluster_bits; + struct ATTR_LIST_ENTRY *le = NULL; + struct buffer_head *bh; + struct super_block *sb; + u32 blocksize; + const struct INDEX_NAMES *in = &s_index_names[indx->type]; + + bbuf->bh = NULL; + + b = ni_find_attr(ni, NULL, &le, ATTR_BITMAP, in->name, in->name_len, + &vcn, &bbuf->mi); + bbuf->b = b; + if (!b) + return -EINVAL; + + if (!b->non_res) { + data_size = le32_to_cpu(b->res.data_size); + + if (off >= data_size) + return -EINVAL; + + bbuf->buf = (ulong *)resident_data(b); + bbuf->bit = 0; + bbuf->nbits = data_size * 8; + + return 0; + } + + data_size = le64_to_cpu(b->nres.data_size); + if (WARN_ON(off >= data_size)) { + /* Looks like filesystem error. */ + return -EINVAL; + } + + valid_size = le64_to_cpu(b->nres.valid_size); + + bh = ntfs_bread_run(sbi, &indx->bitmap_run, off); + if (!bh) + return -EIO; + + if (IS_ERR(bh)) + return PTR_ERR(bh); + + bbuf->bh = bh; + + if (buffer_locked(bh)) + __wait_on_buffer(bh); + + lock_buffer(bh); + + sb = sbi->sb; + blocksize = sb->s_blocksize; + + vbo = off & ~(size_t)sbi->block_mask; + + bbuf->new_valid = vbo + blocksize; + if (bbuf->new_valid <= valid_size) + bbuf->new_valid = 0; + else if (bbuf->new_valid > data_size) + bbuf->new_valid = data_size; + + if (vbo >= valid_size) { + memset(bh->b_data, 0, blocksize); + } else if (vbo + blocksize > valid_size) { + u32 voff = valid_size & sbi->block_mask; + + memset(bh->b_data + voff, 0, blocksize - voff); + } + + bbuf->buf = (ulong *)bh->b_data; + bbuf->bit = 8 * (off & ~(size_t)sbi->block_mask); + bbuf->nbits = 8 * blocksize; + + return 0; +} + +static void bmp_buf_put(struct bmp_buf *bbuf, bool dirty) +{ + struct buffer_head *bh = bbuf->bh; + struct ATTRIB *b = bbuf->b; + + if (!bh) { + if (b && !b->non_res && dirty) + bbuf->mi->dirty = true; + return; + } + + if (!dirty) + goto out; + + if (bbuf->new_valid) { + b->nres.valid_size = cpu_to_le64(bbuf->new_valid); + bbuf->mi->dirty = true; + } + + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + +out: + unlock_buffer(bh); + put_bh(bh); +} + +/* + * indx_mark_used - Mark the bit @bit as used. + */ +static int indx_mark_used(struct ntfs_index *indx, struct ntfs_inode *ni, + size_t bit) +{ + int err; + struct bmp_buf bbuf; + + err = bmp_buf_get(indx, ni, bit, &bbuf); + if (err) + return err; + + __set_bit(bit - bbuf.bit, bbuf.buf); + + bmp_buf_put(&bbuf, true); + + return 0; +} + +/* + * indx_mark_free - Mark the bit @bit as free. + */ +static int indx_mark_free(struct ntfs_index *indx, struct ntfs_inode *ni, + size_t bit) +{ + int err; + struct bmp_buf bbuf; + + err = bmp_buf_get(indx, ni, bit, &bbuf); + if (err) + return err; + + __clear_bit(bit - bbuf.bit, bbuf.buf); + + bmp_buf_put(&bbuf, true); + + return 0; +} + +/* + * scan_nres_bitmap + * + * If ntfs_readdir calls this function (indx_used_bit -> scan_nres_bitmap), + * inode is shared locked and no ni_lock. + * Use rw_semaphore for read/write access to bitmap_run. + */ +static int scan_nres_bitmap(struct ntfs_inode *ni, struct ATTRIB *bitmap, + struct ntfs_index *indx, size_t from, + bool (*fn)(const ulong *buf, u32 bit, u32 bits, + size_t *ret), + size_t *ret) +{ + struct ntfs_sb_info *sbi = ni->mi.sbi; + struct super_block *sb = sbi->sb; + struct runs_tree *run = &indx->bitmap_run; + struct rw_semaphore *lock = &indx->run_lock; + u32 nbits = sb->s_blocksize * 8; + u32 blocksize = sb->s_blocksize; + u64 valid_size = le64_to_cpu(bitmap->nres.valid_size); + u64 data_size = le64_to_cpu(bitmap->nres.data_size); + sector_t eblock = bytes_to_block(sb, data_size); + size_t vbo = from >> 3; + sector_t blk = (vbo & sbi->cluster_mask) >> sb->s_blocksize_bits; + sector_t vblock = vbo >> sb->s_blocksize_bits; + sector_t blen, block; + CLST lcn, clen, vcn, vcn_next; + size_t idx; + struct buffer_head *bh; + bool ok; + + *ret = MINUS_ONE_T; + + if (vblock >= eblock) + return 0; + + from &= nbits - 1; + vcn = vbo >> sbi->cluster_bits; + + down_read(lock); + ok = run_lookup_entry(run, vcn, &lcn, &clen, &idx); + up_read(lock); + +next_run: + if (!ok) { + int err; + const struct INDEX_NAMES *name = &s_index_names[indx->type]; + + down_write(lock); + err = attr_load_runs_vcn(ni, ATTR_BITMAP, name->name, + name->name_len, run, vcn); + up_write(lock); + if (err) + return err; + down_read(lock); + ok = run_lookup_entry(run, vcn, &lcn, &clen, &idx); + up_read(lock); + if (!ok) + return -EINVAL; + } + + blen = (sector_t)clen * sbi->blocks_per_cluster; + block = (sector_t)lcn * sbi->blocks_per_cluster; + + for (; blk < blen; blk++, from = 0) { + bh = ntfs_bread(sb, block + blk); + if (!bh) + return -EIO; + + vbo = (u64)vblock << sb->s_blocksize_bits; + if (vbo >= valid_size) { + memset(bh->b_data, 0, blocksize); + } else if (vbo + blocksize > valid_size) { + u32 voff = valid_size & sbi->block_mask; + + memset(bh->b_data + voff, 0, blocksize - voff); + } + + if (vbo + blocksize > data_size) + nbits = 8 * (data_size - vbo); + + ok = nbits > from ? (*fn)((ulong *)bh->b_data, from, nbits, ret) + : false; + put_bh(bh); + + if (ok) { + *ret += 8 * vbo; + return 0; + } + + if (++vblock >= eblock) { + *ret = MINUS_ONE_T; + return 0; + } + } + blk = 0; + vcn_next = vcn + clen; + down_read(lock); + ok = run_get_entry(run, ++idx, &vcn, &lcn, &clen) && vcn == vcn_next; + if (!ok) + vcn = vcn_next; + up_read(lock); + goto next_run; +} + +static bool scan_for_free(const ulong *buf, u32 bit, u32 bits, size_t *ret) +{ + size_t pos = find_next_zero_bit(buf, bits, bit); + + if (pos >= bits) + return false; + *ret = pos; + return true; +} + +/* + * indx_find_free - Look for free bit. + * + * Return: -1 if no free bits. + */ +static int indx_find_free(struct ntfs_index *indx, struct ntfs_inode *ni, + size_t *bit, struct ATTRIB **bitmap) +{ + struct ATTRIB *b; + struct ATTR_LIST_ENTRY *le = NULL; + const struct INDEX_NAMES *in = &s_index_names[indx->type]; + int err; + + b = ni_find_attr(ni, NULL, &le, ATTR_BITMAP, in->name, in->name_len, + NULL, NULL); + + if (!b) + return -ENOENT; + + *bitmap = b; + *bit = MINUS_ONE_T; + + if (!b->non_res) { + u32 nbits = 8 * le32_to_cpu(b->res.data_size); + size_t pos = find_next_zero_bit(resident_data(b), nbits, 0); + + if (pos < nbits) + *bit = pos; + } else { + err = scan_nres_bitmap(ni, b, indx, 0, &scan_for_free, bit); + + if (err) + return err; + } + + return 0; +} + +static bool scan_for_used(const ulong *buf, u32 bit, u32 bits, size_t *ret) +{ + size_t pos = find_next_bit(buf, bits, bit); + + if (pos >= bits) + return false; + *ret = pos; + return true; +} + +/* + * indx_used_bit - Look for used bit. + * + * Return: MINUS_ONE_T if no used bits. + */ +int indx_used_bit(struct ntfs_index *indx, struct ntfs_inode *ni, size_t *bit) +{ + struct ATTRIB *b; + struct ATTR_LIST_ENTRY *le = NULL; + size_t from = *bit; + const struct INDEX_NAMES *in = &s_index_names[indx->type]; + int err; + + b = ni_find_attr(ni, NULL, &le, ATTR_BITMAP, in->name, in->name_len, + NULL, NULL); + + if (!b) + return -ENOENT; + + *bit = MINUS_ONE_T; + + if (!b->non_res) { + u32 nbits = le32_to_cpu(b->res.data_size) * 8; + size_t pos = find_next_bit(resident_data(b), nbits, from); + + if (pos < nbits) + *bit = pos; + } else { + err = scan_nres_bitmap(ni, b, indx, from, &scan_for_used, bit); + if (err) + return err; + } + + return 0; +} + +/* + * hdr_find_split + * + * Find a point at which the index allocation buffer would like to be split. + * NOTE: This function should never return 'END' entry NULL returns on error. + */ +static const struct NTFS_DE *hdr_find_split(const struct INDEX_HDR *hdr) +{ + size_t o; + const struct NTFS_DE *e = hdr_first_de(hdr); + u32 used_2 = le32_to_cpu(hdr->used) >> 1; + u16 esize; + + if (!e || de_is_last(e)) + return NULL; + + esize = le16_to_cpu(e->size); + for (o = le32_to_cpu(hdr->de_off) + esize; o < used_2; o += esize) { + const struct NTFS_DE *p = e; + + e = Add2Ptr(hdr, o); + + /* We must not return END entry. */ + if (de_is_last(e)) + return p; + + esize = le16_to_cpu(e->size); + } + + return e; +} + +/* + * hdr_insert_head - Insert some entries at the beginning of the buffer. + * + * It is used to insert entries into a newly-created buffer. + */ +static const struct NTFS_DE *hdr_insert_head(struct INDEX_HDR *hdr, + const void *ins, u32 ins_bytes) +{ + u32 to_move; + struct NTFS_DE *e = hdr_first_de(hdr); + u32 used = le32_to_cpu(hdr->used); + + if (!e) + return NULL; + + /* Now we just make room for the inserted entries and jam it in. */ + to_move = used - le32_to_cpu(hdr->de_off); + memmove(Add2Ptr(e, ins_bytes), e, to_move); + memcpy(e, ins, ins_bytes); + hdr->used = cpu_to_le32(used + ins_bytes); + + return e; +} + +void fnd_clear(struct ntfs_fnd *fnd) +{ + int i; + + for (i = 0; i < fnd->level; i++) { + struct indx_node *n = fnd->nodes[i]; + + if (!n) + continue; + + put_indx_node(n); + fnd->nodes[i] = NULL; + } + fnd->level = 0; + fnd->root_de = NULL; +} + +static int fnd_push(struct ntfs_fnd *fnd, struct indx_node *n, + struct NTFS_DE *e) +{ + int i; + + i = fnd->level; + if (i < 0 || i >= ARRAY_SIZE(fnd->nodes)) + return -EINVAL; + fnd->nodes[i] = n; + fnd->de[i] = e; + fnd->level += 1; + return 0; +} + +static struct indx_node *fnd_pop(struct ntfs_fnd *fnd) +{ + struct indx_node *n; + int i = fnd->level; + + i -= 1; + n = fnd->nodes[i]; + fnd->nodes[i] = NULL; + fnd->level = i; + + return n; +} + +static bool fnd_is_empty(struct ntfs_fnd *fnd) +{ + if (!fnd->level) + return !fnd->root_de; + + return !fnd->de[fnd->level - 1]; +} + +/* + * hdr_find_e - Locate an entry the index buffer. + * + * If no matching entry is found, it returns the first entry which is greater + * than the desired entry If the search key is greater than all the entries the + * buffer, it returns the 'end' entry. This function does a binary search of the + * current index buffer, for the first entry that is <= to the search value. + * + * Return: NULL if error. + */ +static struct NTFS_DE *hdr_find_e(const struct ntfs_index *indx, + const struct INDEX_HDR *hdr, const void *key, + size_t key_len, const void *ctx, int *diff) +{ + struct NTFS_DE *e; + NTFS_CMP_FUNC cmp = indx->cmp; + u32 e_size, e_key_len; + u32 end = le32_to_cpu(hdr->used); + u32 off = le32_to_cpu(hdr->de_off); + +#ifdef NTFS3_INDEX_BINARY_SEARCH + int max_idx = 0, fnd, min_idx; + int nslots = 64; + u16 *offs; + + if (end > 0x10000) + goto next; + + offs = kmalloc(sizeof(u16) * nslots, GFP_NOFS); + if (!offs) + goto next; + + /* Use binary search algorithm. */ +next1: + if (off + sizeof(struct NTFS_DE) > end) { + e = NULL; + goto out1; + } + e = Add2Ptr(hdr, off); + e_size = le16_to_cpu(e->size); + + if (e_size < sizeof(struct NTFS_DE) || off + e_size > end) { + e = NULL; + goto out1; + } + + if (max_idx >= nslots) { + u16 *ptr; + int new_slots = ALIGN(2 * nslots, 8); + + ptr = kmalloc(sizeof(u16) * new_slots, GFP_NOFS); + if (ptr) + memcpy(ptr, offs, sizeof(u16) * max_idx); + kfree(offs); + offs = ptr; + nslots = new_slots; + if (!ptr) + goto next; + } + + /* Store entry table. */ + offs[max_idx] = off; + + if (!de_is_last(e)) { + off += e_size; + max_idx += 1; + goto next1; + } + + /* + * Table of pointers is created. + * Use binary search to find entry that is <= to the search value. + */ + fnd = -1; + min_idx = 0; + + while (min_idx <= max_idx) { + int mid_idx = min_idx + ((max_idx - min_idx) >> 1); + int diff2; + + e = Add2Ptr(hdr, offs[mid_idx]); + + e_key_len = le16_to_cpu(e->key_size); + + diff2 = (*cmp)(key, key_len, e + 1, e_key_len, ctx); + + if (!diff2) { + *diff = 0; + goto out1; + } + + if (diff2 < 0) { + max_idx = mid_idx - 1; + fnd = mid_idx; + if (!fnd) + break; + } else { + min_idx = mid_idx + 1; + } + } + + if (fnd == -1) { + e = NULL; + goto out1; + } + + *diff = -1; + e = Add2Ptr(hdr, offs[fnd]); + +out1: + kfree(offs); + + return e; +#endif + +next: + /* + * Entries index are sorted. + * Enumerate all entries until we find entry + * that is <= to the search value. + */ + if (off + sizeof(struct NTFS_DE) > end) + return NULL; + + e = Add2Ptr(hdr, off); + e_size = le16_to_cpu(e->size); + + if (e_size < sizeof(struct NTFS_DE) || off + e_size > end) + return NULL; + + off += e_size; + + e_key_len = le16_to_cpu(e->key_size); + + *diff = (*cmp)(key, key_len, e + 1, e_key_len, ctx); + if (!*diff) + return e; + + if (*diff <= 0) + return e; + + if (de_is_last(e)) { + *diff = 1; + return e; + } + goto next; +} + +/* + * hdr_insert_de - Insert an index entry into the buffer. + * + * 'before' should be a pointer previously returned from hdr_find_e. + */ +static struct NTFS_DE *hdr_insert_de(const struct ntfs_index *indx, + struct INDEX_HDR *hdr, + const struct NTFS_DE *de, + struct NTFS_DE *before, const void *ctx) +{ + int diff; + size_t off = PtrOffset(hdr, before); + u32 used = le32_to_cpu(hdr->used); + u32 total = le32_to_cpu(hdr->total); + u16 de_size = le16_to_cpu(de->size); + + /* First, check to see if there's enough room. */ + if (used + de_size > total) + return NULL; + + /* We know there's enough space, so we know we'll succeed. */ + if (before) { + /* Check that before is inside Index. */ + if (off >= used || off < le32_to_cpu(hdr->de_off) || + off + le16_to_cpu(before->size) > total) { + return NULL; + } + goto ok; + } + /* No insert point is applied. Get it manually. */ + before = hdr_find_e(indx, hdr, de + 1, le16_to_cpu(de->key_size), ctx, + &diff); + if (!before) + return NULL; + off = PtrOffset(hdr, before); + +ok: + /* Now we just make room for the entry and jam it in. */ + memmove(Add2Ptr(before, de_size), before, used - off); + + hdr->used = cpu_to_le32(used + de_size); + memcpy(before, de, de_size); + + return before; +} + +/* + * hdr_delete_de - Remove an entry from the index buffer. + */ +static inline struct NTFS_DE *hdr_delete_de(struct INDEX_HDR *hdr, + struct NTFS_DE *re) +{ + u32 used = le32_to_cpu(hdr->used); + u16 esize = le16_to_cpu(re->size); + u32 off = PtrOffset(hdr, re); + int bytes = used - (off + esize); + + if (off >= used || esize < sizeof(struct NTFS_DE) || + bytes < sizeof(struct NTFS_DE)) + return NULL; + + hdr->used = cpu_to_le32(used - esize); + memmove(re, Add2Ptr(re, esize), bytes); + + return re; +} + +void indx_clear(struct ntfs_index *indx) +{ + run_close(&indx->alloc_run); + run_close(&indx->bitmap_run); +} + +int indx_init(struct ntfs_index *indx, struct ntfs_sb_info *sbi, + const struct ATTRIB *attr, enum index_mutex_classed type) +{ + u32 t32; + const struct INDEX_ROOT *root = resident_data(attr); + + /* Check root fields. */ + if (!root->index_block_clst) + return -EINVAL; + + indx->type = type; + indx->idx2vbn_bits = __ffs(root->index_block_clst); + + t32 = le32_to_cpu(root->index_block_size); + indx->index_bits = blksize_bits(t32); + + /* Check index record size. */ + if (t32 < sbi->cluster_size) { + /* Index record is smaller than a cluster, use 512 blocks. */ + if (t32 != root->index_block_clst * SECTOR_SIZE) + return -EINVAL; + + /* Check alignment to a cluster. */ + if ((sbi->cluster_size >> SECTOR_SHIFT) & + (root->index_block_clst - 1)) { + return -EINVAL; + } + + indx->vbn2vbo_bits = SECTOR_SHIFT; + } else { + /* Index record must be a multiple of cluster size. */ + if (t32 != root->index_block_clst << sbi->cluster_bits) + return -EINVAL; + + indx->vbn2vbo_bits = sbi->cluster_bits; + } + + init_rwsem(&indx->run_lock); + + indx->cmp = get_cmp_func(root); + return indx->cmp ? 0 : -EINVAL; +} + +static struct indx_node *indx_new(struct ntfs_index *indx, + struct ntfs_inode *ni, CLST vbn, + const __le64 *sub_vbn) +{ + int err; + struct NTFS_DE *e; + struct indx_node *r; + struct INDEX_HDR *hdr; + struct INDEX_BUFFER *index; + u64 vbo = (u64)vbn << indx->vbn2vbo_bits; + u32 bytes = 1u << indx->index_bits; + u16 fn; + u32 eo; + + r = kzalloc(sizeof(struct indx_node), GFP_NOFS); + if (!r) + return ERR_PTR(-ENOMEM); + + index = kzalloc(bytes, GFP_NOFS); + if (!index) { + kfree(r); + return ERR_PTR(-ENOMEM); + } + + err = ntfs_get_bh(ni->mi.sbi, &indx->alloc_run, vbo, bytes, &r->nb); + + if (err) { + kfree(index); + kfree(r); + return ERR_PTR(err); + } + + /* Create header. */ + index->rhdr.sign = NTFS_INDX_SIGNATURE; + index->rhdr.fix_off = cpu_to_le16(sizeof(struct INDEX_BUFFER)); // 0x28 + fn = (bytes >> SECTOR_SHIFT) + 1; // 9 + index->rhdr.fix_num = cpu_to_le16(fn); + index->vbn = cpu_to_le64(vbn); + hdr = &index->ihdr; + eo = ALIGN(sizeof(struct INDEX_BUFFER) + fn * sizeof(short), 8); + hdr->de_off = cpu_to_le32(eo); + + e = Add2Ptr(hdr, eo); + + if (sub_vbn) { + e->flags = NTFS_IE_LAST | NTFS_IE_HAS_SUBNODES; + e->size = cpu_to_le16(sizeof(struct NTFS_DE) + sizeof(u64)); + hdr->used = + cpu_to_le32(eo + sizeof(struct NTFS_DE) + sizeof(u64)); + de_set_vbn_le(e, *sub_vbn); + hdr->flags = 1; + } else { + e->size = cpu_to_le16(sizeof(struct NTFS_DE)); + hdr->used = cpu_to_le32(eo + sizeof(struct NTFS_DE)); + e->flags = NTFS_IE_LAST; + } + + hdr->total = cpu_to_le32(bytes - offsetof(struct INDEX_BUFFER, ihdr)); + + r->index = index; + return r; +} + +struct INDEX_ROOT *indx_get_root(struct ntfs_index *indx, struct ntfs_inode *ni, + struct ATTRIB **attr, struct mft_inode **mi) +{ + struct ATTR_LIST_ENTRY *le = NULL; + struct ATTRIB *a; + const struct INDEX_NAMES *in = &s_index_names[indx->type]; + + a = ni_find_attr(ni, NULL, &le, ATTR_ROOT, in->name, in->name_len, NULL, + mi); + if (!a) + return NULL; + + if (attr) + *attr = a; + + return resident_data_ex(a, sizeof(struct INDEX_ROOT)); +} + +static int indx_write(struct ntfs_index *indx, struct ntfs_inode *ni, + struct indx_node *node, int sync) +{ + struct INDEX_BUFFER *ib = node->index; + + return ntfs_write_bh(ni->mi.sbi, &ib->rhdr, &node->nb, sync); +} + +/* + * indx_read + * + * If ntfs_readdir calls this function + * inode is shared locked and no ni_lock. + * Use rw_semaphore for read/write access to alloc_run. + */ +int indx_read(struct ntfs_index *indx, struct ntfs_inode *ni, CLST vbn, + struct indx_node **node) +{ + int err; + struct INDEX_BUFFER *ib; + struct runs_tree *run = &indx->alloc_run; + struct rw_semaphore *lock = &indx->run_lock; + u64 vbo = (u64)vbn << indx->vbn2vbo_bits; + u32 bytes = 1u << indx->index_bits; + struct indx_node *in = *node; + const struct INDEX_NAMES *name; + + if (!in) { + in = kzalloc(sizeof(struct indx_node), GFP_NOFS); + if (!in) + return -ENOMEM; + } else { + nb_put(&in->nb); + } + + ib = in->index; + if (!ib) { + ib = kmalloc(bytes, GFP_NOFS); + if (!ib) { + err = -ENOMEM; + goto out; + } + } + + down_read(lock); + err = ntfs_read_bh(ni->mi.sbi, run, vbo, &ib->rhdr, bytes, &in->nb); + up_read(lock); + if (!err) + goto ok; + + if (err == -E_NTFS_FIXUP) + goto ok; + + if (err != -ENOENT) + goto out; + + name = &s_index_names[indx->type]; + down_write(lock); + err = attr_load_runs_range(ni, ATTR_ALLOC, name->name, name->name_len, + run, vbo, vbo + bytes); + up_write(lock); + if (err) + goto out; + + down_read(lock); + err = ntfs_read_bh(ni->mi.sbi, run, vbo, &ib->rhdr, bytes, &in->nb); + up_read(lock); + if (err == -E_NTFS_FIXUP) + goto ok; + + if (err) + goto out; + +ok: + if (err == -E_NTFS_FIXUP) { + ntfs_write_bh(ni->mi.sbi, &ib->rhdr, &in->nb, 0); + err = 0; + } + + in->index = ib; + *node = in; + +out: + if (ib != in->index) + kfree(ib); + + if (*node != in) { + nb_put(&in->nb); + kfree(in); + } + + return err; +} + +/* + * indx_find - Scan NTFS directory for given entry. + */ +int indx_find(struct ntfs_index *indx, struct ntfs_inode *ni, + const struct INDEX_ROOT *root, const void *key, size_t key_len, + const void *ctx, int *diff, struct NTFS_DE **entry, + struct ntfs_fnd *fnd) +{ + int err; + struct NTFS_DE *e; + const struct INDEX_HDR *hdr; + struct indx_node *node; + + if (!root) + root = indx_get_root(&ni->dir, ni, NULL, NULL); + + if (!root) { + err = -EINVAL; + goto out; + } + + hdr = &root->ihdr; + + /* Check cache. */ + e = fnd->level ? fnd->de[fnd->level - 1] : fnd->root_de; + if (e && !de_is_last(e) && + !(*indx->cmp)(key, key_len, e + 1, le16_to_cpu(e->key_size), ctx)) { + *entry = e; + *diff = 0; + return 0; + } + + /* Soft finder reset. */ + fnd_clear(fnd); + + /* Lookup entry that is <= to the search value. */ + e = hdr_find_e(indx, hdr, key, key_len, ctx, diff); + if (!e) + return -EINVAL; + + if (fnd) + fnd->root_de = e; + + err = 0; + + for (;;) { + node = NULL; + if (*diff >= 0 || !de_has_vcn_ex(e)) { + *entry = e; + goto out; + } + + /* Read next level. */ + err = indx_read(indx, ni, de_get_vbn(e), &node); + if (err) + goto out; + + /* Lookup entry that is <= to the search value. */ + e = hdr_find_e(indx, &node->index->ihdr, key, key_len, ctx, + diff); + if (!e) { + err = -EINVAL; + put_indx_node(node); + goto out; + } + + fnd_push(fnd, node, e); + } + +out: + return err; +} + +int indx_find_sort(struct ntfs_index *indx, struct ntfs_inode *ni, + const struct INDEX_ROOT *root, struct NTFS_DE **entry, + struct ntfs_fnd *fnd) +{ + int err; + struct indx_node *n = NULL; + struct NTFS_DE *e; + size_t iter = 0; + int level = fnd->level; + + if (!*entry) { + /* Start find. */ + e = hdr_first_de(&root->ihdr); + if (!e) + return 0; + fnd_clear(fnd); + fnd->root_de = e; + } else if (!level) { + if (de_is_last(fnd->root_de)) { + *entry = NULL; + return 0; + } + + e = hdr_next_de(&root->ihdr, fnd->root_de); + if (!e) + return -EINVAL; + fnd->root_de = e; + } else { + n = fnd->nodes[level - 1]; + e = fnd->de[level - 1]; + + if (de_is_last(e)) + goto pop_level; + + e = hdr_next_de(&n->index->ihdr, e); + if (!e) + return -EINVAL; + + fnd->de[level - 1] = e; + } + + /* Just to avoid tree cycle. */ +next_iter: + if (iter++ >= 1000) + return -EINVAL; + + while (de_has_vcn_ex(e)) { + if (le16_to_cpu(e->size) < + sizeof(struct NTFS_DE) + sizeof(u64)) { + if (n) { + fnd_pop(fnd); + kfree(n); + } + return -EINVAL; + } + + /* Read next level. */ + err = indx_read(indx, ni, de_get_vbn(e), &n); + if (err) + return err; + + /* Try next level. */ + e = hdr_first_de(&n->index->ihdr); + if (!e) { + kfree(n); + return -EINVAL; + } + + fnd_push(fnd, n, e); + } + + if (le16_to_cpu(e->size) > sizeof(struct NTFS_DE)) { + *entry = e; + return 0; + } + +pop_level: + for (;;) { + if (!de_is_last(e)) + goto next_iter; + + /* Pop one level. */ + if (n) { + fnd_pop(fnd); + kfree(n); + } + + level = fnd->level; + + if (level) { + n = fnd->nodes[level - 1]; + e = fnd->de[level - 1]; + } else if (fnd->root_de) { + n = NULL; + e = fnd->root_de; + fnd->root_de = NULL; + } else { + *entry = NULL; + return 0; + } + + if (le16_to_cpu(e->size) > sizeof(struct NTFS_DE)) { + *entry = e; + if (!fnd->root_de) + fnd->root_de = e; + return 0; + } + } +} + +int indx_find_raw(struct ntfs_index *indx, struct ntfs_inode *ni, + const struct INDEX_ROOT *root, struct NTFS_DE **entry, + size_t *off, struct ntfs_fnd *fnd) +{ + int err; + struct indx_node *n = NULL; + struct NTFS_DE *e = NULL; + struct NTFS_DE *e2; + size_t bit; + CLST next_used_vbn; + CLST next_vbn; + u32 record_size = ni->mi.sbi->record_size; + + /* Use non sorted algorithm. */ + if (!*entry) { + /* This is the first call. */ + e = hdr_first_de(&root->ihdr); + if (!e) + return 0; + fnd_clear(fnd); + fnd->root_de = e; + + /* The first call with setup of initial element. */ + if (*off >= record_size) { + next_vbn = (((*off - record_size) >> indx->index_bits)) + << indx->idx2vbn_bits; + /* Jump inside cycle 'for'. */ + goto next; + } + + /* Start enumeration from root. */ + *off = 0; + } else if (!fnd->root_de) + return -EINVAL; + + for (;;) { + /* Check if current entry can be used. */ + if (e && le16_to_cpu(e->size) > sizeof(struct NTFS_DE)) + goto ok; + + if (!fnd->level) { + /* Continue to enumerate root. */ + if (!de_is_last(fnd->root_de)) { + e = hdr_next_de(&root->ihdr, fnd->root_de); + if (!e) + return -EINVAL; + fnd->root_de = e; + continue; + } + + /* Start to enumerate indexes from 0. */ + next_vbn = 0; + } else { + /* Continue to enumerate indexes. */ + e2 = fnd->de[fnd->level - 1]; + + n = fnd->nodes[fnd->level - 1]; + + if (!de_is_last(e2)) { + e = hdr_next_de(&n->index->ihdr, e2); + if (!e) + return -EINVAL; + fnd->de[fnd->level - 1] = e; + continue; + } + + /* Continue with next index. */ + next_vbn = le64_to_cpu(n->index->vbn) + + root->index_block_clst; + } + +next: + /* Release current index. */ + if (n) { + fnd_pop(fnd); + put_indx_node(n); + n = NULL; + } + + /* Skip all free indexes. */ + bit = next_vbn >> indx->idx2vbn_bits; + err = indx_used_bit(indx, ni, &bit); + if (err == -ENOENT || bit == MINUS_ONE_T) { + /* No used indexes. */ + *entry = NULL; + return 0; + } + + next_used_vbn = bit << indx->idx2vbn_bits; + + /* Read buffer into memory. */ + err = indx_read(indx, ni, next_used_vbn, &n); + if (err) + return err; + + e = hdr_first_de(&n->index->ihdr); + fnd_push(fnd, n, e); + if (!e) + return -EINVAL; + } + +ok: + /* Return offset to restore enumerator if necessary. */ + if (!n) { + /* 'e' points in root, */ + *off = PtrOffset(&root->ihdr, e); + } else { + /* 'e' points in index, */ + *off = (le64_to_cpu(n->index->vbn) << indx->vbn2vbo_bits) + + record_size + PtrOffset(&n->index->ihdr, e); + } + + *entry = e; + return 0; +} + +/* + * indx_create_allocate - Create "Allocation + Bitmap" attributes. + */ +static int indx_create_allocate(struct ntfs_index *indx, struct ntfs_inode *ni, + CLST *vbn) +{ + int err = -ENOMEM; + struct ntfs_sb_info *sbi = ni->mi.sbi; + struct ATTRIB *bitmap; + struct ATTRIB *alloc; + u32 data_size = 1u << indx->index_bits; + u32 alloc_size = ntfs_up_cluster(sbi, data_size); + CLST len = alloc_size >> sbi->cluster_bits; + const struct INDEX_NAMES *in = &s_index_names[indx->type]; + CLST alen; + struct runs_tree run; + + run_init(&run); + + err = attr_allocate_clusters(sbi, &run, 0, 0, len, NULL, 0, &alen, 0, + NULL); + if (err) + goto out; + + err = ni_insert_nonresident(ni, ATTR_ALLOC, in->name, in->name_len, + &run, 0, len, 0, &alloc, NULL); + if (err) + goto out1; + + alloc->nres.valid_size = alloc->nres.data_size = cpu_to_le64(data_size); + + err = ni_insert_resident(ni, bitmap_size(1), ATTR_BITMAP, in->name, + in->name_len, &bitmap, NULL, NULL); + if (err) + goto out2; + + if (in->name == I30_NAME) { + ni->vfs_inode.i_size = data_size; + inode_set_bytes(&ni->vfs_inode, alloc_size); + } + + memcpy(&indx->alloc_run, &run, sizeof(run)); + + *vbn = 0; + + return 0; + +out2: + mi_remove_attr(NULL, &ni->mi, alloc); + +out1: + run_deallocate(sbi, &run, false); + +out: + return err; +} + +/* + * indx_add_allocate - Add clusters to index. + */ +static int indx_add_allocate(struct ntfs_index *indx, struct ntfs_inode *ni, + CLST *vbn) +{ + int err; + size_t bit; + u64 data_size; + u64 bmp_size, bmp_size_v; + struct ATTRIB *bmp, *alloc; + struct mft_inode *mi; + const struct INDEX_NAMES *in = &s_index_names[indx->type]; + + err = indx_find_free(indx, ni, &bit, &bmp); + if (err) + goto out1; + + if (bit != MINUS_ONE_T) { + bmp = NULL; + } else { + if (bmp->non_res) { + bmp_size = le64_to_cpu(bmp->nres.data_size); + bmp_size_v = le64_to_cpu(bmp->nres.valid_size); + } else { + bmp_size = bmp_size_v = le32_to_cpu(bmp->res.data_size); + } + + bit = bmp_size << 3; + } + + data_size = (u64)(bit + 1) << indx->index_bits; + + if (bmp) { + /* Increase bitmap. */ + err = attr_set_size(ni, ATTR_BITMAP, in->name, in->name_len, + &indx->bitmap_run, bitmap_size(bit + 1), + NULL, true, NULL); + if (err) + goto out1; + } + + alloc = ni_find_attr(ni, NULL, NULL, ATTR_ALLOC, in->name, in->name_len, + NULL, &mi); + if (!alloc) { + err = -EINVAL; + if (bmp) + goto out2; + goto out1; + } + + /* Increase allocation. */ + err = attr_set_size(ni, ATTR_ALLOC, in->name, in->name_len, + &indx->alloc_run, data_size, &data_size, true, + NULL); + if (err) { + if (bmp) + goto out2; + goto out1; + } + + *vbn = bit << indx->idx2vbn_bits; + + return 0; + +out2: + /* Ops. No space? */ + attr_set_size(ni, ATTR_BITMAP, in->name, in->name_len, + &indx->bitmap_run, bmp_size, &bmp_size_v, false, NULL); + +out1: + return err; +} + +/* + * indx_insert_into_root - Attempt to insert an entry into the index root. + * + * @undo - True if we undoing previous remove. + * If necessary, it will twiddle the index b-tree. + */ +static int indx_insert_into_root(struct ntfs_index *indx, struct ntfs_inode *ni, + const struct NTFS_DE *new_de, + struct NTFS_DE *root_de, const void *ctx, + struct ntfs_fnd *fnd, bool undo) +{ + int err = 0; + struct NTFS_DE *e, *e0, *re; + struct mft_inode *mi; + struct ATTRIB *attr; + struct INDEX_HDR *hdr; + struct indx_node *n; + CLST new_vbn; + __le64 *sub_vbn, t_vbn; + u16 new_de_size; + u32 hdr_used, hdr_total, asize, to_move; + u32 root_size, new_root_size; + struct ntfs_sb_info *sbi; + int ds_root; + struct INDEX_ROOT *root, *a_root; + + /* Get the record this root placed in. */ + root = indx_get_root(indx, ni, &attr, &mi); + if (!root) + return -EINVAL; + + /* + * Try easy case: + * hdr_insert_de will succeed if there's + * room the root for the new entry. + */ + hdr = &root->ihdr; + sbi = ni->mi.sbi; + new_de_size = le16_to_cpu(new_de->size); + hdr_used = le32_to_cpu(hdr->used); + hdr_total = le32_to_cpu(hdr->total); + asize = le32_to_cpu(attr->size); + root_size = le32_to_cpu(attr->res.data_size); + + ds_root = new_de_size + hdr_used - hdr_total; + + /* If 'undo' is set then reduce requirements. */ + if ((undo || asize + ds_root < sbi->max_bytes_per_attr) && + mi_resize_attr(mi, attr, ds_root)) { + hdr->total = cpu_to_le32(hdr_total + ds_root); + e = hdr_insert_de(indx, hdr, new_de, root_de, ctx); + WARN_ON(!e); + fnd_clear(fnd); + fnd->root_de = e; + + return 0; + } + + /* Make a copy of root attribute to restore if error. */ + a_root = kmemdup(attr, asize, GFP_NOFS); + if (!a_root) + return -ENOMEM; + + /* + * Copy all the non-end entries from + * the index root to the new buffer. + */ + to_move = 0; + e0 = hdr_first_de(hdr); + + /* Calculate the size to copy. */ + for (e = e0;; e = hdr_next_de(hdr, e)) { + if (!e) { + err = -EINVAL; + goto out_free_root; + } + + if (de_is_last(e)) + break; + to_move += le16_to_cpu(e->size); + } + + if (!to_move) { + re = NULL; + } else { + re = kmemdup(e0, to_move, GFP_NOFS); + if (!re) { + err = -ENOMEM; + goto out_free_root; + } + } + + sub_vbn = NULL; + if (de_has_vcn(e)) { + t_vbn = de_get_vbn_le(e); + sub_vbn = &t_vbn; + } + + new_root_size = sizeof(struct INDEX_ROOT) + sizeof(struct NTFS_DE) + + sizeof(u64); + ds_root = new_root_size - root_size; + + if (ds_root > 0 && asize + ds_root > sbi->max_bytes_per_attr) { + /* Make root external. */ + err = -EOPNOTSUPP; + goto out_free_re; + } + + if (ds_root) + mi_resize_attr(mi, attr, ds_root); + + /* Fill first entry (vcn will be set later). */ + e = (struct NTFS_DE *)(root + 1); + memset(e, 0, sizeof(struct NTFS_DE)); + e->size = cpu_to_le16(sizeof(struct NTFS_DE) + sizeof(u64)); + e->flags = NTFS_IE_HAS_SUBNODES | NTFS_IE_LAST; + + hdr->flags = 1; + hdr->used = hdr->total = + cpu_to_le32(new_root_size - offsetof(struct INDEX_ROOT, ihdr)); + + fnd->root_de = hdr_first_de(hdr); + mi->dirty = true; + + /* Create alloc and bitmap attributes (if not). */ + err = run_is_empty(&indx->alloc_run) + ? indx_create_allocate(indx, ni, &new_vbn) + : indx_add_allocate(indx, ni, &new_vbn); + + /* Layout of record may be changed, so rescan root. */ + root = indx_get_root(indx, ni, &attr, &mi); + if (!root) { + /* Bug? */ + ntfs_set_state(sbi, NTFS_DIRTY_ERROR); + err = -EINVAL; + goto out_free_re; + } + + if (err) { + /* Restore root. */ + if (mi_resize_attr(mi, attr, -ds_root)) + memcpy(attr, a_root, asize); + else { + /* Bug? */ + ntfs_set_state(sbi, NTFS_DIRTY_ERROR); + } + goto out_free_re; + } + + e = (struct NTFS_DE *)(root + 1); + *(__le64 *)(e + 1) = cpu_to_le64(new_vbn); + mi->dirty = true; + + /* Now we can create/format the new buffer and copy the entries into. */ + n = indx_new(indx, ni, new_vbn, sub_vbn); + if (IS_ERR(n)) { + err = PTR_ERR(n); + goto out_free_re; + } + + hdr = &n->index->ihdr; + hdr_used = le32_to_cpu(hdr->used); + hdr_total = le32_to_cpu(hdr->total); + + /* Copy root entries into new buffer. */ + hdr_insert_head(hdr, re, to_move); + + /* Update bitmap attribute. */ + indx_mark_used(indx, ni, new_vbn >> indx->idx2vbn_bits); + + /* Check if we can insert new entry new index buffer. */ + if (hdr_used + new_de_size > hdr_total) { + /* + * This occurs if MFT record is the same or bigger than index + * buffer. Move all root new index and have no space to add + * new entry classic case when MFT record is 1K and index + * buffer 4K the problem should not occurs. + */ + kfree(re); + indx_write(indx, ni, n, 0); + + put_indx_node(n); + fnd_clear(fnd); + err = indx_insert_entry(indx, ni, new_de, ctx, fnd, undo); + goto out_free_root; + } + + /* + * Now root is a parent for new index buffer. + * Insert NewEntry a new buffer. + */ + e = hdr_insert_de(indx, hdr, new_de, NULL, ctx); + if (!e) { + err = -EINVAL; + goto out_put_n; + } + fnd_push(fnd, n, e); + + /* Just write updates index into disk. */ + indx_write(indx, ni, n, 0); + + n = NULL; + +out_put_n: + put_indx_node(n); +out_free_re: + kfree(re); +out_free_root: + kfree(a_root); + return err; +} + +/* + * indx_insert_into_buffer + * + * Attempt to insert an entry into an Index Allocation Buffer. + * If necessary, it will split the buffer. + */ +static int +indx_insert_into_buffer(struct ntfs_index *indx, struct ntfs_inode *ni, + struct INDEX_ROOT *root, const struct NTFS_DE *new_de, + const void *ctx, int level, struct ntfs_fnd *fnd) +{ + int err; + const struct NTFS_DE *sp; + struct NTFS_DE *e, *de_t, *up_e = NULL; + struct indx_node *n2 = NULL; + struct indx_node *n1 = fnd->nodes[level]; + struct INDEX_HDR *hdr1 = &n1->index->ihdr; + struct INDEX_HDR *hdr2; + u32 to_copy, used; + CLST new_vbn; + __le64 t_vbn, *sub_vbn; + u16 sp_size; + + /* Try the most easy case. */ + e = fnd->level - 1 == level ? fnd->de[level] : NULL; + e = hdr_insert_de(indx, hdr1, new_de, e, ctx); + fnd->de[level] = e; + if (e) { + /* Just write updated index into disk. */ + indx_write(indx, ni, n1, 0); + return 0; + } + + /* + * No space to insert into buffer. Split it. + * To split we: + * - Save split point ('cause index buffers will be changed) + * - Allocate NewBuffer and copy all entries <= sp into new buffer + * - Remove all entries (sp including) from TargetBuffer + * - Insert NewEntry into left or right buffer (depending on sp <=> + * NewEntry) + * - Insert sp into parent buffer (or root) + * - Make sp a parent for new buffer + */ + sp = hdr_find_split(hdr1); + if (!sp) + return -EINVAL; + + sp_size = le16_to_cpu(sp->size); + up_e = kmalloc(sp_size + sizeof(u64), GFP_NOFS); + if (!up_e) + return -ENOMEM; + memcpy(up_e, sp, sp_size); + + if (!hdr1->flags) { + up_e->flags |= NTFS_IE_HAS_SUBNODES; + up_e->size = cpu_to_le16(sp_size + sizeof(u64)); + sub_vbn = NULL; + } else { + t_vbn = de_get_vbn_le(up_e); + sub_vbn = &t_vbn; + } + + /* Allocate on disk a new index allocation buffer. */ + err = indx_add_allocate(indx, ni, &new_vbn); + if (err) + goto out; + + /* Allocate and format memory a new index buffer. */ + n2 = indx_new(indx, ni, new_vbn, sub_vbn); + if (IS_ERR(n2)) { + err = PTR_ERR(n2); + goto out; + } + + hdr2 = &n2->index->ihdr; + + /* Make sp a parent for new buffer. */ + de_set_vbn(up_e, new_vbn); + + /* Copy all the entries <= sp into the new buffer. */ + de_t = hdr_first_de(hdr1); + to_copy = PtrOffset(de_t, sp); + hdr_insert_head(hdr2, de_t, to_copy); + + /* Remove all entries (sp including) from hdr1. */ + used = le32_to_cpu(hdr1->used) - to_copy - sp_size; + memmove(de_t, Add2Ptr(sp, sp_size), used - le32_to_cpu(hdr1->de_off)); + hdr1->used = cpu_to_le32(used); + + /* + * Insert new entry into left or right buffer + * (depending on sp <=> new_de). + */ + hdr_insert_de(indx, + (*indx->cmp)(new_de + 1, le16_to_cpu(new_de->key_size), + up_e + 1, le16_to_cpu(up_e->key_size), + ctx) < 0 + ? hdr2 + : hdr1, + new_de, NULL, ctx); + + indx_mark_used(indx, ni, new_vbn >> indx->idx2vbn_bits); + + indx_write(indx, ni, n1, 0); + indx_write(indx, ni, n2, 0); + + put_indx_node(n2); + + /* + * We've finished splitting everybody, so we are ready to + * insert the promoted entry into the parent. + */ + if (!level) { + /* Insert in root. */ + err = indx_insert_into_root(indx, ni, up_e, NULL, ctx, fnd, 0); + if (err) + goto out; + } else { + /* + * The target buffer's parent is another index buffer. + * TODO: Remove recursion. + */ + err = indx_insert_into_buffer(indx, ni, root, up_e, ctx, + level - 1, fnd); + if (err) + goto out; + } + +out: + kfree(up_e); + + return err; +} + +/* + * indx_insert_entry - Insert new entry into index. + * + * @undo - True if we undoing previous remove. + */ +int indx_insert_entry(struct ntfs_index *indx, struct ntfs_inode *ni, + const struct NTFS_DE *new_de, const void *ctx, + struct ntfs_fnd *fnd, bool undo) +{ + int err; + int diff; + struct NTFS_DE *e; + struct ntfs_fnd *fnd_a = NULL; + struct INDEX_ROOT *root; + + if (!fnd) { + fnd_a = fnd_get(); + if (!fnd_a) { + err = -ENOMEM; + goto out1; + } + fnd = fnd_a; + } + + root = indx_get_root(indx, ni, NULL, NULL); + if (!root) { + err = -EINVAL; + goto out; + } + + if (fnd_is_empty(fnd)) { + /* + * Find the spot the tree where we want to + * insert the new entry. + */ + err = indx_find(indx, ni, root, new_de + 1, + le16_to_cpu(new_de->key_size), ctx, &diff, &e, + fnd); + if (err) + goto out; + + if (!diff) { + err = -EEXIST; + goto out; + } + } + + if (!fnd->level) { + /* + * The root is also a leaf, so we'll insert the + * new entry into it. + */ + err = indx_insert_into_root(indx, ni, new_de, fnd->root_de, ctx, + fnd, undo); + if (err) + goto out; + } else { + /* + * Found a leaf buffer, so we'll insert the new entry into it. + */ + err = indx_insert_into_buffer(indx, ni, root, new_de, ctx, + fnd->level - 1, fnd); + if (err) + goto out; + } + +out: + fnd_put(fnd_a); +out1: + return err; +} + +/* + * indx_find_buffer - Locate a buffer from the tree. + */ +static struct indx_node *indx_find_buffer(struct ntfs_index *indx, + struct ntfs_inode *ni, + const struct INDEX_ROOT *root, + __le64 vbn, struct indx_node *n) +{ + int err; + const struct NTFS_DE *e; + struct indx_node *r; + const struct INDEX_HDR *hdr = n ? &n->index->ihdr : &root->ihdr; + + /* Step 1: Scan one level. */ + for (e = hdr_first_de(hdr);; e = hdr_next_de(hdr, e)) { + if (!e) + return ERR_PTR(-EINVAL); + + if (de_has_vcn(e) && vbn == de_get_vbn_le(e)) + return n; + + if (de_is_last(e)) + break; + } + + /* Step2: Do recursion. */ + e = Add2Ptr(hdr, le32_to_cpu(hdr->de_off)); + for (;;) { + if (de_has_vcn_ex(e)) { + err = indx_read(indx, ni, de_get_vbn(e), &n); + if (err) + return ERR_PTR(err); + + r = indx_find_buffer(indx, ni, root, vbn, n); + if (r) + return r; + } + + if (de_is_last(e)) + break; + + e = Add2Ptr(e, le16_to_cpu(e->size)); + } + + return NULL; +} + +/* + * indx_shrink - Deallocate unused tail indexes. + */ +static int indx_shrink(struct ntfs_index *indx, struct ntfs_inode *ni, + size_t bit) +{ + int err = 0; + u64 bpb, new_data; + size_t nbits; + struct ATTRIB *b; + struct ATTR_LIST_ENTRY *le = NULL; + const struct INDEX_NAMES *in = &s_index_names[indx->type]; + + b = ni_find_attr(ni, NULL, &le, ATTR_BITMAP, in->name, in->name_len, + NULL, NULL); + + if (!b) + return -ENOENT; + + if (!b->non_res) { + unsigned long pos; + const unsigned long *bm = resident_data(b); + + nbits = (size_t)le32_to_cpu(b->res.data_size) * 8; + + if (bit >= nbits) + return 0; + + pos = find_next_bit(bm, nbits, bit); + if (pos < nbits) + return 0; + } else { + size_t used = MINUS_ONE_T; + + nbits = le64_to_cpu(b->nres.data_size) * 8; + + if (bit >= nbits) + return 0; + + err = scan_nres_bitmap(ni, b, indx, bit, &scan_for_used, &used); + if (err) + return err; + + if (used != MINUS_ONE_T) + return 0; + } + + new_data = (u64)bit << indx->index_bits; + + err = attr_set_size(ni, ATTR_ALLOC, in->name, in->name_len, + &indx->alloc_run, new_data, &new_data, false, NULL); + if (err) + return err; + + bpb = bitmap_size(bit); + if (bpb * 8 == nbits) + return 0; + + err = attr_set_size(ni, ATTR_BITMAP, in->name, in->name_len, + &indx->bitmap_run, bpb, &bpb, false, NULL); + + return err; +} + +static int indx_free_children(struct ntfs_index *indx, struct ntfs_inode *ni, + const struct NTFS_DE *e, bool trim) +{ + int err; + struct indx_node *n; + struct INDEX_HDR *hdr; + CLST vbn = de_get_vbn(e); + size_t i; + + err = indx_read(indx, ni, vbn, &n); + if (err) + return err; + + hdr = &n->index->ihdr; + /* First, recurse into the children, if any. */ + if (hdr_has_subnode(hdr)) { + for (e = hdr_first_de(hdr); e; e = hdr_next_de(hdr, e)) { + indx_free_children(indx, ni, e, false); + if (de_is_last(e)) + break; + } + } + + put_indx_node(n); + + i = vbn >> indx->idx2vbn_bits; + /* + * We've gotten rid of the children; add this buffer to the free list. + */ + indx_mark_free(indx, ni, i); + + if (!trim) + return 0; + + /* + * If there are no used indexes after current free index + * then we can truncate allocation and bitmap. + * Use bitmap to estimate the case. + */ + indx_shrink(indx, ni, i + 1); + return 0; +} + +/* + * indx_get_entry_to_replace + * + * Find a replacement entry for a deleted entry. + * Always returns a node entry: + * NTFS_IE_HAS_SUBNODES is set the flags and the size includes the sub_vcn. + */ +static int indx_get_entry_to_replace(struct ntfs_index *indx, + struct ntfs_inode *ni, + const struct NTFS_DE *de_next, + struct NTFS_DE **de_to_replace, + struct ntfs_fnd *fnd) +{ + int err; + int level = -1; + CLST vbn; + struct NTFS_DE *e, *te, *re; + struct indx_node *n; + struct INDEX_BUFFER *ib; + + *de_to_replace = NULL; + + /* Find first leaf entry down from de_next. */ + vbn = de_get_vbn(de_next); + for (;;) { + n = NULL; + err = indx_read(indx, ni, vbn, &n); + if (err) + goto out; + + e = hdr_first_de(&n->index->ihdr); + fnd_push(fnd, n, e); + + if (!de_is_last(e)) { + /* + * This buffer is non-empty, so its first entry + * could be used as the replacement entry. + */ + level = fnd->level - 1; + } + + if (!de_has_vcn(e)) + break; + + /* This buffer is a node. Continue to go down. */ + vbn = de_get_vbn(e); + } + + if (level == -1) + goto out; + + n = fnd->nodes[level]; + te = hdr_first_de(&n->index->ihdr); + /* Copy the candidate entry into the replacement entry buffer. */ + re = kmalloc(le16_to_cpu(te->size) + sizeof(u64), GFP_NOFS); + if (!re) { + err = -ENOMEM; + goto out; + } + + *de_to_replace = re; + memcpy(re, te, le16_to_cpu(te->size)); + + if (!de_has_vcn(re)) { + /* + * The replacement entry we found doesn't have a sub_vcn. + * increase its size to hold one. + */ + le16_add_cpu(&re->size, sizeof(u64)); + re->flags |= NTFS_IE_HAS_SUBNODES; + } else { + /* + * The replacement entry we found was a node entry, which + * means that all its child buffers are empty. Return them + * to the free pool. + */ + indx_free_children(indx, ni, te, true); + } + + /* + * Expunge the replacement entry from its former location, + * and then write that buffer. + */ + ib = n->index; + e = hdr_delete_de(&ib->ihdr, te); + + fnd->de[level] = e; + indx_write(indx, ni, n, 0); + + /* Check to see if this action created an empty leaf. */ + if (ib_is_leaf(ib) && ib_is_empty(ib)) + return 0; + +out: + fnd_clear(fnd); + return err; +} + +/* + * indx_delete_entry - Delete an entry from the index. + */ +int indx_delete_entry(struct ntfs_index *indx, struct ntfs_inode *ni, + const void *key, u32 key_len, const void *ctx) +{ + int err, diff; + struct INDEX_ROOT *root; + struct INDEX_HDR *hdr; + struct ntfs_fnd *fnd, *fnd2; + struct INDEX_BUFFER *ib; + struct NTFS_DE *e, *re, *next, *prev, *me; + struct indx_node *n, *n2d = NULL; + __le64 sub_vbn; + int level, level2; + struct ATTRIB *attr; + struct mft_inode *mi; + u32 e_size, root_size, new_root_size; + size_t trim_bit; + const struct INDEX_NAMES *in; + + fnd = fnd_get(); + if (!fnd) { + err = -ENOMEM; + goto out2; + } + + fnd2 = fnd_get(); + if (!fnd2) { + err = -ENOMEM; + goto out1; + } + + root = indx_get_root(indx, ni, &attr, &mi); + if (!root) { + err = -EINVAL; + goto out; + } + + /* Locate the entry to remove. */ + err = indx_find(indx, ni, root, key, key_len, ctx, &diff, &e, fnd); + if (err) + goto out; + + if (!e || diff) { + err = -ENOENT; + goto out; + } + + level = fnd->level; + + if (level) { + n = fnd->nodes[level - 1]; + e = fnd->de[level - 1]; + ib = n->index; + hdr = &ib->ihdr; + } else { + hdr = &root->ihdr; + e = fnd->root_de; + n = NULL; + } + + e_size = le16_to_cpu(e->size); + + if (!de_has_vcn_ex(e)) { + /* The entry to delete is a leaf, so we can just rip it out. */ + hdr_delete_de(hdr, e); + + if (!level) { + hdr->total = hdr->used; + + /* Shrink resident root attribute. */ + mi_resize_attr(mi, attr, 0 - e_size); + goto out; + } + + indx_write(indx, ni, n, 0); + + /* + * Check to see if removing that entry made + * the leaf empty. + */ + if (ib_is_leaf(ib) && ib_is_empty(ib)) { + fnd_pop(fnd); + fnd_push(fnd2, n, e); + } + } else { + /* + * The entry we wish to delete is a node buffer, so we + * have to find a replacement for it. + */ + next = de_get_next(e); + + err = indx_get_entry_to_replace(indx, ni, next, &re, fnd2); + if (err) + goto out; + + if (re) { + de_set_vbn_le(re, de_get_vbn_le(e)); + hdr_delete_de(hdr, e); + + err = level ? indx_insert_into_buffer(indx, ni, root, + re, ctx, + fnd->level - 1, + fnd) + : indx_insert_into_root(indx, ni, re, e, + ctx, fnd, 0); + kfree(re); + + if (err) + goto out; + } else { + /* + * There is no replacement for the current entry. + * This means that the subtree rooted at its node + * is empty, and can be deleted, which turn means + * that the node can just inherit the deleted + * entry sub_vcn. + */ + indx_free_children(indx, ni, next, true); + + de_set_vbn_le(next, de_get_vbn_le(e)); + hdr_delete_de(hdr, e); + if (level) { + indx_write(indx, ni, n, 0); + } else { + hdr->total = hdr->used; + + /* Shrink resident root attribute. */ + mi_resize_attr(mi, attr, 0 - e_size); + } + } + } + + /* Delete a branch of tree. */ + if (!fnd2 || !fnd2->level) + goto out; + + /* Reinit root 'cause it can be changed. */ + root = indx_get_root(indx, ni, &attr, &mi); + if (!root) { + err = -EINVAL; + goto out; + } + + n2d = NULL; + sub_vbn = fnd2->nodes[0]->index->vbn; + level2 = 0; + level = fnd->level; + + hdr = level ? &fnd->nodes[level - 1]->index->ihdr : &root->ihdr; + + /* Scan current level. */ + for (e = hdr_first_de(hdr);; e = hdr_next_de(hdr, e)) { + if (!e) { + err = -EINVAL; + goto out; + } + + if (de_has_vcn(e) && sub_vbn == de_get_vbn_le(e)) + break; + + if (de_is_last(e)) { + e = NULL; + break; + } + } + + if (!e) { + /* Do slow search from root. */ + struct indx_node *in; + + fnd_clear(fnd); + + in = indx_find_buffer(indx, ni, root, sub_vbn, NULL); + if (IS_ERR(in)) { + err = PTR_ERR(in); + goto out; + } + + if (in) + fnd_push(fnd, in, NULL); + } + + /* Merge fnd2 -> fnd. */ + for (level = 0; level < fnd2->level; level++) { + fnd_push(fnd, fnd2->nodes[level], fnd2->de[level]); + fnd2->nodes[level] = NULL; + } + fnd2->level = 0; + + hdr = NULL; + for (level = fnd->level; level; level--) { + struct indx_node *in = fnd->nodes[level - 1]; + + ib = in->index; + if (ib_is_empty(ib)) { + sub_vbn = ib->vbn; + } else { + hdr = &ib->ihdr; + n2d = in; + level2 = level; + break; + } + } + + if (!hdr) + hdr = &root->ihdr; + + e = hdr_first_de(hdr); + if (!e) { + err = -EINVAL; + goto out; + } + + if (hdr != &root->ihdr || !de_is_last(e)) { + prev = NULL; + while (!de_is_last(e)) { + if (de_has_vcn(e) && sub_vbn == de_get_vbn_le(e)) + break; + prev = e; + e = hdr_next_de(hdr, e); + if (!e) { + err = -EINVAL; + goto out; + } + } + + if (sub_vbn != de_get_vbn_le(e)) { + /* + * Didn't find the parent entry, although this buffer + * is the parent trail. Something is corrupt. + */ + err = -EINVAL; + goto out; + } + + if (de_is_last(e)) { + /* + * Since we can't remove the end entry, we'll remove + * its predecessor instead. This means we have to + * transfer the predecessor's sub_vcn to the end entry. + * Note: This index block is not empty, so the + * predecessor must exist. + */ + if (!prev) { + err = -EINVAL; + goto out; + } + + if (de_has_vcn(prev)) { + de_set_vbn_le(e, de_get_vbn_le(prev)); + } else if (de_has_vcn(e)) { + le16_sub_cpu(&e->size, sizeof(u64)); + e->flags &= ~NTFS_IE_HAS_SUBNODES; + le32_sub_cpu(&hdr->used, sizeof(u64)); + } + e = prev; + } + + /* + * Copy the current entry into a temporary buffer (stripping + * off its down-pointer, if any) and delete it from the current + * buffer or root, as appropriate. + */ + e_size = le16_to_cpu(e->size); + me = kmemdup(e, e_size, GFP_NOFS); + if (!me) { + err = -ENOMEM; + goto out; + } + + if (de_has_vcn(me)) { + me->flags &= ~NTFS_IE_HAS_SUBNODES; + le16_sub_cpu(&me->size, sizeof(u64)); + } + + hdr_delete_de(hdr, e); + + if (hdr == &root->ihdr) { + level = 0; + hdr->total = hdr->used; + + /* Shrink resident root attribute. */ + mi_resize_attr(mi, attr, 0 - e_size); + } else { + indx_write(indx, ni, n2d, 0); + level = level2; + } + + /* Mark unused buffers as free. */ + trim_bit = -1; + for (; level < fnd->level; level++) { + ib = fnd->nodes[level]->index; + if (ib_is_empty(ib)) { + size_t k = le64_to_cpu(ib->vbn) >> + indx->idx2vbn_bits; + + indx_mark_free(indx, ni, k); + if (k < trim_bit) + trim_bit = k; + } + } + + fnd_clear(fnd); + /*fnd->root_de = NULL;*/ + + /* + * Re-insert the entry into the tree. + * Find the spot the tree where we want to insert the new entry. + */ + err = indx_insert_entry(indx, ni, me, ctx, fnd, 0); + kfree(me); + if (err) + goto out; + + if (trim_bit != -1) + indx_shrink(indx, ni, trim_bit); + } else { + /* + * This tree needs to be collapsed down to an empty root. + * Recreate the index root as an empty leaf and free all + * the bits the index allocation bitmap. + */ + fnd_clear(fnd); + fnd_clear(fnd2); + + in = &s_index_names[indx->type]; + + err = attr_set_size(ni, ATTR_ALLOC, in->name, in->name_len, + &indx->alloc_run, 0, NULL, false, NULL); + err = ni_remove_attr(ni, ATTR_ALLOC, in->name, in->name_len, + false, NULL); + run_close(&indx->alloc_run); + + err = attr_set_size(ni, ATTR_BITMAP, in->name, in->name_len, + &indx->bitmap_run, 0, NULL, false, NULL); + err = ni_remove_attr(ni, ATTR_BITMAP, in->name, in->name_len, + false, NULL); + run_close(&indx->bitmap_run); + + root = indx_get_root(indx, ni, &attr, &mi); + if (!root) { + err = -EINVAL; + goto out; + } + + root_size = le32_to_cpu(attr->res.data_size); + new_root_size = + sizeof(struct INDEX_ROOT) + sizeof(struct NTFS_DE); + + if (new_root_size != root_size && + !mi_resize_attr(mi, attr, new_root_size - root_size)) { + err = -EINVAL; + goto out; + } + + /* Fill first entry. */ + e = (struct NTFS_DE *)(root + 1); + e->ref.low = 0; + e->ref.high = 0; + e->ref.seq = 0; + e->size = cpu_to_le16(sizeof(struct NTFS_DE)); + e->flags = NTFS_IE_LAST; // 0x02 + e->key_size = 0; + e->res = 0; + + hdr = &root->ihdr; + hdr->flags = 0; + hdr->used = hdr->total = cpu_to_le32( + new_root_size - offsetof(struct INDEX_ROOT, ihdr)); + mi->dirty = true; + } + +out: + fnd_put(fnd2); +out1: + fnd_put(fnd); +out2: + return err; +} + +/* + * Update duplicated information in directory entry + * 'dup' - info from MFT record + */ +int indx_update_dup(struct ntfs_inode *ni, struct ntfs_sb_info *sbi, + const struct ATTR_FILE_NAME *fname, + const struct NTFS_DUP_INFO *dup, int sync) +{ + int err, diff; + struct NTFS_DE *e = NULL; + struct ATTR_FILE_NAME *e_fname; + struct ntfs_fnd *fnd; + struct INDEX_ROOT *root; + struct mft_inode *mi; + struct ntfs_index *indx = &ni->dir; + + fnd = fnd_get(); + if (!fnd) + return -ENOMEM; + + root = indx_get_root(indx, ni, NULL, &mi); + if (!root) { + err = -EINVAL; + goto out; + } + + /* Find entry in directory. */ + err = indx_find(indx, ni, root, fname, fname_full_size(fname), sbi, + &diff, &e, fnd); + if (err) + goto out; + + if (!e) { + err = -EINVAL; + goto out; + } + + if (diff) { + err = -EINVAL; + goto out; + } + + e_fname = (struct ATTR_FILE_NAME *)(e + 1); + + if (!memcmp(&e_fname->dup, dup, sizeof(*dup))) { + /* + * Nothing to update in index! Try to avoid this call. + */ + goto out; + } + + memcpy(&e_fname->dup, dup, sizeof(*dup)); + + if (fnd->level) { + /* Directory entry in index. */ + err = indx_write(indx, ni, fnd->nodes[fnd->level - 1], sync); + } else { + /* Directory entry in directory MFT record. */ + mi->dirty = true; + if (sync) + err = mi_write(mi, 1); + else + mark_inode_dirty(&ni->vfs_inode); + } + +out: + fnd_put(fnd); + return err; +} diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c new file mode 100644 index 000000000000..db2a5a4c38e4 --- /dev/null +++ b/fs/ntfs3/inode.c @@ -0,0 +1,1957 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * + * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. + * + */ + +#include <linux/blkdev.h> +#include <linux/buffer_head.h> +#include <linux/fs.h> +#include <linux/iversion.h> +#include <linux/mpage.h> +#include <linux/namei.h> +#include <linux/nls.h> +#include <linux/uio.h> +#include <linux/writeback.h> + +#include "debug.h" +#include "ntfs.h" +#include "ntfs_fs.h" + +/* + * ntfs_read_mft - Read record and parses MFT. + */ +static struct inode *ntfs_read_mft(struct inode *inode, + const struct cpu_str *name, + const struct MFT_REF *ref) +{ + int err = 0; + struct ntfs_inode *ni = ntfs_i(inode); + struct super_block *sb = inode->i_sb; + struct ntfs_sb_info *sbi = sb->s_fs_info; + mode_t mode = 0; + struct ATTR_STD_INFO5 *std5 = NULL; + struct ATTR_LIST_ENTRY *le; + struct ATTRIB *attr; + bool is_match = false; + bool is_root = false; + bool is_dir; + unsigned long ino = inode->i_ino; + u32 rp_fa = 0, asize, t32; + u16 roff, rsize, names = 0; + const struct ATTR_FILE_NAME *fname = NULL; + const struct INDEX_ROOT *root; + struct REPARSE_DATA_BUFFER rp; // 0x18 bytes + u64 t64; + struct MFT_REC *rec; + struct runs_tree *run; + + inode->i_op = NULL; + /* Setup 'uid' and 'gid' */ + inode->i_uid = sbi->options.fs_uid; + inode->i_gid = sbi->options.fs_gid; + + err = mi_init(&ni->mi, sbi, ino); + if (err) + goto out; + + if (!sbi->mft.ni && ino == MFT_REC_MFT && !sb->s_root) { + t64 = sbi->mft.lbo >> sbi->cluster_bits; + t32 = bytes_to_cluster(sbi, MFT_REC_VOL * sbi->record_size); + sbi->mft.ni = ni; + init_rwsem(&ni->file.run_lock); + + if (!run_add_entry(&ni->file.run, 0, t64, t32, true)) { + err = -ENOMEM; + goto out; + } + } + + err = mi_read(&ni->mi, ino == MFT_REC_MFT); + + if (err) + goto out; + + rec = ni->mi.mrec; + + if (sbi->flags & NTFS_FLAGS_LOG_REPLAYING) { + ; + } else if (ref->seq != rec->seq) { + err = -EINVAL; + ntfs_err(sb, "MFT: r=%lx, expect seq=%x instead of %x!", ino, + le16_to_cpu(ref->seq), le16_to_cpu(rec->seq)); + goto out; + } else if (!is_rec_inuse(rec)) { + err = -EINVAL; + ntfs_err(sb, "Inode r=%x is not in use!", (u32)ino); + goto out; + } + + if (le32_to_cpu(rec->total) != sbi->record_size) { + /* Bad inode? */ + err = -EINVAL; + goto out; + } + + if (!is_rec_base(rec)) + goto Ok; + + /* Record should contain $I30 root. */ + is_dir = rec->flags & RECORD_FLAG_DIR; + + inode->i_generation = le16_to_cpu(rec->seq); + + /* Enumerate all struct Attributes MFT. */ + le = NULL; + attr = NULL; + + /* + * To reduce tab pressure use goto instead of + * while( (attr = ni_enum_attr_ex(ni, attr, &le, NULL) )) + */ +next_attr: + run = NULL; + err = -EINVAL; + attr = ni_enum_attr_ex(ni, attr, &le, NULL); + if (!attr) + goto end_enum; + + if (le && le->vcn) { + /* This is non primary attribute segment. Ignore if not MFT. */ + if (ino != MFT_REC_MFT || attr->type != ATTR_DATA) + goto next_attr; + + run = &ni->file.run; + asize = le32_to_cpu(attr->size); + goto attr_unpack_run; + } + + roff = attr->non_res ? 0 : le16_to_cpu(attr->res.data_off); + rsize = attr->non_res ? 0 : le32_to_cpu(attr->res.data_size); + asize = le32_to_cpu(attr->size); + + switch (attr->type) { + case ATTR_STD: + if (attr->non_res || + asize < sizeof(struct ATTR_STD_INFO) + roff || + rsize < sizeof(struct ATTR_STD_INFO)) + goto out; + + if (std5) + goto next_attr; + + std5 = Add2Ptr(attr, roff); + +#ifdef STATX_BTIME + nt2kernel(std5->cr_time, &ni->i_crtime); +#endif + nt2kernel(std5->a_time, &inode->i_atime); + nt2kernel(std5->c_time, &inode->i_ctime); + nt2kernel(std5->m_time, &inode->i_mtime); + + ni->std_fa = std5->fa; + + if (asize >= sizeof(struct ATTR_STD_INFO5) + roff && + rsize >= sizeof(struct ATTR_STD_INFO5)) + ni->std_security_id = std5->security_id; + goto next_attr; + + case ATTR_LIST: + if (attr->name_len || le || ino == MFT_REC_LOG) + goto out; + + err = ntfs_load_attr_list(ni, attr); + if (err) + goto out; + + le = NULL; + attr = NULL; + goto next_attr; + + case ATTR_NAME: + if (attr->non_res || asize < SIZEOF_ATTRIBUTE_FILENAME + roff || + rsize < SIZEOF_ATTRIBUTE_FILENAME) + goto out; + + fname = Add2Ptr(attr, roff); + if (fname->type == FILE_NAME_DOS) + goto next_attr; + + names += 1; + if (name && name->len == fname->name_len && + !ntfs_cmp_names_cpu(name, (struct le_str *)&fname->name_len, + NULL, false)) + is_match = true; + + goto next_attr; + + case ATTR_DATA: + if (is_dir) { + /* Ignore data attribute in dir record. */ + goto next_attr; + } + + if (ino == MFT_REC_BADCLUST && !attr->non_res) + goto next_attr; + + if (attr->name_len && + ((ino != MFT_REC_BADCLUST || !attr->non_res || + attr->name_len != ARRAY_SIZE(BAD_NAME) || + memcmp(attr_name(attr), BAD_NAME, sizeof(BAD_NAME))) && + (ino != MFT_REC_SECURE || !attr->non_res || + attr->name_len != ARRAY_SIZE(SDS_NAME) || + memcmp(attr_name(attr), SDS_NAME, sizeof(SDS_NAME))))) { + /* File contains stream attribute. Ignore it. */ + goto next_attr; + } + + if (is_attr_sparsed(attr)) + ni->std_fa |= FILE_ATTRIBUTE_SPARSE_FILE; + else + ni->std_fa &= ~FILE_ATTRIBUTE_SPARSE_FILE; + + if (is_attr_compressed(attr)) + ni->std_fa |= FILE_ATTRIBUTE_COMPRESSED; + else + ni->std_fa &= ~FILE_ATTRIBUTE_COMPRESSED; + + if (is_attr_encrypted(attr)) + ni->std_fa |= FILE_ATTRIBUTE_ENCRYPTED; + else + ni->std_fa &= ~FILE_ATTRIBUTE_ENCRYPTED; + + if (!attr->non_res) { + ni->i_valid = inode->i_size = rsize; + inode_set_bytes(inode, rsize); + t32 = asize; + } else { + t32 = le16_to_cpu(attr->nres.run_off); + } + + mode = S_IFREG | (0777 & sbi->options.fs_fmask_inv); + + if (!attr->non_res) { + ni->ni_flags |= NI_FLAG_RESIDENT; + goto next_attr; + } + + inode_set_bytes(inode, attr_ondisk_size(attr)); + + ni->i_valid = le64_to_cpu(attr->nres.valid_size); + inode->i_size = le64_to_cpu(attr->nres.data_size); + if (!attr->nres.alloc_size) + goto next_attr; + + run = ino == MFT_REC_BITMAP ? &sbi->used.bitmap.run + : &ni->file.run; + break; + + case ATTR_ROOT: + if (attr->non_res) + goto out; + + root = Add2Ptr(attr, roff); + is_root = true; + + if (attr->name_len != ARRAY_SIZE(I30_NAME) || + memcmp(attr_name(attr), I30_NAME, sizeof(I30_NAME))) + goto next_attr; + + if (root->type != ATTR_NAME || + root->rule != NTFS_COLLATION_TYPE_FILENAME) + goto out; + + if (!is_dir) + goto next_attr; + + ni->ni_flags |= NI_FLAG_DIR; + + err = indx_init(&ni->dir, sbi, attr, INDEX_MUTEX_I30); + if (err) + goto out; + + mode = sb->s_root + ? (S_IFDIR | (0777 & sbi->options.fs_dmask_inv)) + : (S_IFDIR | 0777); + goto next_attr; + + case ATTR_ALLOC: + if (!is_root || attr->name_len != ARRAY_SIZE(I30_NAME) || + memcmp(attr_name(attr), I30_NAME, sizeof(I30_NAME))) + goto next_attr; + + inode->i_size = le64_to_cpu(attr->nres.data_size); + ni->i_valid = le64_to_cpu(attr->nres.valid_size); + inode_set_bytes(inode, le64_to_cpu(attr->nres.alloc_size)); + + run = &ni->dir.alloc_run; + break; + + case ATTR_BITMAP: + if (ino == MFT_REC_MFT) { + if (!attr->non_res) + goto out; +#ifndef CONFIG_NTFS3_64BIT_CLUSTER + /* 0x20000000 = 2^32 / 8 */ + if (le64_to_cpu(attr->nres.alloc_size) >= 0x20000000) + goto out; +#endif + run = &sbi->mft.bitmap.run; + break; + } else if (is_dir && attr->name_len == ARRAY_SIZE(I30_NAME) && + !memcmp(attr_name(attr), I30_NAME, + sizeof(I30_NAME)) && + attr->non_res) { + run = &ni->dir.bitmap_run; + break; + } + goto next_attr; + + case ATTR_REPARSE: + if (attr->name_len) + goto next_attr; + + rp_fa = ni_parse_reparse(ni, attr, &rp); + switch (rp_fa) { + case REPARSE_LINK: + if (!attr->non_res) { + inode->i_size = rsize; + inode_set_bytes(inode, rsize); + t32 = asize; + } else { + inode->i_size = + le64_to_cpu(attr->nres.data_size); + t32 = le16_to_cpu(attr->nres.run_off); + } + + /* Looks like normal symlink. */ + ni->i_valid = inode->i_size; + + /* Clear directory bit. */ + if (ni->ni_flags & NI_FLAG_DIR) { + indx_clear(&ni->dir); + memset(&ni->dir, 0, sizeof(ni->dir)); + ni->ni_flags &= ~NI_FLAG_DIR; + } else { + run_close(&ni->file.run); + } + mode = S_IFLNK | 0777; + is_dir = false; + if (attr->non_res) { + run = &ni->file.run; + goto attr_unpack_run; // Double break. + } + break; + + case REPARSE_COMPRESSED: + break; + + case REPARSE_DEDUPLICATED: + break; + } + goto next_attr; + + case ATTR_EA_INFO: + if (!attr->name_len && + resident_data_ex(attr, sizeof(struct EA_INFO))) { + ni->ni_flags |= NI_FLAG_EA; + /* + * ntfs_get_wsl_perm updates inode->i_uid, inode->i_gid, inode->i_mode + */ + inode->i_mode = mode; + ntfs_get_wsl_perm(inode); + mode = inode->i_mode; + } + goto next_attr; + + default: + goto next_attr; + } + +attr_unpack_run: + roff = le16_to_cpu(attr->nres.run_off); + + t64 = le64_to_cpu(attr->nres.svcn); + err = run_unpack_ex(run, sbi, ino, t64, le64_to_cpu(attr->nres.evcn), + t64, Add2Ptr(attr, roff), asize - roff); + if (err < 0) + goto out; + err = 0; + goto next_attr; + +end_enum: + + if (!std5) + goto out; + + if (!is_match && name) { + /* Reuse rec as buffer for ascii name. */ + err = -ENOENT; + goto out; + } + + if (std5->fa & FILE_ATTRIBUTE_READONLY) + mode &= ~0222; + + if (!names) { + err = -EINVAL; + goto out; + } + + if (names != le16_to_cpu(rec->hard_links)) { + /* Correct minor error on the fly. Do not mark inode as dirty. */ + rec->hard_links = cpu_to_le16(names); + ni->mi.dirty = true; + } + + set_nlink(inode, names); + + if (S_ISDIR(mode)) { + ni->std_fa |= FILE_ATTRIBUTE_DIRECTORY; + + /* + * Dot and dot-dot should be included in count but was not + * included in enumeration. + * Usually a hard links to directories are disabled. + */ + inode->i_op = &ntfs_dir_inode_operations; + inode->i_fop = &ntfs_dir_operations; + ni->i_valid = 0; + } else if (S_ISLNK(mode)) { + ni->std_fa &= ~FILE_ATTRIBUTE_DIRECTORY; + inode->i_op = &ntfs_link_inode_operations; + inode->i_fop = NULL; + inode_nohighmem(inode); // ?? + } else if (S_ISREG(mode)) { + ni->std_fa &= ~FILE_ATTRIBUTE_DIRECTORY; + inode->i_op = &ntfs_file_inode_operations; + inode->i_fop = &ntfs_file_operations; + inode->i_mapping->a_ops = + is_compressed(ni) ? &ntfs_aops_cmpr : &ntfs_aops; + if (ino != MFT_REC_MFT) + init_rwsem(&ni->file.run_lock); + } else if (S_ISCHR(mode) || S_ISBLK(mode) || S_ISFIFO(mode) || + S_ISSOCK(mode)) { + inode->i_op = &ntfs_special_inode_operations; + init_special_inode(inode, mode, inode->i_rdev); + } else if (fname && fname->home.low == cpu_to_le32(MFT_REC_EXTEND) && + fname->home.seq == cpu_to_le16(MFT_REC_EXTEND)) { + /* Records in $Extend are not a files or general directories. */ + } else { + err = -EINVAL; + goto out; + } + + if ((sbi->options.sys_immutable && + (std5->fa & FILE_ATTRIBUTE_SYSTEM)) && + !S_ISFIFO(mode) && !S_ISSOCK(mode) && !S_ISLNK(mode)) { + inode->i_flags |= S_IMMUTABLE; + } else { + inode->i_flags &= ~S_IMMUTABLE; + } + + inode->i_mode = mode; + if (!(ni->ni_flags & NI_FLAG_EA)) { + /* If no xattr then no security (stored in xattr). */ + inode->i_flags |= S_NOSEC; + } + +Ok: + if (ino == MFT_REC_MFT && !sb->s_root) + sbi->mft.ni = NULL; + + unlock_new_inode(inode); + + return inode; + +out: + if (ino == MFT_REC_MFT && !sb->s_root) + sbi->mft.ni = NULL; + + iget_failed(inode); + return ERR_PTR(err); +} + +/* + * ntfs_test_inode + * + * Return: 1 if match. + */ +static int ntfs_test_inode(struct inode *inode, void *data) +{ + struct MFT_REF *ref = data; + + return ino_get(ref) == inode->i_ino; +} + +static int ntfs_set_inode(struct inode *inode, void *data) +{ + const struct MFT_REF *ref = data; + + inode->i_ino = ino_get(ref); + return 0; +} + +struct inode *ntfs_iget5(struct super_block *sb, const struct MFT_REF *ref, + const struct cpu_str *name) +{ + struct inode *inode; + + inode = iget5_locked(sb, ino_get(ref), ntfs_test_inode, ntfs_set_inode, + (void *)ref); + if (unlikely(!inode)) + return ERR_PTR(-ENOMEM); + + /* If this is a freshly allocated inode, need to read it now. */ + if (inode->i_state & I_NEW) + inode = ntfs_read_mft(inode, name, ref); + else if (ref->seq != ntfs_i(inode)->mi.mrec->seq) { + /* Inode overlaps? */ + make_bad_inode(inode); + } + + return inode; +} + +enum get_block_ctx { + GET_BLOCK_GENERAL = 0, + GET_BLOCK_WRITE_BEGIN = 1, + GET_BLOCK_DIRECT_IO_R = 2, + GET_BLOCK_DIRECT_IO_W = 3, + GET_BLOCK_BMAP = 4, +}; + +static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo, + struct buffer_head *bh, int create, + enum get_block_ctx ctx) +{ + struct super_block *sb = inode->i_sb; + struct ntfs_sb_info *sbi = sb->s_fs_info; + struct ntfs_inode *ni = ntfs_i(inode); + struct page *page = bh->b_page; + u8 cluster_bits = sbi->cluster_bits; + u32 block_size = sb->s_blocksize; + u64 bytes, lbo, valid; + u32 off; + int err; + CLST vcn, lcn, len; + bool new; + + /* Clear previous state. */ + clear_buffer_new(bh); + clear_buffer_uptodate(bh); + + /* Direct write uses 'create=0'. */ + if (!create && vbo >= ni->i_valid) { + /* Out of valid. */ + return 0; + } + + if (vbo >= inode->i_size) { + /* Out of size. */ + return 0; + } + + if (is_resident(ni)) { + ni_lock(ni); + err = attr_data_read_resident(ni, page); + ni_unlock(ni); + + if (!err) + set_buffer_uptodate(bh); + bh->b_size = block_size; + return err; + } + + vcn = vbo >> cluster_bits; + off = vbo & sbi->cluster_mask; + new = false; + + err = attr_data_get_block(ni, vcn, 1, &lcn, &len, create ? &new : NULL); + if (err) + goto out; + + if (!len) + return 0; + + bytes = ((u64)len << cluster_bits) - off; + + if (lcn == SPARSE_LCN) { + if (!create) { + if (bh->b_size > bytes) + bh->b_size = bytes; + return 0; + } + WARN_ON(1); + } + + if (new) { + set_buffer_new(bh); + if ((len << cluster_bits) > block_size) + ntfs_sparse_cluster(inode, page, vcn, len); + } + + lbo = ((u64)lcn << cluster_bits) + off; + + set_buffer_mapped(bh); + bh->b_bdev = sb->s_bdev; + bh->b_blocknr = lbo >> sb->s_blocksize_bits; + + valid = ni->i_valid; + + if (ctx == GET_BLOCK_DIRECT_IO_W) { + /* ntfs_direct_IO will update ni->i_valid. */ + if (vbo >= valid) + set_buffer_new(bh); + } else if (create) { + /* Normal write. */ + if (bytes > bh->b_size) + bytes = bh->b_size; + + if (vbo >= valid) + set_buffer_new(bh); + + if (vbo + bytes > valid) { + ni->i_valid = vbo + bytes; + mark_inode_dirty(inode); + } + } else if (vbo >= valid) { + /* Read out of valid data. */ + /* Should never be here 'cause already checked. */ + clear_buffer_mapped(bh); + } else if (vbo + bytes <= valid) { + /* Normal read. */ + } else if (vbo + block_size <= valid) { + /* Normal short read. */ + bytes = block_size; + } else { + /* + * Read across valid size: vbo < valid && valid < vbo + block_size + */ + bytes = block_size; + + if (page) { + u32 voff = valid - vbo; + + bh->b_size = block_size; + off = vbo & (PAGE_SIZE - 1); + set_bh_page(bh, page, off); + ll_rw_block(REQ_OP_READ, 0, 1, &bh); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { + err = -EIO; + goto out; + } + zero_user_segment(page, off + voff, off + block_size); + } + } + + if (bh->b_size > bytes) + bh->b_size = bytes; + +#ifndef __LP64__ + if (ctx == GET_BLOCK_DIRECT_IO_W || ctx == GET_BLOCK_DIRECT_IO_R) { + static_assert(sizeof(size_t) < sizeof(loff_t)); + if (bytes > 0x40000000u) + bh->b_size = 0x40000000u; + } +#endif + + return 0; + +out: + return err; +} + +int ntfs_get_block(struct inode *inode, sector_t vbn, + struct buffer_head *bh_result, int create) +{ + return ntfs_get_block_vbo(inode, (u64)vbn << inode->i_blkbits, + bh_result, create, GET_BLOCK_GENERAL); +} + +static int ntfs_get_block_bmap(struct inode *inode, sector_t vsn, + struct buffer_head *bh_result, int create) +{ + return ntfs_get_block_vbo(inode, + (u64)vsn << inode->i_sb->s_blocksize_bits, + bh_result, create, GET_BLOCK_BMAP); +} + +static sector_t ntfs_bmap(struct address_space *mapping, sector_t block) +{ + return generic_block_bmap(mapping, block, ntfs_get_block_bmap); +} + +static int ntfs_readpage(struct file *file, struct page *page) +{ + int err; + struct address_space *mapping = page->mapping; + struct inode *inode = mapping->host; + struct ntfs_inode *ni = ntfs_i(inode); + + if (is_resident(ni)) { + ni_lock(ni); + err = attr_data_read_resident(ni, page); + ni_unlock(ni); + if (err != E_NTFS_NONRESIDENT) { + unlock_page(page); + return err; + } + } + + if (is_compressed(ni)) { + ni_lock(ni); + err = ni_readpage_cmpr(ni, page); + ni_unlock(ni); + return err; + } + + /* Normal + sparse files. */ + return mpage_readpage(page, ntfs_get_block); +} + +static void ntfs_readahead(struct readahead_control *rac) +{ + struct address_space *mapping = rac->mapping; + struct inode *inode = mapping->host; + struct ntfs_inode *ni = ntfs_i(inode); + u64 valid; + loff_t pos; + + if (is_resident(ni)) { + /* No readahead for resident. */ + return; + } + + if (is_compressed(ni)) { + /* No readahead for compressed. */ + return; + } + + valid = ni->i_valid; + pos = readahead_pos(rac); + + if (valid < i_size_read(inode) && pos <= valid && + valid < pos + readahead_length(rac)) { + /* Range cross 'valid'. Read it page by page. */ + return; + } + + mpage_readahead(rac, ntfs_get_block); +} + +static int ntfs_get_block_direct_IO_R(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + return ntfs_get_block_vbo(inode, (u64)iblock << inode->i_blkbits, + bh_result, create, GET_BLOCK_DIRECT_IO_R); +} + +static int ntfs_get_block_direct_IO_W(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + return ntfs_get_block_vbo(inode, (u64)iblock << inode->i_blkbits, + bh_result, create, GET_BLOCK_DIRECT_IO_W); +} + +static ssize_t ntfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + struct ntfs_inode *ni = ntfs_i(inode); + loff_t vbo = iocb->ki_pos; + loff_t end; + int wr = iov_iter_rw(iter) & WRITE; + loff_t valid; + ssize_t ret; + + if (is_resident(ni)) { + /* Switch to buffered write. */ + ret = 0; + goto out; + } + + ret = blockdev_direct_IO(iocb, inode, iter, + wr ? ntfs_get_block_direct_IO_W + : ntfs_get_block_direct_IO_R); + + if (ret <= 0) + goto out; + + end = vbo + ret; + valid = ni->i_valid; + if (wr) { + if (end > valid && !S_ISBLK(inode->i_mode)) { + ni->i_valid = end; + mark_inode_dirty(inode); + } + } else if (vbo < valid && valid < end) { + /* Fix page. */ + iov_iter_revert(iter, end - valid); + iov_iter_zero(end - valid, iter); + } + +out: + return ret; +} + +int ntfs_set_size(struct inode *inode, u64 new_size) +{ + struct super_block *sb = inode->i_sb; + struct ntfs_sb_info *sbi = sb->s_fs_info; + struct ntfs_inode *ni = ntfs_i(inode); + int err; + + /* Check for maximum file size. */ + if (is_sparsed(ni) || is_compressed(ni)) { + if (new_size > sbi->maxbytes_sparse) { + err = -EFBIG; + goto out; + } + } else if (new_size > sbi->maxbytes) { + err = -EFBIG; + goto out; + } + + ni_lock(ni); + down_write(&ni->file.run_lock); + + err = attr_set_size(ni, ATTR_DATA, NULL, 0, &ni->file.run, new_size, + &ni->i_valid, true, NULL); + + up_write(&ni->file.run_lock); + ni_unlock(ni); + + mark_inode_dirty(inode); + +out: + return err; +} + +static int ntfs_writepage(struct page *page, struct writeback_control *wbc) +{ + struct address_space *mapping = page->mapping; + struct inode *inode = mapping->host; + struct ntfs_inode *ni = ntfs_i(inode); + int err; + + if (is_resident(ni)) { + ni_lock(ni); + err = attr_data_write_resident(ni, page); + ni_unlock(ni); + if (err != E_NTFS_NONRESIDENT) { + unlock_page(page); + return err; + } + } + + return block_write_full_page(page, ntfs_get_block, wbc); +} + +static int ntfs_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + struct ntfs_inode *ni = ntfs_i(inode); + /* Redirect call to 'ntfs_writepage' for resident files. */ + get_block_t *get_block = is_resident(ni) ? NULL : &ntfs_get_block; + + return mpage_writepages(mapping, wbc, get_block); +} + +static int ntfs_get_block_write_begin(struct inode *inode, sector_t vbn, + struct buffer_head *bh_result, int create) +{ + return ntfs_get_block_vbo(inode, (u64)vbn << inode->i_blkbits, + bh_result, create, GET_BLOCK_WRITE_BEGIN); +} + +static int ntfs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, u32 len, u32 flags, struct page **pagep, + void **fsdata) +{ + int err; + struct inode *inode = mapping->host; + struct ntfs_inode *ni = ntfs_i(inode); + + *pagep = NULL; + if (is_resident(ni)) { + struct page *page = grab_cache_page_write_begin( + mapping, pos >> PAGE_SHIFT, flags); + + if (!page) { + err = -ENOMEM; + goto out; + } + + ni_lock(ni); + err = attr_data_read_resident(ni, page); + ni_unlock(ni); + + if (!err) { + *pagep = page; + goto out; + } + unlock_page(page); + put_page(page); + + if (err != E_NTFS_NONRESIDENT) + goto out; + } + + err = block_write_begin(mapping, pos, len, flags, pagep, + ntfs_get_block_write_begin); + +out: + return err; +} + +/* + * ntfs_write_end - Address_space_operations::write_end. + */ +static int ntfs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, u32 len, u32 copied, struct page *page, + void *fsdata) + +{ + struct inode *inode = mapping->host; + struct ntfs_inode *ni = ntfs_i(inode); + u64 valid = ni->i_valid; + bool dirty = false; + int err; + + if (is_resident(ni)) { + ni_lock(ni); + err = attr_data_write_resident(ni, page); + ni_unlock(ni); + if (!err) { + dirty = true; + /* Clear any buffers in page. */ + if (page_has_buffers(page)) { + struct buffer_head *head, *bh; + + bh = head = page_buffers(page); + do { + clear_buffer_dirty(bh); + clear_buffer_mapped(bh); + set_buffer_uptodate(bh); + } while (head != (bh = bh->b_this_page)); + } + SetPageUptodate(page); + err = copied; + } + unlock_page(page); + put_page(page); + } else { + err = generic_write_end(file, mapping, pos, len, copied, page, + fsdata); + } + + if (err >= 0) { + if (!(ni->std_fa & FILE_ATTRIBUTE_ARCHIVE)) { + inode->i_ctime = inode->i_mtime = current_time(inode); + ni->std_fa |= FILE_ATTRIBUTE_ARCHIVE; + dirty = true; + } + + if (valid != ni->i_valid) { + /* ni->i_valid is changed in ntfs_get_block_vbo. */ + dirty = true; + } + + if (dirty) + mark_inode_dirty(inode); + } + + return err; +} + +int reset_log_file(struct inode *inode) +{ + int err; + loff_t pos = 0; + u32 log_size = inode->i_size; + struct address_space *mapping = inode->i_mapping; + + for (;;) { + u32 len; + void *kaddr; + struct page *page; + + len = pos + PAGE_SIZE > log_size ? (log_size - pos) : PAGE_SIZE; + + err = block_write_begin(mapping, pos, len, 0, &page, + ntfs_get_block_write_begin); + if (err) + goto out; + + kaddr = kmap_atomic(page); + memset(kaddr, -1, len); + kunmap_atomic(kaddr); + flush_dcache_page(page); + + err = block_write_end(NULL, mapping, pos, len, len, page, NULL); + if (err < 0) + goto out; + pos += len; + + if (pos >= log_size) + break; + balance_dirty_pages_ratelimited(mapping); + } +out: + mark_inode_dirty_sync(inode); + + return err; +} + +int ntfs3_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + return _ni_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); +} + +int ntfs_sync_inode(struct inode *inode) +{ + return _ni_write_inode(inode, 1); +} + +/* + * writeback_inode - Helper function for ntfs_flush_inodes(). + * + * This writes both the inode and the file data blocks, waiting + * for in flight data blocks before the start of the call. It + * does not wait for any io started during the call. + */ +static int writeback_inode(struct inode *inode) +{ + int ret = sync_inode_metadata(inode, 0); + + if (!ret) + ret = filemap_fdatawrite(inode->i_mapping); + return ret; +} + +/* + * ntfs_flush_inodes + * + * Write data and metadata corresponding to i1 and i2. The io is + * started but we do not wait for any of it to finish. + * + * filemap_flush() is used for the block device, so if there is a dirty + * page for a block already in flight, we will not wait and start the + * io over again. + */ +int ntfs_flush_inodes(struct super_block *sb, struct inode *i1, + struct inode *i2) +{ + int ret = 0; + + if (i1) + ret = writeback_inode(i1); + if (!ret && i2) + ret = writeback_inode(i2); + if (!ret) + ret = filemap_flush(sb->s_bdev->bd_inode->i_mapping); + return ret; +} + +int inode_write_data(struct inode *inode, const void *data, size_t bytes) +{ + pgoff_t idx; + + /* Write non resident data. */ + for (idx = 0; bytes; idx++) { + size_t op = bytes > PAGE_SIZE ? PAGE_SIZE : bytes; + struct page *page = ntfs_map_page(inode->i_mapping, idx); + + if (IS_ERR(page)) + return PTR_ERR(page); + + lock_page(page); + WARN_ON(!PageUptodate(page)); + ClearPageUptodate(page); + + memcpy(page_address(page), data, op); + + flush_dcache_page(page); + SetPageUptodate(page); + unlock_page(page); + + ntfs_unmap_page(page); + + bytes -= op; + data = Add2Ptr(data, PAGE_SIZE); + } + return 0; +} + +/* + * ntfs_reparse_bytes + * + * Number of bytes for REPARSE_DATA_BUFFER(IO_REPARSE_TAG_SYMLINK) + * for unicode string of @uni_len length. + */ +static inline u32 ntfs_reparse_bytes(u32 uni_len) +{ + /* Header + unicode string + decorated unicode string. */ + return sizeof(short) * (2 * uni_len + 4) + + offsetof(struct REPARSE_DATA_BUFFER, + SymbolicLinkReparseBuffer.PathBuffer); +} + +static struct REPARSE_DATA_BUFFER * +ntfs_create_reparse_buffer(struct ntfs_sb_info *sbi, const char *symname, + u32 size, u16 *nsize) +{ + int i, err; + struct REPARSE_DATA_BUFFER *rp; + __le16 *rp_name; + typeof(rp->SymbolicLinkReparseBuffer) *rs; + + rp = kzalloc(ntfs_reparse_bytes(2 * size + 2), GFP_NOFS); + if (!rp) + return ERR_PTR(-ENOMEM); + + rs = &rp->SymbolicLinkReparseBuffer; + rp_name = rs->PathBuffer; + + /* Convert link name to UTF-16. */ + err = ntfs_nls_to_utf16(sbi, symname, size, + (struct cpu_str *)(rp_name - 1), 2 * size, + UTF16_LITTLE_ENDIAN); + if (err < 0) + goto out; + + /* err = the length of unicode name of symlink. */ + *nsize = ntfs_reparse_bytes(err); + + if (*nsize > sbi->reparse.max_size) { + err = -EFBIG; + goto out; + } + + /* Translate Linux '/' into Windows '\'. */ + for (i = 0; i < err; i++) { + if (rp_name[i] == cpu_to_le16('/')) + rp_name[i] = cpu_to_le16('\\'); + } + + rp->ReparseTag = IO_REPARSE_TAG_SYMLINK; + rp->ReparseDataLength = + cpu_to_le16(*nsize - offsetof(struct REPARSE_DATA_BUFFER, + SymbolicLinkReparseBuffer)); + + /* PrintName + SubstituteName. */ + rs->SubstituteNameOffset = cpu_to_le16(sizeof(short) * err); + rs->SubstituteNameLength = cpu_to_le16(sizeof(short) * err + 8); + rs->PrintNameLength = rs->SubstituteNameOffset; + + /* + * TODO: Use relative path if possible to allow Windows to + * parse this path. + * 0-absolute path 1- relative path (SYMLINK_FLAG_RELATIVE). + */ + rs->Flags = 0; + + memmove(rp_name + err + 4, rp_name, sizeof(short) * err); + + /* Decorate SubstituteName. */ + rp_name += err; + rp_name[0] = cpu_to_le16('\\'); + rp_name[1] = cpu_to_le16('?'); + rp_name[2] = cpu_to_le16('?'); + rp_name[3] = cpu_to_le16('\\'); + + return rp; +out: + kfree(rp); + return ERR_PTR(err); +} + +struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *dentry, + const struct cpu_str *uni, umode_t mode, + dev_t dev, const char *symname, u32 size, + struct ntfs_fnd *fnd) +{ + int err; + struct super_block *sb = dir->i_sb; + struct ntfs_sb_info *sbi = sb->s_fs_info; + const struct qstr *name = &dentry->d_name; + CLST ino = 0; + struct ntfs_inode *dir_ni = ntfs_i(dir); + struct ntfs_inode *ni = NULL; + struct inode *inode = NULL; + struct ATTRIB *attr; + struct ATTR_STD_INFO5 *std5; + struct ATTR_FILE_NAME *fname; + struct MFT_REC *rec; + u32 asize, dsize, sd_size; + enum FILE_ATTRIBUTE fa; + __le32 security_id = SECURITY_ID_INVALID; + CLST vcn; + const void *sd; + u16 t16, nsize = 0, aid = 0; + struct INDEX_ROOT *root, *dir_root; + struct NTFS_DE *e, *new_de = NULL; + struct REPARSE_DATA_BUFFER *rp = NULL; + bool rp_inserted = false; + + dir_root = indx_get_root(&dir_ni->dir, dir_ni, NULL, NULL); + if (!dir_root) + return ERR_PTR(-EINVAL); + + if (S_ISDIR(mode)) { + /* Use parent's directory attributes. */ + fa = dir_ni->std_fa | FILE_ATTRIBUTE_DIRECTORY | + FILE_ATTRIBUTE_ARCHIVE; + /* + * By default child directory inherits parent attributes. + * Root directory is hidden + system. + * Make an exception for children in root. + */ + if (dir->i_ino == MFT_REC_ROOT) + fa &= ~(FILE_ATTRIBUTE_HIDDEN | FILE_ATTRIBUTE_SYSTEM); + } else if (S_ISLNK(mode)) { + /* It is good idea that link should be the same type (file/dir) as target */ + fa = FILE_ATTRIBUTE_REPARSE_POINT; + + /* + * Linux: there are dir/file/symlink and so on. + * NTFS: symlinks are "dir + reparse" or "file + reparse" + * It is good idea to create: + * dir + reparse if 'symname' points to directory + * or + * file + reparse if 'symname' points to file + * Unfortunately kern_path hangs if symname contains 'dir'. + */ + + /* + * struct path path; + * + * if (!kern_path(symname, LOOKUP_FOLLOW, &path)){ + * struct inode *target = d_inode(path.dentry); + * + * if (S_ISDIR(target->i_mode)) + * fa |= FILE_ATTRIBUTE_DIRECTORY; + * // if ( target->i_sb == sb ){ + * // use relative path? + * // } + * path_put(&path); + * } + */ + } else if (S_ISREG(mode)) { + if (sbi->options.sparse) { + /* Sparsed regular file, cause option 'sparse'. */ + fa = FILE_ATTRIBUTE_SPARSE_FILE | + FILE_ATTRIBUTE_ARCHIVE; + } else if (dir_ni->std_fa & FILE_ATTRIBUTE_COMPRESSED) { + /* Compressed regular file, if parent is compressed. */ + fa = FILE_ATTRIBUTE_COMPRESSED | FILE_ATTRIBUTE_ARCHIVE; + } else { + /* Regular file, default attributes. */ + fa = FILE_ATTRIBUTE_ARCHIVE; + } + } else { + fa = FILE_ATTRIBUTE_ARCHIVE; + } + + if (!(mode & 0222)) + fa |= FILE_ATTRIBUTE_READONLY; + + /* Allocate PATH_MAX bytes. */ + new_de = __getname(); + if (!new_de) { + err = -ENOMEM; + goto out1; + } + + /* Mark rw ntfs as dirty. it will be cleared at umount. */ + ntfs_set_state(sbi, NTFS_DIRTY_DIRTY); + + /* Step 1: allocate and fill new mft record. */ + err = ntfs_look_free_mft(sbi, &ino, false, NULL, NULL); + if (err) + goto out2; + + ni = ntfs_new_inode(sbi, ino, fa & FILE_ATTRIBUTE_DIRECTORY); + if (IS_ERR(ni)) { + err = PTR_ERR(ni); + ni = NULL; + goto out3; + } + inode = &ni->vfs_inode; + inode_init_owner(mnt_userns, inode, dir, mode); + mode = inode->i_mode; + + inode->i_atime = inode->i_mtime = inode->i_ctime = ni->i_crtime = + current_time(inode); + + rec = ni->mi.mrec; + rec->hard_links = cpu_to_le16(1); + attr = Add2Ptr(rec, le16_to_cpu(rec->attr_off)); + + /* Get default security id. */ + sd = s_default_security; + sd_size = sizeof(s_default_security); + + if (is_ntfs3(sbi)) { + security_id = dir_ni->std_security_id; + if (le32_to_cpu(security_id) < SECURITY_ID_FIRST) { + security_id = sbi->security.def_security_id; + + if (security_id == SECURITY_ID_INVALID && + !ntfs_insert_security(sbi, sd, sd_size, + &security_id, NULL)) + sbi->security.def_security_id = security_id; + } + } + + /* Insert standard info. */ + std5 = Add2Ptr(attr, SIZEOF_RESIDENT); + + if (security_id == SECURITY_ID_INVALID) { + dsize = sizeof(struct ATTR_STD_INFO); + } else { + dsize = sizeof(struct ATTR_STD_INFO5); + std5->security_id = security_id; + ni->std_security_id = security_id; + } + asize = SIZEOF_RESIDENT + dsize; + + attr->type = ATTR_STD; + attr->size = cpu_to_le32(asize); + attr->id = cpu_to_le16(aid++); + attr->res.data_off = SIZEOF_RESIDENT_LE; + attr->res.data_size = cpu_to_le32(dsize); + + std5->cr_time = std5->m_time = std5->c_time = std5->a_time = + kernel2nt(&inode->i_atime); + + ni->std_fa = fa; + std5->fa = fa; + + attr = Add2Ptr(attr, asize); + + /* Insert file name. */ + err = fill_name_de(sbi, new_de, name, uni); + if (err) + goto out4; + + mi_get_ref(&ni->mi, &new_de->ref); + + fname = (struct ATTR_FILE_NAME *)(new_de + 1); + mi_get_ref(&dir_ni->mi, &fname->home); + fname->dup.cr_time = fname->dup.m_time = fname->dup.c_time = + fname->dup.a_time = std5->cr_time; + fname->dup.alloc_size = fname->dup.data_size = 0; + fname->dup.fa = std5->fa; + fname->dup.ea_size = fname->dup.reparse = 0; + + dsize = le16_to_cpu(new_de->key_size); + asize = ALIGN(SIZEOF_RESIDENT + dsize, 8); + + attr->type = ATTR_NAME; + attr->size = cpu_to_le32(asize); + attr->res.data_off = SIZEOF_RESIDENT_LE; + attr->res.flags = RESIDENT_FLAG_INDEXED; + attr->id = cpu_to_le16(aid++); + attr->res.data_size = cpu_to_le32(dsize); + memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), fname, dsize); + + attr = Add2Ptr(attr, asize); + + if (security_id == SECURITY_ID_INVALID) { + /* Insert security attribute. */ + asize = SIZEOF_RESIDENT + ALIGN(sd_size, 8); + + attr->type = ATTR_SECURE; + attr->size = cpu_to_le32(asize); + attr->id = cpu_to_le16(aid++); + attr->res.data_off = SIZEOF_RESIDENT_LE; + attr->res.data_size = cpu_to_le32(sd_size); + memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), sd, sd_size); + + attr = Add2Ptr(attr, asize); + } + + attr->id = cpu_to_le16(aid++); + if (fa & FILE_ATTRIBUTE_DIRECTORY) { + /* + * Regular directory or symlink to directory. + * Create root attribute. + */ + dsize = sizeof(struct INDEX_ROOT) + sizeof(struct NTFS_DE); + asize = sizeof(I30_NAME) + SIZEOF_RESIDENT + dsize; + + attr->type = ATTR_ROOT; + attr->size = cpu_to_le32(asize); + + attr->name_len = ARRAY_SIZE(I30_NAME); + attr->name_off = SIZEOF_RESIDENT_LE; + attr->res.data_off = + cpu_to_le16(sizeof(I30_NAME) + SIZEOF_RESIDENT); + attr->res.data_size = cpu_to_le32(dsize); + memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), I30_NAME, + sizeof(I30_NAME)); + + root = Add2Ptr(attr, sizeof(I30_NAME) + SIZEOF_RESIDENT); + memcpy(root, dir_root, offsetof(struct INDEX_ROOT, ihdr)); + root->ihdr.de_off = + cpu_to_le32(sizeof(struct INDEX_HDR)); // 0x10 + root->ihdr.used = cpu_to_le32(sizeof(struct INDEX_HDR) + + sizeof(struct NTFS_DE)); + root->ihdr.total = root->ihdr.used; + + e = Add2Ptr(root, sizeof(struct INDEX_ROOT)); + e->size = cpu_to_le16(sizeof(struct NTFS_DE)); + e->flags = NTFS_IE_LAST; + } else if (S_ISLNK(mode)) { + /* + * Symlink to file. + * Create empty resident data attribute. + */ + asize = SIZEOF_RESIDENT; + + /* Insert empty ATTR_DATA */ + attr->type = ATTR_DATA; + attr->size = cpu_to_le32(SIZEOF_RESIDENT); + attr->name_off = SIZEOF_RESIDENT_LE; + attr->res.data_off = SIZEOF_RESIDENT_LE; + } else if (S_ISREG(mode)) { + /* + * Regular file. Create empty non resident data attribute. + */ + attr->type = ATTR_DATA; + attr->non_res = 1; + attr->nres.evcn = cpu_to_le64(-1ll); + if (fa & FILE_ATTRIBUTE_SPARSE_FILE) { + attr->size = cpu_to_le32(SIZEOF_NONRESIDENT_EX + 8); + attr->name_off = SIZEOF_NONRESIDENT_EX_LE; + attr->flags = ATTR_FLAG_SPARSED; + asize = SIZEOF_NONRESIDENT_EX + 8; + } else if (fa & FILE_ATTRIBUTE_COMPRESSED) { + attr->size = cpu_to_le32(SIZEOF_NONRESIDENT_EX + 8); + attr->name_off = SIZEOF_NONRESIDENT_EX_LE; + attr->flags = ATTR_FLAG_COMPRESSED; + attr->nres.c_unit = COMPRESSION_UNIT; + asize = SIZEOF_NONRESIDENT_EX + 8; + } else { + attr->size = cpu_to_le32(SIZEOF_NONRESIDENT + 8); + attr->name_off = SIZEOF_NONRESIDENT_LE; + asize = SIZEOF_NONRESIDENT + 8; + } + attr->nres.run_off = attr->name_off; + } else { + /* + * Node. Create empty resident data attribute. + */ + attr->type = ATTR_DATA; + attr->size = cpu_to_le32(SIZEOF_RESIDENT); + attr->name_off = SIZEOF_RESIDENT_LE; + if (fa & FILE_ATTRIBUTE_SPARSE_FILE) + attr->flags = ATTR_FLAG_SPARSED; + else if (fa & FILE_ATTRIBUTE_COMPRESSED) + attr->flags = ATTR_FLAG_COMPRESSED; + attr->res.data_off = SIZEOF_RESIDENT_LE; + asize = SIZEOF_RESIDENT; + ni->ni_flags |= NI_FLAG_RESIDENT; + } + + if (S_ISDIR(mode)) { + ni->ni_flags |= NI_FLAG_DIR; + err = indx_init(&ni->dir, sbi, attr, INDEX_MUTEX_I30); + if (err) + goto out4; + } else if (S_ISLNK(mode)) { + rp = ntfs_create_reparse_buffer(sbi, symname, size, &nsize); + + if (IS_ERR(rp)) { + err = PTR_ERR(rp); + rp = NULL; + goto out4; + } + + /* + * Insert ATTR_REPARSE. + */ + attr = Add2Ptr(attr, asize); + attr->type = ATTR_REPARSE; + attr->id = cpu_to_le16(aid++); + + /* Resident or non resident? */ + asize = ALIGN(SIZEOF_RESIDENT + nsize, 8); + t16 = PtrOffset(rec, attr); + + /* 0x78 - the size of EA + EAINFO to store WSL */ + if (asize + t16 + 0x78 + 8 > sbi->record_size) { + CLST alen; + CLST clst = bytes_to_cluster(sbi, nsize); + + /* Bytes per runs. */ + t16 = sbi->record_size - t16 - SIZEOF_NONRESIDENT; + + attr->non_res = 1; + attr->nres.evcn = cpu_to_le64(clst - 1); + attr->name_off = SIZEOF_NONRESIDENT_LE; + attr->nres.run_off = attr->name_off; + attr->nres.data_size = cpu_to_le64(nsize); + attr->nres.valid_size = attr->nres.data_size; + attr->nres.alloc_size = + cpu_to_le64(ntfs_up_cluster(sbi, nsize)); + + err = attr_allocate_clusters(sbi, &ni->file.run, 0, 0, + clst, NULL, 0, &alen, 0, + NULL); + if (err) + goto out5; + + err = run_pack(&ni->file.run, 0, clst, + Add2Ptr(attr, SIZEOF_NONRESIDENT), t16, + &vcn); + if (err < 0) + goto out5; + + if (vcn != clst) { + err = -EINVAL; + goto out5; + } + + asize = SIZEOF_NONRESIDENT + ALIGN(err, 8); + inode->i_size = nsize; + } else { + attr->res.data_off = SIZEOF_RESIDENT_LE; + attr->res.data_size = cpu_to_le32(nsize); + memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), rp, nsize); + inode->i_size = nsize; + nsize = 0; + } + + attr->size = cpu_to_le32(asize); + + err = ntfs_insert_reparse(sbi, IO_REPARSE_TAG_SYMLINK, + &new_de->ref); + if (err) + goto out5; + + rp_inserted = true; + } + + attr = Add2Ptr(attr, asize); + attr->type = ATTR_END; + + rec->used = cpu_to_le32(PtrOffset(rec, attr) + 8); + rec->next_attr_id = cpu_to_le16(aid); + + /* Step 2: Add new name in index. */ + err = indx_insert_entry(&dir_ni->dir, dir_ni, new_de, sbi, fnd, 0); + if (err) + goto out6; + + inode->i_generation = le16_to_cpu(rec->seq); + + dir->i_mtime = dir->i_ctime = inode->i_atime; + + if (S_ISDIR(mode)) { + inode->i_op = &ntfs_dir_inode_operations; + inode->i_fop = &ntfs_dir_operations; + } else if (S_ISLNK(mode)) { + inode->i_op = &ntfs_link_inode_operations; + inode->i_fop = NULL; + inode->i_mapping->a_ops = &ntfs_aops; + } else if (S_ISREG(mode)) { + inode->i_op = &ntfs_file_inode_operations; + inode->i_fop = &ntfs_file_operations; + inode->i_mapping->a_ops = + is_compressed(ni) ? &ntfs_aops_cmpr : &ntfs_aops; + init_rwsem(&ni->file.run_lock); + } else { + inode->i_op = &ntfs_special_inode_operations; + init_special_inode(inode, mode, dev); + } + +#ifdef CONFIG_NTFS3_FS_POSIX_ACL + if (!S_ISLNK(mode) && (sb->s_flags & SB_POSIXACL)) { + err = ntfs_init_acl(mnt_userns, inode, dir); + if (err) + goto out6; + } else +#endif + { + inode->i_flags |= S_NOSEC; + } + + /* Write non resident data. */ + if (nsize) { + err = ntfs_sb_write_run(sbi, &ni->file.run, 0, rp, nsize); + if (err) + goto out7; + } + + /* + * Call 'd_instantiate' after inode->i_op is set + * but before finish_open. + */ + d_instantiate(dentry, inode); + + ntfs_save_wsl_perm(inode); + mark_inode_dirty(dir); + mark_inode_dirty(inode); + + /* Normal exit. */ + goto out2; + +out7: + + /* Undo 'indx_insert_entry'. */ + indx_delete_entry(&dir_ni->dir, dir_ni, new_de + 1, + le16_to_cpu(new_de->key_size), sbi); +out6: + if (rp_inserted) + ntfs_remove_reparse(sbi, IO_REPARSE_TAG_SYMLINK, &new_de->ref); + +out5: + if (S_ISDIR(mode) || run_is_empty(&ni->file.run)) + goto out4; + + run_deallocate(sbi, &ni->file.run, false); + +out4: + clear_rec_inuse(rec); + clear_nlink(inode); + ni->mi.dirty = false; + discard_new_inode(inode); +out3: + ntfs_mark_rec_free(sbi, ino); + +out2: + __putname(new_de); + kfree(rp); + +out1: + if (err) + return ERR_PTR(err); + + unlock_new_inode(inode); + + return inode; +} + +int ntfs_link_inode(struct inode *inode, struct dentry *dentry) +{ + int err; + struct ntfs_inode *ni = ntfs_i(inode); + struct ntfs_sb_info *sbi = inode->i_sb->s_fs_info; + struct NTFS_DE *de; + struct ATTR_FILE_NAME *de_name; + + /* Allocate PATH_MAX bytes. */ + de = __getname(); + if (!de) + return -ENOMEM; + + /* Mark rw ntfs as dirty. It will be cleared at umount. */ + ntfs_set_state(sbi, NTFS_DIRTY_DIRTY); + + /* Construct 'de'. */ + err = fill_name_de(sbi, de, &dentry->d_name, NULL); + if (err) + goto out; + + de_name = (struct ATTR_FILE_NAME *)(de + 1); + /* Fill duplicate info. */ + de_name->dup.cr_time = de_name->dup.m_time = de_name->dup.c_time = + de_name->dup.a_time = kernel2nt(&inode->i_ctime); + de_name->dup.alloc_size = de_name->dup.data_size = + cpu_to_le64(inode->i_size); + de_name->dup.fa = ni->std_fa; + de_name->dup.ea_size = de_name->dup.reparse = 0; + + err = ni_add_name(ntfs_i(d_inode(dentry->d_parent)), ni, de); +out: + __putname(de); + return err; +} + +/* + * ntfs_unlink_inode + * + * inode_operations::unlink + * inode_operations::rmdir + */ +int ntfs_unlink_inode(struct inode *dir, const struct dentry *dentry) +{ + int err; + struct ntfs_sb_info *sbi = dir->i_sb->s_fs_info; + struct inode *inode = d_inode(dentry); + struct ntfs_inode *ni = ntfs_i(inode); + struct ntfs_inode *dir_ni = ntfs_i(dir); + struct NTFS_DE *de, *de2 = NULL; + int undo_remove; + + if (ntfs_is_meta_file(sbi, ni->mi.rno)) + return -EINVAL; + + /* Allocate PATH_MAX bytes. */ + de = __getname(); + if (!de) + return -ENOMEM; + + ni_lock(ni); + + if (S_ISDIR(inode->i_mode) && !dir_is_empty(inode)) { + err = -ENOTEMPTY; + goto out; + } + + err = fill_name_de(sbi, de, &dentry->d_name, NULL); + if (err < 0) + goto out; + + undo_remove = 0; + err = ni_remove_name(dir_ni, ni, de, &de2, &undo_remove); + + if (!err) { + drop_nlink(inode); + dir->i_mtime = dir->i_ctime = current_time(dir); + mark_inode_dirty(dir); + inode->i_ctime = dir->i_ctime; + if (inode->i_nlink) + mark_inode_dirty(inode); + } else if (!ni_remove_name_undo(dir_ni, ni, de, de2, undo_remove)) { + make_bad_inode(inode); + ntfs_inode_err(inode, "failed to undo unlink"); + ntfs_set_state(sbi, NTFS_DIRTY_ERROR); + } else { + if (ni_is_dirty(dir)) + mark_inode_dirty(dir); + if (ni_is_dirty(inode)) + mark_inode_dirty(inode); + } + +out: + ni_unlock(ni); + __putname(de); + return err; +} + +void ntfs_evict_inode(struct inode *inode) +{ + truncate_inode_pages_final(&inode->i_data); + + if (inode->i_nlink) + _ni_write_inode(inode, inode_needs_sync(inode)); + + invalidate_inode_buffers(inode); + clear_inode(inode); + + ni_clear(ntfs_i(inode)); +} + +static noinline int ntfs_readlink_hlp(struct inode *inode, char *buffer, + int buflen) +{ + int i, err = 0; + struct ntfs_inode *ni = ntfs_i(inode); + struct super_block *sb = inode->i_sb; + struct ntfs_sb_info *sbi = sb->s_fs_info; + u64 i_size = inode->i_size; + u16 nlen = 0; + void *to_free = NULL; + struct REPARSE_DATA_BUFFER *rp; + struct le_str *uni; + struct ATTRIB *attr; + + /* Reparse data present. Try to parse it. */ + static_assert(!offsetof(struct REPARSE_DATA_BUFFER, ReparseTag)); + static_assert(sizeof(u32) == sizeof(rp->ReparseTag)); + + *buffer = 0; + + /* Read into temporal buffer. */ + if (i_size > sbi->reparse.max_size || i_size <= sizeof(u32)) { + err = -EINVAL; + goto out; + } + + attr = ni_find_attr(ni, NULL, NULL, ATTR_REPARSE, NULL, 0, NULL, NULL); + if (!attr) { + err = -EINVAL; + goto out; + } + + if (!attr->non_res) { + rp = resident_data_ex(attr, i_size); + if (!rp) { + err = -EINVAL; + goto out; + } + } else { + rp = kmalloc(i_size, GFP_NOFS); + if (!rp) { + err = -ENOMEM; + goto out; + } + to_free = rp; + err = ntfs_read_run_nb(sbi, &ni->file.run, 0, rp, i_size, NULL); + if (err) + goto out; + } + + err = -EINVAL; + + /* Microsoft Tag. */ + switch (rp->ReparseTag) { + case IO_REPARSE_TAG_MOUNT_POINT: + /* Mount points and junctions. */ + /* Can we use 'Rp->MountPointReparseBuffer.PrintNameLength'? */ + if (i_size <= offsetof(struct REPARSE_DATA_BUFFER, + MountPointReparseBuffer.PathBuffer)) + goto out; + uni = Add2Ptr(rp, + offsetof(struct REPARSE_DATA_BUFFER, + MountPointReparseBuffer.PathBuffer) + + le16_to_cpu(rp->MountPointReparseBuffer + .PrintNameOffset) - + 2); + nlen = le16_to_cpu(rp->MountPointReparseBuffer.PrintNameLength); + break; + + case IO_REPARSE_TAG_SYMLINK: + /* FolderSymbolicLink */ + /* Can we use 'Rp->SymbolicLinkReparseBuffer.PrintNameLength'? */ + if (i_size <= offsetof(struct REPARSE_DATA_BUFFER, + SymbolicLinkReparseBuffer.PathBuffer)) + goto out; + uni = Add2Ptr(rp, + offsetof(struct REPARSE_DATA_BUFFER, + SymbolicLinkReparseBuffer.PathBuffer) + + le16_to_cpu(rp->SymbolicLinkReparseBuffer + .PrintNameOffset) - + 2); + nlen = le16_to_cpu( + rp->SymbolicLinkReparseBuffer.PrintNameLength); + break; + + case IO_REPARSE_TAG_CLOUD: + case IO_REPARSE_TAG_CLOUD_1: + case IO_REPARSE_TAG_CLOUD_2: + case IO_REPARSE_TAG_CLOUD_3: + case IO_REPARSE_TAG_CLOUD_4: + case IO_REPARSE_TAG_CLOUD_5: + case IO_REPARSE_TAG_CLOUD_6: + case IO_REPARSE_TAG_CLOUD_7: + case IO_REPARSE_TAG_CLOUD_8: + case IO_REPARSE_TAG_CLOUD_9: + case IO_REPARSE_TAG_CLOUD_A: + case IO_REPARSE_TAG_CLOUD_B: + case IO_REPARSE_TAG_CLOUD_C: + case IO_REPARSE_TAG_CLOUD_D: + case IO_REPARSE_TAG_CLOUD_E: + case IO_REPARSE_TAG_CLOUD_F: + err = sizeof("OneDrive") - 1; + if (err > buflen) + err = buflen; + memcpy(buffer, "OneDrive", err); + goto out; + + default: + if (IsReparseTagMicrosoft(rp->ReparseTag)) { + /* Unknown Microsoft Tag. */ + goto out; + } + if (!IsReparseTagNameSurrogate(rp->ReparseTag) || + i_size <= sizeof(struct REPARSE_POINT)) { + goto out; + } + + /* Users tag. */ + uni = Add2Ptr(rp, sizeof(struct REPARSE_POINT) - 2); + nlen = le16_to_cpu(rp->ReparseDataLength) - + sizeof(struct REPARSE_POINT); + } + + /* Convert nlen from bytes to UNICODE chars. */ + nlen >>= 1; + + /* Check that name is available. */ + if (!nlen || &uni->name[nlen] > (__le16 *)Add2Ptr(rp, i_size)) + goto out; + + /* If name is already zero terminated then truncate it now. */ + if (!uni->name[nlen - 1]) + nlen -= 1; + uni->len = nlen; + + err = ntfs_utf16_to_nls(sbi, uni, buffer, buflen); + + if (err < 0) + goto out; + + /* Translate Windows '\' into Linux '/'. */ + for (i = 0; i < err; i++) { + if (buffer[i] == '\\') + buffer[i] = '/'; + } + + /* Always set last zero. */ + buffer[err] = 0; +out: + kfree(to_free); + return err; +} + +static const char *ntfs_get_link(struct dentry *de, struct inode *inode, + struct delayed_call *done) +{ + int err; + char *ret; + + if (!de) + return ERR_PTR(-ECHILD); + + ret = kmalloc(PAGE_SIZE, GFP_NOFS); + if (!ret) + return ERR_PTR(-ENOMEM); + + err = ntfs_readlink_hlp(inode, ret, PAGE_SIZE); + if (err < 0) { + kfree(ret); + return ERR_PTR(err); + } + + set_delayed_call(done, kfree_link, ret); + + return ret; +} + +// clang-format off +const struct inode_operations ntfs_link_inode_operations = { + .get_link = ntfs_get_link, + .setattr = ntfs3_setattr, + .listxattr = ntfs_listxattr, + .permission = ntfs_permission, + .get_acl = ntfs_get_acl, + .set_acl = ntfs_set_acl, +}; + +const struct address_space_operations ntfs_aops = { + .readpage = ntfs_readpage, + .readahead = ntfs_readahead, + .writepage = ntfs_writepage, + .writepages = ntfs_writepages, + .write_begin = ntfs_write_begin, + .write_end = ntfs_write_end, + .direct_IO = ntfs_direct_IO, + .bmap = ntfs_bmap, + .set_page_dirty = __set_page_dirty_buffers, +}; + +const struct address_space_operations ntfs_aops_cmpr = { + .readpage = ntfs_readpage, + .readahead = ntfs_readahead, +}; +// clang-format on diff --git a/fs/ntfs3/lib/decompress_common.c b/fs/ntfs3/lib/decompress_common.c new file mode 100644 index 000000000000..e96652240859 --- /dev/null +++ b/fs/ntfs3/lib/decompress_common.c @@ -0,0 +1,319 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * decompress_common.c - Code shared by the XPRESS and LZX decompressors + * + * Copyright (C) 2015 Eric Biggers + */ + +#include "decompress_common.h" + +/* + * make_huffman_decode_table() - + * + * Build a decoding table for a canonical prefix code, or "Huffman code". + * + * This is an internal function, not part of the library API! + * + * This takes as input the length of the codeword for each symbol in the + * alphabet and produces as output a table that can be used for fast + * decoding of prefix-encoded symbols using read_huffsym(). + * + * Strictly speaking, a canonical prefix code might not be a Huffman + * code. But this algorithm will work either way; and in fact, since + * Huffman codes are defined in terms of symbol frequencies, there is no + * way for the decompressor to know whether the code is a true Huffman + * code or not until all symbols have been decoded. + * + * Because the prefix code is assumed to be "canonical", it can be + * reconstructed directly from the codeword lengths. A prefix code is + * canonical if and only if a longer codeword never lexicographically + * precedes a shorter codeword, and the lexicographic ordering of + * codewords of the same length is the same as the lexicographic ordering + * of the corresponding symbols. Consequently, we can sort the symbols + * primarily by codeword length and secondarily by symbol value, then + * reconstruct the prefix code by generating codewords lexicographically + * in that order. + * + * This function does not, however, generate the prefix code explicitly. + * Instead, it directly builds a table for decoding symbols using the + * code. The basic idea is this: given the next 'max_codeword_len' bits + * in the input, we can look up the decoded symbol by indexing a table + * containing 2**max_codeword_len entries. A codeword with length + * 'max_codeword_len' will have exactly one entry in this table, whereas + * a codeword shorter than 'max_codeword_len' will have multiple entries + * in this table. Precisely, a codeword of length n will be represented + * by 2**(max_codeword_len - n) entries in this table. The 0-based index + * of each such entry will contain the corresponding codeword as a prefix + * when zero-padded on the left to 'max_codeword_len' binary digits. + * + * That's the basic idea, but we implement two optimizations regarding + * the format of the decode table itself: + * + * - For many compression formats, the maximum codeword length is too + * long for it to be efficient to build the full decoding table + * whenever a new prefix code is used. Instead, we can build the table + * using only 2**table_bits entries, where 'table_bits' is some number + * less than or equal to 'max_codeword_len'. Then, only codewords of + * length 'table_bits' and shorter can be directly looked up. For + * longer codewords, the direct lookup instead produces the root of a + * binary tree. Using this tree, the decoder can do traditional + * bit-by-bit decoding of the remainder of the codeword. Child nodes + * are allocated in extra entries at the end of the table; leaf nodes + * contain symbols. Note that the long-codeword case is, in general, + * not performance critical, since in Huffman codes the most frequently + * used symbols are assigned the shortest codeword lengths. + * + * - When we decode a symbol using a direct lookup of the table, we still + * need to know its length so that the bitstream can be advanced by the + * appropriate number of bits. The simple solution is to simply retain + * the 'lens' array and use the decoded symbol as an index into it. + * However, this requires two separate array accesses in the fast path. + * The optimization is to store the length directly in the decode + * table. We use the bottom 11 bits for the symbol and the top 5 bits + * for the length. In addition, to combine this optimization with the + * previous one, we introduce a special case where the top 2 bits of + * the length are both set if the entry is actually the root of a + * binary tree. + * + * @decode_table: + * The array in which to create the decoding table. This must have + * a length of at least ((2**table_bits) + 2 * num_syms) entries. + * + * @num_syms: + * The number of symbols in the alphabet; also, the length of the + * 'lens' array. Must be less than or equal to 2048. + * + * @table_bits: + * The order of the decode table size, as explained above. Must be + * less than or equal to 13. + * + * @lens: + * An array of length @num_syms, indexable by symbol, that gives the + * length of the codeword, in bits, for that symbol. The length can + * be 0, which means that the symbol does not have a codeword + * assigned. + * + * @max_codeword_len: + * The longest codeword length allowed in the compression format. + * All entries in 'lens' must be less than or equal to this value. + * This must be less than or equal to 23. + * + * @working_space + * A temporary array of length '2 * (max_codeword_len + 1) + + * num_syms'. + * + * Returns 0 on success, or -1 if the lengths do not form a valid prefix + * code. + */ +int make_huffman_decode_table(u16 decode_table[], const u32 num_syms, + const u32 table_bits, const u8 lens[], + const u32 max_codeword_len, + u16 working_space[]) +{ + const u32 table_num_entries = 1 << table_bits; + u16 * const len_counts = &working_space[0]; + u16 * const offsets = &working_space[1 * (max_codeword_len + 1)]; + u16 * const sorted_syms = &working_space[2 * (max_codeword_len + 1)]; + int left; + void *decode_table_ptr; + u32 sym_idx; + u32 codeword_len; + u32 stores_per_loop; + u32 decode_table_pos; + u32 len; + u32 sym; + + /* Count how many symbols have each possible codeword length. + * Note that a length of 0 indicates the corresponding symbol is not + * used in the code and therefore does not have a codeword. + */ + for (len = 0; len <= max_codeword_len; len++) + len_counts[len] = 0; + for (sym = 0; sym < num_syms; sym++) + len_counts[lens[sym]]++; + + /* We can assume all lengths are <= max_codeword_len, but we + * cannot assume they form a valid prefix code. A codeword of + * length n should require a proportion of the codespace equaling + * (1/2)^n. The code is valid if and only if the codespace is + * exactly filled by the lengths, by this measure. + */ + left = 1; + for (len = 1; len <= max_codeword_len; len++) { + left <<= 1; + left -= len_counts[len]; + if (left < 0) { + /* The lengths overflow the codespace; that is, the code + * is over-subscribed. + */ + return -1; + } + } + + if (left) { + /* The lengths do not fill the codespace; that is, they form an + * incomplete set. + */ + if (left == (1 << max_codeword_len)) { + /* The code is completely empty. This is arguably + * invalid, but in fact it is valid in LZX and XPRESS, + * so we must allow it. By definition, no symbols can + * be decoded with an empty code. Consequently, we + * technically don't even need to fill in the decode + * table. However, to avoid accessing uninitialized + * memory if the algorithm nevertheless attempts to + * decode symbols using such a code, we zero out the + * decode table. + */ + memset(decode_table, 0, + table_num_entries * sizeof(decode_table[0])); + return 0; + } + return -1; + } + + /* Sort the symbols primarily by length and secondarily by symbol order. + */ + + /* Initialize 'offsets' so that offsets[len] for 1 <= len <= + * max_codeword_len is the number of codewords shorter than 'len' bits. + */ + offsets[1] = 0; + for (len = 1; len < max_codeword_len; len++) + offsets[len + 1] = offsets[len] + len_counts[len]; + + /* Use the 'offsets' array to sort the symbols. Note that we do not + * include symbols that are not used in the code. Consequently, fewer + * than 'num_syms' entries in 'sorted_syms' may be filled. + */ + for (sym = 0; sym < num_syms; sym++) + if (lens[sym]) + sorted_syms[offsets[lens[sym]]++] = sym; + + /* Fill entries for codewords with length <= table_bits + * --- that is, those short enough for a direct mapping. + * + * The table will start with entries for the shortest codeword(s), which + * have the most entries. From there, the number of entries per + * codeword will decrease. + */ + decode_table_ptr = decode_table; + sym_idx = 0; + codeword_len = 1; + stores_per_loop = (1 << (table_bits - codeword_len)); + for (; stores_per_loop != 0; codeword_len++, stores_per_loop >>= 1) { + u32 end_sym_idx = sym_idx + len_counts[codeword_len]; + + for (; sym_idx < end_sym_idx; sym_idx++) { + u16 entry; + u16 *p; + u32 n; + + entry = ((u32)codeword_len << 11) | sorted_syms[sym_idx]; + p = (u16 *)decode_table_ptr; + n = stores_per_loop; + + do { + *p++ = entry; + } while (--n); + + decode_table_ptr = p; + } + } + + /* If we've filled in the entire table, we are done. Otherwise, + * there are codewords longer than table_bits for which we must + * generate binary trees. + */ + decode_table_pos = (u16 *)decode_table_ptr - decode_table; + if (decode_table_pos != table_num_entries) { + u32 j; + u32 next_free_tree_slot; + u32 cur_codeword; + + /* First, zero out the remaining entries. This is + * necessary so that these entries appear as + * "unallocated" in the next part. Each of these entries + * will eventually be filled with the representation of + * the root node of a binary tree. + */ + j = decode_table_pos; + do { + decode_table[j] = 0; + } while (++j != table_num_entries); + + /* We allocate child nodes starting at the end of the + * direct lookup table. Note that there should be + * 2*num_syms extra entries for this purpose, although + * fewer than this may actually be needed. + */ + next_free_tree_slot = table_num_entries; + + /* Iterate through each codeword with length greater than + * 'table_bits', primarily in order of codeword length + * and secondarily in order of symbol. + */ + for (cur_codeword = decode_table_pos << 1; + codeword_len <= max_codeword_len; + codeword_len++, cur_codeword <<= 1) { + u32 end_sym_idx = sym_idx + len_counts[codeword_len]; + + for (; sym_idx < end_sym_idx; sym_idx++, cur_codeword++) { + /* 'sorted_sym' is the symbol represented by the + * codeword. + */ + u32 sorted_sym = sorted_syms[sym_idx]; + u32 extra_bits = codeword_len - table_bits; + u32 node_idx = cur_codeword >> extra_bits; + + /* Go through each bit of the current codeword + * beyond the prefix of length @table_bits and + * walk the appropriate binary tree, allocating + * any slots that have not yet been allocated. + * + * Note that the 'pointer' entry to the binary + * tree, which is stored in the direct lookup + * portion of the table, is represented + * identically to other internal (non-leaf) + * nodes of the binary tree; it can be thought + * of as simply the root of the tree. The + * representation of these internal nodes is + * simply the index of the left child combined + * with the special bits 0xC000 to distinguish + * the entry from direct mapping and leaf node + * entries. + */ + do { + /* At least one bit remains in the + * codeword, but the current node is an + * unallocated leaf. Change it to an + * internal node. + */ + if (decode_table[node_idx] == 0) { + decode_table[node_idx] = + next_free_tree_slot | 0xC000; + decode_table[next_free_tree_slot++] = 0; + decode_table[next_free_tree_slot++] = 0; + } + + /* Go to the left child if the next bit + * in the codeword is 0; otherwise go to + * the right child. + */ + node_idx = decode_table[node_idx] & 0x3FFF; + --extra_bits; + node_idx += (cur_codeword >> extra_bits) & 1; + } while (extra_bits != 0); + + /* We've traversed the tree using the entire + * codeword, and we're now at the entry where + * the actual symbol will be stored. This is + * distinguished from internal nodes by not + * having its high two bits set. + */ + decode_table[node_idx] = sorted_sym; + } + } + } + return 0; +} diff --git a/fs/ntfs3/lib/decompress_common.h b/fs/ntfs3/lib/decompress_common.h new file mode 100644 index 000000000000..2d70ae42f1b5 --- /dev/null +++ b/fs/ntfs3/lib/decompress_common.h @@ -0,0 +1,338 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * decompress_common.h - Code shared by the XPRESS and LZX decompressors + * + * Copyright (C) 2015 Eric Biggers + */ + +#include <linux/string.h> +#include <linux/compiler.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <asm/unaligned.h> + + +/* "Force inline" macro (not required, but helpful for performance) */ +#define forceinline __always_inline + +/* Enable whole-word match copying on selected architectures */ +#if defined(__i386__) || defined(__x86_64__) || defined(__ARM_FEATURE_UNALIGNED) +# define FAST_UNALIGNED_ACCESS +#endif + +/* Size of a machine word */ +#define WORDBYTES (sizeof(size_t)) + +static forceinline void +copy_unaligned_word(const void *src, void *dst) +{ + put_unaligned(get_unaligned((const size_t *)src), (size_t *)dst); +} + + +/* Generate a "word" with platform-dependent size whose bytes all contain the + * value 'b'. + */ +static forceinline size_t repeat_byte(u8 b) +{ + size_t v; + + v = b; + v |= v << 8; + v |= v << 16; + v |= v << ((WORDBYTES == 8) ? 32 : 0); + return v; +} + +/* Structure that encapsulates a block of in-memory data being interpreted as a + * stream of bits, optionally with interwoven literal bytes. Bits are assumed + * to be stored in little endian 16-bit coding units, with the bits ordered high + * to low. + */ +struct input_bitstream { + + /* Bits that have been read from the input buffer. The bits are + * left-justified; the next bit is always bit 31. + */ + u32 bitbuf; + + /* Number of bits currently held in @bitbuf. */ + u32 bitsleft; + + /* Pointer to the next byte to be retrieved from the input buffer. */ + const u8 *next; + + /* Pointer to just past the end of the input buffer. */ + const u8 *end; +}; + +/* Initialize a bitstream to read from the specified input buffer. */ +static forceinline void init_input_bitstream(struct input_bitstream *is, + const void *buffer, u32 size) +{ + is->bitbuf = 0; + is->bitsleft = 0; + is->next = buffer; + is->end = is->next + size; +} + +/* Ensure the bit buffer variable for the bitstream contains at least @num_bits + * bits. Following this, bitstream_peek_bits() and/or bitstream_remove_bits() + * may be called on the bitstream to peek or remove up to @num_bits bits. Note + * that @num_bits must be <= 16. + */ +static forceinline void bitstream_ensure_bits(struct input_bitstream *is, + u32 num_bits) +{ + if (is->bitsleft < num_bits) { + if (is->end - is->next >= 2) { + is->bitbuf |= (u32)get_unaligned_le16(is->next) + << (16 - is->bitsleft); + is->next += 2; + } + is->bitsleft += 16; + } +} + +/* Return the next @num_bits bits from the bitstream, without removing them. + * There must be at least @num_bits remaining in the buffer variable, from a + * previous call to bitstream_ensure_bits(). + */ +static forceinline u32 +bitstream_peek_bits(const struct input_bitstream *is, const u32 num_bits) +{ + return (is->bitbuf >> 1) >> (sizeof(is->bitbuf) * 8 - num_bits - 1); +} + +/* Remove @num_bits from the bitstream. There must be at least @num_bits + * remaining in the buffer variable, from a previous call to + * bitstream_ensure_bits(). + */ +static forceinline void +bitstream_remove_bits(struct input_bitstream *is, u32 num_bits) +{ + is->bitbuf <<= num_bits; + is->bitsleft -= num_bits; +} + +/* Remove and return @num_bits bits from the bitstream. There must be at least + * @num_bits remaining in the buffer variable, from a previous call to + * bitstream_ensure_bits(). + */ +static forceinline u32 +bitstream_pop_bits(struct input_bitstream *is, u32 num_bits) +{ + u32 bits = bitstream_peek_bits(is, num_bits); + + bitstream_remove_bits(is, num_bits); + return bits; +} + +/* Read and return the next @num_bits bits from the bitstream. */ +static forceinline u32 +bitstream_read_bits(struct input_bitstream *is, u32 num_bits) +{ + bitstream_ensure_bits(is, num_bits); + return bitstream_pop_bits(is, num_bits); +} + +/* Read and return the next literal byte embedded in the bitstream. */ +static forceinline u8 +bitstream_read_byte(struct input_bitstream *is) +{ + if (unlikely(is->end == is->next)) + return 0; + return *is->next++; +} + +/* Read and return the next 16-bit integer embedded in the bitstream. */ +static forceinline u16 +bitstream_read_u16(struct input_bitstream *is) +{ + u16 v; + + if (unlikely(is->end - is->next < 2)) + return 0; + v = get_unaligned_le16(is->next); + is->next += 2; + return v; +} + +/* Read and return the next 32-bit integer embedded in the bitstream. */ +static forceinline u32 +bitstream_read_u32(struct input_bitstream *is) +{ + u32 v; + + if (unlikely(is->end - is->next < 4)) + return 0; + v = get_unaligned_le32(is->next); + is->next += 4; + return v; +} + +/* Read into @dst_buffer an array of literal bytes embedded in the bitstream. + * Return either a pointer to the byte past the last written, or NULL if the + * read overflows the input buffer. + */ +static forceinline void *bitstream_read_bytes(struct input_bitstream *is, + void *dst_buffer, size_t count) +{ + if ((size_t)(is->end - is->next) < count) + return NULL; + memcpy(dst_buffer, is->next, count); + is->next += count; + return (u8 *)dst_buffer + count; +} + +/* Align the input bitstream on a coding-unit boundary. */ +static forceinline void bitstream_align(struct input_bitstream *is) +{ + is->bitsleft = 0; + is->bitbuf = 0; +} + +extern int make_huffman_decode_table(u16 decode_table[], const u32 num_syms, + const u32 num_bits, const u8 lens[], + const u32 max_codeword_len, + u16 working_space[]); + + +/* Reads and returns the next Huffman-encoded symbol from a bitstream. If the + * input data is exhausted, the Huffman symbol is decoded as if the missing bits + * are all zeroes. + */ +static forceinline u32 read_huffsym(struct input_bitstream *istream, + const u16 decode_table[], + u32 table_bits, + u32 max_codeword_len) +{ + u32 entry; + u32 key_bits; + + bitstream_ensure_bits(istream, max_codeword_len); + + /* Index the decode table by the next table_bits bits of the input. */ + key_bits = bitstream_peek_bits(istream, table_bits); + entry = decode_table[key_bits]; + if (entry < 0xC000) { + /* Fast case: The decode table directly provided the + * symbol and codeword length. The low 11 bits are the + * symbol, and the high 5 bits are the codeword length. + */ + bitstream_remove_bits(istream, entry >> 11); + return entry & 0x7FF; + } + /* Slow case: The codeword for the symbol is longer than + * table_bits, so the symbol does not have an entry + * directly in the first (1 << table_bits) entries of the + * decode table. Traverse the appropriate binary tree + * bit-by-bit to decode the symbol. + */ + bitstream_remove_bits(istream, table_bits); + do { + key_bits = (entry & 0x3FFF) + bitstream_pop_bits(istream, 1); + } while ((entry = decode_table[key_bits]) >= 0xC000); + return entry; +} + +/* + * Copy an LZ77 match at (dst - offset) to dst. + * + * The length and offset must be already validated --- that is, (dst - offset) + * can't underrun the output buffer, and (dst + length) can't overrun the output + * buffer. Also, the length cannot be 0. + * + * @bufend points to the byte past the end of the output buffer. This function + * won't write any data beyond this position. + * + * Returns dst + length. + */ +static forceinline u8 *lz_copy(u8 *dst, u32 length, u32 offset, const u8 *bufend, + u32 min_length) +{ + const u8 *src = dst - offset; + + /* + * Try to copy one machine word at a time. On i386 and x86_64 this is + * faster than copying one byte at a time, unless the data is + * near-random and all the matches have very short lengths. Note that + * since this requires unaligned memory accesses, it won't necessarily + * be faster on every architecture. + * + * Also note that we might copy more than the length of the match. For + * example, if a word is 8 bytes and the match is of length 5, then + * we'll simply copy 8 bytes. This is okay as long as we don't write + * beyond the end of the output buffer, hence the check for (bufend - + * end >= WORDBYTES - 1). + */ +#ifdef FAST_UNALIGNED_ACCESS + u8 * const end = dst + length; + + if (bufend - end >= (ptrdiff_t)(WORDBYTES - 1)) { + + if (offset >= WORDBYTES) { + /* The source and destination words don't overlap. */ + + /* To improve branch prediction, one iteration of this + * loop is unrolled. Most matches are short and will + * fail the first check. But if that check passes, then + * it becomes increasing likely that the match is long + * and we'll need to continue copying. + */ + + copy_unaligned_word(src, dst); + src += WORDBYTES; + dst += WORDBYTES; + + if (dst < end) { + do { + copy_unaligned_word(src, dst); + src += WORDBYTES; + dst += WORDBYTES; + } while (dst < end); + } + return end; + } else if (offset == 1) { + + /* Offset 1 matches are equivalent to run-length + * encoding of the previous byte. This case is common + * if the data contains many repeated bytes. + */ + size_t v = repeat_byte(*(dst - 1)); + + do { + put_unaligned(v, (size_t *)dst); + src += WORDBYTES; + dst += WORDBYTES; + } while (dst < end); + return end; + } + /* + * We don't bother with special cases for other 'offset < + * WORDBYTES', which are usually rarer than 'offset == 1'. Extra + * checks will just slow things down. Actually, it's possible + * to handle all the 'offset < WORDBYTES' cases using the same + * code, but it still becomes more complicated doesn't seem any + * faster overall; it definitely slows down the more common + * 'offset == 1' case. + */ + } +#endif /* FAST_UNALIGNED_ACCESS */ + + /* Fall back to a bytewise copy. */ + + if (min_length >= 2) { + *dst++ = *src++; + length--; + } + if (min_length >= 3) { + *dst++ = *src++; + length--; + } + do { + *dst++ = *src++; + } while (--length); + + return dst; +} diff --git a/fs/ntfs3/lib/lib.h b/fs/ntfs3/lib/lib.h new file mode 100644 index 000000000000..f508fbad2e71 --- /dev/null +++ b/fs/ntfs3/lib/lib.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Adapted for linux kernel by Alexander Mamaev: + * - remove implementations of get_unaligned_ + * - assume GCC is always defined + * - ISO C90 + * - linux kernel code style + */ + + +/* globals from xpress_decompress.c */ +struct xpress_decompressor *xpress_allocate_decompressor(void); +void xpress_free_decompressor(struct xpress_decompressor *d); +int xpress_decompress(struct xpress_decompressor *__restrict d, + const void *__restrict compressed_data, + size_t compressed_size, + void *__restrict uncompressed_data, + size_t uncompressed_size); + +/* globals from lzx_decompress.c */ +struct lzx_decompressor *lzx_allocate_decompressor(void); +void lzx_free_decompressor(struct lzx_decompressor *d); +int lzx_decompress(struct lzx_decompressor *__restrict d, + const void *__restrict compressed_data, + size_t compressed_size, void *__restrict uncompressed_data, + size_t uncompressed_size); diff --git a/fs/ntfs3/lib/lzx_decompress.c b/fs/ntfs3/lib/lzx_decompress.c new file mode 100644 index 000000000000..6b16f07073c1 --- /dev/null +++ b/fs/ntfs3/lib/lzx_decompress.c @@ -0,0 +1,670 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * lzx_decompress.c - A decompressor for the LZX compression format, which can + * be used in "System Compressed" files. This is based on the code from wimlib. + * This code only supports a window size (dictionary size) of 32768 bytes, since + * this is the only size used in System Compression. + * + * Copyright (C) 2015 Eric Biggers + */ + +#include "decompress_common.h" +#include "lib.h" + +/* Number of literal byte values */ +#define LZX_NUM_CHARS 256 + +/* The smallest and largest allowed match lengths */ +#define LZX_MIN_MATCH_LEN 2 +#define LZX_MAX_MATCH_LEN 257 + +/* Number of distinct match lengths that can be represented */ +#define LZX_NUM_LENS (LZX_MAX_MATCH_LEN - LZX_MIN_MATCH_LEN + 1) + +/* Number of match lengths for which no length symbol is required */ +#define LZX_NUM_PRIMARY_LENS 7 +#define LZX_NUM_LEN_HEADERS (LZX_NUM_PRIMARY_LENS + 1) + +/* Valid values of the 3-bit block type field */ +#define LZX_BLOCKTYPE_VERBATIM 1 +#define LZX_BLOCKTYPE_ALIGNED 2 +#define LZX_BLOCKTYPE_UNCOMPRESSED 3 + +/* Number of offset slots for a window size of 32768 */ +#define LZX_NUM_OFFSET_SLOTS 30 + +/* Number of symbols in the main code for a window size of 32768 */ +#define LZX_MAINCODE_NUM_SYMBOLS \ + (LZX_NUM_CHARS + (LZX_NUM_OFFSET_SLOTS * LZX_NUM_LEN_HEADERS)) + +/* Number of symbols in the length code */ +#define LZX_LENCODE_NUM_SYMBOLS (LZX_NUM_LENS - LZX_NUM_PRIMARY_LENS) + +/* Number of symbols in the precode */ +#define LZX_PRECODE_NUM_SYMBOLS 20 + +/* Number of bits in which each precode codeword length is represented */ +#define LZX_PRECODE_ELEMENT_SIZE 4 + +/* Number of low-order bits of each match offset that are entropy-encoded in + * aligned offset blocks + */ +#define LZX_NUM_ALIGNED_OFFSET_BITS 3 + +/* Number of symbols in the aligned offset code */ +#define LZX_ALIGNEDCODE_NUM_SYMBOLS (1 << LZX_NUM_ALIGNED_OFFSET_BITS) + +/* Mask for the match offset bits that are entropy-encoded in aligned offset + * blocks + */ +#define LZX_ALIGNED_OFFSET_BITMASK ((1 << LZX_NUM_ALIGNED_OFFSET_BITS) - 1) + +/* Number of bits in which each aligned offset codeword length is represented */ +#define LZX_ALIGNEDCODE_ELEMENT_SIZE 3 + +/* Maximum lengths (in bits) of the codewords in each Huffman code */ +#define LZX_MAX_MAIN_CODEWORD_LEN 16 +#define LZX_MAX_LEN_CODEWORD_LEN 16 +#define LZX_MAX_PRE_CODEWORD_LEN ((1 << LZX_PRECODE_ELEMENT_SIZE) - 1) +#define LZX_MAX_ALIGNED_CODEWORD_LEN ((1 << LZX_ALIGNEDCODE_ELEMENT_SIZE) - 1) + +/* The default "filesize" value used in pre/post-processing. In the LZX format + * used in cabinet files this value must be given to the decompressor, whereas + * in the LZX format used in WIM files and system-compressed files this value is + * fixed at 12000000. + */ +#define LZX_DEFAULT_FILESIZE 12000000 + +/* Assumed block size when the encoded block size begins with a 0 bit. */ +#define LZX_DEFAULT_BLOCK_SIZE 32768 + +/* Number of offsets in the recent (or "repeat") offsets queue. */ +#define LZX_NUM_RECENT_OFFSETS 3 + +/* These values are chosen for fast decompression. */ +#define LZX_MAINCODE_TABLEBITS 11 +#define LZX_LENCODE_TABLEBITS 10 +#define LZX_PRECODE_TABLEBITS 6 +#define LZX_ALIGNEDCODE_TABLEBITS 7 + +#define LZX_READ_LENS_MAX_OVERRUN 50 + +/* Mapping: offset slot => first match offset that uses that offset slot. + */ +static const u32 lzx_offset_slot_base[LZX_NUM_OFFSET_SLOTS + 1] = { + 0, 1, 2, 3, 4, /* 0 --- 4 */ + 6, 8, 12, 16, 24, /* 5 --- 9 */ + 32, 48, 64, 96, 128, /* 10 --- 14 */ + 192, 256, 384, 512, 768, /* 15 --- 19 */ + 1024, 1536, 2048, 3072, 4096, /* 20 --- 24 */ + 6144, 8192, 12288, 16384, 24576, /* 25 --- 29 */ + 32768, /* extra */ +}; + +/* Mapping: offset slot => how many extra bits must be read and added to the + * corresponding offset slot base to decode the match offset. + */ +static const u8 lzx_extra_offset_bits[LZX_NUM_OFFSET_SLOTS] = { + 0, 0, 0, 0, 1, + 1, 2, 2, 3, 3, + 4, 4, 5, 5, 6, + 6, 7, 7, 8, 8, + 9, 9, 10, 10, 11, + 11, 12, 12, 13, 13, +}; + +/* Reusable heap-allocated memory for LZX decompression */ +struct lzx_decompressor { + + /* Huffman decoding tables, and arrays that map symbols to codeword + * lengths + */ + + u16 maincode_decode_table[(1 << LZX_MAINCODE_TABLEBITS) + + (LZX_MAINCODE_NUM_SYMBOLS * 2)]; + u8 maincode_lens[LZX_MAINCODE_NUM_SYMBOLS + LZX_READ_LENS_MAX_OVERRUN]; + + + u16 lencode_decode_table[(1 << LZX_LENCODE_TABLEBITS) + + (LZX_LENCODE_NUM_SYMBOLS * 2)]; + u8 lencode_lens[LZX_LENCODE_NUM_SYMBOLS + LZX_READ_LENS_MAX_OVERRUN]; + + + u16 alignedcode_decode_table[(1 << LZX_ALIGNEDCODE_TABLEBITS) + + (LZX_ALIGNEDCODE_NUM_SYMBOLS * 2)]; + u8 alignedcode_lens[LZX_ALIGNEDCODE_NUM_SYMBOLS]; + + u16 precode_decode_table[(1 << LZX_PRECODE_TABLEBITS) + + (LZX_PRECODE_NUM_SYMBOLS * 2)]; + u8 precode_lens[LZX_PRECODE_NUM_SYMBOLS]; + + /* Temporary space for make_huffman_decode_table() */ + u16 working_space[2 * (1 + LZX_MAX_MAIN_CODEWORD_LEN) + + LZX_MAINCODE_NUM_SYMBOLS]; +}; + +static void undo_e8_translation(void *target, s32 input_pos) +{ + s32 abs_offset, rel_offset; + + abs_offset = get_unaligned_le32(target); + if (abs_offset >= 0) { + if (abs_offset < LZX_DEFAULT_FILESIZE) { + /* "good translation" */ + rel_offset = abs_offset - input_pos; + put_unaligned_le32(rel_offset, target); + } + } else { + if (abs_offset >= -input_pos) { + /* "compensating translation" */ + rel_offset = abs_offset + LZX_DEFAULT_FILESIZE; + put_unaligned_le32(rel_offset, target); + } + } +} + +/* + * Undo the 'E8' preprocessing used in LZX. Before compression, the + * uncompressed data was preprocessed by changing the targets of suspected x86 + * CALL instructions from relative offsets to absolute offsets. After + * match/literal decoding, the decompressor must undo the translation. + */ +static void lzx_postprocess(u8 *data, u32 size) +{ + /* + * A worthwhile optimization is to push the end-of-buffer check into the + * relatively rare E8 case. This is possible if we replace the last six + * bytes of data with E8 bytes; then we are guaranteed to hit an E8 byte + * before reaching end-of-buffer. In addition, this scheme guarantees + * that no translation can begin following an E8 byte in the last 10 + * bytes because a 4-byte offset containing E8 as its high byte is a + * large negative number that is not valid for translation. That is + * exactly what we need. + */ + u8 *tail; + u8 saved_bytes[6]; + u8 *p; + + if (size <= 10) + return; + + tail = &data[size - 6]; + memcpy(saved_bytes, tail, 6); + memset(tail, 0xE8, 6); + p = data; + for (;;) { + while (*p != 0xE8) + p++; + if (p >= tail) + break; + undo_e8_translation(p + 1, p - data); + p += 5; + } + memcpy(tail, saved_bytes, 6); +} + +/* Read a Huffman-encoded symbol using the precode. */ +static forceinline u32 read_presym(const struct lzx_decompressor *d, + struct input_bitstream *is) +{ + return read_huffsym(is, d->precode_decode_table, + LZX_PRECODE_TABLEBITS, LZX_MAX_PRE_CODEWORD_LEN); +} + +/* Read a Huffman-encoded symbol using the main code. */ +static forceinline u32 read_mainsym(const struct lzx_decompressor *d, + struct input_bitstream *is) +{ + return read_huffsym(is, d->maincode_decode_table, + LZX_MAINCODE_TABLEBITS, LZX_MAX_MAIN_CODEWORD_LEN); +} + +/* Read a Huffman-encoded symbol using the length code. */ +static forceinline u32 read_lensym(const struct lzx_decompressor *d, + struct input_bitstream *is) +{ + return read_huffsym(is, d->lencode_decode_table, + LZX_LENCODE_TABLEBITS, LZX_MAX_LEN_CODEWORD_LEN); +} + +/* Read a Huffman-encoded symbol using the aligned offset code. */ +static forceinline u32 read_alignedsym(const struct lzx_decompressor *d, + struct input_bitstream *is) +{ + return read_huffsym(is, d->alignedcode_decode_table, + LZX_ALIGNEDCODE_TABLEBITS, + LZX_MAX_ALIGNED_CODEWORD_LEN); +} + +/* + * Read the precode from the compressed input bitstream, then use it to decode + * @num_lens codeword length values. + * + * @is: The input bitstream. + * + * @lens: An array that contains the length values from the previous time + * the codeword lengths for this Huffman code were read, or all 0's + * if this is the first time. This array must have at least + * (@num_lens + LZX_READ_LENS_MAX_OVERRUN) entries. + * + * @num_lens: Number of length values to decode. + * + * Returns 0 on success, or -1 if the data was invalid. + */ +static int lzx_read_codeword_lens(struct lzx_decompressor *d, + struct input_bitstream *is, + u8 *lens, u32 num_lens) +{ + u8 *len_ptr = lens; + u8 *lens_end = lens + num_lens; + int i; + + /* Read the lengths of the precode codewords. These are given + * explicitly. + */ + for (i = 0; i < LZX_PRECODE_NUM_SYMBOLS; i++) { + d->precode_lens[i] = + bitstream_read_bits(is, LZX_PRECODE_ELEMENT_SIZE); + } + + /* Make the decoding table for the precode. */ + if (make_huffman_decode_table(d->precode_decode_table, + LZX_PRECODE_NUM_SYMBOLS, + LZX_PRECODE_TABLEBITS, + d->precode_lens, + LZX_MAX_PRE_CODEWORD_LEN, + d->working_space)) + return -1; + + /* Decode the codeword lengths. */ + do { + u32 presym; + u8 len; + + /* Read the next precode symbol. */ + presym = read_presym(d, is); + if (presym < 17) { + /* Difference from old length */ + len = *len_ptr - presym; + if ((s8)len < 0) + len += 17; + *len_ptr++ = len; + } else { + /* Special RLE values */ + + u32 run_len; + + if (presym == 17) { + /* Run of 0's */ + run_len = 4 + bitstream_read_bits(is, 4); + len = 0; + } else if (presym == 18) { + /* Longer run of 0's */ + run_len = 20 + bitstream_read_bits(is, 5); + len = 0; + } else { + /* Run of identical lengths */ + run_len = 4 + bitstream_read_bits(is, 1); + presym = read_presym(d, is); + if (presym > 17) + return -1; + len = *len_ptr - presym; + if ((s8)len < 0) + len += 17; + } + + do { + *len_ptr++ = len; + } while (--run_len); + /* Worst case overrun is when presym == 18, + * run_len == 20 + 31, and only 1 length was remaining. + * So LZX_READ_LENS_MAX_OVERRUN == 50. + * + * Overrun while reading the first half of maincode_lens + * can corrupt the previous values in the second half. + * This doesn't really matter because the resulting + * lengths will still be in range, and data that + * generates overruns is invalid anyway. + */ + } + } while (len_ptr < lens_end); + + return 0; +} + +/* + * Read the header of an LZX block and save the block type and (uncompressed) + * size in *block_type_ret and *block_size_ret, respectively. + * + * If the block is compressed, also update the Huffman decode @tables with the + * new Huffman codes. If the block is uncompressed, also update the match + * offset @queue with the new match offsets. + * + * Return 0 on success, or -1 if the data was invalid. + */ +static int lzx_read_block_header(struct lzx_decompressor *d, + struct input_bitstream *is, + int *block_type_ret, + u32 *block_size_ret, + u32 recent_offsets[]) +{ + int block_type; + u32 block_size; + int i; + + bitstream_ensure_bits(is, 4); + + /* The first three bits tell us what kind of block it is, and should be + * one of the LZX_BLOCKTYPE_* values. + */ + block_type = bitstream_pop_bits(is, 3); + + /* Read the block size. */ + if (bitstream_pop_bits(is, 1)) { + block_size = LZX_DEFAULT_BLOCK_SIZE; + } else { + block_size = 0; + block_size |= bitstream_read_bits(is, 8); + block_size <<= 8; + block_size |= bitstream_read_bits(is, 8); + } + + switch (block_type) { + + case LZX_BLOCKTYPE_ALIGNED: + + /* Read the aligned offset code and prepare its decode table. + */ + + for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) { + d->alignedcode_lens[i] = + bitstream_read_bits(is, + LZX_ALIGNEDCODE_ELEMENT_SIZE); + } + + if (make_huffman_decode_table(d->alignedcode_decode_table, + LZX_ALIGNEDCODE_NUM_SYMBOLS, + LZX_ALIGNEDCODE_TABLEBITS, + d->alignedcode_lens, + LZX_MAX_ALIGNED_CODEWORD_LEN, + d->working_space)) + return -1; + + /* Fall though, since the rest of the header for aligned offset + * blocks is the same as that for verbatim blocks. + */ + fallthrough; + + case LZX_BLOCKTYPE_VERBATIM: + + /* Read the main code and prepare its decode table. + * + * Note that the codeword lengths in the main code are encoded + * in two parts: one part for literal symbols, and one part for + * match symbols. + */ + + if (lzx_read_codeword_lens(d, is, d->maincode_lens, + LZX_NUM_CHARS)) + return -1; + + if (lzx_read_codeword_lens(d, is, + d->maincode_lens + LZX_NUM_CHARS, + LZX_MAINCODE_NUM_SYMBOLS - LZX_NUM_CHARS)) + return -1; + + if (make_huffman_decode_table(d->maincode_decode_table, + LZX_MAINCODE_NUM_SYMBOLS, + LZX_MAINCODE_TABLEBITS, + d->maincode_lens, + LZX_MAX_MAIN_CODEWORD_LEN, + d->working_space)) + return -1; + + /* Read the length code and prepare its decode table. */ + + if (lzx_read_codeword_lens(d, is, d->lencode_lens, + LZX_LENCODE_NUM_SYMBOLS)) + return -1; + + if (make_huffman_decode_table(d->lencode_decode_table, + LZX_LENCODE_NUM_SYMBOLS, + LZX_LENCODE_TABLEBITS, + d->lencode_lens, + LZX_MAX_LEN_CODEWORD_LEN, + d->working_space)) + return -1; + + break; + + case LZX_BLOCKTYPE_UNCOMPRESSED: + + /* Before reading the three recent offsets from the uncompressed + * block header, the stream must be aligned on a 16-bit + * boundary. But if the stream is *already* aligned, then the + * next 16 bits must be discarded. + */ + bitstream_ensure_bits(is, 1); + bitstream_align(is); + + recent_offsets[0] = bitstream_read_u32(is); + recent_offsets[1] = bitstream_read_u32(is); + recent_offsets[2] = bitstream_read_u32(is); + + /* Offsets of 0 are invalid. */ + if (recent_offsets[0] == 0 || recent_offsets[1] == 0 || + recent_offsets[2] == 0) + return -1; + break; + + default: + /* Unrecognized block type. */ + return -1; + } + + *block_type_ret = block_type; + *block_size_ret = block_size; + return 0; +} + +/* Decompress a block of LZX-compressed data. */ +static int lzx_decompress_block(const struct lzx_decompressor *d, + struct input_bitstream *is, + int block_type, u32 block_size, + u8 * const out_begin, u8 *out_next, + u32 recent_offsets[]) +{ + u8 * const block_end = out_next + block_size; + u32 ones_if_aligned = 0U - (block_type == LZX_BLOCKTYPE_ALIGNED); + + do { + u32 mainsym; + u32 match_len; + u32 match_offset; + u32 offset_slot; + u32 num_extra_bits; + + mainsym = read_mainsym(d, is); + if (mainsym < LZX_NUM_CHARS) { + /* Literal */ + *out_next++ = mainsym; + continue; + } + + /* Match */ + + /* Decode the length header and offset slot. */ + mainsym -= LZX_NUM_CHARS; + match_len = mainsym % LZX_NUM_LEN_HEADERS; + offset_slot = mainsym / LZX_NUM_LEN_HEADERS; + + /* If needed, read a length symbol to decode the full length. */ + if (match_len == LZX_NUM_PRIMARY_LENS) + match_len += read_lensym(d, is); + match_len += LZX_MIN_MATCH_LEN; + + if (offset_slot < LZX_NUM_RECENT_OFFSETS) { + /* Repeat offset */ + + /* Note: This isn't a real LRU queue, since using the R2 + * offset doesn't bump the R1 offset down to R2. This + * quirk allows all 3 recent offsets to be handled by + * the same code. (For R0, the swap is a no-op.) + */ + match_offset = recent_offsets[offset_slot]; + recent_offsets[offset_slot] = recent_offsets[0]; + recent_offsets[0] = match_offset; + } else { + /* Explicit offset */ + + /* Look up the number of extra bits that need to be read + * to decode offsets with this offset slot. + */ + num_extra_bits = lzx_extra_offset_bits[offset_slot]; + + /* Start with the offset slot base value. */ + match_offset = lzx_offset_slot_base[offset_slot]; + + /* In aligned offset blocks, the low-order 3 bits of + * each offset are encoded using the aligned offset + * code. Otherwise, all the extra bits are literal. + */ + + if ((num_extra_bits & ones_if_aligned) >= LZX_NUM_ALIGNED_OFFSET_BITS) { + match_offset += + bitstream_read_bits(is, num_extra_bits - + LZX_NUM_ALIGNED_OFFSET_BITS) + << LZX_NUM_ALIGNED_OFFSET_BITS; + match_offset += read_alignedsym(d, is); + } else { + match_offset += bitstream_read_bits(is, num_extra_bits); + } + + /* Adjust the offset. */ + match_offset -= (LZX_NUM_RECENT_OFFSETS - 1); + + /* Update the recent offsets. */ + recent_offsets[2] = recent_offsets[1]; + recent_offsets[1] = recent_offsets[0]; + recent_offsets[0] = match_offset; + } + + /* Validate the match, then copy it to the current position. */ + + if (match_len > (size_t)(block_end - out_next)) + return -1; + + if (match_offset > (size_t)(out_next - out_begin)) + return -1; + + out_next = lz_copy(out_next, match_len, match_offset, + block_end, LZX_MIN_MATCH_LEN); + + } while (out_next != block_end); + + return 0; +} + +/* + * lzx_allocate_decompressor - Allocate an LZX decompressor + * + * Return the pointer to the decompressor on success, or return NULL and set + * errno on failure. + */ +struct lzx_decompressor *lzx_allocate_decompressor(void) +{ + return kmalloc(sizeof(struct lzx_decompressor), GFP_NOFS); +} + +/* + * lzx_decompress - Decompress a buffer of LZX-compressed data + * + * @decompressor: A decompressor allocated with lzx_allocate_decompressor() + * @compressed_data: The buffer of data to decompress + * @compressed_size: Number of bytes of compressed data + * @uncompressed_data: The buffer in which to store the decompressed data + * @uncompressed_size: The number of bytes the data decompresses into + * + * Return 0 on success, or return -1 and set errno on failure. + */ +int lzx_decompress(struct lzx_decompressor *decompressor, + const void *compressed_data, size_t compressed_size, + void *uncompressed_data, size_t uncompressed_size) +{ + struct lzx_decompressor *d = decompressor; + u8 * const out_begin = uncompressed_data; + u8 *out_next = out_begin; + u8 * const out_end = out_begin + uncompressed_size; + struct input_bitstream is; + u32 recent_offsets[LZX_NUM_RECENT_OFFSETS] = {1, 1, 1}; + int e8_status = 0; + + init_input_bitstream(&is, compressed_data, compressed_size); + + /* Codeword lengths begin as all 0's for delta encoding purposes. */ + memset(d->maincode_lens, 0, LZX_MAINCODE_NUM_SYMBOLS); + memset(d->lencode_lens, 0, LZX_LENCODE_NUM_SYMBOLS); + + /* Decompress blocks until we have all the uncompressed data. */ + + while (out_next != out_end) { + int block_type; + u32 block_size; + + if (lzx_read_block_header(d, &is, &block_type, &block_size, + recent_offsets)) + goto invalid; + + if (block_size < 1 || block_size > (size_t)(out_end - out_next)) + goto invalid; + + if (block_type != LZX_BLOCKTYPE_UNCOMPRESSED) { + + /* Compressed block */ + + if (lzx_decompress_block(d, + &is, + block_type, + block_size, + out_begin, + out_next, + recent_offsets)) + goto invalid; + + e8_status |= d->maincode_lens[0xe8]; + out_next += block_size; + } else { + /* Uncompressed block */ + + out_next = bitstream_read_bytes(&is, out_next, + block_size); + if (!out_next) + goto invalid; + + if (block_size & 1) + bitstream_read_byte(&is); + + e8_status = 1; + } + } + + /* Postprocess the data unless it cannot possibly contain 0xe8 bytes. */ + if (e8_status) + lzx_postprocess(uncompressed_data, uncompressed_size); + + return 0; + +invalid: + return -1; +} + +/* + * lzx_free_decompressor - Free an LZX decompressor + * + * @decompressor: A decompressor that was allocated with + * lzx_allocate_decompressor(), or NULL. + */ +void lzx_free_decompressor(struct lzx_decompressor *decompressor) +{ + kfree(decompressor); +} diff --git a/fs/ntfs3/lib/xpress_decompress.c b/fs/ntfs3/lib/xpress_decompress.c new file mode 100644 index 000000000000..769c6d3dde67 --- /dev/null +++ b/fs/ntfs3/lib/xpress_decompress.c @@ -0,0 +1,142 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * xpress_decompress.c - A decompressor for the XPRESS compression format + * (Huffman variant), which can be used in "System Compressed" files. This is + * based on the code from wimlib. + * + * Copyright (C) 2015 Eric Biggers + */ + +#include "decompress_common.h" +#include "lib.h" + +#define XPRESS_NUM_SYMBOLS 512 +#define XPRESS_MAX_CODEWORD_LEN 15 +#define XPRESS_MIN_MATCH_LEN 3 + +/* This value is chosen for fast decompression. */ +#define XPRESS_TABLEBITS 12 + +/* Reusable heap-allocated memory for XPRESS decompression */ +struct xpress_decompressor { + + /* The Huffman decoding table */ + u16 decode_table[(1 << XPRESS_TABLEBITS) + 2 * XPRESS_NUM_SYMBOLS]; + + /* An array that maps symbols to codeword lengths */ + u8 lens[XPRESS_NUM_SYMBOLS]; + + /* Temporary space for make_huffman_decode_table() */ + u16 working_space[2 * (1 + XPRESS_MAX_CODEWORD_LEN) + + XPRESS_NUM_SYMBOLS]; +}; + +/* + * xpress_allocate_decompressor - Allocate an XPRESS decompressor + * + * Return the pointer to the decompressor on success, or return NULL and set + * errno on failure. + */ +struct xpress_decompressor *xpress_allocate_decompressor(void) +{ + return kmalloc(sizeof(struct xpress_decompressor), GFP_NOFS); +} + +/* + * xpress_decompress - Decompress a buffer of XPRESS-compressed data + * + * @decompressor: A decompressor that was allocated with + * xpress_allocate_decompressor() + * @compressed_data: The buffer of data to decompress + * @compressed_size: Number of bytes of compressed data + * @uncompressed_data: The buffer in which to store the decompressed data + * @uncompressed_size: The number of bytes the data decompresses into + * + * Return 0 on success, or return -1 and set errno on failure. + */ +int xpress_decompress(struct xpress_decompressor *decompressor, + const void *compressed_data, size_t compressed_size, + void *uncompressed_data, size_t uncompressed_size) +{ + struct xpress_decompressor *d = decompressor; + const u8 * const in_begin = compressed_data; + u8 * const out_begin = uncompressed_data; + u8 *out_next = out_begin; + u8 * const out_end = out_begin + uncompressed_size; + struct input_bitstream is; + u32 i; + + /* Read the Huffman codeword lengths. */ + if (compressed_size < XPRESS_NUM_SYMBOLS / 2) + goto invalid; + for (i = 0; i < XPRESS_NUM_SYMBOLS / 2; i++) { + d->lens[i*2 + 0] = in_begin[i] & 0xF; + d->lens[i*2 + 1] = in_begin[i] >> 4; + } + + /* Build a decoding table for the Huffman code. */ + if (make_huffman_decode_table(d->decode_table, XPRESS_NUM_SYMBOLS, + XPRESS_TABLEBITS, d->lens, + XPRESS_MAX_CODEWORD_LEN, + d->working_space)) + goto invalid; + + /* Decode the matches and literals. */ + + init_input_bitstream(&is, in_begin + XPRESS_NUM_SYMBOLS / 2, + compressed_size - XPRESS_NUM_SYMBOLS / 2); + + while (out_next != out_end) { + u32 sym; + u32 log2_offset; + u32 length; + u32 offset; + + sym = read_huffsym(&is, d->decode_table, + XPRESS_TABLEBITS, XPRESS_MAX_CODEWORD_LEN); + if (sym < 256) { + /* Literal */ + *out_next++ = sym; + } else { + /* Match */ + length = sym & 0xf; + log2_offset = (sym >> 4) & 0xf; + + bitstream_ensure_bits(&is, 16); + + offset = ((u32)1 << log2_offset) | + bitstream_pop_bits(&is, log2_offset); + + if (length == 0xf) { + length += bitstream_read_byte(&is); + if (length == 0xf + 0xff) + length = bitstream_read_u16(&is); + } + length += XPRESS_MIN_MATCH_LEN; + + if (offset > (size_t)(out_next - out_begin)) + goto invalid; + + if (length > (size_t)(out_end - out_next)) + goto invalid; + + out_next = lz_copy(out_next, length, offset, out_end, + XPRESS_MIN_MATCH_LEN); + } + } + return 0; + +invalid: + return -1; +} + +/* + * xpress_free_decompressor - Free an XPRESS decompressor + * + * @decompressor: A decompressor that was allocated with + * xpress_allocate_decompressor(), or NULL. + */ +void xpress_free_decompressor(struct xpress_decompressor *decompressor) +{ + kfree(decompressor); +} diff --git a/fs/ntfs3/lznt.c b/fs/ntfs3/lznt.c new file mode 100644 index 000000000000..f1f691a67cc4 --- /dev/null +++ b/fs/ntfs3/lznt.c @@ -0,0 +1,453 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * + * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. + * + */ + +#include <linux/blkdev.h> +#include <linux/buffer_head.h> +#include <linux/fs.h> +#include <linux/nls.h> + +#include "debug.h" +#include "ntfs.h" +#include "ntfs_fs.h" + +// clang-format off +/* Src buffer is zero. */ +#define LZNT_ERROR_ALL_ZEROS 1 +#define LZNT_CHUNK_SIZE 0x1000 +// clang-format on + +struct lznt_hash { + const u8 *p1; + const u8 *p2; +}; + +struct lznt { + const u8 *unc; + const u8 *unc_end; + const u8 *best_match; + size_t max_len; + bool std; + + struct lznt_hash hash[LZNT_CHUNK_SIZE]; +}; + +static inline size_t get_match_len(const u8 *ptr, const u8 *end, const u8 *prev, + size_t max_len) +{ + size_t len = 0; + + while (ptr + len < end && ptr[len] == prev[len] && ++len < max_len) + ; + return len; +} + +static size_t longest_match_std(const u8 *src, struct lznt *ctx) +{ + size_t hash_index; + size_t len1 = 0, len2 = 0; + const u8 **hash; + + hash_index = + ((40543U * ((((src[0] << 4) ^ src[1]) << 4) ^ src[2])) >> 4) & + (LZNT_CHUNK_SIZE - 1); + + hash = &(ctx->hash[hash_index].p1); + + if (hash[0] >= ctx->unc && hash[0] < src && hash[0][0] == src[0] && + hash[0][1] == src[1] && hash[0][2] == src[2]) { + len1 = 3; + if (ctx->max_len > 3) + len1 += get_match_len(src + 3, ctx->unc_end, + hash[0] + 3, ctx->max_len - 3); + } + + if (hash[1] >= ctx->unc && hash[1] < src && hash[1][0] == src[0] && + hash[1][1] == src[1] && hash[1][2] == src[2]) { + len2 = 3; + if (ctx->max_len > 3) + len2 += get_match_len(src + 3, ctx->unc_end, + hash[1] + 3, ctx->max_len - 3); + } + + /* Compare two matches and select the best one. */ + if (len1 < len2) { + ctx->best_match = hash[1]; + len1 = len2; + } else { + ctx->best_match = hash[0]; + } + + hash[1] = hash[0]; + hash[0] = src; + return len1; +} + +static size_t longest_match_best(const u8 *src, struct lznt *ctx) +{ + size_t max_len; + const u8 *ptr; + + if (ctx->unc >= src || !ctx->max_len) + return 0; + + max_len = 0; + for (ptr = ctx->unc; ptr < src; ++ptr) { + size_t len = + get_match_len(src, ctx->unc_end, ptr, ctx->max_len); + if (len >= max_len) { + max_len = len; + ctx->best_match = ptr; + } + } + + return max_len >= 3 ? max_len : 0; +} + +static const size_t s_max_len[] = { + 0x1002, 0x802, 0x402, 0x202, 0x102, 0x82, 0x42, 0x22, 0x12, +}; + +static const size_t s_max_off[] = { + 0x10, 0x20, 0x40, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000, +}; + +static inline u16 make_pair(size_t offset, size_t len, size_t index) +{ + return ((offset - 1) << (12 - index)) | + ((len - 3) & (((1 << (12 - index)) - 1))); +} + +static inline size_t parse_pair(u16 pair, size_t *offset, size_t index) +{ + *offset = 1 + (pair >> (12 - index)); + return 3 + (pair & ((1 << (12 - index)) - 1)); +} + +/* + * compress_chunk + * + * Return: + * * 0 - Ok, @cmpr contains @cmpr_chunk_size bytes of compressed data. + * * 1 - Input buffer is full zero. + * * -2 - The compressed buffer is too small to hold the compressed data. + */ +static inline int compress_chunk(size_t (*match)(const u8 *, struct lznt *), + const u8 *unc, const u8 *unc_end, u8 *cmpr, + u8 *cmpr_end, size_t *cmpr_chunk_size, + struct lznt *ctx) +{ + size_t cnt = 0; + size_t idx = 0; + const u8 *up = unc; + u8 *cp = cmpr + 3; + u8 *cp2 = cmpr + 2; + u8 not_zero = 0; + /* Control byte of 8-bit values: ( 0 - means byte as is, 1 - short pair ). */ + u8 ohdr = 0; + u8 *last; + u16 t16; + + if (unc + LZNT_CHUNK_SIZE < unc_end) + unc_end = unc + LZNT_CHUNK_SIZE; + + last = min(cmpr + LZNT_CHUNK_SIZE + sizeof(short), cmpr_end); + + ctx->unc = unc; + ctx->unc_end = unc_end; + ctx->max_len = s_max_len[0]; + + while (up < unc_end) { + size_t max_len; + + while (unc + s_max_off[idx] < up) + ctx->max_len = s_max_len[++idx]; + + /* Find match. */ + max_len = up + 3 <= unc_end ? (*match)(up, ctx) : 0; + + if (!max_len) { + if (cp >= last) + goto NotCompressed; + not_zero |= *cp++ = *up++; + } else if (cp + 1 >= last) { + goto NotCompressed; + } else { + t16 = make_pair(up - ctx->best_match, max_len, idx); + *cp++ = t16; + *cp++ = t16 >> 8; + + ohdr |= 1 << cnt; + up += max_len; + } + + cnt = (cnt + 1) & 7; + if (!cnt) { + *cp2 = ohdr; + ohdr = 0; + cp2 = cp; + cp += 1; + } + } + + if (cp2 < last) + *cp2 = ohdr; + else + cp -= 1; + + *cmpr_chunk_size = cp - cmpr; + + t16 = (*cmpr_chunk_size - 3) | 0xB000; + cmpr[0] = t16; + cmpr[1] = t16 >> 8; + + return not_zero ? 0 : LZNT_ERROR_ALL_ZEROS; + +NotCompressed: + + if ((cmpr + LZNT_CHUNK_SIZE + sizeof(short)) > last) + return -2; + + /* + * Copy non cmpr data. + * 0x3FFF == ((LZNT_CHUNK_SIZE + 2 - 3) | 0x3000) + */ + cmpr[0] = 0xff; + cmpr[1] = 0x3f; + + memcpy(cmpr + sizeof(short), unc, LZNT_CHUNK_SIZE); + *cmpr_chunk_size = LZNT_CHUNK_SIZE + sizeof(short); + + return 0; +} + +static inline ssize_t decompress_chunk(u8 *unc, u8 *unc_end, const u8 *cmpr, + const u8 *cmpr_end) +{ + u8 *up = unc; + u8 ch = *cmpr++; + size_t bit = 0; + size_t index = 0; + u16 pair; + size_t offset, length; + + /* Do decompression until pointers are inside range. */ + while (up < unc_end && cmpr < cmpr_end) { + /* Correct index */ + while (unc + s_max_off[index] < up) + index += 1; + + /* Check the current flag for zero. */ + if (!(ch & (1 << bit))) { + /* Just copy byte. */ + *up++ = *cmpr++; + goto next; + } + + /* Check for boundary. */ + if (cmpr + 1 >= cmpr_end) + return -EINVAL; + + /* Read a short from little endian stream. */ + pair = cmpr[1]; + pair <<= 8; + pair |= cmpr[0]; + + cmpr += 2; + + /* Translate packed information into offset and length. */ + length = parse_pair(pair, &offset, index); + + /* Check offset for boundary. */ + if (unc + offset > up) + return -EINVAL; + + /* Truncate the length if necessary. */ + if (up + length >= unc_end) + length = unc_end - up; + + /* Now we copy bytes. This is the heart of LZ algorithm. */ + for (; length > 0; length--, up++) + *up = *(up - offset); + +next: + /* Advance flag bit value. */ + bit = (bit + 1) & 7; + + if (!bit) { + if (cmpr >= cmpr_end) + break; + + ch = *cmpr++; + } + } + + /* Return the size of uncompressed data. */ + return up - unc; +} + +/* + * get_lznt_ctx + * @level: 0 - Standard compression. + * !0 - Best compression, requires a lot of cpu. + */ +struct lznt *get_lznt_ctx(int level) +{ + struct lznt *r = kzalloc(level ? offsetof(struct lznt, hash) + : sizeof(struct lznt), + GFP_NOFS); + + if (r) + r->std = !level; + return r; +} + +/* + * compress_lznt - Compresses @unc into @cmpr + * + * Return: + * * +x - Ok, @cmpr contains 'final_compressed_size' bytes of compressed data. + * * 0 - Input buffer is full zero. + */ +size_t compress_lznt(const void *unc, size_t unc_size, void *cmpr, + size_t cmpr_size, struct lznt *ctx) +{ + int err; + size_t (*match)(const u8 *src, struct lznt *ctx); + u8 *p = cmpr; + u8 *end = p + cmpr_size; + const u8 *unc_chunk = unc; + const u8 *unc_end = unc_chunk + unc_size; + bool is_zero = true; + + if (ctx->std) { + match = &longest_match_std; + memset(ctx->hash, 0, sizeof(ctx->hash)); + } else { + match = &longest_match_best; + } + + /* Compression cycle. */ + for (; unc_chunk < unc_end; unc_chunk += LZNT_CHUNK_SIZE) { + cmpr_size = 0; + err = compress_chunk(match, unc_chunk, unc_end, p, end, + &cmpr_size, ctx); + if (err < 0) + return unc_size; + + if (is_zero && err != LZNT_ERROR_ALL_ZEROS) + is_zero = false; + + p += cmpr_size; + } + + if (p <= end - 2) + p[0] = p[1] = 0; + + return is_zero ? 0 : PtrOffset(cmpr, p); +} + +/* + * decompress_lznt - Decompress @cmpr into @unc. + */ +ssize_t decompress_lznt(const void *cmpr, size_t cmpr_size, void *unc, + size_t unc_size) +{ + const u8 *cmpr_chunk = cmpr; + const u8 *cmpr_end = cmpr_chunk + cmpr_size; + u8 *unc_chunk = unc; + u8 *unc_end = unc_chunk + unc_size; + u16 chunk_hdr; + + if (cmpr_size < sizeof(short)) + return -EINVAL; + + /* Read chunk header. */ + chunk_hdr = cmpr_chunk[1]; + chunk_hdr <<= 8; + chunk_hdr |= cmpr_chunk[0]; + + /* Loop through decompressing chunks. */ + for (;;) { + size_t chunk_size_saved; + size_t unc_use; + size_t cmpr_use = 3 + (chunk_hdr & (LZNT_CHUNK_SIZE - 1)); + + /* Check that the chunk actually fits the supplied buffer. */ + if (cmpr_chunk + cmpr_use > cmpr_end) + return -EINVAL; + + /* First make sure the chunk contains compressed data. */ + if (chunk_hdr & 0x8000) { + /* Decompress a chunk and return if we get an error. */ + ssize_t err = + decompress_chunk(unc_chunk, unc_end, + cmpr_chunk + sizeof(chunk_hdr), + cmpr_chunk + cmpr_use); + if (err < 0) + return err; + unc_use = err; + } else { + /* This chunk does not contain compressed data. */ + unc_use = unc_chunk + LZNT_CHUNK_SIZE > unc_end + ? unc_end - unc_chunk + : LZNT_CHUNK_SIZE; + + if (cmpr_chunk + sizeof(chunk_hdr) + unc_use > + cmpr_end) { + return -EINVAL; + } + + memcpy(unc_chunk, cmpr_chunk + sizeof(chunk_hdr), + unc_use); + } + + /* Advance pointers. */ + cmpr_chunk += cmpr_use; + unc_chunk += unc_use; + + /* Check for the end of unc buffer. */ + if (unc_chunk >= unc_end) + break; + + /* Proceed the next chunk. */ + if (cmpr_chunk > cmpr_end - 2) + break; + + chunk_size_saved = LZNT_CHUNK_SIZE; + + /* Read chunk header. */ + chunk_hdr = cmpr_chunk[1]; + chunk_hdr <<= 8; + chunk_hdr |= cmpr_chunk[0]; + + if (!chunk_hdr) + break; + + /* Check the size of unc buffer. */ + if (unc_use < chunk_size_saved) { + size_t t1 = chunk_size_saved - unc_use; + u8 *t2 = unc_chunk + t1; + + /* 'Zero' memory. */ + if (t2 >= unc_end) + break; + + memset(unc_chunk, 0, t1); + unc_chunk = t2; + } + } + + /* Check compression boundary. */ + if (cmpr_chunk > cmpr_end) + return -EINVAL; + + /* + * The unc size is just a difference between current + * pointer and original one. + */ + return PtrOffset(unc, unc_chunk); +} diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c new file mode 100644 index 000000000000..e58415d07132 --- /dev/null +++ b/fs/ntfs3/namei.c @@ -0,0 +1,411 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * + * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. + * + */ + +#include <linux/blkdev.h> +#include <linux/buffer_head.h> +#include <linux/fs.h> +#include <linux/iversion.h> +#include <linux/namei.h> +#include <linux/nls.h> + +#include "debug.h" +#include "ntfs.h" +#include "ntfs_fs.h" + +/* + * fill_name_de - Format NTFS_DE in @buf. + */ +int fill_name_de(struct ntfs_sb_info *sbi, void *buf, const struct qstr *name, + const struct cpu_str *uni) +{ + int err; + struct NTFS_DE *e = buf; + u16 data_size; + struct ATTR_FILE_NAME *fname = (struct ATTR_FILE_NAME *)(e + 1); + +#ifndef CONFIG_NTFS3_64BIT_CLUSTER + e->ref.high = fname->home.high = 0; +#endif + if (uni) { +#ifdef __BIG_ENDIAN + int ulen = uni->len; + __le16 *uname = fname->name; + const u16 *name_cpu = uni->name; + + while (ulen--) + *uname++ = cpu_to_le16(*name_cpu++); +#else + memcpy(fname->name, uni->name, uni->len * sizeof(u16)); +#endif + fname->name_len = uni->len; + + } else { + /* Convert input string to unicode. */ + err = ntfs_nls_to_utf16(sbi, name->name, name->len, + (struct cpu_str *)&fname->name_len, + NTFS_NAME_LEN, UTF16_LITTLE_ENDIAN); + if (err < 0) + return err; + } + + fname->type = FILE_NAME_POSIX; + data_size = fname_full_size(fname); + + e->size = cpu_to_le16(ALIGN(data_size, 8) + sizeof(struct NTFS_DE)); + e->key_size = cpu_to_le16(data_size); + e->flags = 0; + e->res = 0; + + return 0; +} + +/* + * ntfs_lookup - inode_operations::lookup + */ +static struct dentry *ntfs_lookup(struct inode *dir, struct dentry *dentry, + u32 flags) +{ + struct ntfs_inode *ni = ntfs_i(dir); + struct cpu_str *uni = __getname(); + struct inode *inode; + int err; + + if (!uni) + inode = ERR_PTR(-ENOMEM); + else { + err = ntfs_nls_to_utf16(ni->mi.sbi, dentry->d_name.name, + dentry->d_name.len, uni, NTFS_NAME_LEN, + UTF16_HOST_ENDIAN); + if (err < 0) + inode = ERR_PTR(err); + else { + ni_lock(ni); + inode = dir_search_u(dir, uni, NULL); + ni_unlock(ni); + } + __putname(uni); + } + + return d_splice_alias(inode, dentry); +} + +/* + * ntfs_create - inode_operations::create + */ +static int ntfs_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) +{ + struct ntfs_inode *ni = ntfs_i(dir); + struct inode *inode; + + ni_lock_dir(ni); + + inode = ntfs_create_inode(mnt_userns, dir, dentry, NULL, S_IFREG | mode, + 0, NULL, 0, NULL); + + ni_unlock(ni); + + return IS_ERR(inode) ? PTR_ERR(inode) : 0; +} + +/* + * ntfs_mknod + * + * inode_operations::mknod + */ +static int ntfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t rdev) +{ + struct ntfs_inode *ni = ntfs_i(dir); + struct inode *inode; + + ni_lock_dir(ni); + + inode = ntfs_create_inode(mnt_userns, dir, dentry, NULL, mode, rdev, + NULL, 0, NULL); + + ni_unlock(ni); + + return IS_ERR(inode) ? PTR_ERR(inode) : 0; +} + +/* + * ntfs_link - inode_operations::link + */ +static int ntfs_link(struct dentry *ode, struct inode *dir, struct dentry *de) +{ + int err; + struct inode *inode = d_inode(ode); + struct ntfs_inode *ni = ntfs_i(inode); + + if (S_ISDIR(inode->i_mode)) + return -EPERM; + + if (inode->i_nlink >= NTFS_LINK_MAX) + return -EMLINK; + + ni_lock_dir(ntfs_i(dir)); + if (inode != dir) + ni_lock(ni); + + inc_nlink(inode); + ihold(inode); + + err = ntfs_link_inode(inode, de); + + if (!err) { + dir->i_ctime = dir->i_mtime = inode->i_ctime = + current_time(dir); + mark_inode_dirty(inode); + mark_inode_dirty(dir); + d_instantiate(de, inode); + } else { + drop_nlink(inode); + iput(inode); + } + + if (inode != dir) + ni_unlock(ni); + ni_unlock(ntfs_i(dir)); + + return err; +} + +/* + * ntfs_unlink - inode_operations::unlink + */ +static int ntfs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct ntfs_inode *ni = ntfs_i(dir); + int err; + + ni_lock_dir(ni); + + err = ntfs_unlink_inode(dir, dentry); + + ni_unlock(ni); + + return err; +} + +/* + * ntfs_symlink - inode_operations::symlink + */ +static int ntfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, const char *symname) +{ + u32 size = strlen(symname); + struct inode *inode; + struct ntfs_inode *ni = ntfs_i(dir); + + ni_lock_dir(ni); + + inode = ntfs_create_inode(mnt_userns, dir, dentry, NULL, S_IFLNK | 0777, + 0, symname, size, NULL); + + ni_unlock(ni); + + return IS_ERR(inode) ? PTR_ERR(inode) : 0; +} + +/* + * ntfs_mkdir- inode_operations::mkdir + */ +static int ntfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) +{ + struct inode *inode; + struct ntfs_inode *ni = ntfs_i(dir); + + ni_lock_dir(ni); + + inode = ntfs_create_inode(mnt_userns, dir, dentry, NULL, S_IFDIR | mode, + 0, NULL, 0, NULL); + + ni_unlock(ni); + + return IS_ERR(inode) ? PTR_ERR(inode) : 0; +} + +/* + * ntfs_rmdir - inode_operations::rm_dir + */ +static int ntfs_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct ntfs_inode *ni = ntfs_i(dir); + int err; + + ni_lock_dir(ni); + + err = ntfs_unlink_inode(dir, dentry); + + ni_unlock(ni); + + return err; +} + +/* + * ntfs_rename - inode_operations::rename + */ +static int ntfs_rename(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, struct inode *new_dir, + struct dentry *new_dentry, u32 flags) +{ + int err; + struct super_block *sb = dir->i_sb; + struct ntfs_sb_info *sbi = sb->s_fs_info; + struct ntfs_inode *dir_ni = ntfs_i(dir); + struct ntfs_inode *new_dir_ni = ntfs_i(new_dir); + struct inode *inode = d_inode(dentry); + struct ntfs_inode *ni = ntfs_i(inode); + struct inode *new_inode = d_inode(new_dentry); + struct NTFS_DE *de, *new_de; + bool is_same, is_bad; + /* + * de - memory of PATH_MAX bytes: + * [0-1024) - original name (dentry->d_name) + * [1024-2048) - paired to original name, usually DOS variant of dentry->d_name + * [2048-3072) - new name (new_dentry->d_name) + */ + static_assert(SIZEOF_ATTRIBUTE_FILENAME_MAX + SIZEOF_RESIDENT < 1024); + static_assert(SIZEOF_ATTRIBUTE_FILENAME_MAX + sizeof(struct NTFS_DE) < + 1024); + static_assert(PATH_MAX >= 4 * 1024); + + if (flags & ~RENAME_NOREPLACE) + return -EINVAL; + + is_same = dentry->d_name.len == new_dentry->d_name.len && + !memcmp(dentry->d_name.name, new_dentry->d_name.name, + dentry->d_name.len); + + if (is_same && dir == new_dir) { + /* Nothing to do. */ + return 0; + } + + if (ntfs_is_meta_file(sbi, inode->i_ino)) { + /* Should we print an error? */ + return -EINVAL; + } + + if (new_inode) { + /* Target name exists. Unlink it. */ + dget(new_dentry); + ni_lock_dir(new_dir_ni); + err = ntfs_unlink_inode(new_dir, new_dentry); + ni_unlock(new_dir_ni); + dput(new_dentry); + if (err) + return err; + } + + /* Allocate PATH_MAX bytes. */ + de = __getname(); + if (!de) + return -ENOMEM; + + /* Translate dentry->d_name into unicode form. */ + err = fill_name_de(sbi, de, &dentry->d_name, NULL); + if (err < 0) + goto out; + + if (is_same) { + /* Reuse 'de'. */ + new_de = de; + } else { + /* Translate new_dentry->d_name into unicode form. */ + new_de = Add2Ptr(de, 2048); + err = fill_name_de(sbi, new_de, &new_dentry->d_name, NULL); + if (err < 0) + goto out; + } + + ni_lock_dir(dir_ni); + ni_lock(ni); + + is_bad = false; + err = ni_rename(dir_ni, new_dir_ni, ni, de, new_de, &is_bad); + if (is_bad) { + /* Restore after failed rename failed too. */ + make_bad_inode(inode); + ntfs_inode_err(inode, "failed to undo rename"); + ntfs_set_state(sbi, NTFS_DIRTY_ERROR); + } else if (!err) { + inode->i_ctime = dir->i_ctime = dir->i_mtime = + current_time(dir); + mark_inode_dirty(inode); + mark_inode_dirty(dir); + if (dir != new_dir) { + new_dir->i_mtime = new_dir->i_ctime = dir->i_ctime; + mark_inode_dirty(new_dir); + } + + if (IS_DIRSYNC(dir)) + ntfs_sync_inode(dir); + + if (IS_DIRSYNC(new_dir)) + ntfs_sync_inode(inode); + } + + ni_unlock(ni); + ni_unlock(dir_ni); +out: + __putname(de); + return err; +} + +struct dentry *ntfs3_get_parent(struct dentry *child) +{ + struct inode *inode = d_inode(child); + struct ntfs_inode *ni = ntfs_i(inode); + + struct ATTR_LIST_ENTRY *le = NULL; + struct ATTRIB *attr = NULL; + struct ATTR_FILE_NAME *fname; + + while ((attr = ni_find_attr(ni, attr, &le, ATTR_NAME, NULL, 0, NULL, + NULL))) { + fname = resident_data_ex(attr, SIZEOF_ATTRIBUTE_FILENAME); + if (!fname) + continue; + + return d_obtain_alias( + ntfs_iget5(inode->i_sb, &fname->home, NULL)); + } + + return ERR_PTR(-ENOENT); +} + +// clang-format off +const struct inode_operations ntfs_dir_inode_operations = { + .lookup = ntfs_lookup, + .create = ntfs_create, + .link = ntfs_link, + .unlink = ntfs_unlink, + .symlink = ntfs_symlink, + .mkdir = ntfs_mkdir, + .rmdir = ntfs_rmdir, + .mknod = ntfs_mknod, + .rename = ntfs_rename, + .permission = ntfs_permission, + .get_acl = ntfs_get_acl, + .set_acl = ntfs_set_acl, + .setattr = ntfs3_setattr, + .getattr = ntfs_getattr, + .listxattr = ntfs_listxattr, + .fiemap = ntfs_fiemap, +}; + +const struct inode_operations ntfs_special_inode_operations = { + .setattr = ntfs3_setattr, + .getattr = ntfs_getattr, + .listxattr = ntfs_listxattr, + .get_acl = ntfs_get_acl, + .set_acl = ntfs_set_acl, +}; +// clang-format on diff --git a/fs/ntfs3/ntfs.h b/fs/ntfs3/ntfs.h new file mode 100644 index 000000000000..6bb3e595263b --- /dev/null +++ b/fs/ntfs3/ntfs.h @@ -0,0 +1,1216 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * + * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. + * + * on-disk ntfs structs + */ + +// clang-format off +#ifndef _LINUX_NTFS3_NTFS_H +#define _LINUX_NTFS3_NTFS_H + +/* TODO: Check 4K MFT record and 512 bytes cluster. */ + +/* Activate this define to use binary search in indexes. */ +#define NTFS3_INDEX_BINARY_SEARCH + +/* Check each run for marked clusters. */ +#define NTFS3_CHECK_FREE_CLST + +#define NTFS_NAME_LEN 255 + +/* ntfs.sys used 500 maximum links on-disk struct allows up to 0xffff. */ +#define NTFS_LINK_MAX 0x400 +//#define NTFS_LINK_MAX 0xffff + +/* + * Activate to use 64 bit clusters instead of 32 bits in ntfs.sys. + * Logical and virtual cluster number if needed, may be + * redefined to use 64 bit value. + */ +//#define CONFIG_NTFS3_64BIT_CLUSTER + +#define NTFS_LZNT_MAX_CLUSTER 4096 +#define NTFS_LZNT_CUNIT 4 +#define NTFS_LZNT_CLUSTERS (1u<<NTFS_LZNT_CUNIT) + +struct GUID { + __le32 Data1; + __le16 Data2; + __le16 Data3; + u8 Data4[8]; +}; + +/* + * This struct repeats layout of ATTR_FILE_NAME + * at offset 0x40. + * It used to store global constants NAME_MFT/NAME_MIRROR... + * most constant names are shorter than 10. + */ +struct cpu_str { + u8 len; + u8 unused; + u16 name[10]; +}; + +struct le_str { + u8 len; + u8 unused; + __le16 name[]; +}; + +static_assert(SECTOR_SHIFT == 9); + +#ifdef CONFIG_NTFS3_64BIT_CLUSTER +typedef u64 CLST; +static_assert(sizeof(size_t) == 8); +#else +typedef u32 CLST; +#endif + +#define SPARSE_LCN64 ((u64)-1) +#define SPARSE_LCN ((CLST)-1) +#define RESIDENT_LCN ((CLST)-2) +#define COMPRESSED_LCN ((CLST)-3) + +#define COMPRESSION_UNIT 4 +#define COMPRESS_MAX_CLUSTER 0x1000 +#define MFT_INCREASE_CHUNK 1024 + +enum RECORD_NUM { + MFT_REC_MFT = 0, + MFT_REC_MIRR = 1, + MFT_REC_LOG = 2, + MFT_REC_VOL = 3, + MFT_REC_ATTR = 4, + MFT_REC_ROOT = 5, + MFT_REC_BITMAP = 6, + MFT_REC_BOOT = 7, + MFT_REC_BADCLUST = 8, + //MFT_REC_QUOTA = 9, + MFT_REC_SECURE = 9, // NTFS 3.0 + MFT_REC_UPCASE = 10, + MFT_REC_EXTEND = 11, // NTFS 3.0 + MFT_REC_RESERVED = 11, + MFT_REC_FREE = 16, + MFT_REC_USER = 24, +}; + +enum ATTR_TYPE { + ATTR_ZERO = cpu_to_le32(0x00), + ATTR_STD = cpu_to_le32(0x10), + ATTR_LIST = cpu_to_le32(0x20), + ATTR_NAME = cpu_to_le32(0x30), + // ATTR_VOLUME_VERSION on Nt4 + ATTR_ID = cpu_to_le32(0x40), + ATTR_SECURE = cpu_to_le32(0x50), + ATTR_LABEL = cpu_to_le32(0x60), + ATTR_VOL_INFO = cpu_to_le32(0x70), + ATTR_DATA = cpu_to_le32(0x80), + ATTR_ROOT = cpu_to_le32(0x90), + ATTR_ALLOC = cpu_to_le32(0xA0), + ATTR_BITMAP = cpu_to_le32(0xB0), + // ATTR_SYMLINK on Nt4 + ATTR_REPARSE = cpu_to_le32(0xC0), + ATTR_EA_INFO = cpu_to_le32(0xD0), + ATTR_EA = cpu_to_le32(0xE0), + ATTR_PROPERTYSET = cpu_to_le32(0xF0), + ATTR_LOGGED_UTILITY_STREAM = cpu_to_le32(0x100), + ATTR_END = cpu_to_le32(0xFFFFFFFF) +}; + +static_assert(sizeof(enum ATTR_TYPE) == 4); + +enum FILE_ATTRIBUTE { + FILE_ATTRIBUTE_READONLY = cpu_to_le32(0x00000001), + FILE_ATTRIBUTE_HIDDEN = cpu_to_le32(0x00000002), + FILE_ATTRIBUTE_SYSTEM = cpu_to_le32(0x00000004), + FILE_ATTRIBUTE_ARCHIVE = cpu_to_le32(0x00000020), + FILE_ATTRIBUTE_DEVICE = cpu_to_le32(0x00000040), + FILE_ATTRIBUTE_TEMPORARY = cpu_to_le32(0x00000100), + FILE_ATTRIBUTE_SPARSE_FILE = cpu_to_le32(0x00000200), + FILE_ATTRIBUTE_REPARSE_POINT = cpu_to_le32(0x00000400), + FILE_ATTRIBUTE_COMPRESSED = cpu_to_le32(0x00000800), + FILE_ATTRIBUTE_OFFLINE = cpu_to_le32(0x00001000), + FILE_ATTRIBUTE_NOT_CONTENT_INDEXED = cpu_to_le32(0x00002000), + FILE_ATTRIBUTE_ENCRYPTED = cpu_to_le32(0x00004000), + FILE_ATTRIBUTE_VALID_FLAGS = cpu_to_le32(0x00007fb7), + FILE_ATTRIBUTE_DIRECTORY = cpu_to_le32(0x10000000), +}; + +static_assert(sizeof(enum FILE_ATTRIBUTE) == 4); + +extern const struct cpu_str NAME_MFT; +extern const struct cpu_str NAME_MIRROR; +extern const struct cpu_str NAME_LOGFILE; +extern const struct cpu_str NAME_VOLUME; +extern const struct cpu_str NAME_ATTRDEF; +extern const struct cpu_str NAME_ROOT; +extern const struct cpu_str NAME_BITMAP; +extern const struct cpu_str NAME_BOOT; +extern const struct cpu_str NAME_BADCLUS; +extern const struct cpu_str NAME_QUOTA; +extern const struct cpu_str NAME_SECURE; +extern const struct cpu_str NAME_UPCASE; +extern const struct cpu_str NAME_EXTEND; +extern const struct cpu_str NAME_OBJID; +extern const struct cpu_str NAME_REPARSE; +extern const struct cpu_str NAME_USNJRNL; + +extern const __le16 I30_NAME[4]; +extern const __le16 SII_NAME[4]; +extern const __le16 SDH_NAME[4]; +extern const __le16 SO_NAME[2]; +extern const __le16 SQ_NAME[2]; +extern const __le16 SR_NAME[2]; + +extern const __le16 BAD_NAME[4]; +extern const __le16 SDS_NAME[4]; +extern const __le16 WOF_NAME[17]; /* WofCompressedData */ + +/* MFT record number structure. */ +struct MFT_REF { + __le32 low; // The low part of the number. + __le16 high; // The high part of the number. + __le16 seq; // The sequence number of MFT record. +}; + +static_assert(sizeof(__le64) == sizeof(struct MFT_REF)); + +static inline CLST ino_get(const struct MFT_REF *ref) +{ +#ifdef CONFIG_NTFS3_64BIT_CLUSTER + return le32_to_cpu(ref->low) | ((u64)le16_to_cpu(ref->high) << 32); +#else + return le32_to_cpu(ref->low); +#endif +} + +struct NTFS_BOOT { + u8 jump_code[3]; // 0x00: Jump to boot code. + u8 system_id[8]; // 0x03: System ID, equals "NTFS " + + // NOTE: This member is not aligned(!) + // bytes_per_sector[0] must be 0. + // bytes_per_sector[1] must be multiplied by 256. + u8 bytes_per_sector[2]; // 0x0B: Bytes per sector. + + u8 sectors_per_clusters;// 0x0D: Sectors per cluster. + u8 unused1[7]; + u8 media_type; // 0x15: Media type (0xF8 - harddisk) + u8 unused2[2]; + __le16 sct_per_track; // 0x18: number of sectors per track. + __le16 heads; // 0x1A: number of heads per cylinder. + __le32 hidden_sectors; // 0x1C: number of 'hidden' sectors. + u8 unused3[4]; + u8 bios_drive_num; // 0x24: BIOS drive number =0x80. + u8 unused4; + u8 signature_ex; // 0x26: Extended BOOT signature =0x80. + u8 unused5; + __le64 sectors_per_volume;// 0x28: Size of volume in sectors. + __le64 mft_clst; // 0x30: First cluster of $MFT + __le64 mft2_clst; // 0x38: First cluster of $MFTMirr + s8 record_size; // 0x40: Size of MFT record in clusters(sectors). + u8 unused6[3]; + s8 index_size; // 0x44: Size of INDX record in clusters(sectors). + u8 unused7[3]; + __le64 serial_num; // 0x48: Volume serial number + __le32 check_sum; // 0x50: Simple additive checksum of all + // of the u32's which precede the 'check_sum'. + + u8 boot_code[0x200 - 0x50 - 2 - 4]; // 0x54: + u8 boot_magic[2]; // 0x1FE: Boot signature =0x55 + 0xAA +}; + +static_assert(sizeof(struct NTFS_BOOT) == 0x200); + +enum NTFS_SIGNATURE { + NTFS_FILE_SIGNATURE = cpu_to_le32(0x454C4946), // 'FILE' + NTFS_INDX_SIGNATURE = cpu_to_le32(0x58444E49), // 'INDX' + NTFS_CHKD_SIGNATURE = cpu_to_le32(0x444B4843), // 'CHKD' + NTFS_RSTR_SIGNATURE = cpu_to_le32(0x52545352), // 'RSTR' + NTFS_RCRD_SIGNATURE = cpu_to_le32(0x44524352), // 'RCRD' + NTFS_BAAD_SIGNATURE = cpu_to_le32(0x44414142), // 'BAAD' + NTFS_HOLE_SIGNATURE = cpu_to_le32(0x454C4F48), // 'HOLE' + NTFS_FFFF_SIGNATURE = cpu_to_le32(0xffffffff), +}; + +static_assert(sizeof(enum NTFS_SIGNATURE) == 4); + +/* MFT Record header structure. */ +struct NTFS_RECORD_HEADER { + /* Record magic number, equals 'FILE'/'INDX'/'RSTR'/'RCRD'. */ + enum NTFS_SIGNATURE sign; // 0x00: + __le16 fix_off; // 0x04: + __le16 fix_num; // 0x06: + __le64 lsn; // 0x08: Log file sequence number, +}; + +static_assert(sizeof(struct NTFS_RECORD_HEADER) == 0x10); + +static inline int is_baad(const struct NTFS_RECORD_HEADER *hdr) +{ + return hdr->sign == NTFS_BAAD_SIGNATURE; +} + +/* Possible bits in struct MFT_REC.flags. */ +enum RECORD_FLAG { + RECORD_FLAG_IN_USE = cpu_to_le16(0x0001), + RECORD_FLAG_DIR = cpu_to_le16(0x0002), + RECORD_FLAG_SYSTEM = cpu_to_le16(0x0004), + RECORD_FLAG_UNKNOWN = cpu_to_le16(0x0008), +}; + +/* MFT Record structure. */ +struct MFT_REC { + struct NTFS_RECORD_HEADER rhdr; // 'FILE' + + __le16 seq; // 0x10: Sequence number for this record. + __le16 hard_links; // 0x12: The number of hard links to record. + __le16 attr_off; // 0x14: Offset to attributes. + __le16 flags; // 0x16: See RECORD_FLAG. + __le32 used; // 0x18: The size of used part. + __le32 total; // 0x1C: Total record size. + + struct MFT_REF parent_ref; // 0x20: Parent MFT record. + __le16 next_attr_id; // 0x28: The next attribute Id. + + __le16 res; // 0x2A: High part of MFT record? + __le32 mft_record; // 0x2C: Current MFT record number. + __le16 fixups[]; // 0x30: +}; + +#define MFTRECORD_FIXUP_OFFSET_1 offsetof(struct MFT_REC, res) +#define MFTRECORD_FIXUP_OFFSET_3 offsetof(struct MFT_REC, fixups) + +static_assert(MFTRECORD_FIXUP_OFFSET_1 == 0x2A); +static_assert(MFTRECORD_FIXUP_OFFSET_3 == 0x30); + +static inline bool is_rec_base(const struct MFT_REC *rec) +{ + const struct MFT_REF *r = &rec->parent_ref; + + return !r->low && !r->high && !r->seq; +} + +static inline bool is_mft_rec5(const struct MFT_REC *rec) +{ + return le16_to_cpu(rec->rhdr.fix_off) >= + offsetof(struct MFT_REC, fixups); +} + +static inline bool is_rec_inuse(const struct MFT_REC *rec) +{ + return rec->flags & RECORD_FLAG_IN_USE; +} + +static inline bool clear_rec_inuse(struct MFT_REC *rec) +{ + return rec->flags &= ~RECORD_FLAG_IN_USE; +} + +/* Possible values of ATTR_RESIDENT.flags */ +#define RESIDENT_FLAG_INDEXED 0x01 + +struct ATTR_RESIDENT { + __le32 data_size; // 0x10: The size of data. + __le16 data_off; // 0x14: Offset to data. + u8 flags; // 0x16: Resident flags ( 1 - indexed ). + u8 res; // 0x17: +}; // sizeof() = 0x18 + +struct ATTR_NONRESIDENT { + __le64 svcn; // 0x10: Starting VCN of this segment. + __le64 evcn; // 0x18: End VCN of this segment. + __le16 run_off; // 0x20: Offset to packed runs. + // Unit of Compression size for this stream, expressed + // as a log of the cluster size. + // + // 0 means file is not compressed + // 1, 2, 3, and 4 are potentially legal values if the + // stream is compressed, however the implementation + // may only choose to use 4, or possibly 3. Note + // that 4 means cluster size time 16. If convenient + // the implementation may wish to accept a + // reasonable range of legal values here (1-5?), + // even if the implementation only generates + // a smaller set of values itself. + u8 c_unit; // 0x22: + u8 res1[5]; // 0x23: + __le64 alloc_size; // 0x28: The allocated size of attribute in bytes. + // (multiple of cluster size) + __le64 data_size; // 0x30: The size of attribute in bytes <= alloc_size. + __le64 valid_size; // 0x38: The size of valid part in bytes <= data_size. + __le64 total_size; // 0x40: The sum of the allocated clusters for a file. + // (present only for the first segment (0 == vcn) + // of compressed attribute) + +}; // sizeof()=0x40 or 0x48 (if compressed) + +/* Possible values of ATTRIB.flags: */ +#define ATTR_FLAG_COMPRESSED cpu_to_le16(0x0001) +#define ATTR_FLAG_COMPRESSED_MASK cpu_to_le16(0x00FF) +#define ATTR_FLAG_ENCRYPTED cpu_to_le16(0x4000) +#define ATTR_FLAG_SPARSED cpu_to_le16(0x8000) + +struct ATTRIB { + enum ATTR_TYPE type; // 0x00: The type of this attribute. + __le32 size; // 0x04: The size of this attribute. + u8 non_res; // 0x08: Is this attribute non-resident? + u8 name_len; // 0x09: This attribute name length. + __le16 name_off; // 0x0A: Offset to the attribute name. + __le16 flags; // 0x0C: See ATTR_FLAG_XXX. + __le16 id; // 0x0E: Unique id (per record). + + union { + struct ATTR_RESIDENT res; // 0x10 + struct ATTR_NONRESIDENT nres; // 0x10 + }; +}; + +/* Define attribute sizes. */ +#define SIZEOF_RESIDENT 0x18 +#define SIZEOF_NONRESIDENT_EX 0x48 +#define SIZEOF_NONRESIDENT 0x40 + +#define SIZEOF_RESIDENT_LE cpu_to_le16(0x18) +#define SIZEOF_NONRESIDENT_EX_LE cpu_to_le16(0x48) +#define SIZEOF_NONRESIDENT_LE cpu_to_le16(0x40) + +static inline u64 attr_ondisk_size(const struct ATTRIB *attr) +{ + return attr->non_res ? ((attr->flags & + (ATTR_FLAG_COMPRESSED | ATTR_FLAG_SPARSED)) ? + le64_to_cpu(attr->nres.total_size) : + le64_to_cpu(attr->nres.alloc_size)) + : ALIGN(le32_to_cpu(attr->res.data_size), 8); +} + +static inline u64 attr_size(const struct ATTRIB *attr) +{ + return attr->non_res ? le64_to_cpu(attr->nres.data_size) : + le32_to_cpu(attr->res.data_size); +} + +static inline bool is_attr_encrypted(const struct ATTRIB *attr) +{ + return attr->flags & ATTR_FLAG_ENCRYPTED; +} + +static inline bool is_attr_sparsed(const struct ATTRIB *attr) +{ + return attr->flags & ATTR_FLAG_SPARSED; +} + +static inline bool is_attr_compressed(const struct ATTRIB *attr) +{ + return attr->flags & ATTR_FLAG_COMPRESSED; +} + +static inline bool is_attr_ext(const struct ATTRIB *attr) +{ + return attr->flags & (ATTR_FLAG_SPARSED | ATTR_FLAG_COMPRESSED); +} + +static inline bool is_attr_indexed(const struct ATTRIB *attr) +{ + return !attr->non_res && (attr->res.flags & RESIDENT_FLAG_INDEXED); +} + +static inline __le16 const *attr_name(const struct ATTRIB *attr) +{ + return Add2Ptr(attr, le16_to_cpu(attr->name_off)); +} + +static inline u64 attr_svcn(const struct ATTRIB *attr) +{ + return attr->non_res ? le64_to_cpu(attr->nres.svcn) : 0; +} + +/* The size of resident attribute by its resident size. */ +#define BYTES_PER_RESIDENT(b) (0x18 + (b)) + +static_assert(sizeof(struct ATTRIB) == 0x48); +static_assert(sizeof(((struct ATTRIB *)NULL)->res) == 0x08); +static_assert(sizeof(((struct ATTRIB *)NULL)->nres) == 0x38); + +static inline void *resident_data_ex(const struct ATTRIB *attr, u32 datasize) +{ + u32 asize, rsize; + u16 off; + + if (attr->non_res) + return NULL; + + asize = le32_to_cpu(attr->size); + off = le16_to_cpu(attr->res.data_off); + + if (asize < datasize + off) + return NULL; + + rsize = le32_to_cpu(attr->res.data_size); + if (rsize < datasize) + return NULL; + + return Add2Ptr(attr, off); +} + +static inline void *resident_data(const struct ATTRIB *attr) +{ + return Add2Ptr(attr, le16_to_cpu(attr->res.data_off)); +} + +static inline void *attr_run(const struct ATTRIB *attr) +{ + return Add2Ptr(attr, le16_to_cpu(attr->nres.run_off)); +} + +/* Standard information attribute (0x10). */ +struct ATTR_STD_INFO { + __le64 cr_time; // 0x00: File creation file. + __le64 m_time; // 0x08: File modification time. + __le64 c_time; // 0x10: Last time any attribute was modified. + __le64 a_time; // 0x18: File last access time. + enum FILE_ATTRIBUTE fa; // 0x20: Standard DOS attributes & more. + __le32 max_ver_num; // 0x24: Maximum Number of Versions. + __le32 ver_num; // 0x28: Version Number. + __le32 class_id; // 0x2C: Class Id from bidirectional Class Id index. +}; + +static_assert(sizeof(struct ATTR_STD_INFO) == 0x30); + +#define SECURITY_ID_INVALID 0x00000000 +#define SECURITY_ID_FIRST 0x00000100 + +struct ATTR_STD_INFO5 { + __le64 cr_time; // 0x00: File creation file. + __le64 m_time; // 0x08: File modification time. + __le64 c_time; // 0x10: Last time any attribute was modified. + __le64 a_time; // 0x18: File last access time. + enum FILE_ATTRIBUTE fa; // 0x20: Standard DOS attributes & more. + __le32 max_ver_num; // 0x24: Maximum Number of Versions. + __le32 ver_num; // 0x28: Version Number. + __le32 class_id; // 0x2C: Class Id from bidirectional Class Id index. + + __le32 owner_id; // 0x30: Owner Id of the user owning the file. + __le32 security_id; // 0x34: The Security Id is a key in the $SII Index and $SDS. + __le64 quota_charge; // 0x38: + __le64 usn; // 0x40: Last Update Sequence Number of the file. This is a direct + // index into the file $UsnJrnl. If zero, the USN Journal is + // disabled. +}; + +static_assert(sizeof(struct ATTR_STD_INFO5) == 0x48); + +/* Attribute list entry structure (0x20) */ +struct ATTR_LIST_ENTRY { + enum ATTR_TYPE type; // 0x00: The type of attribute. + __le16 size; // 0x04: The size of this record. + u8 name_len; // 0x06: The length of attribute name. + u8 name_off; // 0x07: The offset to attribute name. + __le64 vcn; // 0x08: Starting VCN of this attribute. + struct MFT_REF ref; // 0x10: MFT record number with attribute. + __le16 id; // 0x18: struct ATTRIB ID. + __le16 name[3]; // 0x1A: Just to align. To get real name can use bNameOffset. + +}; // sizeof(0x20) + +static_assert(sizeof(struct ATTR_LIST_ENTRY) == 0x20); + +static inline u32 le_size(u8 name_len) +{ + return ALIGN(offsetof(struct ATTR_LIST_ENTRY, name) + + name_len * sizeof(short), 8); +} + +/* Returns 0 if 'attr' has the same type and name. */ +static inline int le_cmp(const struct ATTR_LIST_ENTRY *le, + const struct ATTRIB *attr) +{ + return le->type != attr->type || le->name_len != attr->name_len || + (!le->name_len && + memcmp(Add2Ptr(le, le->name_off), + Add2Ptr(attr, le16_to_cpu(attr->name_off)), + le->name_len * sizeof(short))); +} + +static inline __le16 const *le_name(const struct ATTR_LIST_ENTRY *le) +{ + return Add2Ptr(le, le->name_off); +} + +/* File name types (the field type in struct ATTR_FILE_NAME). */ +#define FILE_NAME_POSIX 0 +#define FILE_NAME_UNICODE 1 +#define FILE_NAME_DOS 2 +#define FILE_NAME_UNICODE_AND_DOS (FILE_NAME_DOS | FILE_NAME_UNICODE) + +/* Filename attribute structure (0x30). */ +struct NTFS_DUP_INFO { + __le64 cr_time; // 0x00: File creation file. + __le64 m_time; // 0x08: File modification time. + __le64 c_time; // 0x10: Last time any attribute was modified. + __le64 a_time; // 0x18: File last access time. + __le64 alloc_size; // 0x20: Data attribute allocated size, multiple of cluster size. + __le64 data_size; // 0x28: Data attribute size <= Dataalloc_size. + enum FILE_ATTRIBUTE fa; // 0x30: Standard DOS attributes & more. + __le16 ea_size; // 0x34: Packed EAs. + __le16 reparse; // 0x36: Used by Reparse. + +}; // 0x38 + +struct ATTR_FILE_NAME { + struct MFT_REF home; // 0x00: MFT record for directory. + struct NTFS_DUP_INFO dup;// 0x08: + u8 name_len; // 0x40: File name length in words. + u8 type; // 0x41: File name type. + __le16 name[]; // 0x42: File name. +}; + +static_assert(sizeof(((struct ATTR_FILE_NAME *)NULL)->dup) == 0x38); +static_assert(offsetof(struct ATTR_FILE_NAME, name) == 0x42); +#define SIZEOF_ATTRIBUTE_FILENAME 0x44 +#define SIZEOF_ATTRIBUTE_FILENAME_MAX (0x42 + 255 * 2) + +static inline struct ATTRIB *attr_from_name(struct ATTR_FILE_NAME *fname) +{ + return (struct ATTRIB *)((char *)fname - SIZEOF_RESIDENT); +} + +static inline u16 fname_full_size(const struct ATTR_FILE_NAME *fname) +{ + /* Don't return struct_size(fname, name, fname->name_len); */ + return offsetof(struct ATTR_FILE_NAME, name) + + fname->name_len * sizeof(short); +} + +static inline u8 paired_name(u8 type) +{ + if (type == FILE_NAME_UNICODE) + return FILE_NAME_DOS; + if (type == FILE_NAME_DOS) + return FILE_NAME_UNICODE; + return FILE_NAME_POSIX; +} + +/* Index entry defines ( the field flags in NtfsDirEntry ). */ +#define NTFS_IE_HAS_SUBNODES cpu_to_le16(1) +#define NTFS_IE_LAST cpu_to_le16(2) + +/* Directory entry structure. */ +struct NTFS_DE { + union { + struct MFT_REF ref; // 0x00: MFT record number with this file. + struct { + __le16 data_off; // 0x00: + __le16 data_size; // 0x02: + __le32 res; // 0x04: Must be 0. + } view; + }; + __le16 size; // 0x08: The size of this entry. + __le16 key_size; // 0x0A: The size of File name length in bytes + 0x42. + __le16 flags; // 0x0C: Entry flags: NTFS_IE_XXX. + __le16 res; // 0x0E: + + // Here any indexed attribute can be placed. + // One of them is: + // struct ATTR_FILE_NAME AttrFileName; + // + + // The last 8 bytes of this structure contains + // the VBN of subnode. + // !!! Note !!! + // This field is presented only if (flags & NTFS_IE_HAS_SUBNODES) + // __le64 vbn; +}; + +static_assert(sizeof(struct NTFS_DE) == 0x10); + +static inline void de_set_vbn_le(struct NTFS_DE *e, __le64 vcn) +{ + __le64 *v = Add2Ptr(e, le16_to_cpu(e->size) - sizeof(__le64)); + + *v = vcn; +} + +static inline void de_set_vbn(struct NTFS_DE *e, CLST vcn) +{ + __le64 *v = Add2Ptr(e, le16_to_cpu(e->size) - sizeof(__le64)); + + *v = cpu_to_le64(vcn); +} + +static inline __le64 de_get_vbn_le(const struct NTFS_DE *e) +{ + return *(__le64 *)Add2Ptr(e, le16_to_cpu(e->size) - sizeof(__le64)); +} + +static inline CLST de_get_vbn(const struct NTFS_DE *e) +{ + __le64 *v = Add2Ptr(e, le16_to_cpu(e->size) - sizeof(__le64)); + + return le64_to_cpu(*v); +} + +static inline struct NTFS_DE *de_get_next(const struct NTFS_DE *e) +{ + return Add2Ptr(e, le16_to_cpu(e->size)); +} + +static inline struct ATTR_FILE_NAME *de_get_fname(const struct NTFS_DE *e) +{ + return le16_to_cpu(e->key_size) >= SIZEOF_ATTRIBUTE_FILENAME ? + Add2Ptr(e, sizeof(struct NTFS_DE)) : + NULL; +} + +static inline bool de_is_last(const struct NTFS_DE *e) +{ + return e->flags & NTFS_IE_LAST; +} + +static inline bool de_has_vcn(const struct NTFS_DE *e) +{ + return e->flags & NTFS_IE_HAS_SUBNODES; +} + +static inline bool de_has_vcn_ex(const struct NTFS_DE *e) +{ + return (e->flags & NTFS_IE_HAS_SUBNODES) && + (u64)(-1) != *((u64 *)Add2Ptr(e, le16_to_cpu(e->size) - + sizeof(__le64))); +} + +#define MAX_BYTES_PER_NAME_ENTRY \ + ALIGN(sizeof(struct NTFS_DE) + \ + offsetof(struct ATTR_FILE_NAME, name) + \ + NTFS_NAME_LEN * sizeof(short), 8) + +struct INDEX_HDR { + __le32 de_off; // 0x00: The offset from the start of this structure + // to the first NTFS_DE. + __le32 used; // 0x04: The size of this structure plus all + // entries (quad-word aligned). + __le32 total; // 0x08: The allocated size of for this structure plus all entries. + u8 flags; // 0x0C: 0x00 = Small directory, 0x01 = Large directory. + u8 res[3]; + + // + // de_off + used <= total + // +}; + +static_assert(sizeof(struct INDEX_HDR) == 0x10); + +static inline struct NTFS_DE *hdr_first_de(const struct INDEX_HDR *hdr) +{ + u32 de_off = le32_to_cpu(hdr->de_off); + u32 used = le32_to_cpu(hdr->used); + struct NTFS_DE *e = Add2Ptr(hdr, de_off); + u16 esize; + + if (de_off >= used || de_off >= le32_to_cpu(hdr->total)) + return NULL; + + esize = le16_to_cpu(e->size); + if (esize < sizeof(struct NTFS_DE) || de_off + esize > used) + return NULL; + + return e; +} + +static inline struct NTFS_DE *hdr_next_de(const struct INDEX_HDR *hdr, + const struct NTFS_DE *e) +{ + size_t off = PtrOffset(hdr, e); + u32 used = le32_to_cpu(hdr->used); + u16 esize; + + if (off >= used) + return NULL; + + esize = le16_to_cpu(e->size); + + if (esize < sizeof(struct NTFS_DE) || + off + esize + sizeof(struct NTFS_DE) > used) + return NULL; + + return Add2Ptr(e, esize); +} + +static inline bool hdr_has_subnode(const struct INDEX_HDR *hdr) +{ + return hdr->flags & 1; +} + +struct INDEX_BUFFER { + struct NTFS_RECORD_HEADER rhdr; // 'INDX' + __le64 vbn; // 0x10: vcn if index >= cluster or vsn id index < cluster + struct INDEX_HDR ihdr; // 0x18: +}; + +static_assert(sizeof(struct INDEX_BUFFER) == 0x28); + +static inline bool ib_is_empty(const struct INDEX_BUFFER *ib) +{ + const struct NTFS_DE *first = hdr_first_de(&ib->ihdr); + + return !first || de_is_last(first); +} + +static inline bool ib_is_leaf(const struct INDEX_BUFFER *ib) +{ + return !(ib->ihdr.flags & 1); +} + +/* Index root structure ( 0x90 ). */ +enum COLLATION_RULE { + NTFS_COLLATION_TYPE_BINARY = cpu_to_le32(0), + // $I30 + NTFS_COLLATION_TYPE_FILENAME = cpu_to_le32(0x01), + // $SII of $Secure and $Q of Quota + NTFS_COLLATION_TYPE_UINT = cpu_to_le32(0x10), + // $O of Quota + NTFS_COLLATION_TYPE_SID = cpu_to_le32(0x11), + // $SDH of $Secure + NTFS_COLLATION_TYPE_SECURITY_HASH = cpu_to_le32(0x12), + // $O of ObjId and "$R" for Reparse + NTFS_COLLATION_TYPE_UINTS = cpu_to_le32(0x13) +}; + +static_assert(sizeof(enum COLLATION_RULE) == 4); + +// +struct INDEX_ROOT { + enum ATTR_TYPE type; // 0x00: The type of attribute to index on. + enum COLLATION_RULE rule; // 0x04: The rule. + __le32 index_block_size;// 0x08: The size of index record. + u8 index_block_clst; // 0x0C: The number of clusters or sectors per index. + u8 res[3]; + struct INDEX_HDR ihdr; // 0x10: +}; + +static_assert(sizeof(struct INDEX_ROOT) == 0x20); +static_assert(offsetof(struct INDEX_ROOT, ihdr) == 0x10); + +#define VOLUME_FLAG_DIRTY cpu_to_le16(0x0001) +#define VOLUME_FLAG_RESIZE_LOG_FILE cpu_to_le16(0x0002) + +struct VOLUME_INFO { + __le64 res1; // 0x00 + u8 major_ver; // 0x08: NTFS major version number (before .) + u8 minor_ver; // 0x09: NTFS minor version number (after .) + __le16 flags; // 0x0A: Volume flags, see VOLUME_FLAG_XXX + +}; // sizeof=0xC + +#define SIZEOF_ATTRIBUTE_VOLUME_INFO 0xc + +#define NTFS_LABEL_MAX_LENGTH (0x100 / sizeof(short)) +#define NTFS_ATTR_INDEXABLE cpu_to_le32(0x00000002) +#define NTFS_ATTR_DUPALLOWED cpu_to_le32(0x00000004) +#define NTFS_ATTR_MUST_BE_INDEXED cpu_to_le32(0x00000010) +#define NTFS_ATTR_MUST_BE_NAMED cpu_to_le32(0x00000020) +#define NTFS_ATTR_MUST_BE_RESIDENT cpu_to_le32(0x00000040) +#define NTFS_ATTR_LOG_ALWAYS cpu_to_le32(0x00000080) + +/* $AttrDef file entry. */ +struct ATTR_DEF_ENTRY { + __le16 name[0x40]; // 0x00: Attr name. + enum ATTR_TYPE type; // 0x80: struct ATTRIB type. + __le32 res; // 0x84: + enum COLLATION_RULE rule; // 0x88: + __le32 flags; // 0x8C: NTFS_ATTR_XXX (see above). + __le64 min_sz; // 0x90: Minimum attribute data size. + __le64 max_sz; // 0x98: Maximum attribute data size. +}; + +static_assert(sizeof(struct ATTR_DEF_ENTRY) == 0xa0); + +/* Object ID (0x40) */ +struct OBJECT_ID { + struct GUID ObjId; // 0x00: Unique Id assigned to file. + struct GUID BirthVolumeId; // 0x10: Birth Volume Id is the Object Id of the Volume on. + // which the Object Id was allocated. It never changes. + struct GUID BirthObjectId; // 0x20: Birth Object Id is the first Object Id that was + // ever assigned to this MFT Record. I.e. If the Object Id + // is changed for some reason, this field will reflect the + // original value of the Object Id. + struct GUID DomainId; // 0x30: Domain Id is currently unused but it is intended to be + // used in a network environment where the local machine is + // part of a Windows 2000 Domain. This may be used in a Windows + // 2000 Advanced Server managed domain. +}; + +static_assert(sizeof(struct OBJECT_ID) == 0x40); + +/* O Directory entry structure ( rule = 0x13 ) */ +struct NTFS_DE_O { + struct NTFS_DE de; + struct GUID ObjId; // 0x10: Unique Id assigned to file. + struct MFT_REF ref; // 0x20: MFT record number with this file. + struct GUID BirthVolumeId; // 0x28: Birth Volume Id is the Object Id of the Volume on + // which the Object Id was allocated. It never changes. + struct GUID BirthObjectId; // 0x38: Birth Object Id is the first Object Id that was + // ever assigned to this MFT Record. I.e. If the Object Id + // is changed for some reason, this field will reflect the + // original value of the Object Id. + // This field is valid if data_size == 0x48. + struct GUID BirthDomainId; // 0x48: Domain Id is currently unused but it is intended + // to be used in a network environment where the local + // machine is part of a Windows 2000 Domain. This may be + // used in a Windows 2000 Advanced Server managed domain. +}; + +static_assert(sizeof(struct NTFS_DE_O) == 0x58); + +#define NTFS_OBJECT_ENTRY_DATA_SIZE1 \ + 0x38 // struct NTFS_DE_O.BirthDomainId is not used +#define NTFS_OBJECT_ENTRY_DATA_SIZE2 \ + 0x48 // struct NTFS_DE_O.BirthDomainId is used + +/* Q Directory entry structure ( rule = 0x11 ) */ +struct NTFS_DE_Q { + struct NTFS_DE de; + __le32 owner_id; // 0x10: Unique Id assigned to file + __le32 Version; // 0x14: 0x02 + __le32 flags2; // 0x18: Quota flags, see above + __le64 BytesUsed; // 0x1C: + __le64 ChangeTime; // 0x24: + __le64 WarningLimit; // 0x28: + __le64 HardLimit; // 0x34: + __le64 ExceededTime; // 0x3C: + + // SID is placed here +}; // sizeof() = 0x44 + +#define SIZEOF_NTFS_DE_Q 0x44 + +#define SecurityDescriptorsBlockSize 0x40000 // 256K +#define SecurityDescriptorMaxSize 0x20000 // 128K +#define Log2OfSecurityDescriptorsBlockSize 18 + +struct SECURITY_KEY { + __le32 hash; // Hash value for descriptor + __le32 sec_id; // Security Id (guaranteed unique) +}; + +/* Security descriptors (the content of $Secure::SDS data stream) */ +struct SECURITY_HDR { + struct SECURITY_KEY key; // 0x00: Security Key. + __le64 off; // 0x08: Offset of this entry in the file. + __le32 size; // 0x10: Size of this entry, 8 byte aligned. + /* + * Security descriptor itself is placed here. + * Total size is 16 byte aligned. + */ +} __packed; + +#define SIZEOF_SECURITY_HDR 0x14 + +/* SII Directory entry structure */ +struct NTFS_DE_SII { + struct NTFS_DE de; + __le32 sec_id; // 0x10: Key: sizeof(security_id) = wKeySize + struct SECURITY_HDR sec_hdr; // 0x14: +} __packed; + +#define SIZEOF_SII_DIRENTRY 0x28 + +/* SDH Directory entry structure */ +struct NTFS_DE_SDH { + struct NTFS_DE de; + struct SECURITY_KEY key; // 0x10: Key + struct SECURITY_HDR sec_hdr; // 0x18: Data + __le16 magic[2]; // 0x2C: 0x00490049 "I I" +}; + +#define SIZEOF_SDH_DIRENTRY 0x30 + +struct REPARSE_KEY { + __le32 ReparseTag; // 0x00: Reparse Tag + struct MFT_REF ref; // 0x04: MFT record number with this file +}; // sizeof() = 0x0C + +static_assert(offsetof(struct REPARSE_KEY, ref) == 0x04); +#define SIZEOF_REPARSE_KEY 0x0C + +/* Reparse Directory entry structure */ +struct NTFS_DE_R { + struct NTFS_DE de; + struct REPARSE_KEY key; // 0x10: Reparse Key. + u32 zero; // 0x1c: +}; // sizeof() = 0x20 + +static_assert(sizeof(struct NTFS_DE_R) == 0x20); + +/* CompressReparseBuffer.WofVersion */ +#define WOF_CURRENT_VERSION cpu_to_le32(1) +/* CompressReparseBuffer.WofProvider */ +#define WOF_PROVIDER_WIM cpu_to_le32(1) +/* CompressReparseBuffer.WofProvider */ +#define WOF_PROVIDER_SYSTEM cpu_to_le32(2) +/* CompressReparseBuffer.ProviderVer */ +#define WOF_PROVIDER_CURRENT_VERSION cpu_to_le32(1) + +#define WOF_COMPRESSION_XPRESS4K cpu_to_le32(0) // 4k +#define WOF_COMPRESSION_LZX32K cpu_to_le32(1) // 32k +#define WOF_COMPRESSION_XPRESS8K cpu_to_le32(2) // 8k +#define WOF_COMPRESSION_XPRESS16K cpu_to_le32(3) // 16k + +/* + * ATTR_REPARSE (0xC0) + * + * The reparse struct GUID structure is used by all 3rd party layered drivers to + * store data in a reparse point. For non-Microsoft tags, The struct GUID field + * cannot be GUID_NULL. + * The constraints on reparse tags are defined below. + * Microsoft tags can also be used with this format of the reparse point buffer. + */ +struct REPARSE_POINT { + __le32 ReparseTag; // 0x00: + __le16 ReparseDataLength;// 0x04: + __le16 Reserved; + + struct GUID Guid; // 0x08: + + // + // Here GenericReparseBuffer is placed + // +}; + +static_assert(sizeof(struct REPARSE_POINT) == 0x18); + +/* Maximum allowed size of the reparse data. */ +#define MAXIMUM_REPARSE_DATA_BUFFER_SIZE (16 * 1024) + +/* + * The value of the following constant needs to satisfy the following + * conditions: + * (1) Be at least as large as the largest of the reserved tags. + * (2) Be strictly smaller than all the tags in use. + */ +#define IO_REPARSE_TAG_RESERVED_RANGE 1 + +/* + * The reparse tags are a ULONG. The 32 bits are laid out as follows: + * + * 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 + * 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 + * +-+-+-+-+-----------------------+-------------------------------+ + * |M|R|N|R| Reserved bits | Reparse Tag Value | + * +-+-+-+-+-----------------------+-------------------------------+ + * + * M is the Microsoft bit. When set to 1, it denotes a tag owned by Microsoft. + * All ISVs must use a tag with a 0 in this position. + * Note: If a Microsoft tag is used by non-Microsoft software, the + * behavior is not defined. + * + * R is reserved. Must be zero for non-Microsoft tags. + * + * N is name surrogate. When set to 1, the file represents another named + * entity in the system. + * + * The M and N bits are OR-able. + * The following macros check for the M and N bit values: + */ + +/* + * Macro to determine whether a reparse point tag corresponds to a tag + * owned by Microsoft. + */ +#define IsReparseTagMicrosoft(_tag) (((_tag)&IO_REPARSE_TAG_MICROSOFT)) + +/* Macro to determine whether a reparse point tag is a name surrogate. */ +#define IsReparseTagNameSurrogate(_tag) (((_tag)&IO_REPARSE_TAG_NAME_SURROGATE)) + +/* + * The following constant represents the bits that are valid to use in + * reparse tags. + */ +#define IO_REPARSE_TAG_VALID_VALUES 0xF000FFFF + +/* + * Macro to determine whether a reparse tag is a valid tag. + */ +#define IsReparseTagValid(_tag) \ + (!((_tag) & ~IO_REPARSE_TAG_VALID_VALUES) && \ + ((_tag) > IO_REPARSE_TAG_RESERVED_RANGE)) + +/* Microsoft tags for reparse points. */ + +enum IO_REPARSE_TAG { + IO_REPARSE_TAG_SYMBOLIC_LINK = cpu_to_le32(0), + IO_REPARSE_TAG_NAME_SURROGATE = cpu_to_le32(0x20000000), + IO_REPARSE_TAG_MICROSOFT = cpu_to_le32(0x80000000), + IO_REPARSE_TAG_MOUNT_POINT = cpu_to_le32(0xA0000003), + IO_REPARSE_TAG_SYMLINK = cpu_to_le32(0xA000000C), + IO_REPARSE_TAG_HSM = cpu_to_le32(0xC0000004), + IO_REPARSE_TAG_SIS = cpu_to_le32(0x80000007), + IO_REPARSE_TAG_DEDUP = cpu_to_le32(0x80000013), + IO_REPARSE_TAG_COMPRESS = cpu_to_le32(0x80000017), + + /* + * The reparse tag 0x80000008 is reserved for Microsoft internal use. + * May be published in the future. + */ + + /* Microsoft reparse tag reserved for DFS */ + IO_REPARSE_TAG_DFS = cpu_to_le32(0x8000000A), + + /* Microsoft reparse tag reserved for the file system filter manager. */ + IO_REPARSE_TAG_FILTER_MANAGER = cpu_to_le32(0x8000000B), + + /* Non-Microsoft tags for reparse points */ + + /* Tag allocated to CONGRUENT, May 2000. Used by IFSTEST. */ + IO_REPARSE_TAG_IFSTEST_CONGRUENT = cpu_to_le32(0x00000009), + + /* Tag allocated to ARKIVIO. */ + IO_REPARSE_TAG_ARKIVIO = cpu_to_le32(0x0000000C), + + /* Tag allocated to SOLUTIONSOFT. */ + IO_REPARSE_TAG_SOLUTIONSOFT = cpu_to_le32(0x2000000D), + + /* Tag allocated to COMMVAULT. */ + IO_REPARSE_TAG_COMMVAULT = cpu_to_le32(0x0000000E), + + /* OneDrive?? */ + IO_REPARSE_TAG_CLOUD = cpu_to_le32(0x9000001A), + IO_REPARSE_TAG_CLOUD_1 = cpu_to_le32(0x9000101A), + IO_REPARSE_TAG_CLOUD_2 = cpu_to_le32(0x9000201A), + IO_REPARSE_TAG_CLOUD_3 = cpu_to_le32(0x9000301A), + IO_REPARSE_TAG_CLOUD_4 = cpu_to_le32(0x9000401A), + IO_REPARSE_TAG_CLOUD_5 = cpu_to_le32(0x9000501A), + IO_REPARSE_TAG_CLOUD_6 = cpu_to_le32(0x9000601A), + IO_REPARSE_TAG_CLOUD_7 = cpu_to_le32(0x9000701A), + IO_REPARSE_TAG_CLOUD_8 = cpu_to_le32(0x9000801A), + IO_REPARSE_TAG_CLOUD_9 = cpu_to_le32(0x9000901A), + IO_REPARSE_TAG_CLOUD_A = cpu_to_le32(0x9000A01A), + IO_REPARSE_TAG_CLOUD_B = cpu_to_le32(0x9000B01A), + IO_REPARSE_TAG_CLOUD_C = cpu_to_le32(0x9000C01A), + IO_REPARSE_TAG_CLOUD_D = cpu_to_le32(0x9000D01A), + IO_REPARSE_TAG_CLOUD_E = cpu_to_le32(0x9000E01A), + IO_REPARSE_TAG_CLOUD_F = cpu_to_le32(0x9000F01A), + +}; + +#define SYMLINK_FLAG_RELATIVE 1 + +/* Microsoft reparse buffer. (see DDK for details) */ +struct REPARSE_DATA_BUFFER { + __le32 ReparseTag; // 0x00: + __le16 ReparseDataLength; // 0x04: + __le16 Reserved; + + union { + /* If ReparseTag == 0xA0000003 (IO_REPARSE_TAG_MOUNT_POINT) */ + struct { + __le16 SubstituteNameOffset; // 0x08 + __le16 SubstituteNameLength; // 0x0A + __le16 PrintNameOffset; // 0x0C + __le16 PrintNameLength; // 0x0E + __le16 PathBuffer[]; // 0x10 + } MountPointReparseBuffer; + + /* + * If ReparseTag == 0xA000000C (IO_REPARSE_TAG_SYMLINK) + * https://msdn.microsoft.com/en-us/library/cc232006.aspx + */ + struct { + __le16 SubstituteNameOffset; // 0x08 + __le16 SubstituteNameLength; // 0x0A + __le16 PrintNameOffset; // 0x0C + __le16 PrintNameLength; // 0x0E + // 0-absolute path 1- relative path, SYMLINK_FLAG_RELATIVE + __le32 Flags; // 0x10 + __le16 PathBuffer[]; // 0x14 + } SymbolicLinkReparseBuffer; + + /* If ReparseTag == 0x80000017U */ + struct { + __le32 WofVersion; // 0x08 == 1 + /* + * 1 - WIM backing provider ("WIMBoot"), + * 2 - System compressed file provider + */ + __le32 WofProvider; // 0x0C: + __le32 ProviderVer; // 0x10: == 1 WOF_FILE_PROVIDER_CURRENT_VERSION == 1 + __le32 CompressionFormat; // 0x14: 0, 1, 2, 3. See WOF_COMPRESSION_XXX + } CompressReparseBuffer; + + struct { + u8 DataBuffer[1]; // 0x08: + } GenericReparseBuffer; + }; +}; + +/* ATTR_EA_INFO (0xD0) */ + +#define FILE_NEED_EA 0x80 // See ntifs.h +/* + *FILE_NEED_EA, indicates that the file to which the EA belongs cannot be + * interpreted without understanding the associated extended attributes. + */ +struct EA_INFO { + __le16 size_pack; // 0x00: Size of buffer to hold in packed form. + __le16 count; // 0x02: Count of EA's with FILE_NEED_EA bit set. + __le32 size; // 0x04: Size of buffer to hold in unpacked form. +}; + +static_assert(sizeof(struct EA_INFO) == 8); + +/* ATTR_EA (0xE0) */ +struct EA_FULL { + __le32 size; // 0x00: (not in packed) + u8 flags; // 0x04: + u8 name_len; // 0x05: + __le16 elength; // 0x06: + u8 name[]; // 0x08: +}; + +static_assert(offsetof(struct EA_FULL, name) == 8); + +#define ACL_REVISION 2 +#define ACL_REVISION_DS 4 + +#define SE_SELF_RELATIVE cpu_to_le16(0x8000) + +struct SECURITY_DESCRIPTOR_RELATIVE { + u8 Revision; + u8 Sbz1; + __le16 Control; + __le32 Owner; + __le32 Group; + __le32 Sacl; + __le32 Dacl; +}; +static_assert(sizeof(struct SECURITY_DESCRIPTOR_RELATIVE) == 0x14); + +struct ACE_HEADER { + u8 AceType; + u8 AceFlags; + __le16 AceSize; +}; +static_assert(sizeof(struct ACE_HEADER) == 4); + +struct ACL { + u8 AclRevision; + u8 Sbz1; + __le16 AclSize; + __le16 AceCount; + __le16 Sbz2; +}; +static_assert(sizeof(struct ACL) == 8); + +struct SID { + u8 Revision; + u8 SubAuthorityCount; + u8 IdentifierAuthority[6]; + __le32 SubAuthority[]; +}; +static_assert(offsetof(struct SID, SubAuthority) == 8); + +#endif /* _LINUX_NTFS3_NTFS_H */ +// clang-format on diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h new file mode 100644 index 000000000000..dc71c59fd445 --- /dev/null +++ b/fs/ntfs3/ntfs_fs.h @@ -0,0 +1,1111 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * + * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. + * + */ + +// clang-format off +#ifndef _LINUX_NTFS3_NTFS_FS_H +#define _LINUX_NTFS3_NTFS_FS_H + +#define MINUS_ONE_T ((size_t)(-1)) +/* Biggest MFT / smallest cluster */ +#define MAXIMUM_BYTES_PER_MFT 4096 +#define NTFS_BLOCKS_PER_MFT_RECORD (MAXIMUM_BYTES_PER_MFT / 512) + +#define MAXIMUM_BYTES_PER_INDEX 4096 +#define NTFS_BLOCKS_PER_INODE (MAXIMUM_BYTES_PER_INDEX / 512) + +/* NTFS specific error code when fixup failed. */ +#define E_NTFS_FIXUP 555 +/* NTFS specific error code about resident->nonresident. */ +#define E_NTFS_NONRESIDENT 556 +/* NTFS specific error code about punch hole. */ +#define E_NTFS_NOTALIGNED 557 + + +/* sbi->flags */ +#define NTFS_FLAGS_NODISCARD 0x00000001 +/* Set when LogFile is replaying. */ +#define NTFS_FLAGS_LOG_REPLAYING 0x00000008 +/* Set when we changed first MFT's which copy must be updated in $MftMirr. */ +#define NTFS_FLAGS_MFTMIRR 0x00001000 +#define NTFS_FLAGS_NEED_REPLAY 0x04000000 + + +/* ni->ni_flags */ +/* + * Data attribute is external compressed (LZX/Xpress) + * 1 - WOF_COMPRESSION_XPRESS4K + * 2 - WOF_COMPRESSION_XPRESS8K + * 3 - WOF_COMPRESSION_XPRESS16K + * 4 - WOF_COMPRESSION_LZX32K + */ +#define NI_FLAG_COMPRESSED_MASK 0x0000000f +/* Data attribute is deduplicated. */ +#define NI_FLAG_DEDUPLICATED 0x00000010 +#define NI_FLAG_EA 0x00000020 +#define NI_FLAG_DIR 0x00000040 +#define NI_FLAG_RESIDENT 0x00000080 +#define NI_FLAG_UPDATE_PARENT 0x00000100 +// clang-format on + +struct ntfs_mount_options { + struct nls_table *nls; + + kuid_t fs_uid; + kgid_t fs_gid; + u16 fs_fmask_inv; + u16 fs_dmask_inv; + + unsigned uid : 1, /* uid was set. */ + gid : 1, /* gid was set. */ + fmask : 1, /* fmask was set. */ + dmask : 1, /* dmask was set. */ + sys_immutable : 1, /* Immutable system files. */ + discard : 1, /* Issue discard requests on deletions. */ + sparse : 1, /* Create sparse files. */ + showmeta : 1, /* Show meta files. */ + nohidden : 1, /* Do not show hidden files. */ + force : 1, /* Rw mount dirty volume. */ + no_acs_rules : 1, /*Exclude acs rules. */ + prealloc : 1 /* Preallocate space when file is growing. */ + ; +}; + +/* Special value to unpack and deallocate. */ +#define RUN_DEALLOCATE ((struct runs_tree *)(size_t)1) + +/* TODO: Use rb tree instead of array. */ +struct runs_tree { + struct ntfs_run *runs; + size_t count; /* Currently used size a ntfs_run storage. */ + size_t allocated; /* Currently allocated ntfs_run storage size. */ +}; + +struct ntfs_buffers { + /* Biggest MFT / smallest cluster = 4096 / 512 = 8 */ + /* Biggest index / smallest cluster = 4096 / 512 = 8 */ + struct buffer_head *bh[PAGE_SIZE >> SECTOR_SHIFT]; + u32 bytes; + u32 nbufs; + u32 off; +}; + +enum ALLOCATE_OPT { + ALLOCATE_DEF = 0, // Allocate all clusters. + ALLOCATE_MFT = 1, // Allocate for MFT. +}; + +enum bitmap_mutex_classes { + BITMAP_MUTEX_CLUSTERS = 0, + BITMAP_MUTEX_MFT = 1, +}; + +struct wnd_bitmap { + struct super_block *sb; + struct rw_semaphore rw_lock; + + struct runs_tree run; + size_t nbits; + + size_t total_zeroes; // Total number of free bits. + u16 *free_bits; // Free bits in each window. + size_t nwnd; + u32 bits_last; // Bits in last window. + + struct rb_root start_tree; // Extents, sorted by 'start'. + struct rb_root count_tree; // Extents, sorted by 'count + start'. + size_t count; // Extents count. + + /* + * -1 Tree is activated but not updated (too many fragments). + * 0 - Tree is not activated. + * 1 - Tree is activated and updated. + */ + int uptodated; + size_t extent_min; // Minimal extent used while building. + size_t extent_max; // Upper estimate of biggest free block. + + /* Zone [bit, end) */ + size_t zone_bit; + size_t zone_end; + + bool set_tail; // Not necessary in driver. + bool inited; +}; + +typedef int (*NTFS_CMP_FUNC)(const void *key1, size_t len1, const void *key2, + size_t len2, const void *param); + +enum index_mutex_classed { + INDEX_MUTEX_I30 = 0, + INDEX_MUTEX_SII = 1, + INDEX_MUTEX_SDH = 2, + INDEX_MUTEX_SO = 3, + INDEX_MUTEX_SQ = 4, + INDEX_MUTEX_SR = 5, + INDEX_MUTEX_TOTAL +}; + +/* ntfs_index - Allocation unit inside directory. */ +struct ntfs_index { + struct runs_tree bitmap_run; + struct runs_tree alloc_run; + /* read/write access to 'bitmap_run'/'alloc_run' while ntfs_readdir */ + struct rw_semaphore run_lock; + + /*TODO: Remove 'cmp'. */ + NTFS_CMP_FUNC cmp; + + u8 index_bits; // log2(root->index_block_size) + u8 idx2vbn_bits; // log2(root->index_block_clst) + u8 vbn2vbo_bits; // index_block_size < cluster? 9 : cluster_bits + u8 type; // index_mutex_classed +}; + +/* Minimum MFT zone. */ +#define NTFS_MIN_MFT_ZONE 100 + +/* Ntfs file system in-core superblock data. */ +struct ntfs_sb_info { + struct super_block *sb; + + u32 discard_granularity; + u64 discard_granularity_mask_inv; // ~(discard_granularity_mask_inv-1) + + u32 cluster_size; // bytes per cluster + u32 cluster_mask; // == cluster_size - 1 + u64 cluster_mask_inv; // ~(cluster_size - 1) + u32 block_mask; // sb->s_blocksize - 1 + u32 blocks_per_cluster; // cluster_size / sb->s_blocksize + + u32 record_size; + u32 sector_size; + u32 index_size; + + u8 sector_bits; + u8 cluster_bits; + u8 record_bits; + + u64 maxbytes; // Maximum size for normal files. + u64 maxbytes_sparse; // Maximum size for sparse file. + + u32 flags; // See NTFS_FLAGS_XXX. + + CLST bad_clusters; // The count of marked bad clusters. + + u16 max_bytes_per_attr; // Maximum attribute size in record. + u16 attr_size_tr; // Attribute size threshold (320 bytes). + + /* Records in $Extend. */ + CLST objid_no; + CLST quota_no; + CLST reparse_no; + CLST usn_jrnl_no; + + struct ATTR_DEF_ENTRY *def_table; // Attribute definition table. + u32 def_entries; + u32 ea_max_size; + + struct MFT_REC *new_rec; + + u16 *upcase; + + struct { + u64 lbo, lbo2; + struct ntfs_inode *ni; + struct wnd_bitmap bitmap; // $MFT::Bitmap + /* + * MFT records [11-24) used to expand MFT itself. + * They always marked as used in $MFT::Bitmap + * 'reserved_bitmap' contains real bitmap of these records. + */ + ulong reserved_bitmap; // Bitmap of used records [11 - 24) + size_t next_free; // The next record to allocate from + size_t used; // MFT valid size in records. + u32 recs_mirr; // Number of records in MFTMirr + u8 next_reserved; + u8 reserved_bitmap_inited; + } mft; + + struct { + struct wnd_bitmap bitmap; // $Bitmap::Data + CLST next_free_lcn; + } used; + + struct { + u64 size; // In bytes. + u64 blocks; // In blocks. + u64 ser_num; + struct ntfs_inode *ni; + __le16 flags; // Cached current VOLUME_INFO::flags, VOLUME_FLAG_DIRTY. + u8 major_ver; + u8 minor_ver; + char label[65]; + bool real_dirty; // Real fs state. + } volume; + + struct { + struct ntfs_index index_sii; + struct ntfs_index index_sdh; + struct ntfs_inode *ni; + u32 next_id; + u64 next_off; + + __le32 def_security_id; + } security; + + struct { + struct ntfs_index index_r; + struct ntfs_inode *ni; + u64 max_size; // 16K + } reparse; + + struct { + struct ntfs_index index_o; + struct ntfs_inode *ni; + } objid; + + struct { + struct mutex mtx_lznt; + struct lznt *lznt; +#ifdef CONFIG_NTFS3_LZX_XPRESS + struct mutex mtx_xpress; + struct xpress_decompressor *xpress; + struct mutex mtx_lzx; + struct lzx_decompressor *lzx; +#endif + } compress; + + struct ntfs_mount_options options; + struct ratelimit_state msg_ratelimit; +}; + +/* One MFT record(usually 1024 bytes), consists of attributes. */ +struct mft_inode { + struct rb_node node; + struct ntfs_sb_info *sbi; + + struct MFT_REC *mrec; + struct ntfs_buffers nb; + + CLST rno; + bool dirty; +}; + +/* Nested class for ntfs_inode::ni_lock. */ +enum ntfs_inode_mutex_lock_class { + NTFS_INODE_MUTEX_DIRTY, + NTFS_INODE_MUTEX_SECURITY, + NTFS_INODE_MUTEX_OBJID, + NTFS_INODE_MUTEX_REPARSE, + NTFS_INODE_MUTEX_NORMAL, + NTFS_INODE_MUTEX_PARENT, +}; + +/* + * sturct ntfs_inode + * + * Ntfs inode - extends linux inode. consists of one or more MFT inodes. + */ +struct ntfs_inode { + struct mft_inode mi; // base record + + /* + * Valid size: [0 - i_valid) - these range in file contains valid data. + * Range [i_valid - inode->i_size) - contains 0. + * Usually i_valid <= inode->i_size. + */ + u64 i_valid; + struct timespec64 i_crtime; + + struct mutex ni_lock; + + /* File attributes from std. */ + enum FILE_ATTRIBUTE std_fa; + __le32 std_security_id; + + /* + * Tree of mft_inode. + * Not empty when primary MFT record (usually 1024 bytes) can't save all attributes + * e.g. file becomes too fragmented or contains a lot of names. + */ + struct rb_root mi_tree; + + /* + * This member is used in ntfs_readdir to ensure that all subrecords are loaded + */ + u8 mi_loaded; + + union { + struct ntfs_index dir; + struct { + struct rw_semaphore run_lock; + struct runs_tree run; +#ifdef CONFIG_NTFS3_LZX_XPRESS + struct page *offs_page; +#endif + } file; + }; + + struct { + struct runs_tree run; + struct ATTR_LIST_ENTRY *le; // 1K aligned memory. + size_t size; + bool dirty; + } attr_list; + + size_t ni_flags; // NI_FLAG_XXX + + struct inode vfs_inode; +}; + +struct indx_node { + struct ntfs_buffers nb; + struct INDEX_BUFFER *index; +}; + +struct ntfs_fnd { + int level; + struct indx_node *nodes[20]; + struct NTFS_DE *de[20]; + struct NTFS_DE *root_de; +}; + +enum REPARSE_SIGN { + REPARSE_NONE = 0, + REPARSE_COMPRESSED = 1, + REPARSE_DEDUPLICATED = 2, + REPARSE_LINK = 3 +}; + +/* Functions from attrib.c */ +int attr_load_runs(struct ATTRIB *attr, struct ntfs_inode *ni, + struct runs_tree *run, const CLST *vcn); +int attr_allocate_clusters(struct ntfs_sb_info *sbi, struct runs_tree *run, + CLST vcn, CLST lcn, CLST len, CLST *pre_alloc, + enum ALLOCATE_OPT opt, CLST *alen, const size_t fr, + CLST *new_lcn); +int attr_make_nonresident(struct ntfs_inode *ni, struct ATTRIB *attr, + struct ATTR_LIST_ENTRY *le, struct mft_inode *mi, + u64 new_size, struct runs_tree *run, + struct ATTRIB **ins_attr, struct page *page); +int attr_set_size(struct ntfs_inode *ni, enum ATTR_TYPE type, + const __le16 *name, u8 name_len, struct runs_tree *run, + u64 new_size, const u64 *new_valid, bool keep_prealloc, + struct ATTRIB **ret); +int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn, + CLST *len, bool *new); +int attr_data_read_resident(struct ntfs_inode *ni, struct page *page); +int attr_data_write_resident(struct ntfs_inode *ni, struct page *page); +int attr_load_runs_vcn(struct ntfs_inode *ni, enum ATTR_TYPE type, + const __le16 *name, u8 name_len, struct runs_tree *run, + CLST vcn); +int attr_load_runs_range(struct ntfs_inode *ni, enum ATTR_TYPE type, + const __le16 *name, u8 name_len, struct runs_tree *run, + u64 from, u64 to); +int attr_wof_frame_info(struct ntfs_inode *ni, struct ATTRIB *attr, + struct runs_tree *run, u64 frame, u64 frames, + u8 frame_bits, u32 *ondisk_size, u64 *vbo_data); +int attr_is_frame_compressed(struct ntfs_inode *ni, struct ATTRIB *attr, + CLST frame, CLST *clst_data); +int attr_allocate_frame(struct ntfs_inode *ni, CLST frame, size_t compr_size, + u64 new_valid); +int attr_collapse_range(struct ntfs_inode *ni, u64 vbo, u64 bytes); +int attr_punch_hole(struct ntfs_inode *ni, u64 vbo, u64 bytes, u32 *frame_size); + +/* Functions from attrlist.c */ +void al_destroy(struct ntfs_inode *ni); +bool al_verify(struct ntfs_inode *ni); +int ntfs_load_attr_list(struct ntfs_inode *ni, struct ATTRIB *attr); +struct ATTR_LIST_ENTRY *al_enumerate(struct ntfs_inode *ni, + struct ATTR_LIST_ENTRY *le); +struct ATTR_LIST_ENTRY *al_find_le(struct ntfs_inode *ni, + struct ATTR_LIST_ENTRY *le, + const struct ATTRIB *attr); +struct ATTR_LIST_ENTRY *al_find_ex(struct ntfs_inode *ni, + struct ATTR_LIST_ENTRY *le, + enum ATTR_TYPE type, const __le16 *name, + u8 name_len, const CLST *vcn); +int al_add_le(struct ntfs_inode *ni, enum ATTR_TYPE type, const __le16 *name, + u8 name_len, CLST svcn, __le16 id, const struct MFT_REF *ref, + struct ATTR_LIST_ENTRY **new_le); +bool al_remove_le(struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le); +bool al_delete_le(struct ntfs_inode *ni, enum ATTR_TYPE type, CLST vcn, + const __le16 *name, size_t name_len, + const struct MFT_REF *ref); +int al_update(struct ntfs_inode *ni); +static inline size_t al_aligned(size_t size) +{ + return (size + 1023) & ~(size_t)1023; +} + +/* Globals from bitfunc.c */ +bool are_bits_clear(const ulong *map, size_t bit, size_t nbits); +bool are_bits_set(const ulong *map, size_t bit, size_t nbits); +size_t get_set_bits_ex(const ulong *map, size_t bit, size_t nbits); + +/* Globals from dir.c */ +int ntfs_utf16_to_nls(struct ntfs_sb_info *sbi, const struct le_str *uni, + u8 *buf, int buf_len); +int ntfs_nls_to_utf16(struct ntfs_sb_info *sbi, const u8 *name, u32 name_len, + struct cpu_str *uni, u32 max_ulen, + enum utf16_endian endian); +struct inode *dir_search_u(struct inode *dir, const struct cpu_str *uni, + struct ntfs_fnd *fnd); +bool dir_is_empty(struct inode *dir); +extern const struct file_operations ntfs_dir_operations; + +/* Globals from file.c */ +int ntfs_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, u32 flags); +void ntfs_sparse_cluster(struct inode *inode, struct page *page0, CLST vcn, + CLST len); +int ntfs3_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr); +int ntfs_file_open(struct inode *inode, struct file *file); +int ntfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len); +extern const struct inode_operations ntfs_special_inode_operations; +extern const struct inode_operations ntfs_file_inode_operations; +extern const struct file_operations ntfs_file_operations; + +/* Globals from frecord.c */ +void ni_remove_mi(struct ntfs_inode *ni, struct mft_inode *mi); +struct ATTR_STD_INFO *ni_std(struct ntfs_inode *ni); +struct ATTR_STD_INFO5 *ni_std5(struct ntfs_inode *ni); +void ni_clear(struct ntfs_inode *ni); +int ni_load_mi_ex(struct ntfs_inode *ni, CLST rno, struct mft_inode **mi); +int ni_load_mi(struct ntfs_inode *ni, const struct ATTR_LIST_ENTRY *le, + struct mft_inode **mi); +struct ATTRIB *ni_find_attr(struct ntfs_inode *ni, struct ATTRIB *attr, + struct ATTR_LIST_ENTRY **entry_o, + enum ATTR_TYPE type, const __le16 *name, + u8 name_len, const CLST *vcn, + struct mft_inode **mi); +struct ATTRIB *ni_enum_attr_ex(struct ntfs_inode *ni, struct ATTRIB *attr, + struct ATTR_LIST_ENTRY **le, + struct mft_inode **mi); +struct ATTRIB *ni_load_attr(struct ntfs_inode *ni, enum ATTR_TYPE type, + const __le16 *name, u8 name_len, CLST vcn, + struct mft_inode **pmi); +int ni_load_all_mi(struct ntfs_inode *ni); +bool ni_add_subrecord(struct ntfs_inode *ni, CLST rno, struct mft_inode **mi); +int ni_remove_attr(struct ntfs_inode *ni, enum ATTR_TYPE type, + const __le16 *name, size_t name_len, bool base_only, + const __le16 *id); +int ni_create_attr_list(struct ntfs_inode *ni); +int ni_expand_list(struct ntfs_inode *ni); +int ni_insert_nonresident(struct ntfs_inode *ni, enum ATTR_TYPE type, + const __le16 *name, u8 name_len, + const struct runs_tree *run, CLST svcn, CLST len, + __le16 flags, struct ATTRIB **new_attr, + struct mft_inode **mi); +int ni_insert_resident(struct ntfs_inode *ni, u32 data_size, + enum ATTR_TYPE type, const __le16 *name, u8 name_len, + struct ATTRIB **new_attr, struct mft_inode **mi, + struct ATTR_LIST_ENTRY **le); +void ni_remove_attr_le(struct ntfs_inode *ni, struct ATTRIB *attr, + struct mft_inode *mi, struct ATTR_LIST_ENTRY *le); +int ni_delete_all(struct ntfs_inode *ni); +struct ATTR_FILE_NAME *ni_fname_name(struct ntfs_inode *ni, + const struct cpu_str *uni, + const struct MFT_REF *home, + struct mft_inode **mi, + struct ATTR_LIST_ENTRY **entry); +struct ATTR_FILE_NAME *ni_fname_type(struct ntfs_inode *ni, u8 name_type, + struct mft_inode **mi, + struct ATTR_LIST_ENTRY **entry); +int ni_new_attr_flags(struct ntfs_inode *ni, enum FILE_ATTRIBUTE new_fa); +enum REPARSE_SIGN ni_parse_reparse(struct ntfs_inode *ni, struct ATTRIB *attr, + void *buffer); +int ni_write_inode(struct inode *inode, int sync, const char *hint); +#define _ni_write_inode(i, w) ni_write_inode(i, w, __func__) +int ni_fiemap(struct ntfs_inode *ni, struct fiemap_extent_info *fieinfo, + __u64 vbo, __u64 len); +int ni_readpage_cmpr(struct ntfs_inode *ni, struct page *page); +int ni_decompress_file(struct ntfs_inode *ni); +int ni_read_frame(struct ntfs_inode *ni, u64 frame_vbo, struct page **pages, + u32 pages_per_frame); +int ni_write_frame(struct ntfs_inode *ni, struct page **pages, + u32 pages_per_frame); +int ni_remove_name(struct ntfs_inode *dir_ni, struct ntfs_inode *ni, + struct NTFS_DE *de, struct NTFS_DE **de2, int *undo_step); + +bool ni_remove_name_undo(struct ntfs_inode *dir_ni, struct ntfs_inode *ni, + struct NTFS_DE *de, struct NTFS_DE *de2, + int undo_step); + +int ni_add_name(struct ntfs_inode *dir_ni, struct ntfs_inode *ni, + struct NTFS_DE *de); + +int ni_rename(struct ntfs_inode *dir_ni, struct ntfs_inode *new_dir_ni, + struct ntfs_inode *ni, struct NTFS_DE *de, struct NTFS_DE *new_de, + bool *is_bad); + +bool ni_is_dirty(struct inode *inode); + +/* Globals from fslog.c */ +int log_replay(struct ntfs_inode *ni, bool *initialized); + +/* Globals from fsntfs.c */ +bool ntfs_fix_pre_write(struct NTFS_RECORD_HEADER *rhdr, size_t bytes); +int ntfs_fix_post_read(struct NTFS_RECORD_HEADER *rhdr, size_t bytes, + bool simple); +int ntfs_extend_init(struct ntfs_sb_info *sbi); +int ntfs_loadlog_and_replay(struct ntfs_inode *ni, struct ntfs_sb_info *sbi); +const struct ATTR_DEF_ENTRY *ntfs_query_def(struct ntfs_sb_info *sbi, + enum ATTR_TYPE Type); +int ntfs_look_for_free_space(struct ntfs_sb_info *sbi, CLST lcn, CLST len, + CLST *new_lcn, CLST *new_len, + enum ALLOCATE_OPT opt); +int ntfs_look_free_mft(struct ntfs_sb_info *sbi, CLST *rno, bool mft, + struct ntfs_inode *ni, struct mft_inode **mi); +void ntfs_mark_rec_free(struct ntfs_sb_info *sbi, CLST rno); +int ntfs_clear_mft_tail(struct ntfs_sb_info *sbi, size_t from, size_t to); +int ntfs_refresh_zone(struct ntfs_sb_info *sbi); +int ntfs_update_mftmirr(struct ntfs_sb_info *sbi, int wait); +enum NTFS_DIRTY_FLAGS { + NTFS_DIRTY_CLEAR = 0, + NTFS_DIRTY_DIRTY = 1, + NTFS_DIRTY_ERROR = 2, +}; +int ntfs_set_state(struct ntfs_sb_info *sbi, enum NTFS_DIRTY_FLAGS dirty); +int ntfs_sb_read(struct super_block *sb, u64 lbo, size_t bytes, void *buffer); +int ntfs_sb_write(struct super_block *sb, u64 lbo, size_t bytes, + const void *buffer, int wait); +int ntfs_sb_write_run(struct ntfs_sb_info *sbi, const struct runs_tree *run, + u64 vbo, const void *buf, size_t bytes); +struct buffer_head *ntfs_bread_run(struct ntfs_sb_info *sbi, + const struct runs_tree *run, u64 vbo); +int ntfs_read_run_nb(struct ntfs_sb_info *sbi, const struct runs_tree *run, + u64 vbo, void *buf, u32 bytes, struct ntfs_buffers *nb); +int ntfs_read_bh(struct ntfs_sb_info *sbi, const struct runs_tree *run, u64 vbo, + struct NTFS_RECORD_HEADER *rhdr, u32 bytes, + struct ntfs_buffers *nb); +int ntfs_get_bh(struct ntfs_sb_info *sbi, const struct runs_tree *run, u64 vbo, + u32 bytes, struct ntfs_buffers *nb); +int ntfs_write_bh(struct ntfs_sb_info *sbi, struct NTFS_RECORD_HEADER *rhdr, + struct ntfs_buffers *nb, int sync); +int ntfs_bio_pages(struct ntfs_sb_info *sbi, const struct runs_tree *run, + struct page **pages, u32 nr_pages, u64 vbo, u32 bytes, + u32 op); +int ntfs_bio_fill_1(struct ntfs_sb_info *sbi, const struct runs_tree *run); +int ntfs_vbo_to_lbo(struct ntfs_sb_info *sbi, const struct runs_tree *run, + u64 vbo, u64 *lbo, u64 *bytes); +struct ntfs_inode *ntfs_new_inode(struct ntfs_sb_info *sbi, CLST nRec, + bool dir); +extern const u8 s_default_security[0x50]; +bool is_sd_valid(const struct SECURITY_DESCRIPTOR_RELATIVE *sd, u32 len); +int ntfs_security_init(struct ntfs_sb_info *sbi); +int ntfs_get_security_by_id(struct ntfs_sb_info *sbi, __le32 security_id, + struct SECURITY_DESCRIPTOR_RELATIVE **sd, + size_t *size); +int ntfs_insert_security(struct ntfs_sb_info *sbi, + const struct SECURITY_DESCRIPTOR_RELATIVE *sd, + u32 size, __le32 *security_id, bool *inserted); +int ntfs_reparse_init(struct ntfs_sb_info *sbi); +int ntfs_objid_init(struct ntfs_sb_info *sbi); +int ntfs_objid_remove(struct ntfs_sb_info *sbi, struct GUID *guid); +int ntfs_insert_reparse(struct ntfs_sb_info *sbi, __le32 rtag, + const struct MFT_REF *ref); +int ntfs_remove_reparse(struct ntfs_sb_info *sbi, __le32 rtag, + const struct MFT_REF *ref); +void mark_as_free_ex(struct ntfs_sb_info *sbi, CLST lcn, CLST len, bool trim); +int run_deallocate(struct ntfs_sb_info *sbi, struct runs_tree *run, bool trim); + +/* Globals from index.c */ +int indx_used_bit(struct ntfs_index *indx, struct ntfs_inode *ni, size_t *bit); +void fnd_clear(struct ntfs_fnd *fnd); +static inline struct ntfs_fnd *fnd_get(void) +{ + return kzalloc(sizeof(struct ntfs_fnd), GFP_NOFS); +} +static inline void fnd_put(struct ntfs_fnd *fnd) +{ + if (fnd) { + fnd_clear(fnd); + kfree(fnd); + } +} +void indx_clear(struct ntfs_index *idx); +int indx_init(struct ntfs_index *indx, struct ntfs_sb_info *sbi, + const struct ATTRIB *attr, enum index_mutex_classed type); +struct INDEX_ROOT *indx_get_root(struct ntfs_index *indx, struct ntfs_inode *ni, + struct ATTRIB **attr, struct mft_inode **mi); +int indx_read(struct ntfs_index *idx, struct ntfs_inode *ni, CLST vbn, + struct indx_node **node); +int indx_find(struct ntfs_index *indx, struct ntfs_inode *dir, + const struct INDEX_ROOT *root, const void *Key, size_t KeyLen, + const void *param, int *diff, struct NTFS_DE **entry, + struct ntfs_fnd *fnd); +int indx_find_sort(struct ntfs_index *indx, struct ntfs_inode *ni, + const struct INDEX_ROOT *root, struct NTFS_DE **entry, + struct ntfs_fnd *fnd); +int indx_find_raw(struct ntfs_index *indx, struct ntfs_inode *ni, + const struct INDEX_ROOT *root, struct NTFS_DE **entry, + size_t *off, struct ntfs_fnd *fnd); +int indx_insert_entry(struct ntfs_index *indx, struct ntfs_inode *ni, + const struct NTFS_DE *new_de, const void *param, + struct ntfs_fnd *fnd, bool undo); +int indx_delete_entry(struct ntfs_index *indx, struct ntfs_inode *ni, + const void *key, u32 key_len, const void *param); +int indx_update_dup(struct ntfs_inode *ni, struct ntfs_sb_info *sbi, + const struct ATTR_FILE_NAME *fname, + const struct NTFS_DUP_INFO *dup, int sync); + +/* Globals from inode.c */ +struct inode *ntfs_iget5(struct super_block *sb, const struct MFT_REF *ref, + const struct cpu_str *name); +int ntfs_set_size(struct inode *inode, u64 new_size); +int reset_log_file(struct inode *inode); +int ntfs_get_block(struct inode *inode, sector_t vbn, + struct buffer_head *bh_result, int create); +int ntfs3_write_inode(struct inode *inode, struct writeback_control *wbc); +int ntfs_sync_inode(struct inode *inode); +int ntfs_flush_inodes(struct super_block *sb, struct inode *i1, + struct inode *i2); +int inode_write_data(struct inode *inode, const void *data, size_t bytes); +struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *dentry, + const struct cpu_str *uni, umode_t mode, + dev_t dev, const char *symname, u32 size, + struct ntfs_fnd *fnd); +int ntfs_link_inode(struct inode *inode, struct dentry *dentry); +int ntfs_unlink_inode(struct inode *dir, const struct dentry *dentry); +void ntfs_evict_inode(struct inode *inode); +extern const struct inode_operations ntfs_link_inode_operations; +extern const struct address_space_operations ntfs_aops; +extern const struct address_space_operations ntfs_aops_cmpr; + +/* Globals from name_i.c */ +int fill_name_de(struct ntfs_sb_info *sbi, void *buf, const struct qstr *name, + const struct cpu_str *uni); +struct dentry *ntfs3_get_parent(struct dentry *child); + +extern const struct inode_operations ntfs_dir_inode_operations; +extern const struct inode_operations ntfs_special_inode_operations; + +/* Globals from record.c */ +int mi_get(struct ntfs_sb_info *sbi, CLST rno, struct mft_inode **mi); +void mi_put(struct mft_inode *mi); +int mi_init(struct mft_inode *mi, struct ntfs_sb_info *sbi, CLST rno); +int mi_read(struct mft_inode *mi, bool is_mft); +struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr); +// TODO: id? +struct ATTRIB *mi_find_attr(struct mft_inode *mi, struct ATTRIB *attr, + enum ATTR_TYPE type, const __le16 *name, + size_t name_len, const __le16 *id); +static inline struct ATTRIB *rec_find_attr_le(struct mft_inode *rec, + struct ATTR_LIST_ENTRY *le) +{ + return mi_find_attr(rec, NULL, le->type, le_name(le), le->name_len, + &le->id); +} +int mi_write(struct mft_inode *mi, int wait); +int mi_format_new(struct mft_inode *mi, struct ntfs_sb_info *sbi, CLST rno, + __le16 flags, bool is_mft); +void mi_mark_free(struct mft_inode *mi); +struct ATTRIB *mi_insert_attr(struct mft_inode *mi, enum ATTR_TYPE type, + const __le16 *name, u8 name_len, u32 asize, + u16 name_off); + +bool mi_remove_attr(struct ntfs_inode *ni, struct mft_inode *mi, + struct ATTRIB *attr); +bool mi_resize_attr(struct mft_inode *mi, struct ATTRIB *attr, int bytes); +int mi_pack_runs(struct mft_inode *mi, struct ATTRIB *attr, + struct runs_tree *run, CLST len); +static inline bool mi_is_ref(const struct mft_inode *mi, + const struct MFT_REF *ref) +{ + if (le32_to_cpu(ref->low) != mi->rno) + return false; + if (ref->seq != mi->mrec->seq) + return false; + +#ifdef CONFIG_NTFS3_64BIT_CLUSTER + return le16_to_cpu(ref->high) == (mi->rno >> 32); +#else + return !ref->high; +#endif +} + +static inline void mi_get_ref(const struct mft_inode *mi, struct MFT_REF *ref) +{ + ref->low = cpu_to_le32(mi->rno); +#ifdef CONFIG_NTFS3_64BIT_CLUSTER + ref->high = cpu_to_le16(mi->rno >> 32); +#else + ref->high = 0; +#endif + ref->seq = mi->mrec->seq; +} + +/* Globals from run.c */ +bool run_lookup_entry(const struct runs_tree *run, CLST vcn, CLST *lcn, + CLST *len, size_t *index); +void run_truncate(struct runs_tree *run, CLST vcn); +void run_truncate_head(struct runs_tree *run, CLST vcn); +void run_truncate_around(struct runs_tree *run, CLST vcn); +bool run_lookup(const struct runs_tree *run, CLST vcn, size_t *Index); +bool run_add_entry(struct runs_tree *run, CLST vcn, CLST lcn, CLST len, + bool is_mft); +bool run_collapse_range(struct runs_tree *run, CLST vcn, CLST len); +bool run_get_entry(const struct runs_tree *run, size_t index, CLST *vcn, + CLST *lcn, CLST *len); +bool run_is_mapped_full(const struct runs_tree *run, CLST svcn, CLST evcn); + +int run_pack(const struct runs_tree *run, CLST svcn, CLST len, u8 *run_buf, + u32 run_buf_size, CLST *packed_vcns); +int run_unpack(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino, + CLST svcn, CLST evcn, CLST vcn, const u8 *run_buf, + u32 run_buf_size); + +#ifdef NTFS3_CHECK_FREE_CLST +int run_unpack_ex(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino, + CLST svcn, CLST evcn, CLST vcn, const u8 *run_buf, + u32 run_buf_size); +#else +#define run_unpack_ex run_unpack +#endif +int run_get_highest_vcn(CLST vcn, const u8 *run_buf, u64 *highest_vcn); + +/* Globals from super.c */ +void *ntfs_set_shared(void *ptr, u32 bytes); +void *ntfs_put_shared(void *ptr); +void ntfs_unmap_meta(struct super_block *sb, CLST lcn, CLST len); +int ntfs_discard(struct ntfs_sb_info *sbi, CLST Lcn, CLST Len); + +/* Globals from bitmap.c*/ +int __init ntfs3_init_bitmap(void); +void ntfs3_exit_bitmap(void); +void wnd_close(struct wnd_bitmap *wnd); +static inline size_t wnd_zeroes(const struct wnd_bitmap *wnd) +{ + return wnd->total_zeroes; +} +int wnd_init(struct wnd_bitmap *wnd, struct super_block *sb, size_t nbits); +int wnd_set_free(struct wnd_bitmap *wnd, size_t bit, size_t bits); +int wnd_set_used(struct wnd_bitmap *wnd, size_t bit, size_t bits); +bool wnd_is_free(struct wnd_bitmap *wnd, size_t bit, size_t bits); +bool wnd_is_used(struct wnd_bitmap *wnd, size_t bit, size_t bits); + +/* Possible values for 'flags' 'wnd_find'. */ +#define BITMAP_FIND_MARK_AS_USED 0x01 +#define BITMAP_FIND_FULL 0x02 +size_t wnd_find(struct wnd_bitmap *wnd, size_t to_alloc, size_t hint, + size_t flags, size_t *allocated); +int wnd_extend(struct wnd_bitmap *wnd, size_t new_bits); +void wnd_zone_set(struct wnd_bitmap *wnd, size_t Lcn, size_t Len); +int ntfs_trim_fs(struct ntfs_sb_info *sbi, struct fstrim_range *range); + +/* Globals from upcase.c */ +int ntfs_cmp_names(const __le16 *s1, size_t l1, const __le16 *s2, size_t l2, + const u16 *upcase, bool bothcase); +int ntfs_cmp_names_cpu(const struct cpu_str *uni1, const struct le_str *uni2, + const u16 *upcase, bool bothcase); + +/* globals from xattr.c */ +#ifdef CONFIG_NTFS3_FS_POSIX_ACL +struct posix_acl *ntfs_get_acl(struct inode *inode, int type, bool rcu); +int ntfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type); +int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct inode *dir); +#else +#define ntfs_get_acl NULL +#define ntfs_set_acl NULL +#endif + +int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct inode *inode); +int ntfs_permission(struct user_namespace *mnt_userns, struct inode *inode, + int mask); +ssize_t ntfs_listxattr(struct dentry *dentry, char *buffer, size_t size); +extern const struct xattr_handler *ntfs_xattr_handlers[]; + +int ntfs_save_wsl_perm(struct inode *inode); +void ntfs_get_wsl_perm(struct inode *inode); + +/* globals from lznt.c */ +struct lznt *get_lznt_ctx(int level); +size_t compress_lznt(const void *uncompressed, size_t uncompressed_size, + void *compressed, size_t compressed_size, + struct lznt *ctx); +ssize_t decompress_lznt(const void *compressed, size_t compressed_size, + void *uncompressed, size_t uncompressed_size); + +static inline bool is_ntfs3(struct ntfs_sb_info *sbi) +{ + return sbi->volume.major_ver >= 3; +} + +/* (sb->s_flags & SB_ACTIVE) */ +static inline bool is_mounted(struct ntfs_sb_info *sbi) +{ + return !!sbi->sb->s_root; +} + +static inline bool ntfs_is_meta_file(struct ntfs_sb_info *sbi, CLST rno) +{ + return rno < MFT_REC_FREE || rno == sbi->objid_no || + rno == sbi->quota_no || rno == sbi->reparse_no || + rno == sbi->usn_jrnl_no; +} + +static inline void ntfs_unmap_page(struct page *page) +{ + kunmap(page); + put_page(page); +} + +static inline struct page *ntfs_map_page(struct address_space *mapping, + unsigned long index) +{ + struct page *page = read_mapping_page(mapping, index, NULL); + + if (!IS_ERR(page)) { + kmap(page); + if (!PageError(page)) + return page; + ntfs_unmap_page(page); + return ERR_PTR(-EIO); + } + return page; +} + +static inline size_t wnd_zone_bit(const struct wnd_bitmap *wnd) +{ + return wnd->zone_bit; +} + +static inline size_t wnd_zone_len(const struct wnd_bitmap *wnd) +{ + return wnd->zone_end - wnd->zone_bit; +} + +static inline void run_init(struct runs_tree *run) +{ + run->runs = NULL; + run->count = 0; + run->allocated = 0; +} + +static inline struct runs_tree *run_alloc(void) +{ + return kzalloc(sizeof(struct runs_tree), GFP_NOFS); +} + +static inline void run_close(struct runs_tree *run) +{ + kvfree(run->runs); + memset(run, 0, sizeof(*run)); +} + +static inline void run_free(struct runs_tree *run) +{ + if (run) { + kvfree(run->runs); + kfree(run); + } +} + +static inline bool run_is_empty(struct runs_tree *run) +{ + return !run->count; +} + +/* NTFS uses quad aligned bitmaps. */ +static inline size_t bitmap_size(size_t bits) +{ + return ALIGN((bits + 7) >> 3, 8); +} + +#define _100ns2seconds 10000000 +#define SecondsToStartOf1970 0x00000002B6109100 + +#define NTFS_TIME_GRAN 100 + +/* + * kernel2nt - Converts in-memory kernel timestamp into nt time. + */ +static inline __le64 kernel2nt(const struct timespec64 *ts) +{ + // 10^7 units of 100 nanoseconds one second + return cpu_to_le64(_100ns2seconds * + (ts->tv_sec + SecondsToStartOf1970) + + ts->tv_nsec / NTFS_TIME_GRAN); +} + +/* + * nt2kernel - Converts on-disk nt time into kernel timestamp. + */ +static inline void nt2kernel(const __le64 tm, struct timespec64 *ts) +{ + u64 t = le64_to_cpu(tm) - _100ns2seconds * SecondsToStartOf1970; + + // WARNING: do_div changes its first argument(!) + ts->tv_nsec = do_div(t, _100ns2seconds) * 100; + ts->tv_sec = t; +} + +static inline struct ntfs_sb_info *ntfs_sb(struct super_block *sb) +{ + return sb->s_fs_info; +} + +/* + * ntfs_up_cluster - Align up on cluster boundary. + */ +static inline u64 ntfs_up_cluster(const struct ntfs_sb_info *sbi, u64 size) +{ + return (size + sbi->cluster_mask) & sbi->cluster_mask_inv; +} + +/* + * ntfs_up_block - Align up on cluster boundary. + */ +static inline u64 ntfs_up_block(const struct super_block *sb, u64 size) +{ + return (size + sb->s_blocksize - 1) & ~(u64)(sb->s_blocksize - 1); +} + +static inline CLST bytes_to_cluster(const struct ntfs_sb_info *sbi, u64 size) +{ + return (size + sbi->cluster_mask) >> sbi->cluster_bits; +} + +static inline u64 bytes_to_block(const struct super_block *sb, u64 size) +{ + return (size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; +} + +static inline struct buffer_head *ntfs_bread(struct super_block *sb, + sector_t block) +{ + struct buffer_head *bh = sb_bread(sb, block); + + if (bh) + return bh; + + ntfs_err(sb, "failed to read volume at offset 0x%llx", + (u64)block << sb->s_blocksize_bits); + return NULL; +} + +static inline struct ntfs_inode *ntfs_i(struct inode *inode) +{ + return container_of(inode, struct ntfs_inode, vfs_inode); +} + +static inline bool is_compressed(const struct ntfs_inode *ni) +{ + return (ni->std_fa & FILE_ATTRIBUTE_COMPRESSED) || + (ni->ni_flags & NI_FLAG_COMPRESSED_MASK); +} + +static inline int ni_ext_compress_bits(const struct ntfs_inode *ni) +{ + return 0xb + (ni->ni_flags & NI_FLAG_COMPRESSED_MASK); +} + +/* Bits - 0xc, 0xd, 0xe, 0xf, 0x10 */ +static inline void ni_set_ext_compress_bits(struct ntfs_inode *ni, u8 bits) +{ + ni->ni_flags |= (bits - 0xb) & NI_FLAG_COMPRESSED_MASK; +} + +static inline bool is_dedup(const struct ntfs_inode *ni) +{ + return ni->ni_flags & NI_FLAG_DEDUPLICATED; +} + +static inline bool is_encrypted(const struct ntfs_inode *ni) +{ + return ni->std_fa & FILE_ATTRIBUTE_ENCRYPTED; +} + +static inline bool is_sparsed(const struct ntfs_inode *ni) +{ + return ni->std_fa & FILE_ATTRIBUTE_SPARSE_FILE; +} + +static inline int is_resident(struct ntfs_inode *ni) +{ + return ni->ni_flags & NI_FLAG_RESIDENT; +} + +static inline void le16_sub_cpu(__le16 *var, u16 val) +{ + *var = cpu_to_le16(le16_to_cpu(*var) - val); +} + +static inline void le32_sub_cpu(__le32 *var, u32 val) +{ + *var = cpu_to_le32(le32_to_cpu(*var) - val); +} + +static inline void nb_put(struct ntfs_buffers *nb) +{ + u32 i, nbufs = nb->nbufs; + + if (!nbufs) + return; + + for (i = 0; i < nbufs; i++) + put_bh(nb->bh[i]); + nb->nbufs = 0; +} + +static inline void put_indx_node(struct indx_node *in) +{ + if (!in) + return; + + kfree(in->index); + nb_put(&in->nb); + kfree(in); +} + +static inline void mi_clear(struct mft_inode *mi) +{ + nb_put(&mi->nb); + kfree(mi->mrec); + mi->mrec = NULL; +} + +static inline void ni_lock(struct ntfs_inode *ni) +{ + mutex_lock_nested(&ni->ni_lock, NTFS_INODE_MUTEX_NORMAL); +} + +static inline void ni_lock_dir(struct ntfs_inode *ni) +{ + mutex_lock_nested(&ni->ni_lock, NTFS_INODE_MUTEX_PARENT); +} + +static inline void ni_unlock(struct ntfs_inode *ni) +{ + mutex_unlock(&ni->ni_lock); +} + +static inline int ni_trylock(struct ntfs_inode *ni) +{ + return mutex_trylock(&ni->ni_lock); +} + +static inline int attr_load_runs_attr(struct ntfs_inode *ni, + struct ATTRIB *attr, + struct runs_tree *run, CLST vcn) +{ + return attr_load_runs_vcn(ni, attr->type, attr_name(attr), + attr->name_len, run, vcn); +} + +static inline void le64_sub_cpu(__le64 *var, u64 val) +{ + *var = cpu_to_le64(le64_to_cpu(*var) - val); +} + +#endif /* _LINUX_NTFS3_NTFS_FS_H */ diff --git a/fs/ntfs3/record.c b/fs/ntfs3/record.c new file mode 100644 index 000000000000..103705c86772 --- /dev/null +++ b/fs/ntfs3/record.c @@ -0,0 +1,605 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * + * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. + * + */ + +#include <linux/blkdev.h> +#include <linux/buffer_head.h> +#include <linux/fs.h> +#include <linux/nls.h> + +#include "debug.h" +#include "ntfs.h" +#include "ntfs_fs.h" + +static inline int compare_attr(const struct ATTRIB *left, enum ATTR_TYPE type, + const __le16 *name, u8 name_len, + const u16 *upcase) +{ + /* First, compare the type codes. */ + int diff = le32_to_cpu(left->type) - le32_to_cpu(type); + + if (diff) + return diff; + + /* They have the same type code, so we have to compare the names. */ + return ntfs_cmp_names(attr_name(left), left->name_len, name, name_len, + upcase, true); +} + +/* + * mi_new_attt_id + * + * Return: Unused attribute id that is less than mrec->next_attr_id. + */ +static __le16 mi_new_attt_id(struct mft_inode *mi) +{ + u16 free_id, max_id, t16; + struct MFT_REC *rec = mi->mrec; + struct ATTRIB *attr; + __le16 id; + + id = rec->next_attr_id; + free_id = le16_to_cpu(id); + if (free_id < 0x7FFF) { + rec->next_attr_id = cpu_to_le16(free_id + 1); + return id; + } + + /* One record can store up to 1024/24 ~= 42 attributes. */ + free_id = 0; + max_id = 0; + + attr = NULL; + + for (;;) { + attr = mi_enum_attr(mi, attr); + if (!attr) { + rec->next_attr_id = cpu_to_le16(max_id + 1); + mi->dirty = true; + return cpu_to_le16(free_id); + } + + t16 = le16_to_cpu(attr->id); + if (t16 == free_id) { + free_id += 1; + attr = NULL; + } else if (max_id < t16) + max_id = t16; + } +} + +int mi_get(struct ntfs_sb_info *sbi, CLST rno, struct mft_inode **mi) +{ + int err; + struct mft_inode *m = kzalloc(sizeof(struct mft_inode), GFP_NOFS); + + if (!m) + return -ENOMEM; + + err = mi_init(m, sbi, rno); + if (err) { + kfree(m); + return err; + } + + err = mi_read(m, false); + if (err) { + mi_put(m); + return err; + } + + *mi = m; + return 0; +} + +void mi_put(struct mft_inode *mi) +{ + mi_clear(mi); + kfree(mi); +} + +int mi_init(struct mft_inode *mi, struct ntfs_sb_info *sbi, CLST rno) +{ + mi->sbi = sbi; + mi->rno = rno; + mi->mrec = kmalloc(sbi->record_size, GFP_NOFS); + if (!mi->mrec) + return -ENOMEM; + + return 0; +} + +/* + * mi_read - Read MFT data. + */ +int mi_read(struct mft_inode *mi, bool is_mft) +{ + int err; + struct MFT_REC *rec = mi->mrec; + struct ntfs_sb_info *sbi = mi->sbi; + u32 bpr = sbi->record_size; + u64 vbo = (u64)mi->rno << sbi->record_bits; + struct ntfs_inode *mft_ni = sbi->mft.ni; + struct runs_tree *run = mft_ni ? &mft_ni->file.run : NULL; + struct rw_semaphore *rw_lock = NULL; + + if (is_mounted(sbi)) { + if (!is_mft) { + rw_lock = &mft_ni->file.run_lock; + down_read(rw_lock); + } + } + + err = ntfs_read_bh(sbi, run, vbo, &rec->rhdr, bpr, &mi->nb); + if (rw_lock) + up_read(rw_lock); + if (!err) + goto ok; + + if (err == -E_NTFS_FIXUP) { + mi->dirty = true; + goto ok; + } + + if (err != -ENOENT) + goto out; + + if (rw_lock) { + ni_lock(mft_ni); + down_write(rw_lock); + } + err = attr_load_runs_vcn(mft_ni, ATTR_DATA, NULL, 0, &mft_ni->file.run, + vbo >> sbi->cluster_bits); + if (rw_lock) { + up_write(rw_lock); + ni_unlock(mft_ni); + } + if (err) + goto out; + + if (rw_lock) + down_read(rw_lock); + err = ntfs_read_bh(sbi, run, vbo, &rec->rhdr, bpr, &mi->nb); + if (rw_lock) + up_read(rw_lock); + + if (err == -E_NTFS_FIXUP) { + mi->dirty = true; + goto ok; + } + if (err) + goto out; + +ok: + /* Check field 'total' only here. */ + if (le32_to_cpu(rec->total) != bpr) { + err = -EINVAL; + goto out; + } + + return 0; + +out: + return err; +} + +struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) +{ + const struct MFT_REC *rec = mi->mrec; + u32 used = le32_to_cpu(rec->used); + u32 t32, off, asize; + u16 t16; + + if (!attr) { + u32 total = le32_to_cpu(rec->total); + + off = le16_to_cpu(rec->attr_off); + + if (used > total) + return NULL; + + if (off >= used || off < MFTRECORD_FIXUP_OFFSET_1 || + !IS_ALIGNED(off, 4)) { + return NULL; + } + + /* Skip non-resident records. */ + if (!is_rec_inuse(rec)) + return NULL; + + attr = Add2Ptr(rec, off); + } else { + /* Check if input attr inside record. */ + off = PtrOffset(rec, attr); + if (off >= used) + return NULL; + + asize = le32_to_cpu(attr->size); + if (asize < SIZEOF_RESIDENT) { + /* Impossible 'cause we should not return such attribute. */ + return NULL; + } + + attr = Add2Ptr(attr, asize); + off += asize; + } + + asize = le32_to_cpu(attr->size); + + /* Can we use the first field (attr->type). */ + if (off + 8 > used) { + static_assert(ALIGN(sizeof(enum ATTR_TYPE), 8) == 8); + return NULL; + } + + if (attr->type == ATTR_END) { + /* End of enumeration. */ + return NULL; + } + + /* 0x100 is last known attribute for now. */ + t32 = le32_to_cpu(attr->type); + if ((t32 & 0xf) || (t32 > 0x100)) + return NULL; + + /* Check boundary. */ + if (off + asize > used) + return NULL; + + /* Check size of attribute. */ + if (!attr->non_res) { + if (asize < SIZEOF_RESIDENT) + return NULL; + + t16 = le16_to_cpu(attr->res.data_off); + + if (t16 > asize) + return NULL; + + t32 = le32_to_cpu(attr->res.data_size); + if (t16 + t32 > asize) + return NULL; + + return attr; + } + + /* Check some nonresident fields. */ + if (attr->name_len && + le16_to_cpu(attr->name_off) + sizeof(short) * attr->name_len > + le16_to_cpu(attr->nres.run_off)) { + return NULL; + } + + if (attr->nres.svcn || !is_attr_ext(attr)) { + if (asize + 8 < SIZEOF_NONRESIDENT) + return NULL; + + if (attr->nres.c_unit) + return NULL; + } else if (asize + 8 < SIZEOF_NONRESIDENT_EX) + return NULL; + + return attr; +} + +/* + * mi_find_attr - Find the attribute by type and name and id. + */ +struct ATTRIB *mi_find_attr(struct mft_inode *mi, struct ATTRIB *attr, + enum ATTR_TYPE type, const __le16 *name, + size_t name_len, const __le16 *id) +{ + u32 type_in = le32_to_cpu(type); + u32 atype; + +next_attr: + attr = mi_enum_attr(mi, attr); + if (!attr) + return NULL; + + atype = le32_to_cpu(attr->type); + if (atype > type_in) + return NULL; + + if (atype < type_in) + goto next_attr; + + if (attr->name_len != name_len) + goto next_attr; + + if (name_len && memcmp(attr_name(attr), name, name_len * sizeof(short))) + goto next_attr; + + if (id && *id != attr->id) + goto next_attr; + + return attr; +} + +int mi_write(struct mft_inode *mi, int wait) +{ + struct MFT_REC *rec; + int err; + struct ntfs_sb_info *sbi; + + if (!mi->dirty) + return 0; + + sbi = mi->sbi; + rec = mi->mrec; + + err = ntfs_write_bh(sbi, &rec->rhdr, &mi->nb, wait); + if (err) + return err; + + if (mi->rno < sbi->mft.recs_mirr) + sbi->flags |= NTFS_FLAGS_MFTMIRR; + + mi->dirty = false; + + return 0; +} + +int mi_format_new(struct mft_inode *mi, struct ntfs_sb_info *sbi, CLST rno, + __le16 flags, bool is_mft) +{ + int err; + u16 seq = 1; + struct MFT_REC *rec; + u64 vbo = (u64)rno << sbi->record_bits; + + err = mi_init(mi, sbi, rno); + if (err) + return err; + + rec = mi->mrec; + + if (rno == MFT_REC_MFT) { + ; + } else if (rno < MFT_REC_FREE) { + seq = rno; + } else if (rno >= sbi->mft.used) { + ; + } else if (mi_read(mi, is_mft)) { + ; + } else if (rec->rhdr.sign == NTFS_FILE_SIGNATURE) { + /* Record is reused. Update its sequence number. */ + seq = le16_to_cpu(rec->seq) + 1; + if (!seq) + seq = 1; + } + + memcpy(rec, sbi->new_rec, sbi->record_size); + + rec->seq = cpu_to_le16(seq); + rec->flags = RECORD_FLAG_IN_USE | flags; + + mi->dirty = true; + + if (!mi->nb.nbufs) { + struct ntfs_inode *ni = sbi->mft.ni; + bool lock = false; + + if (is_mounted(sbi) && !is_mft) { + down_read(&ni->file.run_lock); + lock = true; + } + + err = ntfs_get_bh(sbi, &ni->file.run, vbo, sbi->record_size, + &mi->nb); + if (lock) + up_read(&ni->file.run_lock); + } + + return err; +} + +/* + * mi_mark_free - Mark record as unused and marks it as free in bitmap. + */ +void mi_mark_free(struct mft_inode *mi) +{ + CLST rno = mi->rno; + struct ntfs_sb_info *sbi = mi->sbi; + + if (rno >= MFT_REC_RESERVED && rno < MFT_REC_FREE) { + ntfs_clear_mft_tail(sbi, rno, rno + 1); + mi->dirty = false; + return; + } + + if (mi->mrec) { + clear_rec_inuse(mi->mrec); + mi->dirty = true; + mi_write(mi, 0); + } + ntfs_mark_rec_free(sbi, rno); +} + +/* + * mi_insert_attr - Reserve space for new attribute. + * + * Return: Not full constructed attribute or NULL if not possible to create. + */ +struct ATTRIB *mi_insert_attr(struct mft_inode *mi, enum ATTR_TYPE type, + const __le16 *name, u8 name_len, u32 asize, + u16 name_off) +{ + size_t tail; + struct ATTRIB *attr; + __le16 id; + struct MFT_REC *rec = mi->mrec; + struct ntfs_sb_info *sbi = mi->sbi; + u32 used = le32_to_cpu(rec->used); + const u16 *upcase = sbi->upcase; + int diff; + + /* Can we insert mi attribute? */ + if (used + asize > mi->sbi->record_size) + return NULL; + + /* + * Scan through the list of attributes to find the point + * at which we should insert it. + */ + attr = NULL; + while ((attr = mi_enum_attr(mi, attr))) { + diff = compare_attr(attr, type, name, name_len, upcase); + if (diff > 0) + break; + if (diff < 0) + continue; + + if (!is_attr_indexed(attr)) + return NULL; + break; + } + + if (!attr) { + tail = 8; /* Not used, just to suppress warning. */ + attr = Add2Ptr(rec, used - 8); + } else { + tail = used - PtrOffset(rec, attr); + } + + id = mi_new_attt_id(mi); + + memmove(Add2Ptr(attr, asize), attr, tail); + memset(attr, 0, asize); + + attr->type = type; + attr->size = cpu_to_le32(asize); + attr->name_len = name_len; + attr->name_off = cpu_to_le16(name_off); + attr->id = id; + + memmove(Add2Ptr(attr, name_off), name, name_len * sizeof(short)); + rec->used = cpu_to_le32(used + asize); + + mi->dirty = true; + + return attr; +} + +/* + * mi_remove_attr - Remove the attribute from record. + * + * NOTE: The source attr will point to next attribute. + */ +bool mi_remove_attr(struct ntfs_inode *ni, struct mft_inode *mi, + struct ATTRIB *attr) +{ + struct MFT_REC *rec = mi->mrec; + u32 aoff = PtrOffset(rec, attr); + u32 used = le32_to_cpu(rec->used); + u32 asize = le32_to_cpu(attr->size); + + if (aoff + asize > used) + return false; + + if (ni && is_attr_indexed(attr)) { + le16_add_cpu(&ni->mi.mrec->hard_links, -1); + ni->mi.dirty = true; + } + + used -= asize; + memmove(attr, Add2Ptr(attr, asize), used - aoff); + rec->used = cpu_to_le32(used); + mi->dirty = true; + + return true; +} + +/* bytes = "new attribute size" - "old attribute size" */ +bool mi_resize_attr(struct mft_inode *mi, struct ATTRIB *attr, int bytes) +{ + struct MFT_REC *rec = mi->mrec; + u32 aoff = PtrOffset(rec, attr); + u32 total, used = le32_to_cpu(rec->used); + u32 nsize, asize = le32_to_cpu(attr->size); + u32 rsize = le32_to_cpu(attr->res.data_size); + int tail = (int)(used - aoff - asize); + int dsize; + char *next; + + if (tail < 0 || aoff >= used) + return false; + + if (!bytes) + return true; + + total = le32_to_cpu(rec->total); + next = Add2Ptr(attr, asize); + + if (bytes > 0) { + dsize = ALIGN(bytes, 8); + if (used + dsize > total) + return false; + nsize = asize + dsize; + /* Move tail */ + memmove(next + dsize, next, tail); + memset(next, 0, dsize); + used += dsize; + rsize += dsize; + } else { + dsize = ALIGN(-bytes, 8); + if (dsize > asize) + return false; + nsize = asize - dsize; + memmove(next - dsize, next, tail); + used -= dsize; + rsize -= dsize; + } + + rec->used = cpu_to_le32(used); + attr->size = cpu_to_le32(nsize); + if (!attr->non_res) + attr->res.data_size = cpu_to_le32(rsize); + mi->dirty = true; + + return true; +} + +int mi_pack_runs(struct mft_inode *mi, struct ATTRIB *attr, + struct runs_tree *run, CLST len) +{ + int err = 0; + struct ntfs_sb_info *sbi = mi->sbi; + u32 new_run_size; + CLST plen; + struct MFT_REC *rec = mi->mrec; + CLST svcn = le64_to_cpu(attr->nres.svcn); + u32 used = le32_to_cpu(rec->used); + u32 aoff = PtrOffset(rec, attr); + u32 asize = le32_to_cpu(attr->size); + char *next = Add2Ptr(attr, asize); + u16 run_off = le16_to_cpu(attr->nres.run_off); + u32 run_size = asize - run_off; + u32 tail = used - aoff - asize; + u32 dsize = sbi->record_size - used; + + /* Make a maximum gap in current record. */ + memmove(next + dsize, next, tail); + + /* Pack as much as possible. */ + err = run_pack(run, svcn, len, Add2Ptr(attr, run_off), run_size + dsize, + &plen); + if (err < 0) { + memmove(next, next + dsize, tail); + return err; + } + + new_run_size = ALIGN(err, 8); + + memmove(next + new_run_size - run_size, next + dsize, tail); + + attr->size = cpu_to_le32(asize + new_run_size - run_size); + attr->nres.evcn = cpu_to_le64(svcn + plen - 1); + rec->used = cpu_to_le32(used + new_run_size - run_size); + mi->dirty = true; + + return 0; +} diff --git a/fs/ntfs3/run.c b/fs/ntfs3/run.c new file mode 100644 index 000000000000..26ed2b64345e --- /dev/null +++ b/fs/ntfs3/run.c @@ -0,0 +1,1113 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * + * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. + * + * TODO: try to use extents tree (instead of array) + */ + +#include <linux/blkdev.h> +#include <linux/buffer_head.h> +#include <linux/fs.h> +#include <linux/log2.h> +#include <linux/nls.h> + +#include "debug.h" +#include "ntfs.h" +#include "ntfs_fs.h" + +/* runs_tree is a continues memory. Try to avoid big size. */ +#define NTFS3_RUN_MAX_BYTES 0x10000 + +struct ntfs_run { + CLST vcn; /* Virtual cluster number. */ + CLST len; /* Length in clusters. */ + CLST lcn; /* Logical cluster number. */ +}; + +/* + * run_lookup - Lookup the index of a MCB entry that is first <= vcn. + * + * Case of success it will return non-zero value and set + * @index parameter to index of entry been found. + * Case of entry missing from list 'index' will be set to + * point to insertion position for the entry question. + */ +bool run_lookup(const struct runs_tree *run, CLST vcn, size_t *index) +{ + size_t min_idx, max_idx, mid_idx; + struct ntfs_run *r; + + if (!run->count) { + *index = 0; + return false; + } + + min_idx = 0; + max_idx = run->count - 1; + + /* Check boundary cases specially, 'cause they cover the often requests. */ + r = run->runs; + if (vcn < r->vcn) { + *index = 0; + return false; + } + + if (vcn < r->vcn + r->len) { + *index = 0; + return true; + } + + r += max_idx; + if (vcn >= r->vcn + r->len) { + *index = run->count; + return false; + } + + if (vcn >= r->vcn) { + *index = max_idx; + return true; + } + + do { + mid_idx = min_idx + ((max_idx - min_idx) >> 1); + r = run->runs + mid_idx; + + if (vcn < r->vcn) { + max_idx = mid_idx - 1; + if (!mid_idx) + break; + } else if (vcn >= r->vcn + r->len) { + min_idx = mid_idx + 1; + } else { + *index = mid_idx; + return true; + } + } while (min_idx <= max_idx); + + *index = max_idx + 1; + return false; +} + +/* + * run_consolidate - Consolidate runs starting from a given one. + */ +static void run_consolidate(struct runs_tree *run, size_t index) +{ + size_t i; + struct ntfs_run *r = run->runs + index; + + while (index + 1 < run->count) { + /* + * I should merge current run with next + * if start of the next run lies inside one being tested. + */ + struct ntfs_run *n = r + 1; + CLST end = r->vcn + r->len; + CLST dl; + + /* Stop if runs are not aligned one to another. */ + if (n->vcn > end) + break; + + dl = end - n->vcn; + + /* + * If range at index overlaps with next one + * then I will either adjust it's start position + * or (if completely matches) dust remove one from the list. + */ + if (dl > 0) { + if (n->len <= dl) + goto remove_next_range; + + n->len -= dl; + n->vcn += dl; + if (n->lcn != SPARSE_LCN) + n->lcn += dl; + dl = 0; + } + + /* + * Stop if sparse mode does not match + * both current and next runs. + */ + if ((n->lcn == SPARSE_LCN) != (r->lcn == SPARSE_LCN)) { + index += 1; + r = n; + continue; + } + + /* + * Check if volume block + * of a next run lcn does not match + * last volume block of the current run. + */ + if (n->lcn != SPARSE_LCN && n->lcn != r->lcn + r->len) + break; + + /* + * Next and current are siblings. + * Eat/join. + */ + r->len += n->len - dl; + +remove_next_range: + i = run->count - (index + 1); + if (i > 1) + memmove(n, n + 1, sizeof(*n) * (i - 1)); + + run->count -= 1; + } +} + +/* + * run_is_mapped_full + * + * Return: True if range [svcn - evcn] is mapped. + */ +bool run_is_mapped_full(const struct runs_tree *run, CLST svcn, CLST evcn) +{ + size_t i; + const struct ntfs_run *r, *end; + CLST next_vcn; + + if (!run_lookup(run, svcn, &i)) + return false; + + end = run->runs + run->count; + r = run->runs + i; + + for (;;) { + next_vcn = r->vcn + r->len; + if (next_vcn > evcn) + return true; + + if (++r >= end) + return false; + + if (r->vcn != next_vcn) + return false; + } +} + +bool run_lookup_entry(const struct runs_tree *run, CLST vcn, CLST *lcn, + CLST *len, size_t *index) +{ + size_t idx; + CLST gap; + struct ntfs_run *r; + + /* Fail immediately if nrun was not touched yet. */ + if (!run->runs) + return false; + + if (!run_lookup(run, vcn, &idx)) + return false; + + r = run->runs + idx; + + if (vcn >= r->vcn + r->len) + return false; + + gap = vcn - r->vcn; + if (r->len <= gap) + return false; + + *lcn = r->lcn == SPARSE_LCN ? SPARSE_LCN : (r->lcn + gap); + + if (len) + *len = r->len - gap; + if (index) + *index = idx; + + return true; +} + +/* + * run_truncate_head - Decommit the range before vcn. + */ +void run_truncate_head(struct runs_tree *run, CLST vcn) +{ + size_t index; + struct ntfs_run *r; + + if (run_lookup(run, vcn, &index)) { + r = run->runs + index; + + if (vcn > r->vcn) { + CLST dlen = vcn - r->vcn; + + r->vcn = vcn; + r->len -= dlen; + if (r->lcn != SPARSE_LCN) + r->lcn += dlen; + } + + if (!index) + return; + } + r = run->runs; + memmove(r, r + index, sizeof(*r) * (run->count - index)); + + run->count -= index; + + if (!run->count) { + kvfree(run->runs); + run->runs = NULL; + run->allocated = 0; + } +} + +/* + * run_truncate - Decommit the range after vcn. + */ +void run_truncate(struct runs_tree *run, CLST vcn) +{ + size_t index; + + /* + * If I hit the range then + * I have to truncate one. + * If range to be truncated is becoming empty + * then it will entirely be removed. + */ + if (run_lookup(run, vcn, &index)) { + struct ntfs_run *r = run->runs + index; + + r->len = vcn - r->vcn; + + if (r->len > 0) + index += 1; + } + + /* + * At this point 'index' is set to position that + * should be thrown away (including index itself) + * Simple one - just set the limit. + */ + run->count = index; + + /* Do not reallocate array 'runs'. Only free if possible. */ + if (!index) { + kvfree(run->runs); + run->runs = NULL; + run->allocated = 0; + } +} + +/* + * run_truncate_around - Trim head and tail if necessary. + */ +void run_truncate_around(struct runs_tree *run, CLST vcn) +{ + run_truncate_head(run, vcn); + + if (run->count >= NTFS3_RUN_MAX_BYTES / sizeof(struct ntfs_run) / 2) + run_truncate(run, (run->runs + (run->count >> 1))->vcn); +} + +/* + * run_add_entry + * + * Sets location to known state. + * Run to be added may overlap with existing location. + * + * Return: false if of memory. + */ +bool run_add_entry(struct runs_tree *run, CLST vcn, CLST lcn, CLST len, + bool is_mft) +{ + size_t used, index; + struct ntfs_run *r; + bool inrange; + CLST tail_vcn = 0, tail_len = 0, tail_lcn = 0; + bool should_add_tail = false; + + /* + * Lookup the insertion point. + * + * Execute bsearch for the entry containing + * start position question. + */ + inrange = run_lookup(run, vcn, &index); + + /* + * Shortcut here would be case of + * range not been found but one been added + * continues previous run. + * This case I can directly make use of + * existing range as my start point. + */ + if (!inrange && index > 0) { + struct ntfs_run *t = run->runs + index - 1; + + if (t->vcn + t->len == vcn && + (t->lcn == SPARSE_LCN) == (lcn == SPARSE_LCN) && + (lcn == SPARSE_LCN || lcn == t->lcn + t->len)) { + inrange = true; + index -= 1; + } + } + + /* + * At this point 'index' either points to the range + * containing start position or to the insertion position + * for a new range. + * So first let's check if range I'm probing is here already. + */ + if (!inrange) { +requires_new_range: + /* + * Range was not found. + * Insert at position 'index' + */ + used = run->count * sizeof(struct ntfs_run); + + /* + * Check allocated space. + * If one is not enough to get one more entry + * then it will be reallocated. + */ + if (run->allocated < used + sizeof(struct ntfs_run)) { + size_t bytes; + struct ntfs_run *new_ptr; + + /* Use power of 2 for 'bytes'. */ + if (!used) { + bytes = 64; + } else if (used <= 16 * PAGE_SIZE) { + if (is_power_of_2(run->allocated)) + bytes = run->allocated << 1; + else + bytes = (size_t)1 + << (2 + blksize_bits(used)); + } else { + bytes = run->allocated + (16 * PAGE_SIZE); + } + + WARN_ON(!is_mft && bytes > NTFS3_RUN_MAX_BYTES); + + new_ptr = kvmalloc(bytes, GFP_KERNEL); + + if (!new_ptr) + return false; + + r = new_ptr + index; + memcpy(new_ptr, run->runs, + index * sizeof(struct ntfs_run)); + memcpy(r + 1, run->runs + index, + sizeof(struct ntfs_run) * (run->count - index)); + + kvfree(run->runs); + run->runs = new_ptr; + run->allocated = bytes; + + } else { + size_t i = run->count - index; + + r = run->runs + index; + + /* memmove appears to be a bottle neck here... */ + if (i > 0) + memmove(r + 1, r, sizeof(struct ntfs_run) * i); + } + + r->vcn = vcn; + r->lcn = lcn; + r->len = len; + run->count += 1; + } else { + r = run->runs + index; + + /* + * If one of ranges was not allocated then we + * have to split location we just matched and + * insert current one. + * A common case this requires tail to be reinserted + * a recursive call. + */ + if (((lcn == SPARSE_LCN) != (r->lcn == SPARSE_LCN)) || + (lcn != SPARSE_LCN && lcn != r->lcn + (vcn - r->vcn))) { + CLST to_eat = vcn - r->vcn; + CLST Tovcn = to_eat + len; + + should_add_tail = Tovcn < r->len; + + if (should_add_tail) { + tail_lcn = r->lcn == SPARSE_LCN + ? SPARSE_LCN + : (r->lcn + Tovcn); + tail_vcn = r->vcn + Tovcn; + tail_len = r->len - Tovcn; + } + + if (to_eat > 0) { + r->len = to_eat; + inrange = false; + index += 1; + goto requires_new_range; + } + + /* lcn should match one were going to add. */ + r->lcn = lcn; + } + + /* + * If existing range fits then were done. + * Otherwise extend found one and fall back to range jocode. + */ + if (r->vcn + r->len < vcn + len) + r->len += len - ((r->vcn + r->len) - vcn); + } + + /* + * And normalize it starting from insertion point. + * It's possible that no insertion needed case if + * start point lies within the range of an entry + * that 'index' points to. + */ + if (inrange && index > 0) + index -= 1; + run_consolidate(run, index); + run_consolidate(run, index + 1); + + /* + * A special case. + * We have to add extra range a tail. + */ + if (should_add_tail && + !run_add_entry(run, tail_vcn, tail_lcn, tail_len, is_mft)) + return false; + + return true; +} + +/* run_collapse_range + * + * Helper for attr_collapse_range(), + * which is helper for fallocate(collapse_range). + */ +bool run_collapse_range(struct runs_tree *run, CLST vcn, CLST len) +{ + size_t index, eat; + struct ntfs_run *r, *e, *eat_start, *eat_end; + CLST end; + + if (WARN_ON(!run_lookup(run, vcn, &index))) + return true; /* Should never be here. */ + + e = run->runs + run->count; + r = run->runs + index; + end = vcn + len; + + if (vcn > r->vcn) { + if (r->vcn + r->len <= end) { + /* Collapse tail of run .*/ + r->len = vcn - r->vcn; + } else if (r->lcn == SPARSE_LCN) { + /* Collapse a middle part of sparsed run. */ + r->len -= len; + } else { + /* Collapse a middle part of normal run, split. */ + if (!run_add_entry(run, vcn, SPARSE_LCN, len, false)) + return false; + return run_collapse_range(run, vcn, len); + } + + r += 1; + } + + eat_start = r; + eat_end = r; + + for (; r < e; r++) { + CLST d; + + if (r->vcn >= end) { + r->vcn -= len; + continue; + } + + if (r->vcn + r->len <= end) { + /* Eat this run. */ + eat_end = r + 1; + continue; + } + + d = end - r->vcn; + if (r->lcn != SPARSE_LCN) + r->lcn += d; + r->len -= d; + r->vcn -= len - d; + } + + eat = eat_end - eat_start; + memmove(eat_start, eat_end, (e - eat_end) * sizeof(*r)); + run->count -= eat; + + return true; +} + +/* + * run_get_entry - Return index-th mapped region. + */ +bool run_get_entry(const struct runs_tree *run, size_t index, CLST *vcn, + CLST *lcn, CLST *len) +{ + const struct ntfs_run *r; + + if (index >= run->count) + return false; + + r = run->runs + index; + + if (!r->len) + return false; + + if (vcn) + *vcn = r->vcn; + if (lcn) + *lcn = r->lcn; + if (len) + *len = r->len; + return true; +} + +/* + * run_packed_size - Calculate the size of packed int64. + */ +#ifdef __BIG_ENDIAN +static inline int run_packed_size(const s64 n) +{ + const u8 *p = (const u8 *)&n + sizeof(n) - 1; + + if (n >= 0) { + if (p[-7] || p[-6] || p[-5] || p[-4]) + p -= 4; + if (p[-3] || p[-2]) + p -= 2; + if (p[-1]) + p -= 1; + if (p[0] & 0x80) + p -= 1; + } else { + if (p[-7] != 0xff || p[-6] != 0xff || p[-5] != 0xff || + p[-4] != 0xff) + p -= 4; + if (p[-3] != 0xff || p[-2] != 0xff) + p -= 2; + if (p[-1] != 0xff) + p -= 1; + if (!(p[0] & 0x80)) + p -= 1; + } + return (const u8 *)&n + sizeof(n) - p; +} + +/* Full trusted function. It does not check 'size' for errors. */ +static inline void run_pack_s64(u8 *run_buf, u8 size, s64 v) +{ + const u8 *p = (u8 *)&v; + + switch (size) { + case 8: + run_buf[7] = p[0]; + fallthrough; + case 7: + run_buf[6] = p[1]; + fallthrough; + case 6: + run_buf[5] = p[2]; + fallthrough; + case 5: + run_buf[4] = p[3]; + fallthrough; + case 4: + run_buf[3] = p[4]; + fallthrough; + case 3: + run_buf[2] = p[5]; + fallthrough; + case 2: + run_buf[1] = p[6]; + fallthrough; + case 1: + run_buf[0] = p[7]; + } +} + +/* Full trusted function. It does not check 'size' for errors. */ +static inline s64 run_unpack_s64(const u8 *run_buf, u8 size, s64 v) +{ + u8 *p = (u8 *)&v; + + switch (size) { + case 8: + p[0] = run_buf[7]; + fallthrough; + case 7: + p[1] = run_buf[6]; + fallthrough; + case 6: + p[2] = run_buf[5]; + fallthrough; + case 5: + p[3] = run_buf[4]; + fallthrough; + case 4: + p[4] = run_buf[3]; + fallthrough; + case 3: + p[5] = run_buf[2]; + fallthrough; + case 2: + p[6] = run_buf[1]; + fallthrough; + case 1: + p[7] = run_buf[0]; + } + return v; +} + +#else + +static inline int run_packed_size(const s64 n) +{ + const u8 *p = (const u8 *)&n; + + if (n >= 0) { + if (p[7] || p[6] || p[5] || p[4]) + p += 4; + if (p[3] || p[2]) + p += 2; + if (p[1]) + p += 1; + if (p[0] & 0x80) + p += 1; + } else { + if (p[7] != 0xff || p[6] != 0xff || p[5] != 0xff || + p[4] != 0xff) + p += 4; + if (p[3] != 0xff || p[2] != 0xff) + p += 2; + if (p[1] != 0xff) + p += 1; + if (!(p[0] & 0x80)) + p += 1; + } + + return 1 + p - (const u8 *)&n; +} + +/* Full trusted function. It does not check 'size' for errors. */ +static inline void run_pack_s64(u8 *run_buf, u8 size, s64 v) +{ + const u8 *p = (u8 *)&v; + + /* memcpy( run_buf, &v, size); Is it faster? */ + switch (size) { + case 8: + run_buf[7] = p[7]; + fallthrough; + case 7: + run_buf[6] = p[6]; + fallthrough; + case 6: + run_buf[5] = p[5]; + fallthrough; + case 5: + run_buf[4] = p[4]; + fallthrough; + case 4: + run_buf[3] = p[3]; + fallthrough; + case 3: + run_buf[2] = p[2]; + fallthrough; + case 2: + run_buf[1] = p[1]; + fallthrough; + case 1: + run_buf[0] = p[0]; + } +} + +/* full trusted function. It does not check 'size' for errors */ +static inline s64 run_unpack_s64(const u8 *run_buf, u8 size, s64 v) +{ + u8 *p = (u8 *)&v; + + /* memcpy( &v, run_buf, size); Is it faster? */ + switch (size) { + case 8: + p[7] = run_buf[7]; + fallthrough; + case 7: + p[6] = run_buf[6]; + fallthrough; + case 6: + p[5] = run_buf[5]; + fallthrough; + case 5: + p[4] = run_buf[4]; + fallthrough; + case 4: + p[3] = run_buf[3]; + fallthrough; + case 3: + p[2] = run_buf[2]; + fallthrough; + case 2: + p[1] = run_buf[1]; + fallthrough; + case 1: + p[0] = run_buf[0]; + } + return v; +} +#endif + +/* + * run_pack - Pack runs into buffer. + * + * packed_vcns - How much runs we have packed. + * packed_size - How much bytes we have used run_buf. + */ +int run_pack(const struct runs_tree *run, CLST svcn, CLST len, u8 *run_buf, + u32 run_buf_size, CLST *packed_vcns) +{ + CLST next_vcn, vcn, lcn; + CLST prev_lcn = 0; + CLST evcn1 = svcn + len; + int packed_size = 0; + size_t i; + bool ok; + s64 dlcn; + int offset_size, size_size, tmp; + + next_vcn = vcn = svcn; + + *packed_vcns = 0; + + if (!len) + goto out; + + ok = run_lookup_entry(run, vcn, &lcn, &len, &i); + + if (!ok) + goto error; + + if (next_vcn != vcn) + goto error; + + for (;;) { + next_vcn = vcn + len; + if (next_vcn > evcn1) + len = evcn1 - vcn; + + /* How much bytes required to pack len. */ + size_size = run_packed_size(len); + + /* offset_size - How much bytes is packed dlcn. */ + if (lcn == SPARSE_LCN) { + offset_size = 0; + dlcn = 0; + } else { + /* NOTE: lcn can be less than prev_lcn! */ + dlcn = (s64)lcn - prev_lcn; + offset_size = run_packed_size(dlcn); + prev_lcn = lcn; + } + + tmp = run_buf_size - packed_size - 2 - offset_size; + if (tmp <= 0) + goto out; + + /* Can we store this entire run. */ + if (tmp < size_size) + goto out; + + if (run_buf) { + /* Pack run header. */ + run_buf[0] = ((u8)(size_size | (offset_size << 4))); + run_buf += 1; + + /* Pack the length of run. */ + run_pack_s64(run_buf, size_size, len); + + run_buf += size_size; + /* Pack the offset from previous LCN. */ + run_pack_s64(run_buf, offset_size, dlcn); + run_buf += offset_size; + } + + packed_size += 1 + offset_size + size_size; + *packed_vcns += len; + + if (packed_size + 1 >= run_buf_size || next_vcn >= evcn1) + goto out; + + ok = run_get_entry(run, ++i, &vcn, &lcn, &len); + if (!ok) + goto error; + + if (next_vcn != vcn) + goto error; + } + +out: + /* Store last zero. */ + if (run_buf) + run_buf[0] = 0; + + return packed_size + 1; + +error: + return -EOPNOTSUPP; +} + +/* + * run_unpack - Unpack packed runs from @run_buf. + * + * Return: Error if negative, or real used bytes. + */ +int run_unpack(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino, + CLST svcn, CLST evcn, CLST vcn, const u8 *run_buf, + u32 run_buf_size) +{ + u64 prev_lcn, vcn64, lcn, next_vcn; + const u8 *run_last, *run_0; + bool is_mft = ino == MFT_REC_MFT; + + /* Check for empty. */ + if (evcn + 1 == svcn) + return 0; + + if (evcn < svcn) + return -EINVAL; + + run_0 = run_buf; + run_last = run_buf + run_buf_size; + prev_lcn = 0; + vcn64 = svcn; + + /* Read all runs the chain. */ + /* size_size - How much bytes is packed len. */ + while (run_buf < run_last) { + /* size_size - How much bytes is packed len. */ + u8 size_size = *run_buf & 0xF; + /* offset_size - How much bytes is packed dlcn. */ + u8 offset_size = *run_buf++ >> 4; + u64 len; + + if (!size_size) + break; + + /* + * Unpack runs. + * NOTE: Runs are stored little endian order + * "len" is unsigned value, "dlcn" is signed. + * Large positive number requires to store 5 bytes + * e.g.: 05 FF 7E FF FF 00 00 00 + */ + if (size_size > 8) + return -EINVAL; + + len = run_unpack_s64(run_buf, size_size, 0); + /* Skip size_size. */ + run_buf += size_size; + + if (!len) + return -EINVAL; + + if (!offset_size) + lcn = SPARSE_LCN64; + else if (offset_size <= 8) { + s64 dlcn; + + /* Initial value of dlcn is -1 or 0. */ + dlcn = (run_buf[offset_size - 1] & 0x80) ? (s64)-1 : 0; + dlcn = run_unpack_s64(run_buf, offset_size, dlcn); + /* Skip offset_size. */ + run_buf += offset_size; + + if (!dlcn) + return -EINVAL; + lcn = prev_lcn + dlcn; + prev_lcn = lcn; + } else + return -EINVAL; + + next_vcn = vcn64 + len; + /* Check boundary. */ + if (next_vcn > evcn + 1) + return -EINVAL; + +#ifndef CONFIG_NTFS3_64BIT_CLUSTER + if (next_vcn > 0x100000000ull || (lcn + len) > 0x100000000ull) { + ntfs_err( + sbi->sb, + "This driver is compiled without CONFIG_NTFS3_64BIT_CLUSTER (like windows driver).\n" + "Volume contains 64 bits run: vcn %llx, lcn %llx, len %llx.\n" + "Activate CONFIG_NTFS3_64BIT_CLUSTER to process this case", + vcn64, lcn, len); + return -EOPNOTSUPP; + } +#endif + if (lcn != SPARSE_LCN64 && lcn + len > sbi->used.bitmap.nbits) { + /* LCN range is out of volume. */ + return -EINVAL; + } + + if (!run) + ; /* Called from check_attr(fslog.c) to check run. */ + else if (run == RUN_DEALLOCATE) { + /* + * Called from ni_delete_all to free clusters + * without storing in run. + */ + if (lcn != SPARSE_LCN64) + mark_as_free_ex(sbi, lcn, len, true); + } else if (vcn64 >= vcn) { + if (!run_add_entry(run, vcn64, lcn, len, is_mft)) + return -ENOMEM; + } else if (next_vcn > vcn) { + u64 dlen = vcn - vcn64; + + if (!run_add_entry(run, vcn, lcn + dlen, len - dlen, + is_mft)) + return -ENOMEM; + } + + vcn64 = next_vcn; + } + + if (vcn64 != evcn + 1) { + /* Not expected length of unpacked runs. */ + return -EINVAL; + } + + return run_buf - run_0; +} + +#ifdef NTFS3_CHECK_FREE_CLST +/* + * run_unpack_ex - Unpack packed runs from "run_buf". + * + * Checks unpacked runs to be used in bitmap. + * + * Return: Error if negative, or real used bytes. + */ +int run_unpack_ex(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino, + CLST svcn, CLST evcn, CLST vcn, const u8 *run_buf, + u32 run_buf_size) +{ + int ret, err; + CLST next_vcn, lcn, len; + size_t index; + bool ok; + struct wnd_bitmap *wnd; + + ret = run_unpack(run, sbi, ino, svcn, evcn, vcn, run_buf, run_buf_size); + if (ret <= 0) + return ret; + + if (!sbi->used.bitmap.sb || !run || run == RUN_DEALLOCATE) + return ret; + + if (ino == MFT_REC_BADCLUST) + return ret; + + next_vcn = vcn = svcn; + wnd = &sbi->used.bitmap; + + for (ok = run_lookup_entry(run, vcn, &lcn, &len, &index); + next_vcn <= evcn; + ok = run_get_entry(run, ++index, &vcn, &lcn, &len)) { + if (!ok || next_vcn != vcn) + return -EINVAL; + + next_vcn = vcn + len; + + if (lcn == SPARSE_LCN) + continue; + + if (sbi->flags & NTFS_FLAGS_NEED_REPLAY) + continue; + + down_read_nested(&wnd->rw_lock, BITMAP_MUTEX_CLUSTERS); + /* Check for free blocks. */ + ok = wnd_is_used(wnd, lcn, len); + up_read(&wnd->rw_lock); + if (ok) + continue; + + /* Looks like volume is corrupted. */ + ntfs_set_state(sbi, NTFS_DIRTY_ERROR); + + if (down_write_trylock(&wnd->rw_lock)) { + /* Mark all zero bits as used in range [lcn, lcn+len). */ + CLST i, lcn_f = 0, len_f = 0; + + err = 0; + for (i = 0; i < len; i++) { + if (wnd_is_free(wnd, lcn + i, 1)) { + if (!len_f) + lcn_f = lcn + i; + len_f += 1; + } else if (len_f) { + err = wnd_set_used(wnd, lcn_f, len_f); + len_f = 0; + if (err) + break; + } + } + + if (len_f) + err = wnd_set_used(wnd, lcn_f, len_f); + + up_write(&wnd->rw_lock); + if (err) + return err; + } + } + + return ret; +} +#endif + +/* + * run_get_highest_vcn + * + * Return the highest vcn from a mapping pairs array + * it used while replaying log file. + */ +int run_get_highest_vcn(CLST vcn, const u8 *run_buf, u64 *highest_vcn) +{ + u64 vcn64 = vcn; + u8 size_size; + + while ((size_size = *run_buf & 0xF)) { + u8 offset_size = *run_buf++ >> 4; + u64 len; + + if (size_size > 8 || offset_size > 8) + return -EINVAL; + + len = run_unpack_s64(run_buf, size_size, 0); + if (!len) + return -EINVAL; + + run_buf += size_size + offset_size; + vcn64 += len; + +#ifndef CONFIG_NTFS3_64BIT_CLUSTER + if (vcn64 > 0x100000000ull) + return -EINVAL; +#endif + } + + *highest_vcn = vcn64 - 1; + return 0; +} diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c new file mode 100644 index 000000000000..55bbc9200a10 --- /dev/null +++ b/fs/ntfs3/super.c @@ -0,0 +1,1512 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * + * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. + * + * + * terminology + * + * cluster - allocation unit - 512,1K,2K,4K,...,2M + * vcn - virtual cluster number - Offset inside the file in clusters. + * vbo - virtual byte offset - Offset inside the file in bytes. + * lcn - logical cluster number - 0 based cluster in clusters heap. + * lbo - logical byte offset - Absolute position inside volume. + * run - maps VCN to LCN - Stored in attributes in packed form. + * attr - attribute segment - std/name/data etc records inside MFT. + * mi - MFT inode - One MFT record(usually 1024 bytes or 4K), consists of attributes. + * ni - NTFS inode - Extends linux inode. consists of one or more mft inodes. + * index - unit inside directory - 2K, 4K, <=page size, does not depend on cluster size. + * + * WSL - Windows Subsystem for Linux + * https://docs.microsoft.com/en-us/windows/wsl/file-permissions + * It stores uid/gid/mode/dev in xattr + * + */ + +#include <linux/backing-dev.h> +#include <linux/blkdev.h> +#include <linux/buffer_head.h> +#include <linux/exportfs.h> +#include <linux/fs.h> +#include <linux/iversion.h> +#include <linux/log2.h> +#include <linux/module.h> +#include <linux/nls.h> +#include <linux/parser.h> +#include <linux/seq_file.h> +#include <linux/statfs.h> + +#include "debug.h" +#include "ntfs.h" +#include "ntfs_fs.h" +#ifdef CONFIG_NTFS3_LZX_XPRESS +#include "lib/lib.h" +#endif + +#ifdef CONFIG_PRINTK +/* + * ntfs_printk - Trace warnings/notices/errors. + * + * Thanks Joe Perches <joe@perches.com> for implementation + */ +void ntfs_printk(const struct super_block *sb, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + int level; + struct ntfs_sb_info *sbi = sb->s_fs_info; + + /* Should we use different ratelimits for warnings/notices/errors? */ + if (!___ratelimit(&sbi->msg_ratelimit, "ntfs3")) + return; + + va_start(args, fmt); + + level = printk_get_level(fmt); + vaf.fmt = printk_skip_level(fmt); + vaf.va = &args; + printk("%c%cntfs3: %s: %pV\n", KERN_SOH_ASCII, level, sb->s_id, &vaf); + + va_end(args); +} + +static char s_name_buf[512]; +static atomic_t s_name_buf_cnt = ATOMIC_INIT(1); // 1 means 'free s_name_buf'. + +/* + * ntfs_inode_printk + * + * Print warnings/notices/errors about inode using name or inode number. + */ +void ntfs_inode_printk(struct inode *inode, const char *fmt, ...) +{ + struct super_block *sb = inode->i_sb; + struct ntfs_sb_info *sbi = sb->s_fs_info; + char *name; + va_list args; + struct va_format vaf; + int level; + + if (!___ratelimit(&sbi->msg_ratelimit, "ntfs3")) + return; + + /* Use static allocated buffer, if possible. */ + name = atomic_dec_and_test(&s_name_buf_cnt) + ? s_name_buf + : kmalloc(sizeof(s_name_buf), GFP_NOFS); + + if (name) { + struct dentry *de = d_find_alias(inode); + const u32 name_len = ARRAY_SIZE(s_name_buf) - 1; + + if (de) { + spin_lock(&de->d_lock); + snprintf(name, name_len, " \"%s\"", de->d_name.name); + spin_unlock(&de->d_lock); + name[name_len] = 0; /* To be sure. */ + } else { + name[0] = 0; + } + dput(de); /* Cocci warns if placed in branch "if (de)" */ + } + + va_start(args, fmt); + + level = printk_get_level(fmt); + vaf.fmt = printk_skip_level(fmt); + vaf.va = &args; + + printk("%c%cntfs3: %s: ino=%lx,%s %pV\n", KERN_SOH_ASCII, level, + sb->s_id, inode->i_ino, name ? name : "", &vaf); + + va_end(args); + + atomic_inc(&s_name_buf_cnt); + if (name != s_name_buf) + kfree(name); +} +#endif + +/* + * Shared memory struct. + * + * On-disk ntfs's upcase table is created by ntfs formatter. + * 'upcase' table is 128K bytes of memory. + * We should read it into memory when mounting. + * Several ntfs volumes likely use the same 'upcase' table. + * It is good idea to share in-memory 'upcase' table between different volumes. + * Unfortunately winxp/vista/win7 use different upcase tables. + */ +static DEFINE_SPINLOCK(s_shared_lock); + +static struct { + void *ptr; + u32 len; + int cnt; +} s_shared[8]; + +/* + * ntfs_set_shared + * + * Return: + * * @ptr - If pointer was saved in shared memory. + * * NULL - If pointer was not shared. + */ +void *ntfs_set_shared(void *ptr, u32 bytes) +{ + void *ret = NULL; + int i, j = -1; + + spin_lock(&s_shared_lock); + for (i = 0; i < ARRAY_SIZE(s_shared); i++) { + if (!s_shared[i].cnt) { + j = i; + } else if (bytes == s_shared[i].len && + !memcmp(s_shared[i].ptr, ptr, bytes)) { + s_shared[i].cnt += 1; + ret = s_shared[i].ptr; + break; + } + } + + if (!ret && j != -1) { + s_shared[j].ptr = ptr; + s_shared[j].len = bytes; + s_shared[j].cnt = 1; + ret = ptr; + } + spin_unlock(&s_shared_lock); + + return ret; +} + +/* + * ntfs_put_shared + * + * Return: + * * @ptr - If pointer is not shared anymore. + * * NULL - If pointer is still shared. + */ +void *ntfs_put_shared(void *ptr) +{ + void *ret = ptr; + int i; + + spin_lock(&s_shared_lock); + for (i = 0; i < ARRAY_SIZE(s_shared); i++) { + if (s_shared[i].cnt && s_shared[i].ptr == ptr) { + if (--s_shared[i].cnt) + ret = NULL; + break; + } + } + spin_unlock(&s_shared_lock); + + return ret; +} + +static inline void clear_mount_options(struct ntfs_mount_options *options) +{ + unload_nls(options->nls); +} + +enum Opt { + Opt_uid, + Opt_gid, + Opt_umask, + Opt_dmask, + Opt_fmask, + Opt_immutable, + Opt_discard, + Opt_force, + Opt_sparse, + Opt_nohidden, + Opt_showmeta, + Opt_acl, + Opt_noatime, + Opt_nls, + Opt_prealloc, + Opt_no_acs_rules, + Opt_err, +}; + +static const match_table_t ntfs_tokens = { + { Opt_uid, "uid=%u" }, + { Opt_gid, "gid=%u" }, + { Opt_umask, "umask=%o" }, + { Opt_dmask, "dmask=%o" }, + { Opt_fmask, "fmask=%o" }, + { Opt_immutable, "sys_immutable" }, + { Opt_discard, "discard" }, + { Opt_force, "force" }, + { Opt_sparse, "sparse" }, + { Opt_nohidden, "nohidden" }, + { Opt_acl, "acl" }, + { Opt_noatime, "noatime" }, + { Opt_showmeta, "showmeta" }, + { Opt_nls, "nls=%s" }, + { Opt_prealloc, "prealloc" }, + { Opt_no_acs_rules, "no_acs_rules" }, + { Opt_err, NULL }, +}; + +static noinline int ntfs_parse_options(struct super_block *sb, char *options, + int silent, + struct ntfs_mount_options *opts) +{ + char *p; + substring_t args[MAX_OPT_ARGS]; + int option; + char nls_name[30]; + struct nls_table *nls; + + opts->fs_uid = current_uid(); + opts->fs_gid = current_gid(); + opts->fs_fmask_inv = opts->fs_dmask_inv = ~current_umask(); + nls_name[0] = 0; + + if (!options) + goto out; + + while ((p = strsep(&options, ","))) { + int token; + + if (!*p) + continue; + + token = match_token(p, ntfs_tokens, args); + switch (token) { + case Opt_immutable: + opts->sys_immutable = 1; + break; + case Opt_uid: + if (match_int(&args[0], &option)) + return -EINVAL; + opts->fs_uid = make_kuid(current_user_ns(), option); + if (!uid_valid(opts->fs_uid)) + return -EINVAL; + opts->uid = 1; + break; + case Opt_gid: + if (match_int(&args[0], &option)) + return -EINVAL; + opts->fs_gid = make_kgid(current_user_ns(), option); + if (!gid_valid(opts->fs_gid)) + return -EINVAL; + opts->gid = 1; + break; + case Opt_umask: + if (match_octal(&args[0], &option)) + return -EINVAL; + opts->fs_fmask_inv = opts->fs_dmask_inv = ~option; + opts->fmask = opts->dmask = 1; + break; + case Opt_dmask: + if (match_octal(&args[0], &option)) + return -EINVAL; + opts->fs_dmask_inv = ~option; + opts->dmask = 1; + break; + case Opt_fmask: + if (match_octal(&args[0], &option)) + return -EINVAL; + opts->fs_fmask_inv = ~option; + opts->fmask = 1; + break; + case Opt_discard: + opts->discard = 1; + break; + case Opt_force: + opts->force = 1; + break; + case Opt_sparse: + opts->sparse = 1; + break; + case Opt_nohidden: + opts->nohidden = 1; + break; + case Opt_acl: +#ifdef CONFIG_NTFS3_FS_POSIX_ACL + sb->s_flags |= SB_POSIXACL; + break; +#else + ntfs_err(sb, "support for ACL not compiled in!"); + return -EINVAL; +#endif + case Opt_noatime: + sb->s_flags |= SB_NOATIME; + break; + case Opt_showmeta: + opts->showmeta = 1; + break; + case Opt_nls: + match_strlcpy(nls_name, &args[0], sizeof(nls_name)); + break; + case Opt_prealloc: + opts->prealloc = 1; + break; + case Opt_no_acs_rules: + opts->no_acs_rules = 1; + break; + default: + if (!silent) + ntfs_err( + sb, + "Unrecognized mount option \"%s\" or missing value", + p); + //return -EINVAL; + } + } + +out: + if (!strcmp(nls_name[0] ? nls_name : CONFIG_NLS_DEFAULT, "utf8")) { + /* + * For UTF-8 use utf16s_to_utf8s()/utf8s_to_utf16s() + * instead of NLS. + */ + nls = NULL; + } else if (nls_name[0]) { + nls = load_nls(nls_name); + if (!nls) { + ntfs_err(sb, "failed to load \"%s\"", nls_name); + return -EINVAL; + } + } else { + nls = load_nls_default(); + if (!nls) { + ntfs_err(sb, "failed to load default nls"); + return -EINVAL; + } + } + opts->nls = nls; + + return 0; +} + +static int ntfs_remount(struct super_block *sb, int *flags, char *data) +{ + int err, ro_rw; + struct ntfs_sb_info *sbi = sb->s_fs_info; + struct ntfs_mount_options old_opts; + char *orig_data = kstrdup(data, GFP_KERNEL); + + if (data && !orig_data) + return -ENOMEM; + + /* Store original options. */ + memcpy(&old_opts, &sbi->options, sizeof(old_opts)); + clear_mount_options(&sbi->options); + memset(&sbi->options, 0, sizeof(sbi->options)); + + err = ntfs_parse_options(sb, data, 0, &sbi->options); + if (err) + goto restore_opts; + + ro_rw = sb_rdonly(sb) && !(*flags & SB_RDONLY); + if (ro_rw && (sbi->flags & NTFS_FLAGS_NEED_REPLAY)) { + ntfs_warn( + sb, + "Couldn't remount rw because journal is not replayed. Please umount/remount instead\n"); + err = -EINVAL; + goto restore_opts; + } + + sync_filesystem(sb); + + if (ro_rw && (sbi->volume.flags & VOLUME_FLAG_DIRTY) && + !sbi->options.force) { + ntfs_warn(sb, "volume is dirty and \"force\" flag is not set!"); + err = -EINVAL; + goto restore_opts; + } + + clear_mount_options(&old_opts); + + *flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME) | + SB_NODIRATIME | SB_NOATIME; + ntfs_info(sb, "re-mounted. Opts: %s", orig_data); + err = 0; + goto out; + +restore_opts: + clear_mount_options(&sbi->options); + memcpy(&sbi->options, &old_opts, sizeof(old_opts)); + +out: + kfree(orig_data); + return err; +} + +static struct kmem_cache *ntfs_inode_cachep; + +static struct inode *ntfs_alloc_inode(struct super_block *sb) +{ + struct ntfs_inode *ni = kmem_cache_alloc(ntfs_inode_cachep, GFP_NOFS); + + if (!ni) + return NULL; + + memset(ni, 0, offsetof(struct ntfs_inode, vfs_inode)); + + mutex_init(&ni->ni_lock); + + return &ni->vfs_inode; +} + +static void ntfs_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + struct ntfs_inode *ni = ntfs_i(inode); + + mutex_destroy(&ni->ni_lock); + + kmem_cache_free(ntfs_inode_cachep, ni); +} + +static void ntfs_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, ntfs_i_callback); +} + +static void init_once(void *foo) +{ + struct ntfs_inode *ni = foo; + + inode_init_once(&ni->vfs_inode); +} + +/* + * put_ntfs - Noinline to reduce binary size. + */ +static noinline void put_ntfs(struct ntfs_sb_info *sbi) +{ + kfree(sbi->new_rec); + kvfree(ntfs_put_shared(sbi->upcase)); + kfree(sbi->def_table); + + wnd_close(&sbi->mft.bitmap); + wnd_close(&sbi->used.bitmap); + + if (sbi->mft.ni) + iput(&sbi->mft.ni->vfs_inode); + + if (sbi->security.ni) + iput(&sbi->security.ni->vfs_inode); + + if (sbi->reparse.ni) + iput(&sbi->reparse.ni->vfs_inode); + + if (sbi->objid.ni) + iput(&sbi->objid.ni->vfs_inode); + + if (sbi->volume.ni) + iput(&sbi->volume.ni->vfs_inode); + + ntfs_update_mftmirr(sbi, 0); + + indx_clear(&sbi->security.index_sii); + indx_clear(&sbi->security.index_sdh); + indx_clear(&sbi->reparse.index_r); + indx_clear(&sbi->objid.index_o); + kfree(sbi->compress.lznt); +#ifdef CONFIG_NTFS3_LZX_XPRESS + xpress_free_decompressor(sbi->compress.xpress); + lzx_free_decompressor(sbi->compress.lzx); +#endif + clear_mount_options(&sbi->options); + + kfree(sbi); +} + +static void ntfs_put_super(struct super_block *sb) +{ + struct ntfs_sb_info *sbi = sb->s_fs_info; + + /* Mark rw ntfs as clear, if possible. */ + ntfs_set_state(sbi, NTFS_DIRTY_CLEAR); + + put_ntfs(sbi); + + sync_blockdev(sb->s_bdev); +} + +static int ntfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct ntfs_sb_info *sbi = sb->s_fs_info; + struct wnd_bitmap *wnd = &sbi->used.bitmap; + + buf->f_type = sb->s_magic; + buf->f_bsize = sbi->cluster_size; + buf->f_blocks = wnd->nbits; + + buf->f_bfree = buf->f_bavail = wnd_zeroes(wnd); + buf->f_fsid.val[0] = sbi->volume.ser_num; + buf->f_fsid.val[1] = (sbi->volume.ser_num >> 32); + buf->f_namelen = NTFS_NAME_LEN; + + return 0; +} + +static int ntfs_show_options(struct seq_file *m, struct dentry *root) +{ + struct super_block *sb = root->d_sb; + struct ntfs_sb_info *sbi = sb->s_fs_info; + struct ntfs_mount_options *opts = &sbi->options; + struct user_namespace *user_ns = seq_user_ns(m); + + if (opts->uid) + seq_printf(m, ",uid=%u", + from_kuid_munged(user_ns, opts->fs_uid)); + if (opts->gid) + seq_printf(m, ",gid=%u", + from_kgid_munged(user_ns, opts->fs_gid)); + if (opts->fmask) + seq_printf(m, ",fmask=%04o", ~opts->fs_fmask_inv); + if (opts->dmask) + seq_printf(m, ",dmask=%04o", ~opts->fs_dmask_inv); + if (opts->nls) + seq_printf(m, ",nls=%s", opts->nls->charset); + else + seq_puts(m, ",nls=utf8"); + if (opts->sys_immutable) + seq_puts(m, ",sys_immutable"); + if (opts->discard) + seq_puts(m, ",discard"); + if (opts->sparse) + seq_puts(m, ",sparse"); + if (opts->showmeta) + seq_puts(m, ",showmeta"); + if (opts->nohidden) + seq_puts(m, ",nohidden"); + if (opts->force) + seq_puts(m, ",force"); + if (opts->no_acs_rules) + seq_puts(m, ",no_acs_rules"); + if (opts->prealloc) + seq_puts(m, ",prealloc"); + if (sb->s_flags & SB_POSIXACL) + seq_puts(m, ",acl"); + if (sb->s_flags & SB_NOATIME) + seq_puts(m, ",noatime"); + + return 0; +} + +/* + * ntfs_sync_fs - super_operations::sync_fs + */ +static int ntfs_sync_fs(struct super_block *sb, int wait) +{ + int err = 0, err2; + struct ntfs_sb_info *sbi = sb->s_fs_info; + struct ntfs_inode *ni; + struct inode *inode; + + ni = sbi->security.ni; + if (ni) { + inode = &ni->vfs_inode; + err2 = _ni_write_inode(inode, wait); + if (err2 && !err) + err = err2; + } + + ni = sbi->objid.ni; + if (ni) { + inode = &ni->vfs_inode; + err2 = _ni_write_inode(inode, wait); + if (err2 && !err) + err = err2; + } + + ni = sbi->reparse.ni; + if (ni) { + inode = &ni->vfs_inode; + err2 = _ni_write_inode(inode, wait); + if (err2 && !err) + err = err2; + } + + if (!err) + ntfs_set_state(sbi, NTFS_DIRTY_CLEAR); + + ntfs_update_mftmirr(sbi, wait); + + return err; +} + +static const struct super_operations ntfs_sops = { + .alloc_inode = ntfs_alloc_inode, + .destroy_inode = ntfs_destroy_inode, + .evict_inode = ntfs_evict_inode, + .put_super = ntfs_put_super, + .statfs = ntfs_statfs, + .show_options = ntfs_show_options, + .sync_fs = ntfs_sync_fs, + .remount_fs = ntfs_remount, + .write_inode = ntfs3_write_inode, +}; + +static struct inode *ntfs_export_get_inode(struct super_block *sb, u64 ino, + u32 generation) +{ + struct MFT_REF ref; + struct inode *inode; + + ref.low = cpu_to_le32(ino); +#ifdef CONFIG_NTFS3_64BIT_CLUSTER + ref.high = cpu_to_le16(ino >> 32); +#else + ref.high = 0; +#endif + ref.seq = cpu_to_le16(generation); + + inode = ntfs_iget5(sb, &ref, NULL); + if (!IS_ERR(inode) && is_bad_inode(inode)) { + iput(inode); + inode = ERR_PTR(-ESTALE); + } + + return inode; +} + +static struct dentry *ntfs_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_dentry(sb, fid, fh_len, fh_type, + ntfs_export_get_inode); +} + +static struct dentry *ntfs_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_parent(sb, fid, fh_len, fh_type, + ntfs_export_get_inode); +} + +/* TODO: == ntfs_sync_inode */ +static int ntfs_nfs_commit_metadata(struct inode *inode) +{ + return _ni_write_inode(inode, 1); +} + +static const struct export_operations ntfs_export_ops = { + .fh_to_dentry = ntfs_fh_to_dentry, + .fh_to_parent = ntfs_fh_to_parent, + .get_parent = ntfs3_get_parent, + .commit_metadata = ntfs_nfs_commit_metadata, +}; + +/* + * format_size_gb - Return Gb,Mb to print with "%u.%02u Gb". + */ +static u32 format_size_gb(const u64 bytes, u32 *mb) +{ + /* Do simple right 30 bit shift of 64 bit value. */ + u64 kbytes = bytes >> 10; + u32 kbytes32 = kbytes; + + *mb = (100 * (kbytes32 & 0xfffff) + 0x7ffff) >> 20; + if (*mb >= 100) + *mb = 99; + + return (kbytes32 >> 20) | (((u32)(kbytes >> 32)) << 12); +} + +static u32 true_sectors_per_clst(const struct NTFS_BOOT *boot) +{ + return boot->sectors_per_clusters <= 0x80 + ? boot->sectors_per_clusters + : (1u << (0 - boot->sectors_per_clusters)); +} + +/* + * ntfs_init_from_boot - Init internal info from on-disk boot sector. + */ +static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, + u64 dev_size) +{ + struct ntfs_sb_info *sbi = sb->s_fs_info; + int err; + u32 mb, gb, boot_sector_size, sct_per_clst, record_size; + u64 sectors, clusters, fs_size, mlcn, mlcn2; + struct NTFS_BOOT *boot; + struct buffer_head *bh; + struct MFT_REC *rec; + u16 fn, ao; + + sbi->volume.blocks = dev_size >> PAGE_SHIFT; + + bh = ntfs_bread(sb, 0); + if (!bh) + return -EIO; + + err = -EINVAL; + boot = (struct NTFS_BOOT *)bh->b_data; + + if (memcmp(boot->system_id, "NTFS ", sizeof("NTFS ") - 1)) + goto out; + + /* 0x55AA is not mandaroty. Thanks Maxim Suhanov*/ + /*if (0x55 != boot->boot_magic[0] || 0xAA != boot->boot_magic[1]) + * goto out; + */ + + boot_sector_size = (u32)boot->bytes_per_sector[1] << 8; + if (boot->bytes_per_sector[0] || boot_sector_size < SECTOR_SIZE || + !is_power_of_2(boot_sector_size)) { + goto out; + } + + /* cluster size: 512, 1K, 2K, 4K, ... 2M */ + sct_per_clst = true_sectors_per_clst(boot); + if (!is_power_of_2(sct_per_clst)) + goto out; + + mlcn = le64_to_cpu(boot->mft_clst); + mlcn2 = le64_to_cpu(boot->mft2_clst); + sectors = le64_to_cpu(boot->sectors_per_volume); + + if (mlcn * sct_per_clst >= sectors) + goto out; + + if (mlcn2 * sct_per_clst >= sectors) + goto out; + + /* Check MFT record size. */ + if ((boot->record_size < 0 && + SECTOR_SIZE > (2U << (-boot->record_size))) || + (boot->record_size >= 0 && !is_power_of_2(boot->record_size))) { + goto out; + } + + /* Check index record size. */ + if ((boot->index_size < 0 && + SECTOR_SIZE > (2U << (-boot->index_size))) || + (boot->index_size >= 0 && !is_power_of_2(boot->index_size))) { + goto out; + } + + sbi->sector_size = boot_sector_size; + sbi->sector_bits = blksize_bits(boot_sector_size); + fs_size = (sectors + 1) << sbi->sector_bits; + + gb = format_size_gb(fs_size, &mb); + + /* + * - Volume formatted and mounted with the same sector size. + * - Volume formatted 4K and mounted as 512. + * - Volume formatted 512 and mounted as 4K. + */ + if (sbi->sector_size != sector_size) { + ntfs_warn(sb, + "Different NTFS' sector size and media sector size"); + dev_size += sector_size - 1; + } + + sbi->cluster_size = boot_sector_size * sct_per_clst; + sbi->cluster_bits = blksize_bits(sbi->cluster_size); + + sbi->mft.lbo = mlcn << sbi->cluster_bits; + sbi->mft.lbo2 = mlcn2 << sbi->cluster_bits; + + if (sbi->cluster_size < sbi->sector_size) + goto out; + + sbi->cluster_mask = sbi->cluster_size - 1; + sbi->cluster_mask_inv = ~(u64)sbi->cluster_mask; + sbi->record_size = record_size = boot->record_size < 0 + ? 1 << (-boot->record_size) + : (u32)boot->record_size + << sbi->cluster_bits; + + if (record_size > MAXIMUM_BYTES_PER_MFT) + goto out; + + sbi->record_bits = blksize_bits(record_size); + sbi->attr_size_tr = (5 * record_size >> 4); // ~320 bytes + + sbi->max_bytes_per_attr = + record_size - ALIGN(MFTRECORD_FIXUP_OFFSET_1, 8) - + ALIGN(((record_size >> SECTOR_SHIFT) * sizeof(short)), 8) - + ALIGN(sizeof(enum ATTR_TYPE), 8); + + sbi->index_size = boot->index_size < 0 + ? 1u << (-boot->index_size) + : (u32)boot->index_size << sbi->cluster_bits; + + sbi->volume.ser_num = le64_to_cpu(boot->serial_num); + sbi->volume.size = sectors << sbi->sector_bits; + + /* Warning if RAW volume. */ + if (dev_size < fs_size) { + u32 mb0, gb0; + + gb0 = format_size_gb(dev_size, &mb0); + ntfs_warn( + sb, + "RAW NTFS volume: Filesystem size %u.%02u Gb > volume size %u.%02u Gb. Mount in read-only", + gb, mb, gb0, mb0); + sb->s_flags |= SB_RDONLY; + } + + clusters = sbi->volume.size >> sbi->cluster_bits; +#ifndef CONFIG_NTFS3_64BIT_CLUSTER + /* 32 bits per cluster. */ + if (clusters >> 32) { + ntfs_notice( + sb, + "NTFS %u.%02u Gb is too big to use 32 bits per cluster", + gb, mb); + goto out; + } +#elif BITS_PER_LONG < 64 +#error "CONFIG_NTFS3_64BIT_CLUSTER incompatible in 32 bit OS" +#endif + + sbi->used.bitmap.nbits = clusters; + + rec = kzalloc(record_size, GFP_NOFS); + if (!rec) { + err = -ENOMEM; + goto out; + } + + sbi->new_rec = rec; + rec->rhdr.sign = NTFS_FILE_SIGNATURE; + rec->rhdr.fix_off = cpu_to_le16(MFTRECORD_FIXUP_OFFSET_1); + fn = (sbi->record_size >> SECTOR_SHIFT) + 1; + rec->rhdr.fix_num = cpu_to_le16(fn); + ao = ALIGN(MFTRECORD_FIXUP_OFFSET_1 + sizeof(short) * fn, 8); + rec->attr_off = cpu_to_le16(ao); + rec->used = cpu_to_le32(ao + ALIGN(sizeof(enum ATTR_TYPE), 8)); + rec->total = cpu_to_le32(sbi->record_size); + ((struct ATTRIB *)Add2Ptr(rec, ao))->type = ATTR_END; + + if (sbi->cluster_size < PAGE_SIZE) + sb_set_blocksize(sb, sbi->cluster_size); + + sbi->block_mask = sb->s_blocksize - 1; + sbi->blocks_per_cluster = sbi->cluster_size >> sb->s_blocksize_bits; + sbi->volume.blocks = sbi->volume.size >> sb->s_blocksize_bits; + + /* Maximum size for normal files. */ + sbi->maxbytes = (clusters << sbi->cluster_bits) - 1; + +#ifdef CONFIG_NTFS3_64BIT_CLUSTER + if (clusters >= (1ull << (64 - sbi->cluster_bits))) + sbi->maxbytes = -1; + sbi->maxbytes_sparse = -1; +#else + /* Maximum size for sparse file. */ + sbi->maxbytes_sparse = (1ull << (sbi->cluster_bits + 32)) - 1; +#endif + + err = 0; + +out: + brelse(bh); + + return err; +} + +/* + * ntfs_fill_super - Try to mount. + */ +static int ntfs_fill_super(struct super_block *sb, void *data, int silent) +{ + int err; + struct ntfs_sb_info *sbi; + struct block_device *bdev = sb->s_bdev; + struct inode *bd_inode = bdev->bd_inode; + struct request_queue *rq = bdev_get_queue(bdev); + struct inode *inode = NULL; + struct ntfs_inode *ni; + size_t i, tt; + CLST vcn, lcn, len; + struct ATTRIB *attr; + const struct VOLUME_INFO *info; + u32 idx, done, bytes; + struct ATTR_DEF_ENTRY *t; + u16 *upcase = NULL; + u16 *shared; + bool is_ro; + struct MFT_REF ref; + + ref.high = 0; + + sbi = kzalloc(sizeof(struct ntfs_sb_info), GFP_NOFS); + if (!sbi) + return -ENOMEM; + + sb->s_fs_info = sbi; + sbi->sb = sb; + sb->s_flags |= SB_NODIRATIME; + sb->s_magic = 0x7366746e; // "ntfs" + sb->s_op = &ntfs_sops; + sb->s_export_op = &ntfs_export_ops; + sb->s_time_gran = NTFS_TIME_GRAN; // 100 nsec + sb->s_xattr = ntfs_xattr_handlers; + + ratelimit_state_init(&sbi->msg_ratelimit, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + + err = ntfs_parse_options(sb, data, silent, &sbi->options); + if (err) + goto out; + + if (!rq || !blk_queue_discard(rq) || !rq->limits.discard_granularity) { + ; + } else { + sbi->discard_granularity = rq->limits.discard_granularity; + sbi->discard_granularity_mask_inv = + ~(u64)(sbi->discard_granularity - 1); + } + + sb_set_blocksize(sb, PAGE_SIZE); + + /* Parse boot. */ + err = ntfs_init_from_boot(sb, rq ? queue_logical_block_size(rq) : 512, + bd_inode->i_size); + if (err) + goto out; + +#ifdef CONFIG_NTFS3_64BIT_CLUSTER + sb->s_maxbytes = MAX_LFS_FILESIZE; +#else + sb->s_maxbytes = 0xFFFFFFFFull << sbi->cluster_bits; +#endif + + mutex_init(&sbi->compress.mtx_lznt); +#ifdef CONFIG_NTFS3_LZX_XPRESS + mutex_init(&sbi->compress.mtx_xpress); + mutex_init(&sbi->compress.mtx_lzx); +#endif + + /* + * Load $Volume. This should be done before $LogFile + * 'cause 'sbi->volume.ni' is used 'ntfs_set_state'. + */ + ref.low = cpu_to_le32(MFT_REC_VOL); + ref.seq = cpu_to_le16(MFT_REC_VOL); + inode = ntfs_iget5(sb, &ref, &NAME_VOLUME); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + ntfs_err(sb, "Failed to load $Volume."); + inode = NULL; + goto out; + } + + ni = ntfs_i(inode); + + /* Load and save label (not necessary). */ + attr = ni_find_attr(ni, NULL, NULL, ATTR_LABEL, NULL, 0, NULL, NULL); + + if (!attr) { + /* It is ok if no ATTR_LABEL */ + } else if (!attr->non_res && !is_attr_ext(attr)) { + /* $AttrDef allows labels to be up to 128 symbols. */ + err = utf16s_to_utf8s(resident_data(attr), + le32_to_cpu(attr->res.data_size) >> 1, + UTF16_LITTLE_ENDIAN, sbi->volume.label, + sizeof(sbi->volume.label)); + if (err < 0) + sbi->volume.label[0] = 0; + } else { + /* Should we break mounting here? */ + //err = -EINVAL; + //goto out; + } + + attr = ni_find_attr(ni, attr, NULL, ATTR_VOL_INFO, NULL, 0, NULL, NULL); + if (!attr || is_attr_ext(attr)) { + err = -EINVAL; + goto out; + } + + info = resident_data_ex(attr, SIZEOF_ATTRIBUTE_VOLUME_INFO); + if (!info) { + err = -EINVAL; + goto out; + } + + sbi->volume.major_ver = info->major_ver; + sbi->volume.minor_ver = info->minor_ver; + sbi->volume.flags = info->flags; + + sbi->volume.ni = ni; + inode = NULL; + + /* Load $MFTMirr to estimate recs_mirr. */ + ref.low = cpu_to_le32(MFT_REC_MIRR); + ref.seq = cpu_to_le16(MFT_REC_MIRR); + inode = ntfs_iget5(sb, &ref, &NAME_MIRROR); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + ntfs_err(sb, "Failed to load $MFTMirr."); + inode = NULL; + goto out; + } + + sbi->mft.recs_mirr = + ntfs_up_cluster(sbi, inode->i_size) >> sbi->record_bits; + + iput(inode); + + /* Load LogFile to replay. */ + ref.low = cpu_to_le32(MFT_REC_LOG); + ref.seq = cpu_to_le16(MFT_REC_LOG); + inode = ntfs_iget5(sb, &ref, &NAME_LOGFILE); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + ntfs_err(sb, "Failed to load \x24LogFile."); + inode = NULL; + goto out; + } + + ni = ntfs_i(inode); + + err = ntfs_loadlog_and_replay(ni, sbi); + if (err) + goto out; + + iput(inode); + inode = NULL; + + is_ro = sb_rdonly(sbi->sb); + + if (sbi->flags & NTFS_FLAGS_NEED_REPLAY) { + if (!is_ro) { + ntfs_warn(sb, + "failed to replay log file. Can't mount rw!"); + err = -EINVAL; + goto out; + } + } else if (sbi->volume.flags & VOLUME_FLAG_DIRTY) { + if (!is_ro && !sbi->options.force) { + ntfs_warn( + sb, + "volume is dirty and \"force\" flag is not set!"); + err = -EINVAL; + goto out; + } + } + + /* Load $MFT. */ + ref.low = cpu_to_le32(MFT_REC_MFT); + ref.seq = cpu_to_le16(1); + + inode = ntfs_iget5(sb, &ref, &NAME_MFT); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + ntfs_err(sb, "Failed to load $MFT."); + inode = NULL; + goto out; + } + + ni = ntfs_i(inode); + + sbi->mft.used = ni->i_valid >> sbi->record_bits; + tt = inode->i_size >> sbi->record_bits; + sbi->mft.next_free = MFT_REC_USER; + + err = wnd_init(&sbi->mft.bitmap, sb, tt); + if (err) + goto out; + + err = ni_load_all_mi(ni); + if (err) + goto out; + + sbi->mft.ni = ni; + + /* Load $BadClus. */ + ref.low = cpu_to_le32(MFT_REC_BADCLUST); + ref.seq = cpu_to_le16(MFT_REC_BADCLUST); + inode = ntfs_iget5(sb, &ref, &NAME_BADCLUS); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + ntfs_err(sb, "Failed to load $BadClus."); + inode = NULL; + goto out; + } + + ni = ntfs_i(inode); + + for (i = 0; run_get_entry(&ni->file.run, i, &vcn, &lcn, &len); i++) { + if (lcn == SPARSE_LCN) + continue; + + if (!sbi->bad_clusters) + ntfs_notice(sb, "Volume contains bad blocks"); + + sbi->bad_clusters += len; + } + + iput(inode); + + /* Load $Bitmap. */ + ref.low = cpu_to_le32(MFT_REC_BITMAP); + ref.seq = cpu_to_le16(MFT_REC_BITMAP); + inode = ntfs_iget5(sb, &ref, &NAME_BITMAP); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + ntfs_err(sb, "Failed to load $Bitmap."); + inode = NULL; + goto out; + } + + ni = ntfs_i(inode); + +#ifndef CONFIG_NTFS3_64BIT_CLUSTER + if (inode->i_size >> 32) { + err = -EINVAL; + goto out; + } +#endif + + /* Check bitmap boundary. */ + tt = sbi->used.bitmap.nbits; + if (inode->i_size < bitmap_size(tt)) { + err = -EINVAL; + goto out; + } + + /* Not necessary. */ + sbi->used.bitmap.set_tail = true; + err = wnd_init(&sbi->used.bitmap, sbi->sb, tt); + if (err) + goto out; + + iput(inode); + + /* Compute the MFT zone. */ + err = ntfs_refresh_zone(sbi); + if (err) + goto out; + + /* Load $AttrDef. */ + ref.low = cpu_to_le32(MFT_REC_ATTR); + ref.seq = cpu_to_le16(MFT_REC_ATTR); + inode = ntfs_iget5(sbi->sb, &ref, &NAME_ATTRDEF); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + ntfs_err(sb, "Failed to load $AttrDef -> %d", err); + inode = NULL; + goto out; + } + + if (inode->i_size < sizeof(struct ATTR_DEF_ENTRY)) { + err = -EINVAL; + goto out; + } + bytes = inode->i_size; + sbi->def_table = t = kmalloc(bytes, GFP_NOFS); + if (!t) { + err = -ENOMEM; + goto out; + } + + for (done = idx = 0; done < bytes; done += PAGE_SIZE, idx++) { + unsigned long tail = bytes - done; + struct page *page = ntfs_map_page(inode->i_mapping, idx); + + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto out; + } + memcpy(Add2Ptr(t, done), page_address(page), + min(PAGE_SIZE, tail)); + ntfs_unmap_page(page); + + if (!idx && ATTR_STD != t->type) { + err = -EINVAL; + goto out; + } + } + + t += 1; + sbi->def_entries = 1; + done = sizeof(struct ATTR_DEF_ENTRY); + sbi->reparse.max_size = MAXIMUM_REPARSE_DATA_BUFFER_SIZE; + sbi->ea_max_size = 0x10000; /* default formatter value */ + + while (done + sizeof(struct ATTR_DEF_ENTRY) <= bytes) { + u32 t32 = le32_to_cpu(t->type); + u64 sz = le64_to_cpu(t->max_sz); + + if ((t32 & 0xF) || le32_to_cpu(t[-1].type) >= t32) + break; + + if (t->type == ATTR_REPARSE) + sbi->reparse.max_size = sz; + else if (t->type == ATTR_EA) + sbi->ea_max_size = sz; + + done += sizeof(struct ATTR_DEF_ENTRY); + t += 1; + sbi->def_entries += 1; + } + iput(inode); + + /* Load $UpCase. */ + ref.low = cpu_to_le32(MFT_REC_UPCASE); + ref.seq = cpu_to_le16(MFT_REC_UPCASE); + inode = ntfs_iget5(sb, &ref, &NAME_UPCASE); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + ntfs_err(sb, "Failed to load \x24LogFile."); + inode = NULL; + goto out; + } + + ni = ntfs_i(inode); + + if (inode->i_size != 0x10000 * sizeof(short)) { + err = -EINVAL; + goto out; + } + + sbi->upcase = upcase = kvmalloc(0x10000 * sizeof(short), GFP_KERNEL); + if (!upcase) { + err = -ENOMEM; + goto out; + } + + for (idx = 0; idx < (0x10000 * sizeof(short) >> PAGE_SHIFT); idx++) { + const __le16 *src; + u16 *dst = Add2Ptr(upcase, idx << PAGE_SHIFT); + struct page *page = ntfs_map_page(inode->i_mapping, idx); + + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto out; + } + + src = page_address(page); + +#ifdef __BIG_ENDIAN + for (i = 0; i < PAGE_SIZE / sizeof(u16); i++) + *dst++ = le16_to_cpu(*src++); +#else + memcpy(dst, src, PAGE_SIZE); +#endif + ntfs_unmap_page(page); + } + + shared = ntfs_set_shared(upcase, 0x10000 * sizeof(short)); + if (shared && upcase != shared) { + sbi->upcase = shared; + kvfree(upcase); + } + + iput(inode); + inode = NULL; + + if (is_ntfs3(sbi)) { + /* Load $Secure. */ + err = ntfs_security_init(sbi); + if (err) + goto out; + + /* Load $Extend. */ + err = ntfs_extend_init(sbi); + if (err) + goto load_root; + + /* Load $Extend\$Reparse. */ + err = ntfs_reparse_init(sbi); + if (err) + goto load_root; + + /* Load $Extend\$ObjId. */ + err = ntfs_objid_init(sbi); + if (err) + goto load_root; + } + +load_root: + /* Load root. */ + ref.low = cpu_to_le32(MFT_REC_ROOT); + ref.seq = cpu_to_le16(MFT_REC_ROOT); + inode = ntfs_iget5(sb, &ref, &NAME_ROOT); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + ntfs_err(sb, "Failed to load root."); + inode = NULL; + goto out; + } + + ni = ntfs_i(inode); + + sb->s_root = d_make_root(inode); + + if (!sb->s_root) { + err = -EINVAL; + goto out; + } + + return 0; + +out: + iput(inode); + + if (sb->s_root) { + d_drop(sb->s_root); + sb->s_root = NULL; + } + + put_ntfs(sbi); + + sb->s_fs_info = NULL; + return err; +} + +void ntfs_unmap_meta(struct super_block *sb, CLST lcn, CLST len) +{ + struct ntfs_sb_info *sbi = sb->s_fs_info; + struct block_device *bdev = sb->s_bdev; + sector_t devblock = (u64)lcn * sbi->blocks_per_cluster; + unsigned long blocks = (u64)len * sbi->blocks_per_cluster; + unsigned long cnt = 0; + unsigned long limit = global_zone_page_state(NR_FREE_PAGES) + << (PAGE_SHIFT - sb->s_blocksize_bits); + + if (limit >= 0x2000) + limit -= 0x1000; + else if (limit < 32) + limit = 32; + else + limit >>= 1; + + while (blocks--) { + clean_bdev_aliases(bdev, devblock++, 1); + if (cnt++ >= limit) { + sync_blockdev(bdev); + cnt = 0; + } + } +} + +/* + * ntfs_discard - Issue a discard request (trim for SSD). + */ +int ntfs_discard(struct ntfs_sb_info *sbi, CLST lcn, CLST len) +{ + int err; + u64 lbo, bytes, start, end; + struct super_block *sb; + + if (sbi->used.next_free_lcn == lcn + len) + sbi->used.next_free_lcn = lcn; + + if (sbi->flags & NTFS_FLAGS_NODISCARD) + return -EOPNOTSUPP; + + if (!sbi->options.discard) + return -EOPNOTSUPP; + + lbo = (u64)lcn << sbi->cluster_bits; + bytes = (u64)len << sbi->cluster_bits; + + /* Align up 'start' on discard_granularity. */ + start = (lbo + sbi->discard_granularity - 1) & + sbi->discard_granularity_mask_inv; + /* Align down 'end' on discard_granularity. */ + end = (lbo + bytes) & sbi->discard_granularity_mask_inv; + + sb = sbi->sb; + if (start >= end) + return 0; + + err = blkdev_issue_discard(sb->s_bdev, start >> 9, (end - start) >> 9, + GFP_NOFS, 0); + + if (err == -EOPNOTSUPP) + sbi->flags |= NTFS_FLAGS_NODISCARD; + + return err; +} + +static struct dentry *ntfs_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data) +{ + return mount_bdev(fs_type, flags, dev_name, data, ntfs_fill_super); +} + +// clang-format off +static struct file_system_type ntfs_fs_type = { + .owner = THIS_MODULE, + .name = "ntfs3", + .mount = ntfs_mount, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, +}; +// clang-format on + +static int __init init_ntfs_fs(void) +{ + int err; + + pr_info("ntfs3: Max link count %u\n", NTFS_LINK_MAX); + + if (IS_ENABLED(CONFIG_NTFS3_FS_POSIX_ACL)) + pr_info("ntfs3: Enabled Linux POSIX ACLs support\n"); + if (IS_ENABLED(CONFIG_NTFS3_64BIT_CLUSTER)) + pr_notice("ntfs3: Warning: Activated 64 bits per cluster. Windows does not support this\n"); + if (IS_ENABLED(CONFIG_NTFS3_LZX_XPRESS)) + pr_info("ntfs3: Read-only LZX/Xpress compression included\n"); + + err = ntfs3_init_bitmap(); + if (err) + return err; + + ntfs_inode_cachep = kmem_cache_create( + "ntfs_inode_cache", sizeof(struct ntfs_inode), 0, + (SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT), + init_once); + if (!ntfs_inode_cachep) { + err = -ENOMEM; + goto out1; + } + + err = register_filesystem(&ntfs_fs_type); + if (err) + goto out; + + return 0; +out: + kmem_cache_destroy(ntfs_inode_cachep); +out1: + ntfs3_exit_bitmap(); + return err; +} + +static void __exit exit_ntfs_fs(void) +{ + if (ntfs_inode_cachep) { + rcu_barrier(); + kmem_cache_destroy(ntfs_inode_cachep); + } + + unregister_filesystem(&ntfs_fs_type); + ntfs3_exit_bitmap(); +} + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("ntfs3 read/write filesystem"); +#ifdef CONFIG_NTFS3_FS_POSIX_ACL +MODULE_INFO(behaviour, "Enabled Linux POSIX ACLs support"); +#endif +#ifdef CONFIG_NTFS3_64BIT_CLUSTER +MODULE_INFO(cluster, "Warning: Activated 64 bits per cluster. Windows does not support this"); +#endif +#ifdef CONFIG_NTFS3_LZX_XPRESS +MODULE_INFO(compression, "Read-only lzx/xpress compression included"); +#endif + +MODULE_AUTHOR("Konstantin Komarov"); +MODULE_ALIAS_FS("ntfs3"); + +module_init(init_ntfs_fs); +module_exit(exit_ntfs_fs); diff --git a/fs/ntfs3/upcase.c b/fs/ntfs3/upcase.c new file mode 100644 index 000000000000..bbeba778237e --- /dev/null +++ b/fs/ntfs3/upcase.c @@ -0,0 +1,108 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * + * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. + * + */ + +#include <linux/blkdev.h> +#include <linux/buffer_head.h> +#include <linux/module.h> +#include <linux/nls.h> + +#include "debug.h" +#include "ntfs.h" +#include "ntfs_fs.h" + +static inline u16 upcase_unicode_char(const u16 *upcase, u16 chr) +{ + if (chr < 'a') + return chr; + + if (chr <= 'z') + return chr - ('a' - 'A'); + + return upcase[chr]; +} + +/* + * ntfs_cmp_names + * + * Thanks Kari Argillander <kari.argillander@gmail.com> for idea and implementation 'bothcase' + * + * Straight way to compare names: + * - Case insensitive + * - If name equals and 'bothcases' then + * - Case sensitive + * 'Straight way' code scans input names twice in worst case. + * Optimized code scans input names only once. + */ +int ntfs_cmp_names(const __le16 *s1, size_t l1, const __le16 *s2, size_t l2, + const u16 *upcase, bool bothcase) +{ + int diff1 = 0; + int diff2; + size_t len = min(l1, l2); + + if (!bothcase && upcase) + goto case_insentive; + + for (; len; s1++, s2++, len--) { + diff1 = le16_to_cpu(*s1) - le16_to_cpu(*s2); + if (diff1) { + if (bothcase && upcase) + goto case_insentive; + + return diff1; + } + } + return l1 - l2; + +case_insentive: + for (; len; s1++, s2++, len--) { + diff2 = upcase_unicode_char(upcase, le16_to_cpu(*s1)) - + upcase_unicode_char(upcase, le16_to_cpu(*s2)); + if (diff2) + return diff2; + } + + diff2 = l1 - l2; + return diff2 ? diff2 : diff1; +} + +int ntfs_cmp_names_cpu(const struct cpu_str *uni1, const struct le_str *uni2, + const u16 *upcase, bool bothcase) +{ + const u16 *s1 = uni1->name; + const __le16 *s2 = uni2->name; + size_t l1 = uni1->len; + size_t l2 = uni2->len; + size_t len = min(l1, l2); + int diff1 = 0; + int diff2; + + if (!bothcase && upcase) + goto case_insentive; + + for (; len; s1++, s2++, len--) { + diff1 = *s1 - le16_to_cpu(*s2); + if (diff1) { + if (bothcase && upcase) + goto case_insentive; + + return diff1; + } + } + return l1 - l2; + +case_insentive: + for (; len; s1++, s2++, len--) { + diff2 = upcase_unicode_char(upcase, *s1) - + upcase_unicode_char(upcase, le16_to_cpu(*s2)); + if (diff2) + return diff2; + } + + diff2 = l1 - l2; + return diff2 ? diff2 : diff1; +} diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c new file mode 100644 index 000000000000..7282d85c4ece --- /dev/null +++ b/fs/ntfs3/xattr.c @@ -0,0 +1,1122 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * + * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. + * + */ + +#include <linux/blkdev.h> +#include <linux/buffer_head.h> +#include <linux/fs.h> +#include <linux/nls.h> +#include <linux/posix_acl.h> +#include <linux/posix_acl_xattr.h> +#include <linux/xattr.h> + +#include "debug.h" +#include "ntfs.h" +#include "ntfs_fs.h" + +// clang-format off +#define SYSTEM_DOS_ATTRIB "system.dos_attrib" +#define SYSTEM_NTFS_ATTRIB "system.ntfs_attrib" +#define SYSTEM_NTFS_SECURITY "system.ntfs_security" +// clang-format on + +static inline size_t unpacked_ea_size(const struct EA_FULL *ea) +{ + return ea->size ? le32_to_cpu(ea->size) + : ALIGN(struct_size(ea, name, + 1 + ea->name_len + + le16_to_cpu(ea->elength)), + 4); +} + +static inline size_t packed_ea_size(const struct EA_FULL *ea) +{ + return struct_size(ea, name, + 1 + ea->name_len + le16_to_cpu(ea->elength)) - + offsetof(struct EA_FULL, flags); +} + +/* + * find_ea + * + * Assume there is at least one xattr in the list. + */ +static inline bool find_ea(const struct EA_FULL *ea_all, u32 bytes, + const char *name, u8 name_len, u32 *off) +{ + *off = 0; + + if (!ea_all || !bytes) + return false; + + for (;;) { + const struct EA_FULL *ea = Add2Ptr(ea_all, *off); + u32 next_off = *off + unpacked_ea_size(ea); + + if (next_off > bytes) + return false; + + if (ea->name_len == name_len && + !memcmp(ea->name, name, name_len)) + return true; + + *off = next_off; + if (next_off >= bytes) + return false; + } +} + +/* + * ntfs_read_ea - Read all extended attributes. + * @ea: New allocated memory. + * @info: Pointer into resident data. + */ +static int ntfs_read_ea(struct ntfs_inode *ni, struct EA_FULL **ea, + size_t add_bytes, const struct EA_INFO **info) +{ + int err; + struct ATTR_LIST_ENTRY *le = NULL; + struct ATTRIB *attr_info, *attr_ea; + void *ea_p; + u32 size; + + static_assert(le32_to_cpu(ATTR_EA_INFO) < le32_to_cpu(ATTR_EA)); + + *ea = NULL; + *info = NULL; + + attr_info = + ni_find_attr(ni, NULL, &le, ATTR_EA_INFO, NULL, 0, NULL, NULL); + attr_ea = + ni_find_attr(ni, attr_info, &le, ATTR_EA, NULL, 0, NULL, NULL); + + if (!attr_ea || !attr_info) + return 0; + + *info = resident_data_ex(attr_info, sizeof(struct EA_INFO)); + if (!*info) + return -EINVAL; + + /* Check Ea limit. */ + size = le32_to_cpu((*info)->size); + if (size > ni->mi.sbi->ea_max_size) + return -EFBIG; + + if (attr_size(attr_ea) > ni->mi.sbi->ea_max_size) + return -EFBIG; + + /* Allocate memory for packed Ea. */ + ea_p = kmalloc(size + add_bytes, GFP_NOFS); + if (!ea_p) + return -ENOMEM; + + if (attr_ea->non_res) { + struct runs_tree run; + + run_init(&run); + + err = attr_load_runs(attr_ea, ni, &run, NULL); + if (!err) + err = ntfs_read_run_nb(ni->mi.sbi, &run, 0, ea_p, size, + NULL); + run_close(&run); + + if (err) + goto out; + } else { + void *p = resident_data_ex(attr_ea, size); + + if (!p) { + err = -EINVAL; + goto out; + } + memcpy(ea_p, p, size); + } + + memset(Add2Ptr(ea_p, size), 0, add_bytes); + *ea = ea_p; + return 0; + +out: + kfree(ea_p); + *ea = NULL; + return err; +} + +/* + * ntfs_list_ea + * + * Copy a list of xattrs names into the buffer + * provided, or compute the buffer size required. + * + * Return: + * * Number of bytes used / required on + * * -ERRNO - on failure + */ +static ssize_t ntfs_list_ea(struct ntfs_inode *ni, char *buffer, + size_t bytes_per_buffer) +{ + const struct EA_INFO *info; + struct EA_FULL *ea_all = NULL; + const struct EA_FULL *ea; + u32 off, size; + int err; + size_t ret; + + err = ntfs_read_ea(ni, &ea_all, 0, &info); + if (err) + return err; + + if (!info || !ea_all) + return 0; + + size = le32_to_cpu(info->size); + + /* Enumerate all xattrs. */ + for (ret = 0, off = 0; off < size; off += unpacked_ea_size(ea)) { + ea = Add2Ptr(ea_all, off); + + if (buffer) { + if (ret + ea->name_len + 1 > bytes_per_buffer) { + err = -ERANGE; + goto out; + } + + memcpy(buffer + ret, ea->name, ea->name_len); + buffer[ret + ea->name_len] = 0; + } + + ret += ea->name_len + 1; + } + +out: + kfree(ea_all); + return err ? err : ret; +} + +static int ntfs_get_ea(struct inode *inode, const char *name, size_t name_len, + void *buffer, size_t size, size_t *required) +{ + struct ntfs_inode *ni = ntfs_i(inode); + const struct EA_INFO *info; + struct EA_FULL *ea_all = NULL; + const struct EA_FULL *ea; + u32 off, len; + int err; + + if (!(ni->ni_flags & NI_FLAG_EA)) + return -ENODATA; + + if (!required) + ni_lock(ni); + + len = 0; + + if (name_len > 255) { + err = -ENAMETOOLONG; + goto out; + } + + err = ntfs_read_ea(ni, &ea_all, 0, &info); + if (err) + goto out; + + if (!info) + goto out; + + /* Enumerate all xattrs. */ + if (!find_ea(ea_all, le32_to_cpu(info->size), name, name_len, &off)) { + err = -ENODATA; + goto out; + } + ea = Add2Ptr(ea_all, off); + + len = le16_to_cpu(ea->elength); + if (!buffer) { + err = 0; + goto out; + } + + if (len > size) { + err = -ERANGE; + if (required) + *required = len; + goto out; + } + + memcpy(buffer, ea->name + ea->name_len + 1, len); + err = 0; + +out: + kfree(ea_all); + if (!required) + ni_unlock(ni); + + return err ? err : len; +} + +static noinline int ntfs_set_ea(struct inode *inode, const char *name, + size_t name_len, const void *value, + size_t val_size, int flags, int locked) +{ + struct ntfs_inode *ni = ntfs_i(inode); + struct ntfs_sb_info *sbi = ni->mi.sbi; + int err; + struct EA_INFO ea_info; + const struct EA_INFO *info; + struct EA_FULL *new_ea; + struct EA_FULL *ea_all = NULL; + size_t add, new_pack; + u32 off, size; + __le16 size_pack; + struct ATTRIB *attr; + struct ATTR_LIST_ENTRY *le; + struct mft_inode *mi; + struct runs_tree ea_run; + u64 new_sz; + void *p; + + if (!locked) + ni_lock(ni); + + run_init(&ea_run); + + if (name_len > 255) { + err = -ENAMETOOLONG; + goto out; + } + + add = ALIGN(struct_size(ea_all, name, 1 + name_len + val_size), 4); + + err = ntfs_read_ea(ni, &ea_all, add, &info); + if (err) + goto out; + + if (!info) { + memset(&ea_info, 0, sizeof(ea_info)); + size = 0; + size_pack = 0; + } else { + memcpy(&ea_info, info, sizeof(ea_info)); + size = le32_to_cpu(ea_info.size); + size_pack = ea_info.size_pack; + } + + if (info && find_ea(ea_all, size, name, name_len, &off)) { + struct EA_FULL *ea; + size_t ea_sz; + + if (flags & XATTR_CREATE) { + err = -EEXIST; + goto out; + } + + ea = Add2Ptr(ea_all, off); + + /* + * Check simple case when we try to insert xattr with the same value + * e.g. ntfs_save_wsl_perm + */ + if (val_size && le16_to_cpu(ea->elength) == val_size && + !memcmp(ea->name + ea->name_len + 1, value, val_size)) { + /* xattr already contains the required value. */ + goto out; + } + + /* Remove current xattr. */ + if (ea->flags & FILE_NEED_EA) + le16_add_cpu(&ea_info.count, -1); + + ea_sz = unpacked_ea_size(ea); + + le16_add_cpu(&ea_info.size_pack, 0 - packed_ea_size(ea)); + + memmove(ea, Add2Ptr(ea, ea_sz), size - off - ea_sz); + + size -= ea_sz; + memset(Add2Ptr(ea_all, size), 0, ea_sz); + + ea_info.size = cpu_to_le32(size); + + if ((flags & XATTR_REPLACE) && !val_size) { + /* Remove xattr. */ + goto update_ea; + } + } else { + if (flags & XATTR_REPLACE) { + err = -ENODATA; + goto out; + } + + if (!ea_all) { + ea_all = kzalloc(add, GFP_NOFS); + if (!ea_all) { + err = -ENOMEM; + goto out; + } + } + } + + /* Append new xattr. */ + new_ea = Add2Ptr(ea_all, size); + new_ea->size = cpu_to_le32(add); + new_ea->flags = 0; + new_ea->name_len = name_len; + new_ea->elength = cpu_to_le16(val_size); + memcpy(new_ea->name, name, name_len); + new_ea->name[name_len] = 0; + memcpy(new_ea->name + name_len + 1, value, val_size); + new_pack = le16_to_cpu(ea_info.size_pack) + packed_ea_size(new_ea); + + /* Should fit into 16 bits. */ + if (new_pack > 0xffff) { + err = -EFBIG; // -EINVAL? + goto out; + } + ea_info.size_pack = cpu_to_le16(new_pack); + + /* New size of ATTR_EA. */ + size += add; + if (size > sbi->ea_max_size) { + err = -EFBIG; // -EINVAL? + goto out; + } + ea_info.size = cpu_to_le32(size); + +update_ea: + + if (!info) { + /* Create xattr. */ + if (!size) { + err = 0; + goto out; + } + + err = ni_insert_resident(ni, sizeof(struct EA_INFO), + ATTR_EA_INFO, NULL, 0, NULL, NULL, + NULL); + if (err) + goto out; + + err = ni_insert_resident(ni, 0, ATTR_EA, NULL, 0, NULL, NULL, + NULL); + if (err) + goto out; + } + + new_sz = size; + err = attr_set_size(ni, ATTR_EA, NULL, 0, &ea_run, new_sz, &new_sz, + false, NULL); + if (err) + goto out; + + le = NULL; + attr = ni_find_attr(ni, NULL, &le, ATTR_EA_INFO, NULL, 0, NULL, &mi); + if (!attr) { + err = -EINVAL; + goto out; + } + + if (!size) { + /* Delete xattr, ATTR_EA_INFO */ + ni_remove_attr_le(ni, attr, mi, le); + } else { + p = resident_data_ex(attr, sizeof(struct EA_INFO)); + if (!p) { + err = -EINVAL; + goto out; + } + memcpy(p, &ea_info, sizeof(struct EA_INFO)); + mi->dirty = true; + } + + le = NULL; + attr = ni_find_attr(ni, NULL, &le, ATTR_EA, NULL, 0, NULL, &mi); + if (!attr) { + err = -EINVAL; + goto out; + } + + if (!size) { + /* Delete xattr, ATTR_EA */ + ni_remove_attr_le(ni, attr, mi, le); + } else if (attr->non_res) { + err = ntfs_sb_write_run(sbi, &ea_run, 0, ea_all, size); + if (err) + goto out; + } else { + p = resident_data_ex(attr, size); + if (!p) { + err = -EINVAL; + goto out; + } + memcpy(p, ea_all, size); + mi->dirty = true; + } + + /* Check if we delete the last xattr. */ + if (size) + ni->ni_flags |= NI_FLAG_EA; + else + ni->ni_flags &= ~NI_FLAG_EA; + + if (ea_info.size_pack != size_pack) + ni->ni_flags |= NI_FLAG_UPDATE_PARENT; + mark_inode_dirty(&ni->vfs_inode); + +out: + if (!locked) + ni_unlock(ni); + + run_close(&ea_run); + kfree(ea_all); + + return err; +} + +#ifdef CONFIG_NTFS3_FS_POSIX_ACL +static inline void ntfs_posix_acl_release(struct posix_acl *acl) +{ + if (acl && refcount_dec_and_test(&acl->a_refcount)) + kfree(acl); +} + +static struct posix_acl *ntfs_get_acl_ex(struct user_namespace *mnt_userns, + struct inode *inode, int type, + int locked) +{ + struct ntfs_inode *ni = ntfs_i(inode); + const char *name; + size_t name_len; + struct posix_acl *acl; + size_t req; + int err; + void *buf; + + /* Allocate PATH_MAX bytes. */ + buf = __getname(); + if (!buf) + return ERR_PTR(-ENOMEM); + + /* Possible values of 'type' was already checked above. */ + if (type == ACL_TYPE_ACCESS) { + name = XATTR_NAME_POSIX_ACL_ACCESS; + name_len = sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1; + } else { + name = XATTR_NAME_POSIX_ACL_DEFAULT; + name_len = sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1; + } + + if (!locked) + ni_lock(ni); + + err = ntfs_get_ea(inode, name, name_len, buf, PATH_MAX, &req); + + if (!locked) + ni_unlock(ni); + + /* Translate extended attribute to acl. */ + if (err >= 0) { + acl = posix_acl_from_xattr(mnt_userns, buf, err); + if (!IS_ERR(acl)) + set_cached_acl(inode, type, acl); + } else { + acl = err == -ENODATA ? NULL : ERR_PTR(err); + } + + __putname(buf); + + return acl; +} + +/* + * ntfs_get_acl - inode_operations::get_acl + */ +struct posix_acl *ntfs_get_acl(struct inode *inode, int type, bool rcu) +{ + if (rcu) + return ERR_PTR(-ECHILD); + + /* TODO: init_user_ns? */ + return ntfs_get_acl_ex(&init_user_ns, inode, type, 0); +} + +static noinline int ntfs_set_acl_ex(struct user_namespace *mnt_userns, + struct inode *inode, struct posix_acl *acl, + int type, int locked) +{ + const char *name; + size_t size, name_len; + void *value = NULL; + int err = 0; + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + + switch (type) { + case ACL_TYPE_ACCESS: + if (acl) { + umode_t mode = inode->i_mode; + + err = posix_acl_equiv_mode(acl, &mode); + if (err < 0) + return err; + + if (inode->i_mode != mode) { + inode->i_mode = mode; + mark_inode_dirty(inode); + } + + if (!err) { + /* + * ACL can be exactly represented in the + * traditional file mode permission bits. + */ + acl = NULL; + } + } + name = XATTR_NAME_POSIX_ACL_ACCESS; + name_len = sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1; + break; + + case ACL_TYPE_DEFAULT: + if (!S_ISDIR(inode->i_mode)) + return acl ? -EACCES : 0; + name = XATTR_NAME_POSIX_ACL_DEFAULT; + name_len = sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1; + break; + + default: + return -EINVAL; + } + + if (!acl) { + size = 0; + value = NULL; + } else { + size = posix_acl_xattr_size(acl->a_count); + value = kmalloc(size, GFP_NOFS); + if (!value) + return -ENOMEM; + + err = posix_acl_to_xattr(mnt_userns, acl, value, size); + if (err < 0) + goto out; + } + + err = ntfs_set_ea(inode, name, name_len, value, size, 0, locked); + if (!err) + set_cached_acl(inode, type, acl); + +out: + kfree(value); + + return err; +} + +/* + * ntfs_set_acl - inode_operations::set_acl + */ +int ntfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type) +{ + return ntfs_set_acl_ex(mnt_userns, inode, acl, type, 0); +} + +static int ntfs_xattr_get_acl(struct user_namespace *mnt_userns, + struct inode *inode, int type, void *buffer, + size_t size) +{ + struct posix_acl *acl; + int err; + + if (!(inode->i_sb->s_flags & SB_POSIXACL)) { + ntfs_inode_warn(inode, "add mount option \"acl\" to use acl"); + return -EOPNOTSUPP; + } + + acl = ntfs_get_acl(inode, type, false); + if (IS_ERR(acl)) + return PTR_ERR(acl); + + if (!acl) + return -ENODATA; + + err = posix_acl_to_xattr(mnt_userns, acl, buffer, size); + ntfs_posix_acl_release(acl); + + return err; +} + +static int ntfs_xattr_set_acl(struct user_namespace *mnt_userns, + struct inode *inode, int type, const void *value, + size_t size) +{ + struct posix_acl *acl; + int err; + + if (!(inode->i_sb->s_flags & SB_POSIXACL)) { + ntfs_inode_warn(inode, "add mount option \"acl\" to use acl"); + return -EOPNOTSUPP; + } + + if (!inode_owner_or_capable(mnt_userns, inode)) + return -EPERM; + + if (!value) { + acl = NULL; + } else { + acl = posix_acl_from_xattr(mnt_userns, value, size); + if (IS_ERR(acl)) + return PTR_ERR(acl); + + if (acl) { + err = posix_acl_valid(mnt_userns, acl); + if (err) + goto release_and_out; + } + } + + err = ntfs_set_acl(mnt_userns, inode, acl, type); + +release_and_out: + ntfs_posix_acl_release(acl); + return err; +} + +/* + * ntfs_init_acl - Initialize the ACLs of a new inode. + * + * Called from ntfs_create_inode(). + */ +int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct inode *dir) +{ + struct posix_acl *default_acl, *acl; + int err; + + /* + * TODO: Refactoring lock. + * ni_lock(dir) ... -> posix_acl_create(dir,...) -> ntfs_get_acl -> ni_lock(dir) + */ + inode->i_default_acl = NULL; + + default_acl = ntfs_get_acl_ex(mnt_userns, dir, ACL_TYPE_DEFAULT, 1); + + if (!default_acl || default_acl == ERR_PTR(-EOPNOTSUPP)) { + inode->i_mode &= ~current_umask(); + err = 0; + goto out; + } + + if (IS_ERR(default_acl)) { + err = PTR_ERR(default_acl); + goto out; + } + + acl = default_acl; + err = __posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); + if (err < 0) + goto out1; + if (!err) { + posix_acl_release(acl); + acl = NULL; + } + + if (!S_ISDIR(inode->i_mode)) { + posix_acl_release(default_acl); + default_acl = NULL; + } + + if (default_acl) + err = ntfs_set_acl_ex(mnt_userns, inode, default_acl, + ACL_TYPE_DEFAULT, 1); + + if (!acl) + inode->i_acl = NULL; + else if (!err) + err = ntfs_set_acl_ex(mnt_userns, inode, acl, ACL_TYPE_ACCESS, + 1); + + posix_acl_release(acl); +out1: + posix_acl_release(default_acl); + +out: + return err; +} +#endif + +/* + * ntfs_acl_chmod - Helper for ntfs3_setattr(). + */ +int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + if (!(sb->s_flags & SB_POSIXACL)) + return 0; + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + + return posix_acl_chmod(mnt_userns, inode, inode->i_mode); +} + +/* + * ntfs_permission - inode_operations::permission + */ +int ntfs_permission(struct user_namespace *mnt_userns, struct inode *inode, + int mask) +{ + if (ntfs_sb(inode->i_sb)->options.no_acs_rules) { + /* "No access rules" mode - Allow all changes. */ + return 0; + } + + return generic_permission(mnt_userns, inode, mask); +} + +/* + * ntfs_listxattr - inode_operations::listxattr + */ +ssize_t ntfs_listxattr(struct dentry *dentry, char *buffer, size_t size) +{ + struct inode *inode = d_inode(dentry); + struct ntfs_inode *ni = ntfs_i(inode); + ssize_t ret; + + if (!(ni->ni_flags & NI_FLAG_EA)) { + /* no xattr in file */ + return 0; + } + + ni_lock(ni); + + ret = ntfs_list_ea(ni, buffer, size); + + ni_unlock(ni); + + return ret; +} + +static int ntfs_getxattr(const struct xattr_handler *handler, struct dentry *de, + struct inode *inode, const char *name, void *buffer, + size_t size) +{ + int err; + struct ntfs_inode *ni = ntfs_i(inode); + size_t name_len = strlen(name); + + /* Dispatch request. */ + if (name_len == sizeof(SYSTEM_DOS_ATTRIB) - 1 && + !memcmp(name, SYSTEM_DOS_ATTRIB, sizeof(SYSTEM_DOS_ATTRIB))) { + /* system.dos_attrib */ + if (!buffer) { + err = sizeof(u8); + } else if (size < sizeof(u8)) { + err = -ENODATA; + } else { + err = sizeof(u8); + *(u8 *)buffer = le32_to_cpu(ni->std_fa); + } + goto out; + } + + if (name_len == sizeof(SYSTEM_NTFS_ATTRIB) - 1 && + !memcmp(name, SYSTEM_NTFS_ATTRIB, sizeof(SYSTEM_NTFS_ATTRIB))) { + /* system.ntfs_attrib */ + if (!buffer) { + err = sizeof(u32); + } else if (size < sizeof(u32)) { + err = -ENODATA; + } else { + err = sizeof(u32); + *(u32 *)buffer = le32_to_cpu(ni->std_fa); + } + goto out; + } + + if (name_len == sizeof(SYSTEM_NTFS_SECURITY) - 1 && + !memcmp(name, SYSTEM_NTFS_SECURITY, sizeof(SYSTEM_NTFS_SECURITY))) { + /* system.ntfs_security*/ + struct SECURITY_DESCRIPTOR_RELATIVE *sd = NULL; + size_t sd_size = 0; + + if (!is_ntfs3(ni->mi.sbi)) { + /* We should get nt4 security. */ + err = -EINVAL; + goto out; + } else if (le32_to_cpu(ni->std_security_id) < + SECURITY_ID_FIRST) { + err = -ENOENT; + goto out; + } + + err = ntfs_get_security_by_id(ni->mi.sbi, ni->std_security_id, + &sd, &sd_size); + if (err) + goto out; + + if (!is_sd_valid(sd, sd_size)) { + ntfs_inode_warn( + inode, + "looks like you get incorrect security descriptor id=%u", + ni->std_security_id); + } + + if (!buffer) { + err = sd_size; + } else if (size < sd_size) { + err = -ENODATA; + } else { + err = sd_size; + memcpy(buffer, sd, sd_size); + } + kfree(sd); + goto out; + } + +#ifdef CONFIG_NTFS3_FS_POSIX_ACL + if ((name_len == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1 && + !memcmp(name, XATTR_NAME_POSIX_ACL_ACCESS, + sizeof(XATTR_NAME_POSIX_ACL_ACCESS))) || + (name_len == sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1 && + !memcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, + sizeof(XATTR_NAME_POSIX_ACL_DEFAULT)))) { + /* TODO: init_user_ns? */ + err = ntfs_xattr_get_acl( + &init_user_ns, inode, + name_len == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1 + ? ACL_TYPE_ACCESS + : ACL_TYPE_DEFAULT, + buffer, size); + goto out; + } +#endif + /* Deal with NTFS extended attribute. */ + err = ntfs_get_ea(inode, name, name_len, buffer, size, NULL); + +out: + return err; +} + +/* + * ntfs_setxattr - inode_operations::setxattr + */ +static noinline int ntfs_setxattr(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, + struct dentry *de, struct inode *inode, + const char *name, const void *value, + size_t size, int flags) +{ + int err = -EINVAL; + struct ntfs_inode *ni = ntfs_i(inode); + size_t name_len = strlen(name); + enum FILE_ATTRIBUTE new_fa; + + /* Dispatch request. */ + if (name_len == sizeof(SYSTEM_DOS_ATTRIB) - 1 && + !memcmp(name, SYSTEM_DOS_ATTRIB, sizeof(SYSTEM_DOS_ATTRIB))) { + if (sizeof(u8) != size) + goto out; + new_fa = cpu_to_le32(*(u8 *)value); + goto set_new_fa; + } + + if (name_len == sizeof(SYSTEM_NTFS_ATTRIB) - 1 && + !memcmp(name, SYSTEM_NTFS_ATTRIB, sizeof(SYSTEM_NTFS_ATTRIB))) { + if (size != sizeof(u32)) + goto out; + new_fa = cpu_to_le32(*(u32 *)value); + + if (S_ISREG(inode->i_mode)) { + /* Process compressed/sparsed in special way. */ + ni_lock(ni); + err = ni_new_attr_flags(ni, new_fa); + ni_unlock(ni); + if (err) + goto out; + } +set_new_fa: + /* + * Thanks Mark Harmstone: + * Keep directory bit consistency. + */ + if (S_ISDIR(inode->i_mode)) + new_fa |= FILE_ATTRIBUTE_DIRECTORY; + else + new_fa &= ~FILE_ATTRIBUTE_DIRECTORY; + + if (ni->std_fa != new_fa) { + ni->std_fa = new_fa; + if (new_fa & FILE_ATTRIBUTE_READONLY) + inode->i_mode &= ~0222; + else + inode->i_mode |= 0222; + /* Std attribute always in primary record. */ + ni->mi.dirty = true; + mark_inode_dirty(inode); + } + err = 0; + + goto out; + } + + if (name_len == sizeof(SYSTEM_NTFS_SECURITY) - 1 && + !memcmp(name, SYSTEM_NTFS_SECURITY, sizeof(SYSTEM_NTFS_SECURITY))) { + /* system.ntfs_security*/ + __le32 security_id; + bool inserted; + struct ATTR_STD_INFO5 *std; + + if (!is_ntfs3(ni->mi.sbi)) { + /* + * We should replace ATTR_SECURE. + * Skip this way cause it is nt4 feature. + */ + err = -EINVAL; + goto out; + } + + if (!is_sd_valid(value, size)) { + err = -EINVAL; + ntfs_inode_warn( + inode, + "you try to set invalid security descriptor"); + goto out; + } + + err = ntfs_insert_security(ni->mi.sbi, value, size, + &security_id, &inserted); + if (err) + goto out; + + ni_lock(ni); + std = ni_std5(ni); + if (!std) { + err = -EINVAL; + } else if (std->security_id != security_id) { + std->security_id = ni->std_security_id = security_id; + /* Std attribute always in primary record. */ + ni->mi.dirty = true; + mark_inode_dirty(&ni->vfs_inode); + } + ni_unlock(ni); + goto out; + } + +#ifdef CONFIG_NTFS3_FS_POSIX_ACL + if ((name_len == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1 && + !memcmp(name, XATTR_NAME_POSIX_ACL_ACCESS, + sizeof(XATTR_NAME_POSIX_ACL_ACCESS))) || + (name_len == sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1 && + !memcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, + sizeof(XATTR_NAME_POSIX_ACL_DEFAULT)))) { + err = ntfs_xattr_set_acl( + mnt_userns, inode, + name_len == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1 + ? ACL_TYPE_ACCESS + : ACL_TYPE_DEFAULT, + value, size); + goto out; + } +#endif + /* Deal with NTFS extended attribute. */ + err = ntfs_set_ea(inode, name, name_len, value, size, flags, 0); + +out: + return err; +} + +/* + * ntfs_save_wsl_perm + * + * save uid/gid/mode in xattr + */ +int ntfs_save_wsl_perm(struct inode *inode) +{ + int err; + __le32 value; + + value = cpu_to_le32(i_uid_read(inode)); + err = ntfs_set_ea(inode, "$LXUID", sizeof("$LXUID") - 1, &value, + sizeof(value), 0, 0); + if (err) + goto out; + + value = cpu_to_le32(i_gid_read(inode)); + err = ntfs_set_ea(inode, "$LXGID", sizeof("$LXGID") - 1, &value, + sizeof(value), 0, 0); + if (err) + goto out; + + value = cpu_to_le32(inode->i_mode); + err = ntfs_set_ea(inode, "$LXMOD", sizeof("$LXMOD") - 1, &value, + sizeof(value), 0, 0); + if (err) + goto out; + + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { + value = cpu_to_le32(inode->i_rdev); + err = ntfs_set_ea(inode, "$LXDEV", sizeof("$LXDEV") - 1, &value, + sizeof(value), 0, 0); + if (err) + goto out; + } + +out: + /* In case of error should we delete all WSL xattr? */ + return err; +} + +/* + * ntfs_get_wsl_perm + * + * get uid/gid/mode from xattr + * it is called from ntfs_iget5->ntfs_read_mft + */ +void ntfs_get_wsl_perm(struct inode *inode) +{ + size_t sz; + __le32 value[3]; + + if (ntfs_get_ea(inode, "$LXUID", sizeof("$LXUID") - 1, &value[0], + sizeof(value[0]), &sz) == sizeof(value[0]) && + ntfs_get_ea(inode, "$LXGID", sizeof("$LXGID") - 1, &value[1], + sizeof(value[1]), &sz) == sizeof(value[1]) && + ntfs_get_ea(inode, "$LXMOD", sizeof("$LXMOD") - 1, &value[2], + sizeof(value[2]), &sz) == sizeof(value[2])) { + i_uid_write(inode, (uid_t)le32_to_cpu(value[0])); + i_gid_write(inode, (gid_t)le32_to_cpu(value[1])); + inode->i_mode = le32_to_cpu(value[2]); + + if (ntfs_get_ea(inode, "$LXDEV", sizeof("$$LXDEV") - 1, + &value[0], sizeof(value), + &sz) == sizeof(value[0])) { + inode->i_rdev = le32_to_cpu(value[0]); + } + } +} + +static bool ntfs_xattr_user_list(struct dentry *dentry) +{ + return true; +} + +// clang-format off +static const struct xattr_handler ntfs_xattr_handler = { + .prefix = "", + .get = ntfs_getxattr, + .set = ntfs_setxattr, + .list = ntfs_xattr_user_list, +}; + +const struct xattr_handler *ntfs_xattr_handlers[] = { + &ntfs_xattr_handler, + NULL, +}; +// clang-format on diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 5c72a7e6d6c5..23a72a423955 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -289,7 +289,7 @@ unlock: return status; } -struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type) +struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type, bool rcu) { struct ocfs2_super *osb; struct buffer_head *di_bh = NULL; @@ -297,6 +297,9 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type) int had_lock; struct ocfs2_lock_holder oh; + if (rcu) + return ERR_PTR(-ECHILD); + osb = OCFS2_SB(inode->i_sb); if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) return NULL; diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h index f59d8d0a61fa..95a57c888ab6 100644 --- a/fs/ocfs2/acl.h +++ b/fs/ocfs2/acl.h @@ -16,7 +16,7 @@ struct ocfs2_acl_entry { __le32 e_id; }; -struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type); +struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type, bool rcu); int ocfs2_iop_set_acl(struct user_namespace *mnt_userns, struct inode *inode, struct posix_acl *acl, int type); extern int ocfs2_acl_chmod(struct inode *, struct buffer_head *); diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 48fd369c29a4..359524b7341f 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -16,6 +16,7 @@ #include <linux/debugfs.h> #include <linux/seq_file.h> #include <linux/time.h> +#include <linux/delay.h> #include <linux/quotaops.h> #include <linux/sched/signal.h> @@ -2721,7 +2722,7 @@ int ocfs2_inode_lock_tracker(struct inode *inode, return status; } } - return tmp_oh ? 1 : 0; + return 1; } void ocfs2_inode_unlock_tracker(struct inode *inode, @@ -3912,6 +3913,17 @@ downconvert: spin_unlock_irqrestore(&lockres->l_lock, flags); ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb, gen); + /* The dlm lock convert is being cancelled in background, + * ocfs2_cancel_convert() is asynchronous in fs/dlm, + * requeue it, try again later. + */ + if (ret == -EBUSY) { + ctl->requeue = 1; + mlog(ML_BASTS, "lockres %s, ReQ: Downconvert busy\n", + lockres->l_name); + ret = 0; + msleep(20); + } leave: if (ret) diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index eda83487c9ec..f033de733adb 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -357,7 +357,6 @@ int ocfs2_global_read_info(struct super_block *sb, int type) } oinfo->dqi_gi.dqi_sb = sb; oinfo->dqi_gi.dqi_type = type; - ocfs2_qinfo_lock_res_init(&oinfo->dqi_gqlock, oinfo); oinfo->dqi_gi.dqi_entry_size = sizeof(struct ocfs2_global_disk_dqblk); oinfo->dqi_gi.dqi_ops = &ocfs2_global_ops; oinfo->dqi_gqi_bh = NULL; diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index b1a8b046f4c2..0e4b16d4c037 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -702,6 +702,8 @@ static int ocfs2_local_read_info(struct super_block *sb, int type) info->dqi_priv = oinfo; oinfo->dqi_type = type; INIT_LIST_HEAD(&oinfo->dqi_chunk); + oinfo->dqi_gqinode = NULL; + ocfs2_qinfo_lock_res_init(&oinfo->dqi_gqlock, oinfo); oinfo->dqi_rec = NULL; oinfo->dqi_lqi_bh = NULL; oinfo->dqi_libh = NULL; diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c index 18852b9ed82b..605e5a3506ec 100644 --- a/fs/orangefs/acl.c +++ b/fs/orangefs/acl.c @@ -10,12 +10,15 @@ #include "orangefs-bufmap.h" #include <linux/posix_acl_xattr.h> -struct posix_acl *orangefs_get_acl(struct inode *inode, int type) +struct posix_acl *orangefs_get_acl(struct inode *inode, int type, bool rcu) { struct posix_acl *acl; int ret; char *key = NULL, *value = NULL; + if (rcu) + return ERR_PTR(-ECHILD); + switch (type) { case ACL_TYPE_ACCESS: key = XATTR_NAME_POSIX_ACL_ACCESS; diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 16ac617df7d7..c1bb4c4b5d67 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -882,12 +882,7 @@ int orangefs_getattr(struct user_namespace *mnt_userns, const struct path *path, if (!(request_mask & STATX_SIZE)) stat->result_mask &= ~STATX_SIZE; - stat->attributes_mask = STATX_ATTR_IMMUTABLE | - STATX_ATTR_APPEND; - if (inode->i_flags & S_IMMUTABLE) - stat->attributes |= STATX_ATTR_IMMUTABLE; - if (inode->i_flags & S_APPEND) - stat->attributes |= STATX_ATTR_APPEND; + generic_fill_statx_attr(inode, stat); } return ret; } diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index 0e6b97682e41..b5940ec1836a 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -106,7 +106,7 @@ enum orangefs_vfs_op_states { extern int orangefs_init_acl(struct inode *inode, struct inode *dir); extern const struct xattr_handler *orangefs_xattr_handlers[]; -extern struct posix_acl *orangefs_get_acl(struct inode *inode, int type); +extern struct posix_acl *orangefs_get_acl(struct inode *inode, int type, bool rcu); extern int orangefs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, struct posix_acl *acl, int type); diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 2846b943e80c..4e7d5bfa2949 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -8,6 +8,7 @@ #include <linux/fs.h> #include <linux/slab.h> #include <linux/file.h> +#include <linux/fileattr.h> #include <linux/splice.h> #include <linux/xattr.h> #include <linux/security.h> @@ -62,7 +63,7 @@ int ovl_copy_xattr(struct super_block *sb, struct dentry *old, return list_size; } - buf = kzalloc(list_size, GFP_KERNEL); + buf = kvzalloc(list_size, GFP_KERNEL); if (!buf) return -ENOMEM; @@ -105,11 +106,12 @@ retry: if (size > value_size) { void *new; - new = krealloc(value, size, GFP_KERNEL); + new = kvmalloc(size, GFP_KERNEL); if (!new) { error = -ENOMEM; break; } + kvfree(value); value = new; value_size = size; goto retry; @@ -124,12 +126,50 @@ retry: error = 0; } } - kfree(value); + kvfree(value); out: - kfree(buf); + kvfree(buf); return error; } +static int ovl_copy_fileattr(struct inode *inode, struct path *old, + struct path *new) +{ + struct fileattr oldfa = { .flags_valid = true }; + struct fileattr newfa = { .flags_valid = true }; + int err; + + err = ovl_real_fileattr_get(old, &oldfa); + if (err) + return err; + + err = ovl_real_fileattr_get(new, &newfa); + if (err) + return err; + + /* + * We cannot set immutable and append-only flags on upper inode, + * because we would not be able to link upper inode to upper dir + * not set overlay private xattr on upper inode. + * Store these flags in overlay.protattr xattr instead. + */ + if (oldfa.flags & OVL_PROT_FS_FLAGS_MASK) { + err = ovl_set_protattr(inode, new->dentry, &oldfa); + if (err) + return err; + } + + BUILD_BUG_ON(OVL_COPY_FS_FLAGS_MASK & ~FS_COMMON_FL); + newfa.flags &= ~OVL_COPY_FS_FLAGS_MASK; + newfa.flags |= (oldfa.flags & OVL_COPY_FS_FLAGS_MASK); + + BUILD_BUG_ON(OVL_COPY_FSX_FLAGS_MASK & ~FS_XFLAG_COMMON); + newfa.fsx_xflags &= ~OVL_COPY_FSX_FLAGS_MASK; + newfa.fsx_xflags |= (oldfa.fsx_xflags & OVL_COPY_FSX_FLAGS_MASK); + + return ovl_real_fileattr_set(new, &newfa); +} + static int ovl_copy_up_data(struct ovl_fs *ofs, struct path *old, struct path *new, loff_t len) { @@ -331,8 +371,8 @@ out_err: return ERR_PTR(err); } -int ovl_set_origin(struct ovl_fs *ofs, struct dentry *dentry, - struct dentry *lower, struct dentry *upper) +int ovl_set_origin(struct ovl_fs *ofs, struct dentry *lower, + struct dentry *upper) { const struct ovl_fh *fh = NULL; int err; @@ -351,7 +391,7 @@ int ovl_set_origin(struct ovl_fs *ofs, struct dentry *dentry, /* * Do not fail when upper doesn't support xattrs. */ - err = ovl_check_setxattr(dentry, upper, OVL_XATTR_ORIGIN, fh->buf, + err = ovl_check_setxattr(ofs, upper, OVL_XATTR_ORIGIN, fh->buf, fh ? fh->fb.len : 0, 0); kfree(fh); @@ -493,20 +533,21 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c) static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp) { struct ovl_fs *ofs = OVL_FS(c->dentry->d_sb); + struct inode *inode = d_inode(c->dentry); + struct path upperpath, datapath; int err; + ovl_path_upper(c->dentry, &upperpath); + if (WARN_ON(upperpath.dentry != NULL)) + return -EIO; + + upperpath.dentry = temp; + /* * Copy up data first and then xattrs. Writing data after * xattrs will remove security.capability xattr automatically. */ if (S_ISREG(c->stat.mode) && !c->metacopy) { - struct path upperpath, datapath; - - ovl_path_upper(c->dentry, &upperpath); - if (WARN_ON(upperpath.dentry != NULL)) - return -EIO; - upperpath.dentry = temp; - ovl_path_lowerdata(c->dentry, &datapath); err = ovl_copy_up_data(ofs, &datapath, &upperpath, c->stat.size); @@ -518,6 +559,16 @@ static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp) if (err) return err; + if (inode->i_flags & OVL_COPY_I_FLAGS_MASK) { + /* + * Copy the fileattr inode flags that are the source of already + * copied i_flags + */ + err = ovl_copy_fileattr(inode, &c->lowerpath, &upperpath); + if (err) + return err; + } + /* * Store identifier of lower inode in upper inode xattr to * allow lookup of the copy up origin inode. @@ -526,13 +577,13 @@ static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp) * hard link. */ if (c->origin) { - err = ovl_set_origin(ofs, c->dentry, c->lowerpath.dentry, temp); + err = ovl_set_origin(ofs, c->lowerpath.dentry, temp); if (err) return err; } if (c->metacopy) { - err = ovl_check_setxattr(c->dentry, temp, OVL_XATTR_METACOPY, + err = ovl_check_setxattr(ofs, temp, OVL_XATTR_METACOPY, NULL, 0, -EOPNOTSUPP); if (err) return err; diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 93efe7048a77..1fefb2b8960e 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -233,9 +233,10 @@ struct dentry *ovl_create_temp(struct dentry *workdir, struct ovl_cattr *attr) static int ovl_set_opaque_xerr(struct dentry *dentry, struct dentry *upper, int xerr) { + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); int err; - err = ovl_check_setxattr(dentry, upper, OVL_XATTR_OPAQUE, "y", 1, xerr); + err = ovl_check_setxattr(ofs, upper, OVL_XATTR_OPAQUE, "y", 1, xerr); if (!err) ovl_dentry_set_opaque(dentry); @@ -320,6 +321,7 @@ static bool ovl_type_origin(struct dentry *dentry) static int ovl_create_upper(struct dentry *dentry, struct inode *inode, struct ovl_cattr *attr) { + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); struct inode *udir = upperdir->d_inode; struct dentry *newdentry; @@ -338,7 +340,8 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode, if (IS_ERR(newdentry)) goto out_unlock; - if (ovl_type_merge(dentry->d_parent) && d_is_dir(newdentry)) { + if (ovl_type_merge(dentry->d_parent) && d_is_dir(newdentry) && + !ovl_allow_offline_changes(ofs)) { /* Setting opaque here is just an optimization, allow to fail */ ovl_set_opaque(dentry, newdentry); } @@ -542,8 +545,10 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, goto out_cleanup; } err = ovl_instantiate(dentry, inode, newdentry, hardlink); - if (err) - goto out_cleanup; + if (err) { + ovl_cleanup(udir, newdentry); + dput(newdentry); + } out_dput: dput(upper); out_unlock: @@ -1043,6 +1048,7 @@ static bool ovl_need_absolute_redirect(struct dentry *dentry, bool samedir) static int ovl_set_redirect(struct dentry *dentry, bool samedir) { int err; + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); const char *redirect = ovl_dentry_get_redirect(dentry); bool absolute_redirect = ovl_need_absolute_redirect(dentry, samedir); @@ -1053,7 +1059,7 @@ static int ovl_set_redirect(struct dentry *dentry, bool samedir) if (IS_ERR(redirect)) return PTR_ERR(redirect); - err = ovl_check_setxattr(dentry, ovl_dentry_upper(dentry), + err = ovl_check_setxattr(ofs, ovl_dentry_upper(dentry), OVL_XATTR_REDIRECT, redirect, strlen(redirect), -EXDEV); if (!err) { diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 5e828a1c98a8..832b17589733 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -13,6 +13,7 @@ #include <linux/fiemap.h> #include <linux/fileattr.h> #include <linux/security.h> +#include <linux/namei.h> #include "overlayfs.h" @@ -33,12 +34,6 @@ int ovl_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, goto out; if (attr->ia_valid & ATTR_SIZE) { - struct inode *realinode = d_inode(ovl_dentry_real(dentry)); - - err = -ETXTBSY; - if (atomic_read(&realinode->i_writecount) < 0) - goto out_drop_write; - /* Truncate should trigger data copy up as well */ full_copy_up = true; } @@ -162,7 +157,8 @@ int ovl_getattr(struct user_namespace *mnt_userns, const struct path *path, enum ovl_path_type type; struct path realpath; const struct cred *old_cred; - bool is_dir = S_ISDIR(dentry->d_inode->i_mode); + struct inode *inode = d_inode(dentry); + bool is_dir = S_ISDIR(inode->i_mode); int fsid = 0; int err; bool metacopy_blocks = false; @@ -175,6 +171,9 @@ int ovl_getattr(struct user_namespace *mnt_userns, const struct path *path, if (err) goto out; + /* Report the effective immutable/append-only STATX flags */ + generic_fill_statx_attr(inode, stat); + /* * For non-dir or same fs, we use st_ino of the copy up origin. * This guaranties constant st_dev/st_ino across copy up. @@ -448,7 +447,7 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) return res; } -struct posix_acl *ovl_get_acl(struct inode *inode, int type) +struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu) { struct inode *realinode = ovl_inode_real(inode); const struct cred *old_cred; @@ -457,6 +456,9 @@ struct posix_acl *ovl_get_acl(struct inode *inode, int type) if (!IS_ENABLED(CONFIG_FS_POSIX_ACL) || !IS_POSIXACL(realinode)) return NULL; + if (rcu) + return get_cached_acl_rcu(realinode, type); + old_cred = ovl_override_creds(inode->i_sb); acl = get_acl(realinode, type); revert_creds(old_cred); @@ -503,16 +505,14 @@ static int ovl_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, * Introducing security_inode_fileattr_get/set() hooks would solve this issue * properly. */ -static int ovl_security_fileattr(struct dentry *dentry, struct fileattr *fa, +static int ovl_security_fileattr(struct path *realpath, struct fileattr *fa, bool set) { - struct path realpath; struct file *file; unsigned int cmd; int err; - ovl_path_real(dentry, &realpath); - file = dentry_open(&realpath, O_RDONLY, current_cred()); + file = dentry_open(realpath, O_RDONLY, current_cred()); if (IS_ERR(file)) return PTR_ERR(file); @@ -527,12 +527,24 @@ static int ovl_security_fileattr(struct dentry *dentry, struct fileattr *fa, return err; } +int ovl_real_fileattr_set(struct path *realpath, struct fileattr *fa) +{ + int err; + + err = ovl_security_fileattr(realpath, fa, true); + if (err) + return err; + + return vfs_fileattr_set(&init_user_ns, realpath->dentry, fa); +} + int ovl_fileattr_set(struct user_namespace *mnt_userns, struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); - struct dentry *upperdentry; + struct path upperpath; const struct cred *old_cred; + unsigned int flags; int err; err = ovl_want_write(dentry); @@ -541,31 +553,78 @@ int ovl_fileattr_set(struct user_namespace *mnt_userns, err = ovl_copy_up(dentry); if (!err) { - upperdentry = ovl_dentry_upper(dentry); + ovl_path_real(dentry, &upperpath); old_cred = ovl_override_creds(inode->i_sb); - err = ovl_security_fileattr(dentry, fa, true); + /* + * Store immutable/append-only flags in xattr and clear them + * in upper fileattr (in case they were set by older kernel) + * so children of "ovl-immutable" directories lower aliases of + * "ovl-immutable" hardlinks could be copied up. + * Clear xattr when flags are cleared. + */ + err = ovl_set_protattr(inode, upperpath.dentry, fa); if (!err) - err = vfs_fileattr_set(&init_user_ns, upperdentry, fa); + err = ovl_real_fileattr_set(&upperpath, fa); revert_creds(old_cred); - ovl_copyflags(ovl_inode_real(inode), inode); + + /* + * Merge real inode flags with inode flags read from + * overlay.protattr xattr + */ + flags = ovl_inode_real(inode)->i_flags & OVL_COPY_I_FLAGS_MASK; + + BUILD_BUG_ON(OVL_PROT_I_FLAGS_MASK & ~OVL_COPY_I_FLAGS_MASK); + flags |= inode->i_flags & OVL_PROT_I_FLAGS_MASK; + inode_set_flags(inode, flags, OVL_COPY_I_FLAGS_MASK); + + /* Update ctime */ + ovl_copyattr(ovl_inode_real(inode), inode); } ovl_drop_write(dentry); out: return err; } +/* Convert inode protection flags to fileattr flags */ +static void ovl_fileattr_prot_flags(struct inode *inode, struct fileattr *fa) +{ + BUILD_BUG_ON(OVL_PROT_FS_FLAGS_MASK & ~FS_COMMON_FL); + BUILD_BUG_ON(OVL_PROT_FSX_FLAGS_MASK & ~FS_XFLAG_COMMON); + + if (inode->i_flags & S_APPEND) { + fa->flags |= FS_APPEND_FL; + fa->fsx_xflags |= FS_XFLAG_APPEND; + } + if (inode->i_flags & S_IMMUTABLE) { + fa->flags |= FS_IMMUTABLE_FL; + fa->fsx_xflags |= FS_XFLAG_IMMUTABLE; + } +} + +int ovl_real_fileattr_get(struct path *realpath, struct fileattr *fa) +{ + int err; + + err = ovl_security_fileattr(realpath, fa, false); + if (err) + return err; + + return vfs_fileattr_get(realpath->dentry, fa); +} + int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa) { struct inode *inode = d_inode(dentry); - struct dentry *realdentry = ovl_dentry_real(dentry); + struct path realpath; const struct cred *old_cred; int err; + ovl_path_real(dentry, &realpath); + old_cred = ovl_override_creds(inode->i_sb); - err = ovl_security_fileattr(dentry, fa, false); - if (!err) - err = vfs_fileattr_get(realdentry, fa); + err = ovl_real_fileattr_get(&realpath, fa); + ovl_fileattr_prot_flags(inode, fa); revert_creds(old_cred); return err; @@ -1118,6 +1177,10 @@ struct inode *ovl_get_inode(struct super_block *sb, } } + /* Check for immutable/append-only inode flags in xattr */ + if (upperdentry) + ovl_check_protattr(inode, upperdentry); + if (inode->i_state & I_NEW) unlock_new_inode(inode); out: diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 210cd6f66e28..1a9b515fc45d 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -392,7 +392,7 @@ invalid: upperdentry, d_inode(upperdentry)->i_mode & S_IFMT, d_inode(origin)->i_mode & S_IFMT); dput(origin); - return -EIO; + return -ESTALE; } static int ovl_check_origin(struct ovl_fs *ofs, struct dentry *upperdentry, @@ -811,7 +811,7 @@ static int ovl_fix_origin(struct ovl_fs *ofs, struct dentry *dentry, if (err) return err; - err = ovl_set_origin(ofs, dentry, lower, upper); + err = ovl_set_origin(ofs, lower, upper); if (!err) err = ovl_set_impure(dentry->d_parent, upper->d_parent); diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 6ec73db4bf9e..3894f3347955 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -34,6 +34,7 @@ enum ovl_xattr { OVL_XATTR_NLINK, OVL_XATTR_UPPER, OVL_XATTR_METACOPY, + OVL_XATTR_PROTATTR, }; enum ovl_inode_flag { @@ -262,6 +263,18 @@ static inline bool ovl_open_flags_need_copy_up(int flags) return ((OPEN_FMODE(flags) & FMODE_WRITE) || (flags & O_TRUNC)); } +static inline bool ovl_allow_offline_changes(struct ovl_fs *ofs) +{ + /* + * To avoid regressions in existing setups with overlay lower offline + * changes, we allow lower changes only if none of the new features + * are used. + */ + return (!ofs->config.index && !ofs->config.metacopy && + !ofs->config.redirect_dir && ofs->config.xino != OVL_XINO_ON); +} + + /* util.c */ int ovl_want_write(struct dentry *dentry); void ovl_drop_write(struct dentry *dentry); @@ -320,7 +333,7 @@ bool ovl_already_copied_up(struct dentry *dentry, int flags); bool ovl_check_origin_xattr(struct ovl_fs *ofs, struct dentry *dentry); bool ovl_check_dir_xattr(struct super_block *sb, struct dentry *dentry, enum ovl_xattr ox); -int ovl_check_setxattr(struct dentry *dentry, struct dentry *upperdentry, +int ovl_check_setxattr(struct ovl_fs *ofs, struct dentry *upperdentry, enum ovl_xattr ox, const void *value, size_t size, int xerr); int ovl_set_impure(struct dentry *dentry, struct dentry *upperdentry); @@ -485,7 +498,7 @@ int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name, int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name, void *value, size_t size); ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size); -struct posix_acl *ovl_get_acl(struct inode *inode, int type); +struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu); int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags); bool ovl_is_private_xattr(struct super_block *sb, const char *name); @@ -518,9 +531,28 @@ static inline void ovl_copyattr(struct inode *from, struct inode *to) i_size_write(to, i_size_read(from)); } +/* vfs inode flags copied from real to ovl inode */ +#define OVL_COPY_I_FLAGS_MASK (S_SYNC | S_NOATIME | S_APPEND | S_IMMUTABLE) +/* vfs inode flags read from overlay.protattr xattr to ovl inode */ +#define OVL_PROT_I_FLAGS_MASK (S_APPEND | S_IMMUTABLE) + +/* + * fileattr flags copied from lower to upper inode on copy up. + * We cannot copy up immutable/append-only flags, because that would prevent + * linking temp inode to upper dir, so we store them in xattr instead. + */ +#define OVL_COPY_FS_FLAGS_MASK (FS_SYNC_FL | FS_NOATIME_FL) +#define OVL_COPY_FSX_FLAGS_MASK (FS_XFLAG_SYNC | FS_XFLAG_NOATIME) +#define OVL_PROT_FS_FLAGS_MASK (FS_APPEND_FL | FS_IMMUTABLE_FL) +#define OVL_PROT_FSX_FLAGS_MASK (FS_XFLAG_APPEND | FS_XFLAG_IMMUTABLE) + +void ovl_check_protattr(struct inode *inode, struct dentry *upper); +int ovl_set_protattr(struct inode *inode, struct dentry *upper, + struct fileattr *fa); + static inline void ovl_copyflags(struct inode *from, struct inode *to) { - unsigned int mask = S_SYNC | S_IMMUTABLE | S_APPEND | S_NOATIME; + unsigned int mask = OVL_COPY_I_FLAGS_MASK; inode_set_flags(to, from->i_flags & mask, mask); } @@ -548,6 +580,8 @@ struct dentry *ovl_create_temp(struct dentry *workdir, struct ovl_cattr *attr); extern const struct file_operations ovl_file_operations; int __init ovl_aio_request_cache_init(void); void ovl_aio_request_cache_destroy(void); +int ovl_real_fileattr_get(struct path *realpath, struct fileattr *fa); +int ovl_real_fileattr_set(struct path *realpath, struct fileattr *fa); int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa); int ovl_fileattr_set(struct user_namespace *mnt_userns, struct dentry *dentry, struct fileattr *fa); @@ -561,8 +595,8 @@ int ovl_copy_xattr(struct super_block *sb, struct dentry *old, int ovl_set_attr(struct dentry *upper, struct kstat *stat); struct ovl_fh *ovl_encode_real_fh(struct ovl_fs *ofs, struct dentry *real, bool is_upper); -int ovl_set_origin(struct ovl_fs *ofs, struct dentry *dentry, - struct dentry *lower, struct dentry *upper); +int ovl_set_origin(struct ovl_fs *ofs, struct dentry *lower, + struct dentry *upper); /* export.c */ extern const struct export_operations ovl_export_operations; diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index b01d4147520d..178daa5e82c9 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -1599,9 +1599,7 @@ static bool ovl_lower_uuid_ok(struct ovl_fs *ofs, const uuid_t *uuid) * user opted-in to one of the new features that require following the * lower inode of non-dir upper. */ - if (!ofs->config.index && !ofs->config.metacopy && - ofs->config.xino != OVL_XINO_ON && - uuid_is_null(uuid)) + if (ovl_allow_offline_changes(ofs) && uuid_is_null(uuid)) return false; for (i = 0; i < ofs->numfs; i++) { diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index b9d03627f364..f48284a2a896 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -10,6 +10,7 @@ #include <linux/cred.h> #include <linux/xattr.h> #include <linux/exportfs.h> +#include <linux/fileattr.h> #include <linux/uuid.h> #include <linux/namei.h> #include <linux/ratelimit.h> @@ -585,6 +586,7 @@ bool ovl_check_dir_xattr(struct super_block *sb, struct dentry *dentry, #define OVL_XATTR_NLINK_POSTFIX "nlink" #define OVL_XATTR_UPPER_POSTFIX "upper" #define OVL_XATTR_METACOPY_POSTFIX "metacopy" +#define OVL_XATTR_PROTATTR_POSTFIX "protattr" #define OVL_XATTR_TAB_ENTRY(x) \ [x] = { [false] = OVL_XATTR_TRUSTED_PREFIX x ## _POSTFIX, \ @@ -598,14 +600,14 @@ const char *const ovl_xattr_table[][2] = { OVL_XATTR_TAB_ENTRY(OVL_XATTR_NLINK), OVL_XATTR_TAB_ENTRY(OVL_XATTR_UPPER), OVL_XATTR_TAB_ENTRY(OVL_XATTR_METACOPY), + OVL_XATTR_TAB_ENTRY(OVL_XATTR_PROTATTR), }; -int ovl_check_setxattr(struct dentry *dentry, struct dentry *upperdentry, +int ovl_check_setxattr(struct ovl_fs *ofs, struct dentry *upperdentry, enum ovl_xattr ox, const void *value, size_t size, int xerr) { int err; - struct ovl_fs *ofs = dentry->d_sb->s_fs_info; if (ofs->noxattr) return xerr; @@ -623,6 +625,7 @@ int ovl_check_setxattr(struct dentry *dentry, struct dentry *upperdentry, int ovl_set_impure(struct dentry *dentry, struct dentry *upperdentry) { + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); int err; if (ovl_test_flag(OVL_IMPURE, d_inode(dentry))) @@ -632,14 +635,95 @@ int ovl_set_impure(struct dentry *dentry, struct dentry *upperdentry) * Do not fail when upper doesn't support xattrs. * Upper inodes won't have origin nor redirect xattr anyway. */ - err = ovl_check_setxattr(dentry, upperdentry, OVL_XATTR_IMPURE, - "y", 1, 0); + err = ovl_check_setxattr(ofs, upperdentry, OVL_XATTR_IMPURE, "y", 1, 0); if (!err) ovl_set_flag(OVL_IMPURE, d_inode(dentry)); return err; } + +#define OVL_PROTATTR_MAX 32 /* Reserved for future flags */ + +void ovl_check_protattr(struct inode *inode, struct dentry *upper) +{ + struct ovl_fs *ofs = OVL_FS(inode->i_sb); + u32 iflags = inode->i_flags & OVL_PROT_I_FLAGS_MASK; + char buf[OVL_PROTATTR_MAX+1]; + int res, n; + + res = ovl_do_getxattr(ofs, upper, OVL_XATTR_PROTATTR, buf, + OVL_PROTATTR_MAX); + if (res < 0) + return; + + /* + * Initialize inode flags from overlay.protattr xattr and upper inode + * flags. If upper inode has those fileattr flags set (i.e. from old + * kernel), we do not clear them on ovl_get_inode(), but we will clear + * them on next fileattr_set(). + */ + for (n = 0; n < res; n++) { + if (buf[n] == 'a') + iflags |= S_APPEND; + else if (buf[n] == 'i') + iflags |= S_IMMUTABLE; + else + break; + } + + if (!res || n < res) { + pr_warn_ratelimited("incompatible overlay.protattr format (%pd2, len=%d)\n", + upper, res); + } else { + inode_set_flags(inode, iflags, OVL_PROT_I_FLAGS_MASK); + } +} + +int ovl_set_protattr(struct inode *inode, struct dentry *upper, + struct fileattr *fa) +{ + struct ovl_fs *ofs = OVL_FS(inode->i_sb); + char buf[OVL_PROTATTR_MAX]; + int len = 0, err = 0; + u32 iflags = 0; + + BUILD_BUG_ON(HWEIGHT32(OVL_PROT_FS_FLAGS_MASK) > OVL_PROTATTR_MAX); + + if (fa->flags & FS_APPEND_FL) { + buf[len++] = 'a'; + iflags |= S_APPEND; + } + if (fa->flags & FS_IMMUTABLE_FL) { + buf[len++] = 'i'; + iflags |= S_IMMUTABLE; + } + + /* + * Do not allow to set protection flags when upper doesn't support + * xattrs, because we do not set those fileattr flags on upper inode. + * Remove xattr if it exist and all protection flags are cleared. + */ + if (len) { + err = ovl_check_setxattr(ofs, upper, OVL_XATTR_PROTATTR, + buf, len, -EPERM); + } else if (inode->i_flags & OVL_PROT_I_FLAGS_MASK) { + err = ovl_do_removexattr(ofs, upper, OVL_XATTR_PROTATTR); + if (err == -EOPNOTSUPP || err == -ENODATA) + err = 0; + } + if (err) + return err; + + inode_set_flags(inode, iflags, OVL_PROT_I_FLAGS_MASK); + + /* Mask out the fileattr flags that should not be set in upper inode */ + fa->flags &= ~OVL_PROT_FS_FLAGS_MASK; + fa->fsx_xflags &= ~OVL_PROT_FSX_FLAGS_MASK; + + return 0; +} + /** * Caller must hold a reference to inode to prevent it from being freed while * it is marked inuse. diff --git a/fs/posix_acl.c b/fs/posix_acl.c index f3309a7edb49..f5c25f580dd9 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -22,6 +22,7 @@ #include <linux/xattr.h> #include <linux/export.h> #include <linux/user_namespace.h> +#include <linux/namei.h> static struct posix_acl **acl_by_type(struct inode *inode, int type) { @@ -56,7 +57,17 @@ EXPORT_SYMBOL(get_cached_acl); struct posix_acl *get_cached_acl_rcu(struct inode *inode, int type) { - return rcu_dereference(*acl_by_type(inode, type)); + struct posix_acl *acl = rcu_dereference(*acl_by_type(inode, type)); + + if (acl == ACL_DONT_CACHE) { + struct posix_acl *ret; + + ret = inode->i_op->get_acl(inode, type, LOOKUP_RCU); + if (!IS_ERR(ret)) + acl = ret; + } + + return acl; } EXPORT_SYMBOL(get_cached_acl_rcu); @@ -138,7 +149,7 @@ struct posix_acl *get_acl(struct inode *inode, int type) set_cached_acl(inode, type, NULL); return NULL; } - acl = inode->i_op->get_acl(inode, type); + acl = inode->i_op->get_acl(inode, type, false); if (IS_ERR(acl)) { /* diff --git a/fs/proc/array.c b/fs/proc/array.c index ee0ce8cecc4a..49be8c8ef555 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -98,27 +98,17 @@ void proc_task_name(struct seq_file *m, struct task_struct *p, bool escape) { - char *buf; - size_t size; char tcomm[64]; - int ret; if (p->flags & PF_WQ_WORKER) wq_worker_comm(tcomm, sizeof(tcomm), p); else __get_task_comm(tcomm, sizeof(tcomm), p); - size = seq_get_buf(m, &buf); - if (escape) { - ret = string_escape_str(tcomm, buf, size, - ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\"); - if (ret >= size) - ret = -1; - } else { - ret = strscpy(buf, tcomm, size); - } - - seq_commit(m, ret); + if (escape) + seq_escape_str(m, tcomm, ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\"); + else + seq_printf(m, "%.64s", tcomm); } /* diff --git a/fs/proc/base.c b/fs/proc/base.c index e5b5f7709d48..533d5836eb9a 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -95,6 +95,7 @@ #include <linux/posix-timers.h> #include <linux/time_namespace.h> #include <linux/resctrl.h> +#include <linux/cn_proc.h> #include <trace/events/oom.h> #include "internal.h" #include "fd.h" @@ -1674,8 +1675,10 @@ static ssize_t comm_write(struct file *file, const char __user *buf, if (!p) return -ESRCH; - if (same_thread_group(current, p)) + if (same_thread_group(current, p)) { set_task_comm(p, buffer); + proc_comm_connector(p); + } else count = -EINVAL; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index eb97468dfe4c..cf25be3e0321 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -619,7 +619,6 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_MAYSHARE)] = "ms", [ilog2(VM_GROWSDOWN)] = "gd", [ilog2(VM_PFNMAP)] = "pf", - [ilog2(VM_DENYWRITE)] = "dw", [ilog2(VM_LOCKED)] = "lo", [ilog2(VM_IO)] = "io", [ilog2(VM_SEQ_READ)] = "sr", diff --git a/fs/reiserfs/acl.h b/fs/reiserfs/acl.h index fd58618da360..d9052b8ce6dd 100644 --- a/fs/reiserfs/acl.h +++ b/fs/reiserfs/acl.h @@ -48,7 +48,7 @@ static inline int reiserfs_acl_count(size_t size) } #ifdef CONFIG_REISERFS_FS_POSIX_ACL -struct posix_acl *reiserfs_get_acl(struct inode *inode, int type); +struct posix_acl *reiserfs_get_acl(struct inode *inode, int type, bool rcu); int reiserfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, struct posix_acl *acl, int type); int reiserfs_acl_chmod(struct inode *inode); diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c index 500f2000eb41..30319dc33c18 100644 --- a/fs/reiserfs/prints.c +++ b/fs/reiserfs/prints.c @@ -8,7 +8,7 @@ #include <linux/string.h> #include <linux/buffer_head.h> -#include <stdarg.h> +#include <linux/stdarg.h> static char error_buf[1024]; static char fmt_buf[1024]; diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index a9547144a099..d6fcddc46f5b 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -190,13 +190,16 @@ fail: * inode->i_mutex: down * BKL held [before 2.5.x] */ -struct posix_acl *reiserfs_get_acl(struct inode *inode, int type) +struct posix_acl *reiserfs_get_acl(struct inode *inode, int type, bool rcu) { char *name, *value; struct posix_acl *acl; int size; int retval; + if (rcu) + return ERR_PTR(-ECHILD); + switch (type) { case ACL_TYPE_ACCESS: name = XATTR_NAME_POSIX_ACL_ACCESS; diff --git a/fs/signalfd.c b/fs/signalfd.c index 167b5889db4b..040e1cf90528 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -114,10 +114,10 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo, break; case SIL_FAULT_BNDERR: case SIL_FAULT_PKUERR: - case SIL_PERF_EVENT: + case SIL_FAULT_PERF_EVENT: /* * Fall through to the SIL_FAULT case. SIL_FAULT_BNDERR, - * SIL_FAULT_PKUERR, and SIL_PERF_EVENT are only + * SIL_FAULT_PKUERR, and SIL_FAULT_PERF_EVENT are only * generated by faults that deliver them synchronously to * userspace. In case someone injects one of these signals * and signalfd catches it treat it as SIL_FAULT. diff --git a/fs/stat.c b/fs/stat.c index 1fa38bdec1a6..28d2020ba1f4 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -60,6 +60,24 @@ void generic_fillattr(struct user_namespace *mnt_userns, struct inode *inode, EXPORT_SYMBOL(generic_fillattr); /** + * generic_fill_statx_attr - Fill in the statx attributes from the inode flags + * @inode: Inode to use as the source + * @stat: Where to fill in the attribute flags + * + * Fill in the STATX_ATTR_* flags in the kstat structure for properties of the + * inode that are published on i_flags and enforced by the VFS. + */ +void generic_fill_statx_attr(struct inode *inode, struct kstat *stat) +{ + if (inode->i_flags & S_IMMUTABLE) + stat->attributes |= STATX_ATTR_IMMUTABLE; + if (inode->i_flags & S_APPEND) + stat->attributes |= STATX_ATTR_APPEND; + stat->attributes_mask |= KSTAT_ATTR_VFS_FLAGS; +} +EXPORT_SYMBOL(generic_fill_statx_attr); + +/** * vfs_getattr_nosec - getattr without security checks * @path: file to get attributes from * @stat: structure to return attributes in diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 9aefa7779b29..d019d6ac6ad0 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -174,8 +174,8 @@ static int sysfs_kf_bin_open(struct kernfs_open_file *of) { struct bin_attribute *battr = of->kn->priv; - if (battr->mapping) - of->file->f_mapping = battr->mapping; + if (battr->f_mapping) + of->file->f_mapping = battr->f_mapping(); return 0; } diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index 64e6a6698935..f29d62004527 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -446,7 +446,7 @@ int compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj, if (!target) return -ENOENT; - entry = kernfs_find_and_get(target_kobj->sd, target_name); + entry = kernfs_find_and_get(target, target_name); if (!entry) { kernfs_put(target); return -ENOENT; diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 2e4e1d159969..5cfa28cd00cd 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1630,6 +1630,17 @@ static const char *ubifs_get_link(struct dentry *dentry, return fscrypt_get_symlink(inode, ui->data, ui->data_len, done); } +static int ubifs_symlink_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags) +{ + ubifs_getattr(mnt_userns, path, stat, request_mask, query_flags); + + if (IS_ENCRYPTED(d_inode(path->dentry))) + return fscrypt_symlink_getattr(path, stat); + return 0; +} + const struct address_space_operations ubifs_file_address_operations = { .readpage = ubifs_readpage, .writepage = ubifs_writepage, @@ -1655,7 +1666,7 @@ const struct inode_operations ubifs_file_inode_operations = { const struct inode_operations ubifs_symlink_inode_operations = { .get_link = ubifs_get_link, .setattr = ubifs_setattr, - .getattr = ubifs_getattr, + .getattr = ubifs_symlink_getattr, .listxattr = ubifs_listxattr, .update_time = ubifs_update_time, }; diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 74028b5a7b0a..00a01471ea05 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -70,7 +70,7 @@ #include <linux/module.h> #include <linux/bitops.h> -#include <stdarg.h> +#include <linux/stdarg.h> #include <linux/uaccess.h> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 5c2d806e6ae5..003f0d31743e 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -33,11 +33,6 @@ int sysctl_unprivileged_userfaultfd __read_mostly; static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly; -enum userfaultfd_state { - UFFD_STATE_WAIT_API, - UFFD_STATE_RUNNING, -}; - /* * Start with fault_pending_wqh and fault_wqh so they're more likely * to be in the same cacheline. @@ -69,12 +64,10 @@ struct userfaultfd_ctx { unsigned int flags; /* features requested from the userspace */ unsigned int features; - /* state machine */ - enum userfaultfd_state state; /* released */ bool released; /* memory mappings are changing because of non-cooperative event */ - bool mmap_changing; + atomic_t mmap_changing; /* mm with one ore more vmas attached to this userfaultfd_ctx */ struct mm_struct *mm; }; @@ -104,6 +97,14 @@ struct userfaultfd_wake_range { unsigned long len; }; +/* internal indication that UFFD_API ioctl was successfully executed */ +#define UFFD_FEATURE_INITIALIZED (1u << 31) + +static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx) +{ + return ctx->features & UFFD_FEATURE_INITIALIZED; +} + static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode, int wake_flags, void *key) { @@ -623,7 +624,8 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, * already released. */ out: - WRITE_ONCE(ctx->mmap_changing, false); + atomic_dec(&ctx->mmap_changing); + VM_BUG_ON(atomic_read(&ctx->mmap_changing) < 0); userfaultfd_ctx_put(ctx); } @@ -666,15 +668,14 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) refcount_set(&ctx->refcount, 1); ctx->flags = octx->flags; - ctx->state = UFFD_STATE_RUNNING; ctx->features = octx->features; ctx->released = false; - ctx->mmap_changing = false; + atomic_set(&ctx->mmap_changing, 0); ctx->mm = vma->vm_mm; mmgrab(ctx->mm); userfaultfd_ctx_get(octx); - WRITE_ONCE(octx->mmap_changing, true); + atomic_inc(&octx->mmap_changing); fctx->orig = octx; fctx->new = ctx; list_add_tail(&fctx->list, fcs); @@ -721,7 +722,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma, if (ctx->features & UFFD_FEATURE_EVENT_REMAP) { vm_ctx->ctx = ctx; userfaultfd_ctx_get(ctx); - WRITE_ONCE(ctx->mmap_changing, true); + atomic_inc(&ctx->mmap_changing); } else { /* Drop uffd context if remap feature not enabled */ vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; @@ -766,7 +767,7 @@ bool userfaultfd_remove(struct vm_area_struct *vma, return true; userfaultfd_ctx_get(ctx); - WRITE_ONCE(ctx->mmap_changing, true); + atomic_inc(&ctx->mmap_changing); mmap_read_unlock(mm); msg_init(&ewq.msg); @@ -810,7 +811,7 @@ int userfaultfd_unmap_prep(struct vm_area_struct *vma, return -ENOMEM; userfaultfd_ctx_get(ctx); - WRITE_ONCE(ctx->mmap_changing, true); + atomic_inc(&ctx->mmap_changing); unmap_ctx->ctx = ctx; unmap_ctx->start = start; unmap_ctx->end = end; @@ -943,38 +944,33 @@ static __poll_t userfaultfd_poll(struct file *file, poll_table *wait) poll_wait(file, &ctx->fd_wqh, wait); - switch (ctx->state) { - case UFFD_STATE_WAIT_API: + if (!userfaultfd_is_initialized(ctx)) return EPOLLERR; - case UFFD_STATE_RUNNING: - /* - * poll() never guarantees that read won't block. - * userfaults can be waken before they're read(). - */ - if (unlikely(!(file->f_flags & O_NONBLOCK))) - return EPOLLERR; - /* - * lockless access to see if there are pending faults - * __pollwait last action is the add_wait_queue but - * the spin_unlock would allow the waitqueue_active to - * pass above the actual list_add inside - * add_wait_queue critical section. So use a full - * memory barrier to serialize the list_add write of - * add_wait_queue() with the waitqueue_active read - * below. - */ - ret = 0; - smp_mb(); - if (waitqueue_active(&ctx->fault_pending_wqh)) - ret = EPOLLIN; - else if (waitqueue_active(&ctx->event_wqh)) - ret = EPOLLIN; - return ret; - default: - WARN_ON_ONCE(1); + /* + * poll() never guarantees that read won't block. + * userfaults can be waken before they're read(). + */ + if (unlikely(!(file->f_flags & O_NONBLOCK))) return EPOLLERR; - } + /* + * lockless access to see if there are pending faults + * __pollwait last action is the add_wait_queue but + * the spin_unlock would allow the waitqueue_active to + * pass above the actual list_add inside + * add_wait_queue critical section. So use a full + * memory barrier to serialize the list_add write of + * add_wait_queue() with the waitqueue_active read + * below. + */ + ret = 0; + smp_mb(); + if (waitqueue_active(&ctx->fault_pending_wqh)) + ret = EPOLLIN; + else if (waitqueue_active(&ctx->event_wqh)) + ret = EPOLLIN; + + return ret; } static const struct file_operations userfaultfd_fops; @@ -1169,7 +1165,7 @@ static ssize_t userfaultfd_read(struct file *file, char __user *buf, int no_wait = file->f_flags & O_NONBLOCK; struct inode *inode = file_inode(file); - if (ctx->state == UFFD_STATE_WAIT_API) + if (!userfaultfd_is_initialized(ctx)) return -EINVAL; for (;;) { @@ -1700,7 +1696,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, user_uffdio_copy = (struct uffdio_copy __user *) arg; ret = -EAGAIN; - if (READ_ONCE(ctx->mmap_changing)) + if (atomic_read(&ctx->mmap_changing)) goto out; ret = -EFAULT; @@ -1757,7 +1753,7 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg; ret = -EAGAIN; - if (READ_ONCE(ctx->mmap_changing)) + if (atomic_read(&ctx->mmap_changing)) goto out; ret = -EFAULT; @@ -1807,7 +1803,7 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx, struct userfaultfd_wake_range range; bool mode_wp, mode_dontwake; - if (READ_ONCE(ctx->mmap_changing)) + if (atomic_read(&ctx->mmap_changing)) return -EAGAIN; user_uffdio_wp = (struct uffdio_writeprotect __user *) arg; @@ -1855,7 +1851,7 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) user_uffdio_continue = (struct uffdio_continue __user *)arg; ret = -EAGAIN; - if (READ_ONCE(ctx->mmap_changing)) + if (atomic_read(&ctx->mmap_changing)) goto out; ret = -EFAULT; @@ -1908,9 +1904,10 @@ out: static inline unsigned int uffd_ctx_features(__u64 user_features) { /* - * For the current set of features the bits just coincide + * For the current set of features the bits just coincide. Set + * UFFD_FEATURE_INITIALIZED to mark the features as enabled. */ - return (unsigned int)user_features; + return (unsigned int)user_features | UFFD_FEATURE_INITIALIZED; } /* @@ -1923,12 +1920,10 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, { struct uffdio_api uffdio_api; void __user *buf = (void __user *)arg; + unsigned int ctx_features; int ret; __u64 features; - ret = -EINVAL; - if (ctx->state != UFFD_STATE_WAIT_API) - goto out; ret = -EFAULT; if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api))) goto out; @@ -1952,9 +1947,13 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, ret = -EFAULT; if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api))) goto out; - ctx->state = UFFD_STATE_RUNNING; + /* only enable the requested features for this uffd context */ - ctx->features = uffd_ctx_features(features); + ctx_features = uffd_ctx_features(features); + ret = -EINVAL; + if (cmpxchg(&ctx->features, 0, ctx_features) != 0) + goto err_out; + ret = 0; out: return ret; @@ -1971,7 +1970,7 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd, int ret = -EINVAL; struct userfaultfd_ctx *ctx = file->private_data; - if (cmd != UFFDIO_API && ctx->state == UFFD_STATE_WAIT_API) + if (cmd != UFFDIO_API && !userfaultfd_is_initialized(ctx)) return -EINVAL; switch(cmd) { @@ -2085,9 +2084,8 @@ SYSCALL_DEFINE1(userfaultfd, int, flags) refcount_set(&ctx->refcount, 1); ctx->flags = flags; ctx->features = 0; - ctx->state = UFFD_STATE_WAIT_API; ctx->released = false; - ctx->mmap_changing = false; + atomic_set(&ctx->mmap_changing, 0); ctx->mm = current->mm; /* prevent the mm struct to be freed */ mmgrab(ctx->mm); diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index e986b95d94c9..6f49bf39183c 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -29,67 +29,3 @@ kmem_alloc(size_t size, xfs_km_flags_t flags) congestion_wait(BLK_RW_ASYNC, HZ/50); } while (1); } - - -/* - * __vmalloc() will allocate data pages and auxiliary structures (e.g. - * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context here. Hence - * we need to tell memory reclaim that we are in such a context via - * PF_MEMALLOC_NOFS to prevent memory reclaim re-entering the filesystem here - * and potentially deadlocking. - */ -static void * -__kmem_vmalloc(size_t size, xfs_km_flags_t flags) -{ - unsigned nofs_flag = 0; - void *ptr; - gfp_t lflags = kmem_flags_convert(flags); - - if (flags & KM_NOFS) - nofs_flag = memalloc_nofs_save(); - - ptr = __vmalloc(size, lflags); - - if (flags & KM_NOFS) - memalloc_nofs_restore(nofs_flag); - - return ptr; -} - -/* - * Same as kmem_alloc_large, except we guarantee the buffer returned is aligned - * to the @align_mask. We only guarantee alignment up to page size, we'll clamp - * alignment at page size if it is larger. vmalloc always returns a PAGE_SIZE - * aligned region. - */ -void * -kmem_alloc_io(size_t size, int align_mask, xfs_km_flags_t flags) -{ - void *ptr; - - trace_kmem_alloc_io(size, flags, _RET_IP_); - - if (WARN_ON_ONCE(align_mask >= PAGE_SIZE)) - align_mask = PAGE_SIZE - 1; - - ptr = kmem_alloc(size, flags | KM_MAYFAIL); - if (ptr) { - if (!((uintptr_t)ptr & align_mask)) - return ptr; - kfree(ptr); - } - return __kmem_vmalloc(size, flags); -} - -void * -kmem_alloc_large(size_t size, xfs_km_flags_t flags) -{ - void *ptr; - - trace_kmem_alloc_large(size, flags, _RET_IP_); - - ptr = kmem_alloc(size, flags | KM_MAYFAIL); - if (ptr) - return ptr; - return __kmem_vmalloc(size, flags); -} diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index 38007117697e..54da6d717a06 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -57,8 +57,6 @@ kmem_flags_convert(xfs_km_flags_t flags) } extern void *kmem_alloc(size_t, xfs_km_flags_t); -extern void *kmem_alloc_io(size_t size, int align_mask, xfs_km_flags_t flags); -extern void *kmem_alloc_large(size_t size, xfs_km_flags_t); static inline void kmem_free(const void *ptr) { kvfree(ptr); diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index ee9ec0c50bec..005abfd9fd34 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -313,7 +313,6 @@ xfs_get_aghdr_buf( if (error) return error; - bp->b_bn = blkno; bp->b_maps[0].bm_bn = blkno; bp->b_ops = ops; @@ -469,7 +468,7 @@ xfs_rmaproot_init( rrec->rm_offset = 0; /* account for refc btree root */ - if (xfs_sb_version_hasreflink(&mp->m_sb)) { + if (xfs_has_reflink(mp)) { rrec = XFS_RMAP_REC_ADDR(block, 5); rrec->rm_startblock = cpu_to_be32(xfs_refc_block(mp)); rrec->rm_blockcount = cpu_to_be32(1); @@ -528,7 +527,7 @@ xfs_agfblock_init( agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp)); agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1); agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1); - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) { + if (xfs_has_rmapbt(mp)) { agf->agf_roots[XFS_BTNUM_RMAPi] = cpu_to_be32(XFS_RMAP_BLOCK(mp)); agf->agf_levels[XFS_BTNUM_RMAPi] = cpu_to_be32(1); @@ -541,9 +540,9 @@ xfs_agfblock_init( tmpsize = id->agsize - mp->m_ag_prealloc_blocks; agf->agf_freeblks = cpu_to_be32(tmpsize); agf->agf_longest = cpu_to_be32(tmpsize); - if (xfs_sb_version_hascrc(&mp->m_sb)) + if (xfs_has_crc(mp)) uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid); - if (xfs_sb_version_hasreflink(&mp->m_sb)) { + if (xfs_has_reflink(mp)) { agf->agf_refcount_root = cpu_to_be32( xfs_refc_block(mp)); agf->agf_refcount_level = cpu_to_be32(1); @@ -569,7 +568,7 @@ xfs_agflblock_init( __be32 *agfl_bno; int bucket; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC); agfl->agfl_seqno = cpu_to_be32(id->agno); uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid); @@ -599,17 +598,17 @@ xfs_agiblock_init( agi->agi_freecount = 0; agi->agi_newino = cpu_to_be32(NULLAGINO); agi->agi_dirino = cpu_to_be32(NULLAGINO); - if (xfs_sb_version_hascrc(&mp->m_sb)) + if (xfs_has_crc(mp)) uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid); - if (xfs_sb_version_hasfinobt(&mp->m_sb)) { + if (xfs_has_finobt(mp)) { agi->agi_free_root = cpu_to_be32(XFS_FIBT_BLOCK(mp)); agi->agi_free_level = cpu_to_be32(1); } for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); - if (xfs_sb_version_hasinobtcounts(&mp->m_sb)) { + if (xfs_has_inobtcounts(mp)) { agi->agi_iblocks = cpu_to_be32(1); - if (xfs_sb_version_hasfinobt(&mp->m_sb)) + if (xfs_has_finobt(mp)) agi->agi_fblocks = cpu_to_be32(1); } } @@ -719,14 +718,14 @@ xfs_ag_init_headers( .ops = &xfs_finobt_buf_ops, .work = &xfs_btroot_init, .type = XFS_BTNUM_FINO, - .need_init = xfs_sb_version_hasfinobt(&mp->m_sb) + .need_init = xfs_has_finobt(mp) }, { /* RMAP root block */ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_RMAP_BLOCK(mp)), .numblks = BTOBB(mp->m_sb.sb_blocksize), .ops = &xfs_rmapbt_buf_ops, .work = &xfs_rmaproot_init, - .need_init = xfs_sb_version_hasrmapbt(&mp->m_sb) + .need_init = xfs_has_rmapbt(mp) }, { /* REFC root block */ .daddr = XFS_AGB_TO_DADDR(mp, id->agno, xfs_refc_block(mp)), @@ -734,7 +733,7 @@ xfs_ag_init_headers( .ops = &xfs_refcountbt_buf_ops, .work = &xfs_btroot_init, .type = XFS_BTNUM_REFC, - .need_init = xfs_sb_version_hasreflink(&mp->m_sb) + .need_init = xfs_has_reflink(mp) }, { /* NULL terminating block */ .daddr = XFS_BUF_DADDR_NULL, diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 6929157d8d6e..95157f5a5a6c 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -51,7 +51,7 @@ xfs_agfl_size( { unsigned int size = mp->m_sb.sb_sectsize; - if (xfs_sb_version_hascrc(&mp->m_sb)) + if (xfs_has_crc(mp)) size -= sizeof(struct xfs_agfl); return size / sizeof(xfs_agblock_t); @@ -61,9 +61,9 @@ unsigned int xfs_refc_block( struct xfs_mount *mp) { - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + if (xfs_has_rmapbt(mp)) return XFS_RMAP_BLOCK(mp) + 1; - if (xfs_sb_version_hasfinobt(&mp->m_sb)) + if (xfs_has_finobt(mp)) return XFS_FIBT_BLOCK(mp) + 1; return XFS_IBT_BLOCK(mp) + 1; } @@ -72,11 +72,11 @@ xfs_extlen_t xfs_prealloc_blocks( struct xfs_mount *mp) { - if (xfs_sb_version_hasreflink(&mp->m_sb)) + if (xfs_has_reflink(mp)) return xfs_refc_block(mp) + 1; - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + if (xfs_has_rmapbt(mp)) return XFS_RMAP_BLOCK(mp) + 1; - if (xfs_sb_version_hasfinobt(&mp->m_sb)) + if (xfs_has_finobt(mp)) return XFS_FIBT_BLOCK(mp) + 1; return XFS_IBT_BLOCK(mp) + 1; } @@ -126,11 +126,11 @@ xfs_alloc_ag_max_usable( blocks = XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)); /* ag headers */ blocks += XFS_ALLOC_AGFL_RESERVE; blocks += 3; /* AGF, AGI btree root blocks */ - if (xfs_sb_version_hasfinobt(&mp->m_sb)) + if (xfs_has_finobt(mp)) blocks++; /* finobt root block */ - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + if (xfs_has_rmapbt(mp)) blocks++; /* rmap root block */ - if (xfs_sb_version_hasreflink(&mp->m_sb)) + if (xfs_has_reflink(mp)) blocks++; /* refcount root block */ return mp->m_sb.sb_agblocks - blocks; @@ -598,7 +598,7 @@ xfs_agfl_verify( * AGFL is what the AGF says is active. We can't get to the AGF, so we * can't verify just those entries are valid. */ - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return NULL; if (!xfs_verify_magic(bp, agfl->agfl_magicnum)) @@ -638,7 +638,7 @@ xfs_agfl_read_verify( * AGFL is what the AGF says is active. We can't get to the AGF, so we * can't verify just those entries are valid. */ - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return; if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF)) @@ -659,7 +659,7 @@ xfs_agfl_write_verify( xfs_failaddr_t fa; /* no verification of non-crc AGFLs */ - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return; fa = xfs_agfl_verify(bp); @@ -2264,7 +2264,7 @@ xfs_alloc_min_freelist( min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1, mp->m_ag_maxlevels); /* space needed reverse mapping used space btree */ - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + if (xfs_has_rmapbt(mp)) min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1, mp->m_rmap_maxlevels); @@ -2373,7 +2373,7 @@ xfs_agfl_needs_reset( int active; /* no agfl header on v4 supers */ - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return false; /* @@ -2877,7 +2877,7 @@ xfs_agf_verify( struct xfs_mount *mp = bp->b_mount; struct xfs_agf *agf = bp->b_addr; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(agf->agf_lsn))) @@ -2907,12 +2907,12 @@ xfs_agf_verify( be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > mp->m_ag_maxlevels) return __this_address; - if (xfs_sb_version_hasrmapbt(&mp->m_sb) && + if (xfs_has_rmapbt(mp) && (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 || be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > mp->m_rmap_maxlevels)) return __this_address; - if (xfs_sb_version_hasrmapbt(&mp->m_sb) && + if (xfs_has_rmapbt(mp) && be32_to_cpu(agf->agf_rmap_blocks) > be32_to_cpu(agf->agf_length)) return __this_address; @@ -2925,16 +2925,16 @@ xfs_agf_verify( if (bp->b_pag && be32_to_cpu(agf->agf_seqno) != bp->b_pag->pag_agno) return __this_address; - if (xfs_sb_version_haslazysbcount(&mp->m_sb) && + if (xfs_has_lazysbcount(mp) && be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length)) return __this_address; - if (xfs_sb_version_hasreflink(&mp->m_sb) && + if (xfs_has_reflink(mp) && be32_to_cpu(agf->agf_refcount_blocks) > be32_to_cpu(agf->agf_length)) return __this_address; - if (xfs_sb_version_hasreflink(&mp->m_sb) && + if (xfs_has_reflink(mp) && (be32_to_cpu(agf->agf_refcount_level) < 1 || be32_to_cpu(agf->agf_refcount_level) > mp->m_refc_maxlevels)) return __this_address; @@ -2950,7 +2950,7 @@ xfs_agf_read_verify( struct xfs_mount *mp = bp->b_mount; xfs_failaddr_t fa; - if (xfs_sb_version_hascrc(&mp->m_sb) && + if (xfs_has_crc(mp) && !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF)) xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { @@ -2975,7 +2975,7 @@ xfs_agf_write_verify( return; } - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return; if (bip) @@ -3073,13 +3073,13 @@ xfs_alloc_read_agf( * counter only tracks non-root blocks. */ allocbt_blks = pag->pagf_btreeblks; - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + if (xfs_has_rmapbt(mp)) allocbt_blks -= be32_to_cpu(agf->agf_rmap_blocks) - 1; if (allocbt_blks > 0) atomic64_add(allocbt_blks, &mp->m_allocbt_blks); } #ifdef DEBUG - else if (!XFS_FORCED_SHUTDOWN(mp)) { + else if (!xfs_is_shutdown(mp)) { ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks)); ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks)); ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount)); @@ -3166,7 +3166,7 @@ xfs_alloc_vextent( * the first a.g. fails. */ if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) && - (mp->m_flags & XFS_MOUNT_32BITINODES)) { + xfs_is_inode32(mp)) { args->fsbno = XFS_AGB_TO_FSB(mp, ((mp->m_agfrotor / rotorstep) % mp->m_sb.sb_agcount), 0); @@ -3392,7 +3392,7 @@ struct xfs_alloc_query_range_info { STATIC int xfs_alloc_query_range_helper( struct xfs_btree_cur *cur, - union xfs_btree_rec *rec, + const union xfs_btree_rec *rec, void *priv) { struct xfs_alloc_query_range_info *query = priv; @@ -3407,8 +3407,8 @@ xfs_alloc_query_range_helper( int xfs_alloc_query_range( struct xfs_btree_cur *cur, - struct xfs_alloc_rec_incore *low_rec, - struct xfs_alloc_rec_incore *high_rec, + const struct xfs_alloc_rec_incore *low_rec, + const struct xfs_alloc_rec_incore *high_rec, xfs_alloc_query_range_fn fn, void *priv) { diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index e30900b6f8ba..df4aefaf0046 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -220,13 +220,13 @@ int xfs_free_extent_fix_freelist(struct xfs_trans *tp, struct xfs_perag *pag, xfs_extlen_t xfs_prealloc_blocks(struct xfs_mount *mp); typedef int (*xfs_alloc_query_range_fn)( - struct xfs_btree_cur *cur, - struct xfs_alloc_rec_incore *rec, - void *priv); + struct xfs_btree_cur *cur, + const struct xfs_alloc_rec_incore *rec, + void *priv); int xfs_alloc_query_range(struct xfs_btree_cur *cur, - struct xfs_alloc_rec_incore *low_rec, - struct xfs_alloc_rec_incore *high_rec, + const struct xfs_alloc_rec_incore *low_rec, + const struct xfs_alloc_rec_incore *high_rec, xfs_alloc_query_range_fn fn, void *priv); int xfs_alloc_query_all(struct xfs_btree_cur *cur, xfs_alloc_query_range_fn fn, void *priv); @@ -243,7 +243,7 @@ static inline __be32 * xfs_buf_to_agfl_bno( struct xfs_buf *bp) { - if (xfs_sb_version_hascrc(&bp->b_mount->m_sb)) + if (xfs_has_crc(bp->b_mount)) return bp->b_addr + sizeof(struct xfs_agfl); return bp->b_addr; } diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 6b363f78cfa2..6746fd735550 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -31,9 +31,9 @@ xfs_allocbt_dup_cursor( STATIC void xfs_allocbt_set_root( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *ptr, - int inc) + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr, + int inc) { struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agf *agf = agbp->b_addr; @@ -50,10 +50,10 @@ xfs_allocbt_set_root( STATIC int xfs_allocbt_alloc_block( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *start, - union xfs_btree_ptr *new, - int *stat) + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat) { int error; xfs_agblock_t bno; @@ -87,7 +87,7 @@ xfs_allocbt_free_block( xfs_agblock_t bno; int error; - bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp)); + bno = xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp)); error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1); if (error) return error; @@ -103,11 +103,11 @@ xfs_allocbt_free_block( */ STATIC void xfs_allocbt_update_lastrec( - struct xfs_btree_cur *cur, - struct xfs_btree_block *block, - union xfs_btree_rec *rec, - int ptr, - int reason) + struct xfs_btree_cur *cur, + const struct xfs_btree_block *block, + const union xfs_btree_rec *rec, + int ptr, + int reason) { struct xfs_agf *agf = cur->bc_ag.agbp->b_addr; struct xfs_perag *pag; @@ -177,8 +177,8 @@ xfs_allocbt_get_maxrecs( STATIC void xfs_allocbt_init_key_from_rec( - union xfs_btree_key *key, - union xfs_btree_rec *rec) + union xfs_btree_key *key, + const union xfs_btree_rec *rec) { key->alloc.ar_startblock = rec->alloc.ar_startblock; key->alloc.ar_blockcount = rec->alloc.ar_blockcount; @@ -186,10 +186,10 @@ xfs_allocbt_init_key_from_rec( STATIC void xfs_bnobt_init_high_key_from_rec( - union xfs_btree_key *key, - union xfs_btree_rec *rec) + union xfs_btree_key *key, + const union xfs_btree_rec *rec) { - __u32 x; + __u32 x; x = be32_to_cpu(rec->alloc.ar_startblock); x += be32_to_cpu(rec->alloc.ar_blockcount) - 1; @@ -199,8 +199,8 @@ xfs_bnobt_init_high_key_from_rec( STATIC void xfs_cntbt_init_high_key_from_rec( - union xfs_btree_key *key, - union xfs_btree_rec *rec) + union xfs_btree_key *key, + const union xfs_btree_rec *rec) { key->alloc.ar_blockcount = rec->alloc.ar_blockcount; key->alloc.ar_startblock = 0; @@ -229,23 +229,23 @@ xfs_allocbt_init_ptr_from_cur( STATIC int64_t xfs_bnobt_key_diff( - struct xfs_btree_cur *cur, - union xfs_btree_key *key) + struct xfs_btree_cur *cur, + const union xfs_btree_key *key) { - xfs_alloc_rec_incore_t *rec = &cur->bc_rec.a; - xfs_alloc_key_t *kp = &key->alloc; + struct xfs_alloc_rec_incore *rec = &cur->bc_rec.a; + const struct xfs_alloc_rec *kp = &key->alloc; return (int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock; } STATIC int64_t xfs_cntbt_key_diff( - struct xfs_btree_cur *cur, - union xfs_btree_key *key) + struct xfs_btree_cur *cur, + const union xfs_btree_key *key) { - xfs_alloc_rec_incore_t *rec = &cur->bc_rec.a; - xfs_alloc_key_t *kp = &key->alloc; - int64_t diff; + struct xfs_alloc_rec_incore *rec = &cur->bc_rec.a; + const struct xfs_alloc_rec *kp = &key->alloc; + int64_t diff; diff = (int64_t)be32_to_cpu(kp->ar_blockcount) - rec->ar_blockcount; if (diff) @@ -256,9 +256,9 @@ xfs_cntbt_key_diff( STATIC int64_t xfs_bnobt_diff_two_keys( - struct xfs_btree_cur *cur, - union xfs_btree_key *k1, - union xfs_btree_key *k2) + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2) { return (int64_t)be32_to_cpu(k1->alloc.ar_startblock) - be32_to_cpu(k2->alloc.ar_startblock); @@ -266,11 +266,11 @@ xfs_bnobt_diff_two_keys( STATIC int64_t xfs_cntbt_diff_two_keys( - struct xfs_btree_cur *cur, - union xfs_btree_key *k1, - union xfs_btree_key *k2) + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2) { - int64_t diff; + int64_t diff; diff = be32_to_cpu(k1->alloc.ar_blockcount) - be32_to_cpu(k2->alloc.ar_blockcount); @@ -295,7 +295,7 @@ xfs_allocbt_verify( if (!xfs_verify_magic(bp, block->bb_magic)) return __this_address; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { fa = xfs_btree_sblock_v5hdr_verify(bp); if (fa) return fa; @@ -376,9 +376,9 @@ const struct xfs_buf_ops xfs_cntbt_buf_ops = { STATIC int xfs_bnobt_keys_inorder( - struct xfs_btree_cur *cur, - union xfs_btree_key *k1, - union xfs_btree_key *k2) + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2) { return be32_to_cpu(k1->alloc.ar_startblock) < be32_to_cpu(k2->alloc.ar_startblock); @@ -386,9 +386,9 @@ xfs_bnobt_keys_inorder( STATIC int xfs_bnobt_recs_inorder( - struct xfs_btree_cur *cur, - union xfs_btree_rec *r1, - union xfs_btree_rec *r2) + struct xfs_btree_cur *cur, + const union xfs_btree_rec *r1, + const union xfs_btree_rec *r2) { return be32_to_cpu(r1->alloc.ar_startblock) + be32_to_cpu(r1->alloc.ar_blockcount) <= @@ -397,9 +397,9 @@ xfs_bnobt_recs_inorder( STATIC int xfs_cntbt_keys_inorder( - struct xfs_btree_cur *cur, - union xfs_btree_key *k1, - union xfs_btree_key *k2) + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2) { return be32_to_cpu(k1->alloc.ar_blockcount) < be32_to_cpu(k2->alloc.ar_blockcount) || @@ -410,9 +410,9 @@ xfs_cntbt_keys_inorder( STATIC int xfs_cntbt_recs_inorder( - struct xfs_btree_cur *cur, - union xfs_btree_rec *r1, - union xfs_btree_rec *r2) + struct xfs_btree_cur *cur, + const union xfs_btree_rec *r1, + const union xfs_btree_rec *r2) { return be32_to_cpu(r1->alloc.ar_blockcount) < be32_to_cpu(r2->alloc.ar_blockcount) || @@ -498,7 +498,7 @@ xfs_allocbt_init_common( atomic_inc(&pag->pag_ref); cur->bc_ag.pag = pag; - if (xfs_sb_version_hascrc(&mp->m_sb)) + if (xfs_has_crc(mp)) cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; return cur; diff --git a/fs/xfs/libxfs/xfs_alloc_btree.h b/fs/xfs/libxfs/xfs_alloc_btree.h index 9eb4c667a6b8..2f6b816aaf9f 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.h +++ b/fs/xfs/libxfs/xfs_alloc_btree.h @@ -20,7 +20,7 @@ struct xbtree_afakeroot; * Btree block header size depends on a superblock flag. */ #define XFS_ALLOC_BLOCK_LEN(mp) \ - (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ + (xfs_has_crc(((mp))) ? \ XFS_BTREE_SBLOCK_CRC_LEN : XFS_BTREE_SBLOCK_LEN) /* diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 191d51725988..fbc9d816882c 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -146,7 +146,7 @@ xfs_attr_get( XFS_STATS_INC(args->dp->i_mount, xs_attr_get); - if (XFS_FORCED_SHUTDOWN(args->dp->i_mount)) + if (xfs_is_shutdown(args->dp->i_mount)) return -EIO; args->geo = args->dp->i_mount->m_attr_geo; @@ -224,7 +224,7 @@ xfs_attr_try_sf_addname( if (!error && !(args->op_flags & XFS_DA_OP_NOTIME)) xfs_trans_ichgtime(args->trans, dp, XFS_ICHGTIME_CHG); - if (dp->i_mount->m_flags & XFS_MOUNT_WSYNC) + if (xfs_has_wsync(dp->i_mount)) xfs_trans_set_sync(args->trans); return error; @@ -335,6 +335,7 @@ xfs_attr_sf_addname( * the attr fork to leaf format and will restart with the leaf * add. */ + trace_xfs_attr_sf_addname_return(XFS_DAS_UNINIT, args->dp); dac->flags |= XFS_DAC_DEFER_FINISH; return -EAGAIN; } @@ -394,6 +395,8 @@ xfs_attr_set_iter( * handling code below */ dac->flags |= XFS_DAC_DEFER_FINISH; + trace_xfs_attr_set_iter_return( + dac->dela_state, args->dp); return -EAGAIN; } else if (error) { return error; @@ -411,6 +414,7 @@ xfs_attr_set_iter( dac->dela_state = XFS_DAS_FOUND_NBLK; } + trace_xfs_attr_set_iter_return(dac->dela_state, args->dp); return -EAGAIN; case XFS_DAS_FOUND_LBLK: /* @@ -438,6 +442,8 @@ xfs_attr_set_iter( error = xfs_attr_rmtval_set_blk(dac); if (error) return error; + trace_xfs_attr_set_iter_return(dac->dela_state, + args->dp); return -EAGAIN; } @@ -472,6 +478,7 @@ xfs_attr_set_iter( * series. */ dac->dela_state = XFS_DAS_FLIP_LFLAG; + trace_xfs_attr_set_iter_return(dac->dela_state, args->dp); return -EAGAIN; case XFS_DAS_FLIP_LFLAG: /* @@ -488,11 +495,15 @@ xfs_attr_set_iter( /* Set state in case xfs_attr_rmtval_remove returns -EAGAIN */ dac->dela_state = XFS_DAS_RM_LBLK; if (args->rmtblkno) { - error = __xfs_attr_rmtval_remove(dac); + error = xfs_attr_rmtval_remove(dac); + if (error == -EAGAIN) + trace_xfs_attr_set_iter_return( + dac->dela_state, args->dp); if (error) return error; dac->dela_state = XFS_DAS_RD_LEAF; + trace_xfs_attr_set_iter_return(dac->dela_state, args->dp); return -EAGAIN; } @@ -542,6 +553,8 @@ xfs_attr_set_iter( error = xfs_attr_rmtval_set_blk(dac); if (error) return error; + trace_xfs_attr_set_iter_return( + dac->dela_state, args->dp); return -EAGAIN; } @@ -577,6 +590,7 @@ xfs_attr_set_iter( * series */ dac->dela_state = XFS_DAS_FLIP_NFLAG; + trace_xfs_attr_set_iter_return(dac->dela_state, args->dp); return -EAGAIN; case XFS_DAS_FLIP_NFLAG: @@ -595,11 +609,16 @@ xfs_attr_set_iter( /* Set state in case xfs_attr_rmtval_remove returns -EAGAIN */ dac->dela_state = XFS_DAS_RM_NBLK; if (args->rmtblkno) { - error = __xfs_attr_rmtval_remove(dac); + error = xfs_attr_rmtval_remove(dac); + if (error == -EAGAIN) + trace_xfs_attr_set_iter_return( + dac->dela_state, args->dp); + if (error) return error; dac->dela_state = XFS_DAS_CLR_FLAG; + trace_xfs_attr_set_iter_return(dac->dela_state, args->dp); return -EAGAIN; } @@ -623,8 +642,8 @@ out: /* * Return EEXIST if attr is found, or ENOATTR if not */ -int -xfs_has_attr( +static int +xfs_attr_lookup( struct xfs_da_args *args) { struct xfs_inode *dp = args->dp; @@ -691,7 +710,7 @@ xfs_attr_set( int rmt_blks = 0; unsigned int total; - if (XFS_FORCED_SHUTDOWN(dp->i_mount)) + if (xfs_is_shutdown(dp->i_mount)) return -EIO; error = xfs_qm_dqattach(dp); @@ -761,8 +780,8 @@ xfs_attr_set( goto out_trans_cancel; } + error = xfs_attr_lookup(args); if (args->value) { - error = xfs_has_attr(args); if (error == -EEXIST && (args->attr_flags & XATTR_CREATE)) goto out_trans_cancel; if (error == -ENOATTR && (args->attr_flags & XATTR_REPLACE)) @@ -777,7 +796,6 @@ xfs_attr_set( if (!args->trans) goto out_unlock; } else { - error = xfs_has_attr(args); if (error != -EEXIST) goto out_trans_cancel; @@ -790,7 +808,7 @@ xfs_attr_set( * If this is a synchronous mount, make sure that the * transaction goes to disk before returning to the user. */ - if (mp->m_flags & XFS_MOUNT_WSYNC) + if (xfs_has_wsync(mp)) xfs_trans_set_sync(args->trans); if (!(args->op_flags & XFS_DA_OP_NOTIME)) @@ -1176,6 +1194,8 @@ xfs_attr_node_addname( * this point. */ dac->flags |= XFS_DAC_DEFER_FINISH; + trace_xfs_attr_node_addname_return( + dac->dela_state, args->dp); return -EAGAIN; } @@ -1421,11 +1441,14 @@ xfs_attr_remove_iter( * May return -EAGAIN. Roll and repeat until all remote * blocks are removed. */ - error = __xfs_attr_rmtval_remove(dac); - if (error == -EAGAIN) + error = xfs_attr_rmtval_remove(dac); + if (error == -EAGAIN) { + trace_xfs_attr_remove_iter_return( + dac->dela_state, args->dp); return error; - else if (error) + } else if (error) { goto out; + } /* * Refill the state structure with buffers (the prior @@ -1438,6 +1461,7 @@ xfs_attr_remove_iter( goto out; dac->dela_state = XFS_DAS_RM_NAME; dac->flags |= XFS_DAC_DEFER_FINISH; + trace_xfs_attr_remove_iter_return(dac->dela_state, args->dp); return -EAGAIN; } @@ -1466,6 +1490,8 @@ xfs_attr_remove_iter( dac->flags |= XFS_DAC_DEFER_FINISH; dac->dela_state = XFS_DAS_RM_SHRINK; + trace_xfs_attr_remove_iter_return( + dac->dela_state, args->dp); return -EAGAIN; } @@ -1514,7 +1540,7 @@ xfs_attr_fillstate(xfs_da_state_t *state) ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); for (blk = path->blk, level = 0; level < path->active; blk++, level++) { if (blk->bp) { - blk->disk_blkno = XFS_BUF_ADDR(blk->bp); + blk->disk_blkno = xfs_buf_daddr(blk->bp); blk->bp = NULL; } else { blk->disk_blkno = 0; @@ -1529,7 +1555,7 @@ xfs_attr_fillstate(xfs_da_state_t *state) ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH)); for (blk = path->blk, level = 0; level < path->active; blk++, level++) { if (blk->bp) { - blk->disk_blkno = XFS_BUF_ADDR(blk->bp); + blk->disk_blkno = xfs_buf_daddr(blk->bp); blk->bp = NULL; } else { blk->disk_blkno = 0; diff --git a/fs/xfs/libxfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h index 8de5d1d2733e..5e71f719bdd5 100644 --- a/fs/xfs/libxfs/xfs_attr.h +++ b/fs/xfs/libxfs/xfs_attr.h @@ -490,7 +490,6 @@ int xfs_attr_get_ilocked(struct xfs_da_args *args); int xfs_attr_get(struct xfs_da_args *args); int xfs_attr_set(struct xfs_da_args *args); int xfs_attr_set_args(struct xfs_da_args *args); -int xfs_has_attr(struct xfs_da_args *args); int xfs_attr_remove_args(struct xfs_da_args *args); int xfs_attr_remove_iter(struct xfs_delattr_context *dac); bool xfs_attr_namecheck(const void *name, size_t length); diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index b910bd209949..e1d11e314228 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -384,7 +384,7 @@ xfs_attr3_leaf_write_verify( return; } - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return; if (bip) @@ -406,7 +406,7 @@ xfs_attr3_leaf_read_verify( struct xfs_mount *mp = bp->b_mount; xfs_failaddr_t fa; - if (xfs_sb_version_hascrc(&mp->m_sb) && + if (xfs_has_crc(mp) && !xfs_buf_verify_cksum(bp, XFS_ATTR3_LEAF_CRC_OFF)) xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { @@ -489,7 +489,7 @@ xfs_attr_copy_value( } if (!args->value) { - args->value = kmem_alloc_large(valuelen, KM_NOLOCKDEP); + args->value = kvmalloc(valuelen, GFP_KERNEL | __GFP_NOLOCKDEP); if (!args->value) return -ENOMEM; } @@ -568,7 +568,7 @@ xfs_attr_shortform_bytesfit( * literal area, but for the old format we are done if there is no * space in the fixed attribute fork. */ - if (!(mp->m_flags & XFS_MOUNT_ATTR2)) + if (!xfs_has_attr2(mp)) return 0; dsize = dp->i_df.if_bytes; @@ -576,7 +576,7 @@ xfs_attr_shortform_bytesfit( switch (dp->i_df.if_format) { case XFS_DINODE_FMT_EXTENTS: /* - * If there is no attr fork and the data fork is extents, + * If there is no attr fork and the data fork is extents, * determine if creating the default attr fork will result * in the extents form migrating to btree. If so, the * minimum offset only needs to be the space required for @@ -621,21 +621,27 @@ xfs_attr_shortform_bytesfit( } /* - * Switch on the ATTR2 superblock bit (implies also FEATURES2) + * Switch on the ATTR2 superblock bit (implies also FEATURES2) unless: + * - noattr2 mount option is set, + * - on-disk version bit says it is already set, or + * - the attr2 mount option is not set to enable automatic upgrade from attr1. */ STATIC void -xfs_sbversion_add_attr2(xfs_mount_t *mp, xfs_trans_t *tp) +xfs_sbversion_add_attr2( + struct xfs_mount *mp, + struct xfs_trans *tp) { - if ((mp->m_flags & XFS_MOUNT_ATTR2) && - !(xfs_sb_version_hasattr2(&mp->m_sb))) { - spin_lock(&mp->m_sb_lock); - if (!xfs_sb_version_hasattr2(&mp->m_sb)) { - xfs_sb_version_addattr2(&mp->m_sb); - spin_unlock(&mp->m_sb_lock); - xfs_log_sb(tp); - } else - spin_unlock(&mp->m_sb_lock); - } + if (xfs_has_noattr2(mp)) + return; + if (mp->m_sb.sb_features2 & XFS_SB_VERSION2_ATTR2BIT) + return; + if (!xfs_has_attr2(mp)) + return; + + spin_lock(&mp->m_sb_lock); + xfs_add_attr2(mp); + spin_unlock(&mp->m_sb_lock); + xfs_log_sb(tp); } /* @@ -810,8 +816,7 @@ xfs_attr_sf_removename( * Fix up the start offset of the attribute fork */ totsize -= size; - if (totsize == sizeof(xfs_attr_sf_hdr_t) && - (mp->m_flags & XFS_MOUNT_ATTR2) && + if (totsize == sizeof(xfs_attr_sf_hdr_t) && xfs_has_attr2(mp) && (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) && !(args->op_flags & XFS_DA_OP_ADDNAME)) { xfs_attr_fork_remove(dp, args->trans); @@ -821,7 +826,7 @@ xfs_attr_sf_removename( ASSERT(dp->i_forkoff); ASSERT(totsize > sizeof(xfs_attr_sf_hdr_t) || (args->op_flags & XFS_DA_OP_ADDNAME) || - !(mp->m_flags & XFS_MOUNT_ATTR2) || + !xfs_has_attr2(mp) || dp->i_df.if_format == XFS_DINODE_FMT_BTREE); xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA); @@ -997,7 +1002,7 @@ xfs_attr_shortform_allfit( bytes += xfs_attr_sf_entsize_byname(name_loc->namelen, be16_to_cpu(name_loc->valuelen)); } - if ((dp->i_mount->m_flags & XFS_MOUNT_ATTR2) && + if (xfs_has_attr2(dp->i_mount) && (dp->i_df.if_format != XFS_DINODE_FMT_BTREE) && (bytes == sizeof(struct xfs_attr_sf_hdr))) return -1; @@ -1122,7 +1127,7 @@ xfs_attr3_leaf_to_shortform( goto out; if (forkoff == -1) { - ASSERT(dp->i_mount->m_flags & XFS_MOUNT_ATTR2); + ASSERT(xfs_has_attr2(dp->i_mount)); ASSERT(dp->i_df.if_format != XFS_DINODE_FMT_BTREE); xfs_attr_fork_remove(dp, args->trans); goto out; @@ -1199,9 +1204,9 @@ xfs_attr3_leaf_to_node( xfs_trans_buf_set_type(args->trans, bp2, XFS_BLFT_ATTR_LEAF_BUF); bp2->b_ops = bp1->b_ops; memcpy(bp2->b_addr, bp1->b_addr, args->geo->blksize); - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { struct xfs_da3_blkinfo *hdr3 = bp2->b_addr; - hdr3->blkno = cpu_to_be64(bp2->b_bn); + hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp2)); } xfs_trans_log_buf(args->trans, bp2, 0, args->geo->blksize - 1); @@ -1264,12 +1269,12 @@ xfs_attr3_leaf_create( memset(&ichdr, 0, sizeof(ichdr)); ichdr.firstused = args->geo->blksize; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { struct xfs_da3_blkinfo *hdr3 = bp->b_addr; ichdr.magic = XFS_ATTR3_LEAF_MAGIC; - hdr3->blkno = cpu_to_be64(bp->b_bn); + hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp)); hdr3->owner = cpu_to_be64(dp->i_ino); uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid); diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 0c8bee3abc3b..83b95be9ded8 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -51,7 +51,7 @@ xfs_attr3_rmt_blocks( struct xfs_mount *mp, int attrlen) { - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { int buflen = XFS_ATTR3_RMT_BUF_SPACE(mp, mp->m_sb.sb_blocksize); return (attrlen + buflen - 1) / buflen; } @@ -126,11 +126,11 @@ __xfs_attr3_rmt_read_verify( int blksize = mp->m_attr_geo->blksize; /* no verification of non-crc buffers */ - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return 0; ptr = bp->b_addr; - bno = bp->b_bn; + bno = xfs_buf_daddr(bp); len = BBTOB(bp->b_length); ASSERT(len >= blksize); @@ -191,11 +191,11 @@ xfs_attr3_rmt_write_verify( xfs_daddr_t bno; /* no verification of non-crc buffers */ - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return; ptr = bp->b_addr; - bno = bp->b_bn; + bno = xfs_buf_daddr(bp); len = BBTOB(bp->b_length); ASSERT(len >= blksize); @@ -246,7 +246,7 @@ xfs_attr3_rmt_hdr_set( { struct xfs_attr3_rmt_hdr *rmt = ptr; - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return 0; rmt->rm_magic = cpu_to_be32(XFS_ATTR3_RMT_MAGIC); @@ -284,7 +284,7 @@ xfs_attr_rmtval_copyout( uint8_t **dst) { char *src = bp->b_addr; - xfs_daddr_t bno = bp->b_bn; + xfs_daddr_t bno = xfs_buf_daddr(bp); int len = BBTOB(bp->b_length); int blksize = mp->m_attr_geo->blksize; @@ -296,7 +296,7 @@ xfs_attr_rmtval_copyout( byte_cnt = min(*valuelen, byte_cnt); - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { if (xfs_attr3_rmt_hdr_ok(src, ino, *offset, byte_cnt, bno)) { xfs_alert(mp, @@ -332,7 +332,7 @@ xfs_attr_rmtval_copyin( uint8_t **src) { char *dst = bp->b_addr; - xfs_daddr_t bno = bp->b_bn; + xfs_daddr_t bno = xfs_buf_daddr(bp); int len = BBTOB(bp->b_length); int blksize = mp->m_attr_geo->blksize; @@ -672,7 +672,7 @@ xfs_attr_rmtval_invalidate( * routine until it returns something other than -EAGAIN. */ int -__xfs_attr_rmtval_remove( +xfs_attr_rmtval_remove( struct xfs_delattr_context *dac) { struct xfs_da_args *args = dac->da_args; @@ -696,6 +696,7 @@ __xfs_attr_rmtval_remove( */ if (!done) { dac->flags |= XFS_DAC_DEFER_FINISH; + trace_xfs_attr_rmtval_remove_return(dac->dela_state, args->dp); return -EAGAIN; } diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h index 61b85b918db8..d72eff30ca18 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.h +++ b/fs/xfs/libxfs/xfs_attr_remote.h @@ -12,7 +12,7 @@ int xfs_attr_rmtval_get(struct xfs_da_args *args); int xfs_attr_rmtval_stale(struct xfs_inode *ip, struct xfs_bmbt_irec *map, xfs_buf_flags_t incore_flags); int xfs_attr_rmtval_invalidate(struct xfs_da_args *args); -int __xfs_attr_rmtval_remove(struct xfs_delattr_context *dac); +int xfs_attr_rmtval_remove(struct xfs_delattr_context *dac); int xfs_attr_rmt_find_hole(struct xfs_da_args *args); int xfs_attr_rmtval_set_value(struct xfs_da_args *args); int xfs_attr_rmtval_set_blk(struct xfs_delattr_context *dac); diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 948092babb6a..b48230f1a361 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -242,7 +242,7 @@ xfs_bmap_get_bp( for (i = 0; i < XFS_BTREE_MAXLEVELS; i++) { if (!cur->bc_bufs[i]) break; - if (XFS_BUF_ADDR(cur->bc_bufs[i]) == bno) + if (xfs_buf_daddr(cur->bc_bufs[i]) == bno) return cur->bc_bufs[i]; } @@ -251,7 +251,7 @@ xfs_bmap_get_bp( struct xfs_buf_log_item *bip = (struct xfs_buf_log_item *)lip; if (bip->bli_item.li_type == XFS_LI_BUF && - XFS_BUF_ADDR(bip->bli_buf) == bno) + xfs_buf_daddr(bip->bli_buf) == bno) return bip->bli_buf; } @@ -739,7 +739,7 @@ xfs_bmap_extents_to_btree( */ abp->b_ops = &xfs_bmbt_buf_ops; ablock = XFS_BUF_TO_BLOCK(abp); - xfs_btree_init_block_int(mp, ablock, abp->b_bn, + xfs_btree_init_block_int(mp, ablock, xfs_buf_daddr(abp), XFS_BTNUM_BMAP, 0, 0, ip->i_ino, XFS_BTREE_LONG_PTRS); @@ -1047,7 +1047,7 @@ xfs_bmap_set_attrforkoff( ip->i_forkoff = xfs_attr_shortform_bytesfit(ip, size); if (!ip->i_forkoff) ip->i_forkoff = default_size; - else if ((ip->i_mount->m_flags & XFS_MOUNT_ATTR2) && version) + else if (xfs_has_attr2(ip->i_mount) && version) *version = 2; break; default: @@ -1115,17 +1115,17 @@ xfs_bmap_add_attrfork( xfs_trans_log_inode(tp, ip, logflags); if (error) goto trans_cancel; - if (!xfs_sb_version_hasattr(&mp->m_sb) || - (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2)) { + if (!xfs_has_attr(mp) || + (!xfs_has_attr2(mp) && version == 2)) { bool log_sb = false; spin_lock(&mp->m_sb_lock); - if (!xfs_sb_version_hasattr(&mp->m_sb)) { - xfs_sb_version_addattr(&mp->m_sb); + if (!xfs_has_attr(mp)) { + xfs_add_attr(mp); log_sb = true; } - if (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2) { - xfs_sb_version_addattr2(&mp->m_sb); + if (!xfs_has_attr2(mp) && version == 2) { + xfs_add_attr2(mp); log_sb = true; } spin_unlock(&mp->m_sb_lock); @@ -3422,7 +3422,7 @@ xfs_bmap_compute_alignments( int stripe_align = 0; /* stripe alignment for allocation is determined by mount parameters */ - if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC)) + if (mp->m_swidth && xfs_has_swalloc(mp)) stripe_align = mp->m_swidth; else if (mp->m_dalign) stripe_align = mp->m_dalign; @@ -3938,7 +3938,7 @@ xfs_bmapi_read( XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) return -EFSCORRUPTED; - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; XFS_STATS_INC(mp, xs_blk_mapr); @@ -4420,7 +4420,7 @@ xfs_bmapi_write( return -EFSCORRUPTED; } - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; XFS_STATS_INC(mp, xs_blk_mapw); @@ -4703,7 +4703,7 @@ xfs_bmapi_remap( return -EFSCORRUPTED; } - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; error = xfs_iread_extents(tp, ip, whichfork); @@ -5361,7 +5361,7 @@ __xfs_bunmapi( ifp = XFS_IFORK_PTR(ip, whichfork); if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp))) return -EFSCORRUPTED; - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); @@ -5852,7 +5852,7 @@ xfs_bmap_collapse_extents( return -EFSCORRUPTED; } - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL)); @@ -5930,7 +5930,7 @@ xfs_bmap_can_insert_extents( ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + if (xfs_is_shutdown(ip->i_mount)) return -EIO; xfs_ilock(ip, XFS_ILOCK_EXCL); @@ -5967,7 +5967,7 @@ xfs_bmap_insert_extents( return -EFSCORRUPTED; } - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL)); @@ -6070,7 +6070,7 @@ xfs_bmap_split_extent( return -EFSCORRUPTED; } - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; /* Read in all the extents */ diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 1ceba020940e..72444b8b38a6 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -58,7 +58,7 @@ xfs_bmdr_to_bmbt( void xfs_bmbt_disk_get_all( - struct xfs_bmbt_rec *rec, + const struct xfs_bmbt_rec *rec, struct xfs_bmbt_irec *irec) { uint64_t l0 = get_unaligned_be64(&rec->l0); @@ -78,7 +78,7 @@ xfs_bmbt_disk_get_all( */ xfs_filblks_t xfs_bmbt_disk_get_blockcount( - xfs_bmbt_rec_t *r) + const struct xfs_bmbt_rec *r) { return (xfs_filblks_t)(be64_to_cpu(r->l1) & xfs_mask64lo(21)); } @@ -88,7 +88,7 @@ xfs_bmbt_disk_get_blockcount( */ xfs_fileoff_t xfs_bmbt_disk_get_startoff( - xfs_bmbt_rec_t *r) + const struct xfs_bmbt_rec *r) { return ((xfs_fileoff_t)be64_to_cpu(r->l0) & xfs_mask64lo(64 - BMBT_EXNTFLAG_BITLEN)) >> 9; @@ -136,7 +136,7 @@ xfs_bmbt_to_bmdr( xfs_bmbt_key_t *tkp; __be64 *tpp; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { ASSERT(rblock->bb_magic == cpu_to_be32(XFS_BMAP_CRC_MAGIC)); ASSERT(uuid_equal(&rblock->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)); @@ -193,10 +193,10 @@ xfs_bmbt_update_cursor( STATIC int xfs_bmbt_alloc_block( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *start, - union xfs_btree_ptr *new, - int *stat) + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat) { xfs_alloc_arg_t args; /* block allocation args */ int error; /* error return value */ @@ -282,7 +282,7 @@ xfs_bmbt_free_block( struct xfs_mount *mp = cur->bc_mp; struct xfs_inode *ip = cur->bc_ino.ip; struct xfs_trans *tp = cur->bc_tp; - xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp)); + xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); struct xfs_owner_info oinfo; xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork); @@ -352,8 +352,8 @@ xfs_bmbt_get_dmaxrecs( STATIC void xfs_bmbt_init_key_from_rec( - union xfs_btree_key *key, - union xfs_btree_rec *rec) + union xfs_btree_key *key, + const union xfs_btree_rec *rec) { key->bmbt.br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(&rec->bmbt)); @@ -361,8 +361,8 @@ xfs_bmbt_init_key_from_rec( STATIC void xfs_bmbt_init_high_key_from_rec( - union xfs_btree_key *key, - union xfs_btree_rec *rec) + union xfs_btree_key *key, + const union xfs_btree_rec *rec) { key->bmbt.br_startoff = cpu_to_be64( xfs_bmbt_disk_get_startoff(&rec->bmbt) + @@ -387,8 +387,8 @@ xfs_bmbt_init_ptr_from_cur( STATIC int64_t xfs_bmbt_key_diff( - struct xfs_btree_cur *cur, - union xfs_btree_key *key) + struct xfs_btree_cur *cur, + const union xfs_btree_key *key) { return (int64_t)be64_to_cpu(key->bmbt.br_startoff) - cur->bc_rec.b.br_startoff; @@ -396,12 +396,12 @@ xfs_bmbt_key_diff( STATIC int64_t xfs_bmbt_diff_two_keys( - struct xfs_btree_cur *cur, - union xfs_btree_key *k1, - union xfs_btree_key *k2) + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2) { - uint64_t a = be64_to_cpu(k1->bmbt.br_startoff); - uint64_t b = be64_to_cpu(k2->bmbt.br_startoff); + uint64_t a = be64_to_cpu(k1->bmbt.br_startoff); + uint64_t b = be64_to_cpu(k2->bmbt.br_startoff); /* * Note: This routine previously casted a and b to int64 and subtracted @@ -428,7 +428,7 @@ xfs_bmbt_verify( if (!xfs_verify_magic(bp, block->bb_magic)) return __this_address; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { /* * XXX: need a better way of verifying the owner here. Right now * just make sure there has been one set. @@ -497,9 +497,9 @@ const struct xfs_buf_ops xfs_bmbt_buf_ops = { STATIC int xfs_bmbt_keys_inorder( - struct xfs_btree_cur *cur, - union xfs_btree_key *k1, - union xfs_btree_key *k2) + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2) { return be64_to_cpu(k1->bmbt.br_startoff) < be64_to_cpu(k2->bmbt.br_startoff); @@ -507,9 +507,9 @@ xfs_bmbt_keys_inorder( STATIC int xfs_bmbt_recs_inorder( - struct xfs_btree_cur *cur, - union xfs_btree_rec *r1, - union xfs_btree_rec *r2) + struct xfs_btree_cur *cur, + const union xfs_btree_rec *r1, + const union xfs_btree_rec *r2) { return xfs_bmbt_disk_get_startoff(&r1->bmbt) + xfs_bmbt_disk_get_blockcount(&r1->bmbt) <= @@ -563,7 +563,7 @@ xfs_bmbt_init_cursor( cur->bc_ops = &xfs_bmbt_ops; cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE; - if (xfs_sb_version_hascrc(&mp->m_sb)) + if (xfs_has_crc(mp)) cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; cur->bc_ino.forksize = XFS_IFORK_SIZE(ip, whichfork); diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h index 72bf74c79fb9..729e3bc569be 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.h +++ b/fs/xfs/libxfs/xfs_bmap_btree.h @@ -16,7 +16,7 @@ struct xfs_trans; * Btree block header size depends on a superblock flag. */ #define XFS_BMBT_BLOCK_LEN(mp) \ - (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ + (xfs_has_crc(((mp))) ? \ XFS_BTREE_LBLOCK_CRC_LEN : XFS_BTREE_LBLOCK_LEN) #define XFS_BMBT_REC_ADDR(mp, block, index) \ @@ -88,9 +88,10 @@ extern void xfs_bmdr_to_bmbt(struct xfs_inode *, xfs_bmdr_block_t *, int, struct xfs_btree_block *, int); void xfs_bmbt_disk_set_all(struct xfs_bmbt_rec *r, struct xfs_bmbt_irec *s); -extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(xfs_bmbt_rec_t *r); -extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(xfs_bmbt_rec_t *r); -extern void xfs_bmbt_disk_get_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s); +extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(const struct xfs_bmbt_rec *r); +extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(const struct xfs_bmbt_rec *r); +void xfs_bmbt_disk_get_all(const struct xfs_bmbt_rec *r, + struct xfs_bmbt_irec *s); extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int, xfs_bmdr_block_t *, int); diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index be74a6b53689..298395481713 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -64,13 +64,13 @@ __xfs_btree_check_lblock( { struct xfs_mount *mp = cur->bc_mp; xfs_btnum_t btnum = cur->bc_btnum; - int crc = xfs_sb_version_hascrc(&mp->m_sb); + int crc = xfs_has_crc(mp); if (crc) { if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; if (block->bb_u.l.bb_blkno != - cpu_to_be64(bp ? bp->b_bn : XFS_BUF_DADDR_NULL)) + cpu_to_be64(bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL)) return __this_address; if (block->bb_u.l.bb_pad != cpu_to_be32(0)) return __this_address; @@ -129,13 +129,13 @@ __xfs_btree_check_sblock( { struct xfs_mount *mp = cur->bc_mp; xfs_btnum_t btnum = cur->bc_btnum; - int crc = xfs_sb_version_hascrc(&mp->m_sb); + int crc = xfs_has_crc(mp); if (crc) { if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; if (block->bb_u.s.bb_blkno != - cpu_to_be64(bp ? bp->b_bn : XFS_BUF_DADDR_NULL)) + cpu_to_be64(bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL)) return __this_address; } @@ -225,10 +225,10 @@ xfs_btree_check_sptr( */ static int xfs_btree_check_ptr( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *ptr, - int index, - int level) + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr, + int index, + int level) { if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { if (xfs_btree_check_lptr(cur, be64_to_cpu((&ptr->l)[index]), @@ -273,7 +273,7 @@ xfs_btree_lblock_calc_crc( struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); struct xfs_buf_log_item *bip = bp->b_log_item; - if (!xfs_sb_version_hascrc(&bp->b_mount->m_sb)) + if (!xfs_has_crc(bp->b_mount)) return; if (bip) block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); @@ -287,7 +287,7 @@ xfs_btree_lblock_verify_crc( struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); struct xfs_mount *mp = bp->b_mount; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.l.bb_lsn))) return false; return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF); @@ -311,7 +311,7 @@ xfs_btree_sblock_calc_crc( struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); struct xfs_buf_log_item *bip = bp->b_log_item; - if (!xfs_sb_version_hascrc(&bp->b_mount->m_sb)) + if (!xfs_has_crc(bp->b_mount)) return; if (bip) block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn); @@ -325,7 +325,7 @@ xfs_btree_sblock_verify_crc( struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); struct xfs_mount *mp = bp->b_mount; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn))) return false; return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF); @@ -374,7 +374,7 @@ xfs_btree_del_cursor( } ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || cur->bc_ino.allocated == 0 || - XFS_FORCED_SHUTDOWN(cur->bc_mp)); + xfs_is_shutdown(cur->bc_mp)); if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) kmem_free(cur->bc_ops); if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS) && cur->bc_ag.pag) @@ -420,7 +420,7 @@ xfs_btree_dup_cursor( bp = cur->bc_bufs[i]; if (bp) { error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, - XFS_BUF_ADDR(bp), mp->m_bsize, + xfs_buf_daddr(bp), mp->m_bsize, 0, &bp, cur->bc_ops->buf_ops); if (error) { @@ -935,9 +935,9 @@ xfs_btree_readahead( STATIC int xfs_btree_ptr_to_daddr( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *ptr, - xfs_daddr_t *daddr) + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr, + xfs_daddr_t *daddr) { xfs_fsblock_t fsbno; xfs_agblock_t agbno; @@ -1012,8 +1012,8 @@ xfs_btree_setbuf( bool xfs_btree_ptr_is_null( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *ptr) + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr) { if (cur->bc_flags & XFS_BTREE_LONG_PTRS) return ptr->l == cpu_to_be64(NULLFSBLOCK); @@ -1059,10 +1059,10 @@ xfs_btree_get_sibling( void xfs_btree_set_sibling( - struct xfs_btree_cur *cur, - struct xfs_btree_block *block, - union xfs_btree_ptr *ptr, - int lr) + struct xfs_btree_cur *cur, + struct xfs_btree_block *block, + const union xfs_btree_ptr *ptr, + int lr) { ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB); @@ -1090,7 +1090,7 @@ xfs_btree_init_block_int( __u64 owner, unsigned int flags) { - int crc = xfs_sb_version_hascrc(&mp->m_sb); + int crc = xfs_has_crc(mp); __u32 magic = xfs_btree_magic(crc, btnum); buf->bb_magic = cpu_to_be32(magic); @@ -1131,7 +1131,7 @@ xfs_btree_init_block( __u16 numrecs, __u64 owner) { - xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn, + xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), xfs_buf_daddr(bp), btnum, level, numrecs, owner, 0); } @@ -1155,9 +1155,9 @@ xfs_btree_init_block_cur( else owner = cur->bc_ag.pag->pag_agno; - xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn, - cur->bc_btnum, level, numrecs, - owner, cur->bc_flags); + xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), + xfs_buf_daddr(bp), cur->bc_btnum, level, + numrecs, owner, cur->bc_flags); } /* @@ -1192,10 +1192,10 @@ xfs_btree_buf_to_ptr( { if (cur->bc_flags & XFS_BTREE_LONG_PTRS) ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp, - XFS_BUF_ADDR(bp))); + xfs_buf_daddr(bp))); else { ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp, - XFS_BUF_ADDR(bp))); + xfs_buf_daddr(bp))); } } @@ -1229,10 +1229,10 @@ xfs_btree_set_refs( int xfs_btree_get_buf_block( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *ptr, - struct xfs_btree_block **block, - struct xfs_buf **bpp) + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr, + struct xfs_btree_block **block, + struct xfs_buf **bpp) { struct xfs_mount *mp = cur->bc_mp; xfs_daddr_t d; @@ -1257,11 +1257,11 @@ xfs_btree_get_buf_block( */ STATIC int xfs_btree_read_buf_block( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *ptr, - int flags, - struct xfs_btree_block **block, - struct xfs_buf **bpp) + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr, + int flags, + struct xfs_btree_block **block, + struct xfs_buf **bpp) { struct xfs_mount *mp = cur->bc_mp; xfs_daddr_t d; @@ -1289,10 +1289,10 @@ xfs_btree_read_buf_block( */ void xfs_btree_copy_keys( - struct xfs_btree_cur *cur, - union xfs_btree_key *dst_key, - union xfs_btree_key *src_key, - int numkeys) + struct xfs_btree_cur *cur, + union xfs_btree_key *dst_key, + const union xfs_btree_key *src_key, + int numkeys) { ASSERT(numkeys >= 0); memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len); @@ -1713,10 +1713,10 @@ error0: int xfs_btree_lookup_get_block( - struct xfs_btree_cur *cur, /* btree cursor */ - int level, /* level in the btree */ - union xfs_btree_ptr *pp, /* ptr to btree block */ - struct xfs_btree_block **blkp) /* return btree block */ + struct xfs_btree_cur *cur, /* btree cursor */ + int level, /* level in the btree */ + const union xfs_btree_ptr *pp, /* ptr to btree block */ + struct xfs_btree_block **blkp) /* return btree block */ { struct xfs_buf *bp; /* buffer pointer for btree block */ xfs_daddr_t daddr; @@ -1739,7 +1739,7 @@ xfs_btree_lookup_get_block( error = xfs_btree_ptr_to_daddr(cur, pp, &daddr); if (error) return error; - if (bp && XFS_BUF_ADDR(bp) == daddr) { + if (bp && xfs_buf_daddr(bp) == daddr) { *blkp = XFS_BUF_TO_BLOCK(bp); return 0; } @@ -1749,7 +1749,7 @@ xfs_btree_lookup_get_block( return error; /* Check the inode owner since the verifiers don't. */ - if (xfs_sb_version_hascrc(&cur->bc_mp->m_sb) && + if (xfs_has_crc(cur->bc_mp) && !(cur->bc_ino.flags & XFS_BTCUR_BMBT_INVALID_OWNER) && (cur->bc_flags & XFS_BTREE_LONG_PTRS) && be64_to_cpu((*blkp)->bb_u.l.bb_owner) != @@ -2923,10 +2923,11 @@ xfs_btree_new_iroot( */ memcpy(cblock, block, xfs_btree_block_len(cur)); if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) { + __be64 bno = cpu_to_be64(xfs_buf_daddr(cbp)); if (cur->bc_flags & XFS_BTREE_LONG_PTRS) - cblock->bb_u.l.bb_blkno = cpu_to_be64(cbp->b_bn); + cblock->bb_u.l.bb_blkno = bno; else - cblock->bb_u.s.bb_blkno = cpu_to_be64(cbp->b_bn); + cblock->bb_u.s.bb_blkno = bno; } be16_add_cpu(&block->bb_level, 1); @@ -3225,7 +3226,7 @@ xfs_btree_insrec( /* Get pointers to the btree buffer and block. */ block = xfs_btree_get_block(cur, level, &bp); - old_bn = bp ? bp->b_bn : XFS_BUF_DADDR_NULL; + old_bn = bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL; numrecs = xfs_btree_get_numrecs(block); #ifdef DEBUG @@ -3341,7 +3342,7 @@ xfs_btree_insrec( * some records into the new tree block), so use the regular key * update mechanism. */ - if (bp && bp->b_bn != old_bn) { + if (bp && xfs_buf_daddr(bp) != old_bn) { xfs_btree_get_keys(cur, block, lkey); } else if (xfs_btree_needs_key_update(cur, optr)) { error = xfs_btree_update_keys(cur, level); @@ -4418,11 +4419,11 @@ xfs_btree_lblock_v5hdr_verify( struct xfs_mount *mp = bp->b_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return __this_address; if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; - if (block->bb_u.l.bb_blkno != cpu_to_be64(bp->b_bn)) + if (block->bb_u.l.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp))) return __this_address; if (owner != XFS_RMAP_OWN_UNKNOWN && be64_to_cpu(block->bb_u.l.bb_owner) != owner) @@ -4468,11 +4469,11 @@ xfs_btree_sblock_v5hdr_verify( struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); struct xfs_perag *pag = bp->b_pag; - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return __this_address; if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; - if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) + if (block->bb_u.s.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp))) return __this_address; if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) return __this_address; @@ -4499,7 +4500,7 @@ xfs_btree_sblock_verify( return __this_address; /* sibling pointer verification */ - agno = xfs_daddr_to_agno(mp, XFS_BUF_ADDR(bp)); + agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp)); if (block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK) && !xfs_verify_agbno(mp, agno, be32_to_cpu(block->bb_u.s.bb_leftsib))) return __this_address; @@ -4536,8 +4537,8 @@ xfs_btree_compute_maxlevels( STATIC int xfs_btree_simple_query_range( struct xfs_btree_cur *cur, - union xfs_btree_key *low_key, - union xfs_btree_key *high_key, + const union xfs_btree_key *low_key, + const union xfs_btree_key *high_key, xfs_btree_query_range_fn fn, void *priv) { @@ -4627,8 +4628,8 @@ out: STATIC int xfs_btree_overlapped_query_range( struct xfs_btree_cur *cur, - union xfs_btree_key *low_key, - union xfs_btree_key *high_key, + const union xfs_btree_key *low_key, + const union xfs_btree_key *high_key, xfs_btree_query_range_fn fn, void *priv) { @@ -4769,8 +4770,8 @@ out: int xfs_btree_query_range( struct xfs_btree_cur *cur, - union xfs_btree_irec *low_rec, - union xfs_btree_irec *high_rec, + const union xfs_btree_irec *low_rec, + const union xfs_btree_irec *high_rec, xfs_btree_query_range_fn fn, void *priv) { @@ -4877,7 +4878,7 @@ xfs_btree_diff_two_ptrs( STATIC int xfs_btree_has_record_helper( struct xfs_btree_cur *cur, - union xfs_btree_rec *rec, + const union xfs_btree_rec *rec, void *priv) { return -ECANCELED; @@ -4886,12 +4887,12 @@ xfs_btree_has_record_helper( /* Is there a record covering a given range of keys? */ int xfs_btree_has_record( - struct xfs_btree_cur *cur, - union xfs_btree_irec *low, - union xfs_btree_irec *high, - bool *exists) + struct xfs_btree_cur *cur, + const union xfs_btree_irec *low, + const union xfs_btree_irec *high, + bool *exists) { - int error; + int error; error = xfs_btree_query_range(cur, low, high, &xfs_btree_has_record_helper, NULL); diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 4dbdc659c396..4eaf8517f850 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -106,19 +106,19 @@ struct xfs_btree_ops { /* update btree root pointer */ void (*set_root)(struct xfs_btree_cur *cur, - union xfs_btree_ptr *nptr, int level_change); + const union xfs_btree_ptr *nptr, int level_change); /* block allocation / freeing */ int (*alloc_block)(struct xfs_btree_cur *cur, - union xfs_btree_ptr *start_bno, + const union xfs_btree_ptr *start_bno, union xfs_btree_ptr *new_bno, int *stat); int (*free_block)(struct xfs_btree_cur *cur, struct xfs_buf *bp); /* update last record information */ void (*update_lastrec)(struct xfs_btree_cur *cur, - struct xfs_btree_block *block, - union xfs_btree_rec *rec, + const struct xfs_btree_block *block, + const union xfs_btree_rec *rec, int ptr, int reason); /* records in block/level */ @@ -130,37 +130,37 @@ struct xfs_btree_ops { /* init values of btree structures */ void (*init_key_from_rec)(union xfs_btree_key *key, - union xfs_btree_rec *rec); + const union xfs_btree_rec *rec); void (*init_rec_from_cur)(struct xfs_btree_cur *cur, union xfs_btree_rec *rec); void (*init_ptr_from_cur)(struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr); void (*init_high_key_from_rec)(union xfs_btree_key *key, - union xfs_btree_rec *rec); + const union xfs_btree_rec *rec); /* difference between key value and cursor value */ int64_t (*key_diff)(struct xfs_btree_cur *cur, - union xfs_btree_key *key); + const union xfs_btree_key *key); /* * Difference between key2 and key1 -- positive if key1 > key2, * negative if key1 < key2, and zero if equal. */ int64_t (*diff_two_keys)(struct xfs_btree_cur *cur, - union xfs_btree_key *key1, - union xfs_btree_key *key2); + const union xfs_btree_key *key1, + const union xfs_btree_key *key2); const struct xfs_buf_ops *buf_ops; /* check that k1 is lower than k2 */ int (*keys_inorder)(struct xfs_btree_cur *cur, - union xfs_btree_key *k1, - union xfs_btree_key *k2); + const union xfs_btree_key *k1, + const union xfs_btree_key *k2); /* check that r1 is lower than r2 */ int (*recs_inorder)(struct xfs_btree_cur *cur, - union xfs_btree_rec *r1, - union xfs_btree_rec *r2); + const union xfs_btree_rec *r1, + const union xfs_btree_rec *r2); }; /* @@ -423,7 +423,7 @@ void xfs_btree_log_recs(struct xfs_btree_cur *, struct xfs_buf *, int, int); /* * Helpers. */ -static inline int xfs_btree_get_numrecs(struct xfs_btree_block *block) +static inline int xfs_btree_get_numrecs(const struct xfs_btree_block *block) { return be16_to_cpu(block->bb_numrecs); } @@ -434,7 +434,7 @@ static inline void xfs_btree_set_numrecs(struct xfs_btree_block *block, block->bb_numrecs = cpu_to_be16(numrecs); } -static inline int xfs_btree_get_level(struct xfs_btree_block *block) +static inline int xfs_btree_get_level(const struct xfs_btree_block *block) { return be16_to_cpu(block->bb_level); } @@ -471,10 +471,11 @@ unsigned long long xfs_btree_calc_size(uint *limits, unsigned long long len); * code on its own. */ typedef int (*xfs_btree_query_range_fn)(struct xfs_btree_cur *cur, - union xfs_btree_rec *rec, void *priv); + const union xfs_btree_rec *rec, void *priv); int xfs_btree_query_range(struct xfs_btree_cur *cur, - union xfs_btree_irec *low_rec, union xfs_btree_irec *high_rec, + const union xfs_btree_irec *low_rec, + const union xfs_btree_irec *high_rec, xfs_btree_query_range_fn fn, void *priv); int xfs_btree_query_all(struct xfs_btree_cur *cur, xfs_btree_query_range_fn fn, void *priv); @@ -502,10 +503,11 @@ union xfs_btree_key *xfs_btree_high_key_addr(struct xfs_btree_cur *cur, int n, union xfs_btree_ptr *xfs_btree_ptr_addr(struct xfs_btree_cur *cur, int n, struct xfs_btree_block *block); int xfs_btree_lookup_get_block(struct xfs_btree_cur *cur, int level, - union xfs_btree_ptr *pp, struct xfs_btree_block **blkp); + const union xfs_btree_ptr *pp, struct xfs_btree_block **blkp); struct xfs_btree_block *xfs_btree_get_block(struct xfs_btree_cur *cur, int level, struct xfs_buf **bpp); -bool xfs_btree_ptr_is_null(struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr); +bool xfs_btree_ptr_is_null(struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr); int64_t xfs_btree_diff_two_ptrs(struct xfs_btree_cur *cur, const union xfs_btree_ptr *a, const union xfs_btree_ptr *b); @@ -516,8 +518,9 @@ void xfs_btree_get_keys(struct xfs_btree_cur *cur, struct xfs_btree_block *block, union xfs_btree_key *key); union xfs_btree_key *xfs_btree_high_key_from_key(struct xfs_btree_cur *cur, union xfs_btree_key *key); -int xfs_btree_has_record(struct xfs_btree_cur *cur, union xfs_btree_irec *low, - union xfs_btree_irec *high, bool *exists); +int xfs_btree_has_record(struct xfs_btree_cur *cur, + const union xfs_btree_irec *low, + const union xfs_btree_irec *high, bool *exists); bool xfs_btree_has_more_records(struct xfs_btree_cur *cur); struct xfs_ifork *xfs_btree_ifork_ptr(struct xfs_btree_cur *cur); @@ -540,10 +543,11 @@ xfs_btree_islastblock( void xfs_btree_set_ptr_null(struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr); -int xfs_btree_get_buf_block(struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr, - struct xfs_btree_block **block, struct xfs_buf **bpp); +int xfs_btree_get_buf_block(struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr, struct xfs_btree_block **block, + struct xfs_buf **bpp); void xfs_btree_set_sibling(struct xfs_btree_cur *cur, - struct xfs_btree_block *block, union xfs_btree_ptr *ptr, + struct xfs_btree_block *block, const union xfs_btree_ptr *ptr, int lr); void xfs_btree_init_block_cur(struct xfs_btree_cur *cur, struct xfs_buf *bp, int level, int numrecs); @@ -551,7 +555,7 @@ void xfs_btree_copy_ptrs(struct xfs_btree_cur *cur, union xfs_btree_ptr *dst_ptr, const union xfs_btree_ptr *src_ptr, int numptrs); void xfs_btree_copy_keys(struct xfs_btree_cur *cur, - union xfs_btree_key *dst_key, union xfs_btree_key *src_key, - int numkeys); + union xfs_btree_key *dst_key, + const union xfs_btree_key *src_key, int numkeys); #endif /* __XFS_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c index aa8dc9521c39..ac9e80152b5c 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.c +++ b/fs/xfs/libxfs/xfs_btree_staging.c @@ -59,10 +59,10 @@ xfs_btree_fakeroot_dup_cursor( */ STATIC int xfs_btree_fakeroot_alloc_block( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *start_bno, - union xfs_btree_ptr *new_bno, - int *stat) + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *start_bno, + union xfs_btree_ptr *new_bno, + int *stat) { ASSERT(0); return -EFSCORRUPTED; @@ -112,9 +112,9 @@ xfs_btree_fakeroot_init_ptr_from_cur( /* Update the btree root information for a per-AG fake root. */ STATIC void xfs_btree_afakeroot_set_root( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *ptr, - int inc) + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr, + int inc) { struct xbtree_afakeroot *afake = cur->bc_ag.afake; diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 747ec77912c3..c062e2c85178 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -129,7 +129,7 @@ xfs_da3_node_hdr_from_disk( struct xfs_da3_icnode_hdr *to, struct xfs_da_intnode *from) { - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { struct xfs_da3_intnode *from3 = (struct xfs_da3_intnode *)from; to->forw = be32_to_cpu(from3->hdr.info.hdr.forw); @@ -156,7 +156,7 @@ xfs_da3_node_hdr_to_disk( struct xfs_da_intnode *to, struct xfs_da3_icnode_hdr *from) { - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { struct xfs_da3_intnode *to3 = (struct xfs_da3_intnode *)to; ASSERT(from->magic == XFS_DA3_NODE_MAGIC); @@ -191,10 +191,10 @@ xfs_da3_blkinfo_verify( if (!xfs_verify_magic16(bp, hdr->magic)) return __this_address; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; - if (be64_to_cpu(hdr3->blkno) != bp->b_bn) + if (be64_to_cpu(hdr3->blkno) != xfs_buf_daddr(bp)) return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn))) return __this_address; @@ -253,7 +253,7 @@ xfs_da3_node_write_verify( return; } - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return; if (bip) @@ -442,12 +442,12 @@ xfs_da3_node_create( xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF); node = bp->b_addr; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { struct xfs_da3_node_hdr *hdr3 = bp->b_addr; memset(hdr3, 0, sizeof(struct xfs_da3_node_hdr)); ichdr.magic = XFS_DA3_NODE_MAGIC; - hdr3->info.blkno = cpu_to_be64(bp->b_bn); + hdr3->info.blkno = cpu_to_be64(xfs_buf_daddr(bp)); hdr3->info.owner = cpu_to_be64(args->dp->i_ino); uuid_copy(&hdr3->info.uuid, &mp->m_sb.sb_meta_uuid); } else { @@ -711,7 +711,7 @@ xfs_da3_root_split( oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) { struct xfs_da3_intnode *node3 = (struct xfs_da3_intnode *)node; - node3->hdr.info.blkno = cpu_to_be64(bp->b_bn); + node3->hdr.info.blkno = cpu_to_be64(xfs_buf_daddr(bp)); } xfs_trans_log_buf(tp, bp, 0, size - 1); @@ -1219,7 +1219,7 @@ xfs_da3_root_join( xfs_trans_buf_copy_type(root_blk->bp, bp); if (oldroothdr.magic == XFS_DA3_NODE_MAGIC) { struct xfs_da3_blkinfo *da3 = root_blk->bp->b_addr; - da3->blkno = cpu_to_be64(root_blk->bp->b_bn); + da3->blkno = cpu_to_be64(xfs_buf_daddr(root_blk->bp)); } xfs_trans_log_buf(args->trans, root_blk->bp, 0, args->geo->blksize - 1); diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index b876b44c0204..5a49caa5c9df 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -789,7 +789,7 @@ struct xfs_attr3_rmt_hdr { #define XFS_ATTR3_RMT_CRC_OFF offsetof(struct xfs_attr3_rmt_hdr, rm_crc) #define XFS_ATTR3_RMT_BUF_SPACE(mp, bufsize) \ - ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \ + ((bufsize) - (xfs_has_crc((mp)) ? \ sizeof(struct xfs_attr3_rmt_hdr) : 0)) /* Number of bytes in a directory block. */ diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 050bdcc4fe73..50546eadaae2 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -115,7 +115,7 @@ xfs_da_mount( dageo->fsblog = mp->m_sb.sb_blocklog; dageo->blksize = xfs_dir2_dirblock_bytes(&mp->m_sb); dageo->fsbcount = 1 << mp->m_sb.sb_dirblklog; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { dageo->node_hdr_size = sizeof(struct xfs_da3_node_hdr); dageo->leaf_hdr_size = sizeof(struct xfs_dir3_leaf_hdr); dageo->free_hdr_size = sizeof(struct xfs_dir3_free_hdr); @@ -730,7 +730,7 @@ xfs_dir2_hashname( struct xfs_mount *mp, struct xfs_name *name) { - if (unlikely(xfs_sb_version_hasasciici(&mp->m_sb))) + if (unlikely(xfs_has_asciici(mp))) return xfs_ascii_ci_hashname(name); return xfs_da_hashname(name->name, name->len); } @@ -741,7 +741,7 @@ xfs_dir2_compname( const unsigned char *name, int len) { - if (unlikely(xfs_sb_version_hasasciici(&args->dp->i_mount->m_sb))) + if (unlikely(xfs_has_asciici(args->dp->i_mount))) return xfs_ascii_ci_compname(args, name, len); return xfs_da_compname(args, name, len); } diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index 75e1421f69c4..df0869bba275 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -53,10 +53,10 @@ xfs_dir3_block_verify( if (!xfs_verify_magic(bp, hdr3->magic)) return __this_address; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; - if (be64_to_cpu(hdr3->blkno) != bp->b_bn) + if (be64_to_cpu(hdr3->blkno) != xfs_buf_daddr(bp)) return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn))) return __this_address; @@ -71,7 +71,7 @@ xfs_dir3_block_read_verify( struct xfs_mount *mp = bp->b_mount; xfs_failaddr_t fa; - if (xfs_sb_version_hascrc(&mp->m_sb) && + if (xfs_has_crc(mp) && !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF)) xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { @@ -96,7 +96,7 @@ xfs_dir3_block_write_verify( return; } - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return; if (bip) @@ -121,7 +121,7 @@ xfs_dir3_block_header_check( { struct xfs_mount *mp = dp->i_mount; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; if (be64_to_cpu(hdr3->owner) != dp->i_ino) @@ -171,10 +171,10 @@ xfs_dir3_block_init( bp->b_ops = &xfs_dir3_block_buf_ops; xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_BLOCK_BUF); - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { memset(hdr3, 0, sizeof(*hdr3)); hdr3->magic = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC); - hdr3->blkno = cpu_to_be64(bp->b_bn); + hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp)); hdr3->owner = cpu_to_be64(dp->i_ino); uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid); return; diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index e67fa086f2c1..dbcf58979a59 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -29,7 +29,7 @@ xfs_dir2_data_bestfree_p( struct xfs_mount *mp, struct xfs_dir2_data_hdr *hdr) { - if (xfs_sb_version_hascrc(&mp->m_sb)) + if (xfs_has_crc(mp)) return ((struct xfs_dir3_data_hdr *)hdr)->best_free; return hdr->bestfree; } @@ -51,7 +51,7 @@ xfs_dir2_data_get_ftype( struct xfs_mount *mp, struct xfs_dir2_data_entry *dep) { - if (xfs_sb_version_hasftype(&mp->m_sb)) { + if (xfs_has_ftype(mp)) { uint8_t ftype = dep->name[dep->namelen]; if (likely(ftype < XFS_DIR3_FT_MAX)) @@ -70,7 +70,7 @@ xfs_dir2_data_put_ftype( ASSERT(ftype < XFS_DIR3_FT_MAX); ASSERT(dep->namelen != 0); - if (xfs_sb_version_hasftype(&mp->m_sb)) + if (xfs_has_ftype(mp)) dep->name[dep->namelen] = ftype; } @@ -297,10 +297,10 @@ xfs_dir3_data_verify( if (!xfs_verify_magic(bp, hdr3->magic)) return __this_address; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; - if (be64_to_cpu(hdr3->blkno) != bp->b_bn) + if (be64_to_cpu(hdr3->blkno) != xfs_buf_daddr(bp)) return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn))) return __this_address; @@ -343,7 +343,7 @@ xfs_dir3_data_read_verify( struct xfs_mount *mp = bp->b_mount; xfs_failaddr_t fa; - if (xfs_sb_version_hascrc(&mp->m_sb) && + if (xfs_has_crc(mp) && !xfs_buf_verify_cksum(bp, XFS_DIR3_DATA_CRC_OFF)) xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { @@ -368,7 +368,7 @@ xfs_dir3_data_write_verify( return; } - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return; if (bip) @@ -401,7 +401,7 @@ xfs_dir3_data_header_check( { struct xfs_mount *mp = dp->i_mount; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { struct xfs_dir3_data_hdr *hdr3 = bp->b_addr; if (be64_to_cpu(hdr3->hdr.owner) != dp->i_ino) @@ -717,12 +717,12 @@ xfs_dir3_data_init( * Initialize the header. */ hdr = bp->b_addr; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; memset(hdr3, 0, sizeof(*hdr3)); hdr3->magic = cpu_to_be32(XFS_DIR3_DATA_MAGIC); - hdr3->blkno = cpu_to_be64(bp->b_bn); + hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp)); hdr3->owner = cpu_to_be64(dp->i_ino); uuid_copy(&hdr3->uuid, &mp->m_sb.sb_meta_uuid); diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index 5369d8bb2593..d9b66306a9a7 100644 --- a/fs/xfs/libxfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -37,7 +37,7 @@ xfs_dir2_leaf_hdr_from_disk( struct xfs_dir3_icleaf_hdr *to, struct xfs_dir2_leaf *from) { - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { struct xfs_dir3_leaf *from3 = (struct xfs_dir3_leaf *)from; to->forw = be32_to_cpu(from3->hdr.info.hdr.forw); @@ -68,7 +68,7 @@ xfs_dir2_leaf_hdr_to_disk( struct xfs_dir2_leaf *to, struct xfs_dir3_icleaf_hdr *from) { - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { struct xfs_dir3_leaf *to3 = (struct xfs_dir3_leaf *)to; ASSERT(from->magic == XFS_DIR3_LEAF1_MAGIC || @@ -108,7 +108,7 @@ xfs_dir3_leaf1_check( if (leafhdr.magic == XFS_DIR3_LEAF1_MAGIC) { struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; - if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn) + if (be64_to_cpu(leaf3->info.blkno) != xfs_buf_daddr(bp)) return __this_address; } else if (leafhdr.magic != XFS_DIR2_LEAF1_MAGIC) return __this_address; @@ -209,7 +209,7 @@ xfs_dir3_leaf_read_verify( struct xfs_mount *mp = bp->b_mount; xfs_failaddr_t fa; - if (xfs_sb_version_hascrc(&mp->m_sb) && + if (xfs_has_crc(mp) && !xfs_buf_verify_cksum(bp, XFS_DIR3_LEAF_CRC_OFF)) xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { @@ -234,7 +234,7 @@ xfs_dir3_leaf_write_verify( return; } - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return; if (bip) @@ -308,7 +308,7 @@ xfs_dir3_leaf_init( ASSERT(type == XFS_DIR2_LEAF1_MAGIC || type == XFS_DIR2_LEAFN_MAGIC); - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; memset(leaf3, 0, sizeof(*leaf3)); @@ -316,7 +316,7 @@ xfs_dir3_leaf_init( leaf3->info.hdr.magic = (type == XFS_DIR2_LEAF1_MAGIC) ? cpu_to_be16(XFS_DIR3_LEAF1_MAGIC) : cpu_to_be16(XFS_DIR3_LEAFN_MAGIC); - leaf3->info.blkno = cpu_to_be64(bp->b_bn); + leaf3->info.blkno = cpu_to_be64(xfs_buf_daddr(bp)); leaf3->info.owner = cpu_to_be64(owner); uuid_copy(&leaf3->info.uuid, &mp->m_sb.sb_meta_uuid); } else { diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index d0520afb913a..7a03aeb9f4c9 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -68,7 +68,7 @@ xfs_dir3_leafn_check( if (leafhdr.magic == XFS_DIR3_LEAFN_MAGIC) { struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr; - if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn) + if (be64_to_cpu(leaf3->info.blkno) != xfs_buf_daddr(bp)) return __this_address; } else if (leafhdr.magic != XFS_DIR2_LEAFN_MAGIC) return __this_address; @@ -105,12 +105,12 @@ xfs_dir3_free_verify( if (!xfs_verify_magic(bp, hdr->magic)) return __this_address; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr; if (!uuid_equal(&hdr3->uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; - if (be64_to_cpu(hdr3->blkno) != bp->b_bn) + if (be64_to_cpu(hdr3->blkno) != xfs_buf_daddr(bp)) return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn))) return __this_address; @@ -128,7 +128,7 @@ xfs_dir3_free_read_verify( struct xfs_mount *mp = bp->b_mount; xfs_failaddr_t fa; - if (xfs_sb_version_hascrc(&mp->m_sb) && + if (xfs_has_crc(mp) && !xfs_buf_verify_cksum(bp, XFS_DIR3_FREE_CRC_OFF)) xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { @@ -153,7 +153,7 @@ xfs_dir3_free_write_verify( return; } - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return; if (bip) @@ -185,7 +185,7 @@ xfs_dir3_free_header_check( firstdb = (xfs_dir2_da_to_db(mp->m_dir_geo, fbno) - xfs_dir2_byte_to_db(mp->m_dir_geo, XFS_DIR2_FREE_OFFSET)) * maxbests; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { struct xfs_dir3_free_hdr *hdr3 = bp->b_addr; if (be32_to_cpu(hdr3->firstdb) != firstdb) @@ -247,7 +247,7 @@ xfs_dir2_free_hdr_from_disk( struct xfs_dir3_icfree_hdr *to, struct xfs_dir2_free *from) { - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { struct xfs_dir3_free *from3 = (struct xfs_dir3_free *)from; to->magic = be32_to_cpu(from3->hdr.hdr.magic); @@ -274,7 +274,7 @@ xfs_dir2_free_hdr_to_disk( struct xfs_dir2_free *to, struct xfs_dir3_icfree_hdr *from) { - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { struct xfs_dir3_free *to3 = (struct xfs_dir3_free *)to; ASSERT(from->magic == XFS_DIR3_FREE_MAGIC); @@ -341,12 +341,12 @@ xfs_dir3_free_get_buf( memset(bp->b_addr, 0, sizeof(struct xfs_dir3_free_hdr)); memset(&hdr, 0, sizeof(hdr)); - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { struct xfs_dir3_free_hdr *hdr3 = bp->b_addr; hdr.magic = XFS_DIR3_FREE_MAGIC; - hdr3->hdr.blkno = cpu_to_be64(bp->b_bn); + hdr3->hdr.blkno = cpu_to_be64(xfs_buf_daddr(bp)); hdr3->hdr.owner = cpu_to_be64(dp->i_ino); uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_meta_uuid); } else diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h index 94943ce49cab..711709a2aa53 100644 --- a/fs/xfs/libxfs/xfs_dir2_priv.h +++ b/fs/xfs/libxfs/xfs_dir2_priv.h @@ -196,7 +196,7 @@ xfs_dir2_data_entsize( len = offsetof(struct xfs_dir2_data_entry, name[0]) + namelen + sizeof(xfs_dir2_data_off_t) /* tag */; - if (xfs_sb_version_hasftype(&mp->m_sb)) + if (xfs_has_ftype(mp)) len += sizeof(uint8_t); return round_up(len, XFS_DIR2_DATA_ALIGN); } diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c index 46d18bf9d5e1..5a97a87eaa20 100644 --- a/fs/xfs/libxfs/xfs_dir2_sf.c +++ b/fs/xfs/libxfs/xfs_dir2_sf.c @@ -48,7 +48,7 @@ xfs_dir2_sf_entsize( count += sizeof(struct xfs_dir2_sf_entry); /* namelen + offset */ count += hdr->i8count ? XFS_INO64_SIZE : XFS_INO32_SIZE; /* ino # */ - if (xfs_sb_version_hasftype(&mp->m_sb)) + if (xfs_has_ftype(mp)) count += sizeof(uint8_t); return count; } @@ -76,7 +76,7 @@ xfs_dir2_sf_get_ino( { uint8_t *from = sfep->name + sfep->namelen; - if (xfs_sb_version_hasftype(&mp->m_sb)) + if (xfs_has_ftype(mp)) from++; if (!hdr->i8count) @@ -95,7 +95,7 @@ xfs_dir2_sf_put_ino( ASSERT(ino <= XFS_MAXINUMBER); - if (xfs_sb_version_hasftype(&mp->m_sb)) + if (xfs_has_ftype(mp)) to++; if (hdr->i8count) @@ -135,7 +135,7 @@ xfs_dir2_sf_get_ftype( struct xfs_mount *mp, struct xfs_dir2_sf_entry *sfep) { - if (xfs_sb_version_hasftype(&mp->m_sb)) { + if (xfs_has_ftype(mp)) { uint8_t ftype = sfep->name[sfep->namelen]; if (ftype < XFS_DIR3_FT_MAX) @@ -153,7 +153,7 @@ xfs_dir2_sf_put_ftype( { ASSERT(ftype < XFS_DIR3_FT_MAX); - if (xfs_sb_version_hasftype(&mp->m_sb)) + if (xfs_has_ftype(mp)) sfep->name[sfep->namelen] = ftype; } @@ -192,7 +192,7 @@ xfs_dir2_block_sfsize( * if there is a filetype field, add the extra byte to the namelen * for each entry that we see. */ - has_ftype = xfs_sb_version_hasftype(&mp->m_sb) ? 1 : 0; + has_ftype = xfs_has_ftype(mp) ? 1 : 0; count = i8count = namelen = 0; btp = xfs_dir2_block_tail_p(geo, hdr); diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c index 6766417d5ba4..deeb74becabc 100644 --- a/fs/xfs/libxfs/xfs_dquot_buf.c +++ b/fs/xfs/libxfs/xfs_dquot_buf.c @@ -70,7 +70,7 @@ xfs_dquot_verify( return __this_address; if ((ddq->d_type & XFS_DQTYPE_BIGTIME) && - !xfs_sb_version_hasbigtime(&mp->m_sb)) + !xfs_has_bigtime(mp)) return __this_address; if ((ddq->d_type & XFS_DQTYPE_BIGTIME) && !ddq->d_id) @@ -106,7 +106,7 @@ xfs_dqblk_verify( struct xfs_dqblk *dqb, xfs_dqid_t id) /* used only during quotacheck */ { - if (xfs_sb_version_hascrc(&mp->m_sb) && + if (xfs_has_crc(mp) && !uuid_equal(&dqb->dd_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; @@ -134,7 +134,7 @@ xfs_dqblk_repair( dqb->dd_diskdq.d_type = type; dqb->dd_diskdq.d_id = cpu_to_be32(id); - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { uuid_copy(&dqb->dd_uuid, &mp->m_sb.sb_meta_uuid); xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk), XFS_DQUOT_CRC_OFF); @@ -151,7 +151,7 @@ xfs_dquot_buf_verify_crc( int ndquots; int i; - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return true; /* diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 76e2461b9e66..2d7057b7984b 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -9,7 +9,7 @@ /* * XFS On Disk Format Definitions * - * This header file defines all the on-disk format definitions for + * This header file defines all the on-disk format definitions for * general XFS objects. Directory and attribute related objects are defined in * xfs_da_format.h, which log and log item formats are defined in * xfs_log_format.h. Everything else goes here. @@ -265,7 +265,6 @@ typedef struct xfs_dsb { /* must be padded to 64 bit alignment */ } xfs_dsb_t; - /* * Misc. Flags - warning - these will be cleared by xfs_repair unless * a feature bit is set when the flag is used. @@ -280,37 +279,9 @@ typedef struct xfs_dsb { #define XFS_SB_VERSION_NUM(sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS) -/* - * The first XFS version we support is a v4 superblock with V2 directories. - */ -static inline bool xfs_sb_good_v4_features(struct xfs_sb *sbp) -{ - if (!(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT)) - return false; - if (!(sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT)) - return false; - - /* check for unknown features in the fs */ - if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) || - ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && - (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS))) - return false; - - return true; -} - -static inline bool xfs_sb_good_version(struct xfs_sb *sbp) +static inline bool xfs_sb_is_v5(struct xfs_sb *sbp) { - if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) - return true; - if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) - return xfs_sb_good_v4_features(sbp); - return false; -} - -static inline bool xfs_sb_version_hasrealtime(struct xfs_sb *sbp) -{ - return sbp->sb_rblocks > 0; + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; } /* @@ -322,9 +293,10 @@ static inline bool xfs_sb_has_mismatched_features2(struct xfs_sb *sbp) return sbp->sb_bad_features2 != sbp->sb_features2; } -static inline bool xfs_sb_version_hasattr(struct xfs_sb *sbp) +static inline bool xfs_sb_version_hasmorebits(struct xfs_sb *sbp) { - return (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT); + return xfs_sb_is_v5(sbp) || + (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT); } static inline void xfs_sb_version_addattr(struct xfs_sb *sbp) @@ -332,87 +304,18 @@ static inline void xfs_sb_version_addattr(struct xfs_sb *sbp) sbp->sb_versionnum |= XFS_SB_VERSION_ATTRBIT; } -static inline bool xfs_sb_version_hasquota(struct xfs_sb *sbp) -{ - return (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT); -} - static inline void xfs_sb_version_addquota(struct xfs_sb *sbp) { sbp->sb_versionnum |= XFS_SB_VERSION_QUOTABIT; } -static inline bool xfs_sb_version_hasalign(struct xfs_sb *sbp) -{ - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 || - (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT)); -} - -static inline bool xfs_sb_version_hasdalign(struct xfs_sb *sbp) -{ - return (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT); -} - -static inline bool xfs_sb_version_haslogv2(struct xfs_sb *sbp) -{ - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 || - (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT); -} - -static inline bool xfs_sb_version_hassector(struct xfs_sb *sbp) -{ - return (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT); -} - -static inline bool xfs_sb_version_hasasciici(struct xfs_sb *sbp) -{ - return (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT); -} - -static inline bool xfs_sb_version_hasmorebits(struct xfs_sb *sbp) -{ - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 || - (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT); -} - -/* - * sb_features2 bit version macros. - */ -static inline bool xfs_sb_version_haslazysbcount(struct xfs_sb *sbp) -{ - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || - (xfs_sb_version_hasmorebits(sbp) && - (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT)); -} - -static inline bool xfs_sb_version_hasattr2(struct xfs_sb *sbp) -{ - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || - (xfs_sb_version_hasmorebits(sbp) && - (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT)); -} - static inline void xfs_sb_version_addattr2(struct xfs_sb *sbp) { sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT; sbp->sb_features2 |= XFS_SB_VERSION2_ATTR2BIT; } -static inline void xfs_sb_version_removeattr2(struct xfs_sb *sbp) -{ - sbp->sb_features2 &= ~XFS_SB_VERSION2_ATTR2BIT; - if (!sbp->sb_features2) - sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT; -} - -static inline bool xfs_sb_version_hasprojid32bit(struct xfs_sb *sbp) -{ - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) || - (xfs_sb_version_hasmorebits(sbp) && - (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT)); -} - -static inline void xfs_sb_version_addprojid32bit(struct xfs_sb *sbp) +static inline void xfs_sb_version_addprojid32(struct xfs_sb *sbp) { sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT; sbp->sb_features2 |= XFS_SB_VERSION2_PROJID32BIT; @@ -495,106 +398,21 @@ xfs_sb_has_incompat_log_feature( return (sbp->sb_features_log_incompat & feature) != 0; } -/* - * V5 superblock specific feature checks - */ -static inline bool xfs_sb_version_hascrc(struct xfs_sb *sbp) -{ - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; -} - -/* - * v5 file systems support V3 inodes only, earlier file systems support - * v2 and v1 inodes. - */ -static inline bool xfs_sb_version_has_v3inode(struct xfs_sb *sbp) -{ - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; -} - -static inline bool xfs_dinode_good_version(struct xfs_sb *sbp, - uint8_t version) -{ - if (xfs_sb_version_has_v3inode(sbp)) - return version == 3; - return version == 1 || version == 2; -} - -static inline bool xfs_sb_version_has_pquotino(struct xfs_sb *sbp) -{ - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5; -} - -static inline int xfs_sb_version_hasftype(struct xfs_sb *sbp) -{ - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && - xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_FTYPE)) || - (xfs_sb_version_hasmorebits(sbp) && - (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE)); -} - -static inline bool xfs_sb_version_hasfinobt(xfs_sb_t *sbp) -{ - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) && - (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT); -} - -static inline bool xfs_sb_version_hassparseinodes(struct xfs_sb *sbp) -{ - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && - xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_SPINODES); -} - -/* - * XFS_SB_FEAT_INCOMPAT_META_UUID indicates that the metadata UUID - * is stored separately from the user-visible UUID; this allows the - * user-visible UUID to be changed on V5 filesystems which have a - * filesystem UUID stamped into every piece of metadata. - */ -static inline bool xfs_sb_version_hasmetauuid(struct xfs_sb *sbp) -{ - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) && - (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID); -} - -static inline bool xfs_sb_version_hasrmapbt(struct xfs_sb *sbp) -{ - return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) && - (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_RMAPBT); -} - -static inline bool xfs_sb_version_hasreflink(struct xfs_sb *sbp) -{ - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && - (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_REFLINK); -} - -static inline bool xfs_sb_version_hasbigtime(struct xfs_sb *sbp) -{ - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && - (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_BIGTIME); -} - -/* - * Inode btree block counter. We record the number of inobt and finobt blocks - * in the AGI header so that we can skip the finobt walk at mount time when - * setting up per-AG reservations. - */ -static inline bool xfs_sb_version_hasinobtcounts(struct xfs_sb *sbp) +static inline void +xfs_sb_remove_incompat_log_features( + struct xfs_sb *sbp) { - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && - (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_INOBTCNT); + sbp->sb_features_log_incompat &= ~XFS_SB_FEAT_INCOMPAT_LOG_ALL; } -static inline bool xfs_sb_version_needsrepair(struct xfs_sb *sbp) +static inline void +xfs_sb_add_incompat_log_features( + struct xfs_sb *sbp, + unsigned int features) { - return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && - (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR); + sbp->sb_features_log_incompat |= features; } -/* - * end of superblock version macros - */ static inline bool xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino) @@ -1062,12 +880,12 @@ enum xfs_dinode_fmt { /* * Inode size for given fs. */ -#define XFS_DINODE_SIZE(sbp) \ - (xfs_sb_version_has_v3inode(sbp) ? \ +#define XFS_DINODE_SIZE(mp) \ + (xfs_has_v3inodes(mp) ? \ sizeof(struct xfs_dinode) : \ offsetof(struct xfs_dinode, di_crc)) #define XFS_LITINO(mp) \ - ((mp)->m_sb.sb_inodesize - XFS_DINODE_SIZE(&(mp)->m_sb)) + ((mp)->m_sb.sb_inodesize - XFS_DINODE_SIZE(mp)) /* * Inode data & attribute fork sizes, per inode. @@ -1454,7 +1272,7 @@ struct xfs_dsymlink_hdr { #define XFS_SYMLINK_MAPS 3 #define XFS_SYMLINK_BUF_SPACE(mp, bufsize) \ - ((bufsize) - (xfs_sb_version_hascrc(&(mp)->m_sb) ? \ + ((bufsize) - (xfs_has_crc((mp)) ? \ sizeof(struct xfs_dsymlink_hdr) : 0)) @@ -1686,7 +1504,7 @@ struct xfs_rmap_key { typedef __be32 xfs_rmap_ptr_t; #define XFS_RMAP_BLOCK(mp) \ - (xfs_sb_version_hasfinobt(&((mp)->m_sb)) ? \ + (xfs_has_finobt(((mp))) ? \ XFS_FIBT_BLOCK(mp) + 1 : \ XFS_IBT_BLOCK(mp) + 1) @@ -1918,7 +1736,7 @@ struct xfs_acl { * limited only by the maximum size of the xattr that stores the information. */ #define XFS_ACL_MAX_ENTRIES(mp) \ - (xfs_sb_version_hascrc(&mp->m_sb) \ + (xfs_has_crc(mp) \ ? (XFS_XATTR_SIZE_MAX - sizeof(struct xfs_acl)) / \ sizeof(struct xfs_acl_entry) \ : 25) diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index aaf8805a82df..994ad783d407 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -58,7 +58,7 @@ xfs_inobt_update( union xfs_btree_rec rec; rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino); - if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) { + if (xfs_has_sparseinodes(cur->bc_mp)) { rec.inobt.ir_u.sp.ir_holemask = cpu_to_be16(irec->ir_holemask); rec.inobt.ir_u.sp.ir_count = irec->ir_count; rec.inobt.ir_u.sp.ir_freecount = irec->ir_freecount; @@ -74,11 +74,11 @@ xfs_inobt_update( void xfs_inobt_btrec_to_irec( struct xfs_mount *mp, - union xfs_btree_rec *rec, + const union xfs_btree_rec *rec, struct xfs_inobt_rec_incore *irec) { irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino); - if (xfs_sb_version_hassparseinodes(&mp->m_sb)) { + if (xfs_has_sparseinodes(mp)) { irec->ir_holemask = be16_to_cpu(rec->inobt.ir_u.sp.ir_holemask); irec->ir_count = rec->inobt.ir_u.sp.ir_count; irec->ir_freecount = rec->inobt.ir_u.sp.ir_freecount; @@ -241,7 +241,7 @@ xfs_check_agi_freecount( } } while (i == 1); - if (!XFS_FORCED_SHUTDOWN(cur->bc_mp)) + if (!xfs_is_shutdown(cur->bc_mp)) ASSERT(freecount == cur->bc_ag.pag->pagi_freecount); } return 0; @@ -302,7 +302,7 @@ xfs_ialloc_inode_init( * That means for v3 inode we log the entire buffer rather than just the * inode cores. */ - if (xfs_sb_version_has_v3inode(&mp->m_sb)) { + if (xfs_has_v3inodes(mp)) { version = 3; ino = XFS_AGINO_TO_INO(mp, agno, XFS_AGB_TO_AGINO(mp, agbno)); @@ -337,7 +337,6 @@ xfs_ialloc_inode_init( xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length)); for (i = 0; i < M_IGEO(mp)->inodes_per_cluster; i++) { int ioffset = i << mp->m_sb.sb_inodelog; - uint isize = XFS_DINODE_SIZE(&mp->m_sb); free = xfs_make_iptr(mp, fbuf, i); free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC); @@ -354,7 +353,7 @@ xfs_ialloc_inode_init( } else if (tp) { /* just log the inode core */ xfs_trans_log_buf(tp, fbuf, ioffset, - ioffset + isize - 1); + ioffset + XFS_DINODE_SIZE(mp) - 1); } } @@ -635,7 +634,7 @@ xfs_ialloc_ag_alloc( #ifdef DEBUG /* randomly do sparse inode allocations */ - if (xfs_sb_version_hassparseinodes(&tp->t_mountp->m_sb) && + if (xfs_has_sparseinodes(tp->t_mountp) && igeo->ialloc_min_blks < igeo->ialloc_blks) do_sparse = prandom_u32() & 1; #endif @@ -712,7 +711,7 @@ xfs_ialloc_ag_alloc( */ isaligned = 0; if (igeo->ialloc_align) { - ASSERT(!(args.mp->m_flags & XFS_MOUNT_NOALIGN)); + ASSERT(!xfs_has_noalign(args.mp)); args.alignment = args.mp->m_dalign; isaligned = 1; } else @@ -754,7 +753,7 @@ xfs_ialloc_ag_alloc( * Finally, try a sparse allocation if the filesystem supports it and * the sparse allocation length is smaller than a full chunk. */ - if (xfs_sb_version_hassparseinodes(&args.mp->m_sb) && + if (xfs_has_sparseinodes(args.mp) && igeo->ialloc_min_blks < igeo->ialloc_blks && args.fsbno == NULLFSBLOCK) { sparse_alloc: @@ -856,7 +855,7 @@ sparse_alloc: * from the previous call. Set merge false to replace any * existing record with this one. */ - if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) { + if (xfs_has_finobt(args.mp)) { error = xfs_inobt_insert_sprec(args.mp, tp, agbp, pag, XFS_BTNUM_FINO, &rec, false); if (error) @@ -869,7 +868,7 @@ sparse_alloc: if (error) return error; - if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) { + if (xfs_has_finobt(args.mp)) { error = xfs_inobt_insert(args.mp, tp, agbp, pag, newino, newlen, XFS_BTNUM_FINO); if (error) @@ -1448,7 +1447,7 @@ xfs_dialloc_ag( int offset; int i; - if (!xfs_sb_version_hasfinobt(&mp->m_sb)) + if (!xfs_has_finobt(mp)) return xfs_dialloc_ag_inobt(tp, agbp, pag, parent, inop); /* @@ -1784,7 +1783,7 @@ xfs_dialloc( break; } - if (XFS_FORCED_SHUTDOWN(mp)) { + if (xfs_is_shutdown(mp)) { error = -EFSCORRUPTED; break; } @@ -1953,8 +1952,7 @@ xfs_difree_inobt( * remove the chunk if the block size is large enough for multiple inode * chunks (that might not be free). */ - if (!(mp->m_flags & XFS_MOUNT_IKEEP) && - rec.ir_free == XFS_INOBT_ALL_FREE && + if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE && mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) { struct xfs_perag *pag = agbp->b_pag; @@ -1994,7 +1992,7 @@ xfs_difree_inobt( goto error0; } - /* + /* * Change the inode free counts and log the ag/sb changes. */ be32_add_cpu(&agi->agi_freecount, 1); @@ -2098,9 +2096,8 @@ xfs_difree_finobt( * enough for multiple chunks. Leave the finobt record to remain in sync * with the inobt. */ - if (rec.ir_free == XFS_INOBT_ALL_FREE && - mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK && - !(mp->m_flags & XFS_MOUNT_IKEEP)) { + if (!xfs_has_ikeep(mp) && rec.ir_free == XFS_INOBT_ALL_FREE && + mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) { error = xfs_btree_delete(cur, &i); if (error) goto error; @@ -2189,7 +2186,7 @@ xfs_difree( /* * Fix up the free inode btree. */ - if (xfs_sb_version_hasfinobt(&mp->m_sb)) { + if (xfs_has_finobt(mp)) { error = xfs_difree_finobt(mp, tp, agbp, pag, agino, &rec); if (error) goto error0; @@ -2478,7 +2475,7 @@ xfs_agi_verify( struct xfs_agi *agi = bp->b_addr; int i; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { if (!uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; if (!xfs_log_check_lsn(mp, be64_to_cpu(agi->agi_lsn))) @@ -2497,7 +2494,7 @@ xfs_agi_verify( be32_to_cpu(agi->agi_level) > M_IGEO(mp)->inobt_maxlevels) return __this_address; - if (xfs_sb_version_hasfinobt(&mp->m_sb) && + if (xfs_has_finobt(mp) && (be32_to_cpu(agi->agi_free_level) < 1 || be32_to_cpu(agi->agi_free_level) > M_IGEO(mp)->inobt_maxlevels)) return __this_address; @@ -2528,7 +2525,7 @@ xfs_agi_read_verify( struct xfs_mount *mp = bp->b_mount; xfs_failaddr_t fa; - if (xfs_sb_version_hascrc(&mp->m_sb) && + if (xfs_has_crc(mp) && !xfs_buf_verify_cksum(bp, XFS_AGI_CRC_OFF)) xfs_verifier_error(bp, -EFSBADCRC, __this_address); else { @@ -2553,7 +2550,7 @@ xfs_agi_write_verify( return; } - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return; if (bip) @@ -2626,7 +2623,7 @@ xfs_ialloc_read_agi( * we are in the middle of a forced shutdown. */ ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) || - XFS_FORCED_SHUTDOWN(mp)); + xfs_is_shutdown(mp)); return 0; } @@ -2716,7 +2713,7 @@ struct xfs_ialloc_count_inodes { STATIC int xfs_ialloc_count_inodes_rec( struct xfs_btree_cur *cur, - union xfs_btree_rec *rec, + const union xfs_btree_rec *rec, void *priv) { struct xfs_inobt_rec_incore irec; @@ -2773,7 +2770,7 @@ xfs_ialloc_setup_geometry( uint inodes; igeo->new_diflags2 = 0; - if (xfs_sb_version_hasbigtime(&mp->m_sb)) + if (xfs_has_bigtime(mp)) igeo->new_diflags2 |= XFS_DIFLAG2_BIGTIME; /* Compute inode btree geometry. */ @@ -2828,7 +2825,7 @@ xfs_ialloc_setup_geometry( * cannot change the behavior. */ igeo->inode_cluster_size_raw = XFS_INODE_BIG_CLUSTER_SIZE; - if (xfs_sb_version_has_v3inode(&mp->m_sb)) { + if (xfs_has_v3inodes(mp)) { int new_size = igeo->inode_cluster_size_raw; new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE; @@ -2846,7 +2843,7 @@ xfs_ialloc_setup_geometry( igeo->inodes_per_cluster = XFS_FSB_TO_INO(mp, igeo->blocks_per_cluster); /* Calculate inode cluster alignment. */ - if (xfs_sb_version_hasalign(&mp->m_sb) && + if (xfs_has_align(mp) && mp->m_sb.sb_inoalignmt >= igeo->blocks_per_cluster) igeo->cluster_align = mp->m_sb.sb_inoalignmt; else @@ -2894,15 +2891,15 @@ xfs_ialloc_calc_rootino( first_bno += xfs_alloc_min_freelist(mp, NULL); /* ...the free inode btree root... */ - if (xfs_sb_version_hasfinobt(&mp->m_sb)) + if (xfs_has_finobt(mp)) first_bno++; /* ...the reverse mapping btree root... */ - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + if (xfs_has_rmapbt(mp)) first_bno++; /* ...the reference count btree... */ - if (xfs_sb_version_hasreflink(&mp->m_sb)) + if (xfs_has_reflink(mp)) first_bno++; /* @@ -2920,9 +2917,9 @@ xfs_ialloc_calc_rootino( * Now round first_bno up to whatever allocation alignment is given * by the filesystem or was passed in. */ - if (xfs_sb_version_hasdalign(&mp->m_sb) && igeo->ialloc_align > 0) + if (xfs_has_dalign(mp) && igeo->ialloc_align > 0) first_bno = roundup(first_bno, sunit); - else if (xfs_sb_version_hasalign(&mp->m_sb) && + else if (xfs_has_align(mp) && mp->m_sb.sb_inoalignmt > 1) first_bno = roundup(first_bno, mp->m_sb.sb_inoalignmt); @@ -2953,7 +2950,7 @@ xfs_ialloc_check_shrink( int has; int error; - if (!xfs_sb_version_hassparseinodes(&mp->m_sb)) + if (!xfs_has_sparseinodes(mp)) return 0; pag = xfs_perag_get(mp, agno); diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index 9a2112b4ad5e..8b5c2b709022 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -106,7 +106,8 @@ int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, struct xfs_buf **bpp); union xfs_btree_rec; -void xfs_inobt_btrec_to_irec(struct xfs_mount *mp, union xfs_btree_rec *rec, +void xfs_inobt_btrec_to_irec(struct xfs_mount *mp, + const union xfs_btree_rec *rec, struct xfs_inobt_rec_incore *irec); int xfs_ialloc_has_inodes_at_extent(struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, bool *exists); diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 823a038939f8..27190840c5d8 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -40,9 +40,9 @@ xfs_inobt_dup_cursor( STATIC void xfs_inobt_set_root( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *nptr, - int inc) /* level change */ + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *nptr, + int inc) /* level change */ { struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agi *agi = agbp->b_addr; @@ -54,9 +54,9 @@ xfs_inobt_set_root( STATIC void xfs_finobt_set_root( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *nptr, - int inc) /* level change */ + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *nptr, + int inc) /* level change */ { struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agi *agi = agbp->b_addr; @@ -76,7 +76,7 @@ xfs_inobt_mod_blockcount( struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agi *agi = agbp->b_addr; - if (!xfs_sb_version_hasinobtcounts(&cur->bc_mp->m_sb)) + if (!xfs_has_inobtcounts(cur->bc_mp)) return; if (cur->bc_btnum == XFS_BTNUM_FINO) @@ -88,11 +88,11 @@ xfs_inobt_mod_blockcount( STATIC int __xfs_inobt_alloc_block( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *start, - union xfs_btree_ptr *new, - int *stat, - enum xfs_ag_resv_type resv) + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat, + enum xfs_ag_resv_type resv) { xfs_alloc_arg_t args; /* block allocation args */ int error; /* error return value */ @@ -127,20 +127,20 @@ __xfs_inobt_alloc_block( STATIC int xfs_inobt_alloc_block( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *start, - union xfs_btree_ptr *new, - int *stat) + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat) { return __xfs_inobt_alloc_block(cur, start, new, stat, XFS_AG_RESV_NONE); } STATIC int xfs_finobt_alloc_block( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *start, - union xfs_btree_ptr *new, - int *stat) + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat) { if (cur->bc_mp->m_finobt_nores) return xfs_inobt_alloc_block(cur, start, new, stat); @@ -156,7 +156,7 @@ __xfs_inobt_free_block( { xfs_inobt_mod_blockcount(cur, -1); return xfs_free_extent(cur->bc_tp, - XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)), 1, + XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)), 1, &XFS_RMAP_OINFO_INOBT, resv); } @@ -188,18 +188,18 @@ xfs_inobt_get_maxrecs( STATIC void xfs_inobt_init_key_from_rec( - union xfs_btree_key *key, - union xfs_btree_rec *rec) + union xfs_btree_key *key, + const union xfs_btree_rec *rec) { key->inobt.ir_startino = rec->inobt.ir_startino; } STATIC void xfs_inobt_init_high_key_from_rec( - union xfs_btree_key *key, - union xfs_btree_rec *rec) + union xfs_btree_key *key, + const union xfs_btree_rec *rec) { - __u32 x; + __u32 x; x = be32_to_cpu(rec->inobt.ir_startino); x += XFS_INODES_PER_CHUNK - 1; @@ -212,7 +212,7 @@ xfs_inobt_init_rec_from_cur( union xfs_btree_rec *rec) { rec->inobt.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino); - if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) { + if (xfs_has_sparseinodes(cur->bc_mp)) { rec->inobt.ir_u.sp.ir_holemask = cpu_to_be16(cur->bc_rec.i.ir_holemask); rec->inobt.ir_u.sp.ir_count = cur->bc_rec.i.ir_count; @@ -253,8 +253,8 @@ xfs_finobt_init_ptr_from_cur( STATIC int64_t xfs_inobt_key_diff( - struct xfs_btree_cur *cur, - union xfs_btree_key *key) + struct xfs_btree_cur *cur, + const union xfs_btree_key *key) { return (int64_t)be32_to_cpu(key->inobt.ir_startino) - cur->bc_rec.i.ir_startino; @@ -262,9 +262,9 @@ xfs_inobt_key_diff( STATIC int64_t xfs_inobt_diff_two_keys( - struct xfs_btree_cur *cur, - union xfs_btree_key *k1, - union xfs_btree_key *k2) + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2) { return (int64_t)be32_to_cpu(k1->inobt.ir_startino) - be32_to_cpu(k2->inobt.ir_startino); @@ -292,7 +292,7 @@ xfs_inobt_verify( * but beware of the landmine (i.e. need to check pag->pagi_init) if we * ever do. */ - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { fa = xfs_btree_sblock_v5hdr_verify(bp); if (fa) return fa; @@ -360,9 +360,9 @@ const struct xfs_buf_ops xfs_finobt_buf_ops = { STATIC int xfs_inobt_keys_inorder( - struct xfs_btree_cur *cur, - union xfs_btree_key *k1, - union xfs_btree_key *k2) + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2) { return be32_to_cpu(k1->inobt.ir_startino) < be32_to_cpu(k2->inobt.ir_startino); @@ -370,9 +370,9 @@ xfs_inobt_keys_inorder( STATIC int xfs_inobt_recs_inorder( - struct xfs_btree_cur *cur, - union xfs_btree_rec *r1, - union xfs_btree_rec *r2) + struct xfs_btree_cur *cur, + const union xfs_btree_rec *r1, + const union xfs_btree_rec *r2) { return be32_to_cpu(r1->inobt.ir_startino) + XFS_INODES_PER_CHUNK <= be32_to_cpu(r2->inobt.ir_startino); @@ -446,7 +446,7 @@ xfs_inobt_init_common( cur->bc_blocklog = mp->m_sb.sb_blocklog; - if (xfs_sb_version_hascrc(&mp->m_sb)) + if (xfs_has_crc(mp)) cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; /* take a reference for the cursor */ @@ -511,7 +511,7 @@ xfs_inobt_commit_staged_btree( fields = XFS_AGI_ROOT | XFS_AGI_LEVEL; agi->agi_root = cpu_to_be32(afake->af_root); agi->agi_level = cpu_to_be32(afake->af_levels); - if (xfs_sb_version_hasinobtcounts(&cur->bc_mp->m_sb)) { + if (xfs_has_inobtcounts(cur->bc_mp)) { agi->agi_iblocks = cpu_to_be32(afake->af_blocks); fields |= XFS_AGI_IBLOCKS; } @@ -521,7 +521,7 @@ xfs_inobt_commit_staged_btree( fields = XFS_AGI_FREE_ROOT | XFS_AGI_FREE_LEVEL; agi->agi_free_root = cpu_to_be32(afake->af_root); agi->agi_free_level = cpu_to_be32(afake->af_levels); - if (xfs_sb_version_hasinobtcounts(&cur->bc_mp->m_sb)) { + if (xfs_has_inobtcounts(cur->bc_mp)) { agi->agi_fblocks = cpu_to_be32(afake->af_blocks); fields |= XFS_AGI_IBLOCKS; } @@ -737,10 +737,10 @@ xfs_finobt_calc_reserves( xfs_extlen_t tree_len = 0; int error; - if (!xfs_sb_version_hasfinobt(&mp->m_sb)) + if (!xfs_has_finobt(mp)) return 0; - if (xfs_sb_version_hasinobtcounts(&mp->m_sb)) + if (xfs_has_inobtcounts(mp)) error = xfs_finobt_read_blocks(mp, tp, pag, &tree_len); else error = xfs_inobt_count_blocks(mp, tp, pag, XFS_BTNUM_FINO, diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h index e530c82b2217..8a322d402e61 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.h +++ b/fs/xfs/libxfs/xfs_ialloc_btree.h @@ -19,7 +19,7 @@ struct xfs_perag; * Btree block header size depends on a superblock flag. */ #define XFS_INOBT_BLOCK_LEN(mp) \ - (xfs_sb_version_hascrc(&((mp)->m_sb)) ? \ + (xfs_has_crc(((mp))) ? \ XFS_BTREE_SBLOCK_CRC_LEN : XFS_BTREE_SBLOCK_LEN) /* diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 84ea2e0af9f0..3932b4ebf903 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -48,7 +48,7 @@ xfs_inode_buf_verify( /* * Validate the magic number and version of every inode in the buffer */ - agno = xfs_daddr_to_agno(mp, XFS_BUF_ADDR(bp)); + agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp)); ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock; for (i = 0; i < ni; i++) { int di_ok; @@ -58,7 +58,7 @@ xfs_inode_buf_verify( dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog)); unlinked_ino = be32_to_cpu(dip->di_next_unlinked); di_ok = xfs_verify_magic16(bp, dip->di_magic) && - xfs_dinode_good_version(&mp->m_sb, dip->di_version) && + xfs_dinode_good_version(mp, dip->di_version) && xfs_verify_agino_or_null(mp, agno, unlinked_ino); if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP))) { @@ -71,7 +71,7 @@ xfs_inode_buf_verify( #ifdef DEBUG xfs_alert(mp, "bad inode magic/vsn daddr %lld #%d (magic=%x)", - (unsigned long long)bp->b_bn, i, + (unsigned long long)xfs_buf_daddr(bp), i, be16_to_cpu(dip->di_magic)); #endif xfs_buf_verifier_error(bp, -EFSCORRUPTED, @@ -192,7 +192,7 @@ xfs_inode_from_disk( * inode. If the inode is unused, mode is zero and we shouldn't mess * with the uninitialized part of it. */ - if (!xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) + if (!xfs_has_v3inodes(ip->i_mount)) ip->i_flushiter = be16_to_cpu(from->di_flushiter); inode->i_generation = be32_to_cpu(from->di_gen); inode->i_mode = be16_to_cpu(from->di_mode); @@ -235,7 +235,7 @@ xfs_inode_from_disk( if (from->di_dmevmask || from->di_dmstate) xfs_iflags_set(ip, XFS_IPRESERVE_DM_FIELDS); - if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) { + if (xfs_has_v3inodes(ip->i_mount)) { inode_set_iversion_queried(inode, be64_to_cpu(from->di_changecount)); ip->i_crtime = xfs_inode_from_disk_ts(from, from->di_crtime); @@ -313,7 +313,7 @@ xfs_inode_to_disk( to->di_aformat = xfs_ifork_format(ip->i_afp); to->di_flags = cpu_to_be16(ip->i_diflags); - if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) { + if (xfs_has_v3inodes(ip->i_mount)) { to->di_version = 3; to->di_changecount = cpu_to_be64(inode_peek_iversion(inode)); to->di_crtime = xfs_inode_to_disk_ts(ip, ip->i_crtime); @@ -413,7 +413,7 @@ xfs_dinode_verify( /* Verify v3 integrity information first */ if (dip->di_version >= 3) { - if (!xfs_sb_version_has_v3inode(&mp->m_sb)) + if (!xfs_has_v3inodes(mp)) return __this_address; if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize, XFS_DINODE_CRC_OFF)) @@ -515,7 +515,7 @@ xfs_dinode_verify( /* don't allow reflink/cowextsize if we don't have reflink */ if ((flags2 & (XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE)) && - !xfs_sb_version_hasreflink(&mp->m_sb)) + !xfs_has_reflink(mp)) return __this_address; /* only regular files get reflink */ @@ -534,7 +534,7 @@ xfs_dinode_verify( /* bigtime iflag can only happen on bigtime filesystems */ if (xfs_dinode_has_bigtime(dip) && - !xfs_sb_version_hasbigtime(&mp->m_sb)) + !xfs_has_bigtime(mp)) return __this_address; return NULL; @@ -550,7 +550,7 @@ xfs_dinode_calc_crc( if (dip->di_version < 3) return; - ASSERT(xfs_sb_version_hascrc(&mp->m_sb)); + ASSERT(xfs_has_crc(mp)); crc = xfs_start_cksum_update((char *)dip, mp->m_sb.sb_inodesize, XFS_DINODE_CRC_OFF); dip->di_crc = xfs_end_cksum(crc); @@ -677,7 +677,7 @@ xfs_inode_validate_cowextsize( hint_flag = (flags2 & XFS_DIFLAG2_COWEXTSIZE); cowextsize_bytes = XFS_FSB_TO_B(mp, cowextsize); - if (hint_flag && !xfs_sb_version_hasreflink(&mp->m_sb)) + if (hint_flag && !xfs_has_reflink(mp)) return __this_address; if (hint_flag && !(S_ISDIR(mode) || S_ISREG(mode))) diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h index 7f865bb4df84..585ed5a110af 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.h +++ b/fs/xfs/libxfs/xfs_inode_buf.h @@ -21,7 +21,7 @@ struct xfs_imap { int xfs_imap_to_bp(struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_imap *imap, struct xfs_buf **bpp); -void xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *); +void xfs_dinode_calc_crc(struct xfs_mount *mp, struct xfs_dinode *dip); void xfs_inode_to_disk(struct xfs_inode *ip, struct xfs_dinode *to, xfs_lsn_t lsn); int xfs_inode_from_disk(struct xfs_inode *ip, struct xfs_dinode *from); @@ -42,4 +42,13 @@ static inline uint64_t xfs_inode_encode_bigtime(struct timespec64 tv) struct timespec64 xfs_inode_from_disk_ts(struct xfs_dinode *dip, const xfs_timestamp_t ts); +static inline bool +xfs_dinode_good_version(struct xfs_mount *mp, uint8_t version) +{ + if (xfs_has_v3inodes(mp)) + return version == 3; + return version == 1 || version == 2; +} + + #endif /* __XFS_INODE_BUF_H__ */ diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 2c5bcbc19264..b322db523d65 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -41,10 +41,10 @@ typedef uint32_t xlog_tid_t; #define XFS_MIN_LOG_FACTOR 3 #define XLOG_REC_SHIFT(log) \ - BTOBB(1 << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \ + BTOBB(1 << (xfs_has_logv2(log->l_mp) ? \ XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT)) #define XLOG_TOTAL_REC_SHIFT(log) \ - BTOBB(XLOG_MAX_ICLOGS << (xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? \ + BTOBB(XLOG_MAX_ICLOGS << (xfs_has_logv2(log->l_mp) ? \ XLOG_MAX_RECORD_BSHIFT : XLOG_BIG_RECORD_BSHIFT)) /* get lsn fields */ @@ -434,7 +434,7 @@ struct xfs_log_dinode { }; #define xfs_log_dinode_size(mp) \ - (xfs_sb_version_has_v3inode(&(mp)->m_sb) ? \ + (xfs_has_v3inodes((mp)) ? \ sizeof(struct xfs_log_dinode) : \ offsetof(struct xfs_log_dinode, di_next_unlinked)) diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index 3cca2bfe714c..ff69a0000817 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -122,6 +122,8 @@ void xlog_buf_readahead(struct xlog *log, xfs_daddr_t blkno, uint len, const struct xfs_buf_ops *ops); bool xlog_is_buffer_cancelled(struct xlog *log, xfs_daddr_t blkno, uint len); +int xlog_recover_iget(struct xfs_mount *mp, xfs_ino_t ino, + struct xfs_inode **ipp); void xlog_recover_release_intent(struct xlog *log, unsigned short intent_type, uint64_t intent_id); diff --git a/fs/xfs/libxfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c index 7f55eb3f3653..67798ff5e14e 100644 --- a/fs/xfs/libxfs/xfs_log_rlimit.c +++ b/fs/xfs/libxfs/xfs_log_rlimit.c @@ -92,7 +92,7 @@ xfs_log_calc_minimum_size( if (tres.tr_logcount > 1) max_logres *= tres.tr_logcount; - if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1) + if (xfs_has_logv2(mp) && mp->m_sb.sb_logsunit > 1) lsunit = BTOBB(mp->m_sb.sb_logsunit); /* diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h index 0f0af4e35032..a02c5062f9b2 100644 --- a/fs/xfs/libxfs/xfs_quota_defs.h +++ b/fs/xfs/libxfs/xfs_quota_defs.h @@ -60,37 +60,15 @@ typedef uint8_t xfs_dqtype_t; #define XFS_DQUOT_LOGRES(mp) \ ((sizeof(struct xfs_dq_logformat) + sizeof(struct xfs_disk_dquot)) * 6) -#define XFS_IS_QUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT) -#define XFS_IS_UQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_UQUOTA_ACCT) -#define XFS_IS_PQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_PQUOTA_ACCT) -#define XFS_IS_GQUOTA_RUNNING(mp) ((mp)->m_qflags & XFS_GQUOTA_ACCT) +#define XFS_IS_QUOTA_ON(mp) ((mp)->m_qflags & XFS_ALL_QUOTA_ACCT) +#define XFS_IS_UQUOTA_ON(mp) ((mp)->m_qflags & XFS_UQUOTA_ACCT) +#define XFS_IS_PQUOTA_ON(mp) ((mp)->m_qflags & XFS_PQUOTA_ACCT) +#define XFS_IS_GQUOTA_ON(mp) ((mp)->m_qflags & XFS_GQUOTA_ACCT) #define XFS_IS_UQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_UQUOTA_ENFD) #define XFS_IS_GQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_GQUOTA_ENFD) #define XFS_IS_PQUOTA_ENFORCED(mp) ((mp)->m_qflags & XFS_PQUOTA_ENFD) /* - * Incore only flags for quotaoff - these bits get cleared when quota(s) - * are in the process of getting turned off. These flags are in m_qflags but - * never in sb_qflags. - */ -#define XFS_UQUOTA_ACTIVE 0x1000 /* uquotas are being turned off */ -#define XFS_GQUOTA_ACTIVE 0x2000 /* gquotas are being turned off */ -#define XFS_PQUOTA_ACTIVE 0x4000 /* pquotas are being turned off */ -#define XFS_ALL_QUOTA_ACTIVE \ - (XFS_UQUOTA_ACTIVE | XFS_GQUOTA_ACTIVE | XFS_PQUOTA_ACTIVE) - -/* - * Checking XFS_IS_*QUOTA_ON() while holding any inode lock guarantees - * quota will be not be switched off as long as that inode lock is held. - */ -#define XFS_IS_QUOTA_ON(mp) ((mp)->m_qflags & (XFS_UQUOTA_ACTIVE | \ - XFS_GQUOTA_ACTIVE | \ - XFS_PQUOTA_ACTIVE)) -#define XFS_IS_UQUOTA_ON(mp) ((mp)->m_qflags & XFS_UQUOTA_ACTIVE) -#define XFS_IS_GQUOTA_ON(mp) ((mp)->m_qflags & XFS_GQUOTA_ACTIVE) -#define XFS_IS_PQUOTA_ON(mp) ((mp)->m_qflags & XFS_PQUOTA_ACTIVE) - -/* * Flags to tell various functions what to do. Not all of these are meaningful * to a single function. None of these XFS_QMOPT_* flags are meant to have * persistent values (ie. their values can and will change between versions) diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 860a0c9801ba..e5d767a7fc5d 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -91,7 +91,7 @@ xfs_refcount_lookup_eq( /* Convert on-disk record to in-core format. */ void xfs_refcount_btrec_to_irec( - union xfs_btree_rec *rec, + const union xfs_btree_rec *rec, struct xfs_refcount_irec *irec) { irec->rc_startblock = be32_to_cpu(rec->refc.rc_startblock); @@ -1253,7 +1253,7 @@ xfs_refcount_increase_extent( struct xfs_trans *tp, struct xfs_bmbt_irec *PREV) { - if (!xfs_sb_version_hasreflink(&tp->t_mountp->m_sb)) + if (!xfs_has_reflink(tp->t_mountp)) return; __xfs_refcount_add(tp, XFS_REFCOUNT_INCREASE, PREV->br_startblock, @@ -1268,7 +1268,7 @@ xfs_refcount_decrease_extent( struct xfs_trans *tp, struct xfs_bmbt_irec *PREV) { - if (!xfs_sb_version_hasreflink(&tp->t_mountp->m_sb)) + if (!xfs_has_reflink(tp->t_mountp)) return; __xfs_refcount_add(tp, XFS_REFCOUNT_DECREASE, PREV->br_startblock, @@ -1617,7 +1617,7 @@ xfs_refcount_alloc_cow_extent( { struct xfs_mount *mp = tp->t_mountp; - if (!xfs_sb_version_hasreflink(&mp->m_sb)) + if (!xfs_has_reflink(mp)) return; __xfs_refcount_add(tp, XFS_REFCOUNT_ALLOC_COW, fsb, len); @@ -1636,7 +1636,7 @@ xfs_refcount_free_cow_extent( { struct xfs_mount *mp = tp->t_mountp; - if (!xfs_sb_version_hasreflink(&mp->m_sb)) + if (!xfs_has_reflink(mp)) return; /* Remove rmap entry */ @@ -1654,7 +1654,7 @@ struct xfs_refcount_recovery { STATIC int xfs_refcount_recover_extent( struct xfs_btree_cur *cur, - union xfs_btree_rec *rec, + const union xfs_btree_rec *rec, void *priv) { struct list_head *debris = priv; diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h index 9f6e9aae4da0..02cb3aa405be 100644 --- a/fs/xfs/libxfs/xfs_refcount.h +++ b/fs/xfs/libxfs/xfs_refcount.h @@ -78,7 +78,7 @@ static inline xfs_fileoff_t xfs_refcount_max_unmap(int log_res) extern int xfs_refcount_has_record(struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, bool *exists); union xfs_btree_rec; -extern void xfs_refcount_btrec_to_irec(union xfs_btree_rec *rec, +extern void xfs_refcount_btrec_to_irec(const union xfs_btree_rec *rec, struct xfs_refcount_irec *irec); extern int xfs_refcount_insert(struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec, int *stat); diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 92d336c17e83..1ef9b99962ab 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -31,9 +31,9 @@ xfs_refcountbt_dup_cursor( STATIC void xfs_refcountbt_set_root( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *ptr, - int inc) + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr, + int inc) { struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agf *agf = agbp->b_addr; @@ -51,10 +51,10 @@ xfs_refcountbt_set_root( STATIC int xfs_refcountbt_alloc_block( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *start, - union xfs_btree_ptr *new, - int *stat) + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat) { struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agf *agf = agbp->b_addr; @@ -102,7 +102,7 @@ xfs_refcountbt_free_block( struct xfs_mount *mp = cur->bc_mp; struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agf *agf = agbp->b_addr; - xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp)); + xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp)); int error; trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_ag.pag->pag_agno, @@ -135,18 +135,18 @@ xfs_refcountbt_get_maxrecs( STATIC void xfs_refcountbt_init_key_from_rec( - union xfs_btree_key *key, - union xfs_btree_rec *rec) + union xfs_btree_key *key, + const union xfs_btree_rec *rec) { key->refc.rc_startblock = rec->refc.rc_startblock; } STATIC void xfs_refcountbt_init_high_key_from_rec( - union xfs_btree_key *key, - union xfs_btree_rec *rec) + union xfs_btree_key *key, + const union xfs_btree_rec *rec) { - __u32 x; + __u32 x; x = be32_to_cpu(rec->refc.rc_startblock); x += be32_to_cpu(rec->refc.rc_blockcount) - 1; @@ -177,20 +177,20 @@ xfs_refcountbt_init_ptr_from_cur( STATIC int64_t xfs_refcountbt_key_diff( - struct xfs_btree_cur *cur, - union xfs_btree_key *key) + struct xfs_btree_cur *cur, + const union xfs_btree_key *key) { struct xfs_refcount_irec *rec = &cur->bc_rec.rc; - struct xfs_refcount_key *kp = &key->refc; + const struct xfs_refcount_key *kp = &key->refc; return (int64_t)be32_to_cpu(kp->rc_startblock) - rec->rc_startblock; } STATIC int64_t xfs_refcountbt_diff_two_keys( - struct xfs_btree_cur *cur, - union xfs_btree_key *k1, - union xfs_btree_key *k2) + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2) { return (int64_t)be32_to_cpu(k1->refc.rc_startblock) - be32_to_cpu(k2->refc.rc_startblock); @@ -209,7 +209,7 @@ xfs_refcountbt_verify( if (!xfs_verify_magic(bp, block->bb_magic)) return __this_address; - if (!xfs_sb_version_hasreflink(&mp->m_sb)) + if (!xfs_has_reflink(mp)) return __this_address; fa = xfs_btree_sblock_v5hdr_verify(bp); if (fa) @@ -269,9 +269,9 @@ const struct xfs_buf_ops xfs_refcountbt_buf_ops = { STATIC int xfs_refcountbt_keys_inorder( - struct xfs_btree_cur *cur, - union xfs_btree_key *k1, - union xfs_btree_key *k2) + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2) { return be32_to_cpu(k1->refc.rc_startblock) < be32_to_cpu(k2->refc.rc_startblock); @@ -279,9 +279,9 @@ xfs_refcountbt_keys_inorder( STATIC int xfs_refcountbt_recs_inorder( - struct xfs_btree_cur *cur, - union xfs_btree_rec *r1, - union xfs_btree_rec *r2) + struct xfs_btree_cur *cur, + const union xfs_btree_rec *r1, + const union xfs_btree_rec *r2) { return be32_to_cpu(r1->refc.rc_startblock) + be32_to_cpu(r1->refc.rc_blockcount) <= @@ -462,7 +462,7 @@ xfs_refcountbt_calc_reserves( xfs_extlen_t tree_len; int error; - if (!xfs_sb_version_hasreflink(&mp->m_sb)) + if (!xfs_has_reflink(mp)) return 0; error = xfs_alloc_read_agf(mp, tp, pag->pag_agno, 0, &agbp); diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index d1dfad0204e3..f45929b1b94a 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -179,8 +179,8 @@ done: /* Convert an internal btree record to an rmap record. */ int xfs_rmap_btrec_to_irec( - union xfs_btree_rec *rec, - struct xfs_rmap_irec *irec) + const union xfs_btree_rec *rec, + struct xfs_rmap_irec *irec) { irec->rm_startblock = be32_to_cpu(rec->rmap.rm_startblock); irec->rm_blockcount = be32_to_cpu(rec->rmap.rm_blockcount); @@ -255,9 +255,9 @@ struct xfs_find_left_neighbor_info { /* For each rmap given, figure out if it matches the key we want. */ STATIC int xfs_rmap_find_left_neighbor_helper( - struct xfs_btree_cur *cur, - struct xfs_rmap_irec *rec, - void *priv) + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) { struct xfs_find_left_neighbor_info *info = priv; @@ -331,9 +331,9 @@ xfs_rmap_find_left_neighbor( /* For each rmap given, figure out if it matches the key we want. */ STATIC int xfs_rmap_lookup_le_range_helper( - struct xfs_btree_cur *cur, - struct xfs_rmap_irec *rec, - void *priv) + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv) { struct xfs_find_left_neighbor_info *info = priv; @@ -705,7 +705,7 @@ xfs_rmap_free( struct xfs_btree_cur *cur; int error; - if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) + if (!xfs_has_rmapbt(mp)) return 0; cur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag); @@ -959,7 +959,7 @@ xfs_rmap_alloc( struct xfs_btree_cur *cur; int error; - if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) + if (!xfs_has_rmapbt(mp)) return 0; cur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag); @@ -2278,9 +2278,9 @@ struct xfs_rmap_query_range_info { /* Format btree record and pass to our callback. */ STATIC int xfs_rmap_query_range_helper( - struct xfs_btree_cur *cur, - union xfs_btree_rec *rec, - void *priv) + struct xfs_btree_cur *cur, + const union xfs_btree_rec *rec, + void *priv) { struct xfs_rmap_query_range_info *query = priv; struct xfs_rmap_irec irec; @@ -2296,8 +2296,8 @@ xfs_rmap_query_range_helper( int xfs_rmap_query_range( struct xfs_btree_cur *cur, - struct xfs_rmap_irec *low_rec, - struct xfs_rmap_irec *high_rec, + const struct xfs_rmap_irec *low_rec, + const struct xfs_rmap_irec *high_rec, xfs_rmap_query_range_fn fn, void *priv) { @@ -2459,7 +2459,7 @@ xfs_rmap_update_is_needed( struct xfs_mount *mp, int whichfork) { - return xfs_sb_version_hasrmapbt(&mp->m_sb) && whichfork != XFS_COW_FORK; + return xfs_has_rmapbt(mp) && whichfork != XFS_COW_FORK; } /* @@ -2707,7 +2707,7 @@ struct xfs_rmap_key_state { STATIC int xfs_rmap_has_other_keys_helper( struct xfs_btree_cur *cur, - struct xfs_rmap_irec *rec, + const struct xfs_rmap_irec *rec, void *priv) { struct xfs_rmap_key_state *rks = priv; diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h index f2423cf7f1e2..fd67904ed446 100644 --- a/fs/xfs/libxfs/xfs_rmap.h +++ b/fs/xfs/libxfs/xfs_rmap.h @@ -134,12 +134,13 @@ int xfs_rmap_get_rec(struct xfs_btree_cur *cur, struct xfs_rmap_irec *irec, int *stat); typedef int (*xfs_rmap_query_range_fn)( - struct xfs_btree_cur *cur, - struct xfs_rmap_irec *rec, - void *priv); + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *rec, + void *priv); int xfs_rmap_query_range(struct xfs_btree_cur *cur, - struct xfs_rmap_irec *low_rec, struct xfs_rmap_irec *high_rec, + const struct xfs_rmap_irec *low_rec, + const struct xfs_rmap_irec *high_rec, xfs_rmap_query_range_fn fn, void *priv); int xfs_rmap_query_all(struct xfs_btree_cur *cur, xfs_rmap_query_range_fn fn, void *priv); @@ -192,7 +193,7 @@ int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno, int xfs_rmap_compare(const struct xfs_rmap_irec *a, const struct xfs_rmap_irec *b); union xfs_btree_rec; -int xfs_rmap_btrec_to_irec(union xfs_btree_rec *rec, +int xfs_rmap_btrec_to_irec(const union xfs_btree_rec *rec, struct xfs_rmap_irec *irec); int xfs_rmap_has_record(struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, bool *exists); diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index f29bc71b9950..b7dbbfb3aeed 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -57,9 +57,9 @@ xfs_rmapbt_dup_cursor( STATIC void xfs_rmapbt_set_root( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *ptr, - int inc) + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *ptr, + int inc) { struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agf *agf = agbp->b_addr; @@ -76,10 +76,10 @@ xfs_rmapbt_set_root( STATIC int xfs_rmapbt_alloc_block( - struct xfs_btree_cur *cur, - union xfs_btree_ptr *start, - union xfs_btree_ptr *new, - int *stat) + struct xfs_btree_cur *cur, + const union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat) { struct xfs_buf *agbp = cur->bc_ag.agbp; struct xfs_agf *agf = agbp->b_addr; @@ -122,7 +122,7 @@ xfs_rmapbt_free_block( xfs_agblock_t bno; int error; - bno = xfs_daddr_to_agbno(cur->bc_mp, XFS_BUF_ADDR(bp)); + bno = xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp)); trace_xfs_rmapbt_free_block(cur->bc_mp, pag->pag_agno, bno, 1); be32_add_cpu(&agf->agf_rmap_blocks, -1); @@ -156,8 +156,8 @@ xfs_rmapbt_get_maxrecs( STATIC void xfs_rmapbt_init_key_from_rec( - union xfs_btree_key *key, - union xfs_btree_rec *rec) + union xfs_btree_key *key, + const union xfs_btree_rec *rec) { key->rmap.rm_startblock = rec->rmap.rm_startblock; key->rmap.rm_owner = rec->rmap.rm_owner; @@ -173,11 +173,11 @@ xfs_rmapbt_init_key_from_rec( */ STATIC void xfs_rmapbt_init_high_key_from_rec( - union xfs_btree_key *key, - union xfs_btree_rec *rec) + union xfs_btree_key *key, + const union xfs_btree_rec *rec) { - uint64_t off; - int adj; + uint64_t off; + int adj; adj = be32_to_cpu(rec->rmap.rm_blockcount) - 1; @@ -219,13 +219,13 @@ xfs_rmapbt_init_ptr_from_cur( STATIC int64_t xfs_rmapbt_key_diff( - struct xfs_btree_cur *cur, - union xfs_btree_key *key) + struct xfs_btree_cur *cur, + const union xfs_btree_key *key) { - struct xfs_rmap_irec *rec = &cur->bc_rec.r; - struct xfs_rmap_key *kp = &key->rmap; - __u64 x, y; - int64_t d; + struct xfs_rmap_irec *rec = &cur->bc_rec.r; + const struct xfs_rmap_key *kp = &key->rmap; + __u64 x, y; + int64_t d; d = (int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock; if (d) @@ -249,14 +249,14 @@ xfs_rmapbt_key_diff( STATIC int64_t xfs_rmapbt_diff_two_keys( - struct xfs_btree_cur *cur, - union xfs_btree_key *k1, - union xfs_btree_key *k2) + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2) { - struct xfs_rmap_key *kp1 = &k1->rmap; - struct xfs_rmap_key *kp2 = &k2->rmap; - int64_t d; - __u64 x, y; + const struct xfs_rmap_key *kp1 = &k1->rmap; + const struct xfs_rmap_key *kp2 = &k2->rmap; + int64_t d; + __u64 x, y; d = (int64_t)be32_to_cpu(kp1->rm_startblock) - be32_to_cpu(kp2->rm_startblock); @@ -304,7 +304,7 @@ xfs_rmapbt_verify( if (!xfs_verify_magic(bp, block->bb_magic)) return __this_address; - if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) + if (!xfs_has_rmapbt(mp)) return __this_address; fa = xfs_btree_sblock_v5hdr_verify(bp); if (fa) @@ -364,9 +364,9 @@ const struct xfs_buf_ops xfs_rmapbt_buf_ops = { STATIC int xfs_rmapbt_keys_inorder( - struct xfs_btree_cur *cur, - union xfs_btree_key *k1, - union xfs_btree_key *k2) + struct xfs_btree_cur *cur, + const union xfs_btree_key *k1, + const union xfs_btree_key *k2) { uint32_t x; uint32_t y; @@ -394,9 +394,9 @@ xfs_rmapbt_keys_inorder( STATIC int xfs_rmapbt_recs_inorder( - struct xfs_btree_cur *cur, - union xfs_btree_rec *r1, - union xfs_btree_rec *r2) + struct xfs_btree_cur *cur, + const union xfs_btree_rec *r1, + const union xfs_btree_rec *r2) { uint32_t x; uint32_t y; @@ -558,7 +558,7 @@ xfs_rmapbt_compute_maxlevels( * disallow reflinking when less than 10% of the per-AG metadata * block reservation since the fallback is a regular file copy. */ - if (xfs_sb_version_hasreflink(&mp->m_sb)) + if (xfs_has_reflink(mp)) mp->m_rmap_maxlevels = XFS_BTREE_MAXLEVELS; else mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels( @@ -606,7 +606,7 @@ xfs_rmapbt_calc_reserves( xfs_extlen_t tree_len; int error; - if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) + if (!xfs_has_rmapbt(mp)) return 0; error = xfs_alloc_read_agf(mp, tp, pag->pag_agno, 0, &agbp); diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h index 88d8d18788a2..f2eee6572af4 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.h +++ b/fs/xfs/libxfs/xfs_rmap_btree.h @@ -59,4 +59,4 @@ extern xfs_extlen_t xfs_rmapbt_max_size(struct xfs_mount *mp, extern int xfs_rmapbt_calc_reserves(struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_perag *pag, xfs_extlen_t *ask, xfs_extlen_t *used); -#endif /* __XFS_RMAP_BTREE_H__ */ +#endif /* __XFS_RMAP_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 483375c6a735..5740ba664867 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -1009,8 +1009,8 @@ xfs_rtfree_extent( int xfs_rtalloc_query_range( struct xfs_trans *tp, - struct xfs_rtalloc_rec *low_rec, - struct xfs_rtalloc_rec *high_rec, + const struct xfs_rtalloc_rec *low_rec, + const struct xfs_rtalloc_rec *high_rec, xfs_rtalloc_query_range_fn fn, void *priv) { @@ -1018,6 +1018,7 @@ xfs_rtalloc_query_range( struct xfs_mount *mp = tp->t_mountp; xfs_rtblock_t rtstart; xfs_rtblock_t rtend; + xfs_rtblock_t high_key; int is_free; int error = 0; @@ -1026,12 +1027,12 @@ xfs_rtalloc_query_range( if (low_rec->ar_startext >= mp->m_sb.sb_rextents || low_rec->ar_startext == high_rec->ar_startext) return 0; - high_rec->ar_startext = min(high_rec->ar_startext, - mp->m_sb.sb_rextents - 1); + + high_key = min(high_rec->ar_startext, mp->m_sb.sb_rextents - 1); /* Iterate the bitmap, looking for discrepancies. */ rtstart = low_rec->ar_startext; - while (rtstart <= high_rec->ar_startext) { + while (rtstart <= high_key) { /* Is the first block free? */ error = xfs_rtcheck_range(mp, tp, rtstart, 1, 1, &rtend, &is_free); @@ -1039,8 +1040,7 @@ xfs_rtalloc_query_range( break; /* How long does the extent go for? */ - error = xfs_rtfind_forw(mp, tp, rtstart, - high_rec->ar_startext, &rtend); + error = xfs_rtfind_forw(mp, tp, rtstart, high_key, &rtend); if (error) break; diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 04f5386446db..e58349be78bd 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -30,13 +30,110 @@ * Physical superblock buffer manipulations. Shared with libxfs in userspace. */ +/* + * We support all XFS versions newer than a v4 superblock with V2 directories. + */ +bool +xfs_sb_good_version( + struct xfs_sb *sbp) +{ + /* all v5 filesystems are supported */ + if (xfs_sb_is_v5(sbp)) + return true; + + /* versions prior to v4 are not supported */ + if (XFS_SB_VERSION_NUM(sbp) < XFS_SB_VERSION_4) + return false; + + /* V4 filesystems need v2 directories and unwritten extents */ + if (!(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT)) + return false; + if (!(sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT)) + return false; + + /* And must not have any unknown v4 feature bits set */ + if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) || + ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && + (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS))) + return false; + + /* It's a supported v4 filesystem */ + return true; +} + +uint64_t +xfs_sb_version_to_features( + struct xfs_sb *sbp) +{ + uint64_t features = 0; + + /* optional V4 features */ + if (sbp->sb_rblocks > 0) + features |= XFS_FEAT_REALTIME; + if (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT) + features |= XFS_FEAT_ATTR; + if (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT) + features |= XFS_FEAT_QUOTA; + if (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT) + features |= XFS_FEAT_ALIGN; + if (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT) + features |= XFS_FEAT_LOGV2; + if (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT) + features |= XFS_FEAT_DALIGN; + if (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT) + features |= XFS_FEAT_EXTFLG; + if (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT) + features |= XFS_FEAT_SECTOR; + if (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT) + features |= XFS_FEAT_ASCIICI; + if (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) { + if (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT) + features |= XFS_FEAT_LAZYSBCOUNT; + if (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT) + features |= XFS_FEAT_ATTR2; + if (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT) + features |= XFS_FEAT_PROJID32; + if (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE) + features |= XFS_FEAT_FTYPE; + } + + if (!xfs_sb_is_v5(sbp)) + return features; + + /* Always on V5 features */ + features |= XFS_FEAT_ALIGN | XFS_FEAT_LOGV2 | XFS_FEAT_EXTFLG | + XFS_FEAT_LAZYSBCOUNT | XFS_FEAT_ATTR2 | XFS_FEAT_PROJID32 | + XFS_FEAT_V3INODES | XFS_FEAT_CRC | XFS_FEAT_PQUOTINO; + + /* Optional V5 features */ + if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT) + features |= XFS_FEAT_FINOBT; + if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_RMAPBT) + features |= XFS_FEAT_RMAPBT; + if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_REFLINK) + features |= XFS_FEAT_REFLINK; + if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_INOBTCNT) + features |= XFS_FEAT_INOBTCNT; + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_FTYPE) + features |= XFS_FEAT_FTYPE; + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_SPINODES) + features |= XFS_FEAT_SPINODES; + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID) + features |= XFS_FEAT_META_UUID; + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_BIGTIME) + features |= XFS_FEAT_BIGTIME; + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR) + features |= XFS_FEAT_NEEDSREPAIR; + return features; +} + /* Check all the superblock fields we care about when reading one in. */ STATIC int xfs_validate_sb_read( struct xfs_mount *mp, struct xfs_sb *sbp) { - if (XFS_SB_VERSION_NUM(sbp) != XFS_SB_VERSION_5) + if (!xfs_sb_is_v5(sbp)) return 0; /* @@ -56,7 +153,7 @@ xfs_validate_sb_read( "Superblock has unknown read-only compatible features (0x%x) enabled.", (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_UNKNOWN)); - if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { + if (!xfs_is_readonly(mp)) { xfs_warn(mp, "Attempted to mount read-only compatible filesystem read-write."); xfs_warn(mp, @@ -95,7 +192,7 @@ xfs_validate_sb_write( * secondary superblocks, so allow this usage to continue because * we never read counters from such superblocks. */ - if (XFS_BUF_ADDR(bp) == XFS_SB_DADDR && !sbp->sb_inprogress && + if (xfs_buf_daddr(bp) == XFS_SB_DADDR && !sbp->sb_inprogress && (sbp->sb_fdblocks > sbp->sb_dblocks || !xfs_verify_icount(mp, sbp->sb_icount) || sbp->sb_ifree > sbp->sb_icount)) { @@ -103,7 +200,7 @@ xfs_validate_sb_write( return -EFSCORRUPTED; } - if (XFS_SB_VERSION_NUM(sbp) != XFS_SB_VERSION_5) + if (!xfs_sb_is_v5(sbp)) return 0; /* @@ -162,6 +259,7 @@ xfs_validate_sb_common( struct xfs_dsb *dsb = bp->b_addr; uint32_t agcount = 0; uint32_t rem; + bool has_dalign; if (!xfs_verify_magic(bp, dsb->sb_magicnum)) { xfs_warn(mp, "bad magic number"); @@ -173,12 +271,41 @@ xfs_validate_sb_common( return -EWRONGFS; } - if (xfs_sb_version_has_pquotino(sbp)) { + /* + * Validate feature flags and state + */ + if (xfs_sb_is_v5(sbp)) { + if (sbp->sb_blocksize < XFS_MIN_CRC_BLOCKSIZE) { + xfs_notice(mp, +"Block size (%u bytes) too small for Version 5 superblock (minimum %d bytes)", + sbp->sb_blocksize, XFS_MIN_CRC_BLOCKSIZE); + return -EFSCORRUPTED; + } + + /* V5 has a separate project quota inode */ if (sbp->sb_qflags & (XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD)) { xfs_notice(mp, "Version 5 of Super block has XFS_OQUOTA bits."); return -EFSCORRUPTED; } + + /* + * Full inode chunks must be aligned to inode chunk size when + * sparse inodes are enabled to support the sparse chunk + * allocation algorithm and prevent overlapping inode records. + */ + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_SPINODES) { + uint32_t align; + + align = XFS_INODES_PER_CHUNK * sbp->sb_inodesize + >> sbp->sb_blocklog; + if (sbp->sb_inoalignmt != align) { + xfs_warn(mp, +"Inode block alignment (%u) must match chunk size (%u) for sparse inodes.", + sbp->sb_inoalignmt, align); + return -EINVAL; + } + } } else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD | XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) { xfs_notice(mp, @@ -186,24 +313,6 @@ xfs_validate_sb_common( return -EFSCORRUPTED; } - /* - * Full inode chunks must be aligned to inode chunk size when - * sparse inodes are enabled to support the sparse chunk - * allocation algorithm and prevent overlapping inode records. - */ - if (xfs_sb_version_hassparseinodes(sbp)) { - uint32_t align; - - align = XFS_INODES_PER_CHUNK * sbp->sb_inodesize - >> sbp->sb_blocklog; - if (sbp->sb_inoalignmt != align) { - xfs_warn(mp, -"Inode block alignment (%u) must match chunk size (%u) for sparse inodes.", - sbp->sb_inoalignmt, align); - return -EINVAL; - } - } - if (unlikely( sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) { xfs_warn(mp, @@ -303,7 +412,8 @@ xfs_validate_sb_common( * Either (sb_unit and !hasdalign) or (!sb_unit and hasdalign) * would imply the image is corrupted. */ - if (!!sbp->sb_unit ^ xfs_sb_version_hasdalign(sbp)) { + has_dalign = sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT; + if (!!sbp->sb_unit ^ has_dalign) { xfs_notice(mp, "SB stripe alignment sanity check failed"); return -EFSCORRUPTED; } @@ -312,12 +422,6 @@ xfs_validate_sb_common( XFS_FSB_TO_B(mp, sbp->sb_width), 0, false)) return -EFSCORRUPTED; - if (xfs_sb_version_hascrc(&mp->m_sb) && - sbp->sb_blocksize < XFS_MIN_CRC_BLOCKSIZE) { - xfs_notice(mp, "v5 SB sanity check failed"); - return -EFSCORRUPTED; - } - /* * Currently only very few inode sizes are supported. */ @@ -361,7 +465,7 @@ xfs_sb_quota_from_disk(struct xfs_sb *sbp) * We need to do these manipilations only if we are working * with an older version of on-disk superblock. */ - if (xfs_sb_version_has_pquotino(sbp)) + if (xfs_sb_is_v5(sbp)) return; if (sbp->sb_qflags & XFS_OQUOTA_ENFD) @@ -454,7 +558,8 @@ __xfs_sb_from_disk( * sb_meta_uuid is only on disk if it differs from sb_uuid and the * feature flag is set; if not set we keep it only in memory. */ - if (xfs_sb_version_hasmetauuid(to)) + if (xfs_sb_is_v5(to) && + (to->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID)) uuid_copy(&to->sb_meta_uuid, &from->sb_meta_uuid); else uuid_copy(&to->sb_meta_uuid, &from->sb_uuid); @@ -479,7 +584,12 @@ xfs_sb_quota_to_disk( uint16_t qflags = from->sb_qflags; to->sb_uquotino = cpu_to_be64(from->sb_uquotino); - if (xfs_sb_version_has_pquotino(from)) { + + /* + * The in-memory superblock quota state matches the v5 on-disk format so + * just write them out and return + */ + if (xfs_sb_is_v5(from)) { to->sb_qflags = cpu_to_be16(from->sb_qflags); to->sb_gquotino = cpu_to_be64(from->sb_gquotino); to->sb_pquotino = cpu_to_be64(from->sb_pquotino); @@ -487,9 +597,9 @@ xfs_sb_quota_to_disk( } /* - * The in-core version of sb_qflags do not have XFS_OQUOTA_* - * flags, whereas the on-disk version does. So, convert incore - * XFS_{PG}QUOTA_* flags to on-disk XFS_OQUOTA_* flags. + * For older superblocks (v4), the in-core version of sb_qflags do not + * have XFS_OQUOTA_* flags, whereas the on-disk version does. So, + * convert incore XFS_{PG}QUOTA_* flags to on-disk XFS_OQUOTA_* flags. */ qflags &= ~(XFS_PQUOTA_ENFD | XFS_PQUOTA_CHKD | XFS_GQUOTA_ENFD | XFS_GQUOTA_CHKD); @@ -589,19 +699,20 @@ xfs_sb_to_disk( to->sb_features2 = cpu_to_be32(from->sb_features2); to->sb_bad_features2 = cpu_to_be32(from->sb_bad_features2); - if (xfs_sb_version_hascrc(from)) { - to->sb_features_compat = cpu_to_be32(from->sb_features_compat); - to->sb_features_ro_compat = - cpu_to_be32(from->sb_features_ro_compat); - to->sb_features_incompat = - cpu_to_be32(from->sb_features_incompat); - to->sb_features_log_incompat = - cpu_to_be32(from->sb_features_log_incompat); - to->sb_spino_align = cpu_to_be32(from->sb_spino_align); - to->sb_lsn = cpu_to_be64(from->sb_lsn); - if (xfs_sb_version_hasmetauuid(from)) - uuid_copy(&to->sb_meta_uuid, &from->sb_meta_uuid); - } + if (!xfs_sb_is_v5(from)) + return; + + to->sb_features_compat = cpu_to_be32(from->sb_features_compat); + to->sb_features_ro_compat = + cpu_to_be32(from->sb_features_ro_compat); + to->sb_features_incompat = + cpu_to_be32(from->sb_features_incompat); + to->sb_features_log_incompat = + cpu_to_be32(from->sb_features_log_incompat); + to->sb_spino_align = cpu_to_be32(from->sb_spino_align); + to->sb_lsn = cpu_to_be64(from->sb_lsn); + if (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID) + uuid_copy(&to->sb_meta_uuid, &from->sb_meta_uuid); } /* @@ -636,8 +747,8 @@ xfs_sb_read_verify( if (!xfs_buf_verify_cksum(bp, XFS_SB_CRC_OFF)) { /* Only fail bad secondaries on a known V5 filesystem */ - if (bp->b_bn == XFS_SB_DADDR || - xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_buf_daddr(bp) == XFS_SB_DADDR || + xfs_has_crc(mp)) { error = -EFSBADCRC; goto out_error; } @@ -704,7 +815,7 @@ xfs_sb_write_verify( if (error) goto out_error; - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_sb_is_v5(&sb)) return; if (bip) @@ -801,7 +912,7 @@ xfs_log_sb( * unclean shutdown, this will be corrected by log recovery rebuilding * the counters from the AGF block counts. */ - if (xfs_sb_version_haslazysbcount(&mp->m_sb)) { + if (xfs_has_lazysbcount(mp)) { mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount); mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree); mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks); @@ -950,10 +1061,12 @@ out: void xfs_fs_geometry( - struct xfs_sb *sbp, + struct xfs_mount *mp, struct xfs_fsop_geom *geo, int struct_version) { + struct xfs_sb *sbp = &mp->m_sb; + memset(geo, 0, sizeof(struct xfs_fsop_geom)); geo->blocksize = sbp->sb_blocksize; @@ -984,51 +1097,51 @@ xfs_fs_geometry( geo->flags = XFS_FSOP_GEOM_FLAGS_NLINK | XFS_FSOP_GEOM_FLAGS_DIRV2 | XFS_FSOP_GEOM_FLAGS_EXTFLG; - if (xfs_sb_version_hasattr(sbp)) + if (xfs_has_attr(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_ATTR; - if (xfs_sb_version_hasquota(sbp)) + if (xfs_has_quota(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_QUOTA; - if (xfs_sb_version_hasalign(sbp)) + if (xfs_has_align(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_IALIGN; - if (xfs_sb_version_hasdalign(sbp)) + if (xfs_has_dalign(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_DALIGN; - if (xfs_sb_version_hassector(sbp)) - geo->flags |= XFS_FSOP_GEOM_FLAGS_SECTOR; - if (xfs_sb_version_hasasciici(sbp)) + if (xfs_has_asciici(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_DIRV2CI; - if (xfs_sb_version_haslazysbcount(sbp)) + if (xfs_has_lazysbcount(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_LAZYSB; - if (xfs_sb_version_hasattr2(sbp)) + if (xfs_has_attr2(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_ATTR2; - if (xfs_sb_version_hasprojid32bit(sbp)) + if (xfs_has_projid32(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_PROJID32; - if (xfs_sb_version_hascrc(sbp)) + if (xfs_has_crc(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_V5SB; - if (xfs_sb_version_hasftype(sbp)) + if (xfs_has_ftype(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_FTYPE; - if (xfs_sb_version_hasfinobt(sbp)) + if (xfs_has_finobt(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_FINOBT; - if (xfs_sb_version_hassparseinodes(sbp)) + if (xfs_has_sparseinodes(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_SPINODES; - if (xfs_sb_version_hasrmapbt(sbp)) + if (xfs_has_rmapbt(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_RMAPBT; - if (xfs_sb_version_hasreflink(sbp)) + if (xfs_has_reflink(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_REFLINK; - if (xfs_sb_version_hasbigtime(sbp)) + if (xfs_has_bigtime(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_BIGTIME; - if (xfs_sb_version_hasinobtcounts(sbp)) + if (xfs_has_inobtcounts(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_INOBTCNT; - if (xfs_sb_version_hassector(sbp)) + if (xfs_has_sector(mp)) { + geo->flags |= XFS_FSOP_GEOM_FLAGS_SECTOR; geo->logsectsize = sbp->sb_logsectsize; - else + } else { geo->logsectsize = BBSIZE; + } geo->rtsectsize = sbp->sb_blocksize; geo->dirblocksize = xfs_dir2_dirblock_bytes(sbp); if (struct_version < 4) return; - if (xfs_sb_version_haslogv2(sbp)) + if (xfs_has_logv2(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_LOGV2; geo->logsunit = sbp->sb_logsunit; diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h index 0c1602d9b53d..a5e14740ec9a 100644 --- a/fs/xfs/libxfs/xfs_sb.h +++ b/fs/xfs/libxfs/xfs_sb.h @@ -20,11 +20,13 @@ extern void xfs_sb_mount_common(struct xfs_mount *mp, struct xfs_sb *sbp); extern void xfs_sb_from_disk(struct xfs_sb *to, struct xfs_dsb *from); extern void xfs_sb_to_disk(struct xfs_dsb *to, struct xfs_sb *from); extern void xfs_sb_quota_from_disk(struct xfs_sb *sbp); +extern bool xfs_sb_good_version(struct xfs_sb *sbp); +extern uint64_t xfs_sb_version_to_features(struct xfs_sb *sbp); extern int xfs_update_secondary_sbs(struct xfs_mount *mp); #define XFS_FS_GEOM_MAX_STRUCT_VER (4) -extern void xfs_fs_geometry(struct xfs_sb *sbp, struct xfs_fsop_geom *geo, +extern void xfs_fs_geometry(struct xfs_mount *mp, struct xfs_fsop_geom *geo, int struct_version); extern int xfs_sb_read_secondary(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c index 594bc447a7dd..f0b38f4aba80 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.c +++ b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -42,7 +42,7 @@ xfs_symlink_hdr_set( { struct xfs_dsymlink_hdr *dsl = bp->b_addr; - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return 0; memset(dsl, 0, sizeof(struct xfs_dsymlink_hdr)); @@ -51,7 +51,7 @@ xfs_symlink_hdr_set( dsl->sl_bytes = cpu_to_be32(size); uuid_copy(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid); dsl->sl_owner = cpu_to_be64(ino); - dsl->sl_blkno = cpu_to_be64(bp->b_bn); + dsl->sl_blkno = cpu_to_be64(xfs_buf_daddr(bp)); bp->b_ops = &xfs_symlink_buf_ops; return sizeof(struct xfs_dsymlink_hdr); @@ -89,13 +89,13 @@ xfs_symlink_verify( struct xfs_mount *mp = bp->b_mount; struct xfs_dsymlink_hdr *dsl = bp->b_addr; - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return __this_address; if (!xfs_verify_magic(bp, dsl->sl_magic)) return __this_address; if (!uuid_equal(&dsl->sl_uuid, &mp->m_sb.sb_meta_uuid)) return __this_address; - if (bp->b_bn != be64_to_cpu(dsl->sl_blkno)) + if (xfs_buf_daddr(bp) != be64_to_cpu(dsl->sl_blkno)) return __this_address; if (be32_to_cpu(dsl->sl_offset) + be32_to_cpu(dsl->sl_bytes) >= XFS_SYMLINK_MAXLEN) @@ -116,7 +116,7 @@ xfs_symlink_read_verify( xfs_failaddr_t fa; /* no verification of non-crc buffers */ - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return; if (!xfs_buf_verify_cksum(bp, XFS_SYMLINK_CRC_OFF)) @@ -137,7 +137,7 @@ xfs_symlink_write_verify( xfs_failaddr_t fa; /* no verification of non-crc buffers */ - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return; fa = xfs_symlink_verify(bp); @@ -173,7 +173,7 @@ xfs_symlink_local_to_remote( xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SYMLINK_BUF); - if (!xfs_sb_version_hascrc(&mp->m_sb)) { + if (!xfs_has_crc(mp)) { bp->b_ops = NULL; memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes); xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1); diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c index 16f723ebe8dd..8b5547073379 100644 --- a/fs/xfs/libxfs/xfs_trans_inode.c +++ b/fs/xfs/libxfs/xfs_trans_inode.c @@ -136,7 +136,7 @@ xfs_trans_log_inode( * to upgrade this inode to bigtime format, do so now. */ if ((flags & (XFS_ILOG_CORE | XFS_ILOG_TIMESTAMP)) && - xfs_sb_version_hasbigtime(&ip->i_mount->m_sb) && + xfs_has_bigtime(ip->i_mount) && !xfs_inode_has_bigtime(ip)) { ip->i_diflags2 |= XFS_DIFLAG2_BIGTIME; flags |= XFS_ILOG_CORE; diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index d1a0848cb52e..5e300daa2559 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -71,9 +71,9 @@ xfs_allocfree_log_count( uint blocks; blocks = num_ops * 2 * (2 * mp->m_ag_maxlevels - 1); - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + if (xfs_has_rmapbt(mp)) blocks += num_ops * (2 * mp->m_rmap_maxlevels - 1); - if (xfs_sb_version_hasreflink(&mp->m_sb)) + if (xfs_has_reflink(mp)) blocks += num_ops * (2 * mp->m_refc_maxlevels - 1); return blocks; @@ -155,7 +155,7 @@ STATIC uint xfs_calc_finobt_res( struct xfs_mount *mp) { - if (!xfs_sb_version_hasfinobt(&mp->m_sb)) + if (!xfs_has_finobt(mp)) return 0; return xfs_calc_inobt_res(mp); @@ -187,7 +187,7 @@ xfs_calc_inode_chunk_res( XFS_FSB_TO_B(mp, 1)); if (alloc) { /* icreate tx uses ordered buffers */ - if (xfs_sb_version_has_v3inode(&mp->m_sb)) + if (xfs_has_v3inodes(mp)) return res; size = XFS_FSB_TO_B(mp, 1); } @@ -268,7 +268,7 @@ xfs_calc_write_reservation( xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz); - if (xfs_sb_version_hasrealtime(&mp->m_sb)) { + if (xfs_has_realtime(mp)) { t2 = xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), blksz) + @@ -317,7 +317,7 @@ xfs_calc_itruncate_reservation( t2 = xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4), blksz); - if (xfs_sb_version_hasrealtime(&mp->m_sb)) { + if (xfs_has_realtime(mp)) { t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + xfs_calc_buf_res(xfs_rtalloc_log_count(mp, 2), blksz) + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz); @@ -799,29 +799,6 @@ xfs_calc_qm_dqalloc_reservation( } /* - * Turning off quotas. - * the quota off logitems: sizeof(struct xfs_qoff_logitem) * 2 - * the superblock for the quota flags: sector size - */ -STATIC uint -xfs_calc_qm_quotaoff_reservation( - struct xfs_mount *mp) -{ - return sizeof(struct xfs_qoff_logitem) * 2 + - xfs_calc_buf_res(1, mp->m_sb.sb_sectsize); -} - -/* - * End of turning off quotas. - * the quota off logitems: sizeof(struct xfs_qoff_logitem) * 2 - */ -STATIC uint -xfs_calc_qm_quotaoff_end_reservation(void) -{ - return sizeof(struct xfs_qoff_logitem) * 2; -} - -/* * Syncing the incore super block changes to disk. * the super block to reflect the changes: sector size */ @@ -842,14 +819,14 @@ xfs_trans_resv_calc( * require a permanent reservation on space. */ resp->tr_write.tr_logres = xfs_calc_write_reservation(mp); - if (xfs_sb_version_hasreflink(&mp->m_sb)) + if (xfs_has_reflink(mp)) resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK; else resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT; resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES; resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp); - if (xfs_sb_version_hasreflink(&mp->m_sb)) + if (xfs_has_reflink(mp)) resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT_REFLINK; else @@ -910,7 +887,7 @@ xfs_trans_resv_calc( resp->tr_growrtalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES; resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp); - if (xfs_sb_version_hasreflink(&mp->m_sb)) + if (xfs_has_reflink(mp)) resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK; else resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT; @@ -923,13 +900,6 @@ xfs_trans_resv_calc( resp->tr_qm_setqlim.tr_logres = xfs_calc_qm_setqlim_reservation(); resp->tr_qm_setqlim.tr_logcount = XFS_DEFAULT_LOG_COUNT; - resp->tr_qm_quotaoff.tr_logres = xfs_calc_qm_quotaoff_reservation(mp); - resp->tr_qm_quotaoff.tr_logcount = XFS_DEFAULT_LOG_COUNT; - - resp->tr_qm_equotaoff.tr_logres = - xfs_calc_qm_quotaoff_end_reservation(); - resp->tr_qm_equotaoff.tr_logcount = XFS_DEFAULT_LOG_COUNT; - resp->tr_sb.tr_logres = xfs_calc_sb_reservation(mp); resp->tr_sb.tr_logcount = XFS_DEFAULT_LOG_COUNT; diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h index 7241ab28cf84..fc4e9b369a3a 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.h +++ b/fs/xfs/libxfs/xfs_trans_resv.h @@ -46,8 +46,6 @@ struct xfs_trans_resv { struct xfs_trans_res tr_growrtfree; /* grow realtime freeing */ struct xfs_trans_res tr_qm_setqlim; /* adjust quota limits */ struct xfs_trans_res tr_qm_dqalloc; /* allocate quota on disk */ - struct xfs_trans_res tr_qm_quotaoff; /* turn quota off */ - struct xfs_trans_res tr_qm_equotaoff;/* end of turn quota off */ struct xfs_trans_res tr_sb; /* modify superblock */ struct xfs_trans_res tr_fsyncts; /* update timestamps on fsync */ }; diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h index 7ad3659c5d2a..50332be34388 100644 --- a/fs/xfs/libxfs/xfs_trans_space.h +++ b/fs/xfs/libxfs/xfs_trans_space.h @@ -57,8 +57,7 @@ XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK) #define XFS_IALLOC_SPACE_RES(mp) \ (M_IGEO(mp)->ialloc_blks + \ - ((xfs_sb_version_hasfinobt(&mp->m_sb) ? 2 : 1) * \ - M_IGEO(mp)->inobt_maxlevels)) + ((xfs_has_finobt(mp) ? 2 : 1) * M_IGEO(mp)->inobt_maxlevels)) /* * Space reservation values for various transactions. @@ -94,8 +93,7 @@ #define XFS_SYMLINK_SPACE_RES(mp,nl,b) \ (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl) + (b)) #define XFS_IFREE_SPACE_RES(mp) \ - (xfs_sb_version_hasfinobt(&mp->m_sb) ? \ - M_IGEO(mp)->inobt_maxlevels : 0) + (xfs_has_finobt(mp) ? M_IGEO(mp)->inobt_maxlevels : 0) #endif /* __XFS_TRANS_SPACE_H__ */ diff --git a/fs/xfs/libxfs/xfs_types.c b/fs/xfs/libxfs/xfs_types.c index e8f4abee7892..e810d23f2d97 100644 --- a/fs/xfs/libxfs/xfs_types.c +++ b/fs/xfs/libxfs/xfs_types.c @@ -169,7 +169,7 @@ xfs_internal_inum( xfs_ino_t ino) { return ino == mp->m_sb.sb_rbmino || ino == mp->m_sb.sb_rsumino || - (xfs_sb_version_hasquota(&mp->m_sb) && + (xfs_has_quota(mp) && xfs_is_quota_inode(&mp->m_sb, ino)); } diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 0870ef6f933d..b6da06b40989 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -87,6 +87,11 @@ typedef void * xfs_failaddr_t; #define XFS_ATTR_FORK 1 #define XFS_COW_FORK 2 +#define XFS_WHICHFORK_STRINGS \ + { XFS_DATA_FORK, "data" }, \ + { XFS_ATTR_FORK, "attr" }, \ + { XFS_COW_FORK, "cow" } + /* * Min numbers of data/attr fork btree root pointers. */ diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index be1a7e1e65f7..ae3c9f6e2c69 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -36,7 +36,7 @@ xchk_superblock_xref( agbno = XFS_SB_BLOCK(mp); - error = xchk_ag_init(sc, agno, &sc->sa); + error = xchk_ag_init_existing(sc, agno, &sc->sa); if (!xchk_xref_process_error(sc, agno, agbno, &error)) return; @@ -63,6 +63,7 @@ xchk_superblock( struct xfs_mount *mp = sc->mp; struct xfs_buf *bp; struct xfs_dsb *sb; + struct xfs_perag *pag; xfs_agnumber_t agno; uint32_t v2_ok; __be32 features_mask; @@ -73,6 +74,15 @@ xchk_superblock( if (agno == 0) return 0; + /* + * Grab an active reference to the perag structure. If we can't get + * it, we're racing with something that's tearing down the AG, so + * signal that the AG no longer exists. + */ + pag = xfs_perag_get(mp, agno); + if (!pag) + return -ENOENT; + error = xfs_sb_read_secondary(mp, sc->tp, agno, &bp); /* * The superblock verifier can return several different error codes @@ -92,7 +102,7 @@ xchk_superblock( break; } if (!xchk_process_error(sc, agno, XFS_SB_BLOCK(mp), &error)) - return error; + goto out_pag; sb = bp->b_addr; @@ -248,7 +258,7 @@ xchk_superblock( xchk_block_set_corrupt(sc, bp); } else { v2_ok = XFS_SB_VERSION2_OKBITS; - if (XFS_SB_VERSION_NUM(&mp->m_sb) >= XFS_SB_VERSION_5) + if (xfs_sb_is_v5(&mp->m_sb)) v2_ok |= XFS_SB_VERSION2_CRCBIT; if (!!(sb->sb_features2 & cpu_to_be32(~v2_ok))) @@ -273,7 +283,7 @@ xchk_superblock( (cpu_to_be32(mp->m_sb.sb_features2) & features_mask)) xchk_block_set_corrupt(sc, bp); - if (!xfs_sb_version_hascrc(&mp->m_sb)) { + if (!xfs_has_crc(mp)) { /* all v5 fields must be zero */ if (memchr_inv(&sb->sb_features_compat, 0, sizeof(struct xfs_dsb) - @@ -324,7 +334,7 @@ xchk_superblock( /* Don't care about sb_lsn */ } - if (xfs_sb_version_hasmetauuid(&mp->m_sb)) { + if (xfs_has_metauuid(mp)) { /* The metadata UUID must be the same for all supers */ if (!uuid_equal(&sb->sb_meta_uuid, &mp->m_sb.sb_meta_uuid)) xchk_block_set_corrupt(sc, bp); @@ -336,7 +346,8 @@ xchk_superblock( xchk_block_set_corrupt(sc, bp); xchk_superblock_xref(sc, bp); - +out_pag: + xfs_perag_put(pag); return error; } @@ -346,7 +357,7 @@ xchk_superblock( STATIC int xchk_agf_record_bno_lengths( struct xfs_btree_cur *cur, - struct xfs_alloc_rec_incore *rec, + const struct xfs_alloc_rec_incore *rec, void *priv) { xfs_extlen_t *blocks = priv; @@ -419,7 +430,7 @@ xchk_agf_xref_btreeblks( int error; /* agf_btreeblks didn't exist before lazysbcount */ - if (!xfs_sb_version_haslazysbcount(&sc->mp->m_sb)) + if (!xfs_has_lazysbcount(sc->mp)) return; /* Check agf_rmap_blocks; set up for agf_btreeblks check */ @@ -438,7 +449,7 @@ xchk_agf_xref_btreeblks( * No rmap cursor; we can't xref if we have the rmapbt feature. * We also can't do it if we're missing the free space btree cursors. */ - if ((xfs_sb_version_hasrmapbt(&mp->m_sb) && !sc->sa.rmap_cur) || + if ((xfs_has_rmapbt(mp) && !sc->sa.rmap_cur) || !sc->sa.bno_cur || !sc->sa.cnt_cur) return; @@ -527,6 +538,7 @@ xchk_agf( xchk_buffer_recheck(sc, sc->sa.agf_bp); agf = sc->sa.agf_bp->b_addr; + pag = sc->sa.pag; /* Check the AG length */ eoag = be32_to_cpu(agf->agf_length); @@ -550,7 +562,7 @@ xchk_agf( if (level <= 0 || level > XFS_BTREE_MAXLEVELS) xchk_block_set_corrupt(sc, sc->sa.agf_bp); - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) { + if (xfs_has_rmapbt(mp)) { agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_RMAP]); if (!xfs_verify_agbno(mp, agno, agbno)) xchk_block_set_corrupt(sc, sc->sa.agf_bp); @@ -560,7 +572,7 @@ xchk_agf( xchk_block_set_corrupt(sc, sc->sa.agf_bp); } - if (xfs_sb_version_hasreflink(&mp->m_sb)) { + if (xfs_has_reflink(mp)) { agbno = be32_to_cpu(agf->agf_refcount_root); if (!xfs_verify_agbno(mp, agno, agbno)) xchk_block_set_corrupt(sc, sc->sa.agf_bp); @@ -582,15 +594,13 @@ xchk_agf( xchk_block_set_corrupt(sc, sc->sa.agf_bp); /* Do the incore counters match? */ - pag = xfs_perag_get(mp, agno); if (pag->pagf_freeblks != be32_to_cpu(agf->agf_freeblks)) xchk_block_set_corrupt(sc, sc->sa.agf_bp); if (pag->pagf_flcount != be32_to_cpu(agf->agf_flcount)) xchk_block_set_corrupt(sc, sc->sa.agf_bp); - if (xfs_sb_version_haslazysbcount(&sc->mp->m_sb) && + if (xfs_has_lazysbcount(sc->mp) && pag->pagf_btreeblks != be32_to_cpu(agf->agf_btreeblks)) xchk_block_set_corrupt(sc, sc->sa.agf_bp); - xfs_perag_put(pag); xchk_agf_xref(sc); out: @@ -630,7 +640,7 @@ xchk_agfl_block( { struct xchk_agfl_info *sai = priv; struct xfs_scrub *sc = sai->sc; - xfs_agnumber_t agno = sc->sa.agno; + xfs_agnumber_t agno = sc->sa.pag->pag_agno; if (xfs_verify_agbno(mp, agno, agbno) && sai->nr_entries < sai->sz_entries) @@ -787,7 +797,7 @@ xchk_agi_xref_fiblocks( xfs_agblock_t blocks; int error = 0; - if (!xfs_sb_version_hasinobtcounts(&sc->mp->m_sb)) + if (!xfs_has_inobtcounts(sc->mp)) return; if (sc->sa.ino_cur) { @@ -857,6 +867,7 @@ xchk_agi( xchk_buffer_recheck(sc, sc->sa.agi_bp); agi = sc->sa.agi_bp->b_addr; + pag = sc->sa.pag; /* Check the AG length */ eoag = be32_to_cpu(agi->agi_length); @@ -872,7 +883,7 @@ xchk_agi( if (level <= 0 || level > XFS_BTREE_MAXLEVELS) xchk_block_set_corrupt(sc, sc->sa.agi_bp); - if (xfs_sb_version_hasfinobt(&mp->m_sb)) { + if (xfs_has_finobt(mp)) { agbno = be32_to_cpu(agi->agi_free_root); if (!xfs_verify_agbno(mp, agno, agbno)) xchk_block_set_corrupt(sc, sc->sa.agi_bp); @@ -909,12 +920,10 @@ xchk_agi( xchk_block_set_corrupt(sc, sc->sa.agi_bp); /* Do the incore counters match? */ - pag = xfs_perag_get(mp, agno); if (pag->pagi_count != be32_to_cpu(agi->agi_count)) xchk_block_set_corrupt(sc, sc->sa.agi_bp); if (pag->pagi_freecount != be32_to_cpu(agi->agi_freecount)) xchk_block_set_corrupt(sc, sc->sa.agi_bp); - xfs_perag_put(pag); xchk_agi_xref(sc); out: diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index e95f8c98f0f7..0f8deee66f15 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -70,7 +70,7 @@ struct xrep_agf_allocbt { STATIC int xrep_agf_walk_allocbt( struct xfs_btree_cur *cur, - struct xfs_alloc_rec_incore *rec, + const struct xfs_alloc_rec_incore *rec, void *priv) { struct xrep_agf_allocbt *raa = priv; @@ -94,7 +94,7 @@ xrep_agf_check_agfl_block( { struct xfs_scrub *sc = priv; - if (!xfs_verify_agbno(mp, sc->sa.agno, agbno)) + if (!xfs_verify_agbno(mp, sc->sa.pag->pag_agno, agbno)) return -EFSCORRUPTED; return 0; } @@ -164,7 +164,7 @@ xrep_agf_find_btrees( return -EFSCORRUPTED; /* We must find the refcountbt root if that feature is enabled. */ - if (xfs_sb_version_hasreflink(&sc->mp->m_sb) && + if (xfs_has_reflink(sc->mp) && !xrep_check_btree_root(sc, &fab[XREP_AGF_REFCOUNTBT])) return -EFSCORRUPTED; @@ -188,12 +188,13 @@ xrep_agf_init_header( memset(agf, 0, BBTOB(agf_bp->b_length)); agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC); agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION); - agf->agf_seqno = cpu_to_be32(sc->sa.agno); - agf->agf_length = cpu_to_be32(xfs_ag_block_count(mp, sc->sa.agno)); + agf->agf_seqno = cpu_to_be32(sc->sa.pag->pag_agno); + agf->agf_length = cpu_to_be32(xfs_ag_block_count(mp, + sc->sa.pag->pag_agno)); agf->agf_flfirst = old_agf->agf_flfirst; agf->agf_fllast = old_agf->agf_fllast; agf->agf_flcount = old_agf->agf_flcount; - if (xfs_sb_version_hascrc(&mp->m_sb)) + if (xfs_has_crc(mp)) uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid); /* Mark the incore AGF data stale until we're done fixing things. */ @@ -223,7 +224,7 @@ xrep_agf_set_roots( agf->agf_levels[XFS_BTNUM_RMAPi] = cpu_to_be32(fab[XREP_AGF_RMAPBT].height); - if (xfs_sb_version_hasreflink(&sc->mp->m_sb)) { + if (xfs_has_reflink(sc->mp)) { agf->agf_refcount_root = cpu_to_be32(fab[XREP_AGF_REFCOUNTBT].root); agf->agf_refcount_level = @@ -280,7 +281,7 @@ xrep_agf_calc_from_btrees( agf->agf_btreeblks = cpu_to_be32(btreeblks); /* Update the AGF counters from the refcountbt. */ - if (xfs_sb_version_hasreflink(&mp->m_sb)) { + if (xfs_has_reflink(mp)) { cur = xfs_refcountbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag); error = xfs_btree_count_blocks(cur, &blocks); @@ -363,16 +364,16 @@ xrep_agf( int error; /* We require the rmapbt to rebuild anything. */ - if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) + if (!xfs_has_rmapbt(mp)) return -EOPNOTSUPP; - xchk_perag_get(sc->mp, &sc->sa); /* * Make sure we have the AGF buffer, as scrub might have decided it * was corrupt after xfs_alloc_read_agf failed with -EFSCORRUPTED. */ error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp, - XFS_AG_DADDR(mp, sc->sa.agno, XFS_AGF_DADDR(mp)), + XFS_AG_DADDR(mp, sc->sa.pag->pag_agno, + XFS_AGF_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), 0, &agf_bp, NULL); if (error) return error; @@ -388,7 +389,7 @@ xrep_agf( * btrees rooted in the AGF. If the AGFL contents are obviously bad * then we'll bail out. */ - error = xfs_alloc_read_agfl(mp, sc->tp, sc->sa.agno, &agfl_bp); + error = xfs_alloc_read_agfl(mp, sc->tp, sc->sa.pag->pag_agno, &agfl_bp); if (error) return error; @@ -442,7 +443,7 @@ struct xrep_agfl { STATIC int xrep_agfl_walk_rmap( struct xfs_btree_cur *cur, - struct xfs_rmap_irec *rec, + const struct xfs_rmap_irec *rec, void *priv) { struct xrep_agfl *ra = priv; @@ -586,7 +587,7 @@ xrep_agfl_init_header( agfl = XFS_BUF_TO_AGFL(agfl_bp); memset(agfl, 0xFF, BBTOB(agfl_bp->b_length)); agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC); - agfl->agfl_seqno = cpu_to_be32(sc->sa.agno); + agfl->agfl_seqno = cpu_to_be32(sc->sa.pag->pag_agno); uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid); /* @@ -599,7 +600,8 @@ xrep_agfl_init_header( for_each_xbitmap_extent(br, n, agfl_extents) { agbno = XFS_FSB_TO_AGBNO(mp, br->start); - trace_xrep_agfl_insert(mp, sc->sa.agno, agbno, br->len); + trace_xrep_agfl_insert(mp, sc->sa.pag->pag_agno, agbno, + br->len); while (br->len > 0 && fl_off < flcount) { agfl_bno[fl_off] = cpu_to_be32(agbno); @@ -638,10 +640,9 @@ xrep_agfl( int error; /* We require the rmapbt to rebuild anything. */ - if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) + if (!xfs_has_rmapbt(mp)) return -EOPNOTSUPP; - xchk_perag_get(sc->mp, &sc->sa); xbitmap_init(&agfl_extents); /* @@ -649,7 +650,8 @@ xrep_agfl( * nothing wrong with the AGF, but all the AG header repair functions * have this chicken-and-egg problem. */ - error = xfs_alloc_read_agf(mp, sc->tp, sc->sa.agno, 0, &agf_bp); + error = xfs_alloc_read_agf(mp, sc->tp, sc->sa.pag->pag_agno, 0, + &agf_bp); if (error) return error; @@ -658,7 +660,8 @@ xrep_agfl( * was corrupt after xfs_alloc_read_agfl failed with -EFSCORRUPTED. */ error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp, - XFS_AG_DADDR(mp, sc->sa.agno, XFS_AGFL_DADDR(mp)), + XFS_AG_DADDR(mp, sc->sa.pag->pag_agno, + XFS_AGFL_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), 0, &agfl_bp, NULL); if (error) return error; @@ -723,7 +726,8 @@ xrep_agi_find_btrees( int error; /* Read the AGF. */ - error = xfs_alloc_read_agf(mp, sc->tp, sc->sa.agno, 0, &agf_bp); + error = xfs_alloc_read_agf(mp, sc->tp, sc->sa.pag->pag_agno, 0, + &agf_bp); if (error) return error; @@ -737,7 +741,7 @@ xrep_agi_find_btrees( return -EFSCORRUPTED; /* We must find the finobt root if that feature is enabled. */ - if (xfs_sb_version_hasfinobt(&mp->m_sb) && + if (xfs_has_finobt(mp) && !xrep_check_btree_root(sc, &fab[XREP_AGI_FINOBT])) return -EFSCORRUPTED; @@ -761,11 +765,12 @@ xrep_agi_init_header( memset(agi, 0, BBTOB(agi_bp->b_length)); agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC); agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION); - agi->agi_seqno = cpu_to_be32(sc->sa.agno); - agi->agi_length = cpu_to_be32(xfs_ag_block_count(mp, sc->sa.agno)); + agi->agi_seqno = cpu_to_be32(sc->sa.pag->pag_agno); + agi->agi_length = cpu_to_be32(xfs_ag_block_count(mp, + sc->sa.pag->pag_agno)); agi->agi_newino = cpu_to_be32(NULLAGINO); agi->agi_dirino = cpu_to_be32(NULLAGINO); - if (xfs_sb_version_hascrc(&mp->m_sb)) + if (xfs_has_crc(mp)) uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid); /* We don't know how to fix the unlinked list yet. */ @@ -787,7 +792,7 @@ xrep_agi_set_roots( agi->agi_root = cpu_to_be32(fab[XREP_AGI_INOBT].root); agi->agi_level = cpu_to_be32(fab[XREP_AGI_INOBT].height); - if (xfs_sb_version_hasfinobt(&sc->mp->m_sb)) { + if (xfs_has_finobt(sc->mp)) { agi->agi_free_root = cpu_to_be32(fab[XREP_AGI_FINOBT].root); agi->agi_free_level = cpu_to_be32(fab[XREP_AGI_FINOBT].height); } @@ -811,7 +816,7 @@ xrep_agi_calc_from_btrees( error = xfs_ialloc_count_inodes(cur, &count, &freecount); if (error) goto err; - if (xfs_sb_version_hasinobtcounts(&mp->m_sb)) { + if (xfs_has_inobtcounts(mp)) { xfs_agblock_t blocks; error = xfs_btree_count_blocks(cur, &blocks); @@ -824,8 +829,7 @@ xrep_agi_calc_from_btrees( agi->agi_count = cpu_to_be32(count); agi->agi_freecount = cpu_to_be32(freecount); - if (xfs_sb_version_hasfinobt(&mp->m_sb) && - xfs_sb_version_hasinobtcounts(&mp->m_sb)) { + if (xfs_has_finobt(mp) && xfs_has_inobtcounts(mp)) { xfs_agblock_t blocks; cur = xfs_inobt_init_cursor(mp, sc->tp, agi_bp, @@ -893,16 +897,16 @@ xrep_agi( int error; /* We require the rmapbt to rebuild anything. */ - if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) + if (!xfs_has_rmapbt(mp)) return -EOPNOTSUPP; - xchk_perag_get(sc->mp, &sc->sa); /* * Make sure we have the AGI buffer, as scrub might have decided it * was corrupt after xfs_ialloc_read_agi failed with -EFSCORRUPTED. */ error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp, - XFS_AG_DADDR(mp, sc->sa.agno, XFS_AGI_DADDR(mp)), + XFS_AG_DADDR(mp, sc->sa.pag->pag_agno, + XFS_AGI_DADDR(mp)), XFS_FSS_TO_BB(mp, 1), 0, &agi_bp, NULL); if (error) return error; diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c index d5741980094a..87518e1292f8 100644 --- a/fs/xfs/scrub/alloc.c +++ b/fs/xfs/scrub/alloc.c @@ -91,7 +91,7 @@ xchk_allocbt_xref( STATIC int xchk_allocbt_rec( struct xchk_btree *bs, - union xfs_btree_rec *rec) + const union xfs_btree_rec *rec) { struct xfs_mount *mp = bs->cur->bc_mp; xfs_agnumber_t agno = bs->cur->bc_ag.pag->pag_agno; diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c index 552af0cf8482..b6f0c9f3f124 100644 --- a/fs/xfs/scrub/attr.c +++ b/fs/xfs/scrub/attr.c @@ -25,11 +25,11 @@ * reallocating the buffer if necessary. Buffer contents are not preserved * across a reallocation. */ -int +static int xchk_setup_xattr_buf( struct xfs_scrub *sc, size_t value_size, - xfs_km_flags_t flags) + gfp_t flags) { size_t sz; struct xchk_xattr_buf *ab = sc->buf; @@ -57,7 +57,7 @@ xchk_setup_xattr_buf( * Don't zero the buffer upon allocation to avoid runtime overhead. * All users must be careful never to read uninitialized contents. */ - ab = kmem_alloc_large(sizeof(*ab) + sz, flags); + ab = kvmalloc(sizeof(*ab) + sz, flags); if (!ab) return -ENOMEM; @@ -79,7 +79,7 @@ xchk_setup_xattr( * without the inode lock held, which means we can sleep. */ if (sc->flags & XCHK_TRY_HARDER) { - error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX, 0); + error = xchk_setup_xattr_buf(sc, XATTR_SIZE_MAX, GFP_KERNEL); if (error) return error; } @@ -138,7 +138,8 @@ xchk_xattr_listent( * doesn't work, we overload the seen_enough variable to convey * the error message back to the main scrub function. */ - error = xchk_setup_xattr_buf(sx->sc, valuelen, KM_MAYFAIL); + error = xchk_setup_xattr_buf(sx->sc, valuelen, + GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (error == -ENOMEM) error = -EDEADLOCK; if (error) { @@ -323,7 +324,8 @@ xchk_xattr_block( return 0; /* Allocate memory for block usage checking. */ - error = xchk_setup_xattr_buf(ds->sc, 0, KM_MAYFAIL); + error = xchk_setup_xattr_buf(ds->sc, 0, + GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (error == -ENOMEM) return -EDEADLOCK; if (error) @@ -334,7 +336,7 @@ xchk_xattr_block( bitmap_zero(usedmap, mp->m_attr_geo->blksize); /* Check all the padding. */ - if (xfs_sb_version_hascrc(&ds->sc->mp->m_sb)) { + if (xfs_has_crc(ds->sc->mp)) { struct xfs_attr3_leafblock *leaf = bp->b_addr; if (leaf->hdr.pad1 != 0 || leaf->hdr.pad2 != 0 || diff --git a/fs/xfs/scrub/attr.h b/fs/xfs/scrub/attr.h index 13a1d2e8424d..1719e1c4da59 100644 --- a/fs/xfs/scrub/attr.h +++ b/fs/xfs/scrub/attr.h @@ -65,7 +65,4 @@ xchk_xattr_dstmap( BITS_TO_LONGS(sc->mp->m_attr_geo->blksize); } -int xchk_setup_xattr_buf(struct xfs_scrub *sc, size_t value_size, - xfs_km_flags_t flags); - #endif /* __XFS_SCRUB_ATTR_H__ */ diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c index 813b5f219113..d6d24c866bc4 100644 --- a/fs/xfs/scrub/bitmap.c +++ b/fs/xfs/scrub/bitmap.c @@ -260,7 +260,7 @@ xbitmap_set_btcur_path( xfs_btree_get_block(cur, i, &bp); if (!bp) continue; - fsb = XFS_DADDR_TO_FSB(cur->bc_mp, bp->b_bn); + fsb = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)); error = xbitmap_set(bitmap, fsb, 1); if (error) return error; @@ -284,7 +284,7 @@ xbitmap_collect_btblock( if (!bp) return 0; - fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, bp->b_bn); + fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp)); return xbitmap_set(bitmap, fsbno, 1); } diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index 1d146c9d9de1..017da9ceaee9 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -260,10 +260,10 @@ xchk_bmap_iextent_xref( agbno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock); len = irec->br_blockcount; - error = xchk_ag_init(info->sc, agno, &info->sc->sa); + error = xchk_ag_init_existing(info->sc, agno, &info->sc->sa); if (!xchk_fblock_process_error(info->sc, info->whichfork, irec->br_startoff, &error)) - return; + goto out_free; xchk_xref_is_used_space(info->sc, agbno, len); xchk_xref_is_not_inode_chunk(info->sc, agbno, len); @@ -283,6 +283,7 @@ xchk_bmap_iextent_xref( break; } +out_free: xchk_ag_free(info->sc, &info->sc->sa); } @@ -383,7 +384,7 @@ xchk_bmap_iextent( STATIC int xchk_bmapbt_rec( struct xchk_btree *bs, - union xfs_btree_rec *rec) + const union xfs_btree_rec *rec) { struct xfs_bmbt_irec irec; struct xfs_bmbt_irec iext_irec; @@ -400,7 +401,7 @@ xchk_bmapbt_rec( * Check the owners of the btree blocks up to the level below * the root since the verifiers don't do that. */ - if (xfs_sb_version_hascrc(&bs->cur->bc_mp->m_sb) && + if (xfs_has_crc(bs->cur->bc_mp) && bs->cur->bc_ptrs[0] == 1) { for (i = 0; i < bs->cur->bc_nlevels - 1; i++) { block = xfs_btree_get_block(bs->cur, i, &bp); @@ -473,10 +474,11 @@ struct xchk_bmap_check_rmap_info { STATIC int xchk_bmap_check_rmap( struct xfs_btree_cur *cur, - struct xfs_rmap_irec *rec, + const struct xfs_rmap_irec *rec, void *priv) { struct xfs_bmbt_irec irec; + struct xfs_rmap_irec check_rec; struct xchk_bmap_check_rmap_info *sbcri = priv; struct xfs_ifork *ifp; struct xfs_scrub *sc = sbcri->sc; @@ -510,28 +512,30 @@ xchk_bmap_check_rmap( * length, so we have to loop through the bmbt to make sure that the * entire rmap is covered by bmbt records. */ + check_rec = *rec; while (have_map) { - if (irec.br_startoff != rec->rm_offset) + if (irec.br_startoff != check_rec.rm_offset) xchk_fblock_set_corrupt(sc, sbcri->whichfork, - rec->rm_offset); + check_rec.rm_offset); if (irec.br_startblock != XFS_AGB_TO_FSB(sc->mp, - cur->bc_ag.pag->pag_agno, rec->rm_startblock)) + cur->bc_ag.pag->pag_agno, + check_rec.rm_startblock)) xchk_fblock_set_corrupt(sc, sbcri->whichfork, - rec->rm_offset); - if (irec.br_blockcount > rec->rm_blockcount) + check_rec.rm_offset); + if (irec.br_blockcount > check_rec.rm_blockcount) xchk_fblock_set_corrupt(sc, sbcri->whichfork, - rec->rm_offset); + check_rec.rm_offset); if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) break; - rec->rm_startblock += irec.br_blockcount; - rec->rm_offset += irec.br_blockcount; - rec->rm_blockcount -= irec.br_blockcount; - if (rec->rm_blockcount == 0) + check_rec.rm_startblock += irec.br_blockcount; + check_rec.rm_offset += irec.br_blockcount; + check_rec.rm_blockcount -= irec.br_blockcount; + if (check_rec.rm_blockcount == 0) break; have_map = xfs_iext_next_extent(ifp, &sbcri->icur, &irec); if (!have_map) xchk_fblock_set_corrupt(sc, sbcri->whichfork, - rec->rm_offset); + check_rec.rm_offset); } out: @@ -581,7 +585,7 @@ xchk_bmap_check_rmaps( bool zero_size; int error; - if (!xfs_sb_version_hasrmapbt(&sc->mp->m_sb) || + if (!xfs_has_rmapbt(sc->mp) || whichfork == XFS_COW_FORK || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) return 0; @@ -659,8 +663,7 @@ xchk_bmap( } break; case XFS_ATTR_FORK: - if (!xfs_sb_version_hasattr(&mp->m_sb) && - !xfs_sb_version_hasattr2(&mp->m_sb)) + if (!xfs_has_attr(mp) && !xfs_has_attr2(mp)) xchk_ino_set_corrupt(sc, sc->ip->i_ino); break; default: diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c index bd1172358964..eccb855dc904 100644 --- a/fs/xfs/scrub/btree.c +++ b/fs/xfs/scrub/btree.c @@ -374,10 +374,10 @@ xchk_btree_check_block_owner( init_sa = bs->cur->bc_flags & XFS_BTREE_LONG_PTRS; if (init_sa) { - error = xchk_ag_init(bs->sc, agno, &bs->sc->sa); + error = xchk_ag_init_existing(bs->sc, agno, &bs->sc->sa); if (!xchk_btree_xref_process_error(bs->sc, bs->cur, level, &error)) - return error; + goto out_free; } xchk_xref_is_used_space(bs->sc, agbno, 1); @@ -393,6 +393,7 @@ xchk_btree_check_block_owner( if (!bs->sc->sa.rmap_cur && btnum == XFS_BTNUM_RMAP) bs->cur = NULL; +out_free: if (init_sa) xchk_ag_free(bs->sc, &bs->sc->sa); @@ -435,12 +436,12 @@ xchk_btree_check_owner( if (!co) return -ENOMEM; co->level = level; - co->daddr = XFS_BUF_ADDR(bp); + co->daddr = xfs_buf_daddr(bp); list_add_tail(&co->list, &bs->to_check); return 0; } - return xchk_btree_check_block_owner(bs, level, XFS_BUF_ADDR(bp)); + return xchk_btree_check_block_owner(bs, level, xfs_buf_daddr(bp)); } /* Decide if we want to check minrecs of a btree block in the inode root. */ diff --git a/fs/xfs/scrub/btree.h b/fs/xfs/scrub/btree.h index 5572e475f8ed..b7d2fc01fbf9 100644 --- a/fs/xfs/scrub/btree.h +++ b/fs/xfs/scrub/btree.h @@ -26,8 +26,8 @@ void xchk_btree_xref_set_corrupt(struct xfs_scrub *sc, struct xchk_btree; typedef int (*xchk_btree_rec_fn)( - struct xchk_btree *bs, - union xfs_btree_rec *rec); + struct xchk_btree *bs, + const union xfs_btree_rec *rec); struct xchk_btree { /* caller-provided scrub state */ diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 8558ca05e11d..bf1f3607d0b6 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -186,7 +186,7 @@ xchk_block_set_preen( struct xfs_buf *bp) { sc->sm->sm_flags |= XFS_SCRUB_OFLAG_PREEN; - trace_xchk_block_preen(sc, bp->b_bn, __return_address); + trace_xchk_block_preen(sc, xfs_buf_daddr(bp), __return_address); } /* @@ -219,7 +219,7 @@ xchk_block_set_corrupt( struct xfs_buf *bp) { sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; - trace_xchk_block_error(sc, bp->b_bn, __return_address); + trace_xchk_block_error(sc, xfs_buf_daddr(bp), __return_address); } /* Record a corruption while cross-referencing. */ @@ -229,7 +229,7 @@ xchk_block_xref_set_corrupt( struct xfs_buf *bp) { sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XCORRUPT; - trace_xchk_block_error(sc, bp->b_bn, __return_address); + trace_xchk_block_error(sc, xfs_buf_daddr(bp), __return_address); } /* @@ -324,7 +324,7 @@ struct xchk_rmap_ownedby_info { STATIC int xchk_count_rmap_ownedby_irec( struct xfs_btree_cur *cur, - struct xfs_rmap_irec *rec, + const struct xfs_rmap_irec *rec, void *priv) { struct xchk_rmap_ownedby_info *sroi = priv; @@ -394,11 +394,11 @@ want_ag_read_header_failure( } /* - * Grab all the headers for an AG. + * Grab the perag structure and all the headers for an AG. * - * The headers should be released by xchk_ag_free, but as a fail - * safe we attach all the buffers we grab to the scrub transaction so - * they'll all be freed when we cancel it. + * The headers should be released by xchk_ag_free, but as a fail safe we attach + * all the buffers we grab to the scrub transaction so they'll all be freed + * when we cancel it. Returns ENOENT if we can't grab the perag structure. */ int xchk_ag_read_headers( @@ -409,22 +409,24 @@ xchk_ag_read_headers( struct xfs_mount *mp = sc->mp; int error; - sa->agno = agno; + ASSERT(!sa->pag); + sa->pag = xfs_perag_get(mp, agno); + if (!sa->pag) + return -ENOENT; error = xfs_ialloc_read_agi(mp, sc->tp, agno, &sa->agi_bp); if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGI)) - goto out; + return error; error = xfs_alloc_read_agf(mp, sc->tp, agno, 0, &sa->agf_bp); if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGF)) - goto out; + return error; error = xfs_alloc_read_agfl(mp, sc->tp, agno, &sa->agfl_bp); if (error && want_ag_read_header_failure(sc, XFS_SCRUB_TYPE_AGFL)) - goto out; - error = 0; -out: - return error; + return error; + + return 0; } /* Release all the AG btree cursors. */ @@ -461,7 +463,6 @@ xchk_ag_btcur_init( { struct xfs_mount *mp = sc->mp; - xchk_perag_get(sc->mp, sa); if (sa->agf_bp && xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_BNO)) { /* Set up a bnobt cursor for cross-referencing. */ @@ -484,21 +485,21 @@ xchk_ag_btcur_init( } /* Set up a finobt cursor for cross-referencing. */ - if (sa->agi_bp && xfs_sb_version_hasfinobt(&mp->m_sb) && + if (sa->agi_bp && xfs_has_finobt(mp) && xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_FINO)) { sa->fino_cur = xfs_inobt_init_cursor(mp, sc->tp, sa->agi_bp, sa->pag, XFS_BTNUM_FINO); } /* Set up a rmapbt cursor for cross-referencing. */ - if (sa->agf_bp && xfs_sb_version_hasrmapbt(&mp->m_sb) && + if (sa->agf_bp && xfs_has_rmapbt(mp) && xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_RMAP)) { sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, sa->agf_bp, sa->pag); } /* Set up a refcountbt cursor for cross-referencing. */ - if (sa->agf_bp && xfs_sb_version_hasreflink(&mp->m_sb) && + if (sa->agf_bp && xfs_has_reflink(mp) && xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_REFC)) { sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp, sa->agf_bp, sa->pag); @@ -528,15 +529,14 @@ xchk_ag_free( xfs_perag_put(sa->pag); sa->pag = NULL; } - sa->agno = NULLAGNUMBER; } /* - * For scrub, grab the AGI and the AGF headers, in that order. Locking - * order requires us to get the AGI before the AGF. We use the - * transaction to avoid deadlocking on crosslinked metadata buffers; - * either the caller passes one in (bmap scrub) or we have to create a - * transaction ourselves. + * For scrub, grab the perag structure, the AGI, and the AGF headers, in that + * order. Locking order requires us to get the AGI before the AGF. We use the + * transaction to avoid deadlocking on crosslinked metadata buffers; either the + * caller passes one in (bmap scrub) or we have to create a transaction + * ourselves. Returns ENOENT if the perag struct cannot be grabbed. */ int xchk_ag_init( @@ -554,19 +554,6 @@ xchk_ag_init( return 0; } -/* - * Grab the per-ag structure if we haven't already gotten it. Teardown of the - * xchk_ag will release it for us. - */ -void -xchk_perag_get( - struct xfs_mount *mp, - struct xchk_ag *sa) -{ - if (!sa->pag) - sa->pag = xfs_perag_get(mp, sa->agno); -} - /* Per-scrubber setup functions */ /* @@ -797,7 +784,7 @@ xchk_buffer_recheck( if (!fa) return; sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; - trace_xchk_block_error(sc, bp->b_bn, fa); + trace_xchk_block_error(sc, xfs_buf_daddr(bp), fa); } /* @@ -842,7 +829,7 @@ xchk_metadata_inode_forks( return error; /* Look for incorrect shared blocks. */ - if (xfs_sb_version_hasreflink(&sc->mp->m_sb)) { + if (xfs_has_reflink(sc->mp)) { error = xfs_reflink_inode_has_shared_extents(sc->tp, sc->ip, &shared); if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, @@ -884,6 +871,7 @@ xchk_stop_reaping( { sc->flags |= XCHK_REAPING_DISABLED; xfs_blockgc_stop(sc->mp); + xfs_inodegc_stop(sc->mp); } /* Restart background reaping of resources. */ @@ -891,6 +879,13 @@ void xchk_start_reaping( struct xfs_scrub *sc) { - xfs_blockgc_start(sc->mp); + /* + * Readonly filesystems do not perform inactivation or speculative + * preallocation, so there's no need to restart the workers. + */ + if (!xfs_is_readonly(sc->mp)) { + xfs_inodegc_start(sc->mp); + xfs_blockgc_start(sc->mp); + } sc->flags &= ~XCHK_REAPING_DISABLED; } diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index 0410faf7d735..454145db10e7 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -107,7 +107,23 @@ int xchk_setup_fscounters(struct xfs_scrub *sc); void xchk_ag_free(struct xfs_scrub *sc, struct xchk_ag *sa); int xchk_ag_init(struct xfs_scrub *sc, xfs_agnumber_t agno, struct xchk_ag *sa); -void xchk_perag_get(struct xfs_mount *mp, struct xchk_ag *sa); + +/* + * Grab all AG resources, treating the inability to grab the perag structure as + * a fs corruption. This is intended for callers checking an ondisk reference + * to a given AG, which means that the AG must still exist. + */ +static inline int +xchk_ag_init_existing( + struct xfs_scrub *sc, + xfs_agnumber_t agno, + struct xchk_ag *sa) +{ + int error = xchk_ag_init(sc, agno, sa); + + return error == -ENOENT ? -EFSCORRUPTED : error; +} + int xchk_ag_read_headers(struct xfs_scrub *sc, xfs_agnumber_t agno, struct xchk_ag *sa); void xchk_ag_btcur_free(struct xchk_ag *sa); diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c index 9f0dbb47c82c..8a52514bc1ff 100644 --- a/fs/xfs/scrub/dabtree.c +++ b/fs/xfs/scrub/dabtree.c @@ -367,11 +367,11 @@ xchk_da_btree_block( pmaxrecs = &ds->maxrecs[level]; /* We only started zeroing the header on v5 filesystems. */ - if (xfs_sb_version_hascrc(&ds->sc->mp->m_sb) && hdr3->hdr.pad) + if (xfs_has_crc(ds->sc->mp) && hdr3->hdr.pad) xchk_da_set_corrupt(ds, level); /* Check the owner. */ - if (xfs_sb_version_hascrc(&ip->i_mount->m_sb)) { + if (xfs_has_crc(ip->i_mount)) { owner = be64_to_cpu(hdr3->owner); if (owner != ip->i_ino) xchk_da_set_corrupt(ds, level); diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index 28dda391d5df..200a63f58fe7 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -51,7 +51,7 @@ xchk_dir_check_ftype( int ino_dtype; int error = 0; - if (!xfs_sb_version_hasftype(&mp->m_sb)) { + if (!xfs_has_ftype(mp)) { if (dtype != DT_UNKNOWN && dtype != DT_DIR) xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset); @@ -140,7 +140,7 @@ xchk_dir_actor( if (!strncmp(".", name, namelen)) { /* If this is "." then check that the inum matches the dir. */ - if (xfs_sb_version_hasftype(&mp->m_sb) && type != DT_DIR) + if (xfs_has_ftype(mp) && type != DT_DIR) xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset); checked_ftype = true; @@ -152,7 +152,7 @@ xchk_dir_actor( * If this is ".." in the root inode, check that the inum * matches this dir. */ - if (xfs_sb_version_hasftype(&mp->m_sb) && type != DT_DIR) + if (xfs_has_ftype(mp) && type != DT_DIR) xchk_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset); checked_ftype = true; @@ -526,7 +526,7 @@ xchk_directory_leaf1_bestfree( bestcount = be32_to_cpu(ltp->bestcount); bestp = xfs_dir2_leaf_bests_p(ltp); - if (xfs_sb_version_hascrc(&sc->mp->m_sb)) { + if (xfs_has_crc(sc->mp)) { struct xfs_dir3_leaf_hdr *hdr3 = bp->b_addr; if (hdr3->pad != cpu_to_be32(0)) @@ -623,7 +623,7 @@ xchk_directory_free_bestfree( return error; xchk_buffer_recheck(sc, bp); - if (xfs_sb_version_hascrc(&sc->mp->m_sb)) { + if (xfs_has_crc(sc->mp)) { struct xfs_dir3_free_hdr *hdr3 = bp->b_addr; if (hdr3->pad != cpu_to_be32(0)) diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index fd7941e04ae1..48a6cbdf95d0 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -148,9 +148,9 @@ xchk_fscount_btreeblks( xfs_extlen_t blocks; int error; - error = xchk_ag_init(sc, agno, &sc->sa); + error = xchk_ag_init_existing(sc, agno, &sc->sa); if (error) - return error; + goto out_free; error = xfs_btree_count_blocks(sc->sa.bno_cur, &blocks); if (error) @@ -207,7 +207,7 @@ retry: /* Add up the free/freelist/bnobt/cntbt blocks */ fsc->fdblocks += pag->pagf_freeblks; fsc->fdblocks += pag->pagf_flcount; - if (xfs_sb_version_haslazysbcount(&sc->mp->m_sb)) { + if (xfs_has_lazysbcount(sc->mp)) { fsc->fdblocks += pag->pagf_btreeblks; } else { error = xchk_fscount_btreeblks(sc, fsc, agno); diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index 30e568596b79..00848ee542fb 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -418,7 +418,7 @@ xchk_iallocbt_rec_alignment( STATIC int xchk_iallocbt_rec( struct xchk_btree *bs, - union xfs_btree_rec *rec) + const union xfs_btree_rec *rec) { struct xfs_mount *mp = bs->cur->bc_mp; struct xchk_iallocbt *iabt = bs->private; @@ -517,7 +517,7 @@ xchk_iallocbt_xref_rmap_btreeblks( int error; if (!sc->sa.ino_cur || !sc->sa.rmap_cur || - (xfs_sb_version_hasfinobt(&sc->mp->m_sb) && !sc->sa.fino_cur) || + (xfs_has_finobt(sc->mp) && !sc->sa.fino_cur) || xchk_skip_xref(sc->sm)) return; diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index 76fbc7ca4cec..2405b09d03d0 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -181,7 +181,7 @@ xchk_inode_flags2( /* reflink flag requires reflink feature */ if ((flags2 & XFS_DIFLAG2_REFLINK) && - !xfs_sb_version_hasreflink(&mp->m_sb)) + !xfs_has_reflink(mp)) goto bad; /* cowextsize flag is checked w.r.t. mode separately */ @@ -199,8 +199,7 @@ xchk_inode_flags2( goto bad; /* no bigtime iflag without the bigtime feature */ - if (xfs_dinode_has_bigtime(dip) && - !xfs_sb_version_hasbigtime(&mp->m_sb)) + if (xfs_dinode_has_bigtime(dip) && !xfs_has_bigtime(mp)) goto bad; return; @@ -278,7 +277,7 @@ xchk_dinode( xchk_ino_set_corrupt(sc, ino); if (dip->di_projid_hi != 0 && - !xfs_sb_version_hasprojid32bit(&mp->m_sb)) + !xfs_has_projid32(mp)) xchk_ino_set_corrupt(sc, ino); break; default: @@ -532,9 +531,9 @@ xchk_inode_xref( agno = XFS_INO_TO_AGNO(sc->mp, ino); agbno = XFS_INO_TO_AGBNO(sc->mp, ino); - error = xchk_ag_init(sc, agno, &sc->sa); + error = xchk_ag_init_existing(sc, agno, &sc->sa); if (!xchk_xref_process_error(sc, agno, agbno, &error)) - return; + goto out_free; xchk_xref_is_used_space(sc, agbno, 1); xchk_inode_xref_finobt(sc, ino); @@ -542,6 +541,7 @@ xchk_inode_xref( xchk_xref_is_not_shared(sc, agbno, 1); xchk_inode_xref_bmap(sc, dip); +out_free: xchk_ag_free(sc, &sc->sa); } @@ -560,7 +560,7 @@ xchk_inode_check_reflink_iflag( bool has_shared; int error; - if (!xfs_sb_version_hasreflink(&mp->m_sb)) + if (!xfs_has_reflink(mp)) return; error = xfs_reflink_inode_has_shared_extents(sc->tp, sc->ip, diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index acbb9839d42f..d6c1b00a4fc8 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -42,7 +42,7 @@ xchk_setup_quota( xfs_dqtype_t dqtype; int error; - if (!XFS_IS_QUOTA_RUNNING(sc->mp) || !XFS_IS_QUOTA_ON(sc->mp)) + if (!XFS_IS_QUOTA_ON(sc->mp)) return -ENOENT; dqtype = xchk_quota_to_dqtype(sc); @@ -127,7 +127,7 @@ xchk_quota_item( * a reflink filesystem we're allowed to exceed physical space * if there are no quota limits. */ - if (xfs_sb_version_hasreflink(&mp->m_sb)) { + if (xfs_has_reflink(mp)) { if (mp->m_sb.sb_dblocks < dq->q_blk.count) xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset); diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c index 7014b7408bad..2744eecdbaf0 100644 --- a/fs/xfs/scrub/refcount.c +++ b/fs/xfs/scrub/refcount.c @@ -91,7 +91,7 @@ struct xchk_refcnt_check { STATIC int xchk_refcountbt_rmap_check( struct xfs_btree_cur *cur, - struct xfs_rmap_irec *rec, + const struct xfs_rmap_irec *rec, void *priv) { struct xchk_refcnt_check *refchk = priv; @@ -330,7 +330,7 @@ xchk_refcountbt_xref( STATIC int xchk_refcountbt_rec( struct xchk_btree *bs, - union xfs_btree_rec *rec) + const union xfs_btree_rec *rec) { struct xfs_mount *mp = bs->cur->bc_mp; xfs_agblock_t *cow_blocks = bs->private; diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index ebb0e245aa72..8f3cba14ada3 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -248,19 +248,19 @@ xrep_calc_ag_resblks( * bnobt/cntbt or inobt/finobt as pairs. */ bnobt_sz = 2 * xfs_allocbt_calc_size(mp, freelen); - if (xfs_sb_version_hassparseinodes(&mp->m_sb)) + if (xfs_has_sparseinodes(mp)) inobt_sz = xfs_iallocbt_calc_size(mp, icount / XFS_INODES_PER_HOLEMASK_BIT); else inobt_sz = xfs_iallocbt_calc_size(mp, icount / XFS_INODES_PER_CHUNK); - if (xfs_sb_version_hasfinobt(&mp->m_sb)) + if (xfs_has_finobt(mp)) inobt_sz *= 2; - if (xfs_sb_version_hasreflink(&mp->m_sb)) + if (xfs_has_reflink(mp)) refcbt_sz = xfs_refcountbt_calc_size(mp, usedlen); else refcbt_sz = 0; - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) { + if (xfs_has_rmapbt(mp)) { /* * Guess how many blocks we need to rebuild the rmapbt. * For non-reflink filesystems we can't have more records than @@ -269,7 +269,7 @@ xrep_calc_ag_resblks( * many rmaps there could be in the AG, so we start off with * what we hope is an generous over-estimation. */ - if (xfs_sb_version_hasreflink(&mp->m_sb)) + if (xfs_has_reflink(mp)) rmapbt_sz = xfs_rmapbt_calc_size(mp, (unsigned long long)aglen * 2); else @@ -306,9 +306,9 @@ xrep_alloc_ag_block( return -ENOSPC; xfs_extent_busy_reuse(sc->mp, sc->sa.pag, bno, 1, false); - *fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, bno); + *fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, bno); if (resv == XFS_AG_RESV_RMAPBT) - xfs_ag_resv_rmapbt_alloc(sc->mp, sc->sa.agno); + xfs_ag_resv_rmapbt_alloc(sc->mp, sc->sa.pag->pag_agno); return 0; default: break; @@ -317,7 +317,7 @@ xrep_alloc_ag_block( args.tp = sc->tp; args.mp = sc->mp; args.oinfo = *oinfo; - args.fsbno = XFS_AGB_TO_FSB(args.mp, sc->sa.agno, 0); + args.fsbno = XFS_AGB_TO_FSB(args.mp, sc->sa.pag->pag_agno, 0); args.minlen = 1; args.maxlen = 1; args.prod = 1; @@ -352,14 +352,14 @@ xrep_init_btblock( trace_xrep_init_btblock(mp, XFS_FSB_TO_AGNO(mp, fsb), XFS_FSB_TO_AGBNO(mp, fsb), btnum); - ASSERT(XFS_FSB_TO_AGNO(mp, fsb) == sc->sa.agno); + ASSERT(XFS_FSB_TO_AGNO(mp, fsb) == sc->sa.pag->pag_agno); error = xfs_trans_get_buf(tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, fsb), XFS_FSB_TO_BB(mp, 1), 0, &bp); if (error) return error; xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); - xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.agno); + xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.pag->pag_agno); xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF); xfs_trans_log_buf(tp, bp, 0, BBTOB(bp->b_length) - 1); bp->b_ops = ops; @@ -481,7 +481,7 @@ xrep_fix_freelist( args.mp = sc->mp; args.tp = sc->tp; - args.agno = sc->sa.agno; + args.agno = sc->sa.pag->pag_agno; args.alignment = 1; args.pag = sc->sa.pag; @@ -611,11 +611,11 @@ xrep_reap_extents( xfs_fsblock_t fsbno; int error = 0; - ASSERT(xfs_sb_version_hasrmapbt(&sc->mp->m_sb)); + ASSERT(xfs_has_rmapbt(sc->mp)); for_each_xbitmap_block(fsbno, bmr, n, bitmap) { ASSERT(sc->ip != NULL || - XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.agno); + XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.pag->pag_agno); trace_xrep_dispose_btree_extent(sc->mp, XFS_FSB_TO_AGNO(sc->mp, fsbno), XFS_FSB_TO_AGBNO(sc->mp, fsbno), 1); @@ -690,7 +690,7 @@ xrep_findroot_block( int block_level; int error = 0; - daddr = XFS_AGB_TO_DADDR(mp, ri->sc->sa.agno, agbno); + daddr = XFS_AGB_TO_DADDR(mp, ri->sc->sa.pag->pag_agno, agbno); /* * Blocks in the AGFL have stale contents that might just happen to @@ -819,7 +819,7 @@ xrep_findroot_block( else fab->root = NULLAGBLOCK; - trace_xrep_findroot_block(mp, ri->sc->sa.agno, agbno, + trace_xrep_findroot_block(mp, ri->sc->sa.pag->pag_agno, agbno, be32_to_cpu(btblock->bb_magic), fab->height - 1); out: xfs_trans_brelse(ri->sc->tp, bp); @@ -833,7 +833,7 @@ out: STATIC int xrep_findroot_rmap( struct xfs_btree_cur *cur, - struct xfs_rmap_irec *rec, + const struct xfs_rmap_irec *rec, void *priv) { struct xrep_findroot *ri = priv; diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c index fc306573f0ac..8dae0345c7df 100644 --- a/fs/xfs/scrub/rmap.c +++ b/fs/xfs/scrub/rmap.c @@ -88,7 +88,7 @@ xchk_rmapbt_xref( STATIC int xchk_rmapbt_rec( struct xchk_btree *bs, - union xfs_btree_rec *rec) + const union xfs_btree_rec *rec) { struct xfs_mount *mp = bs->cur->bc_mp; struct xfs_rmap_irec irec; diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c index 37c0e2266c85..8fa012057405 100644 --- a/fs/xfs/scrub/rtbitmap.c +++ b/fs/xfs/scrub/rtbitmap.c @@ -41,7 +41,7 @@ xchk_setup_rt( STATIC int xchk_rtbitmap_rec( struct xfs_trans *tp, - struct xfs_rtalloc_rec *rec, + const struct xfs_rtalloc_rec *rec, void *priv) { struct xfs_scrub *sc = priv; diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 0e542636227c..51e4c61916d2 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -239,21 +239,21 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .type = ST_PERAG, .setup = xchk_setup_ag_iallocbt, .scrub = xchk_finobt, - .has = xfs_sb_version_hasfinobt, + .has = xfs_has_finobt, .repair = xrep_notsupported, }, [XFS_SCRUB_TYPE_RMAPBT] = { /* rmapbt */ .type = ST_PERAG, .setup = xchk_setup_ag_rmapbt, .scrub = xchk_rmapbt, - .has = xfs_sb_version_hasrmapbt, + .has = xfs_has_rmapbt, .repair = xrep_notsupported, }, [XFS_SCRUB_TYPE_REFCNTBT] = { /* refcountbt */ .type = ST_PERAG, .setup = xchk_setup_ag_refcountbt, .scrub = xchk_refcountbt, - .has = xfs_sb_version_hasreflink, + .has = xfs_has_reflink, .repair = xrep_notsupported, }, [XFS_SCRUB_TYPE_INODE] = { /* inode record */ @@ -308,14 +308,14 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { .type = ST_FS, .setup = xchk_setup_rt, .scrub = xchk_rtbitmap, - .has = xfs_sb_version_hasrealtime, + .has = xfs_has_realtime, .repair = xrep_notsupported, }, [XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */ .type = ST_FS, .setup = xchk_setup_rt, .scrub = xchk_rtsummary, - .has = xfs_sb_version_hasrealtime, + .has = xfs_has_realtime, .repair = xrep_notsupported, }, [XFS_SCRUB_TYPE_UQUOTA] = { /* user quota */ @@ -383,7 +383,7 @@ xchk_validate_inputs( if (ops->setup == NULL || ops->scrub == NULL) goto out; /* Does this fs even support this type of metadata? */ - if (ops->has && !ops->has(&mp->m_sb)) + if (ops->has && !ops->has(mp)) goto out; error = -EINVAL; @@ -415,11 +415,11 @@ xchk_validate_inputs( */ if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) { error = -EOPNOTSUPP; - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) goto out; error = -EROFS; - if (mp->m_flags & XFS_MOUNT_RDONLY) + if (xfs_is_readonly(mp)) goto out; } @@ -464,9 +464,6 @@ xfs_scrub_metadata( struct xfs_scrub sc = { .file = file, .sm = sm, - .sa = { - .agno = NULLAGNUMBER, - }, }; struct xfs_mount *mp = XFS_I(file_inode(file))->i_mount; int error = 0; @@ -480,10 +477,10 @@ xfs_scrub_metadata( /* Forbidden if we are shut down or mounted norecovery. */ error = -ESHUTDOWN; - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) goto out; error = -ENOTRECOVERABLE; - if (mp->m_flags & XFS_MOUNT_NORECOVERY) + if (xfs_has_norecovery(mp)) goto out; error = xchk_validate_inputs(mp, sm); diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index 08a483cb46e2..80e5026bba44 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -27,7 +27,7 @@ struct xchk_meta_ops { int (*repair)(struct xfs_scrub *); /* Decide if we even have this piece of metadata. */ - bool (*has)(struct xfs_sb *); + bool (*has)(struct xfs_mount *); /* type describing required/allowed inputs */ enum xchk_type type; @@ -35,7 +35,6 @@ struct xchk_meta_ops { /* Buffer pointers and btree cursors for an entire AG. */ struct xchk_ag { - xfs_agnumber_t agno; struct xfs_perag *pag; /* AG btree roots */ diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c index 03882a605a3c..c0ef53fe6611 100644 --- a/fs/xfs/scrub/trace.c +++ b/fs/xfs/scrub/trace.c @@ -22,11 +22,11 @@ xchk_btree_cur_fsbno( int level) { if (level < cur->bc_nlevels && cur->bc_bufs[level]) - return XFS_DADDR_TO_FSB(cur->bc_mp, cur->bc_bufs[level]->b_bn); - else if (level == cur->bc_nlevels - 1 && - cur->bc_flags & XFS_BTREE_LONG_PTRS) + return XFS_DADDR_TO_FSB(cur->bc_mp, + xfs_buf_daddr(cur->bc_bufs[level])); + if (level == cur->bc_nlevels - 1 && cur->bc_flags & XFS_BTREE_LONG_PTRS) return XFS_INO_TO_FSB(cur->bc_mp, cur->bc_ino.ip->i_ino); - else if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS)) + if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS)) return XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.pag->pag_agno, 0); return NULLFSBLOCK; } diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index e46f5cef90da..a7bbb84f91a7 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -2,6 +2,10 @@ /* * Copyright (C) 2017 Oracle. All Rights Reserved. * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * NOTE: none of these tracepoints shall be considered a stable kernel ABI + * as they can change at any time. See xfs_trace.h for documentation of + * specific units found in tracepoint output. */ #undef TRACE_SYSTEM #define TRACE_SYSTEM xfs_scrub @@ -79,6 +83,16 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS); { XFS_SCRUB_TYPE_PQUOTA, "prjquota" }, \ { XFS_SCRUB_TYPE_FSCOUNTERS, "fscounters" } +#define XFS_SCRUB_FLAG_STRINGS \ + { XFS_SCRUB_IFLAG_REPAIR, "repair" }, \ + { XFS_SCRUB_OFLAG_CORRUPT, "corrupt" }, \ + { XFS_SCRUB_OFLAG_PREEN, "preen" }, \ + { XFS_SCRUB_OFLAG_XFAIL, "xfail" }, \ + { XFS_SCRUB_OFLAG_XCORRUPT, "xcorrupt" }, \ + { XFS_SCRUB_OFLAG_INCOMPLETE, "incomplete" }, \ + { XFS_SCRUB_OFLAG_WARNING, "warning" }, \ + { XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED, "norepair" } + DECLARE_EVENT_CLASS(xchk_class, TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_metadata *sm, int error), @@ -103,14 +117,14 @@ DECLARE_EVENT_CLASS(xchk_class, __entry->flags = sm->sm_flags; __entry->error = error; ), - TP_printk("dev %d:%d ino 0x%llx type %s agno %u inum %llu gen %u flags 0x%x error %d", + TP_printk("dev %d:%d ino 0x%llx type %s agno 0x%x inum 0x%llx gen 0x%x flags (%s) error %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), __entry->agno, __entry->inum, __entry->gen, - __entry->flags, + __print_flags(__entry->flags, "|", XFS_SCRUB_FLAG_STRINGS), __entry->error) ) #define DEFINE_SCRUB_EVENT(name) \ @@ -145,7 +159,7 @@ TRACE_EVENT(xchk_op_error, __entry->error = error; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d type %s agno %u agbno %u error %d ret_ip %pS", + TP_printk("dev %d:%d type %s agno 0x%x agbno 0x%x error %d ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), __entry->agno, @@ -176,10 +190,10 @@ TRACE_EVENT(xchk_file_op_error, __entry->error = error; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d ino 0x%llx fork %d type %s offset %llu error %d ret_ip %pS", + TP_printk("dev %d:%d ino 0x%llx fork %s type %s fileoff 0x%llx error %d ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, - __entry->whichfork, + __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), __entry->offset, __entry->error, @@ -193,29 +207,21 @@ DECLARE_EVENT_CLASS(xchk_block_error_class, __field(dev_t, dev) __field(unsigned int, type) __field(xfs_agnumber_t, agno) - __field(xfs_agblock_t, bno) + __field(xfs_agblock_t, agbno) __field(void *, ret_ip) ), TP_fast_assign( - xfs_fsblock_t fsbno; - xfs_agnumber_t agno; - xfs_agblock_t bno; - - fsbno = XFS_DADDR_TO_FSB(sc->mp, daddr); - agno = XFS_FSB_TO_AGNO(sc->mp, fsbno); - bno = XFS_FSB_TO_AGBNO(sc->mp, fsbno); - __entry->dev = sc->mp->m_super->s_dev; __entry->type = sc->sm->sm_type; - __entry->agno = agno; - __entry->bno = bno; + __entry->agno = xfs_daddr_to_agno(sc->mp, daddr); + __entry->agbno = xfs_daddr_to_agbno(sc->mp, daddr); __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d type %s agno %u agbno %u ret_ip %pS", + TP_printk("dev %d:%d type %s agno 0x%x agbno 0x%x ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), __entry->agno, - __entry->bno, + __entry->agbno, __entry->ret_ip) ) @@ -281,10 +287,10 @@ DECLARE_EVENT_CLASS(xchk_fblock_error_class, __entry->offset = offset; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d ino 0x%llx fork %d type %s offset %llu ret_ip %pS", + TP_printk("dev %d:%d ino 0x%llx fork %s type %s fileoff 0x%llx ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, - __entry->whichfork, + __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), __entry->offset, __entry->ret_ip) @@ -346,7 +352,7 @@ TRACE_EVENT(xchk_btree_op_error, __entry->error = error; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d type %s btree %s level %d ptr %d agno %u agbno %u error %d ret_ip %pS", + TP_printk("dev %d:%d type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x error %d ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), @@ -389,10 +395,10 @@ TRACE_EVENT(xchk_ifork_btree_op_error, __entry->error = error; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d ino 0x%llx fork %d type %s btree %s level %d ptr %d agno %u agbno %u error %d ret_ip %pS", + TP_printk("dev %d:%d ino 0x%llx fork %s type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x error %d ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, - __entry->whichfork, + __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), __entry->level, @@ -428,7 +434,7 @@ TRACE_EVENT(xchk_btree_error, __entry->ptr = cur->bc_ptrs[level]; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d type %s btree %s level %d ptr %d agno %u agbno %u ret_ip %pS", + TP_printk("dev %d:%d type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), @@ -468,10 +474,10 @@ TRACE_EVENT(xchk_ifork_btree_error, __entry->ptr = cur->bc_ptrs[level]; __entry->ret_ip = ret_ip; ), - TP_printk("dev %d:%d ino 0x%llx fork %d type %s btree %s level %d ptr %d agno %u agbno %u ret_ip %pS", + TP_printk("dev %d:%d ino 0x%llx fork %s type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, - __entry->whichfork, + __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), __entry->level, @@ -507,7 +513,7 @@ DECLARE_EVENT_CLASS(xchk_sbtree_class, __entry->nlevels = cur->bc_nlevels; __entry->ptr = cur->bc_ptrs[level]; ), - TP_printk("dev %d:%d type %s btree %s agno %u agbno %u level %d nlevels %d ptr %d", + TP_printk("dev %d:%d type %s btree %s agno 0x%x agbno 0x%x level %d nlevels %d ptr %d", MAJOR(__entry->dev), MINOR(__entry->dev), __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), @@ -580,7 +586,7 @@ TRACE_EVENT(xchk_iallocbt_check_cluster, __entry->holemask = holemask; __entry->cluster_ino = cluster_ino; ), - TP_printk("dev %d:%d agno %d startino %u daddr 0x%llx len %d chunkino %u nr_inodes %u cluster_mask 0x%x holemask 0x%x cluster_ino %u", + TP_printk("dev %d:%d agno 0x%x startino 0x%x daddr 0x%llx bbcount 0x%x chunkino 0x%x nr_inodes %u cluster_mask 0x%x holemask 0x%x cluster_ino 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->startino, @@ -670,7 +676,7 @@ DECLARE_EVENT_CLASS(xrep_extent_class, __entry->agbno = agbno; __entry->len = len; ), - TP_printk("dev %d:%d agno %u agbno %u len %u", + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->agbno, @@ -707,7 +713,7 @@ DECLARE_EVENT_CLASS(xrep_rmap_class, __entry->offset = offset; __entry->flags = flags; ), - TP_printk("dev %d:%d agno %u agbno %u len %u owner %lld offset %llu flags 0x%x", + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->agbno, @@ -745,7 +751,7 @@ TRACE_EVENT(xrep_refcount_extent_fn, __entry->blockcount = irec->rc_blockcount; __entry->refcount = irec->rc_refcount; ), - TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u", + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->startblock, @@ -769,7 +775,7 @@ TRACE_EVENT(xrep_init_btblock, __entry->agbno = agbno; __entry->btnum = btnum; ), - TP_printk("dev %d:%d agno %u agbno %u btree %s", + TP_printk("dev %d:%d agno 0x%x agbno 0x%x btree %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->agbno, @@ -793,7 +799,7 @@ TRACE_EVENT(xrep_findroot_block, __entry->magic = magic; __entry->level = level; ), - TP_printk("dev %d:%d agno %u agbno %u magic 0x%x level %u", + TP_printk("dev %d:%d agno 0x%x agbno 0x%x magic 0x%x level %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->agbno, @@ -821,7 +827,7 @@ TRACE_EVENT(xrep_calc_ag_resblks, __entry->freelen = freelen; __entry->usedlen = usedlen; ), - TP_printk("dev %d:%d agno %d icount %u aglen %u freelen %u usedlen %u", + TP_printk("dev %d:%d agno 0x%x icount %u aglen %u freelen %u usedlen %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->icount, @@ -850,7 +856,7 @@ TRACE_EVENT(xrep_calc_ag_resblks_btsize, __entry->rmapbt_sz = rmapbt_sz; __entry->refcbt_sz = refcbt_sz; ), - TP_printk("dev %d:%d agno %d bno %u ino %u rmap %u refcount %u", + TP_printk("dev %d:%d agno 0x%x bnobt %u inobt %u rmapbt %u refcountbt %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->bnobt_sz, @@ -894,7 +900,7 @@ TRACE_EVENT(xrep_ialloc_insert, __entry->freecount = freecount; __entry->freemask = freemask; ), - TP_printk("dev %d:%d agno %d startino %u holemask 0x%x count %u freecount %u freemask 0x%llx", + TP_printk("dev %d:%d agno 0x%x startino 0x%x holemask 0x%x count %u freecount %u freemask 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->startino, diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index d02bef24b32b..5c52ee869272 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -125,7 +125,7 @@ xfs_acl_to_disk(struct xfs_acl *aclp, const struct posix_acl *acl) } struct posix_acl * -xfs_get_acl(struct inode *inode, int type) +xfs_get_acl(struct inode *inode, int type, bool rcu) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; @@ -137,6 +137,9 @@ xfs_get_acl(struct inode *inode, int type) }; int error; + if (rcu) + return ERR_PTR(-ECHILD); + trace_xfs_get_acl(ip); switch (type) { @@ -232,7 +235,7 @@ xfs_acl_set_mode( inode->i_ctime = current_time(inode); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - if (mp->m_flags & XFS_MOUNT_WSYNC) + if (xfs_has_wsync(mp)) xfs_trans_set_sync(tp); return xfs_trans_commit(tp); } diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index 7bdb3a4ed798..bb6abdcb265d 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -10,13 +10,13 @@ struct inode; struct posix_acl; #ifdef CONFIG_XFS_POSIX_ACL -extern struct posix_acl *xfs_get_acl(struct inode *inode, int type); +extern struct posix_acl *xfs_get_acl(struct inode *inode, int type, bool rcu); extern int xfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, struct posix_acl *acl, int type); extern int __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); void xfs_forget_acl(struct inode *inode, const char *name); #else -static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type) +static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type, bool rcu) { return NULL; } diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index cb4e0fcf4c76..34fc6148032a 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -97,7 +97,7 @@ xfs_end_ioend( /* * Just clean up the in-memory structures if the fs has been shut down. */ - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { + if (xfs_is_shutdown(ip->i_mount)) { error = -EIO; goto done; } @@ -260,7 +260,7 @@ xfs_map_blocks( int retries = 0; int error = 0; - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; /* @@ -440,7 +440,7 @@ xfs_discard_page( xfs_fileoff_t pageoff_fsb = XFS_B_TO_FSBT(mp, pageoff); int error; - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) goto out_invalidate; xfs_alert_ratelimited(mp, @@ -449,7 +449,7 @@ xfs_discard_page( error = xfs_bmap_punch_delalloc_range(ip, start_fsb, i_blocks_per_page(inode, page) - pageoff_fsb); - if (error && !XFS_FORCED_SHUTDOWN(mp)) + if (error && !xfs_is_shutdown(mp)) xfs_alert(mp, "page discard unable to remove delalloc mapping."); out_invalidate: iomap_invalidatepage(page, pageoff, PAGE_SIZE - pageoff); @@ -462,22 +462,6 @@ static const struct iomap_writeback_ops xfs_writeback_ops = { }; STATIC int -xfs_vm_writepage( - struct page *page, - struct writeback_control *wbc) -{ - struct xfs_writepage_ctx wpc = { }; - - if (WARN_ON_ONCE(current->journal_info)) { - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return 0; - } - - return iomap_writepage(page, wbc, &wpc.ctx, &xfs_writeback_ops); -} - -STATIC int xfs_vm_writepages( struct address_space *mapping, struct writeback_control *wbc) @@ -559,7 +543,6 @@ xfs_iomap_swapfile_activate( const struct address_space_operations xfs_address_space_operations = { .readpage = xfs_vm_readpage, .readahead = xfs_vm_readahead, - .writepage = xfs_vm_writepage, .writepages = xfs_vm_writepages, .set_page_dirty = __set_page_dirty_nobuffers, .releasepage = iomap_releasepage, diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index aaa7e66c42d7..2b5da6218977 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -151,7 +151,7 @@ xfs_attr3_node_inactive( } xfs_da3_node_hdr_from_disk(dp->i_mount, &ichdr, bp->b_addr); - parent_blkno = bp->b_bn; + parent_blkno = xfs_buf_daddr(bp); if (!ichdr.count) { xfs_trans_brelse(*trans, bp); return 0; @@ -177,7 +177,7 @@ xfs_attr3_node_inactive( return error; /* save for re-read later */ - child_blkno = XFS_BUF_ADDR(child_bp); + child_blkno = xfs_buf_daddr(child_bp); /* * Invalidate the subtree, however we have to. @@ -271,7 +271,7 @@ xfs_attr3_root_inactive( error = xfs_da3_node_read(*trans, dp, 0, &bp, XFS_ATTR_FORK); if (error) return error; - blkno = bp->b_bn; + blkno = xfs_buf_daddr(bp); /* * Invalidate the tree, even if the "tree" is only a single leaf block. diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 25dcc98d50e6..2d1e5134cebe 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -529,7 +529,7 @@ xfs_attr_list( XFS_STATS_INC(dp->i_mount, xs_attr_list); - if (XFS_FORCED_SHUTDOWN(dp->i_mount)) + if (xfs_is_shutdown(dp->i_mount)) return -EIO; lock_mode = xfs_ilock_attr_map_shared(dp); diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index e3a691937e92..03159970133f 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -24,7 +24,6 @@ #include "xfs_error.h" #include "xfs_log_priv.h" #include "xfs_log_recover.h" -#include "xfs_quota.h" kmem_zone_t *xfs_bui_zone; kmem_zone_t *xfs_bud_zone; @@ -487,18 +486,10 @@ xfs_bui_item_recover( XFS_ATTR_FORK : XFS_DATA_FORK; bui_type = bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK; - /* Grab the inode. */ - error = xfs_iget(mp, NULL, bmap->me_owner, 0, 0, &ip); + error = xlog_recover_iget(mp, bmap->me_owner, &ip); if (error) return error; - error = xfs_qm_dqattach(ip); - if (error) - goto err_rele; - - if (VFS_I(ip)->i_nlink == 0) - xfs_iflags_set(ip, XFS_IRECOVERY); - /* Allocate transaction and do the work. */ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 0, 0, &tp); @@ -522,6 +513,9 @@ xfs_bui_item_recover( error = xfs_trans_log_finish_bmap_update(tp, budp, bui_type, ip, whichfork, bmap->me_startoff, bmap->me_startblock, &count, state); + if (error == -EFSCORRUPTED) + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bmap, + sizeof(*bmap)); if (error) goto err_cancel; diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 1cd3f940fa6a..73a36b7be3bd 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -731,7 +731,7 @@ xfs_free_eofblocks( error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); if (error) { - ASSERT(XFS_FORCED_SHUTDOWN(mp)); + ASSERT(xfs_is_shutdown(mp)); return error; } @@ -789,7 +789,7 @@ xfs_alloc_file_space( trace_xfs_alloc_file_space(ip); - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; error = xfs_qm_dqattach(ip); @@ -1282,7 +1282,7 @@ xfs_swap_extents_check_format( * If we have to use the (expensive) rmap swap method, we can * handle any number of extents and any format. */ - if (xfs_sb_version_hasrmapbt(&ip->i_mount->m_sb)) + if (xfs_has_rmapbt(ip->i_mount)) return 0; /* @@ -1516,7 +1516,7 @@ xfs_swap_extent_forks( * event of a crash. Set the owner change log flags now and leave the * bmbt scan as the last step. */ - if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) { + if (xfs_has_v3inodes(ip->i_mount)) { if (ip->i_df.if_format == XFS_DINODE_FMT_BTREE) (*target_log_flags) |= XFS_ILOG_DOWNER; if (tip->i_df.if_format == XFS_DINODE_FMT_BTREE) @@ -1553,7 +1553,7 @@ xfs_swap_extent_forks( (*src_log_flags) |= XFS_ILOG_DEXT; break; case XFS_DINODE_FMT_BTREE: - ASSERT(!xfs_sb_version_has_v3inode(&ip->i_mount->m_sb) || + ASSERT(!xfs_has_v3inodes(ip->i_mount) || (*src_log_flags & XFS_ILOG_DOWNER)); (*src_log_flags) |= XFS_ILOG_DBROOT; break; @@ -1565,7 +1565,7 @@ xfs_swap_extent_forks( break; case XFS_DINODE_FMT_BTREE: (*target_log_flags) |= XFS_ILOG_DBROOT; - ASSERT(!xfs_sb_version_has_v3inode(&ip->i_mount->m_sb) || + ASSERT(!xfs_has_v3inodes(ip->i_mount) || (*target_log_flags & XFS_ILOG_DOWNER)); break; } @@ -1678,7 +1678,7 @@ xfs_swap_extents( * a block reservation because it's really just a remap operation * performed with log redo items! */ - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) { + if (xfs_has_rmapbt(mp)) { int w = XFS_DATA_FORK; uint32_t ipnext = ip->i_df.if_nextents; uint32_t tipnext = tip->i_df.if_nextents; @@ -1759,7 +1759,7 @@ xfs_swap_extents( src_log_flags = XFS_ILOG_CORE; target_log_flags = XFS_ILOG_CORE; - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + if (xfs_has_rmapbt(mp)) error = xfs_swap_extent_rmap(&tp, ip, tip); else error = xfs_swap_extent_forks(tp, ip, tip, &src_log_flags, @@ -1778,7 +1778,7 @@ xfs_swap_extents( } /* Swap the cow forks. */ - if (xfs_sb_version_hasreflink(&mp->m_sb)) { + if (xfs_has_reflink(mp)) { ASSERT(!ip->i_cowfp || ip->i_cowfp->if_format == XFS_DINODE_FMT_EXTENTS); ASSERT(!tip->i_cowfp || @@ -1820,7 +1820,7 @@ xfs_swap_extents( * If this is a synchronous mount, make sure that the * transaction goes to disk before returning to the user. */ - if (mp->m_flags & XFS_MOUNT_WSYNC) + if (xfs_has_wsync(mp)) xfs_trans_set_sync(tp); error = xfs_trans_commit(tp); diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 3ab73567a0f5..5fa6cd947dd4 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -251,7 +251,7 @@ _xfs_buf_alloc( return error; } - bp->b_bn = map[0].bm_bn; + bp->b_rhash_key = map[0].bm_bn; bp->b_length = 0; for (i = 0; i < nmaps; i++) { bp->b_maps[i].bm_bn = map[i].bm_bn; @@ -315,7 +315,6 @@ xfs_buf_alloc_kmem( struct xfs_buf *bp, xfs_buf_flags_t flags) { - int align_mask = xfs_buftarg_dma_alignment(bp->b_target); xfs_km_flags_t kmflag_mask = KM_NOFS; size_t size = BBTOB(bp->b_length); @@ -323,7 +322,7 @@ xfs_buf_alloc_kmem( if (!(flags & XBF_READ)) kmflag_mask |= KM_ZERO; - bp->b_addr = kmem_alloc_io(size, align_mask, kmflag_mask); + bp->b_addr = kmem_alloc(size, kmflag_mask); if (!bp->b_addr) return -ENOMEM; @@ -460,7 +459,7 @@ _xfs_buf_obj_cmp( */ BUILD_BUG_ON(offsetof(struct xfs_buf_map, bm_bn) != 0); - if (bp->b_bn != map->bm_bn) + if (bp->b_rhash_key != map->bm_bn) return 1; if (unlikely(bp->b_length != map->bm_len)) { @@ -482,7 +481,7 @@ static const struct rhashtable_params xfs_buf_hash_params = { .min_size = 32, /* empty AGs have minimal footprint */ .nelem_hint = 16, .key_len = sizeof(xfs_daddr_t), - .key_offset = offsetof(struct xfs_buf, b_bn), + .key_offset = offsetof(struct xfs_buf, b_rhash_key), .head_offset = offsetof(struct xfs_buf, b_rhash_head), .automatic_shrinking = true, .obj_cmpfn = _xfs_buf_obj_cmp, @@ -814,7 +813,7 @@ xfs_buf_read_map( * buffer. */ if (error) { - if (!XFS_FORCED_SHUTDOWN(target->bt_mount)) + if (!xfs_is_shutdown(target->bt_mount)) xfs_buf_ioerror_alert(bp, fa); bp->b_flags &= ~XBF_DONE; @@ -854,7 +853,9 @@ xfs_buf_readahead_map( /* * Read an uncached buffer from disk. Allocates and returns a locked - * buffer containing the disk contents or nothing. + * buffer containing the disk contents or nothing. Uncached buffers always have + * a cache index of XFS_BUF_DADDR_NULL so we can easily determine if the buffer + * is cached or uncached during fault diagnosis. */ int xfs_buf_read_uncached( @@ -876,7 +877,7 @@ xfs_buf_read_uncached( /* set up the buffer for a read IO */ ASSERT(bp->b_map_count == 1); - bp->b_bn = XFS_BUF_DADDR_NULL; /* always null for uncached buffers */ + bp->b_rhash_key = XFS_BUF_DADDR_NULL; bp->b_maps[0].bm_bn = daddr; bp->b_flags |= XBF_READ; bp->b_ops = ops; @@ -1145,7 +1146,7 @@ xfs_buf_ioerror_permanent( return true; /* At unmount we may treat errors differently */ - if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount) + if (xfs_is_unmounting(mp) && mp->m_fail_unmount) return true; return false; @@ -1179,7 +1180,7 @@ xfs_buf_ioend_handle_error( * If we've already decided to shutdown the filesystem because of I/O * errors, there's no point in giving this a retry. */ - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) goto out_stale; xfs_buf_ioerror_alert_ratelimited(bp); @@ -1336,7 +1337,7 @@ xfs_buf_ioerror_alert( { xfs_buf_alert_ratelimited(bp, "XFS: metadata IO error", "metadata I/O error in \"%pS\" at daddr 0x%llx len %d error %d", - func, (uint64_t)XFS_BUF_ADDR(bp), + func, (uint64_t)xfs_buf_daddr(bp), bp->b_length, -bp->b_error); } @@ -1514,17 +1515,18 @@ _xfs_buf_ioapply( SHUTDOWN_CORRUPT_INCORE); return; } - } else if (bp->b_bn != XFS_BUF_DADDR_NULL) { + } else if (bp->b_rhash_key != XFS_BUF_DADDR_NULL) { struct xfs_mount *mp = bp->b_mount; /* * non-crc filesystems don't attach verifiers during * log recovery, so don't warn for such filesystems. */ - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { xfs_warn(mp, "%s: no buf ops on daddr 0x%llx len %d", - __func__, bp->b_bn, bp->b_length); + __func__, xfs_buf_daddr(bp), + bp->b_length); xfs_hex_dump(bp->b_addr, XFS_CORRUPTION_DUMP_LEN); dump_stack(); @@ -1592,7 +1594,7 @@ __xfs_buf_submit( ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); /* on shutdown we stale and complete the buffer immediately */ - if (XFS_FORCED_SHUTDOWN(bp->b_mount)) { + if (xfs_is_shutdown(bp->b_mount)) { xfs_buf_ioend_fail(bp); return -EIO; } @@ -1794,7 +1796,7 @@ xfs_buftarg_drain( xfs_buf_alert_ratelimited(bp, "XFS: Corruption Alert", "Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!", - (long long)bp->b_bn); + (long long)xfs_buf_daddr(bp)); } xfs_buf_rele(bp); } @@ -1809,7 +1811,7 @@ xfs_buftarg_drain( * down the fs. */ if (write_fail) { - ASSERT(XFS_FORCED_SHUTDOWN(btp->bt_mount)); + ASSERT(xfs_is_shutdown(btp->bt_mount)); xfs_alert(btp->bt_mount, "Please run xfs_repair to determine the extent of the problem."); } @@ -2302,7 +2304,7 @@ xfs_verify_magic( struct xfs_mount *mp = bp->b_mount; int idx; - idx = xfs_sb_version_hascrc(&mp->m_sb); + idx = xfs_has_crc(mp); if (WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx])) return false; return dmagic == bp->b_ops->magic[idx]; @@ -2320,7 +2322,7 @@ xfs_verify_magic16( struct xfs_mount *mp = bp->b_mount; int idx; - idx = xfs_sb_version_hascrc(&mp->m_sb); + idx = xfs_has_crc(mp); if (WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx])) return false; return dmagic == bp->b_ops->magic16[idx]; diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 464dc548fa23..6b0200b8007d 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -133,7 +133,8 @@ struct xfs_buf { * fast-path on locking. */ struct rhash_head b_rhash_head; /* pag buffer hash node */ - xfs_daddr_t b_bn; /* block number of buffer */ + + xfs_daddr_t b_rhash_key; /* buffer cache index */ int b_length; /* size of buffer in BBs */ atomic_t b_hold; /* reference count */ atomic_t b_lru_ref; /* lru reclaim ref count */ @@ -296,18 +297,10 @@ extern int xfs_buf_delwri_pushbuf(struct xfs_buf *, struct list_head *); extern int xfs_buf_init(void); extern void xfs_buf_terminate(void); -/* - * These macros use the IO block map rather than b_bn. b_bn is now really - * just for the buffer cache index for cached buffers. As IO does not use b_bn - * anymore, uncached buffers do not use b_bn at all and hence must modify the IO - * map directly. Uncached buffers are not allowed to be discontiguous, so this - * is safe to do. - * - * In future, uncached buffers will pass the block number directly to the io - * request function and hence these macros will go away at that point. - */ -#define XFS_BUF_ADDR(bp) ((bp)->b_maps[0].bm_bn) -#define XFS_BUF_SET_ADDR(bp, bno) ((bp)->b_maps[0].bm_bn = (xfs_daddr_t)(bno)) +static inline xfs_daddr_t xfs_buf_daddr(struct xfs_buf *bp) +{ + return bp->b_maps[0].bm_bn; +} void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref); @@ -355,12 +348,6 @@ extern int xfs_setsize_buftarg(struct xfs_buftarg *, unsigned int); #define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) #define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) -static inline int -xfs_buftarg_dma_alignment(struct xfs_buftarg *bt) -{ - return queue_dma_alignment(bt->bt_bdev->bd_disk->queue); -} - int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops); bool xfs_verify_magic(struct xfs_buf *bp, __be32 dmagic); bool xfs_verify_magic16(struct xfs_buf *bp, __be16 dmagic); diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 2828ce45b701..b1ab100c09e1 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -428,7 +428,7 @@ xfs_buf_item_format( * occurs during recovery. */ if (bip->bli_flags & XFS_BLI_INODE_BUF) { - if (xfs_sb_version_has_v3inode(&lip->li_mountp->m_sb) || + if (xfs_has_v3inodes(lip->li_mountp) || !((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) && xfs_log_item_in_current_chkpt(lip))) bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF; @@ -581,7 +581,7 @@ xfs_buf_item_push( if (bp->b_flags & XBF_WRITE_FAIL) { xfs_buf_alert_ratelimited(bp, "XFS: Failing async write", "Failing async write on buffer block 0x%llx. Retrying async write.", - (long long)bp->b_bn); + (long long)xfs_buf_daddr(bp)); } if (!xfs_buf_delwri_queue(bp, buffer_list)) @@ -616,7 +616,7 @@ xfs_buf_item_put( * that case, the bli is freed on buffer writeback completion. */ aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags) || - XFS_FORCED_SHUTDOWN(lip->li_mountp); + xfs_is_shutdown(lip->li_mountp); dirty = bip->bli_flags & XFS_BLI_DIRTY; if (dirty && !aborted) return false; diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index 4775485b4062..a476c7ef5d53 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -219,7 +219,7 @@ xlog_recover_validate_buf_type( * inconsistent state resulting in verification failures. Hence for now * just avoid the verification stage for non-crc filesystems */ - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) return; magic32 = be32_to_cpu(*(__be32 *)bp->b_addr); @@ -497,7 +497,7 @@ xlog_recover_do_reg_buffer( if (fa) { xfs_alert(mp, "dquot corrupt at %pS trying to replay into block 0x%llx", - fa, bp->b_bn); + fa, xfs_buf_daddr(bp)); goto next; } } @@ -597,7 +597,7 @@ xlog_recover_do_inode_buffer( * Post recovery validation only works properly on CRC enabled * filesystems. */ - if (xfs_sb_version_hascrc(&mp->m_sb)) + if (xfs_has_crc(mp)) bp->b_ops = &xfs_inode_buf_ops; inodes_per_buf = BBTOB(bp->b_length) >> mp->m_sb.sb_inodelog; @@ -710,7 +710,7 @@ xlog_recover_get_buf_lsn( uint16_t blft; /* v4 filesystems always recover immediately */ - if (!xfs_sb_version_hascrc(&mp->m_sb)) + if (!xfs_has_crc(mp)) goto recover_immediately; /* @@ -787,7 +787,7 @@ xlog_recover_get_buf_lsn( * the relevant UUID in the superblock. */ lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn); - if (xfs_sb_version_hasmetauuid(&mp->m_sb)) + if (xfs_has_metauuid(mp)) uuid = &((struct xfs_dsb *)blk)->sb_meta_uuid; else uuid = &((struct xfs_dsb *)blk)->sb_uuid; diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index da1cc683560c..8310005af00f 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -32,7 +32,7 @@ xfs_dir3_get_dtype( struct xfs_mount *mp, uint8_t filetype) { - if (!xfs_sb_version_hasftype(&mp->m_sb)) + if (!xfs_has_ftype(mp)) return DT_UNKNOWN; if (filetype >= XFS_DIR3_FT_MAX) @@ -512,7 +512,7 @@ xfs_readdir( trace_xfs_readdir(dp); - if (XFS_FORCED_SHUTDOWN(dp->i_mount)) + if (xfs_is_shutdown(dp->i_mount)) return -EIO; ASSERT(S_ISDIR(VFS_I(dp)->i_mode)); diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 736df5660f1f..0191de8ce9ce 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -169,7 +169,7 @@ xfs_ioc_trim( * We haven't recovered the log, so we cannot use our bnobt-guided * storage zapping commands. */ - if (mp->m_flags & XFS_MOUNT_NORECOVERY) + if (xfs_has_norecovery(mp)) return -EROFS; if (copy_from_user(&range, urange, sizeof(range))) diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index ecd5059d6928..c15d61d47a06 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -223,9 +223,9 @@ xfs_qm_init_dquot_blk( d->dd_diskdq.d_version = XFS_DQUOT_VERSION; d->dd_diskdq.d_id = cpu_to_be32(curid); d->dd_diskdq.d_type = type; - if (curid > 0 && xfs_sb_version_hasbigtime(&mp->m_sb)) + if (curid > 0 && xfs_has_bigtime(mp)) d->dd_diskdq.d_type |= XFS_DQTYPE_BIGTIME; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { uuid_copy(&d->dd_uuid, &mp->m_sb.sb_meta_uuid); xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk), XFS_DQUOT_CRC_OFF); @@ -526,7 +526,7 @@ xfs_dquot_check_type( * expect an exact match for user dquots and for non-root group and * project dquots. */ - if (xfs_sb_version_hascrc(&dqp->q_mount->m_sb) || + if (xfs_has_crc(dqp->q_mount) || dqp_type == XFS_DQTYPE_USER || dqp->q_id != 0) return ddqp_type == dqp_type; @@ -847,9 +847,6 @@ xfs_qm_dqget_checks( struct xfs_mount *mp, xfs_dqtype_t type) { - if (WARN_ON_ONCE(!XFS_IS_QUOTA_RUNNING(mp))) - return -ESRCH; - switch (type) { case XFS_DQTYPE_USER: if (!XFS_IS_UQUOTA_ON(mp)) @@ -1222,7 +1219,7 @@ xfs_qm_dqflush_check( /* bigtime flag should never be set on root dquots */ if (dqp->q_type & XFS_DQTYPE_BIGTIME) { - if (!xfs_sb_version_hasbigtime(&dqp->q_mount->m_sb)) + if (!xfs_has_bigtime(dqp->q_mount)) return __this_address; if (dqp->q_id == 0) return __this_address; @@ -1301,7 +1298,7 @@ xfs_qm_dqflush( * buffer always has a valid CRC. This ensures there is no possibility * of a dquot without an up-to-date CRC getting to disk. */ - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { dqblk->dd_lsn = cpu_to_be64(dqp->q_logitem.qli_item.li_lsn); xfs_update_cksum((char *)dqblk, sizeof(struct xfs_dqblk), XFS_DQUOT_CRC_OFF); diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index f642884a6834..6b5e3cf40c8b 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -54,6 +54,16 @@ struct xfs_dquot_res { xfs_qwarncnt_t warnings; }; +static inline bool +xfs_dquot_res_over_limits( + const struct xfs_dquot_res *qres) +{ + if ((qres->softlimit && qres->softlimit < qres->reserved) || + (qres->hardlimit && qres->hardlimit < qres->reserved)) + return true; + return false; +} + /* * The incore dquot structure */ diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index 8ed47b739b6c..6a1aae799cf1 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -218,137 +218,3 @@ xfs_qm_dquot_logitem_init( &xfs_dquot_item_ops); lp->qli_dquot = dqp; } - -/*------------------ QUOTAOFF LOG ITEMS -------------------*/ - -static inline struct xfs_qoff_logitem *QOFF_ITEM(struct xfs_log_item *lip) -{ - return container_of(lip, struct xfs_qoff_logitem, qql_item); -} - - -/* - * This returns the number of iovecs needed to log the given quotaoff item. - * We only need 1 iovec for an quotaoff item. It just logs the - * quotaoff_log_format structure. - */ -STATIC void -xfs_qm_qoff_logitem_size( - struct xfs_log_item *lip, - int *nvecs, - int *nbytes) -{ - *nvecs += 1; - *nbytes += sizeof(struct xfs_qoff_logitem); -} - -STATIC void -xfs_qm_qoff_logitem_format( - struct xfs_log_item *lip, - struct xfs_log_vec *lv) -{ - struct xfs_qoff_logitem *qflip = QOFF_ITEM(lip); - struct xfs_log_iovec *vecp = NULL; - struct xfs_qoff_logformat *qlf; - - qlf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_QUOTAOFF); - qlf->qf_type = XFS_LI_QUOTAOFF; - qlf->qf_size = 1; - qlf->qf_flags = qflip->qql_flags; - xlog_finish_iovec(lv, vecp, sizeof(struct xfs_qoff_logitem)); -} - -/* - * There isn't much you can do to push a quotaoff item. It is simply - * stuck waiting for the log to be flushed to disk. - */ -STATIC uint -xfs_qm_qoff_logitem_push( - struct xfs_log_item *lip, - struct list_head *buffer_list) -{ - return XFS_ITEM_LOCKED; -} - -STATIC xfs_lsn_t -xfs_qm_qoffend_logitem_committed( - struct xfs_log_item *lip, - xfs_lsn_t lsn) -{ - struct xfs_qoff_logitem *qfe = QOFF_ITEM(lip); - struct xfs_qoff_logitem *qfs = qfe->qql_start_lip; - - xfs_qm_qoff_logitem_relse(qfs); - - kmem_free(lip->li_lv_shadow); - kmem_free(qfe); - return (xfs_lsn_t)-1; -} - -STATIC void -xfs_qm_qoff_logitem_release( - struct xfs_log_item *lip) -{ - struct xfs_qoff_logitem *qoff = QOFF_ITEM(lip); - - if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) { - if (qoff->qql_start_lip) - xfs_qm_qoff_logitem_relse(qoff->qql_start_lip); - xfs_qm_qoff_logitem_relse(qoff); - } -} - -static const struct xfs_item_ops xfs_qm_qoffend_logitem_ops = { - .iop_size = xfs_qm_qoff_logitem_size, - .iop_format = xfs_qm_qoff_logitem_format, - .iop_committed = xfs_qm_qoffend_logitem_committed, - .iop_push = xfs_qm_qoff_logitem_push, - .iop_release = xfs_qm_qoff_logitem_release, -}; - -static const struct xfs_item_ops xfs_qm_qoff_logitem_ops = { - .iop_size = xfs_qm_qoff_logitem_size, - .iop_format = xfs_qm_qoff_logitem_format, - .iop_push = xfs_qm_qoff_logitem_push, - .iop_release = xfs_qm_qoff_logitem_release, -}; - -/* - * Delete the quotaoff intent from the AIL and free it. On success, - * this should only be called for the start item. It can be used for - * either on shutdown or abort. - */ -void -xfs_qm_qoff_logitem_relse( - struct xfs_qoff_logitem *qoff) -{ - struct xfs_log_item *lip = &qoff->qql_item; - - ASSERT(test_bit(XFS_LI_IN_AIL, &lip->li_flags) || - test_bit(XFS_LI_ABORTED, &lip->li_flags) || - XFS_FORCED_SHUTDOWN(lip->li_mountp)); - xfs_trans_ail_delete(lip, 0); - kmem_free(lip->li_lv_shadow); - kmem_free(qoff); -} - -/* - * Allocate and initialize an quotaoff item of the correct quota type(s). - */ -struct xfs_qoff_logitem * -xfs_qm_qoff_logitem_init( - struct xfs_mount *mp, - struct xfs_qoff_logitem *start, - uint flags) -{ - struct xfs_qoff_logitem *qf; - - qf = kmem_zalloc(sizeof(struct xfs_qoff_logitem), 0); - - xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ? - &xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops); - qf->qql_item.li_mountp = mp; - qf->qql_start_lip = start; - qf->qql_flags = flags; - return qf; -} diff --git a/fs/xfs/xfs_dquot_item.h b/fs/xfs/xfs_dquot_item.h index 2b86a43d7ce2..794710c24474 100644 --- a/fs/xfs/xfs_dquot_item.h +++ b/fs/xfs/xfs_dquot_item.h @@ -9,7 +9,6 @@ struct xfs_dquot; struct xfs_trans; struct xfs_mount; -struct xfs_qoff_logitem; struct xfs_dq_logitem { struct xfs_log_item qli_item; /* common portion */ @@ -17,22 +16,6 @@ struct xfs_dq_logitem { xfs_lsn_t qli_flush_lsn; /* lsn at last flush */ }; -struct xfs_qoff_logitem { - struct xfs_log_item qql_item; /* common portion */ - struct xfs_qoff_logitem *qql_start_lip; /* qoff-start logitem, if any */ - unsigned int qql_flags; -}; - - void xfs_qm_dquot_logitem_init(struct xfs_dquot *dqp); -struct xfs_qoff_logitem *xfs_qm_qoff_logitem_init(struct xfs_mount *mp, - struct xfs_qoff_logitem *start, - uint flags); -void xfs_qm_qoff_logitem_relse(struct xfs_qoff_logitem *); -struct xfs_qoff_logitem *xfs_trans_get_qoff_item(struct xfs_trans *tp, - struct xfs_qoff_logitem *startqoff, - uint flags); -void xfs_trans_log_quotaoff_item(struct xfs_trans *tp, - struct xfs_qoff_logitem *qlp); #endif /* __XFS_DQUOT_ITEM_H__ */ diff --git a/fs/xfs/xfs_dquot_item_recover.c b/fs/xfs/xfs_dquot_item_recover.c index 5875c7e1bd28..8966ba842395 100644 --- a/fs/xfs/xfs_dquot_item_recover.c +++ b/fs/xfs/xfs_dquot_item_recover.c @@ -136,7 +136,7 @@ xlog_recover_dquot_commit_pass2( * If the dquot has an LSN in it, recover the dquot only if it's less * than the lsn of the transaction we are replaying. */ - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddq; xfs_lsn_t lsn = be64_to_cpu(dqb->dd_lsn); @@ -146,7 +146,7 @@ xlog_recover_dquot_commit_pass2( } memcpy(ddq, recddq, item->ri_buf[1].i_len); - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk), XFS_DQUOT_CRC_OFF); } diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index ce3bc1b291a1..81c445e9489b 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -371,7 +371,7 @@ xfs_buf_corruption_error( xfs_alert_tag(mp, XFS_PTAG_VERIFIER_ERROR, "Metadata corruption detected at %pS, %s block 0x%llx", - fa, bp->b_ops->name, bp->b_bn); + fa, bp->b_ops->name, xfs_buf_daddr(bp)); xfs_alert(mp, "Unmount and run xfs_repair"); @@ -402,7 +402,7 @@ xfs_buf_verifier_error( xfs_alert_tag(mp, XFS_PTAG_VERIFIER_ERROR, "Metadata %s detected at %pS, %s block 0x%llx %s", bp->b_error == -EFSBADCRC ? "CRC error" : "corruption", - fa, bp->b_ops->name, bp->b_bn, name); + fa, bp->b_ops->name, xfs_buf_daddr(bp), name); xfs_alert(mp, "Unmount and run xfs_repair"); diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 1717b7508356..5735d5ea87ee 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -75,4 +75,16 @@ extern int xfs_errortag_clearall(struct xfs_mount *mp); #define XFS_PTAG_FSBLOCK_ZERO 0x00000080 #define XFS_PTAG_VERIFIER_ERROR 0x00000100 +#define XFS_PTAG_STRINGS \ + { XFS_NO_PTAG, "none" }, \ + { XFS_PTAG_IFLUSH, "iflush" }, \ + { XFS_PTAG_LOGRES, "logres" }, \ + { XFS_PTAG_AILDELETE, "aildelete" }, \ + { XFS_PTAG_ERROR_REPORT , "error_report" }, \ + { XFS_PTAG_SHUTDOWN_CORRUPT, "corrupt" }, \ + { XFS_PTAG_SHUTDOWN_IOERROR, "ioerror" }, \ + { XFS_PTAG_SHUTDOWN_LOGERROR, "logerror" }, \ + { XFS_PTAG_FSBLOCK_ZERO, "fsb_zero" }, \ + { XFS_PTAG_VERIFIER_ERROR, "verifier" } + #endif /* __XFS_ERROR_H__ */ diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index 1da59bdff245..1064c2342876 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -44,6 +44,7 @@ xfs_fs_encode_fh( int *max_len, struct inode *parent) { + struct xfs_mount *mp = XFS_M(inode->i_sb); struct fid *fid = (struct fid *)fh; struct xfs_fid64 *fid64 = (struct xfs_fid64 *)fh; int fileid_type; @@ -63,8 +64,7 @@ xfs_fs_encode_fh( * large enough filesystem may contain them, thus the slightly * confusing looking conditional below. */ - if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS) || - (XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_32BITINODES)) + if (!xfs_has_small_inums(mp) || xfs_is_inode32(mp)) fileid_type |= XFS_FILEID_TYPE_64FLAG; /* diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 2424230ca2c3..3f8a0713573a 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -629,6 +629,9 @@ xfs_efi_item_recover( error = xfs_trans_free_extent(tp, efdp, extp->ext_start, extp->ext_len, &XFS_RMAP_OINFO_ANY_OWNER, false); + if (error == -EFSCORRUPTED) + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + extp, sizeof(*extp)); if (error) goto abort_error; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 3dfbdcdb0d1c..7aa943edfc02 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -185,7 +185,7 @@ xfs_file_fsync( if (error) return error; - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; xfs_iflags_clear(ip, XFS_ITRUNCATED); @@ -318,7 +318,7 @@ xfs_file_read_iter( XFS_STATS_INC(mp, xs_read_calls); - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; if (IS_DAX(inode)) @@ -462,7 +462,7 @@ xfs_dio_write_end_io( trace_xfs_end_io_direct_write(ip, offset, size); - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + if (xfs_is_shutdown(ip->i_mount)) return -EIO; if (error) @@ -814,7 +814,7 @@ xfs_file_write_iter( if (ocount == 0) return 0; - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + if (xfs_is_shutdown(ip->i_mount)) return -EIO; if (IS_DAX(inode)) @@ -1122,7 +1122,7 @@ static inline bool xfs_file_sync_writes(struct file *filp) { struct xfs_inode *ip = XFS_I(file_inode(filp)); - if (ip->i_mount->m_flags & XFS_MOUNT_WSYNC) + if (xfs_has_wsync(ip->i_mount)) return true; if (filp->f_flags & (__O_SYNC | O_DSYNC)) return true; @@ -1153,10 +1153,10 @@ xfs_file_remap_range( if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) return -EINVAL; - if (!xfs_sb_version_hasreflink(&mp->m_sb)) + if (!xfs_has_reflink(mp)) return -EOPNOTSUPP; - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; /* Prepare and then clone file data. */ @@ -1205,7 +1205,7 @@ xfs_file_open( { if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) return -EFBIG; - if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb))) + if (xfs_is_shutdown(XFS_M(inode->i_sb))) return -EIO; file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; return 0; @@ -1277,7 +1277,7 @@ xfs_file_llseek( { struct inode *inode = file->f_mapping->host; - if (XFS_FORCED_SHUTDOWN(XFS_I(inode)->i_mount)) + if (xfs_is_shutdown(XFS_I(inode)->i_mount)) return -EIO; switch (whence) { diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index eed6ca5f8f91..6a3ce0f6dc9e 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -295,7 +295,7 @@ xfs_filestream_lookup_ag( * Set the starting AG using the rotor for inode32, otherwise * use the directory inode's AG. */ - if (mp->m_flags & XFS_MOUNT_32BITINODES) { + if (xfs_is_inode32(mp)) { xfs_agnumber_t rotorstep = xfs_rotorstep; startag = (mp->m_agfrotor / rotorstep) % mp->m_sb.sb_agcount; mp->m_agfrotor = (mp->m_agfrotor + 1) % diff --git a/fs/xfs/xfs_filestream.h b/fs/xfs/xfs_filestream.h index 3af963743e4d..403226ebb80b 100644 --- a/fs/xfs/xfs_filestream.h +++ b/fs/xfs/xfs_filestream.h @@ -21,7 +21,7 @@ static inline int xfs_inode_is_filestream( struct xfs_inode *ip) { - return (ip->i_mount->m_flags & XFS_MOUNT_FILESTREAMS) || + return xfs_has_filestreams(ip->i_mount) || (ip->i_diflags & XFS_DIFLAG_FILESTREAM); } diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 7d0b09c1366e..48287caad28b 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -61,7 +61,7 @@ xfs_fsmap_to_internal( static int xfs_fsmap_owner_to_rmap( struct xfs_rmap_irec *dest, - struct xfs_fsmap *src) + const struct xfs_fsmap *src) { if (!(src->fmr_flags & FMR_OF_SPECIAL_OWNER)) { dest->rm_owner = src->fmr_owner; @@ -111,8 +111,8 @@ xfs_fsmap_owner_to_rmap( /* Convert an rmapbt owner into an fsmap owner. */ static int xfs_fsmap_owner_from_rmap( - struct xfs_fsmap *dest, - struct xfs_rmap_irec *src) + struct xfs_fsmap *dest, + const struct xfs_rmap_irec *src) { dest->fmr_flags = 0; if (!XFS_RMAP_NON_INODE_OWNER(src->rm_owner)) { @@ -171,7 +171,7 @@ struct xfs_getfsmap_info { struct xfs_getfsmap_dev { u32 dev; int (*fn)(struct xfs_trans *tp, - struct xfs_fsmap *keys, + const struct xfs_fsmap *keys, struct xfs_getfsmap_info *info); }; @@ -192,7 +192,7 @@ STATIC int xfs_getfsmap_is_shared( struct xfs_trans *tp, struct xfs_getfsmap_info *info, - struct xfs_rmap_irec *rec, + const struct xfs_rmap_irec *rec, bool *stat) { struct xfs_mount *mp = tp->t_mountp; @@ -202,7 +202,7 @@ xfs_getfsmap_is_shared( int error; *stat = false; - if (!xfs_sb_version_hasreflink(&mp->m_sb)) + if (!xfs_has_reflink(mp)) return 0; /* rt files will have no perag structure */ if (!info->pag) @@ -245,7 +245,7 @@ STATIC int xfs_getfsmap_helper( struct xfs_trans *tp, struct xfs_getfsmap_info *info, - struct xfs_rmap_irec *rec, + const struct xfs_rmap_irec *rec, xfs_daddr_t rec_daddr) { struct xfs_fsmap fmr; @@ -347,7 +347,7 @@ out: STATIC int xfs_getfsmap_datadev_helper( struct xfs_btree_cur *cur, - struct xfs_rmap_irec *rec, + const struct xfs_rmap_irec *rec, void *priv) { struct xfs_mount *mp = cur->bc_mp; @@ -365,7 +365,7 @@ xfs_getfsmap_datadev_helper( STATIC int xfs_getfsmap_datadev_bnobt_helper( struct xfs_btree_cur *cur, - struct xfs_alloc_rec_incore *rec, + const struct xfs_alloc_rec_incore *rec, void *priv) { struct xfs_mount *mp = cur->bc_mp; @@ -389,7 +389,7 @@ xfs_getfsmap_datadev_bnobt_helper( static void xfs_getfsmap_set_irec_flags( struct xfs_rmap_irec *irec, - struct xfs_fsmap *fmr) + const struct xfs_fsmap *fmr) { irec->rm_flags = 0; if (fmr->fmr_flags & FMR_OF_ATTR_FORK) @@ -404,7 +404,7 @@ xfs_getfsmap_set_irec_flags( STATIC int xfs_getfsmap_logdev( struct xfs_trans *tp, - struct xfs_fsmap *keys, + const struct xfs_fsmap *keys, struct xfs_getfsmap_info *info) { struct xfs_mount *mp = tp->t_mountp; @@ -451,7 +451,7 @@ xfs_getfsmap_logdev( STATIC int xfs_getfsmap_rtdev_rtbitmap_helper( struct xfs_trans *tp, - struct xfs_rtalloc_rec *rec, + const struct xfs_rtalloc_rec *rec, void *priv) { struct xfs_mount *mp = tp->t_mountp; @@ -473,7 +473,7 @@ xfs_getfsmap_rtdev_rtbitmap_helper( STATIC int __xfs_getfsmap_rtdev( struct xfs_trans *tp, - struct xfs_fsmap *keys, + const struct xfs_fsmap *keys, int (*query_fn)(struct xfs_trans *, struct xfs_getfsmap_info *), struct xfs_getfsmap_info *info) @@ -481,16 +481,14 @@ __xfs_getfsmap_rtdev( struct xfs_mount *mp = tp->t_mountp; xfs_fsblock_t start_fsb; xfs_fsblock_t end_fsb; - xfs_daddr_t eofs; + uint64_t eofs; int error = 0; eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks); if (keys[0].fmr_physical >= eofs) return 0; - if (keys[1].fmr_physical >= eofs) - keys[1].fmr_physical = eofs - 1; start_fsb = XFS_BB_TO_FSBT(mp, keys[0].fmr_physical); - end_fsb = XFS_BB_TO_FSB(mp, keys[1].fmr_physical); + end_fsb = XFS_BB_TO_FSB(mp, min(eofs - 1, keys[1].fmr_physical)); /* Set up search keys */ info->low.rm_startblock = start_fsb; @@ -523,27 +521,37 @@ xfs_getfsmap_rtdev_rtbitmap_query( { struct xfs_rtalloc_rec alow = { 0 }; struct xfs_rtalloc_rec ahigh = { 0 }; + struct xfs_mount *mp = tp->t_mountp; int error; - xfs_ilock(tp->t_mountp->m_rbmip, XFS_ILOCK_SHARED); + xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED); + /* + * Set up query parameters to return free rtextents covering the range + * we want. + */ alow.ar_startext = info->low.rm_startblock; ahigh.ar_startext = info->high.rm_startblock; - do_div(alow.ar_startext, tp->t_mountp->m_sb.sb_rextsize); - if (do_div(ahigh.ar_startext, tp->t_mountp->m_sb.sb_rextsize)) + do_div(alow.ar_startext, mp->m_sb.sb_rextsize); + if (do_div(ahigh.ar_startext, mp->m_sb.sb_rextsize)) ahigh.ar_startext++; error = xfs_rtalloc_query_range(tp, &alow, &ahigh, xfs_getfsmap_rtdev_rtbitmap_helper, info); if (error) goto err; - /* Report any gaps at the end of the rtbitmap */ + /* + * Report any gaps at the end of the rtbitmap by simulating a null + * rmap starting at the block after the end of the query range. + */ info->last = true; + ahigh.ar_startext = min(mp->m_sb.sb_rextents, ahigh.ar_startext); + error = xfs_getfsmap_rtdev_rtbitmap_helper(tp, &ahigh, info); if (error) goto err; err: - xfs_iunlock(tp->t_mountp->m_rbmip, XFS_ILOCK_SHARED); + xfs_iunlock(mp->m_rbmip, XFS_ILOCK_SHARED); return error; } @@ -551,7 +559,7 @@ err: STATIC int xfs_getfsmap_rtdev_rtbitmap( struct xfs_trans *tp, - struct xfs_fsmap *keys, + const struct xfs_fsmap *keys, struct xfs_getfsmap_info *info) { info->missing_owner = XFS_FMR_OWN_UNKNOWN; @@ -564,7 +572,7 @@ xfs_getfsmap_rtdev_rtbitmap( STATIC int __xfs_getfsmap_datadev( struct xfs_trans *tp, - struct xfs_fsmap *keys, + const struct xfs_fsmap *keys, struct xfs_getfsmap_info *info, int (*query_fn)(struct xfs_trans *, struct xfs_getfsmap_info *, @@ -579,16 +587,14 @@ __xfs_getfsmap_datadev( xfs_fsblock_t end_fsb; xfs_agnumber_t start_ag; xfs_agnumber_t end_ag; - xfs_daddr_t eofs; + uint64_t eofs; int error = 0; eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); if (keys[0].fmr_physical >= eofs) return 0; - if (keys[1].fmr_physical >= eofs) - keys[1].fmr_physical = eofs - 1; start_fsb = XFS_DADDR_TO_FSB(mp, keys[0].fmr_physical); - end_fsb = XFS_DADDR_TO_FSB(mp, keys[1].fmr_physical); + end_fsb = XFS_DADDR_TO_FSB(mp, min(eofs - 1, keys[1].fmr_physical)); /* * Convert the fsmap low/high keys to AG based keys. Initialize @@ -716,7 +722,7 @@ xfs_getfsmap_datadev_rmapbt_query( STATIC int xfs_getfsmap_datadev_rmapbt( struct xfs_trans *tp, - struct xfs_fsmap *keys, + const struct xfs_fsmap *keys, struct xfs_getfsmap_info *info) { info->missing_owner = XFS_FMR_OWN_FREE; @@ -751,7 +757,7 @@ xfs_getfsmap_datadev_bnobt_query( STATIC int xfs_getfsmap_datadev_bnobt( struct xfs_trans *tp, - struct xfs_fsmap *keys, + const struct xfs_fsmap *keys, struct xfs_getfsmap_info *info) { struct xfs_alloc_rec_incore akeys[2]; @@ -859,7 +865,7 @@ xfs_getfsmap( return -EINVAL; use_rmap = capable(CAP_SYS_ADMIN) && - xfs_sb_version_hasrmapbt(&mp->m_sb); + xfs_has_rmapbt(mp); head->fmh_entries = 0; /* Set up our device handlers. */ diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 6ed29b158312..33e26690a8c4 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -19,6 +19,7 @@ #include "xfs_log.h" #include "xfs_ag.h" #include "xfs_ag_resv.h" +#include "xfs_trace.h" /* * Write new AG headers to disk. Non-transactional, but need to be @@ -177,7 +178,7 @@ xfs_growfs_data_private( * particularly important for shrink because the write verifier * will fail if sb_fdblocks is ever larger than sb_dblocks. */ - if (xfs_sb_version_haslazysbcount(&mp->m_sb)) + if (xfs_has_lazysbcount(mp)) xfs_log_sb(tp); xfs_trans_set_sync(tp); @@ -511,6 +512,11 @@ xfs_fs_goingdown( * consistent. We don't do an unmount here; just shutdown the shop, make sure * that absolutely nothing persistent happens to this filesystem after this * point. + * + * The shutdown state change is atomic, resulting in the first and only the + * first shutdown call processing the shutdown. This means we only shutdown the + * log once as it requires, and we don't spam the logs when multiple concurrent + * shutdowns race to set the shutdown flags. */ void xfs_do_force_shutdown( @@ -519,48 +525,37 @@ xfs_do_force_shutdown( char *fname, int lnnum) { - bool logerror = flags & SHUTDOWN_LOG_IO_ERROR; - - /* - * No need to duplicate efforts. - */ - if (XFS_FORCED_SHUTDOWN(mp) && !logerror) - return; - - /* - * This flags XFS_MOUNT_FS_SHUTDOWN, makes sure that we don't - * queue up anybody new on the log reservations, and wakes up - * everybody who's sleeping on log reservations to tell them - * the bad news. - */ - if (xfs_log_force_umount(mp, logerror)) - return; + int tag; + const char *why; - if (flags & SHUTDOWN_FORCE_UMOUNT) { - xfs_alert(mp, -"User initiated shutdown (0x%x) received. Shutting down filesystem", - flags); + if (test_and_set_bit(XFS_OPSTATE_SHUTDOWN, &mp->m_opstate)) return; - } - - if (flags & SHUTDOWN_CORRUPT_INCORE) { - xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_CORRUPT, -"Corruption of in-memory data (0x%x) detected at %pS (%s:%d). Shutting down filesystem", - flags, __return_address, fname, lnnum); - if (XFS_ERRLEVEL_HIGH <= xfs_error_level) - xfs_stack_trace(); - } else if (logerror) { - xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR, -"Log I/O error (0x%x) detected at %pS (%s:%d). Shutting down filesystem", - flags, __return_address, fname, lnnum); + if (mp->m_sb_bp) + mp->m_sb_bp->b_flags |= XBF_DONE; + + if (flags & SHUTDOWN_FORCE_UMOUNT) + xfs_alert(mp, "User initiated shutdown received."); + + if (xlog_force_shutdown(mp->m_log, flags)) { + tag = XFS_PTAG_SHUTDOWN_LOGERROR; + why = "Log I/O Error"; + } else if (flags & SHUTDOWN_CORRUPT_INCORE) { + tag = XFS_PTAG_SHUTDOWN_CORRUPT; + why = "Corruption of in-memory data"; } else { - xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR, -"I/O error (0x%x) detected at %pS (%s:%d). Shutting down filesystem", - flags, __return_address, fname, lnnum); + tag = XFS_PTAG_SHUTDOWN_IOERROR; + why = "Metadata I/O Error"; } + trace_xfs_force_shutdown(mp, tag, flags, fname, lnnum); + + xfs_alert_tag(mp, tag, +"%s (0x%x) detected at %pS (%s:%d). Shutting down filesystem.", + why, flags, __return_address, fname, lnnum); xfs_alert(mp, "Please unmount the filesystem and rectify the problem(s)"); + if (xfs_error_level >= XFS_ERRLEVEL_HIGH) + xfs_stack_trace(); } /* diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c index eb10eacabc8f..72a075bb2c10 100644 --- a/fs/xfs/xfs_health.c +++ b/fs/xfs/xfs_health.c @@ -30,7 +30,7 @@ xfs_health_unmount( unsigned int checked = 0; bool warn = false; - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return; /* Measure AG corruption levels. */ diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 6007683482c6..f2210d927481 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -38,23 +38,11 @@ * radix tree tags when convenient. Avoid existing XFS_IWALK namespace. */ enum xfs_icwalk_goal { - /* Goals that are not related to tags; these must be < 0. */ - XFS_ICWALK_DQRELE = -1, - /* Goals directly associated with tagged inodes. */ XFS_ICWALK_BLOCKGC = XFS_ICI_BLOCKGC_TAG, XFS_ICWALK_RECLAIM = XFS_ICI_RECLAIM_TAG, }; -#define XFS_ICWALK_NULL_TAG (-1U) - -/* Compute the inode radix tree tag for this goal. */ -static inline unsigned int -xfs_icwalk_tag(enum xfs_icwalk_goal goal) -{ - return goal < 0 ? XFS_ICWALK_NULL_TAG : goal; -} - static int xfs_icwalk(struct xfs_mount *mp, enum xfs_icwalk_goal goal, struct xfs_icwalk *icw); static int xfs_icwalk_ag(struct xfs_perag *pag, @@ -64,9 +52,6 @@ static int xfs_icwalk_ag(struct xfs_perag *pag, * Private inode cache walk flags for struct xfs_icwalk. Must not * coincide with XFS_ICWALK_FLAGS_VALID. */ -#define XFS_ICWALK_FLAG_DROP_UDQUOT (1U << 31) -#define XFS_ICWALK_FLAG_DROP_GDQUOT (1U << 30) -#define XFS_ICWALK_FLAG_DROP_PDQUOT (1U << 29) /* Stop scanning after icw_scan_limit inodes. */ #define XFS_ICWALK_FLAG_SCAN_LIMIT (1U << 28) @@ -74,10 +59,7 @@ static int xfs_icwalk_ag(struct xfs_perag *pag, #define XFS_ICWALK_FLAG_RECLAIM_SICK (1U << 27) #define XFS_ICWALK_FLAG_UNION (1U << 26) /* union filter algorithm */ -#define XFS_ICWALK_PRIVATE_FLAGS (XFS_ICWALK_FLAG_DROP_UDQUOT | \ - XFS_ICWALK_FLAG_DROP_GDQUOT | \ - XFS_ICWALK_FLAG_DROP_PDQUOT | \ - XFS_ICWALK_FLAG_SCAN_LIMIT | \ +#define XFS_ICWALK_PRIVATE_FLAGS (XFS_ICWALK_FLAG_SCAN_LIMIT | \ XFS_ICWALK_FLAG_RECLAIM_SICK | \ XFS_ICWALK_FLAG_UNION) @@ -102,8 +84,9 @@ xfs_inode_alloc( return NULL; } - /* VFS doesn't initialise i_mode! */ + /* VFS doesn't initialise i_mode or i_state! */ VFS_I(ip)->i_mode = 0; + VFS_I(ip)->i_state = 0; XFS_STATS_INC(mp, vn_active); ASSERT(atomic_read(&ip->i_pincount) == 0); @@ -220,9 +203,14 @@ static inline void xfs_blockgc_queue( struct xfs_perag *pag) { + struct xfs_mount *mp = pag->pag_mount; + + if (!xfs_is_blockgc_enabled(mp)) + return; + rcu_read_lock(); if (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG)) - queue_delayed_work(pag->pag_mount->m_gc_workqueue, + queue_delayed_work(pag->pag_mount->m_blockgc_wq, &pag->pag_blockgc_work, msecs_to_jiffies(xfs_blockgc_secs * 1000)); rcu_read_unlock(); @@ -301,31 +289,6 @@ xfs_perag_clear_inode_tag( trace_xfs_perag_clear_inode_tag(mp, pag->pag_agno, tag, _RET_IP_); } -/* - * We set the inode flag atomically with the radix tree tag. - * Once we get tag lookups on the radix tree, this inode flag - * can go away. - */ -void -xfs_inode_mark_reclaimable( - struct xfs_inode *ip) -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_perag *pag; - - pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); - spin_lock(&pag->pag_ici_lock); - spin_lock(&ip->i_flags_lock); - - xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino), - XFS_ICI_RECLAIM_TAG); - __xfs_iflags_set(ip, XFS_IRECLAIMABLE); - - spin_unlock(&ip->i_flags_lock); - spin_unlock(&pag->pag_ici_lock); - xfs_perag_put(pag); -} - static inline void xfs_inew_wait( struct xfs_inode *ip) @@ -484,6 +447,21 @@ xfs_iget_check_free_state( return 0; } +/* Make all pending inactivation work start immediately. */ +static void +xfs_inodegc_queue_all( + struct xfs_mount *mp) +{ + struct xfs_inodegc *gc; + int cpu; + + for_each_online_cpu(cpu) { + gc = per_cpu_ptr(mp->m_inodegc, cpu); + if (!llist_empty(&gc->list)) + queue_work_on(cpu, mp->m_inodegc_wq, &gc->work); + } +} + /* * Check the validity of the inode we just found it the cache */ @@ -516,13 +494,30 @@ xfs_iget_cache_hit( * reclaimable state, wait for the initialisation to complete * before continuing. * + * If we're racing with the inactivation worker we also want to wait. + * If we're creating a new file, it's possible that the worker + * previously marked the inode as free on disk but hasn't finished + * updating the incore state yet. The AGI buffer will be dirty and + * locked to the icreate transaction, so a synchronous push of the + * inodegc workers would result in deadlock. For a regular iget, the + * worker is running already, so we might as well wait. + * * XXX(hch): eventually we should do something equivalent to * wait_on_inode to wait for these flags to be cleared * instead of polling for it. */ - if (ip->i_flags & (XFS_INEW | XFS_IRECLAIM)) + if (ip->i_flags & (XFS_INEW | XFS_IRECLAIM | XFS_INACTIVATING)) goto out_skip; + if (ip->i_flags & XFS_NEED_INACTIVE) { + /* Unlinked inodes cannot be re-grabbed. */ + if (VFS_I(ip)->i_nlink == 0) { + error = -ENOENT; + goto out_error; + } + goto out_inodegc_flush; + } + /* * Check the inode free state is valid. This also detects lookup * racing with unlinks. @@ -570,6 +565,17 @@ out_error: spin_unlock(&ip->i_flags_lock); rcu_read_unlock(); return error; + +out_inodegc_flush: + spin_unlock(&ip->i_flags_lock); + rcu_read_unlock(); + /* + * Do not wait for the workers, because the caller could hold an AGI + * buffer lock. We're just going to sleep in a loop anyway. + */ + if (xfs_is_inodegc_enabled(mp)) + xfs_inodegc_queue_all(mp); + return -EAGAIN; } static int @@ -597,7 +603,7 @@ xfs_iget_cache_miss( /* * For version 5 superblocks, if we are initialising a new inode and we - * are not utilising the XFS_MOUNT_IKEEP inode cluster mode, we can + * are not utilising the XFS_FEAT_IKEEP inode cluster mode, we can * simply build the new inode core with a random generation number. * * For version 4 (and older) superblocks, log recovery is dependent on @@ -605,8 +611,8 @@ xfs_iget_cache_miss( * value and hence we must also read the inode off disk even when * initializing new inodes. */ - if (xfs_sb_version_has_v3inode(&mp->m_sb) && - (flags & XFS_IGET_CREATE) && !(mp->m_flags & XFS_MOUNT_IKEEP)) { + if (xfs_has_v3inodes(mp) && + (flags & XFS_IGET_CREATE) && !xfs_has_ikeep(mp)) { VFS_I(ip)->i_generation = prandom_u32(); } else { struct xfs_buf *bp; @@ -817,97 +823,6 @@ xfs_icache_inode_is_allocated( return 0; } -#ifdef CONFIG_XFS_QUOTA -/* Decide if we want to grab this inode to drop its dquots. */ -static bool -xfs_dqrele_igrab( - struct xfs_inode *ip) -{ - bool ret = false; - - ASSERT(rcu_read_lock_held()); - - /* Check for stale RCU freed inode */ - spin_lock(&ip->i_flags_lock); - if (!ip->i_ino) - goto out_unlock; - - /* - * Skip inodes that are anywhere in the reclaim machinery because we - * drop dquots before tagging an inode for reclamation. - */ - if (ip->i_flags & (XFS_IRECLAIM | XFS_IRECLAIMABLE)) - goto out_unlock; - - /* - * The inode looks alive; try to grab a VFS reference so that it won't - * get destroyed. If we got the reference, return true to say that - * we grabbed the inode. - * - * If we can't get the reference, then we know the inode had its VFS - * state torn down and hasn't yet entered the reclaim machinery. Since - * we also know that dquots are detached from an inode before it enters - * reclaim, we can skip the inode. - */ - ret = igrab(VFS_I(ip)) != NULL; - -out_unlock: - spin_unlock(&ip->i_flags_lock); - return ret; -} - -/* Drop this inode's dquots. */ -static void -xfs_dqrele_inode( - struct xfs_inode *ip, - struct xfs_icwalk *icw) -{ - if (xfs_iflags_test(ip, XFS_INEW)) - xfs_inew_wait(ip); - - xfs_ilock(ip, XFS_ILOCK_EXCL); - if (icw->icw_flags & XFS_ICWALK_FLAG_DROP_UDQUOT) { - xfs_qm_dqrele(ip->i_udquot); - ip->i_udquot = NULL; - } - if (icw->icw_flags & XFS_ICWALK_FLAG_DROP_GDQUOT) { - xfs_qm_dqrele(ip->i_gdquot); - ip->i_gdquot = NULL; - } - if (icw->icw_flags & XFS_ICWALK_FLAG_DROP_PDQUOT) { - xfs_qm_dqrele(ip->i_pdquot); - ip->i_pdquot = NULL; - } - xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_irele(ip); -} - -/* - * Detach all dquots from incore inodes if we can. The caller must already - * have dropped the relevant XFS_[UGP]QUOTA_ACTIVE flags so that dquots will - * not get reattached. - */ -int -xfs_dqrele_all_inodes( - struct xfs_mount *mp, - unsigned int qflags) -{ - struct xfs_icwalk icw = { .icw_flags = 0 }; - - if (qflags & XFS_UQUOTA_ACCT) - icw.icw_flags |= XFS_ICWALK_FLAG_DROP_UDQUOT; - if (qflags & XFS_GQUOTA_ACCT) - icw.icw_flags |= XFS_ICWALK_FLAG_DROP_GDQUOT; - if (qflags & XFS_PQUOTA_ACCT) - icw.icw_flags |= XFS_ICWALK_FLAG_DROP_PDQUOT; - - return xfs_icwalk(mp, XFS_ICWALK_DQRELE, &icw); -} -#else -# define xfs_dqrele_igrab(ip) (false) -# define xfs_dqrele_inode(ip, priv) ((void)0) -#endif /* CONFIG_XFS_QUOTA */ - /* * Grab the inode for reclaim exclusively. * @@ -976,7 +891,7 @@ xfs_reclaim_inode( if (xfs_iflags_test_and_set(ip, XFS_IFLUSHING)) goto out_iunlock; - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { + if (xfs_is_shutdown(ip->i_mount)) { xfs_iunpin_wait(ip); xfs_iflush_abort(ip); goto reclaim; @@ -988,6 +903,7 @@ xfs_reclaim_inode( xfs_iflags_clear(ip, XFS_IFLUSHING); reclaim: + trace_xfs_inode_reclaiming(ip); /* * Because we use RCU freeing we need to ensure the inode always appears @@ -1052,9 +968,8 @@ static inline bool xfs_want_reclaim_sick( struct xfs_mount *mp) { - return (mp->m_flags & XFS_MOUNT_UNMOUNTING) || - (mp->m_flags & XFS_MOUNT_NORECOVERY) || - XFS_FORCED_SHUTDOWN(mp); + return xfs_is_unmounting(mp) || xfs_has_norecovery(mp) || + xfs_is_shutdown(mp); } void @@ -1447,8 +1362,12 @@ xfs_blockgc_stop( struct xfs_perag *pag; xfs_agnumber_t agno; - for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG) + if (!xfs_clear_blockgc_enabled(mp)) + return; + + for_each_perag(mp, agno, pag) cancel_delayed_work_sync(&pag->pag_blockgc_work); + trace_xfs_blockgc_stop(mp, __return_address); } /* Enable post-EOF and CoW block auto-reclamation. */ @@ -1459,12 +1378,18 @@ xfs_blockgc_start( struct xfs_perag *pag; xfs_agnumber_t agno; + if (xfs_set_blockgc_enabled(mp)) + return; + + trace_xfs_blockgc_start(mp, __return_address); for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG) xfs_blockgc_queue(pag); } /* Don't try to run block gc on an inode that's in any of these states. */ #define XFS_BLOCKGC_NOGRAB_IFLAGS (XFS_INEW | \ + XFS_NEED_INACTIVE | \ + XFS_INACTIVATING | \ XFS_IRECLAIMABLE | \ XFS_IRECLAIM) /* @@ -1490,7 +1415,7 @@ xfs_blockgc_igrab( spin_unlock(&ip->i_flags_lock); /* nothing to sync during shutdown */ - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + if (xfs_is_shutdown(ip->i_mount)) return false; /* If we can't grab the inode, it must on it's way to reclaim. */ @@ -1536,27 +1461,62 @@ xfs_blockgc_worker( struct xfs_mount *mp = pag->pag_mount; int error; - if (!sb_start_write_trylock(mp->m_super)) - return; + trace_xfs_blockgc_worker(mp, __return_address); + error = xfs_icwalk_ag(pag, XFS_ICWALK_BLOCKGC, NULL); if (error) xfs_info(mp, "AG %u preallocation gc worker failed, err=%d", pag->pag_agno, error); - sb_end_write(mp->m_super); xfs_blockgc_queue(pag); } /* - * Try to free space in the filesystem by purging eofblocks and cowblocks. + * Try to free space in the filesystem by purging inactive inodes, eofblocks + * and cowblocks. */ int xfs_blockgc_free_space( struct xfs_mount *mp, struct xfs_icwalk *icw) { + int error; + trace_xfs_blockgc_free_space(mp, icw, _RET_IP_); - return xfs_icwalk(mp, XFS_ICWALK_BLOCKGC, icw); + error = xfs_icwalk(mp, XFS_ICWALK_BLOCKGC, icw); + if (error) + return error; + + xfs_inodegc_flush(mp); + return 0; +} + +/* + * Reclaim all the free space that we can by scheduling the background blockgc + * and inodegc workers immediately and waiting for them all to clear. + */ +void +xfs_blockgc_flush_all( + struct xfs_mount *mp) +{ + struct xfs_perag *pag; + xfs_agnumber_t agno; + + trace_xfs_blockgc_flush_all(mp, __return_address); + + /* + * For each blockgc worker, move its queue time up to now. If it + * wasn't queued, it will not be requeued. Then flush whatever's + * left. + */ + for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG) + mod_delayed_work(pag->pag_mount->m_blockgc_wq, + &pag->pag_blockgc_work, 0); + + for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG) + flush_delayed_work(&pag->pag_blockgc_work); + + xfs_inodegc_flush(mp); } /* @@ -1647,8 +1607,6 @@ xfs_icwalk_igrab( struct xfs_icwalk *icw) { switch (goal) { - case XFS_ICWALK_DQRELE: - return xfs_dqrele_igrab(ip); case XFS_ICWALK_BLOCKGC: return xfs_blockgc_igrab(ip); case XFS_ICWALK_RECLAIM: @@ -1672,9 +1630,6 @@ xfs_icwalk_process_inode( int error = 0; switch (goal) { - case XFS_ICWALK_DQRELE: - xfs_dqrele_inode(ip, icw); - break; case XFS_ICWALK_BLOCKGC: error = xfs_blockgc_scan_inode(ip, icw); break; @@ -1712,22 +1667,14 @@ restart: nr_found = 0; do { struct xfs_inode *batch[XFS_LOOKUP_BATCH]; - unsigned int tag = xfs_icwalk_tag(goal); int error = 0; int i; rcu_read_lock(); - if (tag == XFS_ICWALK_NULL_TAG) - nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, - (void **)batch, first_index, - XFS_LOOKUP_BATCH); - else - nr_found = radix_tree_gang_lookup_tag( - &pag->pag_ici_root, - (void **) batch, first_index, - XFS_LOOKUP_BATCH, tag); - + nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root, + (void **) batch, first_index, + XFS_LOOKUP_BATCH, goal); if (!nr_found) { done = true; rcu_read_unlock(); @@ -1805,20 +1752,6 @@ restart: return last_error; } -/* Fetch the next (possibly tagged) per-AG structure. */ -static inline struct xfs_perag * -xfs_icwalk_get_perag( - struct xfs_mount *mp, - xfs_agnumber_t agno, - enum xfs_icwalk_goal goal) -{ - unsigned int tag = xfs_icwalk_tag(goal); - - if (tag == XFS_ICWALK_NULL_TAG) - return xfs_perag_get(mp, agno); - return xfs_perag_get_tag(mp, agno, tag); -} - /* Walk all incore inodes to achieve a given goal. */ static int xfs_icwalk( @@ -1829,18 +1762,465 @@ xfs_icwalk( struct xfs_perag *pag; int error = 0; int last_error = 0; - xfs_agnumber_t agno = 0; + xfs_agnumber_t agno; - while ((pag = xfs_icwalk_get_perag(mp, agno, goal))) { - agno = pag->pag_agno + 1; + for_each_perag_tag(mp, agno, pag, goal) { error = xfs_icwalk_ag(pag, goal, icw); - xfs_perag_put(pag); if (error) { last_error = error; - if (error == -EFSCORRUPTED) + if (error == -EFSCORRUPTED) { + xfs_perag_put(pag); break; + } } } return last_error; BUILD_BUG_ON(XFS_ICWALK_PRIVATE_FLAGS & XFS_ICWALK_FLAGS_VALID); } + +#ifdef DEBUG +static void +xfs_check_delalloc( + struct xfs_inode *ip, + int whichfork) +{ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_bmbt_irec got; + struct xfs_iext_cursor icur; + + if (!ifp || !xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got)) + return; + do { + if (isnullstartblock(got.br_startblock)) { + xfs_warn(ip->i_mount, + "ino %llx %s fork has delalloc extent at [0x%llx:0x%llx]", + ip->i_ino, + whichfork == XFS_DATA_FORK ? "data" : "cow", + got.br_startoff, got.br_blockcount); + } + } while (xfs_iext_next_extent(ifp, &icur, &got)); +} +#else +#define xfs_check_delalloc(ip, whichfork) do { } while (0) +#endif + +/* Schedule the inode for reclaim. */ +static void +xfs_inodegc_set_reclaimable( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_perag *pag; + + if (!xfs_is_shutdown(mp) && ip->i_delayed_blks) { + xfs_check_delalloc(ip, XFS_DATA_FORK); + xfs_check_delalloc(ip, XFS_COW_FORK); + ASSERT(0); + } + + pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); + spin_lock(&pag->pag_ici_lock); + spin_lock(&ip->i_flags_lock); + + trace_xfs_inode_set_reclaimable(ip); + ip->i_flags &= ~(XFS_NEED_INACTIVE | XFS_INACTIVATING); + ip->i_flags |= XFS_IRECLAIMABLE; + xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino), + XFS_ICI_RECLAIM_TAG); + + spin_unlock(&ip->i_flags_lock); + spin_unlock(&pag->pag_ici_lock); + xfs_perag_put(pag); +} + +/* + * Free all speculative preallocations and possibly even the inode itself. + * This is the last chance to make changes to an otherwise unreferenced file + * before incore reclamation happens. + */ +static void +xfs_inodegc_inactivate( + struct xfs_inode *ip) +{ + trace_xfs_inode_inactivating(ip); + xfs_inactive(ip); + xfs_inodegc_set_reclaimable(ip); +} + +void +xfs_inodegc_worker( + struct work_struct *work) +{ + struct xfs_inodegc *gc = container_of(work, struct xfs_inodegc, + work); + struct llist_node *node = llist_del_all(&gc->list); + struct xfs_inode *ip, *n; + + WRITE_ONCE(gc->items, 0); + + if (!node) + return; + + ip = llist_entry(node, struct xfs_inode, i_gclist); + trace_xfs_inodegc_worker(ip->i_mount, READ_ONCE(gc->shrinker_hits)); + + WRITE_ONCE(gc->shrinker_hits, 0); + llist_for_each_entry_safe(ip, n, node, i_gclist) { + xfs_iflags_set(ip, XFS_INACTIVATING); + xfs_inodegc_inactivate(ip); + } +} + +/* + * Force all currently queued inode inactivation work to run immediately, and + * wait for the work to finish. Two pass - queue all the work first pass, wait + * for it in a second pass. + */ +void +xfs_inodegc_flush( + struct xfs_mount *mp) +{ + struct xfs_inodegc *gc; + int cpu; + + if (!xfs_is_inodegc_enabled(mp)) + return; + + trace_xfs_inodegc_flush(mp, __return_address); + + xfs_inodegc_queue_all(mp); + + for_each_online_cpu(cpu) { + gc = per_cpu_ptr(mp->m_inodegc, cpu); + flush_work(&gc->work); + } +} + +/* + * Flush all the pending work and then disable the inode inactivation background + * workers and wait for them to stop. + */ +void +xfs_inodegc_stop( + struct xfs_mount *mp) +{ + struct xfs_inodegc *gc; + int cpu; + + if (!xfs_clear_inodegc_enabled(mp)) + return; + + xfs_inodegc_queue_all(mp); + + for_each_online_cpu(cpu) { + gc = per_cpu_ptr(mp->m_inodegc, cpu); + cancel_work_sync(&gc->work); + } + trace_xfs_inodegc_stop(mp, __return_address); +} + +/* + * Enable the inode inactivation background workers and schedule deferred inode + * inactivation work if there is any. + */ +void +xfs_inodegc_start( + struct xfs_mount *mp) +{ + if (xfs_set_inodegc_enabled(mp)) + return; + + trace_xfs_inodegc_start(mp, __return_address); + xfs_inodegc_queue_all(mp); +} + +#ifdef CONFIG_XFS_RT +static inline bool +xfs_inodegc_want_queue_rt_file( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + uint64_t freertx; + + if (!XFS_IS_REALTIME_INODE(ip)) + return false; + + freertx = READ_ONCE(mp->m_sb.sb_frextents); + return freertx < mp->m_low_rtexts[XFS_LOWSP_5_PCNT]; +} +#else +# define xfs_inodegc_want_queue_rt_file(ip) (false) +#endif /* CONFIG_XFS_RT */ + +/* + * Schedule the inactivation worker when: + * + * - We've accumulated more than one inode cluster buffer's worth of inodes. + * - There is less than 5% free space left. + * - Any of the quotas for this inode are near an enforcement limit. + */ +static inline bool +xfs_inodegc_want_queue_work( + struct xfs_inode *ip, + unsigned int items) +{ + struct xfs_mount *mp = ip->i_mount; + + if (items > mp->m_ino_geo.inodes_per_cluster) + return true; + + if (__percpu_counter_compare(&mp->m_fdblocks, + mp->m_low_space[XFS_LOWSP_5_PCNT], + XFS_FDBLOCKS_BATCH) < 0) + return true; + + if (xfs_inodegc_want_queue_rt_file(ip)) + return true; + + if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_USER)) + return true; + + if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_GROUP)) + return true; + + if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_PROJ)) + return true; + + return false; +} + +/* + * Upper bound on the number of inodes in each AG that can be queued for + * inactivation at any given time, to avoid monopolizing the workqueue. + */ +#define XFS_INODEGC_MAX_BACKLOG (4 * XFS_INODES_PER_CHUNK) + +/* + * Make the frontend wait for inactivations when: + * + * - Memory shrinkers queued the inactivation worker and it hasn't finished. + * - The queue depth exceeds the maximum allowable percpu backlog. + * + * Note: If the current thread is running a transaction, we don't ever want to + * wait for other transactions because that could introduce a deadlock. + */ +static inline bool +xfs_inodegc_want_flush_work( + struct xfs_inode *ip, + unsigned int items, + unsigned int shrinker_hits) +{ + if (current->journal_info) + return false; + + if (shrinker_hits > 0) + return true; + + if (items > XFS_INODEGC_MAX_BACKLOG) + return true; + + return false; +} + +/* + * Queue a background inactivation worker if there are inodes that need to be + * inactivated and higher level xfs code hasn't disabled the background + * workers. + */ +static void +xfs_inodegc_queue( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_inodegc *gc; + int items; + unsigned int shrinker_hits; + + trace_xfs_inode_set_need_inactive(ip); + spin_lock(&ip->i_flags_lock); + ip->i_flags |= XFS_NEED_INACTIVE; + spin_unlock(&ip->i_flags_lock); + + gc = get_cpu_ptr(mp->m_inodegc); + llist_add(&ip->i_gclist, &gc->list); + items = READ_ONCE(gc->items); + WRITE_ONCE(gc->items, items + 1); + shrinker_hits = READ_ONCE(gc->shrinker_hits); + put_cpu_ptr(gc); + + if (!xfs_is_inodegc_enabled(mp)) + return; + + if (xfs_inodegc_want_queue_work(ip, items)) { + trace_xfs_inodegc_queue(mp, __return_address); + queue_work(mp->m_inodegc_wq, &gc->work); + } + + if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) { + trace_xfs_inodegc_throttle(mp, __return_address); + flush_work(&gc->work); + } +} + +/* + * Fold the dead CPU inodegc queue into the current CPUs queue. + */ +void +xfs_inodegc_cpu_dead( + struct xfs_mount *mp, + unsigned int dead_cpu) +{ + struct xfs_inodegc *dead_gc, *gc; + struct llist_node *first, *last; + unsigned int count = 0; + + dead_gc = per_cpu_ptr(mp->m_inodegc, dead_cpu); + cancel_work_sync(&dead_gc->work); + + if (llist_empty(&dead_gc->list)) + return; + + first = dead_gc->list.first; + last = first; + while (last->next) { + last = last->next; + count++; + } + dead_gc->list.first = NULL; + dead_gc->items = 0; + + /* Add pending work to current CPU */ + gc = get_cpu_ptr(mp->m_inodegc); + llist_add_batch(first, last, &gc->list); + count += READ_ONCE(gc->items); + WRITE_ONCE(gc->items, count); + put_cpu_ptr(gc); + + if (xfs_is_inodegc_enabled(mp)) { + trace_xfs_inodegc_queue(mp, __return_address); + queue_work(mp->m_inodegc_wq, &gc->work); + } +} + +/* + * We set the inode flag atomically with the radix tree tag. Once we get tag + * lookups on the radix tree, this inode flag can go away. + * + * We always use background reclaim here because even if the inode is clean, it + * still may be under IO and hence we have wait for IO completion to occur + * before we can reclaim the inode. The background reclaim path handles this + * more efficiently than we can here, so simply let background reclaim tear down + * all inodes. + */ +void +xfs_inode_mark_reclaimable( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + bool need_inactive; + + XFS_STATS_INC(mp, vn_reclaim); + + /* + * We should never get here with any of the reclaim flags already set. + */ + ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_ALL_IRECLAIM_FLAGS)); + + need_inactive = xfs_inode_needs_inactive(ip); + if (need_inactive) { + xfs_inodegc_queue(ip); + return; + } + + /* Going straight to reclaim, so drop the dquots. */ + xfs_qm_dqdetach(ip); + xfs_inodegc_set_reclaimable(ip); +} + +/* + * Register a phony shrinker so that we can run background inodegc sooner when + * there's memory pressure. Inactivation does not itself free any memory but + * it does make inodes reclaimable, which eventually frees memory. + * + * The count function, seek value, and batch value are crafted to trigger the + * scan function during the second round of scanning. Hopefully this means + * that we reclaimed enough memory that initiating metadata transactions won't + * make things worse. + */ +#define XFS_INODEGC_SHRINKER_COUNT (1UL << DEF_PRIORITY) +#define XFS_INODEGC_SHRINKER_BATCH ((XFS_INODEGC_SHRINKER_COUNT / 2) + 1) + +static unsigned long +xfs_inodegc_shrinker_count( + struct shrinker *shrink, + struct shrink_control *sc) +{ + struct xfs_mount *mp = container_of(shrink, struct xfs_mount, + m_inodegc_shrinker); + struct xfs_inodegc *gc; + int cpu; + + if (!xfs_is_inodegc_enabled(mp)) + return 0; + + for_each_online_cpu(cpu) { + gc = per_cpu_ptr(mp->m_inodegc, cpu); + if (!llist_empty(&gc->list)) + return XFS_INODEGC_SHRINKER_COUNT; + } + + return 0; +} + +static unsigned long +xfs_inodegc_shrinker_scan( + struct shrinker *shrink, + struct shrink_control *sc) +{ + struct xfs_mount *mp = container_of(shrink, struct xfs_mount, + m_inodegc_shrinker); + struct xfs_inodegc *gc; + int cpu; + bool no_items = true; + + if (!xfs_is_inodegc_enabled(mp)) + return SHRINK_STOP; + + trace_xfs_inodegc_shrinker_scan(mp, sc, __return_address); + + for_each_online_cpu(cpu) { + gc = per_cpu_ptr(mp->m_inodegc, cpu); + if (!llist_empty(&gc->list)) { + unsigned int h = READ_ONCE(gc->shrinker_hits); + + WRITE_ONCE(gc->shrinker_hits, h + 1); + queue_work_on(cpu, mp->m_inodegc_wq, &gc->work); + no_items = false; + } + } + + /* + * If there are no inodes to inactivate, we don't want the shrinker + * to think there's deferred work to call us back about. + */ + if (no_items) + return LONG_MAX; + + return SHRINK_STOP; +} + +/* Register a shrinker so we can accelerate inodegc and throttle queuing. */ +int +xfs_inodegc_register_shrinker( + struct xfs_mount *mp) +{ + struct shrinker *shrink = &mp->m_inodegc_shrinker; + + shrink->count_objects = xfs_inodegc_shrinker_count; + shrink->scan_objects = xfs_inodegc_shrinker_scan; + shrink->seeks = 0; + shrink->flags = SHRINKER_NONSLAB; + shrink->batch = XFS_INODEGC_SHRINKER_BATCH; + + return register_shrinker(shrink); +} diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index c751cc32dc46..2e4cfddf8b8e 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -59,6 +59,7 @@ int xfs_blockgc_free_dquots(struct xfs_mount *mp, struct xfs_dquot *udqp, unsigned int iwalk_flags); int xfs_blockgc_free_quota(struct xfs_inode *ip, unsigned int iwalk_flags); int xfs_blockgc_free_space(struct xfs_mount *mp, struct xfs_icwalk *icm); +void xfs_blockgc_flush_all(struct xfs_mount *mp); void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip); void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip); @@ -68,16 +69,17 @@ void xfs_inode_clear_cowblocks_tag(struct xfs_inode *ip); void xfs_blockgc_worker(struct work_struct *work); -#ifdef CONFIG_XFS_QUOTA -int xfs_dqrele_all_inodes(struct xfs_mount *mp, unsigned int qflags); -#else -# define xfs_dqrele_all_inodes(mp, qflags) (0) -#endif - int xfs_icache_inode_is_allocated(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino, bool *inuse); void xfs_blockgc_stop(struct xfs_mount *mp); void xfs_blockgc_start(struct xfs_mount *mp); +void xfs_inodegc_worker(struct work_struct *work); +void xfs_inodegc_flush(struct xfs_mount *mp); +void xfs_inodegc_stop(struct xfs_mount *mp); +void xfs_inodegc_start(struct xfs_mount *mp); +void xfs_inodegc_cpu_dead(struct xfs_mount *mp, unsigned int cpu); +int xfs_inodegc_register_shrinker(struct xfs_mount *mp); + #endif diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c index 9b3994b9c716..017904a34c02 100644 --- a/fs/xfs/xfs_icreate_item.c +++ b/fs/xfs/xfs_icreate_item.c @@ -201,7 +201,7 @@ xlog_recover_icreate_commit_pass2( if (length != igeo->ialloc_blks && length != igeo->ialloc_min_blks) { xfs_warn(log->l_mp, - "%s: unsupported chunk length", __FUNCTION__); + "%s: unsupported chunk length", __func__); return -EINVAL; } @@ -209,7 +209,7 @@ xlog_recover_icreate_commit_pass2( if ((count >> mp->m_sb.sb_inopblog) != length) { xfs_warn(log->l_mp, "%s: inconsistent inode count and chunk length", - __FUNCTION__); + __func__); return -EINVAL; } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index f00145e1a976..a4f6f034fb81 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -674,7 +674,7 @@ xfs_lookup( trace_xfs_lookup(dp, name); - if (XFS_FORCED_SHUTDOWN(dp->i_mount)) + if (xfs_is_shutdown(dp->i_mount)) return -EIO; error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name); @@ -716,7 +716,7 @@ xfs_inode_inherit_flags( di_flags |= XFS_DIFLAG_PROJINHERIT; } else if (S_ISREG(mode)) { if ((pip->i_diflags & XFS_DIFLAG_RTINHERIT) && - xfs_sb_version_hasrealtime(&ip->i_mount->m_sb)) + xfs_has_realtime(ip->i_mount)) di_flags |= XFS_DIFLAG_REALTIME; if (pip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) { di_flags |= XFS_DIFLAG_EXTSIZE; @@ -837,8 +837,7 @@ xfs_init_new_inode( inode->i_rdev = rdev; ip->i_projid = prid; - if (dir && !(dir->i_mode & S_ISGID) && - (mp->m_flags & XFS_MOUNT_GRPID)) { + if (dir && !(dir->i_mode & S_ISGID) && xfs_has_grpid(mp)) { inode_fsuid_set(inode, mnt_userns); inode->i_gid = dir->i_gid; inode->i_mode = mode; @@ -868,7 +867,7 @@ xfs_init_new_inode( ip->i_extsize = 0; ip->i_diflags = 0; - if (xfs_sb_version_has_v3inode(&mp->m_sb)) { + if (xfs_has_v3inodes(mp)) { inode_set_iversion(inode, 1); ip->i_cowextsize = 0; ip->i_crtime = tv; @@ -908,7 +907,7 @@ xfs_init_new_inode( * this saves us from needing to run a separate transaction to set the * fork offset in the immediate future. */ - if (init_xattrs && xfs_sb_version_hasattr(&mp->m_sb)) { + if (init_xattrs && xfs_has_attr(mp)) { ip->i_forkoff = xfs_default_attroffset(ip) >> 3; ip->i_afp = xfs_ifork_alloc(XFS_DINODE_FMT_EXTENTS, 0); } @@ -987,7 +986,7 @@ xfs_create( trace_xfs_create(dp, name); - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; prid = xfs_get_initial_prid(dp); @@ -1079,7 +1078,7 @@ xfs_create( * create transaction goes to disk before returning to * the user. */ - if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) + if (xfs_has_wsync(mp) || xfs_has_dirsync(mp)) xfs_trans_set_sync(tp); /* @@ -1141,7 +1140,7 @@ xfs_create_tmpfile( uint resblks; xfs_ino_t ino; - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; prid = xfs_get_initial_prid(dp); @@ -1171,7 +1170,7 @@ xfs_create_tmpfile( if (error) goto out_trans_cancel; - if (mp->m_flags & XFS_MOUNT_WSYNC) + if (xfs_has_wsync(mp)) xfs_trans_set_sync(tp); /* @@ -1231,7 +1230,7 @@ xfs_link( ASSERT(!S_ISDIR(VFS_I(sip)->i_mode)); - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; error = xfs_qm_dqattach(sip); @@ -1305,7 +1304,7 @@ xfs_link( * link transaction goes to disk before returning to * the user. */ - if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) + if (xfs_has_wsync(mp) || xfs_has_dirsync(mp)) xfs_trans_set_sync(tp); return xfs_trans_commit(tp); @@ -1446,10 +1445,10 @@ xfs_release( return 0; /* If this is a read-only mount, don't do this (would generate I/O) */ - if (mp->m_flags & XFS_MOUNT_RDONLY) + if (xfs_is_readonly(mp)) return 0; - if (!XFS_FORCED_SHUTDOWN(mp)) { + if (!xfs_is_shutdown(mp)) { int truncated; /* @@ -1532,7 +1531,7 @@ xfs_inactive_truncate( error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); if (error) { - ASSERT(XFS_FORCED_SHUTDOWN(mp)); + ASSERT(xfs_is_shutdown(mp)); return error; } xfs_ilock(ip, XFS_ILOCK_EXCL); @@ -1603,7 +1602,7 @@ xfs_inactive_ifree( "Failed to remove inode(s) from unlinked list. " "Please free space, unmount and run xfs_repair."); } else { - ASSERT(XFS_FORCED_SHUTDOWN(mp)); + ASSERT(xfs_is_shutdown(mp)); } return error; } @@ -1639,7 +1638,7 @@ xfs_inactive_ifree( * might do that, we need to make sure. Otherwise the * inode might be lost for a long time or forever. */ - if (!XFS_FORCED_SHUTDOWN(mp)) { + if (!xfs_is_shutdown(mp)) { xfs_notice(mp, "%s: xfs_ifree returned error %d", __func__, error); xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); @@ -1666,6 +1665,59 @@ xfs_inactive_ifree( } /* + * Returns true if we need to update the on-disk metadata before we can free + * the memory used by this inode. Updates include freeing post-eof + * preallocations; freeing COW staging extents; and marking the inode free in + * the inobt if it is on the unlinked list. + */ +bool +xfs_inode_needs_inactive( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *cow_ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + + /* + * If the inode is already free, then there can be nothing + * to clean up here. + */ + if (VFS_I(ip)->i_mode == 0) + return false; + + /* If this is a read-only mount, don't do this (would generate I/O) */ + if (xfs_is_readonly(mp)) + return false; + + /* If the log isn't running, push inodes straight to reclaim. */ + if (xfs_is_shutdown(mp) || xfs_has_norecovery(mp)) + return false; + + /* Metadata inodes require explicit resource cleanup. */ + if (xfs_is_metadata_inode(ip)) + return false; + + /* Want to clean out the cow blocks if there are any. */ + if (cow_ifp && cow_ifp->if_bytes > 0) + return true; + + /* Unlinked files must be freed. */ + if (VFS_I(ip)->i_nlink == 0) + return true; + + /* + * This file isn't being freed, so check if there are post-eof blocks + * to free. @force is true because we are evicting an inode from the + * cache. Post-eof blocks must be freed, lest we end up with broken + * free space accounting. + * + * Note: don't bother with iolock here since lockdep complains about + * acquiring it in reclaim context. We have the only reference to the + * inode at this point anyways. + */ + return xfs_can_free_eofblocks(ip, true); +} + +/* * xfs_inactive * * This is called when the vnode reference count for the vnode @@ -1694,7 +1746,7 @@ xfs_inactive( ASSERT(!xfs_iflags_test(ip, XFS_IRECOVERY)); /* If this is a read-only mount, don't do this (would generate I/O) */ - if (mp->m_flags & XFS_MOUNT_RDONLY) + if (xfs_is_readonly(mp)) goto out; /* Metadata inodes require explicit resource cleanup. */ @@ -1969,7 +2021,7 @@ xfs_iunlink_destroy( rhashtable_free_and_destroy(&pag->pagi_unlinked_hash, xfs_iunlink_free_item, &freed_anything); - ASSERT(freed_anything == false || XFS_FORCED_SHUTDOWN(pag->pag_mount)); + ASSERT(freed_anything == false || xfs_is_shutdown(pag->pag_mount)); } /* @@ -2714,7 +2766,7 @@ xfs_remove( trace_xfs_remove(dp, name); - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; error = xfs_qm_dqattach(dp); @@ -2813,7 +2865,7 @@ xfs_remove( * remove transaction goes to disk before returning to * the user. */ - if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) + if (xfs_has_wsync(mp) || xfs_has_dirsync(mp)) xfs_trans_set_sync(tp); error = xfs_trans_commit(tp); @@ -2890,7 +2942,7 @@ xfs_finish_rename( * If this is a synchronous mount, make sure that the rename transaction * goes to disk before returning to the user. */ - if (tp->t_mountp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) + if (xfs_has_wsync(tp->t_mountp) || xfs_has_dirsync(tp->t_mountp)) xfs_trans_set_sync(tp); return xfs_trans_commit(tp); @@ -3473,7 +3525,7 @@ xfs_iflush( * happen but we need to still do it to ensure backwards compatibility * with old kernels that predate logging all inode changes. */ - if (!xfs_sb_version_has_v3inode(&mp->m_sb)) + if (!xfs_has_v3inodes(mp)) ip->i_flushiter++; /* @@ -3495,7 +3547,7 @@ xfs_iflush( xfs_inode_to_disk(ip, dip, iip->ili_item.li_lsn); /* Wrap, we never let the log put out DI_MAX_FLUSH */ - if (!xfs_sb_version_has_v3inode(&mp->m_sb)) { + if (!xfs_has_v3inodes(mp)) { if (ip->i_flushiter == DI_MAX_FLUSH) ip->i_flushiter = 0; } @@ -3614,7 +3666,7 @@ xfs_iflush_cluster( * AIL, leaving a dirty/unpinned inode attached to the buffer * that otherwise looks like it should be flushed. */ - if (XFS_FORCED_SHUTDOWN(mp)) { + if (xfs_is_shutdown(mp)) { xfs_iunpin_wait(ip); xfs_iflush_abort(ip); xfs_iunlock(ip, XFS_ILOCK_SHARED); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index e0ae905554e2..b21b177832d1 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -41,6 +41,7 @@ typedef struct xfs_inode { struct xfs_inode_log_item *i_itemp; /* logging information */ mrlock_t i_lock; /* inode lock */ atomic_t i_pincount; /* inode pin count */ + struct llist_node i_gclist; /* deferred inactivation list */ /* * Bitsets of inode metadata that have been checked and/or are sick. @@ -239,6 +240,7 @@ static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip) #define __XFS_IPINNED_BIT 8 /* wakeup key for zero pin count */ #define XFS_IPINNED (1 << __XFS_IPINNED_BIT) #define XFS_IEOFBLOCKS (1 << 9) /* has the preallocblocks tag set */ +#define XFS_NEED_INACTIVE (1 << 10) /* see XFS_INACTIVATING below */ /* * If this unlinked inode is in the middle of recovery, don't let drop_inode * truncate and free the inode. This can happen if we iget the inode during @@ -248,13 +250,29 @@ static inline bool xfs_inode_has_bigtime(struct xfs_inode *ip) #define XFS_ICOWBLOCKS (1 << 12)/* has the cowblocks tag set */ /* + * If we need to update on-disk metadata before this IRECLAIMABLE inode can be + * freed, then NEED_INACTIVE will be set. Once we start the updates, the + * INACTIVATING bit will be set to keep iget away from this inode. After the + * inactivation completes, both flags will be cleared and the inode is a + * plain old IRECLAIMABLE inode. + */ +#define XFS_INACTIVATING (1 << 13) + +/* All inode state flags related to inode reclaim. */ +#define XFS_ALL_IRECLAIM_FLAGS (XFS_IRECLAIMABLE | \ + XFS_IRECLAIM | \ + XFS_NEED_INACTIVE | \ + XFS_INACTIVATING) + +/* * Per-lifetime flags need to be reset when re-using a reclaimable inode during * inode lookup. This prevents unintended behaviour on the new inode from * ocurring. */ #define XFS_IRECLAIM_RESET_FLAGS \ (XFS_IRECLAIMABLE | XFS_IRECLAIM | \ - XFS_IDIRTY_RELEASE | XFS_ITRUNCATED) + XFS_IDIRTY_RELEASE | XFS_ITRUNCATED | XFS_NEED_INACTIVE | \ + XFS_INACTIVATING) /* * Flags for inode locking. @@ -381,8 +399,7 @@ enum layout_break_reason { * new subdirectory gets S_ISGID bit from parent. */ #define XFS_INHERIT_GID(pip) \ - (((pip)->i_mount->m_flags & XFS_MOUNT_GRPID) || \ - (VFS_I(pip)->i_mode & S_ISGID)) + (xfs_has_grpid((pip)->i_mount) || (VFS_I(pip)->i_mode & S_ISGID)) int xfs_release(struct xfs_inode *ip); void xfs_inactive(struct xfs_inode *ip); @@ -492,6 +509,8 @@ extern struct kmem_zone *xfs_inode_zone; /* The default CoW extent size hint. */ #define XFS_DEFAULT_COWEXTSZ_HINT 32 +bool xfs_inode_needs_inactive(struct xfs_inode *ip); + int xfs_iunlink_init(struct xfs_perag *pag); void xfs_iunlink_destroy(struct xfs_perag *pag); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 35de30849fcc..0659d19c211e 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -396,7 +396,7 @@ xfs_inode_to_log_dinode( /* log a dummy value to ensure log structure is fully initialised */ to->di_next_unlinked = NULLAGINO; - if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) { + if (xfs_has_v3inodes(ip->i_mount)) { to->di_version = 3; to->di_changecount = inode_peek_iversion(inode); to->di_crtime = xfs_inode_to_log_dinode_ts(ip, ip->i_crtime); diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c index e0072a6cd2d3..239dd2e3384e 100644 --- a/fs/xfs/xfs_inode_item_recover.c +++ b/fs/xfs/xfs_inode_item_recover.c @@ -295,7 +295,7 @@ xlog_recover_inode_commit_pass2( * superblock flag to determine whether we need to look at di_flushiter * to skip replay when the on disk inode is newer than the log one */ - if (!xfs_sb_version_has_v3inode(&mp->m_sb) && + if (!xfs_has_v3inodes(mp) && ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) { /* * Deal with the wrap case, DI_MAX_FLUSH is less diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 16039ea10ac9..0c795dc093ef 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -756,7 +756,7 @@ xfs_ioc_fsbulkstat( if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; if (copy_from_user(&bulkreq, arg, sizeof(struct xfs_fsop_bulkreq))) @@ -927,7 +927,7 @@ xfs_ioc_bulkstat( if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; if (copy_from_user(&hdr, &arg->hdr, sizeof(hdr))) @@ -977,7 +977,7 @@ xfs_ioc_inumbers( if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; if (copy_from_user(&hdr, &arg->hdr, sizeof(hdr))) @@ -1010,7 +1010,7 @@ xfs_ioc_fsgeometry( struct xfs_fsop_geom fsgeo; size_t len; - xfs_fs_geometry(&mp->m_sb, &fsgeo, struct_version); + xfs_fs_geometry(mp, &fsgeo, struct_version); if (struct_version <= 3) len = sizeof(struct xfs_fsop_geom_v1); @@ -1213,7 +1213,7 @@ xfs_ioctl_setattr_xflags( /* diflags2 only valid for v3 inodes. */ i_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags); - if (i_flags2 && !xfs_sb_version_has_v3inode(&mp->m_sb)) + if (i_flags2 && !xfs_has_v3inodes(mp)) return -EINVAL; ip->i_diflags = xfs_flags2diflags(ip, fa->fsx_xflags); @@ -1237,8 +1237,7 @@ xfs_ioctl_setattr_prepare_dax( if (S_ISDIR(inode->i_mode)) return; - if ((mp->m_flags & XFS_MOUNT_DAX_ALWAYS) || - (mp->m_flags & XFS_MOUNT_DAX_NEVER)) + if (xfs_has_dax_always(mp) || xfs_has_dax_never(mp)) return; if (((fa->fsx_xflags & FS_XFLAG_DAX) && @@ -1263,10 +1262,10 @@ xfs_ioctl_setattr_get_trans( struct xfs_trans *tp; int error = -EROFS; - if (mp->m_flags & XFS_MOUNT_RDONLY) + if (xfs_is_readonly(mp)) goto out_error; error = -EIO; - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) goto out_error; error = xfs_trans_alloc_ichange(ip, NULL, NULL, pdqp, @@ -1274,7 +1273,7 @@ xfs_ioctl_setattr_get_trans( if (error) goto out_error; - if (mp->m_flags & XFS_MOUNT_WSYNC) + if (xfs_has_wsync(mp)) xfs_trans_set_sync(tp); return tp; @@ -1362,9 +1361,9 @@ xfs_ioctl_setattr_check_projid( if (!fa->fsx_valid) return 0; - /* Disallow 32bit project ids if projid32bit feature is not enabled. */ + /* Disallow 32bit project ids if 32bit IDs are not enabled. */ if (fa->fsx_projid > (uint16_t)-1 && - !xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb)) + !xfs_has_projid32(ip->i_mount)) return -EINVAL; return 0; } @@ -1450,7 +1449,7 @@ xfs_fileattr_set( /* Change the ownerships and register project quota modifications */ if (ip->i_projid != fa->fsx_projid) { - if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) { + if (XFS_IS_PQUOTA_ON(mp)) { olddquot = xfs_qm_vop_chown(tp, ip, &ip->i_pdquot, pdqp); } @@ -1467,7 +1466,7 @@ xfs_fileattr_set( else ip->i_extsize = 0; - if (xfs_sb_version_has_v3inode(&mp->m_sb)) { + if (xfs_has_v3inodes(mp)) { if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) ip->i_cowextsize = XFS_B_TO_FSB(mp, fa->fsx_cowextsize); else @@ -1792,7 +1791,7 @@ xfs_ioc_swapext( goto out_put_tmp_file; } - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { + if (xfs_is_shutdown(ip->i_mount)) { error = -EIO; goto out_put_tmp_file; } @@ -2081,7 +2080,7 @@ xfs_file_ioctl( if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (mp->m_flags & XFS_MOUNT_RDONLY) + if (xfs_is_readonly(mp)) return -EROFS; if (copy_from_user(&inout, arg, sizeof(inout))) @@ -2198,7 +2197,7 @@ xfs_file_ioctl( if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (mp->m_flags & XFS_MOUNT_RDONLY) + if (xfs_is_readonly(mp)) return -EROFS; if (copy_from_user(&eofb, arg, sizeof(eofb))) diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index e6506773ba55..8783af203cfc 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -50,7 +50,7 @@ xfs_compat_ioc_fsgeometry_v1( { struct xfs_fsop_geom fsgeo; - xfs_fs_geometry(&mp->m_sb, &fsgeo, 3); + xfs_fs_geometry(mp, &fsgeo, 3); /* The 32-bit variant simply has some padding at the end */ if (copy_to_user(arg32, &fsgeo, sizeof(struct compat_xfs_fsop_geom_v1))) return -EFAULT; @@ -254,7 +254,7 @@ xfs_compat_ioc_fsbulkstat( if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; if (get_user(addr, &p32->lastip)) diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index d8cd2583dedb..093758440ad5 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -132,7 +132,7 @@ xfs_eof_alignment( * If mounted with the "-o swalloc" option the alignment is * increased from the strip unit size to the stripe width. */ - if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC)) + if (mp->m_swidth && xfs_has_swalloc(mp)) align = mp->m_swidth; else if (mp->m_dalign) align = mp->m_dalign; @@ -734,7 +734,7 @@ xfs_direct_write_iomap_begin( ASSERT(flags & (IOMAP_WRITE | IOMAP_ZERO)); - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; /* @@ -874,7 +874,7 @@ xfs_buffered_write_iomap_begin( int allocfork = XFS_DATA_FORK; int error = 0; - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; /* we can't use delayed allocations when using extent size hints */ @@ -994,7 +994,7 @@ xfs_buffered_write_iomap_begin( * Determine the initial size of the preallocation. * We clean up any extra preallocation when the file is closed. */ - if (mp->m_flags & XFS_MOUNT_ALLOCSIZE) + if (xfs_has_allocsize(mp)) prealloc_blocks = mp->m_allocsize_blocks; else prealloc_blocks = xfs_iomap_prealloc_size(ip, allocfork, @@ -1064,11 +1064,11 @@ found_cow: error = xfs_bmbt_to_iomap(ip, srcmap, &imap, 0); if (error) return error; - } else { - xfs_trim_extent(&cmap, offset_fsb, - imap.br_startoff - offset_fsb); + return xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED); } - return xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED); + + xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb); + return xfs_bmbt_to_iomap(ip, iomap, &cmap, 0); out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -1127,7 +1127,7 @@ xfs_buffered_write_iomap_end( error = xfs_bmap_punch_delalloc_range(ip, start_fsb, end_fsb - start_fsb); - if (error && !XFS_FORCED_SHUTDOWN(mp)) { + if (error && !xfs_is_shutdown(mp)) { xfs_alert(mp, "%s: unable to clean up ino %lld", __func__, ip->i_ino); return error; @@ -1162,7 +1162,7 @@ xfs_read_iomap_begin( ASSERT(!(flags & (IOMAP_WRITE | IOMAP_ZERO))); - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; error = xfs_ilock_for_iomap(ip, flags, &lockmode); @@ -1203,7 +1203,7 @@ xfs_seek_iomap_begin( int error = 0; unsigned lockmode; - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; lockmode = xfs_ilock_data_map_shared(ip); @@ -1285,7 +1285,7 @@ xfs_xattr_iomap_begin( int nimaps = 1, error = 0; unsigned lockmode; - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; lockmode = xfs_ilock_attr_map_shared(ip); diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 93c082db04b7..a607d6aca5c4 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -393,7 +393,7 @@ xfs_vn_unlink( * but still hashed. This is incompatible with case-insensitive * mode, so invalidate (unhash) the dentry in CI-mode. */ - if (xfs_sb_version_hasasciici(&XFS_M(dir->i_sb)->m_sb)) + if (xfs_has_asciici(XFS_M(dir->i_sb))) d_invalidate(dentry); return 0; } @@ -558,10 +558,10 @@ xfs_stat_blksize( * default buffered I/O size, return that, otherwise return the compat * default. */ - if (mp->m_flags & XFS_MOUNT_LARGEIO) { + if (xfs_has_large_iosize(mp)) { if (mp->m_swidth) return XFS_FSB_TO_B(mp, mp->m_swidth); - if (mp->m_flags & XFS_MOUNT_ALLOCSIZE) + if (xfs_has_allocsize(mp)) return 1U << mp->m_allocsize_log; } @@ -582,7 +582,7 @@ xfs_vn_getattr( trace_xfs_getattr(ip); - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; stat->size = XFS_ISIZE(ip); @@ -597,7 +597,7 @@ xfs_vn_getattr( stat->ctime = inode->i_ctime; stat->blocks = XFS_FSB_TO_BB(mp, ip->i_nblocks + ip->i_delayed_blks); - if (xfs_sb_version_has_v3inode(&mp->m_sb)) { + if (xfs_has_v3inodes(mp)) { if (request_mask & STATX_BTIME) { stat->result_mask |= STATX_BTIME; stat->btime = ip->i_crtime; @@ -673,10 +673,10 @@ xfs_vn_change_ok( { struct xfs_mount *mp = XFS_I(d_inode(dentry))->i_mount; - if (mp->m_flags & XFS_MOUNT_RDONLY) + if (xfs_is_readonly(mp)) return -EROFS; - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; return setattr_prepare(mnt_userns, dentry, iattr); @@ -778,7 +778,7 @@ xfs_setattr_nonsize( * in the transaction. */ if (!uid_eq(iuid, uid)) { - if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_UQUOTA_ON(mp)) { + if (XFS_IS_UQUOTA_ON(mp)) { ASSERT(mask & ATTR_UID); ASSERT(udqp); olddquot1 = xfs_qm_vop_chown(tp, ip, @@ -787,8 +787,8 @@ xfs_setattr_nonsize( inode->i_uid = uid; } if (!gid_eq(igid, gid)) { - if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_GQUOTA_ON(mp)) { - ASSERT(xfs_sb_version_has_pquotino(&mp->m_sb) || + if (XFS_IS_GQUOTA_ON(mp)) { + ASSERT(xfs_has_pquotino(mp) || !XFS_IS_PQUOTA_ON(mp)); ASSERT(mask & ATTR_GID); ASSERT(gdqp); @@ -808,7 +808,7 @@ xfs_setattr_nonsize( XFS_STATS_INC(mp, xs_ig_attrchg); - if (mp->m_flags & XFS_MOUNT_WSYNC) + if (xfs_has_wsync(mp)) xfs_trans_set_sync(tp); error = xfs_trans_commit(tp); @@ -1037,7 +1037,7 @@ xfs_setattr_size( XFS_STATS_INC(mp, xs_ig_attrchg); - if (mp->m_flags & XFS_MOUNT_WSYNC) + if (xfs_has_wsync(mp)) xfs_trans_set_sync(tp); error = xfs_trans_commit(tp); @@ -1287,11 +1287,11 @@ xfs_inode_should_enable_dax( { if (!IS_ENABLED(CONFIG_FS_DAX)) return false; - if (ip->i_mount->m_flags & XFS_MOUNT_DAX_NEVER) + if (xfs_has_dax_never(ip->i_mount)) return false; if (!xfs_inode_supports_dax(ip)) return false; - if (ip->i_mount->m_flags & XFS_MOUNT_DAX_ALWAYS) + if (xfs_has_dax_always(ip->i_mount)) return true; if (ip->i_diflags2 & XFS_DIFLAG2_DAX) return true; @@ -1344,7 +1344,7 @@ xfs_setup_inode( gfp_t gfp_mask; inode->i_ino = ip->i_ino; - inode->i_state = I_NEW; + inode->i_state |= I_NEW; inode_sb_list_add(inode); /* make the inode look hashed for the writeback code */ @@ -1401,7 +1401,7 @@ xfs_setup_iops( inode->i_mapping->a_ops = &xfs_address_space_operations; break; case S_IFDIR: - if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb)) + if (xfs_has_asciici(XFS_M(inode->i_sb))) inode->i_op = &xfs_dir_ci_inode_operations; else inode->i_op = &xfs_dir_inode_operations; diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index f331975a16de..c08c79d9e311 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -19,6 +19,7 @@ #include "xfs_error.h" #include "xfs_icache.h" #include "xfs_health.h" +#include "xfs_trans.h" /* * Bulk Stat @@ -107,7 +108,7 @@ xfs_bulkstat_one_int( buf->bs_forkoff = XFS_IFORK_BOFF(ip); buf->bs_version = XFS_BULKSTAT_VERSION_V5; - if (xfs_sb_version_has_v3inode(&mp->m_sb)) { + if (xfs_has_v3inodes(mp)) { buf->bs_btime = ip->i_crtime.tv_sec; buf->bs_btime_nsec = ip->i_crtime.tv_nsec; if (ip->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) @@ -163,6 +164,7 @@ xfs_bulkstat_one( .formatter = formatter, .breq = breq, }; + struct xfs_trans *tp; int error; if (breq->mnt_userns != &init_user_ns) { @@ -178,9 +180,18 @@ xfs_bulkstat_one( if (!bc.buf) return -ENOMEM; - error = xfs_bulkstat_one_int(breq->mp, breq->mnt_userns, NULL, - breq->startino, &bc); + /* + * Grab an empty transaction so that we can use its recursive buffer + * locking abilities to detect cycles in the inobt without deadlocking. + */ + error = xfs_trans_alloc_empty(breq->mp, &tp); + if (error) + goto out; + error = xfs_bulkstat_one_int(breq->mp, breq->mnt_userns, tp, + breq->startino, &bc); + xfs_trans_cancel(tp); +out: kmem_free(bc.buf); /* @@ -244,6 +255,7 @@ xfs_bulkstat( .formatter = formatter, .breq = breq, }; + struct xfs_trans *tp; int error; if (breq->mnt_userns != &init_user_ns) { @@ -259,9 +271,18 @@ xfs_bulkstat( if (!bc.buf) return -ENOMEM; - error = xfs_iwalk(breq->mp, NULL, breq->startino, breq->flags, - xfs_bulkstat_iwalk, breq->icount, &bc); + /* + * Grab an empty transaction so that we can use its recursive buffer + * locking abilities to detect cycles in the inobt without deadlocking. + */ + error = xfs_trans_alloc_empty(breq->mp, &tp); + if (error) + goto out; + error = xfs_iwalk(breq->mp, tp, breq->startino, breq->flags, + xfs_bulkstat_iwalk, breq->icount, &bc); + xfs_trans_cancel(tp); +out: kmem_free(bc.buf); /* @@ -374,13 +395,24 @@ xfs_inumbers( .formatter = formatter, .breq = breq, }; + struct xfs_trans *tp; int error = 0; if (xfs_bulkstat_already_done(breq->mp, breq->startino)) return 0; - error = xfs_inobt_walk(breq->mp, NULL, breq->startino, breq->flags, + /* + * Grab an empty transaction so that we can use its recursive buffer + * locking abilities to detect cycles in the inobt without deadlocking. + */ + error = xfs_trans_alloc_empty(breq->mp, &tp); + if (error) + goto out; + + error = xfs_inobt_walk(breq->mp, tp, breq->startino, breq->flags, xfs_inumbers_walk, breq->icount, &ic); + xfs_trans_cancel(tp); +out: /* * We found some inode groups, so clear the error status and return diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c index 917d51eefee3..7558486f4937 100644 --- a/fs/xfs/xfs_iwalk.c +++ b/fs/xfs/xfs_iwalk.c @@ -83,6 +83,9 @@ struct xfs_iwalk_ag { /* Skip empty inobt records? */ unsigned int skip_empty:1; + + /* Drop the (hopefully empty) transaction when calling iwalk_fn. */ + unsigned int drop_trans:1; }; /* @@ -352,7 +355,6 @@ xfs_iwalk_run_callbacks( int *has_more) { struct xfs_mount *mp = iwag->mp; - struct xfs_trans *tp = iwag->tp; struct xfs_inobt_rec_incore *irec; xfs_agino_t next_agino; int error; @@ -362,10 +364,15 @@ xfs_iwalk_run_callbacks( ASSERT(iwag->nr_recs > 0); /* Delete cursor but remember the last record we cached... */ - xfs_iwalk_del_inobt(tp, curpp, agi_bpp, 0); + xfs_iwalk_del_inobt(iwag->tp, curpp, agi_bpp, 0); irec = &iwag->recs[iwag->nr_recs - 1]; ASSERT(next_agino >= irec->ir_startino + XFS_INODES_PER_CHUNK); + if (iwag->drop_trans) { + xfs_trans_cancel(iwag->tp); + iwag->tp = NULL; + } + error = xfs_iwalk_ag_recs(iwag); if (error) return error; @@ -376,8 +383,15 @@ xfs_iwalk_run_callbacks( if (!has_more) return 0; + if (iwag->drop_trans) { + error = xfs_trans_alloc_empty(mp, &iwag->tp); + if (error) + return error; + } + /* ...and recreate the cursor just past where we left off. */ - error = xfs_inobt_cur(mp, tp, iwag->pag, XFS_BTNUM_INO, curpp, agi_bpp); + error = xfs_inobt_cur(mp, iwag->tp, iwag->pag, XFS_BTNUM_INO, curpp, + agi_bpp); if (error) return error; @@ -390,7 +404,6 @@ xfs_iwalk_ag( struct xfs_iwalk_ag *iwag) { struct xfs_mount *mp = iwag->mp; - struct xfs_trans *tp = iwag->tp; struct xfs_perag *pag = iwag->pag; struct xfs_buf *agi_bp = NULL; struct xfs_btree_cur *cur = NULL; @@ -469,7 +482,7 @@ xfs_iwalk_ag( error = xfs_iwalk_run_callbacks(iwag, &cur, &agi_bp, &has_more); out: - xfs_iwalk_del_inobt(tp, &cur, &agi_bp, error); + xfs_iwalk_del_inobt(iwag->tp, &cur, &agi_bp, error); return error; } @@ -599,8 +612,18 @@ xfs_iwalk_ag_work( error = xfs_iwalk_alloc(iwag); if (error) goto out; + /* + * Grab an empty transaction so that we can use its recursive buffer + * locking abilities to detect cycles in the inobt without deadlocking. + */ + error = xfs_trans_alloc_empty(mp, &iwag->tp); + if (error) + goto out; + iwag->drop_trans = 1; error = xfs_iwalk_ag(iwag); + if (iwag->tp) + xfs_trans_cancel(iwag->tp); xfs_iwalk_free(iwag); out: xfs_perag_put(iwag->pag); diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 60ac5fd63f1e..f6cd2d4aa770 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -41,6 +41,8 @@ xlog_dealloc_log( /* local state machine functions */ STATIC void xlog_state_done_syncing( struct xlog_in_core *iclog); +STATIC void xlog_state_do_callback( + struct xlog *log); STATIC int xlog_state_get_iclog_space( struct xlog *log, @@ -50,11 +52,6 @@ xlog_state_get_iclog_space( int *continued_write, int *logoffsetp); STATIC void -xlog_state_switch_iclogs( - struct xlog *log, - struct xlog_in_core *iclog, - int eventual_size); -STATIC void xlog_grant_push_ail( struct xlog *log, int need_bytes); @@ -246,7 +243,7 @@ xlog_grant_head_wait( list_add_tail(&tic->t_queue, &head->waiters); do { - if (XLOG_FORCED_SHUTDOWN(log)) + if (xlog_is_shutdown(log)) goto shutdown; xlog_grant_push_ail(log, need_bytes); @@ -260,7 +257,7 @@ xlog_grant_head_wait( trace_xfs_log_grant_wake(log, tic); spin_lock(&head->lock); - if (XLOG_FORCED_SHUTDOWN(log)) + if (xlog_is_shutdown(log)) goto shutdown; } while (xlog_space_left(log, &head->grant) < need_bytes); @@ -298,7 +295,7 @@ xlog_grant_head_check( int free_bytes; int error = 0; - ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY)); + ASSERT(!xlog_in_recovery(log)); /* * If there are other waiters on the queue then give them a chance at @@ -359,13 +356,13 @@ xfs_log_writable( * mounts allow internal writes for log recovery and unmount purposes, * so don't restrict that case. */ - if (mp->m_flags & XFS_MOUNT_NORECOVERY) + if (xfs_has_norecovery(mp)) return false; if (xfs_readonly_buftarg(mp->m_ddev_targp)) return false; if (xfs_readonly_buftarg(mp->m_log->l_targ)) return false; - if (XFS_FORCED_SHUTDOWN(mp)) + if (xlog_is_shutdown(mp->m_log)) return false; return true; } @@ -382,7 +379,7 @@ xfs_log_regrant( int need_bytes; int error = 0; - if (XLOG_FORCED_SHUTDOWN(log)) + if (xlog_is_shutdown(log)) return -EIO; XFS_STATS_INC(mp, xs_try_logspace); @@ -450,7 +447,7 @@ xfs_log_reserve( ASSERT(client == XFS_TRANSACTION || client == XFS_LOG); - if (XLOG_FORCED_SHUTDOWN(log)) + if (xlog_is_shutdown(log)) return -EIO; XFS_STATS_INC(mp, xs_try_logspace); @@ -487,6 +484,42 @@ out_error: } /* + * Run all the pending iclog callbacks and wake log force waiters and iclog + * space waiters so they can process the newly set shutdown state. We really + * don't care what order we process callbacks here because the log is shut down + * and so state cannot change on disk anymore. + * + * We avoid processing actively referenced iclogs so that we don't run callbacks + * while the iclog owner might still be preparing the iclog for IO submssion. + * These will be caught by xlog_state_iclog_release() and call this function + * again to process any callbacks that may have been added to that iclog. + */ +static void +xlog_state_shutdown_callbacks( + struct xlog *log) +{ + struct xlog_in_core *iclog; + LIST_HEAD(cb_list); + + spin_lock(&log->l_icloglock); + iclog = log->l_iclog; + do { + if (atomic_read(&iclog->ic_refcnt)) { + /* Reference holder will re-run iclog callbacks. */ + continue; + } + list_splice_init(&iclog->ic_callbacks, &cb_list); + wake_up_all(&iclog->ic_write_wait); + wake_up_all(&iclog->ic_force_wait); + } while ((iclog = iclog->ic_next) != log->l_iclog); + + wake_up_all(&log->l_flush_wait); + spin_unlock(&log->l_icloglock); + + xlog_cil_process_committed(&cb_list); +} + +/* * Flush iclog to disk if this is the last reference to the given iclog and the * it is in the WANT_SYNC state. * @@ -520,12 +553,11 @@ xlog_state_release_iclog( xfs_lsn_t old_tail_lsn) { xfs_lsn_t tail_lsn; + bool last_ref; + lockdep_assert_held(&log->l_icloglock); trace_xlog_iclog_release(iclog, _RET_IP_); - if (iclog->ic_state == XLOG_STATE_IOERROR) - return -EIO; - /* * Grabbing the current log tail needs to be atomic w.r.t. the writing * of the tail LSN into the iclog so we guarantee that the log tail does @@ -543,7 +575,23 @@ xlog_state_release_iclog( iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn); } - if (!atomic_dec_and_test(&iclog->ic_refcnt)) + last_ref = atomic_dec_and_test(&iclog->ic_refcnt); + + if (xlog_is_shutdown(log)) { + /* + * If there are no more references to this iclog, process the + * pending iclog callbacks that were waiting on the release of + * this iclog. + */ + if (last_ref) { + spin_unlock(&log->l_icloglock); + xlog_state_shutdown_callbacks(log); + spin_lock(&log->l_icloglock); + } + return -EIO; + } + + if (!last_ref) return 0; if (iclog->ic_state != XLOG_STATE_WANT_SYNC) { @@ -580,25 +628,27 @@ xfs_log_mount( xfs_daddr_t blk_offset, int num_bblks) { - bool fatal = xfs_sb_version_hascrc(&mp->m_sb); + struct xlog *log; + bool fatal = xfs_has_crc(mp); int error = 0; int min_logfsbs; - if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) { + if (!xfs_has_norecovery(mp)) { xfs_notice(mp, "Mounting V%d Filesystem", XFS_SB_VERSION_NUM(&mp->m_sb)); } else { xfs_notice(mp, "Mounting V%d filesystem in no-recovery mode. Filesystem will be inconsistent.", XFS_SB_VERSION_NUM(&mp->m_sb)); - ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); + ASSERT(xfs_is_readonly(mp)); } - mp->m_log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks); - if (IS_ERR(mp->m_log)) { - error = PTR_ERR(mp->m_log); + log = xlog_alloc_log(mp, log_target, blk_offset, num_bblks); + if (IS_ERR(log)) { + error = PTR_ERR(log); goto out; } + mp->m_log = log; /* * Validate the given log space and drop a critical message via syslog @@ -663,51 +713,51 @@ xfs_log_mount( xfs_warn(mp, "AIL initialisation failed: error %d", error); goto out_free_log; } - mp->m_log->l_ailp = mp->m_ail; + log->l_ailp = mp->m_ail; /* * skip log recovery on a norecovery mount. pretend it all * just worked. */ - if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) { - int readonly = (mp->m_flags & XFS_MOUNT_RDONLY); - - if (readonly) - mp->m_flags &= ~XFS_MOUNT_RDONLY; - - error = xlog_recover(mp->m_log); - + if (!xfs_has_norecovery(mp)) { + /* + * log recovery ignores readonly state and so we need to clear + * mount-based read only state so it can write to disk. + */ + bool readonly = test_and_clear_bit(XFS_OPSTATE_READONLY, + &mp->m_opstate); + error = xlog_recover(log); if (readonly) - mp->m_flags |= XFS_MOUNT_RDONLY; + set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate); if (error) { xfs_warn(mp, "log mount/recovery failed: error %d", error); - xlog_recover_cancel(mp->m_log); + xlog_recover_cancel(log); goto out_destroy_ail; } } - error = xfs_sysfs_init(&mp->m_log->l_kobj, &xfs_log_ktype, &mp->m_kobj, + error = xfs_sysfs_init(&log->l_kobj, &xfs_log_ktype, &mp->m_kobj, "log"); if (error) goto out_destroy_ail; /* Normal transactions can now occur */ - mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY; + clear_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate); /* * Now the log has been fully initialised and we know were our * space grant counters are, we can initialise the permanent ticket * needed for delayed logging to work. */ - xlog_cil_init_post_recovery(mp->m_log); + xlog_cil_init_post_recovery(log); return 0; out_destroy_ail: xfs_trans_ail_destroy(mp); out_free_log: - xlog_dealloc_log(mp->m_log); + xlog_dealloc_log(log); out: return error; } @@ -726,19 +776,22 @@ int xfs_log_mount_finish( struct xfs_mount *mp) { - int error = 0; - bool readonly = (mp->m_flags & XFS_MOUNT_RDONLY); - bool recovered = mp->m_log->l_flags & XLOG_RECOVERY_NEEDED; + struct xlog *log = mp->m_log; + bool readonly; + int error = 0; - if (mp->m_flags & XFS_MOUNT_NORECOVERY) { - ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); + if (xfs_has_norecovery(mp)) { + ASSERT(xfs_is_readonly(mp)); return 0; - } else if (readonly) { - /* Allow unlinked processing to proceed */ - mp->m_flags &= ~XFS_MOUNT_RDONLY; } /* + * log recovery ignores readonly state and so we need to clear + * mount-based read only state so it can write to disk. + */ + readonly = test_and_clear_bit(XFS_OPSTATE_READONLY, &mp->m_opstate); + + /* * During the second phase of log recovery, we need iget and * iput to behave like they do for an active filesystem. * xfs_fs_drop_inode needs to be able to prevent the deletion @@ -759,7 +812,8 @@ xfs_log_mount_finish( * mount failure occurs. */ mp->m_super->s_flags |= SB_ACTIVE; - error = xlog_recover_finish(mp->m_log); + if (xlog_recovery_needed(log)) + error = xlog_recover_finish(log); if (!error) xfs_log_work_queue(mp); mp->m_super->s_flags &= ~SB_ACTIVE; @@ -774,17 +828,24 @@ xfs_log_mount_finish( * Don't push in the error case because the AIL may have pending intents * that aren't removed until recovery is cancelled. */ - if (!error && recovered) { - xfs_log_force(mp, XFS_LOG_SYNC); - xfs_ail_push_all_sync(mp->m_ail); + if (xlog_recovery_needed(log)) { + if (!error) { + xfs_log_force(mp, XFS_LOG_SYNC); + xfs_ail_push_all_sync(mp->m_ail); + } + xfs_notice(mp, "Ending recovery (logdev: %s)", + mp->m_logname ? mp->m_logname : "internal"); + } else { + xfs_info(mp, "Ending clean mount"); } xfs_buftarg_drain(mp->m_ddev_targp); + clear_bit(XLOG_RECOVERY_NEEDED, &log->l_opstate); if (readonly) - mp->m_flags |= XFS_MOUNT_RDONLY; + set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate); /* Make sure the log is dead if we're returning failure. */ - ASSERT(!error || (mp->m_log->l_flags & XLOG_IO_ERROR)); + ASSERT(!error || xlog_is_shutdown(log)); return error; } @@ -830,7 +891,7 @@ xlog_wait_on_iclog( struct xlog *log = iclog->ic_log; trace_xlog_iclog_wait_on(iclog, _RET_IP_); - if (!XLOG_FORCED_SHUTDOWN(log) && + if (!xlog_is_shutdown(log) && iclog->ic_state != XLOG_STATE_ACTIVE && iclog->ic_state != XLOG_STATE_DIRTY) { XFS_STATS_INC(log->l_mp, xs_log_force_sleep); @@ -839,7 +900,7 @@ xlog_wait_on_iclog( spin_unlock(&log->l_icloglock); } - if (XLOG_FORCED_SHUTDOWN(log)) + if (xlog_is_shutdown(log)) return -EIO; return 0; } @@ -870,7 +931,7 @@ xlog_write_unmount_record( /* account for space used by record data */ ticket->t_curr_res -= sizeof(ulf); - return xlog_write(log, &vec, ticket, NULL, NULL, XLOG_UNMOUNT_TRANS); + return xlog_write(log, NULL, &vec, ticket, XLOG_UNMOUNT_TRANS); } /* @@ -893,7 +954,7 @@ xlog_unmount_write( error = xlog_write_unmount_record(log, tic); /* * At this point, we're umounting anyway, so there's no point in - * transitioning log state to IOERROR. Just continue... + * transitioning log state to shutdown. Just continue... */ out_err: if (error) @@ -940,7 +1001,7 @@ xfs_log_unmount_write( xfs_log_force(mp, XFS_LOG_SYNC); - if (XLOG_FORCED_SHUTDOWN(log)) + if (xlog_is_shutdown(log)) return; /* @@ -972,6 +1033,20 @@ int xfs_log_quiesce( struct xfs_mount *mp) { + /* + * Clear log incompat features since we're quiescing the log. Report + * failures, though it's not fatal to have a higher log feature + * protection level than the log contents actually require. + */ + if (xfs_clear_incompat_log_features(mp)) { + int error; + + error = xfs_sync_sb(mp, false); + if (error) + xfs_warn(mp, + "Failed to clear log incompat features on quiesce"); + } + cancel_delayed_work_sync(&mp->m_log->l_work); xfs_log_force(mp, XFS_LOG_SYNC); @@ -1049,11 +1124,11 @@ xfs_log_space_wake( struct xlog *log = mp->m_log; int free_bytes; - if (XLOG_FORCED_SHUTDOWN(log)) + if (xlog_is_shutdown(log)) return; if (!list_empty_careful(&log->l_write_head.waiters)) { - ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY)); + ASSERT(!xlog_in_recovery(log)); spin_lock(&log->l_write_head.lock); free_bytes = xlog_space_left(log, &log->l_write_head.grant); @@ -1062,7 +1137,7 @@ xfs_log_space_wake( } if (!list_empty_careful(&log->l_reserve_head.waiters)) { - ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY)); + ASSERT(!xlog_in_recovery(log)); spin_lock(&log->l_reserve_head.lock); free_bytes = xlog_space_left(log, &log->l_reserve_head.grant); @@ -1140,7 +1215,7 @@ xfs_log_cover( ASSERT((xlog_cil_empty(mp->m_log) && xlog_iclogs_empty(mp->m_log) && !xfs_ail_min_lsn(mp->m_log->l_ailp)) || - XFS_FORCED_SHUTDOWN(mp)); + xlog_is_shutdown(mp->m_log)); if (!xfs_log_writable(mp)) return 0; @@ -1157,7 +1232,7 @@ xfs_log_cover( * handles this for us. */ need_covered = xfs_log_need_covered(mp); - if (!need_covered && !xfs_sb_version_haslazysbcount(&mp->m_sb)) + if (!need_covered && !xfs_has_lazysbcount(mp)) return 0; /* @@ -1230,16 +1305,18 @@ xlog_assign_tail_lsn( * wrap the tail, we should blow up. Rather than catch this case here, * we depend on other ASSERTions in other parts of the code. XXXmiken * - * This code also handles the case where the reservation head is behind - * the tail. The details of this case are described below, but the end - * result is that we return the size of the log as the amount of space left. + * If reservation head is behind the tail, we have a problem. Warn about it, + * but then treat it as if the log is empty. + * + * If the log is shut down, the head and tail may be invalid or out of whack, so + * shortcut invalidity asserts in this case so that we don't trigger them + * falsely. */ STATIC int xlog_space_left( struct xlog *log, atomic64_t *head) { - int free_bytes; int tail_bytes; int tail_cycle; int head_cycle; @@ -1249,29 +1326,30 @@ xlog_space_left( xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_bytes); tail_bytes = BBTOB(tail_bytes); if (tail_cycle == head_cycle && head_bytes >= tail_bytes) - free_bytes = log->l_logsize - (head_bytes - tail_bytes); - else if (tail_cycle + 1 < head_cycle) + return log->l_logsize - (head_bytes - tail_bytes); + if (tail_cycle + 1 < head_cycle) return 0; - else if (tail_cycle < head_cycle) { + + /* Ignore potential inconsistency when shutdown. */ + if (xlog_is_shutdown(log)) + return log->l_logsize; + + if (tail_cycle < head_cycle) { ASSERT(tail_cycle == (head_cycle - 1)); - free_bytes = tail_bytes - head_bytes; - } else { - /* - * The reservation head is behind the tail. - * In this case we just want to return the size of the - * log as the amount of space left. - */ - xfs_alert(log->l_mp, "xlog_space_left: head behind tail"); - xfs_alert(log->l_mp, - " tail_cycle = %d, tail_bytes = %d", - tail_cycle, tail_bytes); - xfs_alert(log->l_mp, - " GH cycle = %d, GH bytes = %d", - head_cycle, head_bytes); - ASSERT(0); - free_bytes = log->l_logsize; + return tail_bytes - head_bytes; } - return free_bytes; + + /* + * The reservation head is behind the tail. In this case we just want to + * return the size of the log as the amount of space left. + */ + xfs_alert(log->l_mp, "xlog_space_left: head behind tail"); + xfs_alert(log->l_mp, " tail_cycle = %d, tail_bytes = %d", + tail_cycle, tail_bytes); + xfs_alert(log->l_mp, " GH cycle = %d, GH bytes = %d", + head_cycle, head_bytes); + ASSERT(0); + return log->l_logsize; } @@ -1349,6 +1427,32 @@ xfs_log_work_queue( } /* + * Clear the log incompat flags if we have the opportunity. + * + * This only happens if we're about to log the second dummy transaction as part + * of covering the log and we can get the log incompat feature usage lock. + */ +static inline void +xlog_clear_incompat( + struct xlog *log) +{ + struct xfs_mount *mp = log->l_mp; + + if (!xfs_sb_has_incompat_log_feature(&mp->m_sb, + XFS_SB_FEAT_INCOMPAT_LOG_ALL)) + return; + + if (log->l_covered_state != XLOG_STATE_COVER_DONE2) + return; + + if (!down_write_trylock(&log->l_incompat_users)) + return; + + xfs_clear_incompat_log_features(mp); + up_write(&log->l_incompat_users); +} + +/* * Every sync period we need to unpin all items in the AIL and push them to * disk. If there is nothing dirty, then we might need to cover the log to * indicate that the filesystem is idle. @@ -1374,6 +1478,7 @@ xfs_log_worker( * synchronously log the superblock instead to ensure the * superblock is immediately unpinned and can be written back. */ + xlog_clear_incompat(log); xfs_sync_sb(mp, true); } else xfs_log_force(mp, 0); @@ -1417,7 +1522,7 @@ xlog_alloc_log( log->l_logBBstart = blk_offset; log->l_logBBsize = num_bblks; log->l_covered_state = XLOG_STATE_COVER_IDLE; - log->l_flags |= XLOG_ACTIVE_RECOVERY; + set_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate); INIT_DELAYED_WORK(&log->l_work, xfs_log_worker); log->l_prev_block = -1; @@ -1426,7 +1531,7 @@ xlog_alloc_log( xlog_assign_atomic_lsn(&log->l_last_sync_lsn, 1, 0); log->l_curr_cycle = 1; /* 0 is bad since this is initial value */ - if (xfs_sb_version_haslogv2(&mp->m_sb) && mp->m_sb.sb_logsunit > 1) + if (xfs_has_logv2(mp) && mp->m_sb.sb_logsunit > 1) log->l_iclog_roundoff = mp->m_sb.sb_logsunit; else log->l_iclog_roundoff = BBSIZE; @@ -1435,7 +1540,7 @@ xlog_alloc_log( xlog_grant_head_init(&log->l_write_head); error = -EFSCORRUPTED; - if (xfs_sb_version_hassector(&mp->m_sb)) { + if (xfs_has_sector(mp)) { log2_size = mp->m_sb.sb_logsectlog; if (log2_size < BBSHIFT) { xfs_warn(mp, "Log sector size too small (0x%x < 0x%x)", @@ -1452,7 +1557,7 @@ xlog_alloc_log( /* for larger sector sizes, must have v2 or external log */ if (log2_size && log->l_logBBstart > 0 && - !xfs_sb_version_haslogv2(&mp->m_sb)) { + !xfs_has_logv2(mp)) { xfs_warn(mp, "log sector size (0x%x) invalid for configuration.", log2_size); @@ -1461,6 +1566,8 @@ xlog_alloc_log( } log->l_sectBBsize = 1 << log2_size; + init_rwsem(&log->l_incompat_users); + xlog_get_iclog_buffer_size(mp, log); spin_lock_init(&log->l_icloglock); @@ -1476,7 +1583,6 @@ xlog_alloc_log( */ ASSERT(log->l_iclog_size >= 4096); for (i = 0; i < log->l_iclog_bufs; i++) { - int align_mask = xfs_buftarg_dma_alignment(mp->m_logdev_targp); size_t bvec_size = howmany(log->l_iclog_size, PAGE_SIZE) * sizeof(struct bio_vec); @@ -1488,8 +1594,8 @@ xlog_alloc_log( iclog->ic_prev = prev_iclog; prev_iclog = iclog; - iclog->ic_data = kmem_alloc_io(log->l_iclog_size, align_mask, - KM_MAYFAIL | KM_ZERO); + iclog->ic_data = kvzalloc(log->l_iclog_size, + GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!iclog->ic_data) goto out_free_iclog; #ifdef DEBUG @@ -1499,7 +1605,7 @@ xlog_alloc_log( memset(head, 0, sizeof(xlog_rec_header_t)); head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM); head->h_version = cpu_to_be32( - xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1); + xfs_has_logv2(log->l_mp) ? 2 : 1); head->h_size = cpu_to_be32(log->l_iclog_size); /* new fields */ head->h_fmt = cpu_to_be32(XLOG_FMT); @@ -1551,37 +1657,6 @@ out: } /* xlog_alloc_log */ /* - * Write out the commit record of a transaction associated with the given - * ticket to close off a running log write. Return the lsn of the commit record. - */ -int -xlog_commit_record( - struct xlog *log, - struct xlog_ticket *ticket, - struct xlog_in_core **iclog, - xfs_lsn_t *lsn) -{ - struct xfs_log_iovec reg = { - .i_addr = NULL, - .i_len = 0, - .i_type = XLOG_REG_TYPE_COMMIT, - }; - struct xfs_log_vec vec = { - .lv_niovecs = 1, - .lv_iovecp = ®, - }; - int error; - - if (XLOG_FORCED_SHUTDOWN(log)) - return -EIO; - - error = xlog_write(log, &vec, ticket, lsn, iclog, XLOG_COMMIT_TRANS); - if (error) - xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); - return error; -} - -/* * Compute the LSN that we'd need to push the log tail towards in order to have * (a) enough on-disk log space to log the number of bytes specified, (b) at * least 25% of the log space free, and (c) at least 256 blocks free. If the @@ -1653,7 +1728,7 @@ xlog_grant_push_ail( xfs_lsn_t threshold_lsn; threshold_lsn = xlog_grant_push_threshold(log, need_bytes); - if (threshold_lsn == NULLCOMMITLSN || XLOG_FORCED_SHUTDOWN(log)) + if (threshold_lsn == NULLCOMMITLSN || xlog_is_shutdown(log)) return; /* @@ -1689,7 +1764,7 @@ xlog_pack_data( dp += BBSIZE; } - if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + if (xfs_has_logv2(log->l_mp)) { xlog_in_core_2_t *xhdr = iclog->ic_data; for ( ; i < BTOBB(size); i++) { @@ -1726,7 +1801,7 @@ xlog_cksum( offsetof(struct xlog_rec_header, h_crc)); /* ... then for additional cycle data for v2 logs ... */ - if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + if (xfs_has_logv2(log->l_mp)) { union xlog_in_core2 *xhdr = (union xlog_in_core2 *)rhead; int i; int xheads; @@ -1795,7 +1870,7 @@ xlog_write_iclog( * across the log IO to archieve that. */ down(&iclog->ic_sema); - if (unlikely(iclog->ic_state == XLOG_STATE_IOERROR)) { + if (xlog_is_shutdown(log)) { /* * It would seem logical to return EIO here, but we rely on * the log state machine to propagate I/O errors instead of @@ -1953,7 +2028,7 @@ xlog_sync( /* real byte length */ size = iclog->ic_offset; - if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) + if (xfs_has_logv2(log->l_mp)) size += roundoff; iclog->ic_header.h_len = cpu_to_be32(size); @@ -2303,8 +2378,7 @@ xlog_write_copy_finish( int *data_cnt, int *partial_copy, int *partial_copy_len, - int log_offset, - struct xlog_in_core **commit_iclog) + int log_offset) { int error; @@ -2323,27 +2397,20 @@ xlog_write_copy_finish( *partial_copy = 0; *partial_copy_len = 0; - if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) { - /* no more space in this iclog - push it. */ - spin_lock(&log->l_icloglock); - xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt); - *record_cnt = 0; - *data_cnt = 0; - - if (iclog->ic_state == XLOG_STATE_ACTIVE) - xlog_state_switch_iclogs(log, iclog, 0); - else - ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC || - iclog->ic_state == XLOG_STATE_IOERROR); - if (!commit_iclog) - goto release_iclog; - spin_unlock(&log->l_icloglock); - ASSERT(flags & XLOG_COMMIT_TRANS); - *commit_iclog = iclog; - } + if (iclog->ic_size - log_offset > sizeof(xlog_op_header_t)) + return 0; - return 0; + /* no more space in this iclog - push it. */ + spin_lock(&log->l_icloglock); + xlog_state_finish_copy(log, iclog, *record_cnt, *data_cnt); + *record_cnt = 0; + *data_cnt = 0; + if (iclog->ic_state == XLOG_STATE_ACTIVE) + xlog_state_switch_iclogs(log, iclog, 0); + else + ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC || + xlog_is_shutdown(log)); release_iclog: error = xlog_state_release_iclog(log, iclog, 0); spin_unlock(&log->l_icloglock); @@ -2393,10 +2460,9 @@ release_iclog: int xlog_write( struct xlog *log, + struct xfs_cil_ctx *ctx, struct xfs_log_vec *log_vector, struct xlog_ticket *ticket, - xfs_lsn_t *start_lsn, - struct xlog_in_core **commit_iclog, uint optype) { struct xlog_in_core *iclog = NULL; @@ -2426,8 +2492,6 @@ xlog_write( } len = xlog_write_calc_vec_length(ticket, log_vector, optype); - if (start_lsn) - *start_lsn = 0; while (lv && (!lv->lv_niovecs || index < lv->lv_niovecs)) { void *ptr; int log_offset; @@ -2440,9 +2504,15 @@ xlog_write( ASSERT(log_offset <= iclog->ic_size - 1); ptr = iclog->ic_datap + log_offset; - /* Start_lsn is the first lsn written to. */ - if (start_lsn && !*start_lsn) - *start_lsn = be64_to_cpu(iclog->ic_header.h_lsn); + /* + * If we have a context pointer, pass it the first iclog we are + * writing to so it can record state needed for iclog write + * ordering. + */ + if (ctx) { + xlog_cil_set_ctx_write_state(ctx, iclog); + ctx = NULL; + } /* * This loop writes out as many regions as can fit in the amount @@ -2521,8 +2591,7 @@ xlog_write( &record_cnt, &data_cnt, &partial_copy, &partial_copy_len, - log_offset, - commit_iclog); + log_offset); if (error) return error; @@ -2560,12 +2629,7 @@ next_lv: spin_lock(&log->l_icloglock); xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); - if (commit_iclog) { - ASSERT(optype & XLOG_COMMIT_TRANS); - *commit_iclog = iclog; - } else { - error = xlog_state_release_iclog(log, iclog, 0); - } + error = xlog_state_release_iclog(log, iclog, 0); spin_unlock(&log->l_icloglock); return error; @@ -2751,8 +2815,7 @@ xlog_state_set_callback( static bool xlog_state_iodone_process_iclog( struct xlog *log, - struct xlog_in_core *iclog, - bool *ioerror) + struct xlog_in_core *iclog) { xfs_lsn_t lowest_lsn; xfs_lsn_t header_lsn; @@ -2764,15 +2827,6 @@ xlog_state_iodone_process_iclog( * Skip all iclogs in the ACTIVE & DIRTY states: */ return false; - case XLOG_STATE_IOERROR: - /* - * Between marking a filesystem SHUTDOWN and stopping the log, - * we do flush all iclogs to disk (if there wasn't a log I/O - * error). So, we do want things to go smoothly in case of just - * a SHUTDOWN w/o a LOG_IO_ERROR. - */ - *ioerror = true; - return false; case XLOG_STATE_DONE_SYNC: /* * Now that we have an iclog that is in the DONE_SYNC state, do @@ -2796,72 +2850,75 @@ xlog_state_iodone_process_iclog( } } -STATIC void -xlog_state_do_callback( +/* + * Loop over all the iclogs, running attached callbacks on them. Return true if + * we ran any callbacks, indicating that we dropped the icloglock. We don't need + * to handle transient shutdown state here at all because + * xlog_state_shutdown_callbacks() will be run to do the necessary shutdown + * cleanup of the callbacks. + */ +static bool +xlog_state_do_iclog_callbacks( struct xlog *log) + __releases(&log->l_icloglock) + __acquires(&log->l_icloglock) { - struct xlog_in_core *iclog; - struct xlog_in_core *first_iclog; - bool cycled_icloglock; - bool ioerror; - int flushcnt = 0; - int repeats = 0; + struct xlog_in_core *first_iclog = log->l_iclog; + struct xlog_in_core *iclog = first_iclog; + bool ran_callback = false; - spin_lock(&log->l_icloglock); do { - /* - * Scan all iclogs starting with the one pointed to by the - * log. Reset this starting point each time the log is - * unlocked (during callbacks). - * - * Keep looping through iclogs until one full pass is made - * without running any callbacks. - */ - first_iclog = log->l_iclog; - iclog = log->l_iclog; - cycled_icloglock = false; - ioerror = false; - repeats++; + LIST_HEAD(cb_list); - do { - LIST_HEAD(cb_list); + if (xlog_state_iodone_process_iclog(log, iclog)) + break; + if (iclog->ic_state != XLOG_STATE_CALLBACK) { + iclog = iclog->ic_next; + continue; + } + list_splice_init(&iclog->ic_callbacks, &cb_list); + spin_unlock(&log->l_icloglock); - if (xlog_state_iodone_process_iclog(log, iclog, - &ioerror)) - break; + trace_xlog_iclog_callbacks_start(iclog, _RET_IP_); + xlog_cil_process_committed(&cb_list); + trace_xlog_iclog_callbacks_done(iclog, _RET_IP_); + ran_callback = true; - if (iclog->ic_state != XLOG_STATE_CALLBACK && - iclog->ic_state != XLOG_STATE_IOERROR) { - iclog = iclog->ic_next; - continue; - } - list_splice_init(&iclog->ic_callbacks, &cb_list); - spin_unlock(&log->l_icloglock); + spin_lock(&log->l_icloglock); + xlog_state_clean_iclog(log, iclog); + iclog = iclog->ic_next; + } while (iclog != first_iclog); + + return ran_callback; +} - trace_xlog_iclog_callbacks_start(iclog, _RET_IP_); - xlog_cil_process_committed(&cb_list); - trace_xlog_iclog_callbacks_done(iclog, _RET_IP_); - cycled_icloglock = true; - spin_lock(&log->l_icloglock); - if (XLOG_FORCED_SHUTDOWN(log)) - wake_up_all(&iclog->ic_force_wait); - else - xlog_state_clean_iclog(log, iclog); - iclog = iclog->ic_next; - } while (first_iclog != iclog); +/* + * Loop running iclog completion callbacks until there are no more iclogs in a + * state that can run callbacks. + */ +STATIC void +xlog_state_do_callback( + struct xlog *log) +{ + int flushcnt = 0; + int repeats = 0; + + spin_lock(&log->l_icloglock); + while (xlog_state_do_iclog_callbacks(log)) { + if (xlog_is_shutdown(log)) + break; - if (repeats > 5000) { + if (++repeats > 5000) { flushcnt += repeats; repeats = 0; xfs_warn(log->l_mp, "%s: possible infinite loop (%d iterations)", __func__, flushcnt); } - } while (!ioerror && cycled_icloglock); + } - if (log->l_iclog->ic_state == XLOG_STATE_ACTIVE || - log->l_iclog->ic_state == XLOG_STATE_IOERROR) + if (log->l_iclog->ic_state == XLOG_STATE_ACTIVE) wake_up_all(&log->l_flush_wait); spin_unlock(&log->l_icloglock); @@ -2871,13 +2928,6 @@ xlog_state_do_callback( /* * Finish transitioning this iclog to the dirty state. * - * Make sure that we completely execute this routine only when this is - * the last call to the iclog. There is a good chance that iclog flushes, - * when we reach the end of the physical log, get turned into 2 separate - * calls to bwrite. Hence, one iclog flush could generate two calls to this - * routine. By using the reference count bwritecnt, we guarantee that only - * the second completion goes through. - * * Callbacks could take time, so they are done outside the scope of the * global state machine log lock. */ @@ -2896,7 +2946,7 @@ xlog_state_done_syncing( * split log writes, on the second, we shut down the file system and * no iclogs should ever be attempted to be written to disk again. */ - if (!XLOG_FORCED_SHUTDOWN(log)) { + if (!xlog_is_shutdown(log)) { ASSERT(iclog->ic_state == XLOG_STATE_SYNCING); iclog->ic_state = XLOG_STATE_DONE_SYNC; } @@ -2944,7 +2994,7 @@ xlog_state_get_iclog_space( restart: spin_lock(&log->l_icloglock); - if (XLOG_FORCED_SHUTDOWN(log)) { + if (xlog_is_shutdown(log)) { spin_unlock(&log->l_icloglock); return -EIO; } @@ -3122,7 +3172,7 @@ xfs_log_ticket_ungrant( * This routine will mark the current iclog in the ring as WANT_SYNC and move * the current iclog pointer to the next iclog in the ring. */ -STATIC void +void xlog_state_switch_iclogs( struct xlog *log, struct xlog_in_core *iclog, @@ -3237,10 +3287,10 @@ xfs_log_force( xlog_cil_force(log); spin_lock(&log->l_icloglock); - iclog = log->l_iclog; - if (iclog->ic_state == XLOG_STATE_IOERROR) + if (xlog_is_shutdown(log)) goto out_error; + iclog = log->l_iclog; trace_xlog_iclog_force(iclog, _RET_IP_); if (iclog->ic_state == XLOG_STATE_DIRTY || @@ -3294,6 +3344,20 @@ out_error: return -EIO; } +/* + * Force the log to a specific LSN. + * + * If an iclog with that lsn can be found: + * If it is in the DIRTY state, just return. + * If it is in the ACTIVE state, move the in-core log into the WANT_SYNC + * state and go to sleep or return. + * If it is in any other state, go to sleep or return. + * + * Synchronous forces are implemented with a wait queue. All callers trying + * to force a given lsn to disk must wait on the queue attached to the + * specific in-core log. When given in-core log finally completes its write + * to disk, that thread will wake up all threads waiting on the queue. + */ static int xlog_force_lsn( struct xlog *log, @@ -3306,10 +3370,10 @@ xlog_force_lsn( bool completed; spin_lock(&log->l_icloglock); - iclog = log->l_iclog; - if (iclog->ic_state == XLOG_STATE_IOERROR) + if (xlog_is_shutdown(log)) goto out_error; + iclog = log->l_iclog; while (be64_to_cpu(iclog->ic_header.h_lsn) != lsn) { trace_xlog_iclog_force_lsn(iclog, _RET_IP_); iclog = iclog->ic_next; @@ -3379,18 +3443,13 @@ out_error: } /* - * Force the in-core log to disk for a specific LSN. - * - * Find in-core log with lsn. - * If it is in the DIRTY state, just return. - * If it is in the ACTIVE state, move the in-core log into the WANT_SYNC - * state and go to sleep or return. - * If it is in any other state, go to sleep or return. + * Force the log to a specific checkpoint sequence. * - * Synchronous forces are implemented with a wait queue. All callers trying - * to force a given lsn to disk must wait on the queue attached to the - * specific in-core log. When given in-core log finally completes its write - * to disk, that thread will wake up all threads waiting on the queue. + * First force the CIL so that all the required changes have been flushed to the + * iclogs. If the CIL force completed it will return a commit LSN that indicates + * the iclog that needs to be flushed to stable storage. If the caller needs + * a synchronous log force, we will wait on the iclog with the LSN returned by + * xlog_cil_force_seq() to be completed. */ int xfs_log_force_seq( @@ -3619,17 +3678,15 @@ xlog_verify_grant_tail( xlog_crack_atomic_lsn(&log->l_tail_lsn, &tail_cycle, &tail_blocks); if (tail_cycle != cycle) { if (cycle - 1 != tail_cycle && - !(log->l_flags & XLOG_TAIL_WARN)) { + !test_and_set_bit(XLOG_TAIL_WARN, &log->l_opstate)) { xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, "%s: cycle - 1 != tail_cycle", __func__); - log->l_flags |= XLOG_TAIL_WARN; } if (space > BBTOB(tail_blocks) && - !(log->l_flags & XLOG_TAIL_WARN)) { + !test_and_set_bit(XLOG_TAIL_WARN, &log->l_opstate)) { xfs_alert_tag(log->l_mp, XFS_PTAG_LOGRES, "%s: space > BBTOB(tail_blocks)", __func__); - log->l_flags |= XLOG_TAIL_WARN; } } } @@ -3765,105 +3822,66 @@ xlog_verify_iclog( #endif /* - * Mark all iclogs IOERROR. l_icloglock is held by the caller. - */ -STATIC int -xlog_state_ioerror( - struct xlog *log) -{ - xlog_in_core_t *iclog, *ic; - - iclog = log->l_iclog; - if (iclog->ic_state != XLOG_STATE_IOERROR) { - /* - * Mark all the incore logs IOERROR. - * From now on, no log flushes will result. - */ - ic = iclog; - do { - ic->ic_state = XLOG_STATE_IOERROR; - ic = ic->ic_next; - } while (ic != iclog); - return 0; - } - /* - * Return non-zero, if state transition has already happened. - */ - return 1; -} - -/* - * This is called from xfs_force_shutdown, when we're forcibly - * shutting down the filesystem, typically because of an IO error. + * Perform a forced shutdown on the log. This should be called once and once + * only by the high level filesystem shutdown code to shut the log subsystem + * down cleanly. + * * Our main objectives here are to make sure that: - * a. if !logerror, flush the logs to disk. Anything modified - * after this is ignored. - * b. the filesystem gets marked 'SHUTDOWN' for all interested - * parties to find out, 'atomically'. - * c. those who're sleeping on log reservations, pinned objects and - * other resources get woken up, and be told the bad news. - * d. nothing new gets queued up after (b) and (c) are done. + * a. if the shutdown was not due to a log IO error, flush the logs to + * disk. Anything modified after this is ignored. + * b. the log gets atomically marked 'XLOG_IO_ERROR' for all interested + * parties to find out. Nothing new gets queued after this is done. + * c. Tasks sleeping on log reservations, pinned objects and + * other resources get woken up. * - * Note: for the !logerror case we need to flush the regions held in memory out - * to disk first. This needs to be done before the log is marked as shutdown, - * otherwise the iclog writes will fail. + * Return true if the shutdown cause was a log IO error and we actually shut the + * log down. */ -int -xfs_log_force_umount( - struct xfs_mount *mp, - int logerror) +bool +xlog_force_shutdown( + struct xlog *log, + int shutdown_flags) { - struct xlog *log; - int retval; - - log = mp->m_log; + bool log_error = (shutdown_flags & SHUTDOWN_LOG_IO_ERROR); /* - * If this happens during log recovery, don't worry about - * locking; the log isn't open for business yet. + * If this happens during log recovery then we aren't using the runtime + * log mechanisms yet so there's nothing to shut down. */ - if (!log || - log->l_flags & XLOG_ACTIVE_RECOVERY) { - mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN; - if (mp->m_sb_bp) - mp->m_sb_bp->b_flags |= XBF_DONE; - return 0; - } + if (!log || xlog_in_recovery(log)) + return false; - /* - * Somebody could've already done the hard work for us. - * No need to get locks for this. - */ - if (logerror && log->l_iclog->ic_state == XLOG_STATE_IOERROR) { - ASSERT(XLOG_FORCED_SHUTDOWN(log)); - return 1; - } + ASSERT(!xlog_is_shutdown(log)); /* * Flush all the completed transactions to disk before marking the log - * being shut down. We need to do it in this order to ensure that - * completed operations are safely on disk before we shut down, and that - * we don't have to issue any buffer IO after the shutdown flags are set - * to guarantee this. + * being shut down. We need to do this first as shutting down the log + * before the force will prevent the log force from flushing the iclogs + * to disk. + * + * Re-entry due to a log IO error shutdown during the log force is + * prevented by the atomicity of higher level shutdown code. */ - if (!logerror) - xfs_log_force(mp, XFS_LOG_SYNC); + if (!log_error) + xfs_log_force(log->l_mp, XFS_LOG_SYNC); /* - * mark the filesystem and the as in a shutdown state and wake - * everybody up to tell them the bad news. + * Atomically set the shutdown state. If the shutdown state is already + * set, there someone else is performing the shutdown and so we are done + * here. This should never happen because we should only ever get called + * once by the first shutdown caller. + * + * Much of the log state machine transitions assume that shutdown state + * cannot change once they hold the log->l_icloglock. Hence we need to + * hold that lock here, even though we use the atomic test_and_set_bit() + * operation to set the shutdown state. */ spin_lock(&log->l_icloglock); - mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN; - if (mp->m_sb_bp) - mp->m_sb_bp->b_flags |= XBF_DONE; - - /* - * Mark the log and the iclogs with IO error flags to prevent any - * further log IO from being issued or completed. - */ - log->l_flags |= XLOG_IO_ERROR; - retval = xlog_state_ioerror(log); + if (test_and_set_bit(XLOG_IO_ERROR, &log->l_opstate)) { + spin_unlock(&log->l_icloglock); + ASSERT(0); + return false; + } spin_unlock(&log->l_icloglock); /* @@ -3883,12 +3901,12 @@ xfs_log_force_umount( * avoid races. */ spin_lock(&log->l_cilp->xc_push_lock); + wake_up_all(&log->l_cilp->xc_start_wait); wake_up_all(&log->l_cilp->xc_commit_wait); spin_unlock(&log->l_cilp->xc_push_lock); - xlog_state_do_callback(log); + xlog_state_shutdown_callbacks(log); - /* return non-zero if log IOERROR transition had already happened */ - return retval; + return log_error; } STATIC int @@ -3926,7 +3944,7 @@ xfs_log_check_lsn( * resets the in-core LSN. We can't validate in this mode, but * modifications are not allowed anyways so just return true. */ - if (mp->m_flags & XFS_MOUNT_NORECOVERY) + if (xfs_has_norecovery(mp)) return true; /* @@ -3952,11 +3970,22 @@ xfs_log_check_lsn( return valid; } -bool -xfs_log_in_recovery( - struct xfs_mount *mp) +/* + * Notify the log that we're about to start using a feature that is protected + * by a log incompat feature flag. This will prevent log covering from + * clearing those flags. + */ +void +xlog_use_incompat_feat( + struct xlog *log) { - struct xlog *log = mp->m_log; + down_read(&log->l_incompat_users); +} - return log->l_flags & XLOG_ACTIVE_RECOVERY; +/* Notify the log that we've finished using log incompat features. */ +void +xlog_drop_incompat_feat( + struct xlog *log) +{ + up_read(&log->l_incompat_users); } diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 813b972e9788..dc1b77b92fc1 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -104,6 +104,7 @@ struct xlog_ticket; struct xfs_log_item; struct xfs_item_ops; struct xfs_trans; +struct xlog; int xfs_log_force(struct xfs_mount *mp, uint flags); int xfs_log_force_seq(struct xfs_mount *mp, xfs_csn_t seq, uint flags, @@ -125,7 +126,6 @@ int xfs_log_reserve(struct xfs_mount *mp, bool permanent); int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic); void xfs_log_unmount(struct xfs_mount *mp); -int xfs_log_force_umount(struct xfs_mount *mp, int logerror); bool xfs_log_writable(struct xfs_mount *mp); struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket); @@ -138,8 +138,11 @@ void xfs_log_work_queue(struct xfs_mount *mp); int xfs_log_quiesce(struct xfs_mount *mp); void xfs_log_clean(struct xfs_mount *mp); bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t); -bool xfs_log_in_recovery(struct xfs_mount *); xfs_lsn_t xlog_grant_push_threshold(struct xlog *log, int need_bytes); +bool xlog_force_shutdown(struct xlog *log, int shutdown_flags); + +void xlog_use_incompat_feat(struct xlog *log); +void xlog_drop_incompat_feat(struct xlog *log); #endif /* __XFS_LOG_H__ */ diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 4c44bc3786c0..6c93c8ada6f3 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -48,6 +48,34 @@ xlog_cil_ticket_alloc( } /* + * Unavoidable forward declaration - xlog_cil_push_work() calls + * xlog_cil_ctx_alloc() itself. + */ +static void xlog_cil_push_work(struct work_struct *work); + +static struct xfs_cil_ctx * +xlog_cil_ctx_alloc(void) +{ + struct xfs_cil_ctx *ctx; + + ctx = kmem_zalloc(sizeof(*ctx), KM_NOFS); + INIT_LIST_HEAD(&ctx->committing); + INIT_LIST_HEAD(&ctx->busy_extents); + INIT_WORK(&ctx->push_work, xlog_cil_push_work); + return ctx; +} + +static void +xlog_cil_ctx_switch( + struct xfs_cil *cil, + struct xfs_cil_ctx *ctx) +{ + ctx->sequence = ++cil->xc_current_sequence; + ctx->cil = cil; + cil->xc_ctx = ctx; +} + +/* * After the first stage of log recovery is done, we know where the head and * tail of the log are. We need this log initialisation done before we can * initialise the first CIL checkpoint context. @@ -185,7 +213,15 @@ xlog_cil_alloc_shadow_bufs( */ kmem_free(lip->li_lv_shadow); - lv = kmem_alloc_large(buf_size, KM_NOFS); + /* + * We are in transaction context, which means this + * allocation will pick up GFP_NOFS from the + * memalloc_nofs_save/restore context the transaction + * holds. This means we can use GFP_KERNEL here so the + * generic kvmalloc() code will run vmalloc on + * contiguous page allocation failure as we require. + */ + lv = kvmalloc(buf_size, GFP_KERNEL); memset(lv, 0, xlog_cil_iovec_space(niovecs)); lv->lv_item = lip; @@ -535,7 +571,7 @@ xlog_discard_busy_extents( struct blk_plug plug; int error = 0; - ASSERT(mp->m_flags & XFS_MOUNT_DISCARD); + ASSERT(xfs_has_discard(mp)); blk_start_plug(&plug); list_for_each_entry(busyp, list, list) { @@ -576,7 +612,7 @@ xlog_cil_committed( struct xfs_cil_ctx *ctx) { struct xfs_mount *mp = ctx->cil->xc_log->l_mp; - bool abort = XLOG_FORCED_SHUTDOWN(ctx->cil->xc_log); + bool abort = xlog_is_shutdown(ctx->cil->xc_log); /* * If the I/O failed, we're aborting the commit and already shutdown. @@ -587,6 +623,7 @@ xlog_cil_committed( */ if (abort) { spin_lock(&ctx->cil->xc_push_lock); + wake_up_all(&ctx->cil->xc_start_wait); wake_up_all(&ctx->cil->xc_commit_wait); spin_unlock(&ctx->cil->xc_push_lock); } @@ -596,7 +633,7 @@ xlog_cil_committed( xfs_extent_busy_sort(&ctx->busy_extents); xfs_extent_busy_clear(mp, &ctx->busy_extents, - (mp->m_flags & XFS_MOUNT_DISCARD) && !abort); + xfs_has_discard(mp) && !abort); spin_lock(&ctx->cil->xc_push_lock); list_del(&ctx->committing); @@ -624,6 +661,180 @@ xlog_cil_process_committed( } /* +* Record the LSN of the iclog we were just granted space to start writing into. +* If the context doesn't have a start_lsn recorded, then this iclog will +* contain the start record for the checkpoint. Otherwise this write contains +* the commit record for the checkpoint. +*/ +void +xlog_cil_set_ctx_write_state( + struct xfs_cil_ctx *ctx, + struct xlog_in_core *iclog) +{ + struct xfs_cil *cil = ctx->cil; + xfs_lsn_t lsn = be64_to_cpu(iclog->ic_header.h_lsn); + + ASSERT(!ctx->commit_lsn); + if (!ctx->start_lsn) { + spin_lock(&cil->xc_push_lock); + /* + * The LSN we need to pass to the log items on transaction + * commit is the LSN reported by the first log vector write, not + * the commit lsn. If we use the commit record lsn then we can + * move the tail beyond the grant write head. + */ + ctx->start_lsn = lsn; + wake_up_all(&cil->xc_start_wait); + spin_unlock(&cil->xc_push_lock); + return; + } + + /* + * Take a reference to the iclog for the context so that we still hold + * it when xlog_write is done and has released it. This means the + * context controls when the iclog is released for IO. + */ + atomic_inc(&iclog->ic_refcnt); + + /* + * xlog_state_get_iclog_space() guarantees there is enough space in the + * iclog for an entire commit record, so we can attach the context + * callbacks now. This needs to be done before we make the commit_lsn + * visible to waiters so that checkpoints with commit records in the + * same iclog order their IO completion callbacks in the same order that + * the commit records appear in the iclog. + */ + spin_lock(&cil->xc_log->l_icloglock); + list_add_tail(&ctx->iclog_entry, &iclog->ic_callbacks); + spin_unlock(&cil->xc_log->l_icloglock); + + /* + * Now we can record the commit LSN and wake anyone waiting for this + * sequence to have the ordered commit record assigned to a physical + * location in the log. + */ + spin_lock(&cil->xc_push_lock); + ctx->commit_iclog = iclog; + ctx->commit_lsn = lsn; + wake_up_all(&cil->xc_commit_wait); + spin_unlock(&cil->xc_push_lock); +} + + +/* + * Ensure that the order of log writes follows checkpoint sequence order. This + * relies on the context LSN being zero until the log write has guaranteed the + * LSN that the log write will start at via xlog_state_get_iclog_space(). + */ +enum _record_type { + _START_RECORD, + _COMMIT_RECORD, +}; + +static int +xlog_cil_order_write( + struct xfs_cil *cil, + xfs_csn_t sequence, + enum _record_type record) +{ + struct xfs_cil_ctx *ctx; + +restart: + spin_lock(&cil->xc_push_lock); + list_for_each_entry(ctx, &cil->xc_committing, committing) { + /* + * Avoid getting stuck in this loop because we were woken by the + * shutdown, but then went back to sleep once already in the + * shutdown state. + */ + if (xlog_is_shutdown(cil->xc_log)) { + spin_unlock(&cil->xc_push_lock); + return -EIO; + } + + /* + * Higher sequences will wait for this one so skip them. + * Don't wait for our own sequence, either. + */ + if (ctx->sequence >= sequence) + continue; + + /* Wait until the LSN for the record has been recorded. */ + switch (record) { + case _START_RECORD: + if (!ctx->start_lsn) { + xlog_wait(&cil->xc_start_wait, &cil->xc_push_lock); + goto restart; + } + break; + case _COMMIT_RECORD: + if (!ctx->commit_lsn) { + xlog_wait(&cil->xc_commit_wait, &cil->xc_push_lock); + goto restart; + } + break; + } + } + spin_unlock(&cil->xc_push_lock); + return 0; +} + +/* + * Write out the log vector change now attached to the CIL context. This will + * write a start record that needs to be strictly ordered in ascending CIL + * sequence order so that log recovery will always use in-order start LSNs when + * replaying checkpoints. + */ +static int +xlog_cil_write_chain( + struct xfs_cil_ctx *ctx, + struct xfs_log_vec *chain) +{ + struct xlog *log = ctx->cil->xc_log; + int error; + + error = xlog_cil_order_write(ctx->cil, ctx->sequence, _START_RECORD); + if (error) + return error; + return xlog_write(log, ctx, chain, ctx->ticket, XLOG_START_TRANS); +} + +/* + * Write out the commit record of a checkpoint transaction to close off a + * running log write. These commit records are strictly ordered in ascending CIL + * sequence order so that log recovery will always replay the checkpoints in the + * correct order. + */ +static int +xlog_cil_write_commit_record( + struct xfs_cil_ctx *ctx) +{ + struct xlog *log = ctx->cil->xc_log; + struct xfs_log_iovec reg = { + .i_addr = NULL, + .i_len = 0, + .i_type = XLOG_REG_TYPE_COMMIT, + }; + struct xfs_log_vec vec = { + .lv_niovecs = 1, + .lv_iovecp = ®, + }; + int error; + + if (xlog_is_shutdown(log)) + return -EIO; + + error = xlog_cil_order_write(ctx->cil, ctx->sequence, _COMMIT_RECORD); + if (error) + return error; + + error = xlog_write(log, ctx, &vec, ctx->ticket, XLOG_COMMIT_TRANS); + if (error) + xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + return error; +} + +/* * Push the Committed Item List to the log. * * If the current sequence is the same as xc_push_seq we need to do a flush. If @@ -641,13 +852,12 @@ static void xlog_cil_push_work( struct work_struct *work) { - struct xfs_cil *cil = - container_of(work, struct xfs_cil, xc_push_work); + struct xfs_cil_ctx *ctx = + container_of(work, struct xfs_cil_ctx, push_work); + struct xfs_cil *cil = ctx->cil; struct xlog *log = cil->xc_log; struct xfs_log_vec *lv; - struct xfs_cil_ctx *ctx; struct xfs_cil_ctx *new_ctx; - struct xlog_in_core *commit_iclog; struct xlog_ticket *tic; int num_iovecs; int error = 0; @@ -655,20 +865,21 @@ xlog_cil_push_work( struct xfs_log_iovec lhdr; struct xfs_log_vec lvhdr = { NULL }; xfs_lsn_t preflush_tail_lsn; - xfs_lsn_t commit_lsn; xfs_csn_t push_seq; struct bio bio; DECLARE_COMPLETION_ONSTACK(bdev_flush); + bool push_commit_stable; - new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_NOFS); + new_ctx = xlog_cil_ctx_alloc(); new_ctx->ticket = xlog_cil_ticket_alloc(log); down_write(&cil->xc_ctx_lock); - ctx = cil->xc_ctx; spin_lock(&cil->xc_push_lock); push_seq = cil->xc_push_seq; ASSERT(push_seq <= ctx->sequence); + push_commit_stable = cil->xc_push_commit_stable; + cil->xc_push_commit_stable = false; /* * As we are about to switch to a new, empty CIL context, we no longer @@ -694,7 +905,7 @@ xlog_cil_push_work( /* check for a previously pushed sequence */ - if (push_seq < cil->xc_ctx->sequence) { + if (push_seq < ctx->sequence) { spin_unlock(&cil->xc_push_lock); goto out_skip; } @@ -767,19 +978,7 @@ xlog_cil_push_work( } /* - * initialise the new context and attach it to the CIL. Then attach - * the current context to the CIL committing list so it can be found - * during log forces to extract the commit lsn of the sequence that - * needs to be forced. - */ - INIT_LIST_HEAD(&new_ctx->committing); - INIT_LIST_HEAD(&new_ctx->busy_extents); - new_ctx->sequence = ctx->sequence + 1; - new_ctx->cil = cil; - cil->xc_ctx = new_ctx; - - /* - * The switch is now done, so we can drop the context lock and move out + * Switch the contexts so we can drop the context lock and move out * of a shared context. We can't just go straight to the commit record, * though - we need to synchronise with previous and future commits so * that the commit records are correctly ordered in the log to ensure @@ -804,7 +1003,7 @@ xlog_cil_push_work( * deferencing a freed context pointer. */ spin_lock(&cil->xc_push_lock); - cil->xc_current_sequence = new_ctx->sequence; + xlog_cil_ctx_switch(cil, new_ctx); spin_unlock(&cil->xc_push_lock); up_write(&cil->xc_ctx_lock); @@ -837,78 +1036,17 @@ xlog_cil_push_work( */ wait_for_completion(&bdev_flush); - error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, - XLOG_START_TRANS); + error = xlog_cil_write_chain(ctx, &lvhdr); if (error) goto out_abort_free_ticket; - /* - * now that we've written the checkpoint into the log, strictly - * order the commit records so replay will get them in the right order. - */ -restart: - spin_lock(&cil->xc_push_lock); - list_for_each_entry(new_ctx, &cil->xc_committing, committing) { - /* - * Avoid getting stuck in this loop because we were woken by the - * shutdown, but then went back to sleep once already in the - * shutdown state. - */ - if (XLOG_FORCED_SHUTDOWN(log)) { - spin_unlock(&cil->xc_push_lock); - goto out_abort_free_ticket; - } - - /* - * Higher sequences will wait for this one so skip them. - * Don't wait for our own sequence, either. - */ - if (new_ctx->sequence >= ctx->sequence) - continue; - if (!new_ctx->commit_lsn) { - /* - * It is still being pushed! Wait for the push to - * complete, then start again from the beginning. - */ - xlog_wait(&cil->xc_commit_wait, &cil->xc_push_lock); - goto restart; - } - } - spin_unlock(&cil->xc_push_lock); - - error = xlog_commit_record(log, tic, &commit_iclog, &commit_lsn); + error = xlog_cil_write_commit_record(ctx); if (error) goto out_abort_free_ticket; xfs_log_ticket_ungrant(log, tic); /* - * Once we attach the ctx to the iclog, a shutdown can process the - * iclog, run the callbacks and free the ctx. The only thing preventing - * this potential UAF situation here is that we are holding the - * icloglock. Hence we cannot access the ctx once we have attached the - * callbacks and dropped the icloglock. - */ - spin_lock(&log->l_icloglock); - if (commit_iclog->ic_state == XLOG_STATE_IOERROR) { - spin_unlock(&log->l_icloglock); - goto out_abort; - } - ASSERT_ALWAYS(commit_iclog->ic_state == XLOG_STATE_ACTIVE || - commit_iclog->ic_state == XLOG_STATE_WANT_SYNC); - list_add_tail(&ctx->iclog_entry, &commit_iclog->ic_callbacks); - - /* - * now the checkpoint commit is complete and we've attached the - * callbacks to the iclog we can assign the commit LSN to the context - * and wake up anyone who is waiting for the commit to complete. - */ - spin_lock(&cil->xc_push_lock); - ctx->commit_lsn = commit_lsn; - wake_up_all(&cil->xc_commit_wait); - spin_unlock(&cil->xc_push_lock); - - /* * If the checkpoint spans multiple iclogs, wait for all previous iclogs * to complete before we submit the commit_iclog. We can't use state * checks for this - ACTIVE can be either a past completed iclog or a @@ -919,21 +1057,19 @@ restart: * wakeup until this commit_iclog is written to disk. Hence we use the * iclog header lsn and compare it to the commit lsn to determine if we * need to wait on iclogs or not. - * - * NOTE: It is not safe to reference the ctx after this check as we drop - * the icloglock if we have to wait for completion of other iclogs. */ - if (ctx->start_lsn != commit_lsn) { + spin_lock(&log->l_icloglock); + if (ctx->start_lsn != ctx->commit_lsn) { xfs_lsn_t plsn; - plsn = be64_to_cpu(commit_iclog->ic_prev->ic_header.h_lsn); - if (plsn && XFS_LSN_CMP(plsn, commit_lsn) < 0) { + plsn = be64_to_cpu(ctx->commit_iclog->ic_prev->ic_header.h_lsn); + if (plsn && XFS_LSN_CMP(plsn, ctx->commit_lsn) < 0) { /* * Waiting on ic_force_wait orders the completion of * iclogs older than ic_prev. Hence we only need to wait * on the most recent older iclog here. */ - xlog_wait_on_iclog(commit_iclog->ic_prev); + xlog_wait_on_iclog(ctx->commit_iclog->ic_prev); spin_lock(&log->l_icloglock); } @@ -941,16 +1077,27 @@ restart: * We need to issue a pre-flush so that the ordering for this * checkpoint is correctly preserved down to stable storage. */ - commit_iclog->ic_flags |= XLOG_ICL_NEED_FLUSH; + ctx->commit_iclog->ic_flags |= XLOG_ICL_NEED_FLUSH; } /* * The commit iclog must be written to stable storage to guarantee * journal IO vs metadata writeback IO is correctly ordered on stable * storage. + * + * If the push caller needs the commit to be immediately stable and the + * commit_iclog is not yet marked as XLOG_STATE_WANT_SYNC to indicate it + * will be written when released, switch it's state to WANT_SYNC right + * now. */ - commit_iclog->ic_flags |= XLOG_ICL_NEED_FUA; - xlog_state_release_iclog(log, commit_iclog, preflush_tail_lsn); + ctx->commit_iclog->ic_flags |= XLOG_ICL_NEED_FUA; + if (push_commit_stable && + ctx->commit_iclog->ic_state == XLOG_STATE_ACTIVE) + xlog_state_switch_iclogs(log, ctx->commit_iclog, 0); + xlog_state_release_iclog(log, ctx->commit_iclog, preflush_tail_lsn); + + /* Not safe to reference ctx now! */ + spin_unlock(&log->l_icloglock); return; @@ -962,9 +1109,15 @@ out_skip: out_abort_free_ticket: xfs_log_ticket_ungrant(log, tic); -out_abort: - ASSERT(XLOG_FORCED_SHUTDOWN(log)); - xlog_cil_committed(ctx); + ASSERT(xlog_is_shutdown(log)); + if (!ctx->commit_iclog) { + xlog_cil_committed(ctx); + return; + } + spin_lock(&log->l_icloglock); + xlog_state_release_iclog(log, ctx->commit_iclog, 0); + /* Not safe to reference ctx now! */ + spin_unlock(&log->l_icloglock); } /* @@ -998,7 +1151,7 @@ xlog_cil_push_background( spin_lock(&cil->xc_push_lock); if (cil->xc_push_seq < cil->xc_current_sequence) { cil->xc_push_seq = cil->xc_current_sequence; - queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work); + queue_work(cil->xc_push_wq, &cil->xc_ctx->push_work); } /* @@ -1034,13 +1187,26 @@ xlog_cil_push_background( /* * xlog_cil_push_now() is used to trigger an immediate CIL push to the sequence * number that is passed. When it returns, the work will be queued for - * @push_seq, but it won't be completed. The caller is expected to do any - * waiting for push_seq to complete if it is required. + * @push_seq, but it won't be completed. + * + * If the caller is performing a synchronous force, we will flush the workqueue + * to get previously queued work moving to minimise the wait time they will + * undergo waiting for all outstanding pushes to complete. The caller is + * expected to do the required waiting for push_seq to complete. + * + * If the caller is performing an async push, we need to ensure that the + * checkpoint is fully flushed out of the iclogs when we finish the push. If we + * don't do this, then the commit record may remain sitting in memory in an + * ACTIVE iclog. This then requires another full log force to push to disk, + * which defeats the purpose of having an async, non-blocking CIL force + * mechanism. Hence in this case we need to pass a flag to the push work to + * indicate it needs to flush the commit record itself. */ static void xlog_cil_push_now( struct xlog *log, - xfs_lsn_t push_seq) + xfs_lsn_t push_seq, + bool async) { struct xfs_cil *cil = log->l_cilp; @@ -1050,7 +1216,8 @@ xlog_cil_push_now( ASSERT(push_seq && push_seq <= cil->xc_current_sequence); /* start on any pending background push to minimise wait time on it */ - flush_work(&cil->xc_push_work); + if (!async) + flush_workqueue(cil->xc_push_wq); /* * If the CIL is empty or we've already pushed the sequence then @@ -1063,7 +1230,8 @@ xlog_cil_push_now( } cil->xc_push_seq = push_seq; - queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work); + cil->xc_push_commit_stable = async; + queue_work(cil->xc_push_wq, &cil->xc_ctx->push_work); spin_unlock(&cil->xc_push_lock); } @@ -1116,7 +1284,7 @@ xlog_cil_commit( xlog_cil_insert_items(log, tp); - if (regrant && !XLOG_FORCED_SHUTDOWN(log)) + if (regrant && !xlog_is_shutdown(log)) xfs_log_ticket_regrant(log, tp->t_ticket); else xfs_log_ticket_ungrant(log, tp->t_ticket); @@ -1148,11 +1316,26 @@ xlog_cil_commit( } /* + * Flush the CIL to stable storage but don't wait for it to complete. This + * requires the CIL push to ensure the commit record for the push hits the disk, + * but otherwise is no different to a push done from a log force. + */ +void +xlog_cil_flush( + struct xlog *log) +{ + xfs_csn_t seq = log->l_cilp->xc_current_sequence; + + trace_xfs_log_force(log->l_mp, seq, _RET_IP_); + xlog_cil_push_now(log, seq, true); +} + +/* * Conditionally push the CIL based on the sequence passed in. * - * We only need to push if we haven't already pushed the sequence - * number given. Hence the only time we will trigger a push here is - * if the push sequence is the same as the current context. + * We only need to push if we haven't already pushed the sequence number given. + * Hence the only time we will trigger a push here is if the push sequence is + * the same as the current context. * * We return the current commit lsn to allow the callers to determine if a * iclog flush is necessary following this call. @@ -1168,13 +1351,17 @@ xlog_cil_force_seq( ASSERT(sequence <= cil->xc_current_sequence); + if (!sequence) + sequence = cil->xc_current_sequence; + trace_xfs_log_force(log->l_mp, sequence, _RET_IP_); + /* * check to see if we need to force out the current context. * xlog_cil_push() handles racing pushes for the same sequence, * so no need to deal with it here. */ restart: - xlog_cil_push_now(log, sequence); + xlog_cil_push_now(log, sequence, false); /* * See if we can find a previous sequence still committing. @@ -1189,7 +1376,7 @@ restart: * shutdown, but then went back to sleep once already in the * shutdown state. */ - if (XLOG_FORCED_SHUTDOWN(log)) + if (xlog_is_shutdown(log)) goto out_shutdown; if (ctx->sequence > sequence) continue; @@ -1198,6 +1385,7 @@ restart: * It is still being pushed! Wait for the push to * complete, then start again from the beginning. */ + XFS_STATS_INC(log->l_mp, xs_log_force_sleep); xlog_wait(&cil->xc_commit_wait, &cil->xc_push_lock); goto restart; } @@ -1282,32 +1470,35 @@ xlog_cil_init( cil = kmem_zalloc(sizeof(*cil), KM_MAYFAIL); if (!cil) return -ENOMEM; + /* + * Limit the CIL pipeline depth to 4 concurrent works to bound the + * concurrency the log spinlocks will be exposed to. + */ + cil->xc_push_wq = alloc_workqueue("xfs-cil/%s", + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_UNBOUND), + 4, log->l_mp->m_super->s_id); + if (!cil->xc_push_wq) + goto out_destroy_cil; - ctx = kmem_zalloc(sizeof(*ctx), KM_MAYFAIL); - if (!ctx) { - kmem_free(cil); - return -ENOMEM; - } - - INIT_WORK(&cil->xc_push_work, xlog_cil_push_work); INIT_LIST_HEAD(&cil->xc_cil); INIT_LIST_HEAD(&cil->xc_committing); spin_lock_init(&cil->xc_cil_lock); spin_lock_init(&cil->xc_push_lock); init_waitqueue_head(&cil->xc_push_wait); init_rwsem(&cil->xc_ctx_lock); + init_waitqueue_head(&cil->xc_start_wait); init_waitqueue_head(&cil->xc_commit_wait); - - INIT_LIST_HEAD(&ctx->committing); - INIT_LIST_HEAD(&ctx->busy_extents); - ctx->sequence = 1; - ctx->cil = cil; - cil->xc_ctx = ctx; - cil->xc_current_sequence = ctx->sequence; - cil->xc_log = log; log->l_cilp = cil; + + ctx = xlog_cil_ctx_alloc(); + xlog_cil_ctx_switch(cil, ctx); + return 0; + +out_destroy_cil: + kmem_free(cil); + return -ENOMEM; } void @@ -1321,6 +1512,7 @@ xlog_cil_destroy( } ASSERT(list_empty(&log->l_cilp->xc_cil)); + destroy_workqueue(log->l_cilp->xc_push_wq); kmem_free(log->l_cilp); } diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index f3e79a45d60a..844fbeec3545 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -12,15 +12,6 @@ struct xlog_ticket; struct xfs_mount; /* - * Flags for log structure - */ -#define XLOG_ACTIVE_RECOVERY 0x2 /* in the middle of recovery */ -#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */ -#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being - shutdown */ -#define XLOG_TAIL_WARN 0x10 /* log tail verify warning issued */ - -/* * get client id from packed copy. * * this hack is here because the xlog_pack code copies four bytes @@ -47,7 +38,6 @@ enum xlog_iclog_state { XLOG_STATE_DONE_SYNC, /* Done syncing to disk */ XLOG_STATE_CALLBACK, /* Callback functions now */ XLOG_STATE_DIRTY, /* Dirty IC log, not ready for ACTIVE status */ - XLOG_STATE_IOERROR, /* IO error happened in sync'ing log */ }; #define XLOG_STATE_STRINGS \ @@ -56,8 +46,7 @@ enum xlog_iclog_state { { XLOG_STATE_SYNCING, "XLOG_STATE_SYNCING" }, \ { XLOG_STATE_DONE_SYNC, "XLOG_STATE_DONE_SYNC" }, \ { XLOG_STATE_CALLBACK, "XLOG_STATE_CALLBACK" }, \ - { XLOG_STATE_DIRTY, "XLOG_STATE_DIRTY" }, \ - { XLOG_STATE_IOERROR, "XLOG_STATE_IOERROR" } + { XLOG_STATE_DIRTY, "XLOG_STATE_DIRTY" } /* * In core log flags @@ -251,6 +240,7 @@ struct xfs_cil_ctx { xfs_csn_t sequence; /* chkpt sequence # */ xfs_lsn_t start_lsn; /* first LSN of chkpt commit */ xfs_lsn_t commit_lsn; /* chkpt commit record lsn */ + struct xlog_in_core *commit_iclog; struct xlog_ticket *ticket; /* chkpt ticket */ int nvecs; /* number of regions */ int space_used; /* aggregate size of regions */ @@ -259,6 +249,7 @@ struct xfs_cil_ctx { struct list_head iclog_entry; struct list_head committing; /* ctx committing list */ struct work_struct discard_endio_work; + struct work_struct push_work; }; /* @@ -281,16 +272,18 @@ struct xfs_cil { struct xlog *xc_log; struct list_head xc_cil; spinlock_t xc_cil_lock; + struct workqueue_struct *xc_push_wq; struct rw_semaphore xc_ctx_lock ____cacheline_aligned_in_smp; struct xfs_cil_ctx *xc_ctx; spinlock_t xc_push_lock ____cacheline_aligned_in_smp; xfs_csn_t xc_push_seq; + bool xc_push_commit_stable; struct list_head xc_committing; wait_queue_head_t xc_commit_wait; + wait_queue_head_t xc_start_wait; xfs_csn_t xc_current_sequence; - struct work_struct xc_push_work; wait_queue_head_t xc_push_wait; /* background push throttle */ } ____cacheline_aligned_in_smp; @@ -407,7 +400,7 @@ struct xlog { struct xfs_buftarg *l_targ; /* buftarg of log */ struct workqueue_struct *l_ioend_workqueue; /* for I/O completions */ struct delayed_work l_work; /* background flush work */ - uint l_flags; + long l_opstate; /* operational state */ uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */ struct list_head *l_buf_cancel_table; int l_iclog_hsize; /* size of iclog header */ @@ -456,13 +449,40 @@ struct xlog { xfs_lsn_t l_recovery_lsn; uint32_t l_iclog_roundoff;/* padding roundoff */ + + /* Users of log incompat features should take a read lock. */ + struct rw_semaphore l_incompat_users; }; #define XLOG_BUF_CANCEL_BUCKET(log, blkno) \ ((log)->l_buf_cancel_table + ((uint64_t)blkno % XLOG_BC_TABLE_SIZE)) -#define XLOG_FORCED_SHUTDOWN(log) \ - (unlikely((log)->l_flags & XLOG_IO_ERROR)) +/* + * Bits for operational state + */ +#define XLOG_ACTIVE_RECOVERY 0 /* in the middle of recovery */ +#define XLOG_RECOVERY_NEEDED 1 /* log was recovered */ +#define XLOG_IO_ERROR 2 /* log hit an I/O error, and being + shutdown */ +#define XLOG_TAIL_WARN 3 /* log tail verify warning issued */ + +static inline bool +xlog_recovery_needed(struct xlog *log) +{ + return test_bit(XLOG_RECOVERY_NEEDED, &log->l_opstate); +} + +static inline bool +xlog_in_recovery(struct xlog *log) +{ + return test_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate); +} + +static inline bool +xlog_is_shutdown(struct xlog *log) +{ + return test_bit(XLOG_IO_ERROR, &log->l_opstate); +} /* common routines */ extern int @@ -496,14 +516,14 @@ xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes) void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket); void xlog_print_trans(struct xfs_trans *); -int xlog_write(struct xlog *log, struct xfs_log_vec *log_vector, - struct xlog_ticket *tic, xfs_lsn_t *start_lsn, - struct xlog_in_core **commit_iclog, uint optype); -int xlog_commit_record(struct xlog *log, struct xlog_ticket *ticket, - struct xlog_in_core **iclog, xfs_lsn_t *lsn); +int xlog_write(struct xlog *log, struct xfs_cil_ctx *ctx, + struct xfs_log_vec *log_vector, struct xlog_ticket *tic, + uint optype); void xfs_log_ticket_ungrant(struct xlog *log, struct xlog_ticket *ticket); void xfs_log_ticket_regrant(struct xlog *log, struct xlog_ticket *ticket); +void xlog_state_switch_iclogs(struct xlog *log, struct xlog_in_core *iclog, + int eventual_size); int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog, xfs_lsn_t log_tail_lsn); @@ -571,10 +591,14 @@ void xlog_cil_destroy(struct xlog *log); bool xlog_cil_empty(struct xlog *log); void xlog_cil_commit(struct xlog *log, struct xfs_trans *tp, xfs_csn_t *commit_seq, bool regrant); +void xlog_cil_set_ctx_write_state(struct xfs_cil_ctx *ctx, + struct xlog_in_core *iclog); + /* * CIL force routines */ +void xlog_cil_flush(struct xlog *log); xfs_lsn_t xlog_cil_force_seq(struct xlog *log, xfs_csn_t sequence); static inline void diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 1721fce2ec94..10562ecbd9ea 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -26,6 +26,8 @@ #include "xfs_error.h" #include "xfs_buf_item.h" #include "xfs_ag.h" +#include "xfs_quota.h" + #define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1) @@ -79,8 +81,6 @@ xlog_alloc_buffer( struct xlog *log, int nbblks) { - int align_mask = xfs_buftarg_dma_alignment(log->l_targ); - /* * Pass log block 0 since we don't have an addr yet, buffer will be * verified on read. @@ -108,7 +108,7 @@ xlog_alloc_buffer( if (nbblks > 1 && log->l_sectBBsize > 1) nbblks += log->l_sectBBsize; nbblks = round_up(nbblks, log->l_sectBBsize); - return kmem_alloc_io(BBTOB(nbblks), align_mask, KM_MAYFAIL | KM_ZERO); + return kvzalloc(BBTOB(nbblks), GFP_KERNEL | __GFP_RETRY_MAYFAIL); } /* @@ -146,7 +146,7 @@ xlog_do_io( error = xfs_rw_bdev(log->l_targ->bt_bdev, log->l_logBBstart + blk_no, BBTOB(nbblks), data, op); - if (error && !XFS_FORCED_SHUTDOWN(log->l_mp)) { + if (error && !xlog_is_shutdown(log)) { xfs_alert(log->l_mp, "log recovery %s I/O error at daddr 0x%llx len %d error %d", op == REQ_OP_WRITE ? "write" : "read", @@ -375,7 +375,7 @@ out: static inline int xlog_logrec_hblks(struct xlog *log, struct xlog_rec_header *rh) { - if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + if (xfs_has_logv2(log->l_mp)) { int h_size = be32_to_cpu(rh->h_size); if ((be32_to_cpu(rh->h_version) & XLOG_VERSION_2) && @@ -1347,7 +1347,7 @@ xlog_find_tail( * headers if we have a filesystem using non-persistent counters. */ if (clean) - log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN; + set_bit(XFS_OPSTATE_CLEAN, &log->l_mp->m_opstate); /* * Make sure that there are no blocks in front of the head @@ -1504,7 +1504,7 @@ xlog_add_record( recp->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM); recp->h_cycle = cpu_to_be32(cycle); recp->h_version = cpu_to_be32( - xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1); + xfs_has_logv2(log->l_mp) ? 2 : 1); recp->h_lsn = cpu_to_be64(xlog_assign_lsn(cycle, block)); recp->h_tail_lsn = cpu_to_be64(xlog_assign_lsn(tail_cycle, tail_block)); recp->h_fmt = cpu_to_be32(XLOG_FMT); @@ -1756,6 +1756,30 @@ xlog_recover_release_intent( spin_unlock(&ailp->ail_lock); } +int +xlog_recover_iget( + struct xfs_mount *mp, + xfs_ino_t ino, + struct xfs_inode **ipp) +{ + int error; + + error = xfs_iget(mp, NULL, ino, 0, 0, ipp); + if (error) + return error; + + error = xfs_qm_dqattach(*ipp); + if (error) { + xfs_irele(*ipp); + return error; + } + + if (VFS_I(*ipp)->i_nlink == 0) + xfs_iflags_set(*ipp, XFS_IRECOVERY); + + return 0; +} + /****************************************************************************** * * Log recover routines @@ -2062,7 +2086,9 @@ xlog_recover_add_to_cont_trans( old_ptr = item->ri_buf[item->ri_cnt-1].i_addr; old_len = item->ri_buf[item->ri_cnt-1].i_len; - ptr = krealloc(old_ptr, len + old_len, GFP_KERNEL | __GFP_NOFAIL); + ptr = kvrealloc(old_ptr, old_len, len + old_len, GFP_KERNEL); + if (!ptr) + return -ENOMEM; memcpy(&ptr[old_len], dp, len); item->ri_buf[item->ri_cnt-1].i_len += len; item->ri_buf[item->ri_cnt-1].i_addr = ptr; @@ -2786,6 +2812,13 @@ xlog_recover_process_iunlinks( } xfs_buf_rele(agibp); } + + /* + * Flush the pending unlinked inodes to ensure that the inactivations + * are fully completed on disk and the incore inodes can be reclaimed + * before we signal that recovery is complete. + */ + xfs_inodegc_flush(mp); } STATIC void @@ -2802,7 +2835,7 @@ xlog_unpack_data( dp += BBSIZE; } - if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + if (xfs_has_logv2(log->l_mp)) { xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead; for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) { j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); @@ -2850,7 +2883,7 @@ xlog_recover_process( * the kernel from one that does not add CRCs by default. */ if (crc != old_crc) { - if (old_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) { + if (old_crc || xfs_has_crc(log->l_mp)) { xfs_alert(log->l_mp, "log record CRC mismatch: found 0x%x, expected 0x%x.", le32_to_cpu(old_crc), @@ -2862,7 +2895,7 @@ xlog_recover_process( * If the filesystem is CRC enabled, this mismatch becomes a * fatal log corruption failure. */ - if (xfs_sb_version_hascrc(&log->l_mp->m_sb)) { + if (xfs_has_crc(log->l_mp)) { XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp); return -EFSCORRUPTED; } @@ -2948,7 +2981,7 @@ xlog_do_recovery_pass( * Read the header of the tail block and get the iclog buffer size from * h_size. Use this to tell how many sectors make up the log header. */ - if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + if (xfs_has_logv2(log->l_mp)) { /* * When using variable length iclogs, read first sector of * iclog header and extract the header size from it. Get a @@ -3280,10 +3313,7 @@ xlog_do_recover( if (error) return error; - /* - * If IO errors happened during recovery, bail out. - */ - if (XFS_FORCED_SHUTDOWN(mp)) + if (xlog_is_shutdown(log)) return -EIO; /* @@ -3305,7 +3335,7 @@ xlog_do_recover( xfs_buf_hold(bp); error = _xfs_buf_read(bp, XBF_READ); if (error) { - if (!XFS_FORCED_SHUTDOWN(mp)) { + if (!xlog_is_shutdown(log)) { xfs_buf_ioerror_alert(bp, __this_address); ASSERT(0); } @@ -3318,6 +3348,7 @@ xlog_do_recover( xfs_buf_relse(bp); /* re-initialise in-core superblock and geometry structures */ + mp->m_features |= xfs_sb_version_to_features(sbp); xfs_reinit_percpu_counters(mp); error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi); if (error) { @@ -3329,7 +3360,7 @@ xlog_do_recover( xlog_recover_check_summary(log); /* Normal transactions can now occur */ - log->l_flags &= ~XLOG_ACTIVE_RECOVERY; + clear_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate); return 0; } @@ -3355,7 +3386,7 @@ xlog_recover( * could not be verified. Check the superblock LSN against the current * LSN now that it's known. */ - if (xfs_sb_version_hascrc(&log->l_mp->m_sb) && + if (xfs_has_crc(log->l_mp) && !xfs_log_check_lsn(log->l_mp, log->l_mp->m_sb.sb_lsn)) return -EINVAL; @@ -3382,7 +3413,7 @@ xlog_recover( * (e.g. unsupported transactions, then simply reject the * attempt at recovery before touching anything. */ - if (XFS_SB_VERSION_NUM(&log->l_mp->m_sb) == XFS_SB_VERSION_5 && + if (xfs_sb_is_v5(&log->l_mp->m_sb) && xfs_sb_has_incompat_log_feature(&log->l_mp->m_sb, XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)) { xfs_warn(log->l_mp, @@ -3413,68 +3444,64 @@ xlog_recover( : "internal"); error = xlog_do_recover(log, head_blk, tail_blk); - log->l_flags |= XLOG_RECOVERY_NEEDED; + set_bit(XLOG_RECOVERY_NEEDED, &log->l_opstate); } return error; } /* - * In the first part of recovery we replay inodes and buffers and build - * up the list of extent free items which need to be processed. Here - * we process the extent free items and clean up the on disk unlinked - * inode lists. This is separated from the first part of recovery so - * that the root and real-time bitmap inodes can be read in from disk in - * between the two stages. This is necessary so that we can free space - * in the real-time portion of the file system. + * In the first part of recovery we replay inodes and buffers and build up the + * list of intents which need to be processed. Here we process the intents and + * clean up the on disk unlinked inode lists. This is separated from the first + * part of recovery so that the root and real-time bitmap inodes can be read in + * from disk in between the two stages. This is necessary so that we can free + * space in the real-time portion of the file system. */ int xlog_recover_finish( struct xlog *log) { - /* - * Now we're ready to do the transactions needed for the - * rest of recovery. Start with completing all the extent - * free intent records and then process the unlinked inode - * lists. At this point, we essentially run in normal mode - * except that we're still performing recovery actions - * rather than accepting new requests. - */ - if (log->l_flags & XLOG_RECOVERY_NEEDED) { - int error; - error = xlog_recover_process_intents(log); - if (error) { - /* - * Cancel all the unprocessed intent items now so that - * we don't leave them pinned in the AIL. This can - * cause the AIL to livelock on the pinned item if - * anyone tries to push the AIL (inode reclaim does - * this) before we get around to xfs_log_mount_cancel. - */ - xlog_recover_cancel_intents(log); - xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); - xfs_alert(log->l_mp, "Failed to recover intents"); - return error; - } + int error; + error = xlog_recover_process_intents(log); + if (error) { /* - * Sync the log to get all the intents out of the AIL. - * This isn't absolutely necessary, but it helps in - * case the unlink transactions would have problems - * pushing the intents out of the way. + * Cancel all the unprocessed intent items now so that we don't + * leave them pinned in the AIL. This can cause the AIL to + * livelock on the pinned item if anyone tries to push the AIL + * (inode reclaim does this) before we get around to + * xfs_log_mount_cancel. */ - xfs_log_force(log->l_mp, XFS_LOG_SYNC); - - xlog_recover_process_iunlinks(log); + xlog_recover_cancel_intents(log); + xfs_alert(log->l_mp, "Failed to recover intents"); + xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); + return error; + } - xlog_recover_check_summary(log); + /* + * Sync the log to get all the intents out of the AIL. This isn't + * absolutely necessary, but it helps in case the unlink transactions + * would have problems pushing the intents out of the way. + */ + xfs_log_force(log->l_mp, XFS_LOG_SYNC); - xfs_notice(log->l_mp, "Ending recovery (logdev: %s)", - log->l_mp->m_logname ? log->l_mp->m_logname - : "internal"); - log->l_flags &= ~XLOG_RECOVERY_NEEDED; - } else { - xfs_info(log->l_mp, "Ending clean mount"); + /* + * Now that we've recovered the log and all the intents, we can clear + * the log incompat feature bits in the superblock because there's no + * longer anything to protect. We rely on the AIL push to write out the + * updated superblock after everything else. + */ + if (xfs_clear_incompat_log_features(log->l_mp)) { + error = xfs_sync_sb(log->l_mp, false); + if (error < 0) { + xfs_alert(log->l_mp, + "Failed to clear log incompat features on recovery"); + return error; + } } + + xlog_recover_process_iunlinks(log); + xlog_recover_check_summary(log); return 0; } @@ -3482,7 +3509,7 @@ void xlog_recover_cancel( struct xlog *log) { - if (log->l_flags & XLOG_RECOVERY_NEEDED) + if (xlog_recovery_needed(log)) xlog_recover_cancel_intents(log); } diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index d0755494597f..06dac09eddbd 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -62,7 +62,7 @@ xfs_uuid_mount( /* Publish UUID in struct super_block */ uuid_copy(&mp->m_super->s_uuid, uuid); - if (mp->m_flags & XFS_MOUNT_NOUUID) + if (xfs_has_nouuid(mp)) return 0; if (uuid_is_null(uuid)) { @@ -104,7 +104,7 @@ xfs_uuid_unmount( uuid_t *uuid = &mp->m_sb.sb_uuid; int i; - if (mp->m_flags & XFS_MOUNT_NOUUID) + if (xfs_has_nouuid(mp)) return; mutex_lock(&xfs_uuid_table_mutex); @@ -225,6 +225,7 @@ reread: goto reread; } + mp->m_features |= xfs_sb_version_to_features(sbp); xfs_reinit_percpu_counters(mp); /* no need to be quiet anymore, so reset the buf ops */ @@ -318,7 +319,7 @@ xfs_validate_new_dalign( } } - if (!xfs_sb_version_hasdalign(&mp->m_sb)) { + if (!xfs_has_dalign(mp)) { xfs_warn(mp, "cannot change alignment: superblock does not support data alignment"); return -EINVAL; @@ -349,8 +350,7 @@ xfs_update_alignment( sbp->sb_unit = mp->m_dalign; sbp->sb_width = mp->m_swidth; mp->m_update_sb = true; - } else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN && - xfs_sb_version_hasdalign(&mp->m_sb)) { + } else if (!xfs_has_noalign(mp) && xfs_has_dalign(mp)) { mp->m_dalign = sbp->sb_unit; mp->m_swidth = sbp->sb_width; } @@ -365,13 +365,16 @@ void xfs_set_low_space_thresholds( struct xfs_mount *mp) { - int i; + uint64_t dblocks = mp->m_sb.sb_dblocks; + uint64_t rtexts = mp->m_sb.sb_rextents; + int i; - for (i = 0; i < XFS_LOWSP_MAX; i++) { - uint64_t space = mp->m_sb.sb_dblocks; + do_div(dblocks, 100); + do_div(rtexts, 100); - do_div(space, 100); - mp->m_low_space[i] = space * (i + 1); + for (i = 0; i < XFS_LOWSP_MAX; i++) { + mp->m_low_space[i] = dblocks * (i + 1); + mp->m_low_rtexts[i] = rtexts * (i + 1); } } @@ -485,7 +488,7 @@ xfs_check_summary_counts( * counters. If any of them are obviously incorrect, we can recompute * them from the AGF headers in the next step. */ - if (XFS_LAST_UNMOUNT_WAS_CLEAN(mp) && + if (xfs_is_clean(mp) && (mp->m_sb.sb_fdblocks > mp->m_sb.sb_dblocks || !xfs_verify_icount(mp, mp->m_sb.sb_icount) || mp->m_sb.sb_ifree > mp->m_sb.sb_icount)) @@ -502,8 +505,7 @@ xfs_check_summary_counts( * superblock to be correct and we don't need to do anything here. * Otherwise, recalculate the summary counters. */ - if ((!xfs_sb_version_haslazysbcount(&mp->m_sb) || - XFS_LAST_UNMOUNT_WAS_CLEAN(mp)) && + if ((!xfs_has_lazysbcount(mp) || xfs_is_clean(mp)) && !xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS)) return 0; @@ -514,7 +516,8 @@ xfs_check_summary_counts( * Flush and reclaim dirty inodes in preparation for unmount. Inodes and * internal inode structures can be sitting in the CIL and AIL at this point, * so we need to unpin them, write them back and/or reclaim them before unmount - * can proceed. + * can proceed. In other words, callers are required to have inactivated all + * inodes. * * An inode cluster that has been freed can have its buffer still pinned in * memory because the transaction is still sitting in a iclog. The stale inodes @@ -543,9 +546,10 @@ xfs_unmount_flush_inodes( xfs_extent_busy_wait_all(mp); flush_workqueue(xfs_discard_wq); - mp->m_flags |= XFS_MOUNT_UNMOUNTING; + set_bit(XFS_OPSTATE_UNMOUNTING, &mp->m_opstate); xfs_ail_push_all_sync(mp->m_ail); + xfs_inodegc_stop(mp); cancel_delayed_work_sync(&mp->m_reclaim_work); xfs_reclaim_inodes(mp); xfs_health_unmount(mp); @@ -607,29 +611,13 @@ xfs_mountfs( xfs_warn(mp, "correcting sb_features alignment problem"); sbp->sb_features2 |= sbp->sb_bad_features2; mp->m_update_sb = true; - - /* - * Re-check for ATTR2 in case it was found in bad_features2 - * slot. - */ - if (xfs_sb_version_hasattr2(&mp->m_sb) && - !(mp->m_flags & XFS_MOUNT_NOATTR2)) - mp->m_flags |= XFS_MOUNT_ATTR2; } - if (xfs_sb_version_hasattr2(&mp->m_sb) && - (mp->m_flags & XFS_MOUNT_NOATTR2)) { - xfs_sb_version_removeattr2(&mp->m_sb); - mp->m_update_sb = true; - - /* update sb_versionnum for the clearing of the morebits */ - if (!sbp->sb_features2) - mp->m_update_sb = true; - } /* always use v2 inodes by default now */ if (!(mp->m_sb.sb_versionnum & XFS_SB_VERSION_NLINKBIT)) { mp->m_sb.sb_versionnum |= XFS_SB_VERSION_NLINKBIT; + mp->m_features |= XFS_FEAT_NLINK; mp->m_update_sb = true; } @@ -702,7 +690,7 @@ xfs_mountfs( * cluster size. Full inode chunk alignment must match the chunk size, * but that is checked on sb read verification... */ - if (xfs_sb_version_hassparseinodes(&mp->m_sb) && + if (xfs_has_sparseinodes(mp) && mp->m_sb.sb_spino_align != XFS_B_TO_FSBT(mp, igeo->inode_cluster_size_raw)) { xfs_warn(mp, @@ -764,6 +752,10 @@ xfs_mountfs( goto out_free_perag; } + error = xfs_inodegc_register_shrinker(mp); + if (error) + goto out_fail_wait; + /* * Log's mount-time initialization. The first part of recovery can place * some items on the AIL, to be handled when recovery is finished or @@ -774,7 +766,7 @@ xfs_mountfs( XFS_FSB_TO_BB(mp, sbp->sb_logblocks)); if (error) { xfs_warn(mp, "log mount failed"); - goto out_fail_wait; + goto out_inodegc_shrinker; } /* Make sure the summary counts are ok. */ @@ -782,6 +774,23 @@ xfs_mountfs( if (error) goto out_log_dealloc; + /* Enable background inode inactivation workers. */ + xfs_inodegc_start(mp); + xfs_blockgc_start(mp); + + /* + * Now that we've recovered any pending superblock feature bit + * additions, we can finish setting up the attr2 behaviour for the + * mount. The noattr2 option overrides the superblock flag, so only + * check the superblock feature flag if the mount option is not set. + */ + if (xfs_has_noattr2(mp)) { + mp->m_features &= ~XFS_FEAT_ATTR2; + } else if (!xfs_has_attr2(mp) && + (mp->m_sb.sb_features2 & XFS_SB_VERSION2_ATTR2BIT)) { + mp->m_features |= XFS_FEAT_ATTR2; + } + /* * Get and sanity-check the root inode. * Save the pointer to it in the mount structure. @@ -825,7 +834,7 @@ xfs_mountfs( * the next remount into writeable mode. Otherwise we would never * perform the update e.g. for the root filesystem. */ - if (mp->m_update_sb && !(mp->m_flags & XFS_MOUNT_RDONLY)) { + if (mp->m_update_sb && !xfs_is_readonly(mp)) { error = xfs_sync_sb(mp, false); if (error) { xfs_warn(mp, "failed to write sb changes"); @@ -836,13 +845,11 @@ xfs_mountfs( /* * Initialise the XFS quota management subsystem for this mount */ - if (XFS_IS_QUOTA_RUNNING(mp)) { + if (XFS_IS_QUOTA_ON(mp)) { error = xfs_qm_newmount(mp, "amount, "aflags); if (error) goto out_rtunmount; } else { - ASSERT(!XFS_IS_QUOTA_ON(mp)); - /* * If a file system had quotas running earlier, but decided to * mount without -o uquota/pquota/gquota options, revoke the @@ -884,10 +891,8 @@ xfs_mountfs( * We use the same quiesce mechanism as the rw->ro remount, as they are * semantically identical operations. */ - if ((mp->m_flags & (XFS_MOUNT_RDONLY|XFS_MOUNT_NORECOVERY)) == - XFS_MOUNT_RDONLY) { + if (xfs_is_readonly(mp) && !xfs_has_norecovery(mp)) xfs_log_clean(mp); - } /* * Complete the quota initialisation, post-log-replay component. @@ -910,7 +915,7 @@ xfs_mountfs( * This may drive us straight to ENOSPC on mount, but that implies * we were already there on the last unmount. Warn if this occurs. */ - if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { + if (!xfs_is_readonly(mp)) { resblks = xfs_default_resblks(mp); error = xfs_reserve_blocks(mp, &resblks, NULL); if (error) @@ -944,6 +949,15 @@ xfs_mountfs( xfs_irele(rip); /* Clean out dquots that might be in memory after quotacheck. */ xfs_qm_unmount(mp); + + /* + * Inactivate all inodes that might still be in memory after a log + * intent recovery failure so that reclaim can free them. Metadata + * inodes and the root directory shouldn't need inactivation, but the + * mount failed for some reason, so pull down all the state and flee. + */ + xfs_inodegc_flush(mp); + /* * Flush all inode reclamation work and flush the log. * We have to do this /after/ rtunmount and qm_unmount because those @@ -958,6 +972,8 @@ xfs_mountfs( xfs_unmount_flush_inodes(mp); out_log_dealloc: xfs_log_mount_cancel(mp); + out_inodegc_shrinker: + unregister_shrinker(&mp->m_inodegc_shrinker); out_fail_wait: if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) xfs_buftarg_drain(mp->m_logdev_targp); @@ -991,6 +1007,16 @@ xfs_unmountfs( uint64_t resblks; int error; + /* + * Perform all on-disk metadata updates required to inactivate inodes + * that the VFS evicted earlier in the unmount process. Freeing inodes + * and discarding CoW fork preallocations can cause shape changes to + * the free inode and refcount btrees, respectively, so we must finish + * this before we discard the metadata space reservations. Metadata + * inodes and the root directory do not require inactivation. + */ + xfs_inodegc_flush(mp); + xfs_blockgc_stop(mp); xfs_fs_unreserve_ag_blocks(mp); xfs_qm_unmount_quotas(mp); @@ -1028,6 +1054,7 @@ xfs_unmountfs( #if defined(DEBUG) xfs_errortag_clearall(mp); #endif + unregister_shrinker(&mp->m_inodegc_shrinker); xfs_free_perag(mp); xfs_errortag_del(mp); @@ -1049,20 +1076,12 @@ xfs_fs_writable( { ASSERT(level > SB_UNFROZEN); if ((mp->m_super->s_writers.frozen >= level) || - XFS_FORCED_SHUTDOWN(mp) || (mp->m_flags & XFS_MOUNT_RDONLY)) + xfs_is_shutdown(mp) || xfs_is_readonly(mp)) return false; return true; } -/* - * Deltas for the block count can vary from 1 to very large, but lock contention - * only occurs on frequent small block count updates such as in the delayed - * allocation path for buffered writes (page a time updates). Hence we set - * a large batch count (1024) to minimise global counter updates except when - * we get near to ENOSPC and we have to be very accurate with our updates. - */ -#define XFS_FDBLOCKS_BATCH 1024 int xfs_mod_fdblocks( struct xfs_mount *mp, @@ -1210,13 +1229,123 @@ void xfs_force_summary_recalc( struct xfs_mount *mp) { - if (!xfs_sb_version_haslazysbcount(&mp->m_sb)) + if (!xfs_has_lazysbcount(mp)) return; xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS); } /* + * Enable a log incompat feature flag in the primary superblock. The caller + * cannot have any other transactions in progress. + */ +int +xfs_add_incompat_log_feature( + struct xfs_mount *mp, + uint32_t feature) +{ + struct xfs_dsb *dsb; + int error; + + ASSERT(hweight32(feature) == 1); + ASSERT(!(feature & XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)); + + /* + * Force the log to disk and kick the background AIL thread to reduce + * the chances that the bwrite will stall waiting for the AIL to unpin + * the primary superblock buffer. This isn't a data integrity + * operation, so we don't need a synchronous push. + */ + error = xfs_log_force(mp, XFS_LOG_SYNC); + if (error) + return error; + xfs_ail_push_all(mp->m_ail); + + /* + * Lock the primary superblock buffer to serialize all callers that + * are trying to set feature bits. + */ + xfs_buf_lock(mp->m_sb_bp); + xfs_buf_hold(mp->m_sb_bp); + + if (xfs_is_shutdown(mp)) { + error = -EIO; + goto rele; + } + + if (xfs_sb_has_incompat_log_feature(&mp->m_sb, feature)) + goto rele; + + /* + * Write the primary superblock to disk immediately, because we need + * the log_incompat bit to be set in the primary super now to protect + * the log items that we're going to commit later. + */ + dsb = mp->m_sb_bp->b_addr; + xfs_sb_to_disk(dsb, &mp->m_sb); + dsb->sb_features_log_incompat |= cpu_to_be32(feature); + error = xfs_bwrite(mp->m_sb_bp); + if (error) + goto shutdown; + + /* + * Add the feature bits to the incore superblock before we unlock the + * buffer. + */ + xfs_sb_add_incompat_log_features(&mp->m_sb, feature); + xfs_buf_relse(mp->m_sb_bp); + + /* Log the superblock to disk. */ + return xfs_sync_sb(mp, false); +shutdown: + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); +rele: + xfs_buf_relse(mp->m_sb_bp); + return error; +} + +/* + * Clear all the log incompat flags from the superblock. + * + * The caller cannot be in a transaction, must ensure that the log does not + * contain any log items protected by any log incompat bit, and must ensure + * that there are no other threads that depend on the state of the log incompat + * feature flags in the primary super. + * + * Returns true if the superblock is dirty. + */ +bool +xfs_clear_incompat_log_features( + struct xfs_mount *mp) +{ + bool ret = false; + + if (!xfs_has_crc(mp) || + !xfs_sb_has_incompat_log_feature(&mp->m_sb, + XFS_SB_FEAT_INCOMPAT_LOG_ALL) || + xfs_is_shutdown(mp)) + return false; + + /* + * Update the incore superblock. We synchronize on the primary super + * buffer lock to be consistent with the add function, though at least + * in theory this shouldn't be necessary. + */ + xfs_buf_lock(mp->m_sb_bp); + xfs_buf_hold(mp->m_sb_bp); + + if (xfs_sb_has_incompat_log_feature(&mp->m_sb, + XFS_SB_FEAT_INCOMPAT_LOG_ALL)) { + xfs_info(mp, "Clearing log incompat feature flags."); + xfs_sb_remove_incompat_log_features(&mp->m_sb); + ret = true; + } + + xfs_buf_relse(mp->m_sb_bp); + return ret; +} + +/* * Update the in-core delayed block counter. * * We prefer to update the counter without having to take a spinlock for every diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index c78b63fe779a..e091f3b3fa15 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -57,6 +57,18 @@ struct xfs_error_cfg { }; /* + * Per-cpu deferred inode inactivation GC lists. + */ +struct xfs_inodegc { + struct llist_head list; + struct work_struct work; + + /* approximate count of inodes in the list */ + unsigned int items; + unsigned int shrinker_hits; +}; + +/* * The struct xfsmount layout is optimised to separate read-mostly variables * from variables that are frequently modified. We put the read-mostly variables * first, then place all the other variables at the end. @@ -82,6 +94,9 @@ typedef struct xfs_mount { xfs_buftarg_t *m_ddev_targp; /* saves taking the address */ xfs_buftarg_t *m_logdev_targp;/* ptr to log device */ xfs_buftarg_t *m_rtdev_targp; /* ptr to rt device */ + struct list_head m_mount_list; /* global mount list */ + void __percpu *m_inodegc; /* percpu inodegc structures */ + /* * Optional cache of rt summary level per bitmap block with the * invariant that m_rsum_cache[bbno] <= the minimum i for which @@ -92,10 +107,10 @@ typedef struct xfs_mount { struct xfs_mru_cache *m_filestream; /* per-mount filestream data */ struct workqueue_struct *m_buf_workqueue; struct workqueue_struct *m_unwritten_workqueue; - struct workqueue_struct *m_cil_workqueue; struct workqueue_struct *m_reclaim_workqueue; - struct workqueue_struct *m_gc_workqueue; struct workqueue_struct *m_sync_workqueue; + struct workqueue_struct *m_blockgc_wq; + struct workqueue_struct *m_inodegc_wq; int m_bsize; /* fs logical block size */ uint8_t m_blkbit_log; /* blocklog + NBBY */ @@ -131,11 +146,13 @@ typedef struct xfs_mount { uint m_rsumsize; /* size of rt summary, bytes */ int m_fixedfsid[2]; /* unchanged for life of FS */ uint m_qflags; /* quota status flags */ - uint64_t m_flags; /* global mount flags */ - int64_t m_low_space[XFS_LOWSP_MAX]; + uint64_t m_features; /* active filesystem features */ + uint64_t m_low_space[XFS_LOWSP_MAX]; + uint64_t m_low_rtexts[XFS_LOWSP_MAX]; struct xfs_ino_geometry m_ino_geo; /* inode geometry */ struct xfs_trans_resv m_resv; /* precomputed res values */ /* low free space thresholds */ + unsigned long m_opstate; /* dynamic state flags */ bool m_always_cow; bool m_fail_unmount; bool m_finobt_nores; /* no per-AG finobt resv. */ @@ -193,6 +210,8 @@ typedef struct xfs_mount { xfs_agnumber_t m_agirotor; /* last ag dir inode alloced */ spinlock_t m_agirotor_lock;/* .. and lock protecting it */ + /* Memory shrinker to throttle and reprioritize inodegc */ + struct shrinker m_inodegc_shrinker; /* * Workqueue item so that we can coalesce multiple inode flush attempts * into a single flush. @@ -225,38 +244,178 @@ typedef struct xfs_mount { #define M_IGEO(mp) (&(mp)->m_ino_geo) /* - * Flags for m_flags. + * Flags for m_features. + * + * These are all the active features in the filesystem, regardless of how + * they are configured. */ -#define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops - must be synchronous except - for space allocations */ -#define XFS_MOUNT_UNMOUNTING (1ULL << 1) /* filesystem is unmounting */ -#define XFS_MOUNT_WAS_CLEAN (1ULL << 3) -#define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem - operations, typically for - disk errors in metadata */ -#define XFS_MOUNT_DISCARD (1ULL << 5) /* discard unused blocks */ -#define XFS_MOUNT_NOALIGN (1ULL << 7) /* turn off stripe alignment - allocations */ -#define XFS_MOUNT_ATTR2 (1ULL << 8) /* allow use of attr2 format */ -#define XFS_MOUNT_GRPID (1ULL << 9) /* group-ID assigned from directory */ -#define XFS_MOUNT_NORECOVERY (1ULL << 10) /* no recovery - dirty fs */ -#define XFS_MOUNT_ALLOCSIZE (1ULL << 12) /* specified allocation size */ -#define XFS_MOUNT_SMALL_INUMS (1ULL << 14) /* user wants 32bit inodes */ -#define XFS_MOUNT_32BITINODES (1ULL << 15) /* inode32 allocator active */ -#define XFS_MOUNT_NOUUID (1ULL << 16) /* ignore uuid during mount */ -#define XFS_MOUNT_IKEEP (1ULL << 18) /* keep empty inode clusters*/ -#define XFS_MOUNT_SWALLOC (1ULL << 19) /* turn on stripe width - * allocation */ -#define XFS_MOUNT_RDONLY (1ULL << 20) /* read-only fs */ -#define XFS_MOUNT_DIRSYNC (1ULL << 21) /* synchronous directory ops */ -#define XFS_MOUNT_LARGEIO (1ULL << 22) /* report large preferred +#define XFS_FEAT_ATTR (1ULL << 0) /* xattrs present in fs */ +#define XFS_FEAT_NLINK (1ULL << 1) /* 32 bit link counts */ +#define XFS_FEAT_QUOTA (1ULL << 2) /* quota active */ +#define XFS_FEAT_ALIGN (1ULL << 3) /* inode alignment */ +#define XFS_FEAT_DALIGN (1ULL << 4) /* data alignment */ +#define XFS_FEAT_LOGV2 (1ULL << 5) /* version 2 logs */ +#define XFS_FEAT_SECTOR (1ULL << 6) /* sector size > 512 bytes */ +#define XFS_FEAT_EXTFLG (1ULL << 7) /* unwritten extents */ +#define XFS_FEAT_ASCIICI (1ULL << 8) /* ASCII only case-insens. */ +#define XFS_FEAT_LAZYSBCOUNT (1ULL << 9) /* Superblk counters */ +#define XFS_FEAT_ATTR2 (1ULL << 10) /* dynamic attr fork */ +#define XFS_FEAT_PARENT (1ULL << 11) /* parent pointers */ +#define XFS_FEAT_PROJID32 (1ULL << 12) /* 32 bit project id */ +#define XFS_FEAT_CRC (1ULL << 13) /* metadata CRCs */ +#define XFS_FEAT_V3INODES (1ULL << 14) /* Version 3 inodes */ +#define XFS_FEAT_PQUOTINO (1ULL << 15) /* non-shared proj/grp quotas */ +#define XFS_FEAT_FTYPE (1ULL << 16) /* inode type in dir */ +#define XFS_FEAT_FINOBT (1ULL << 17) /* free inode btree */ +#define XFS_FEAT_RMAPBT (1ULL << 18) /* reverse map btree */ +#define XFS_FEAT_REFLINK (1ULL << 19) /* reflinked files */ +#define XFS_FEAT_SPINODES (1ULL << 20) /* sparse inode chunks */ +#define XFS_FEAT_META_UUID (1ULL << 21) /* metadata UUID */ +#define XFS_FEAT_REALTIME (1ULL << 22) /* realtime device present */ +#define XFS_FEAT_INOBTCNT (1ULL << 23) /* inobt block counts */ +#define XFS_FEAT_BIGTIME (1ULL << 24) /* large timestamps */ +#define XFS_FEAT_NEEDSREPAIR (1ULL << 25) /* needs xfs_repair */ + +/* Mount features */ +#define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */ +#define XFS_FEAT_NOALIGN (1ULL << 49) /* ignore alignment */ +#define XFS_FEAT_ALLOCSIZE (1ULL << 50) /* user specified allocation size */ +#define XFS_FEAT_LARGE_IOSIZE (1ULL << 51) /* report large preferred * I/O size in stat() */ -#define XFS_MOUNT_FILESTREAMS (1ULL << 24) /* enable the filestreams - allocator */ -#define XFS_MOUNT_NOATTR2 (1ULL << 25) /* disable use of attr2 format */ -#define XFS_MOUNT_DAX_ALWAYS (1ULL << 26) -#define XFS_MOUNT_DAX_NEVER (1ULL << 27) +#define XFS_FEAT_WSYNC (1ULL << 52) /* synchronous metadata ops */ +#define XFS_FEAT_DIRSYNC (1ULL << 53) /* synchronous directory ops */ +#define XFS_FEAT_DISCARD (1ULL << 54) /* discard unused blocks */ +#define XFS_FEAT_GRPID (1ULL << 55) /* group-ID assigned from directory */ +#define XFS_FEAT_SMALL_INUMS (1ULL << 56) /* user wants 32bit inodes */ +#define XFS_FEAT_IKEEP (1ULL << 57) /* keep empty inode clusters*/ +#define XFS_FEAT_SWALLOC (1ULL << 58) /* stripe width allocation */ +#define XFS_FEAT_FILESTREAMS (1ULL << 59) /* use filestreams allocator */ +#define XFS_FEAT_DAX_ALWAYS (1ULL << 60) /* DAX always enabled */ +#define XFS_FEAT_DAX_NEVER (1ULL << 61) /* DAX never enabled */ +#define XFS_FEAT_NORECOVERY (1ULL << 62) /* no recovery - dirty fs */ +#define XFS_FEAT_NOUUID (1ULL << 63) /* ignore uuid during mount */ + +#define __XFS_HAS_FEAT(name, NAME) \ +static inline bool xfs_has_ ## name (struct xfs_mount *mp) \ +{ \ + return mp->m_features & XFS_FEAT_ ## NAME; \ +} + +/* Some features can be added dynamically so they need a set wrapper, too. */ +#define __XFS_ADD_FEAT(name, NAME) \ + __XFS_HAS_FEAT(name, NAME); \ +static inline void xfs_add_ ## name (struct xfs_mount *mp) \ +{ \ + mp->m_features |= XFS_FEAT_ ## NAME; \ + xfs_sb_version_add ## name(&mp->m_sb); \ +} + +/* Superblock features */ +__XFS_ADD_FEAT(attr, ATTR) +__XFS_HAS_FEAT(nlink, NLINK) +__XFS_ADD_FEAT(quota, QUOTA) +__XFS_HAS_FEAT(align, ALIGN) +__XFS_HAS_FEAT(dalign, DALIGN) +__XFS_HAS_FEAT(logv2, LOGV2) +__XFS_HAS_FEAT(sector, SECTOR) +__XFS_HAS_FEAT(extflg, EXTFLG) +__XFS_HAS_FEAT(asciici, ASCIICI) +__XFS_HAS_FEAT(lazysbcount, LAZYSBCOUNT) +__XFS_ADD_FEAT(attr2, ATTR2) +__XFS_HAS_FEAT(parent, PARENT) +__XFS_ADD_FEAT(projid32, PROJID32) +__XFS_HAS_FEAT(crc, CRC) +__XFS_HAS_FEAT(v3inodes, V3INODES) +__XFS_HAS_FEAT(pquotino, PQUOTINO) +__XFS_HAS_FEAT(ftype, FTYPE) +__XFS_HAS_FEAT(finobt, FINOBT) +__XFS_HAS_FEAT(rmapbt, RMAPBT) +__XFS_HAS_FEAT(reflink, REFLINK) +__XFS_HAS_FEAT(sparseinodes, SPINODES) +__XFS_HAS_FEAT(metauuid, META_UUID) +__XFS_HAS_FEAT(realtime, REALTIME) +__XFS_HAS_FEAT(inobtcounts, INOBTCNT) +__XFS_HAS_FEAT(bigtime, BIGTIME) +__XFS_HAS_FEAT(needsrepair, NEEDSREPAIR) + +/* + * Mount features + * + * These do not change dynamically - features that can come and go, such as 32 + * bit inodes and read-only state, are kept as operational state rather than + * features. + */ +__XFS_HAS_FEAT(noattr2, NOATTR2) +__XFS_HAS_FEAT(noalign, NOALIGN) +__XFS_HAS_FEAT(allocsize, ALLOCSIZE) +__XFS_HAS_FEAT(large_iosize, LARGE_IOSIZE) +__XFS_HAS_FEAT(wsync, WSYNC) +__XFS_HAS_FEAT(dirsync, DIRSYNC) +__XFS_HAS_FEAT(discard, DISCARD) +__XFS_HAS_FEAT(grpid, GRPID) +__XFS_HAS_FEAT(small_inums, SMALL_INUMS) +__XFS_HAS_FEAT(ikeep, IKEEP) +__XFS_HAS_FEAT(swalloc, SWALLOC) +__XFS_HAS_FEAT(filestreams, FILESTREAMS) +__XFS_HAS_FEAT(dax_always, DAX_ALWAYS) +__XFS_HAS_FEAT(dax_never, DAX_NEVER) +__XFS_HAS_FEAT(norecovery, NORECOVERY) +__XFS_HAS_FEAT(nouuid, NOUUID) + +/* + * Operational mount state flags + * + * Use these with atomic bit ops only! + */ +#define XFS_OPSTATE_UNMOUNTING 0 /* filesystem is unmounting */ +#define XFS_OPSTATE_CLEAN 1 /* mount was clean */ +#define XFS_OPSTATE_SHUTDOWN 2 /* stop all fs operations */ +#define XFS_OPSTATE_INODE32 3 /* inode32 allocator active */ +#define XFS_OPSTATE_READONLY 4 /* read-only fs */ + +/* + * If set, inactivation worker threads will be scheduled to process queued + * inodegc work. If not, queued inodes remain in memory waiting to be + * processed. + */ +#define XFS_OPSTATE_INODEGC_ENABLED 5 +/* + * If set, background speculative prealloc gc worker threads will be scheduled + * to process queued blockgc work. If not, inodes retain their preallocations + * until explicitly deleted. + */ +#define XFS_OPSTATE_BLOCKGC_ENABLED 6 + +#define __XFS_IS_OPSTATE(name, NAME) \ +static inline bool xfs_is_ ## name (struct xfs_mount *mp) \ +{ \ + return test_bit(XFS_OPSTATE_ ## NAME, &mp->m_opstate); \ +} \ +static inline bool xfs_clear_ ## name (struct xfs_mount *mp) \ +{ \ + return test_and_clear_bit(XFS_OPSTATE_ ## NAME, &mp->m_opstate); \ +} \ +static inline bool xfs_set_ ## name (struct xfs_mount *mp) \ +{ \ + return test_and_set_bit(XFS_OPSTATE_ ## NAME, &mp->m_opstate); \ +} + +__XFS_IS_OPSTATE(unmounting, UNMOUNTING) +__XFS_IS_OPSTATE(clean, CLEAN) +__XFS_IS_OPSTATE(shutdown, SHUTDOWN) +__XFS_IS_OPSTATE(inode32, INODE32) +__XFS_IS_OPSTATE(readonly, READONLY) +__XFS_IS_OPSTATE(inodegc_enabled, INODEGC_ENABLED) +__XFS_IS_OPSTATE(blockgc_enabled, BLOCKGC_ENABLED) + +#define XFS_OPSTATE_STRINGS \ + { (1UL << XFS_OPSTATE_UNMOUNTING), "unmounting" }, \ + { (1UL << XFS_OPSTATE_CLEAN), "clean" }, \ + { (1UL << XFS_OPSTATE_SHUTDOWN), "shutdown" }, \ + { (1UL << XFS_OPSTATE_INODE32), "inode32" }, \ + { (1UL << XFS_OPSTATE_READONLY), "read_only" }, \ + { (1UL << XFS_OPSTATE_INODEGC_ENABLED), "inodegc" }, \ + { (1UL << XFS_OPSTATE_BLOCKGC_ENABLED), "blockgc" } /* * Max and min values for mount-option defined I/O @@ -265,9 +424,7 @@ typedef struct xfs_mount { #define XFS_MAX_IO_LOG 30 /* 1G */ #define XFS_MIN_IO_LOG PAGE_SHIFT -#define XFS_LAST_UNMOUNT_WAS_CLEAN(mp) \ - ((mp)->m_flags & XFS_MOUNT_WAS_CLEAN) -#define XFS_FORCED_SHUTDOWN(mp) ((mp)->m_flags & XFS_MOUNT_FS_SHUTDOWN) +#define xfs_is_shutdown(mp) xfs_is_shutdown(mp) void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname, int lnnum); #define xfs_force_shutdown(m,f) \ @@ -278,6 +435,12 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname, #define SHUTDOWN_FORCE_UMOUNT 0x0004 /* shutdown from a forced unmount */ #define SHUTDOWN_CORRUPT_INCORE 0x0008 /* corrupt in-memory data structures */ +#define XFS_SHUTDOWN_STRINGS \ + { SHUTDOWN_META_IO_ERROR, "metadata_io" }, \ + { SHUTDOWN_LOG_IO_ERROR, "log_io" }, \ + { SHUTDOWN_FORCE_UMOUNT, "force_umount" }, \ + { SHUTDOWN_CORRUPT_INCORE, "corruption" } + /* * Flags for xfs_mountfs */ @@ -306,6 +469,15 @@ extern uint64_t xfs_default_resblks(xfs_mount_t *mp); extern int xfs_mountfs(xfs_mount_t *mp); extern void xfs_unmountfs(xfs_mount_t *); +/* + * Deltas for the block count can vary from 1 to very large, but lock contention + * only occurs on frequent small block count updates such as in the delayed + * allocation path for buffered writes (page a time updates). Hence we set + * a large batch count (1024) to minimise global counter updates except when + * we get near to ENOSPC and we have to be very accurate with our updates. + */ +#define XFS_FDBLOCKS_BATCH 1024 + extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta, bool reserved); extern int xfs_mod_frextents(struct xfs_mount *mp, int64_t delta); @@ -325,6 +497,8 @@ int xfs_zero_extent(struct xfs_inode *ip, xfs_fsblock_t start_fsb, struct xfs_error_cfg * xfs_error_get_cfg(struct xfs_mount *mp, int error_class, int error); void xfs_force_summary_recalc(struct xfs_mount *mp); +int xfs_add_incompat_log_feature(struct xfs_mount *mp, uint32_t feature); +bool xfs_clear_incompat_log_features(struct xfs_mount *mp); void xfs_mod_delalloc(struct xfs_mount *mp, int64_t delta); #endif /* __XFS_MOUNT_H__ */ diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 956cca24e67f..5e1d29d8b2e7 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -92,7 +92,7 @@ xfs_fs_map_blocks( uint lock_flags; int error = 0; - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; /* diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index fe341f3fd419..5608066d6e53 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -157,7 +157,7 @@ xfs_qm_dqpurge( } ASSERT(atomic_read(&dqp->q_pincount) == 0); - ASSERT(XFS_FORCED_SHUTDOWN(mp) || + ASSERT(xfs_is_shutdown(mp) || !test_bit(XFS_LI_IN_AIL, &dqp->q_logitem.qli_item.li_flags)); xfs_dqfunlock(dqp); @@ -185,17 +185,13 @@ out_unlock: /* * Purge the dquot cache. */ -void +static void xfs_qm_dqpurge_all( - struct xfs_mount *mp, - uint flags) + struct xfs_mount *mp) { - if (flags & XFS_QMOPT_UQUOTA) - xfs_qm_dquot_walk(mp, XFS_DQTYPE_USER, xfs_qm_dqpurge, NULL); - if (flags & XFS_QMOPT_GQUOTA) - xfs_qm_dquot_walk(mp, XFS_DQTYPE_GROUP, xfs_qm_dqpurge, NULL); - if (flags & XFS_QMOPT_PQUOTA) - xfs_qm_dquot_walk(mp, XFS_DQTYPE_PROJ, xfs_qm_dqpurge, NULL); + xfs_qm_dquot_walk(mp, XFS_DQTYPE_USER, xfs_qm_dqpurge, NULL); + xfs_qm_dquot_walk(mp, XFS_DQTYPE_GROUP, xfs_qm_dqpurge, NULL); + xfs_qm_dquot_walk(mp, XFS_DQTYPE_PROJ, xfs_qm_dqpurge, NULL); } /* @@ -206,7 +202,7 @@ xfs_qm_unmount( struct xfs_mount *mp) { if (mp->m_quotainfo) { - xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL); + xfs_qm_dqpurge_all(mp); xfs_qm_destroy_quotainfo(mp); } } @@ -299,8 +295,6 @@ xfs_qm_need_dqattach( { struct xfs_mount *mp = ip->i_mount; - if (!XFS_IS_QUOTA_RUNNING(mp)) - return false; if (!XFS_IS_QUOTA_ON(mp)) return false; if (!XFS_NOT_DQATTACHED(mp, ip)) @@ -635,7 +629,7 @@ xfs_qm_init_quotainfo( struct xfs_quotainfo *qinf; int error; - ASSERT(XFS_IS_QUOTA_RUNNING(mp)); + ASSERT(XFS_IS_QUOTA_ON(mp)); qinf = mp->m_quotainfo = kmem_zalloc(sizeof(struct xfs_quotainfo), 0); @@ -662,7 +656,7 @@ xfs_qm_init_quotainfo( /* Precalc some constants */ qinf->qi_dqchunklen = XFS_FSB_TO_BB(mp, XFS_DQUOT_CLUSTER_SIZE_FSB); qinf->qi_dqperchunk = xfs_calc_dquots_per_chunk(qinf->qi_dqchunklen); - if (xfs_sb_version_hasbigtime(&mp->m_sb)) { + if (xfs_has_bigtime(mp)) { qinf->qi_expiry_min = xfs_dq_bigtime_to_unix(XFS_DQ_BIGTIME_EXPIRY_MIN); qinf->qi_expiry_max = @@ -680,11 +674,11 @@ xfs_qm_init_quotainfo( xfs_qm_init_timelimits(mp, XFS_DQTYPE_GROUP); xfs_qm_init_timelimits(mp, XFS_DQTYPE_PROJ); - if (XFS_IS_UQUOTA_RUNNING(mp)) + if (XFS_IS_UQUOTA_ON(mp)) xfs_qm_set_defquota(mp, XFS_DQTYPE_USER, qinf); - if (XFS_IS_GQUOTA_RUNNING(mp)) + if (XFS_IS_GQUOTA_ON(mp)) xfs_qm_set_defquota(mp, XFS_DQTYPE_GROUP, qinf); - if (XFS_IS_PQUOTA_RUNNING(mp)) + if (XFS_IS_PQUOTA_ON(mp)) xfs_qm_set_defquota(mp, XFS_DQTYPE_PROJ, qinf); qinf->qi_shrinker.count_objects = xfs_qm_shrink_count; @@ -755,7 +749,7 @@ xfs_qm_qino_alloc( * with PQUOTA, just use sb_gquotino for sb_pquotino and * vice-versa. */ - if (!xfs_sb_version_has_pquotino(&mp->m_sb) && + if (!xfs_has_pquotino(mp) && (flags & (XFS_QMOPT_PQUOTA|XFS_QMOPT_GQUOTA))) { xfs_ino_t ino = NULLFSINO; @@ -808,9 +802,9 @@ xfs_qm_qino_alloc( */ spin_lock(&mp->m_sb_lock); if (flags & XFS_QMOPT_SBVERSION) { - ASSERT(!xfs_sb_version_hasquota(&mp->m_sb)); + ASSERT(!xfs_has_quota(mp)); - xfs_sb_version_addquota(&mp->m_sb); + xfs_add_quota(mp); mp->m_sb.sb_uquotino = NULLFSINO; mp->m_sb.sb_gquotino = NULLFSINO; mp->m_sb.sb_pquotino = NULLFSINO; @@ -829,7 +823,7 @@ xfs_qm_qino_alloc( error = xfs_trans_commit(tp); if (error) { - ASSERT(XFS_FORCED_SHUTDOWN(mp)); + ASSERT(xfs_is_shutdown(mp)); xfs_alert(mp, "%s failed (error %d)!", __func__, error); } if (need_alloc) @@ -896,11 +890,11 @@ xfs_qm_reset_dqcounts( ddq->d_bwarns = 0; ddq->d_iwarns = 0; ddq->d_rtbwarns = 0; - if (xfs_sb_version_hasbigtime(&mp->m_sb)) + if (xfs_has_bigtime(mp)) ddq->d_type |= XFS_DQTYPE_BIGTIME; } - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { xfs_update_cksum((char *)&dqb[j], sizeof(struct xfs_dqblk), XFS_DQUOT_CRC_OFF); @@ -1147,7 +1141,7 @@ xfs_qm_dqusage_adjust( xfs_filblks_t rtblks = 0; /* total rt blks */ int error; - ASSERT(XFS_IS_QUOTA_RUNNING(mp)); + ASSERT(XFS_IS_QUOTA_ON(mp)); /* * rootino must have its resources accounted for, not so with the quota @@ -1288,7 +1282,7 @@ xfs_qm_quotacheck( flags = 0; ASSERT(uip || gip || pip); - ASSERT(XFS_IS_QUOTA_RUNNING(mp)); + ASSERT(XFS_IS_QUOTA_ON(mp)); xfs_notice(mp, "Quotacheck needed: Please wait."); @@ -1359,7 +1353,7 @@ xfs_qm_quotacheck( * at this point (because we intentionally didn't in dqget_noattach). */ if (error) { - xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL); + xfs_qm_dqpurge_all(mp); goto error_return; } @@ -1418,7 +1412,7 @@ xfs_qm_mount_quotas( goto write_changes; } - ASSERT(XFS_IS_QUOTA_RUNNING(mp)); + ASSERT(XFS_IS_QUOTA_ON(mp)); /* * Allocate the quotainfo structure inside the mount struct, and @@ -1473,7 +1467,7 @@ xfs_qm_mount_quotas( * the incore structures are convinced that quotas are * off, but the on disk superblock doesn't know that ! */ - ASSERT(!(XFS_IS_QUOTA_RUNNING(mp))); + ASSERT(!(XFS_IS_QUOTA_ON(mp))); xfs_alert(mp, "%s: Superblock update failed!", __func__); } @@ -1504,7 +1498,7 @@ xfs_qm_init_quotainos( /* * Get the uquota and gquota inodes */ - if (xfs_sb_version_hasquota(&mp->m_sb)) { + if (xfs_has_quota(mp)) { if (XFS_IS_UQUOTA_ON(mp) && mp->m_sb.sb_uquotino != NULLFSINO) { ASSERT(mp->m_sb.sb_uquotino > 0); @@ -1645,7 +1639,7 @@ xfs_qm_vop_dqalloc( int error; uint lockflags; - if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) + if (!XFS_IS_QUOTA_ON(mp)) return 0; lockflags = XFS_ILOCK_EXCL; @@ -1776,7 +1770,7 @@ xfs_qm_vop_chown( ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - ASSERT(XFS_IS_QUOTA_RUNNING(ip->i_mount)); + ASSERT(XFS_IS_QUOTA_ON(ip->i_mount)); /* old dquot */ prevdq = *IO_olddq; @@ -1829,7 +1823,7 @@ xfs_qm_vop_rename_dqattach( struct xfs_mount *mp = i_tab[0]->i_mount; int i; - if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) + if (!XFS_IS_QUOTA_ON(mp)) return 0; for (i = 0; (i < 4 && i_tab[i]); i++) { @@ -1860,7 +1854,7 @@ xfs_qm_vop_create_dqattach( { struct xfs_mount *mp = tp->t_mountp; - if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) + if (!XFS_IS_QUOTA_ON(mp)) return; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); @@ -1888,3 +1882,37 @@ xfs_qm_vop_create_dqattach( } } +/* Decide if this inode's dquot is near an enforcement boundary. */ +bool +xfs_inode_near_dquot_enforcement( + struct xfs_inode *ip, + xfs_dqtype_t type) +{ + struct xfs_dquot *dqp; + int64_t freesp; + + /* We only care for quotas that are enabled and enforced. */ + dqp = xfs_inode_dquot(ip, type); + if (!dqp || !xfs_dquot_is_enforced(dqp)) + return false; + + if (xfs_dquot_res_over_limits(&dqp->q_ino) || + xfs_dquot_res_over_limits(&dqp->q_rtb)) + return true; + + /* For space on the data device, check the various thresholds. */ + if (!dqp->q_prealloc_hi_wmark) + return false; + + if (dqp->q_blk.reserved < dqp->q_prealloc_lo_wmark) + return false; + + if (dqp->q_blk.reserved >= dqp->q_prealloc_hi_wmark) + return true; + + freesp = dqp->q_prealloc_hi_wmark - dqp->q_blk.reserved; + if (freesp < dqp->q_low_space[XFS_QLOWSP_5_PCNT]) + return true; + + return false; +} diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index ebbb484c49dc..442a0f97a9d4 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -140,9 +140,6 @@ struct xfs_dquot_acct { extern void xfs_qm_destroy_quotainfo(struct xfs_mount *); -/* dquot stuff */ -extern void xfs_qm_dqpurge_all(struct xfs_mount *, uint); - /* quota ops */ extern int xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint); extern int xfs_qm_scall_getquota(struct xfs_mount *mp, diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c index df00dfbf5c9d..b77673dd0558 100644 --- a/fs/xfs/xfs_qm_bhv.c +++ b/fs/xfs/xfs_qm_bhv.c @@ -75,7 +75,7 @@ xfs_qm_newmount( uint quotaondisk; uint uquotaondisk = 0, gquotaondisk = 0, pquotaondisk = 0; - quotaondisk = xfs_sb_version_hasquota(&mp->m_sb) && + quotaondisk = xfs_has_quota(mp) && (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT); if (quotaondisk) { diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 13a56e1ea15c..47fe60e1a887 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -19,91 +19,11 @@ #include "xfs_qm.h" #include "xfs_icache.h" -STATIC int -xfs_qm_log_quotaoff( - struct xfs_mount *mp, - struct xfs_qoff_logitem **qoffstartp, - uint flags) -{ - struct xfs_trans *tp; - int error; - struct xfs_qoff_logitem *qoffi; - - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_quotaoff, 0, 0, 0, &tp); - if (error) - goto out; - - qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT); - xfs_trans_log_quotaoff_item(tp, qoffi); - - spin_lock(&mp->m_sb_lock); - mp->m_sb.sb_qflags = (mp->m_qflags & ~(flags)) & XFS_MOUNT_QUOTA_ALL; - spin_unlock(&mp->m_sb_lock); - - xfs_log_sb(tp); - - /* - * We have to make sure that the transaction is secure on disk before we - * return and actually stop quota accounting. So, make it synchronous. - * We don't care about quotoff's performance. - */ - xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp); - if (error) - goto out; - - *qoffstartp = qoffi; -out: - return error; -} - -STATIC int -xfs_qm_log_quotaoff_end( - struct xfs_mount *mp, - struct xfs_qoff_logitem **startqoff, - uint flags) -{ - struct xfs_trans *tp; - int error; - struct xfs_qoff_logitem *qoffi; - - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_equotaoff, 0, 0, 0, &tp); - if (error) - return error; - - qoffi = xfs_trans_get_qoff_item(tp, *startqoff, - flags & XFS_ALL_QUOTA_ACCT); - xfs_trans_log_quotaoff_item(tp, qoffi); - *startqoff = NULL; - - /* - * We have to make sure that the transaction is secure on disk before we - * return and actually stop quota accounting. So, make it synchronous. - * We don't care about quotoff's performance. - */ - xfs_trans_set_sync(tp); - return xfs_trans_commit(tp); -} - -/* - * Turn off quota accounting and/or enforcement for all udquots and/or - * gdquots. Called only at unmount time. - * - * This assumes that there are no dquots of this file system cached - * incore, and modifies the ondisk dquot directly. Therefore, for example, - * it is an error to call this twice, without purging the cache. - */ int xfs_qm_scall_quotaoff( xfs_mount_t *mp, uint flags) { - struct xfs_quotainfo *q = mp->m_quotainfo; - uint dqtype; - int error; - uint inactivate_flags; - struct xfs_qoff_logitem *qoffstart = NULL; - /* * No file system can have quotas enabled on disk but not in core. * Note that quota utilities (like quotaoff) _expect_ @@ -111,160 +31,23 @@ xfs_qm_scall_quotaoff( */ if ((mp->m_qflags & flags) == 0) return -EEXIST; - error = 0; - - flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD); - - /* - * We don't want to deal with two quotaoffs messing up each other, - * so we're going to serialize it. quotaoff isn't exactly a performance - * critical thing. - * If quotaoff, then we must be dealing with the root filesystem. - */ - ASSERT(q); - mutex_lock(&q->qi_quotaofflock); /* - * If we're just turning off quota enforcement, change mp and go. + * We do not support actually turning off quota accounting any more. + * Just log a warning and ignore the accounting related flags. */ - if ((flags & XFS_ALL_QUOTA_ACCT) == 0) { - mp->m_qflags &= ~(flags); + if (flags & XFS_ALL_QUOTA_ACCT) + xfs_info(mp, "disabling of quota accounting not supported."); - spin_lock(&mp->m_sb_lock); - mp->m_sb.sb_qflags = mp->m_qflags; - spin_unlock(&mp->m_sb_lock); - mutex_unlock(&q->qi_quotaofflock); - - /* XXX what to do if error ? Revert back to old vals incore ? */ - return xfs_sync_sb(mp, false); - } - - dqtype = 0; - inactivate_flags = 0; - /* - * If accounting is off, we must turn enforcement off, clear the - * quota 'CHKD' certificate to make it known that we have to - * do a quotacheck the next time this quota is turned on. - */ - if (flags & XFS_UQUOTA_ACCT) { - dqtype |= XFS_QMOPT_UQUOTA; - flags |= (XFS_UQUOTA_CHKD | XFS_UQUOTA_ENFD); - inactivate_flags |= XFS_UQUOTA_ACTIVE; - } - if (flags & XFS_GQUOTA_ACCT) { - dqtype |= XFS_QMOPT_GQUOTA; - flags |= (XFS_GQUOTA_CHKD | XFS_GQUOTA_ENFD); - inactivate_flags |= XFS_GQUOTA_ACTIVE; - } - if (flags & XFS_PQUOTA_ACCT) { - dqtype |= XFS_QMOPT_PQUOTA; - flags |= (XFS_PQUOTA_CHKD | XFS_PQUOTA_ENFD); - inactivate_flags |= XFS_PQUOTA_ACTIVE; - } - - /* - * Nothing to do? Don't complain. This happens when we're just - * turning off quota enforcement. - */ - if ((mp->m_qflags & flags) == 0) - goto out_unlock; - - /* - * Write the LI_QUOTAOFF log record, and do SB changes atomically, - * and synchronously. If we fail to write, we should abort the - * operation as it cannot be recovered safely if we crash. - */ - error = xfs_qm_log_quotaoff(mp, &qoffstart, flags); - if (error) - goto out_unlock; - - /* - * Next we clear the XFS_MOUNT_*DQ_ACTIVE bit(s) in the mount struct - * to take care of the race between dqget and quotaoff. We don't take - * any special locks to reset these bits. All processes need to check - * these bits *after* taking inode lock(s) to see if the particular - * quota type is in the process of being turned off. If *ACTIVE, it is - * guaranteed that all dquot structures and all quotainode ptrs will all - * stay valid as long as that inode is kept locked. - * - * There is no turning back after this. - */ - mp->m_qflags &= ~inactivate_flags; - - /* - * Give back all the dquot reference(s) held by inodes. - * Here we go thru every single incore inode in this file system, and - * do a dqrele on the i_udquot/i_gdquot that it may have. - * Essentially, as long as somebody has an inode locked, this guarantees - * that quotas will not be turned off. This is handy because in a - * transaction once we lock the inode(s) and check for quotaon, we can - * depend on the quota inodes (and other things) being valid as long as - * we keep the lock(s). - */ - error = xfs_dqrele_all_inodes(mp, flags); - ASSERT(!error); - - /* - * Next we make the changes in the quota flag in the mount struct. - * This isn't protected by a particular lock directly, because we - * don't want to take a mrlock every time we depend on quotas being on. - */ - mp->m_qflags &= ~flags; - - /* - * Go through all the dquots of this file system and purge them, - * according to what was turned off. - */ - xfs_qm_dqpurge_all(mp, dqtype); - - /* - * Transactions that had started before ACTIVE state bit was cleared - * could have logged many dquots, so they'd have higher LSNs than - * the first QUOTAOFF log record does. If we happen to crash when - * the tail of the log has gone past the QUOTAOFF record, but - * before the last dquot modification, those dquots __will__ - * recover, and that's not good. - * - * So, we have QUOTAOFF start and end logitems; the start - * logitem won't get overwritten until the end logitem appears... - */ - error = xfs_qm_log_quotaoff_end(mp, &qoffstart, flags); - if (error) { - /* We're screwed now. Shutdown is the only option. */ - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - goto out_unlock; - } - - /* - * If all quotas are completely turned off, close shop. - */ - if (mp->m_qflags == 0) { - mutex_unlock(&q->qi_quotaofflock); - xfs_qm_destroy_quotainfo(mp); - return 0; - } - - /* - * Release our quotainode references if we don't need them anymore. - */ - if ((dqtype & XFS_QMOPT_UQUOTA) && q->qi_uquotaip) { - xfs_irele(q->qi_uquotaip); - q->qi_uquotaip = NULL; - } - if ((dqtype & XFS_QMOPT_GQUOTA) && q->qi_gquotaip) { - xfs_irele(q->qi_gquotaip); - q->qi_gquotaip = NULL; - } - if ((dqtype & XFS_QMOPT_PQUOTA) && q->qi_pquotaip) { - xfs_irele(q->qi_pquotaip); - q->qi_pquotaip = NULL; - } + mutex_lock(&mp->m_quotainfo->qi_quotaofflock); + mp->m_qflags &= ~(flags & XFS_ALL_QUOTA_ENFD); + spin_lock(&mp->m_sb_lock); + mp->m_sb.sb_qflags = mp->m_qflags; + spin_unlock(&mp->m_sb_lock); + mutex_unlock(&mp->m_quotainfo->qi_quotaofflock); -out_unlock: - if (error && qoffstart) - xfs_qm_qoff_logitem_relse(qoffstart); - mutex_unlock(&q->qi_quotaofflock); - return error; + /* XXX what to do if error ? Revert back to old vals incore ? */ + return xfs_sync_sb(mp, false); } STATIC int @@ -322,7 +105,7 @@ xfs_qm_scall_trunc_qfiles( { int error = -EINVAL; - if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0 || + if (!xfs_has_quota(mp) || flags == 0 || (flags & ~XFS_QMOPT_QUOTALL)) { xfs_debug(mp, "%s: flags=%x m_qflags=%x", __func__, flags, mp->m_qflags); @@ -421,7 +204,7 @@ xfs_qm_scall_quotaon( (mp->m_qflags & XFS_GQUOTA_ACCT))) return 0; - if (! XFS_IS_QUOTA_RUNNING(mp)) + if (!XFS_IS_QUOTA_ON(mp)) return -ESRCH; /* @@ -698,6 +481,10 @@ xfs_qm_scall_getquota( struct xfs_dquot *dqp; int error; + /* Flush inodegc work at the start of a quota reporting scan. */ + if (id == 0) + xfs_inodegc_flush(mp); + /* * Try to get the dquot. We don't want it allocated on disk, so don't * set doalloc. If it doesn't exist, we'll get ENOENT back. @@ -736,6 +523,10 @@ xfs_qm_scall_getquota_next( struct xfs_dquot *dqp; int error; + /* Flush inodegc work at the start of a quota reporting scan. */ + if (*id == 0) + xfs_inodegc_flush(mp); + error = xfs_qm_dqget_next(mp, *id, type, &dqp); if (error) return error; diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index d00d01302545..dcc785fdd345 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -113,6 +113,7 @@ xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks) { return xfs_trans_reserve_quota_nblks(NULL, ip, blocks, 0, false); } +bool xfs_inode_near_dquot_enforcement(struct xfs_inode *ip, xfs_dqtype_t type); #else static inline int xfs_qm_vop_dqalloc(struct xfs_inode *ip, kuid_t kuid, kgid_t kgid, @@ -168,6 +169,7 @@ xfs_trans_reserve_quota_icreate(struct xfs_trans *tp, struct xfs_dquot *udqp, #define xfs_qm_mount_quotas(mp) #define xfs_qm_unmount(mp) #define xfs_qm_unmount_quotas(mp) +#define xfs_inode_near_dquot_enforcement(ip, type) (false) #endif /* CONFIG_XFS_QUOTA */ static inline int diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c index 88d70c236a54..07989bd67728 100644 --- a/fs/xfs/xfs_quotaops.c +++ b/fs/xfs/xfs_quotaops.c @@ -60,18 +60,18 @@ xfs_fs_get_quota_state( struct xfs_quotainfo *q = mp->m_quotainfo; memset(state, 0, sizeof(*state)); - if (!XFS_IS_QUOTA_RUNNING(mp)) + if (!XFS_IS_QUOTA_ON(mp)) return 0; state->s_incoredqs = q->qi_dquots; - if (XFS_IS_UQUOTA_RUNNING(mp)) + if (XFS_IS_UQUOTA_ON(mp)) state->s_state[USRQUOTA].flags |= QCI_ACCT_ENABLED; if (XFS_IS_UQUOTA_ENFORCED(mp)) state->s_state[USRQUOTA].flags |= QCI_LIMITS_ENFORCED; - if (XFS_IS_GQUOTA_RUNNING(mp)) + if (XFS_IS_GQUOTA_ON(mp)) state->s_state[GRPQUOTA].flags |= QCI_ACCT_ENABLED; if (XFS_IS_GQUOTA_ENFORCED(mp)) state->s_state[GRPQUOTA].flags |= QCI_LIMITS_ENFORCED; - if (XFS_IS_PQUOTA_RUNNING(mp)) + if (XFS_IS_PQUOTA_ON(mp)) state->s_state[PRJQUOTA].flags |= QCI_ACCT_ENABLED; if (XFS_IS_PQUOTA_ENFORCED(mp)) state->s_state[PRJQUOTA].flags |= QCI_LIMITS_ENFORCED; @@ -114,10 +114,8 @@ xfs_fs_set_info( if (sb_rdonly(sb)) return -EROFS; - if (!XFS_IS_QUOTA_RUNNING(mp)) - return -ENOSYS; if (!XFS_IS_QUOTA_ON(mp)) - return -ESRCH; + return -ENOSYS; if (info->i_fieldmask & ~XFS_QC_SETINFO_MASK) return -EINVAL; if ((info->i_fieldmask & XFS_QC_SETINFO_MASK) == 0) @@ -164,7 +162,7 @@ xfs_quota_enable( if (sb_rdonly(sb)) return -EROFS; - if (!XFS_IS_QUOTA_RUNNING(mp)) + if (!XFS_IS_QUOTA_ON(mp)) return -ENOSYS; return xfs_qm_scall_quotaon(mp, xfs_quota_flags(uflags)); @@ -179,10 +177,8 @@ xfs_quota_disable( if (sb_rdonly(sb)) return -EROFS; - if (!XFS_IS_QUOTA_RUNNING(mp)) - return -ENOSYS; if (!XFS_IS_QUOTA_ON(mp)) - return -EINVAL; + return -ENOSYS; return xfs_qm_scall_quotaoff(mp, xfs_quota_flags(uflags)); } @@ -223,10 +219,8 @@ xfs_fs_get_dqblk( struct xfs_mount *mp = XFS_M(sb); xfs_dqid_t id; - if (!XFS_IS_QUOTA_RUNNING(mp)) - return -ENOSYS; if (!XFS_IS_QUOTA_ON(mp)) - return -ESRCH; + return -ENOSYS; id = from_kqid(&init_user_ns, qid); return xfs_qm_scall_getquota(mp, id, xfs_quota_type(qid.type), qdq); @@ -243,10 +237,8 @@ xfs_fs_get_nextdqblk( struct xfs_mount *mp = XFS_M(sb); xfs_dqid_t id; - if (!XFS_IS_QUOTA_RUNNING(mp)) - return -ENOSYS; if (!XFS_IS_QUOTA_ON(mp)) - return -ESRCH; + return -ENOSYS; id = from_kqid(&init_user_ns, *qid); ret = xfs_qm_scall_getquota_next(mp, &id, xfs_quota_type(qid->type), @@ -269,10 +261,8 @@ xfs_fs_set_dqblk( if (sb_rdonly(sb)) return -EROFS; - if (!XFS_IS_QUOTA_RUNNING(mp)) - return -ENOSYS; if (!XFS_IS_QUOTA_ON(mp)) - return -ESRCH; + return -ENOSYS; return xfs_qm_scall_setqlim(mp, from_kqid(&init_user_ns, qid), xfs_quota_type(qid.type), qdq); diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 746f4eda724c..46904b793bd4 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -423,7 +423,7 @@ xfs_cui_validate_phys( struct xfs_mount *mp, struct xfs_phys_extent *refc) { - if (!xfs_sb_version_hasreflink(&mp->m_sb)) + if (!xfs_has_reflink(mp)) return false; if (refc->pe_flags & ~XFS_REFCOUNT_EXTENT_FLAGS) @@ -522,6 +522,9 @@ xfs_cui_item_recover( error = xfs_trans_log_finish_refcount_update(tp, cudp, type, refc->pe_startblock, refc->pe_len, &new_fsb, &new_len, &rcur); + if (error == -EFSCORRUPTED) + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + refc, sizeof(*refc)); if (error) goto abort_error; diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index c256104772cb..76355f293488 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -759,7 +759,7 @@ xfs_reflink_recover_cow( xfs_agnumber_t agno; int error = 0; - if (!xfs_sb_version_hasreflink(&mp->m_sb)) + if (!xfs_has_reflink(mp)) return 0; for_each_perag(mp, agno, pag) { @@ -967,7 +967,7 @@ xfs_reflink_ag_has_free_space( struct xfs_perag *pag; int error = 0; - if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) + if (!xfs_has_rmapbt(mp)) return 0; pag = xfs_perag_get(mp, agno); diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index 487b00434b96..bea65f2fe657 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -8,8 +8,7 @@ static inline bool xfs_is_always_cow_inode(struct xfs_inode *ip) { - return ip->i_mount->m_always_cow && - xfs_sb_version_hasreflink(&ip->i_mount->m_sb); + return ip->i_mount->m_always_cow && xfs_has_reflink(ip->i_mount); } static inline bool xfs_is_cow_inode(struct xfs_inode *ip) diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index dc4f0c9f0897..5f0695980467 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -466,7 +466,7 @@ xfs_rui_validate_map( struct xfs_mount *mp, struct xfs_map_extent *rmap) { - if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) + if (!xfs_has_rmapbt(mp)) return false; if (rmap->me_flags & ~XFS_RMAP_EXTENT_FLAGS) @@ -578,6 +578,9 @@ xfs_rui_item_recover( rmap->me_owner, whichfork, rmap->me_startoff, rmap->me_startblock, rmap->me_len, state, &rcur); + if (error == -EFSCORRUPTED) + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + rmap, sizeof(*rmap)); if (error) goto abort_error; diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 699066fb9052..b8c79ee791af 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -951,8 +951,7 @@ xfs_growfs_rt( return -EINVAL; /* Unsupported realtime features. */ - if (xfs_sb_version_hasrmapbt(&mp->m_sb) || - xfs_sb_version_hasreflink(&mp->m_sb)) + if (xfs_has_rmapbt(mp) || xfs_has_reflink(mp)) return -EOPNOTSUPP; nrblocks = in->newblocks; @@ -1131,6 +1130,9 @@ error_cancel: error = xfs_trans_commit(tp); if (error) break; + + /* Ensure the mount RT feature flag is now set. */ + mp->m_features |= XFS_FEAT_REALTIME; } if (error) goto out_free; diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index ed885620589c..91b00289509b 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -22,9 +22,9 @@ struct xfs_rtalloc_rec { }; typedef int (*xfs_rtalloc_query_range_fn)( - struct xfs_trans *tp, - struct xfs_rtalloc_rec *rec, - void *priv); + struct xfs_trans *tp, + const struct xfs_rtalloc_rec *rec, + void *priv); #ifdef CONFIG_XFS_RT /* @@ -124,10 +124,9 @@ int xfs_rtfree_range(struct xfs_mount *mp, struct xfs_trans *tp, xfs_rtblock_t start, xfs_extlen_t len, struct xfs_buf **rbpp, xfs_fsblock_t *rsb); int xfs_rtalloc_query_range(struct xfs_trans *tp, - struct xfs_rtalloc_rec *low_rec, - struct xfs_rtalloc_rec *high_rec, - xfs_rtalloc_query_range_fn fn, - void *priv); + const struct xfs_rtalloc_rec *low_rec, + const struct xfs_rtalloc_rec *high_rec, + xfs_rtalloc_query_range_fn fn, void *priv); int xfs_rtalloc_query_all(struct xfs_trans *tp, xfs_rtalloc_query_range_fn fn, void *priv); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 102cbd606633..c4e0cd1c1c8c 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -49,6 +49,28 @@ static struct kset *xfs_kset; /* top-level xfs sysfs dir */ static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */ #endif +#ifdef CONFIG_HOTPLUG_CPU +static LIST_HEAD(xfs_mount_list); +static DEFINE_SPINLOCK(xfs_mount_list_lock); + +static inline void xfs_mount_list_add(struct xfs_mount *mp) +{ + spin_lock(&xfs_mount_list_lock); + list_add(&mp->m_mount_list, &xfs_mount_list); + spin_unlock(&xfs_mount_list_lock); +} + +static inline void xfs_mount_list_del(struct xfs_mount *mp) +{ + spin_lock(&xfs_mount_list_lock); + list_del(&mp->m_mount_list); + spin_unlock(&xfs_mount_list_lock); +} +#else /* !CONFIG_HOTPLUG_CPU */ +static inline void xfs_mount_list_add(struct xfs_mount *mp) {} +static inline void xfs_mount_list_del(struct xfs_mount *mp) {} +#endif + enum xfs_dax_mode { XFS_DAX_INODE = 0, XFS_DAX_ALWAYS = 1, @@ -62,15 +84,15 @@ xfs_mount_set_dax_mode( { switch (mode) { case XFS_DAX_INODE: - mp->m_flags &= ~(XFS_MOUNT_DAX_ALWAYS | XFS_MOUNT_DAX_NEVER); + mp->m_features &= ~(XFS_FEAT_DAX_ALWAYS | XFS_FEAT_DAX_NEVER); break; case XFS_DAX_ALWAYS: - mp->m_flags |= XFS_MOUNT_DAX_ALWAYS; - mp->m_flags &= ~XFS_MOUNT_DAX_NEVER; + mp->m_features |= XFS_FEAT_DAX_ALWAYS; + mp->m_features &= ~XFS_FEAT_DAX_NEVER; break; case XFS_DAX_NEVER: - mp->m_flags |= XFS_MOUNT_DAX_NEVER; - mp->m_flags &= ~XFS_MOUNT_DAX_ALWAYS; + mp->m_features |= XFS_FEAT_DAX_NEVER; + mp->m_features &= ~XFS_FEAT_DAX_ALWAYS; break; } } @@ -154,33 +176,32 @@ xfs_fs_show_options( { static struct proc_xfs_info xfs_info_set[] = { /* the few simple ones we can get from the mount struct */ - { XFS_MOUNT_IKEEP, ",ikeep" }, - { XFS_MOUNT_WSYNC, ",wsync" }, - { XFS_MOUNT_NOALIGN, ",noalign" }, - { XFS_MOUNT_SWALLOC, ",swalloc" }, - { XFS_MOUNT_NOUUID, ",nouuid" }, - { XFS_MOUNT_NORECOVERY, ",norecovery" }, - { XFS_MOUNT_ATTR2, ",attr2" }, - { XFS_MOUNT_FILESTREAMS, ",filestreams" }, - { XFS_MOUNT_GRPID, ",grpid" }, - { XFS_MOUNT_DISCARD, ",discard" }, - { XFS_MOUNT_LARGEIO, ",largeio" }, - { XFS_MOUNT_DAX_ALWAYS, ",dax=always" }, - { XFS_MOUNT_DAX_NEVER, ",dax=never" }, + { XFS_FEAT_IKEEP, ",ikeep" }, + { XFS_FEAT_WSYNC, ",wsync" }, + { XFS_FEAT_NOALIGN, ",noalign" }, + { XFS_FEAT_SWALLOC, ",swalloc" }, + { XFS_FEAT_NOUUID, ",nouuid" }, + { XFS_FEAT_NORECOVERY, ",norecovery" }, + { XFS_FEAT_ATTR2, ",attr2" }, + { XFS_FEAT_FILESTREAMS, ",filestreams" }, + { XFS_FEAT_GRPID, ",grpid" }, + { XFS_FEAT_DISCARD, ",discard" }, + { XFS_FEAT_LARGE_IOSIZE, ",largeio" }, + { XFS_FEAT_DAX_ALWAYS, ",dax=always" }, + { XFS_FEAT_DAX_NEVER, ",dax=never" }, { 0, NULL } }; struct xfs_mount *mp = XFS_M(root->d_sb); struct proc_xfs_info *xfs_infop; for (xfs_infop = xfs_info_set; xfs_infop->flag; xfs_infop++) { - if (mp->m_flags & xfs_infop->flag) + if (mp->m_features & xfs_infop->flag) seq_puts(m, xfs_infop->str); } - seq_printf(m, ",inode%d", - (mp->m_flags & XFS_MOUNT_SMALL_INUMS) ? 32 : 64); + seq_printf(m, ",inode%d", xfs_has_small_inums(mp) ? 32 : 64); - if (mp->m_flags & XFS_MOUNT_ALLOCSIZE) + if (xfs_has_allocsize(mp)) seq_printf(m, ",allocsize=%dk", (1 << mp->m_allocsize_log) >> 10); @@ -201,25 +222,20 @@ xfs_fs_show_options( seq_printf(m, ",swidth=%d", (int)XFS_FSB_TO_BB(mp, mp->m_swidth)); - if (mp->m_qflags & XFS_UQUOTA_ACCT) { - if (mp->m_qflags & XFS_UQUOTA_ENFD) - seq_puts(m, ",usrquota"); - else - seq_puts(m, ",uqnoenforce"); - } + if (mp->m_qflags & XFS_UQUOTA_ENFD) + seq_puts(m, ",usrquota"); + else if (mp->m_qflags & XFS_UQUOTA_ACCT) + seq_puts(m, ",uqnoenforce"); - if (mp->m_qflags & XFS_PQUOTA_ACCT) { - if (mp->m_qflags & XFS_PQUOTA_ENFD) - seq_puts(m, ",prjquota"); - else - seq_puts(m, ",pqnoenforce"); - } - if (mp->m_qflags & XFS_GQUOTA_ACCT) { - if (mp->m_qflags & XFS_GQUOTA_ENFD) - seq_puts(m, ",grpquota"); - else - seq_puts(m, ",gqnoenforce"); - } + if (mp->m_qflags & XFS_PQUOTA_ENFD) + seq_puts(m, ",prjquota"); + else if (mp->m_qflags & XFS_PQUOTA_ACCT) + seq_puts(m, ",pqnoenforce"); + + if (mp->m_qflags & XFS_GQUOTA_ENFD) + seq_puts(m, ",grpquota"); + else if (mp->m_qflags & XFS_GQUOTA_ACCT) + seq_puts(m, ",gqnoenforce"); if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT)) seq_puts(m, ",noquota"); @@ -230,11 +246,11 @@ xfs_fs_show_options( /* * Set parameters for inode allocation heuristics, taking into account * filesystem size and inode32/inode64 mount options; i.e. specifically - * whether or not XFS_MOUNT_SMALL_INUMS is set. + * whether or not XFS_FEAT_SMALL_INUMS is set. * * Inode allocation patterns are altered only if inode32 is requested - * (XFS_MOUNT_SMALL_INUMS), and the filesystem is sufficiently large. - * If altered, XFS_MOUNT_32BITINODES is set as well. + * (XFS_FEAT_SMALL_INUMS), and the filesystem is sufficiently large. + * If altered, XFS_OPSTATE_INODE32 is set as well. * * An agcount independent of that in the mount structure is provided * because in the growfs case, mp->m_sb.sb_agcount is not yet updated @@ -276,13 +292,13 @@ xfs_set_inode_alloc( /* * If user asked for no more than 32-bit inodes, and the fs is - * sufficiently large, set XFS_MOUNT_32BITINODES if we must alter + * sufficiently large, set XFS_OPSTATE_INODE32 if we must alter * the allocator to accommodate the request. */ - if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && ino > XFS_MAXINUMBER_32) - mp->m_flags |= XFS_MOUNT_32BITINODES; + if (xfs_has_small_inums(mp) && ino > XFS_MAXINUMBER_32) + set_bit(XFS_OPSTATE_INODE32, &mp->m_opstate); else - mp->m_flags &= ~XFS_MOUNT_32BITINODES; + clear_bit(XFS_OPSTATE_INODE32, &mp->m_opstate); for (index = 0; index < agcount; index++) { struct xfs_perag *pag; @@ -291,7 +307,7 @@ xfs_set_inode_alloc( pag = xfs_perag_get(mp, index); - if (mp->m_flags & XFS_MOUNT_32BITINODES) { + if (xfs_is_inode32(mp)) { if (ino > XFS_MAXINUMBER_32) { pag->pagi_inodeok = 0; pag->pagf_metadata = 0; @@ -311,7 +327,16 @@ xfs_set_inode_alloc( xfs_perag_put(pag); } - return (mp->m_flags & XFS_MOUNT_32BITINODES) ? maxagi : agcount; + return xfs_is_inode32(mp) ? maxagi : agcount; +} + +static bool +xfs_buftarg_is_dax( + struct super_block *sb, + struct xfs_buftarg *bt) +{ + return dax_supported(bt->bt_daxdev, bt->bt_bdev, sb->s_blocksize, 0, + bdev_nr_sectors(bt->bt_bdev)); } STATIC int @@ -468,7 +493,7 @@ xfs_setup_devices( if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) { unsigned int log_sector_size = BBSIZE; - if (xfs_sb_version_hassector(&mp->m_sb)) + if (xfs_has_sector(mp)) log_sector_size = mp->m_sb.sb_logsectsize; error = xfs_setsize_buftarg(mp->m_logdev_targp, log_sector_size); @@ -501,37 +526,37 @@ xfs_init_mount_workqueues( if (!mp->m_unwritten_workqueue) goto out_destroy_buf; - mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s", - XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_UNBOUND), - 0, mp->m_super->s_id); - if (!mp->m_cil_workqueue) - goto out_destroy_unwritten; - mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s", XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM), 0, mp->m_super->s_id); if (!mp->m_reclaim_workqueue) - goto out_destroy_cil; + goto out_destroy_unwritten; - mp->m_gc_workqueue = alloc_workqueue("xfs-gc/%s", - WQ_SYSFS | WQ_UNBOUND | WQ_FREEZABLE | WQ_MEM_RECLAIM, + mp->m_blockgc_wq = alloc_workqueue("xfs-blockgc/%s", + XFS_WQFLAGS(WQ_UNBOUND | WQ_FREEZABLE | WQ_MEM_RECLAIM), 0, mp->m_super->s_id); - if (!mp->m_gc_workqueue) + if (!mp->m_blockgc_wq) goto out_destroy_reclaim; + mp->m_inodegc_wq = alloc_workqueue("xfs-inodegc/%s", + XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM), + 1, mp->m_super->s_id); + if (!mp->m_inodegc_wq) + goto out_destroy_blockgc; + mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s", XFS_WQFLAGS(WQ_FREEZABLE), 0, mp->m_super->s_id); if (!mp->m_sync_workqueue) - goto out_destroy_eofb; + goto out_destroy_inodegc; return 0; -out_destroy_eofb: - destroy_workqueue(mp->m_gc_workqueue); +out_destroy_inodegc: + destroy_workqueue(mp->m_inodegc_wq); +out_destroy_blockgc: + destroy_workqueue(mp->m_blockgc_wq); out_destroy_reclaim: destroy_workqueue(mp->m_reclaim_workqueue); -out_destroy_cil: - destroy_workqueue(mp->m_cil_workqueue); out_destroy_unwritten: destroy_workqueue(mp->m_unwritten_workqueue); out_destroy_buf: @@ -545,9 +570,9 @@ xfs_destroy_mount_workqueues( struct xfs_mount *mp) { destroy_workqueue(mp->m_sync_workqueue); - destroy_workqueue(mp->m_gc_workqueue); + destroy_workqueue(mp->m_blockgc_wq); + destroy_workqueue(mp->m_inodegc_wq); destroy_workqueue(mp->m_reclaim_workqueue); - destroy_workqueue(mp->m_cil_workqueue); destroy_workqueue(mp->m_unwritten_workqueue); destroy_workqueue(mp->m_buf_workqueue); } @@ -596,32 +621,6 @@ xfs_fs_alloc_inode( return NULL; } -#ifdef DEBUG -static void -xfs_check_delalloc( - struct xfs_inode *ip, - int whichfork) -{ - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); - struct xfs_bmbt_irec got; - struct xfs_iext_cursor icur; - - if (!ifp || !xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got)) - return; - do { - if (isnullstartblock(got.br_startblock)) { - xfs_warn(ip->i_mount, - "ino %llx %s fork has delalloc extent at [0x%llx:0x%llx]", - ip->i_ino, - whichfork == XFS_DATA_FORK ? "data" : "cow", - got.br_startoff, got.br_blockcount); - } - } while (xfs_iext_next_extent(ifp, &icur, &got)); -} -#else -#define xfs_check_delalloc(ip, whichfork) do { } while (0) -#endif - /* * Now that the generic code is guaranteed not to be accessing * the linux inode, we can inactivate and reclaim the inode. @@ -637,30 +636,6 @@ xfs_fs_destroy_inode( ASSERT(!rwsem_is_locked(&inode->i_rwsem)); XFS_STATS_INC(ip->i_mount, vn_rele); XFS_STATS_INC(ip->i_mount, vn_remove); - - xfs_inactive(ip); - - if (!XFS_FORCED_SHUTDOWN(ip->i_mount) && ip->i_delayed_blks) { - xfs_check_delalloc(ip, XFS_DATA_FORK); - xfs_check_delalloc(ip, XFS_COW_FORK); - ASSERT(0); - } - - XFS_STATS_INC(ip->i_mount, vn_reclaim); - - /* - * We should never get here with one of the reclaim flags already set. - */ - ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIMABLE)); - ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM)); - - /* - * We always use background reclaim here because even if the inode is - * clean, it still may be under IO and hence we have wait for IO - * completion to occur before we can reclaim the inode. The background - * reclaim path handles this more efficiently than we can here, so - * simply let background reclaim tear down all inodes. - */ xfs_inode_mark_reclaimable(ip); } @@ -732,7 +707,7 @@ xfs_fs_drop_inode( * that. See the comment for this inode flag. */ if (ip->i_flags & XFS_IRECOVERY) { - ASSERT(ip->i_mount->m_log->l_flags & XLOG_RECOVERY_NEEDED); + ASSERT(xlog_recovery_needed(ip->i_mount->m_log)); return 0; } @@ -755,6 +730,8 @@ xfs_fs_sync_fs( { struct xfs_mount *mp = XFS_M(sb); + trace_xfs_fs_sync_fs(mp, __return_address); + /* * Doing anything during the async pass would be counterproductive. */ @@ -771,6 +748,25 @@ xfs_fs_sync_fs( flush_delayed_work(&mp->m_log->l_work); } + /* + * If we are called with page faults frozen out, it means we are about + * to freeze the transaction subsystem. Take the opportunity to shut + * down inodegc because once SB_FREEZE_FS is set it's too late to + * prevent inactivation races with freeze. The fs doesn't get called + * again by the freezing process until after SB_FREEZE_FS has been set, + * so it's now or never. Same logic applies to speculative allocation + * garbage collection. + * + * We don't care if this is a normal syncfs call that does this or + * freeze that does this - we can run this multiple times without issue + * and we won't race with a restart because a restart can only occur + * when the state is either SB_FREEZE_FS or SB_FREEZE_COMPLETE. + */ + if (sb->s_writers.frozen == SB_FREEZE_PAGEFAULT) { + xfs_inodegc_stop(mp); + xfs_blockgc_stop(mp); + } + return 0; } @@ -789,6 +785,9 @@ xfs_fs_statfs( xfs_extlen_t lsize; int64_t ffree; + /* Wait for whatever inactivations are in progress. */ + xfs_inodegc_flush(mp); + statp->f_type = XFS_SUPER_MAGIC; statp->f_namelen = MAXNAMELEN - 1; @@ -884,10 +883,22 @@ xfs_fs_freeze( * set a GFP_NOFS context here to avoid recursion deadlocks. */ flags = memalloc_nofs_save(); - xfs_blockgc_stop(mp); xfs_save_resvblks(mp); ret = xfs_log_quiesce(mp); memalloc_nofs_restore(flags); + + /* + * For read-write filesystems, we need to restart the inodegc on error + * because we stopped it at SB_FREEZE_PAGEFAULT level and a thaw is not + * going to be run to restart it now. We are at SB_FREEZE_FS level + * here, so we can restart safely without racing with a stop in + * xfs_fs_sync_fs(). + */ + if (ret && !xfs_is_readonly(mp)) { + xfs_blockgc_start(mp); + xfs_inodegc_start(mp); + } + return ret; } @@ -899,7 +910,18 @@ xfs_fs_unfreeze( xfs_restore_resvblks(mp); xfs_log_work_queue(mp); - xfs_blockgc_start(mp); + + /* + * Don't reactivate the inodegc worker on a readonly filesystem because + * inodes are sent directly to reclaim. Don't reactivate the blockgc + * worker because there are no speculative preallocations on a readonly + * filesystem. + */ + if (!xfs_is_readonly(mp)) { + xfs_blockgc_start(mp); + xfs_inodegc_start(mp); + } + return 0; } @@ -911,10 +933,8 @@ STATIC int xfs_finish_flags( struct xfs_mount *mp) { - int ronly = (mp->m_flags & XFS_MOUNT_RDONLY); - /* Fail a mount where the logbuf is smaller than the log stripe */ - if (xfs_sb_version_haslogv2(&mp->m_sb)) { + if (xfs_has_logv2(mp)) { if (mp->m_logbsize <= 0 && mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE) { mp->m_logbsize = mp->m_sb.sb_logsunit; @@ -936,33 +956,24 @@ xfs_finish_flags( /* * V5 filesystems always use attr2 format for attributes. */ - if (xfs_sb_version_hascrc(&mp->m_sb) && - (mp->m_flags & XFS_MOUNT_NOATTR2)) { + if (xfs_has_crc(mp) && xfs_has_noattr2(mp)) { xfs_warn(mp, "Cannot mount a V5 filesystem as noattr2. " "attr2 is always enabled for V5 filesystems."); return -EINVAL; } /* - * mkfs'ed attr2 will turn on attr2 mount unless explicitly - * told by noattr2 to turn it off - */ - if (xfs_sb_version_hasattr2(&mp->m_sb) && - !(mp->m_flags & XFS_MOUNT_NOATTR2)) - mp->m_flags |= XFS_MOUNT_ATTR2; - - /* * prohibit r/w mounts of read-only filesystems */ - if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !ronly) { + if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !xfs_is_readonly(mp)) { xfs_warn(mp, "cannot mount a read-only filesystem as read-write"); return -EROFS; } - if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) && - (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE)) && - !xfs_sb_version_has_pquotino(&mp->m_sb)) { + if ((mp->m_qflags & XFS_GQUOTA_ACCT) && + (mp->m_qflags & XFS_PQUOTA_ACCT) && + !xfs_has_pquotino(mp)) { xfs_warn(mp, "Super block does not support project and group quota together"); return -EINVAL; @@ -1020,11 +1031,40 @@ xfs_destroy_percpu_counters( percpu_counter_destroy(&mp->m_icount); percpu_counter_destroy(&mp->m_ifree); percpu_counter_destroy(&mp->m_fdblocks); - ASSERT(XFS_FORCED_SHUTDOWN(mp) || + ASSERT(xfs_is_shutdown(mp) || percpu_counter_sum(&mp->m_delalloc_blks) == 0); percpu_counter_destroy(&mp->m_delalloc_blks); } +static int +xfs_inodegc_init_percpu( + struct xfs_mount *mp) +{ + struct xfs_inodegc *gc; + int cpu; + + mp->m_inodegc = alloc_percpu(struct xfs_inodegc); + if (!mp->m_inodegc) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + gc = per_cpu_ptr(mp->m_inodegc, cpu); + init_llist_head(&gc->list); + gc->items = 0; + INIT_WORK(&gc->work, xfs_inodegc_worker); + } + return 0; +} + +static void +xfs_inodegc_free_percpu( + struct xfs_mount *mp) +{ + if (!mp->m_inodegc) + return; + free_percpu(mp->m_inodegc); +} + static void xfs_fs_put_super( struct super_block *sb) @@ -1041,6 +1081,8 @@ xfs_fs_put_super( xfs_freesb(mp); free_percpu(mp->m_stats.xs_stats); + xfs_mount_list_del(mp); + xfs_inodegc_free_percpu(mp); xfs_destroy_percpu_counters(mp); xfs_destroy_mount_workqueues(mp); xfs_close_devices(mp); @@ -1129,7 +1171,7 @@ xfs_fs_warn_deprecated( * already had the flag set */ if ((fc->purpose & FS_CONTEXT_FOR_RECONFIGURE) && - !!(XFS_M(fc->root->d_sb)->m_flags & flag) == value) + !!(XFS_M(fc->root->d_sb)->m_features & flag) == value) return; xfs_warn(fc->s_fs_info, "%s mount option is deprecated.", param->key); } @@ -1177,27 +1219,27 @@ xfs_fs_parse_param( if (suffix_kstrtoint(param->string, 10, &size)) return -EINVAL; parsing_mp->m_allocsize_log = ffs(size) - 1; - parsing_mp->m_flags |= XFS_MOUNT_ALLOCSIZE; + parsing_mp->m_features |= XFS_FEAT_ALLOCSIZE; return 0; case Opt_grpid: case Opt_bsdgroups: - parsing_mp->m_flags |= XFS_MOUNT_GRPID; + parsing_mp->m_features |= XFS_FEAT_GRPID; return 0; case Opt_nogrpid: case Opt_sysvgroups: - parsing_mp->m_flags &= ~XFS_MOUNT_GRPID; + parsing_mp->m_features &= ~XFS_FEAT_GRPID; return 0; case Opt_wsync: - parsing_mp->m_flags |= XFS_MOUNT_WSYNC; + parsing_mp->m_features |= XFS_FEAT_WSYNC; return 0; case Opt_norecovery: - parsing_mp->m_flags |= XFS_MOUNT_NORECOVERY; + parsing_mp->m_features |= XFS_FEAT_NORECOVERY; return 0; case Opt_noalign: - parsing_mp->m_flags |= XFS_MOUNT_NOALIGN; + parsing_mp->m_features |= XFS_FEAT_NOALIGN; return 0; case Opt_swalloc: - parsing_mp->m_flags |= XFS_MOUNT_SWALLOC; + parsing_mp->m_features |= XFS_FEAT_SWALLOC; return 0; case Opt_sunit: parsing_mp->m_dalign = result.uint_32; @@ -1206,62 +1248,58 @@ xfs_fs_parse_param( parsing_mp->m_swidth = result.uint_32; return 0; case Opt_inode32: - parsing_mp->m_flags |= XFS_MOUNT_SMALL_INUMS; + parsing_mp->m_features |= XFS_FEAT_SMALL_INUMS; return 0; case Opt_inode64: - parsing_mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS; + parsing_mp->m_features &= ~XFS_FEAT_SMALL_INUMS; return 0; case Opt_nouuid: - parsing_mp->m_flags |= XFS_MOUNT_NOUUID; + parsing_mp->m_features |= XFS_FEAT_NOUUID; return 0; case Opt_largeio: - parsing_mp->m_flags |= XFS_MOUNT_LARGEIO; + parsing_mp->m_features |= XFS_FEAT_LARGE_IOSIZE; return 0; case Opt_nolargeio: - parsing_mp->m_flags &= ~XFS_MOUNT_LARGEIO; + parsing_mp->m_features &= ~XFS_FEAT_LARGE_IOSIZE; return 0; case Opt_filestreams: - parsing_mp->m_flags |= XFS_MOUNT_FILESTREAMS; + parsing_mp->m_features |= XFS_FEAT_FILESTREAMS; return 0; case Opt_noquota: parsing_mp->m_qflags &= ~XFS_ALL_QUOTA_ACCT; parsing_mp->m_qflags &= ~XFS_ALL_QUOTA_ENFD; - parsing_mp->m_qflags &= ~XFS_ALL_QUOTA_ACTIVE; return 0; case Opt_quota: case Opt_uquota: case Opt_usrquota: - parsing_mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE | - XFS_UQUOTA_ENFD); + parsing_mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ENFD); return 0; case Opt_qnoenforce: case Opt_uqnoenforce: - parsing_mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE); + parsing_mp->m_qflags |= XFS_UQUOTA_ACCT; parsing_mp->m_qflags &= ~XFS_UQUOTA_ENFD; return 0; case Opt_pquota: case Opt_prjquota: - parsing_mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE | - XFS_PQUOTA_ENFD); + parsing_mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ENFD); return 0; case Opt_pqnoenforce: - parsing_mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE); + parsing_mp->m_qflags |= XFS_PQUOTA_ACCT; parsing_mp->m_qflags &= ~XFS_PQUOTA_ENFD; return 0; case Opt_gquota: case Opt_grpquota: - parsing_mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE | - XFS_GQUOTA_ENFD); + parsing_mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ENFD); return 0; case Opt_gqnoenforce: - parsing_mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE); + parsing_mp->m_qflags |= XFS_GQUOTA_ACCT; parsing_mp->m_qflags &= ~XFS_GQUOTA_ENFD; return 0; case Opt_discard: - parsing_mp->m_flags |= XFS_MOUNT_DISCARD; + parsing_mp->m_features |= XFS_FEAT_DISCARD; return 0; case Opt_nodiscard: - parsing_mp->m_flags &= ~XFS_MOUNT_DISCARD; + parsing_mp->m_features &= ~XFS_FEAT_DISCARD; return 0; #ifdef CONFIG_FS_DAX case Opt_dax: @@ -1273,21 +1311,20 @@ xfs_fs_parse_param( #endif /* Following mount options will be removed in September 2025 */ case Opt_ikeep: - xfs_fs_warn_deprecated(fc, param, XFS_MOUNT_IKEEP, true); - parsing_mp->m_flags |= XFS_MOUNT_IKEEP; + xfs_fs_warn_deprecated(fc, param, XFS_FEAT_IKEEP, true); + parsing_mp->m_features |= XFS_FEAT_IKEEP; return 0; case Opt_noikeep: - xfs_fs_warn_deprecated(fc, param, XFS_MOUNT_IKEEP, false); - parsing_mp->m_flags &= ~XFS_MOUNT_IKEEP; + xfs_fs_warn_deprecated(fc, param, XFS_FEAT_IKEEP, false); + parsing_mp->m_features &= ~XFS_FEAT_IKEEP; return 0; case Opt_attr2: - xfs_fs_warn_deprecated(fc, param, XFS_MOUNT_ATTR2, true); - parsing_mp->m_flags |= XFS_MOUNT_ATTR2; + xfs_fs_warn_deprecated(fc, param, XFS_FEAT_ATTR2, true); + parsing_mp->m_features |= XFS_FEAT_ATTR2; return 0; case Opt_noattr2: - xfs_fs_warn_deprecated(fc, param, XFS_MOUNT_NOATTR2, true); - parsing_mp->m_flags &= ~XFS_MOUNT_ATTR2; - parsing_mp->m_flags |= XFS_MOUNT_NOATTR2; + xfs_fs_warn_deprecated(fc, param, XFS_FEAT_NOATTR2, true); + parsing_mp->m_features |= XFS_FEAT_NOATTR2; return 0; default: xfs_warn(parsing_mp, "unknown mount option [%s].", param->key); @@ -1301,17 +1338,23 @@ static int xfs_fs_validate_params( struct xfs_mount *mp) { + /* No recovery flag requires a read-only mount */ + if (xfs_has_norecovery(mp) && !xfs_is_readonly(mp)) { + xfs_warn(mp, "no-recovery mounts must be read-only."); + return -EINVAL; + } + /* - * no recovery flag requires a read-only mount + * We have not read the superblock at this point, so only the attr2 + * mount option can set the attr2 feature by this stage. */ - if ((mp->m_flags & XFS_MOUNT_NORECOVERY) && - !(mp->m_flags & XFS_MOUNT_RDONLY)) { - xfs_warn(mp, "no-recovery mounts must be read-only."); + if (xfs_has_attr2(mp) && xfs_has_noattr2(mp)) { + xfs_warn(mp, "attr2 and noattr2 cannot both be specified."); return -EINVAL; } - if ((mp->m_flags & XFS_MOUNT_NOALIGN) && - (mp->m_dalign || mp->m_swidth)) { + + if (xfs_has_noalign(mp) && (mp->m_dalign || mp->m_swidth)) { xfs_warn(mp, "sunit and swidth options incompatible with the noalign option"); return -EINVAL; @@ -1355,7 +1398,7 @@ xfs_fs_validate_params( return -EINVAL; } - if ((mp->m_flags & XFS_MOUNT_ALLOCSIZE) && + if (xfs_has_allocsize(mp) && (mp->m_allocsize_log > XFS_MAX_IO_LOG || mp->m_allocsize_log < XFS_MIN_IO_LOG)) { xfs_warn(mp, "invalid log iosize: %d [not %d-%d]", @@ -1416,11 +1459,22 @@ xfs_fs_fill_super( if (error) goto out_destroy_workqueues; + error = xfs_inodegc_init_percpu(mp); + if (error) + goto out_destroy_counters; + + /* + * All percpu data structures requiring cleanup when a cpu goes offline + * must be allocated before adding this @mp to the cpu-dead handler's + * mount list. + */ + xfs_mount_list_add(mp); + /* Allocate stats memory before we do operations that might use it */ mp->m_stats.xs_stats = alloc_percpu(struct xfsstats); if (!mp->m_stats.xs_stats) { error = -ENOMEM; - goto out_destroy_counters; + goto out_destroy_inodegc; } error = xfs_readsb(mp, flags); @@ -1436,7 +1490,7 @@ xfs_fs_fill_super( goto out_free_sb; /* V4 support is undergoing deprecation. */ - if (!xfs_sb_version_hascrc(&mp->m_sb)) { + if (!xfs_has_crc(mp)) { #ifdef CONFIG_XFS_SUPPORT_V4 xfs_warn_once(mp, "Deprecated V4 format (crc=0) will not be supported after September 2030."); @@ -1449,7 +1503,7 @@ xfs_fs_fill_super( } /* Filesystem claims it needs repair, so refuse the mount. */ - if (xfs_sb_version_needsrepair(&mp->m_sb)) { + if (xfs_has_needsrepair(mp)) { xfs_warn(mp, "Filesystem needs repair. Please run xfs_repair."); error = -EFSCORRUPTED; goto out_free_sb; @@ -1521,7 +1575,7 @@ xfs_fs_fill_super( sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_max_links = XFS_MAXLINK; sb->s_time_gran = 1; - if (xfs_sb_version_hasbigtime(&mp->m_sb)) { + if (xfs_has_bigtime(mp)) { sb->s_time_min = xfs_bigtime_to_unix(XFS_BIGTIME_TIME_MIN); sb->s_time_max = xfs_bigtime_to_unix(XFS_BIGTIME_TIME_MAX); } else { @@ -1534,30 +1588,25 @@ xfs_fs_fill_super( set_posix_acl_flag(sb); /* version 5 superblocks support inode version counters. */ - if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5) + if (xfs_has_crc(mp)) sb->s_flags |= SB_I_VERSION; - if (xfs_sb_version_hasbigtime(&mp->m_sb)) - xfs_warn(mp, - "EXPERIMENTAL big timestamp feature in use. Use at your own risk!"); - - if (mp->m_flags & XFS_MOUNT_DAX_ALWAYS) { + if (xfs_has_dax_always(mp)) { bool rtdev_is_dax = false, datadev_is_dax; xfs_warn(mp, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); - datadev_is_dax = bdev_dax_supported(mp->m_ddev_targp->bt_bdev, - sb->s_blocksize); + datadev_is_dax = xfs_buftarg_is_dax(sb, mp->m_ddev_targp); if (mp->m_rtdev_targp) - rtdev_is_dax = bdev_dax_supported( - mp->m_rtdev_targp->bt_bdev, sb->s_blocksize); + rtdev_is_dax = xfs_buftarg_is_dax(sb, + mp->m_rtdev_targp); if (!rtdev_is_dax && !datadev_is_dax) { xfs_alert(mp, "DAX unsupported by block device. Turning off DAX."); xfs_mount_set_dax_mode(mp, XFS_DAX_NEVER); } - if (xfs_sb_version_hasreflink(&mp->m_sb)) { + if (xfs_has_reflink(mp)) { xfs_alert(mp, "DAX and reflink cannot be used together!"); error = -EINVAL; @@ -1565,17 +1614,17 @@ xfs_fs_fill_super( } } - if (mp->m_flags & XFS_MOUNT_DISCARD) { + if (xfs_has_discard(mp)) { struct request_queue *q = bdev_get_queue(sb->s_bdev); if (!blk_queue_discard(q)) { xfs_warn(mp, "mounting with \"discard\" option, but " "the device does not support discard"); - mp->m_flags &= ~XFS_MOUNT_DISCARD; + mp->m_features &= ~XFS_FEAT_DISCARD; } } - if (xfs_sb_version_hasreflink(&mp->m_sb)) { + if (xfs_has_reflink(mp)) { if (mp->m_sb.sb_rblocks) { xfs_alert(mp, "reflink not compatible with realtime device!"); @@ -1589,17 +1638,13 @@ xfs_fs_fill_super( } } - if (xfs_sb_version_hasrmapbt(&mp->m_sb) && mp->m_sb.sb_rblocks) { + if (xfs_has_rmapbt(mp) && mp->m_sb.sb_rblocks) { xfs_alert(mp, "reverse mapping btree not compatible with realtime device!"); error = -EINVAL; goto out_filestream_unmount; } - if (xfs_sb_version_hasinobtcounts(&mp->m_sb)) - xfs_warn(mp, - "EXPERIMENTAL inode btree counters feature in use. Use at your own risk!"); - error = xfs_mountfs(mp); if (error) goto out_filestream_unmount; @@ -1623,6 +1668,9 @@ xfs_fs_fill_super( xfs_freesb(mp); out_free_stats: free_percpu(mp->m_stats.xs_stats); + out_destroy_inodegc: + xfs_mount_list_del(mp); + xfs_inodegc_free_percpu(mp); out_destroy_counters: xfs_destroy_percpu_counters(mp); out_destroy_workqueues: @@ -1654,13 +1702,13 @@ xfs_remount_rw( struct xfs_sb *sbp = &mp->m_sb; int error; - if (mp->m_flags & XFS_MOUNT_NORECOVERY) { + if (xfs_has_norecovery(mp)) { xfs_warn(mp, "ro->rw transition prohibited on norecovery mount"); return -EINVAL; } - if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && + if (xfs_sb_is_v5(sbp) && xfs_sb_has_ro_compat_feature(sbp, XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) { xfs_warn(mp, "ro->rw transition prohibited on unknown (0x%x) ro-compat filesystem", @@ -1669,7 +1717,7 @@ xfs_remount_rw( return -EINVAL; } - mp->m_flags &= ~XFS_MOUNT_RDONLY; + clear_bit(XFS_OPSTATE_READONLY, &mp->m_opstate); /* * If this is the first remount to writeable state we might have some @@ -1706,6 +1754,9 @@ xfs_remount_rw( if (error && error != -ENOSPC) return error; + /* Re-enable the background inode inactivation worker. */ + xfs_inodegc_start(mp); + return 0; } @@ -1728,6 +1779,15 @@ xfs_remount_ro( return error; } + /* + * Stop the inodegc background worker. xfs_fs_reconfigure already + * flushed all pending inodegc work when it sync'd the filesystem. + * The VFS holds s_umount, so we know that inodes cannot enter + * xfs_fs_destroy_inode during a remount operation. In readonly mode + * we send inodes straight to reclaim, so no inodes will be queued. + */ + xfs_inodegc_stop(mp); + /* Free the per-AG metadata reservation pool. */ error = xfs_fs_unreserve_ag_blocks(mp); if (error) { @@ -1745,7 +1805,7 @@ xfs_remount_ro( xfs_save_resvblks(mp); xfs_log_clean(mp); - mp->m_flags |= XFS_MOUNT_RDONLY; + set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate); return 0; } @@ -1768,12 +1828,11 @@ xfs_fs_reconfigure( { struct xfs_mount *mp = XFS_M(fc->root->d_sb); struct xfs_mount *new_mp = fc->s_fs_info; - xfs_sb_t *sbp = &mp->m_sb; int flags = fc->sb_flags; int error; /* version 5 superblocks always support version counters. */ - if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5) + if (xfs_has_crc(mp)) fc->sb_flags |= SB_I_VERSION; error = xfs_fs_validate_params(new_mp); @@ -1783,28 +1842,26 @@ xfs_fs_reconfigure( sync_filesystem(mp->m_super); /* inode32 -> inode64 */ - if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && - !(new_mp->m_flags & XFS_MOUNT_SMALL_INUMS)) { - mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS; - mp->m_maxagi = xfs_set_inode_alloc(mp, sbp->sb_agcount); + if (xfs_has_small_inums(mp) && !xfs_has_small_inums(new_mp)) { + mp->m_features &= ~XFS_FEAT_SMALL_INUMS; + mp->m_maxagi = xfs_set_inode_alloc(mp, mp->m_sb.sb_agcount); } /* inode64 -> inode32 */ - if (!(mp->m_flags & XFS_MOUNT_SMALL_INUMS) && - (new_mp->m_flags & XFS_MOUNT_SMALL_INUMS)) { - mp->m_flags |= XFS_MOUNT_SMALL_INUMS; - mp->m_maxagi = xfs_set_inode_alloc(mp, sbp->sb_agcount); + if (!xfs_has_small_inums(mp) && xfs_has_small_inums(new_mp)) { + mp->m_features |= XFS_FEAT_SMALL_INUMS; + mp->m_maxagi = xfs_set_inode_alloc(mp, mp->m_sb.sb_agcount); } /* ro -> rw */ - if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(flags & SB_RDONLY)) { + if (xfs_is_readonly(mp) && !(flags & SB_RDONLY)) { error = xfs_remount_rw(mp); if (error) return error; } /* rw -> ro */ - if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (flags & SB_RDONLY)) { + if (!xfs_is_readonly(mp) && (flags & SB_RDONLY)) { error = xfs_remount_ro(mp); if (error) return error; @@ -1871,11 +1928,11 @@ static int xfs_init_fs_context( * Copy binary VFS mount flags we are interested in. */ if (fc->sb_flags & SB_RDONLY) - mp->m_flags |= XFS_MOUNT_RDONLY; + set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate); if (fc->sb_flags & SB_DIRSYNC) - mp->m_flags |= XFS_MOUNT_DIRSYNC; + mp->m_features |= XFS_FEAT_DIRSYNC; if (fc->sb_flags & SB_SYNCHRONOUS) - mp->m_flags |= XFS_MOUNT_WSYNC; + mp->m_features |= XFS_FEAT_WSYNC; fc->s_fs_info = mp; fc->ops = &xfs_context_ops; @@ -2118,6 +2175,48 @@ xfs_destroy_workqueues(void) destroy_workqueue(xfs_alloc_wq); } +#ifdef CONFIG_HOTPLUG_CPU +static int +xfs_cpu_dead( + unsigned int cpu) +{ + struct xfs_mount *mp, *n; + + spin_lock(&xfs_mount_list_lock); + list_for_each_entry_safe(mp, n, &xfs_mount_list, m_mount_list) { + spin_unlock(&xfs_mount_list_lock); + xfs_inodegc_cpu_dead(mp, cpu); + spin_lock(&xfs_mount_list_lock); + } + spin_unlock(&xfs_mount_list_lock); + return 0; +} + +static int __init +xfs_cpu_hotplug_init(void) +{ + int error; + + error = cpuhp_setup_state_nocalls(CPUHP_XFS_DEAD, "xfs:dead", NULL, + xfs_cpu_dead); + if (error < 0) + xfs_alert(NULL, +"Failed to initialise CPU hotplug, error %d. XFS is non-functional.", + error); + return error; +} + +static void +xfs_cpu_hotplug_destroy(void) +{ + cpuhp_remove_state_nocalls(CPUHP_XFS_DEAD); +} + +#else /* !CONFIG_HOTPLUG_CPU */ +static inline int xfs_cpu_hotplug_init(void) { return 0; } +static inline void xfs_cpu_hotplug_destroy(void) {} +#endif + STATIC int __init init_xfs_fs(void) { @@ -2130,10 +2229,14 @@ init_xfs_fs(void) xfs_dir_startup(); - error = xfs_init_zones(); + error = xfs_cpu_hotplug_init(); if (error) goto out; + error = xfs_init_zones(); + if (error) + goto out_destroy_hp; + error = xfs_init_workqueues(); if (error) goto out_destroy_zones; @@ -2213,6 +2316,8 @@ init_xfs_fs(void) xfs_destroy_workqueues(); out_destroy_zones: xfs_destroy_zones(); + out_destroy_hp: + xfs_cpu_hotplug_destroy(); out: return error; } @@ -2235,6 +2340,7 @@ exit_xfs_fs(void) xfs_destroy_workqueues(); xfs_destroy_zones(); xfs_uuid_table_free(); + xfs_cpu_hotplug_destroy(); } module_init(init_xfs_fs); diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 1525636f4065..fc2c6a404647 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -63,7 +63,7 @@ xfs_readlink_bmap_ilocked( byte_cnt = pathlen; cur_chunk = bp->b_addr; - if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (xfs_has_crc(mp)) { if (!xfs_symlink_hdr_ok(ip->i_ino, offset, byte_cnt, bp)) { error = -EFSCORRUPTED; @@ -107,7 +107,7 @@ xfs_readlink( ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_LOCAL); - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; xfs_ilock(ip, XFS_ILOCK_SHARED); @@ -168,7 +168,7 @@ xfs_symlink( trace_xfs_symlink(dp, link_name); - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; /* @@ -321,9 +321,8 @@ xfs_symlink( * symlink transaction goes to disk before returning to * the user. */ - if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { + if (xfs_has_wsync(mp) || xfs_has_dirsync(mp)) xfs_trans_set_sync(tp); - } error = xfs_trans_commit(tp); if (error) @@ -445,7 +444,7 @@ xfs_inactive_symlink_rmt( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); error = xfs_trans_commit(tp); if (error) { - ASSERT(XFS_FORCED_SHUTDOWN(mp)); + ASSERT(xfs_is_shutdown(mp)); goto error_unlock; } @@ -478,7 +477,7 @@ xfs_inactive_symlink( trace_xfs_inactive_symlink(ip); - if (XFS_FORCED_SHUTDOWN(mp)) + if (xfs_is_shutdown(mp)) return -EIO; xfs_ilock(ip, XFS_ILOCK_EXCL); diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index f1bc88f4367c..18dc5eca6c04 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -10,6 +10,7 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_sysfs.h" +#include "xfs_log.h" #include "xfs_log_priv.h" #include "xfs_mount.h" diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index 7e01e00550ac..d269ef57ff01 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -20,6 +20,7 @@ #include "xfs_bmap.h" #include "xfs_attr.h" #include "xfs_trans.h" +#include "xfs_log.h" #include "xfs_log_priv.h" #include "xfs_buf_item.h" #include "xfs_quota.h" @@ -32,6 +33,7 @@ #include "xfs_icache.h" #include "xfs_ag.h" #include "xfs_ag_resv.h" +#include "xfs_error.h" /* * We include this last to have the helpers above available for the trace diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 19260291ff8b..1033a95fbf8e 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -2,6 +2,41 @@ /* * Copyright (c) 2009, Christoph Hellwig * All Rights Reserved. + * + * NOTE: none of these tracepoints shall be considered a stable kernel ABI + * as they can change at any time. + * + * Current conventions for printing numbers measuring specific units: + * + * agno: allocation group number + * + * agino: per-AG inode number + * ino: filesystem inode number + * + * agbno: per-AG block number in fs blocks + * startblock: physical block number for file mappings. This is either a + * segmented fsblock for data device mappings, or a rfsblock + * for realtime device mappings + * fsbcount: number of blocks in an extent, in fs blocks + * + * daddr: physical block number in 512b blocks + * bbcount: number of blocks in a physical extent, in 512b blocks + * + * owner: reverse-mapping owner, usually inodes + * + * fileoff: file offset, in fs blocks + * pos: file offset, in bytes + * bytecount: number of bytes + * + * disize: ondisk file size, in bytes + * isize: incore file size, in bytes + * + * forkoff: inode fork offset, in bytes + * + * ireccount: number of inode records + * + * Numbers describing space allocations (blocks, extents, inodes) should be + * formatted in hexadecimal. */ #undef TRACE_SYSTEM #define TRACE_SYSTEM xfs @@ -139,7 +174,7 @@ DECLARE_EVENT_CLASS(xfs_perag_class, __entry->refcount = refcount; __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d agno %u refcount %d caller %pS", + TP_printk("dev %d:%d agno 0x%x refcount %d caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->refcount, @@ -157,6 +192,84 @@ DEFINE_PERAG_REF_EVENT(xfs_perag_put); DEFINE_PERAG_REF_EVENT(xfs_perag_set_inode_tag); DEFINE_PERAG_REF_EVENT(xfs_perag_clear_inode_tag); +TRACE_EVENT(xfs_inodegc_worker, + TP_PROTO(struct xfs_mount *mp, unsigned int shrinker_hits), + TP_ARGS(mp, shrinker_hits), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, shrinker_hits) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->shrinker_hits = shrinker_hits; + ), + TP_printk("dev %d:%d shrinker_hits %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->shrinker_hits) +); + +DECLARE_EVENT_CLASS(xfs_fs_class, + TP_PROTO(struct xfs_mount *mp, void *caller_ip), + TP_ARGS(mp, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned long long, mflags) + __field(unsigned long, opstate) + __field(unsigned long, sbflags) + __field(void *, caller_ip) + ), + TP_fast_assign( + if (mp) { + __entry->dev = mp->m_super->s_dev; + __entry->mflags = mp->m_features; + __entry->opstate = mp->m_opstate; + __entry->sbflags = mp->m_super->s_flags; + } + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d m_features 0x%llx opstate (%s) s_flags 0x%lx caller %pS", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->mflags, + __print_flags(__entry->opstate, "|", XFS_OPSTATE_STRINGS), + __entry->sbflags, + __entry->caller_ip) +); + +#define DEFINE_FS_EVENT(name) \ +DEFINE_EVENT(xfs_fs_class, name, \ + TP_PROTO(struct xfs_mount *mp, void *caller_ip), \ + TP_ARGS(mp, caller_ip)) +DEFINE_FS_EVENT(xfs_inodegc_flush); +DEFINE_FS_EVENT(xfs_inodegc_start); +DEFINE_FS_EVENT(xfs_inodegc_stop); +DEFINE_FS_EVENT(xfs_inodegc_queue); +DEFINE_FS_EVENT(xfs_inodegc_throttle); +DEFINE_FS_EVENT(xfs_fs_sync_fs); +DEFINE_FS_EVENT(xfs_blockgc_start); +DEFINE_FS_EVENT(xfs_blockgc_stop); +DEFINE_FS_EVENT(xfs_blockgc_worker); +DEFINE_FS_EVENT(xfs_blockgc_flush_all); + +TRACE_EVENT(xfs_inodegc_shrinker_scan, + TP_PROTO(struct xfs_mount *mp, struct shrink_control *sc, + void *caller_ip), + TP_ARGS(mp, sc, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned long, nr_to_scan) + __field(void *, caller_ip) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->nr_to_scan = sc->nr_to_scan; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d nr_to_scan %lu caller %pS", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->nr_to_scan, + __entry->caller_ip) +); + DECLARE_EVENT_CLASS(xfs_ag_class, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno), TP_ARGS(mp, agno), @@ -168,7 +281,7 @@ DECLARE_EVENT_CLASS(xfs_ag_class, __entry->dev = mp->m_super->s_dev; __entry->agno = agno; ), - TP_printk("dev %d:%d agno %u", + TP_printk("dev %d:%d agno 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno) ); @@ -268,7 +381,7 @@ DECLARE_EVENT_CLASS(xfs_bmap_class, __entry->caller_ip = caller_ip; ), TP_printk("dev %d:%d ino 0x%llx state %s cur %p/%d " - "offset %lld block %lld count %lld flag %d caller %pS", + "fileoff 0x%llx startblock 0x%llx fsbcount 0x%llx flag %d caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS), @@ -308,10 +421,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class, ), TP_fast_assign( __entry->dev = bp->b_target->bt_dev; - if (bp->b_bn == XFS_BUF_DADDR_NULL) - __entry->bno = bp->b_maps[0].bm_bn; - else - __entry->bno = bp->b_bn; + __entry->bno = xfs_buf_daddr(bp); __entry->nblks = bp->b_length; __entry->hold = atomic_read(&bp->b_hold); __entry->pincount = atomic_read(&bp->b_pin_count); @@ -319,7 +429,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class, __entry->flags = bp->b_flags; __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d bno 0x%llx nblks 0x%x hold %d pincount %d " + TP_printk("dev %d:%d daddr 0x%llx bbcount 0x%x hold %d pincount %d " "lock %d flags %s caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->bno, @@ -370,7 +480,7 @@ DECLARE_EVENT_CLASS(xfs_buf_flags_class, TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_daddr_t, bno) - __field(size_t, buffer_length) + __field(unsigned int, length) __field(int, hold) __field(int, pincount) __field(unsigned, lockval) @@ -379,19 +489,19 @@ DECLARE_EVENT_CLASS(xfs_buf_flags_class, ), TP_fast_assign( __entry->dev = bp->b_target->bt_dev; - __entry->bno = bp->b_bn; - __entry->buffer_length = BBTOB(bp->b_length); + __entry->bno = xfs_buf_daddr(bp); + __entry->length = bp->b_length; __entry->flags = flags; __entry->hold = atomic_read(&bp->b_hold); __entry->pincount = atomic_read(&bp->b_pin_count); __entry->lockval = bp->b_sema.count; __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " + TP_printk("dev %d:%d daddr 0x%llx bbcount 0x%x hold %d pincount %d " "lock %d flags %s caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->bno, - __entry->buffer_length, + __entry->length, __entry->hold, __entry->pincount, __entry->lockval, @@ -413,7 +523,7 @@ TRACE_EVENT(xfs_buf_ioerror, TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_daddr_t, bno) - __field(size_t, buffer_length) + __field(unsigned int, length) __field(unsigned, flags) __field(int, hold) __field(int, pincount) @@ -423,8 +533,8 @@ TRACE_EVENT(xfs_buf_ioerror, ), TP_fast_assign( __entry->dev = bp->b_target->bt_dev; - __entry->bno = bp->b_bn; - __entry->buffer_length = BBTOB(bp->b_length); + __entry->bno = xfs_buf_daddr(bp); + __entry->length = bp->b_length; __entry->hold = atomic_read(&bp->b_hold); __entry->pincount = atomic_read(&bp->b_pin_count); __entry->lockval = bp->b_sema.count; @@ -432,11 +542,11 @@ TRACE_EVENT(xfs_buf_ioerror, __entry->flags = bp->b_flags; __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " + TP_printk("dev %d:%d daddr 0x%llx bbcount 0x%x hold %d pincount %d " "lock %d error %d flags %s caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->bno, - __entry->buffer_length, + __entry->length, __entry->hold, __entry->pincount, __entry->lockval, @@ -451,7 +561,7 @@ DECLARE_EVENT_CLASS(xfs_buf_item_class, TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_daddr_t, buf_bno) - __field(size_t, buf_len) + __field(unsigned int, buf_len) __field(int, buf_hold) __field(int, buf_pincount) __field(int, buf_lockval) @@ -466,15 +576,15 @@ DECLARE_EVENT_CLASS(xfs_buf_item_class, __entry->bli_flags = bip->bli_flags; __entry->bli_recur = bip->bli_recur; __entry->bli_refcount = atomic_read(&bip->bli_refcount); - __entry->buf_bno = bip->bli_buf->b_bn; - __entry->buf_len = BBTOB(bip->bli_buf->b_length); + __entry->buf_bno = xfs_buf_daddr(bip->bli_buf); + __entry->buf_len = bip->bli_buf->b_length; __entry->buf_flags = bip->bli_buf->b_flags; __entry->buf_hold = atomic_read(&bip->bli_buf->b_hold); __entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count); __entry->buf_lockval = bip->bli_buf->b_sema.count; __entry->li_flags = bip->bli_item.li_flags; ), - TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " + TP_printk("dev %d:%d daddr 0x%llx bbcount 0x%x hold %d pincount %d " "lock %d flags %s recur %d refcount %d bliflags %s " "liflags %s", MAJOR(__entry->dev), MINOR(__entry->dev), @@ -534,7 +644,7 @@ DECLARE_EVENT_CLASS(xfs_filestream_class, __entry->agno = agno; __entry->streams = xfs_filestream_peek_ag(mp, agno); ), - TP_printk("dev %d:%d ino 0x%llx agno %u streams %d", + TP_printk("dev %d:%d ino 0x%llx agno 0x%x streams %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->agno, @@ -568,7 +678,7 @@ TRACE_EVENT(xfs_filestream_pick, __entry->free = free; __entry->nscan = nscan; ), - TP_printk("dev %d:%d ino 0x%llx agno %u streams %d free %d nscan %d", + TP_printk("dev %d:%d ino 0x%llx agno 0x%x streams %d free %d nscan %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->agno, @@ -616,14 +726,17 @@ DECLARE_EVENT_CLASS(xfs_inode_class, TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) + __field(unsigned long, iflags) ), TP_fast_assign( __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; + __entry->iflags = ip->i_flags; ), - TP_printk("dev %d:%d ino 0x%llx", + TP_printk("dev %d:%d ino 0x%llx iflags 0x%lx", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->ino) + __entry->ino, + __entry->iflags) ) #define DEFINE_INODE_EVENT(name) \ @@ -667,6 +780,10 @@ DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid); DEFINE_INODE_EVENT(xfs_inode_set_cowblocks_tag); DEFINE_INODE_EVENT(xfs_inode_clear_cowblocks_tag); DEFINE_INODE_EVENT(xfs_inode_free_cowblocks_invalid); +DEFINE_INODE_EVENT(xfs_inode_set_reclaimable); +DEFINE_INODE_EVENT(xfs_inode_reclaiming); +DEFINE_INODE_EVENT(xfs_inode_set_need_inactive); +DEFINE_INODE_EVENT(xfs_inode_inactivating); /* * ftrace's __print_symbolic requires that all enum values be wrapped in the @@ -773,9 +890,12 @@ TRACE_EVENT(xfs_irec_merge_pre, __entry->nagino = nagino; __entry->nholemask = holemask; ), - TP_printk("dev %d:%d agno %d inobt (%u:0x%x) new (%u:0x%x)", - MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, - __entry->agino, __entry->holemask, __entry->nagino, + TP_printk("dev %d:%d agno 0x%x agino 0x%x holemask 0x%x new_agino 0x%x new_holemask 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agino, + __entry->holemask, + __entry->nagino, __entry->nholemask) ) @@ -795,8 +915,11 @@ TRACE_EVENT(xfs_irec_merge_post, __entry->agino = agino; __entry->holemask = holemask; ), - TP_printk("dev %d:%d agno %d inobt (%u:0x%x)", MAJOR(__entry->dev), - MINOR(__entry->dev), __entry->agno, __entry->agino, + TP_printk("dev %d:%d agno 0x%x agino 0x%x holemask 0x%x", + MAJOR(__entry->dev), + MINOR(__entry->dev), + __entry->agno, + __entry->agino, __entry->holemask) ) @@ -1301,7 +1424,7 @@ DECLARE_EVENT_CLASS(xfs_file_class, __entry->offset = iocb->ki_pos; __entry->count = iov_iter_count(iter); ), - TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count 0x%zx", + TP_printk("dev %d:%d ino 0x%llx disize 0x%llx pos 0x%llx bytecount 0x%zx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->size, @@ -1348,14 +1471,14 @@ DECLARE_EVENT_CLASS(xfs_imap_class, __entry->startblock = irec ? irec->br_startblock : 0; __entry->blockcount = irec ? irec->br_blockcount : 0; ), - TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset 0x%llx count %zd " - "fork %s startoff 0x%llx startblock %lld blockcount 0x%llx", + TP_printk("dev %d:%d ino 0x%llx disize 0x%llx pos 0x%llx bytecount 0x%zx " + "fork %s startoff 0x%llx startblock 0x%llx fsbcount 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->size, __entry->offset, __entry->count, - __entry->whichfork == XFS_COW_FORK ? "cow" : "data", + __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), __entry->startoff, (int64_t)__entry->startblock, __entry->blockcount) @@ -1391,7 +1514,7 @@ DECLARE_EVENT_CLASS(xfs_simple_io_class, __entry->count = count; ), TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx " - "offset 0x%llx count %zd", + "pos 0x%llx bytecount 0x%zx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->isize, @@ -1427,7 +1550,7 @@ DECLARE_EVENT_CLASS(xfs_itrunc_class, __entry->size = ip->i_disk_size; __entry->new_size = new_size; ), - TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx", + TP_printk("dev %d:%d ino 0x%llx disize 0x%llx new_size 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->size, @@ -1458,7 +1581,7 @@ TRACE_EVENT(xfs_pagecache_inval, __entry->start = start; __entry->finish = finish; ), - TP_printk("dev %d:%d ino 0x%llx size 0x%llx start 0x%llx finish 0x%llx", + TP_printk("dev %d:%d ino 0x%llx disize 0x%llx start 0x%llx finish 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->size, @@ -1467,14 +1590,14 @@ TRACE_EVENT(xfs_pagecache_inval, ); TRACE_EVENT(xfs_bunmap, - TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, + TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t fileoff, xfs_filblks_t len, int flags, unsigned long caller_ip), - TP_ARGS(ip, bno, len, flags, caller_ip), + TP_ARGS(ip, fileoff, len, flags, caller_ip), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_ino_t, ino) __field(xfs_fsize_t, size) - __field(xfs_fileoff_t, bno) + __field(xfs_fileoff_t, fileoff) __field(xfs_filblks_t, len) __field(unsigned long, caller_ip) __field(int, flags) @@ -1483,17 +1606,17 @@ TRACE_EVENT(xfs_bunmap, __entry->dev = VFS_I(ip)->i_sb->s_dev; __entry->ino = ip->i_ino; __entry->size = ip->i_disk_size; - __entry->bno = bno; + __entry->fileoff = fileoff; __entry->len = len; __entry->caller_ip = caller_ip; __entry->flags = flags; ), - TP_printk("dev %d:%d ino 0x%llx size 0x%llx bno 0x%llx len 0x%llx" + TP_printk("dev %d:%d ino 0x%llx disize 0x%llx fileoff 0x%llx fsbcount 0x%llx" "flags %s caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->size, - __entry->bno, + __entry->fileoff, __entry->len, __print_flags(__entry->flags, "|", XFS_BMAPI_FLAGS), (void *)__entry->caller_ip) @@ -1516,7 +1639,7 @@ DECLARE_EVENT_CLASS(xfs_extent_busy_class, __entry->agbno = agbno; __entry->len = len; ), - TP_printk("dev %d:%d agno %u agbno %u len %u", + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->agbno, @@ -1554,7 +1677,7 @@ TRACE_EVENT(xfs_extent_busy_trim, __entry->tbno = tbno; __entry->tlen = tlen; ), - TP_printk("dev %d:%d agno %u agbno %u len %u tbno %u tlen %u", + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x found_agbno 0x%x found_fsbcount 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->agbno, @@ -1601,7 +1724,7 @@ DECLARE_EVENT_CLASS(xfs_agf_class, __entry->longest = be32_to_cpu(agf->agf_longest); __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d agno %u flags %s length %u roots b %u c %u " + TP_printk("dev %d:%d agno 0x%x flags %s length %u roots b %u c %u " "levels b %u c %u flfirst %u fllast %u flcount %u " "freeblks %u longest %u caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), @@ -1650,7 +1773,7 @@ TRACE_EVENT(xfs_free_extent, __entry->haveleft = haveleft; __entry->haveright = haveright; ), - TP_printk("dev %d:%d agno %u agbno %u len %u resv %d %s", + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x resv %d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->agbno, @@ -1707,7 +1830,7 @@ DECLARE_EVENT_CLASS(xfs_alloc_class, __entry->datatype = args->datatype; __entry->firstblock = args->tp->t_firstblock; ), - TP_printk("dev %d:%d agno %u agbno %u minlen %u maxlen %u mod %u " + TP_printk("dev %d:%d agno 0x%x agbno 0x%x minlen %u maxlen %u mod %u " "prod %u minleft %u total %u alignment %u minalignslop %u " "len %u type %s otype %s wasdel %d wasfromfl %d resv %d " "datatype 0x%x firstblock 0x%llx", @@ -1785,7 +1908,7 @@ TRACE_EVENT(xfs_alloc_cur_check, __entry->diff = diff; __entry->new = new; ), - TP_printk("dev %d:%d btree %s bno 0x%x len 0x%x diff 0x%x new %d", + TP_printk("dev %d:%d btree %s agbno 0x%x fsbcount 0x%x diff 0x%x new %d", MAJOR(__entry->dev), MINOR(__entry->dev), __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), __entry->bno, __entry->len, __entry->diff, __entry->new) @@ -2060,7 +2183,7 @@ DECLARE_EVENT_CLASS(xfs_swap_extent_class, __entry->fork_off = XFS_IFORK_BOFF(ip); ), TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, " - "broot size %d, fork offset %d", + "broot size %d, forkoff 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __print_symbolic(__entry->which, XFS_SWAPEXT_INODES), @@ -2186,7 +2309,7 @@ DECLARE_EVENT_CLASS(xfs_log_recover_buf_item_class, __entry->size = buf_f->blf_size; __entry->map_size = buf_f->blf_map_size; ), - TP_printk("dev %d:%d blkno 0x%llx, len %u, flags 0x%x, size %d, " + TP_printk("dev %d:%d daddr 0x%llx, bbcount 0x%x, flags 0x%x, size %d, " "map_size %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->blkno, @@ -2237,7 +2360,7 @@ DECLARE_EVENT_CLASS(xfs_log_recover_ino_item_class, __entry->boffset = in_f->ilf_boffset; ), TP_printk("dev %d:%d ino 0x%llx, size %u, fields 0x%x, asize %d, " - "dsize %d, blkno 0x%llx, len %d, boffset %d", + "dsize %d, daddr 0x%llx, bbcount 0x%x, boffset %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->size, @@ -2278,10 +2401,14 @@ DECLARE_EVENT_CLASS(xfs_log_recover_icreate_item_class, __entry->length = be32_to_cpu(in_f->icl_length); __entry->gen = be32_to_cpu(in_f->icl_gen); ), - TP_printk("dev %d:%d agno %u agbno %u count %u isize %u length %u " - "gen %u", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->agno, __entry->agbno, __entry->count, __entry->isize, - __entry->length, __entry->gen) + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x ireccount %u isize %u gen 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->length, + __entry->count, + __entry->isize, + __entry->gen) ) #define DEFINE_LOG_RECOVER_ICREATE_ITEM(name) \ DEFINE_EVENT(xfs_log_recover_icreate_item_class, name, \ @@ -2307,7 +2434,7 @@ DECLARE_EVENT_CLASS(xfs_discard_class, __entry->agbno = agbno; __entry->len = len; ), - TP_printk("dev %d:%d agno %u agbno %u len %u", + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->agbno, @@ -2350,7 +2477,7 @@ DECLARE_EVENT_CLASS(xfs_btree_cur_class, __entry->level = level; __entry->nlevels = cur->bc_nlevels; __entry->ptr = cur->bc_ptrs[level]; - __entry->daddr = bp ? bp->b_bn : -1; + __entry->daddr = bp ? xfs_buf_daddr(bp) : -1; ), TP_printk("dev %d:%d btree %s level %d/%d ptr %d daddr 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), @@ -2466,7 +2593,7 @@ DECLARE_EVENT_CLASS(xfs_phys_extent_deferred_class, __entry->agbno = agbno; __entry->len = len; ), - TP_printk("dev %d:%d op %d agno %u agbno %u len %u", + TP_printk("dev %d:%d op %d agno 0x%x agbno 0x%x fsbcount 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->type, __entry->agno, @@ -2513,13 +2640,13 @@ DECLARE_EVENT_CLASS(xfs_map_extent_deferred_class, __entry->l_state = state; __entry->op = op; ), - TP_printk("dev %d:%d op %d agno %u agbno %u owner %lld %s offset %llu len %llu state %d", + TP_printk("dev %d:%d op %d agno 0x%x agbno 0x%x owner 0x%llx %s fileoff 0x%llx fsbcount 0x%llx state %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->op, __entry->agno, __entry->agbno, __entry->ino, - __entry->whichfork == XFS_ATTR_FORK ? "attr" : "data", + __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), __entry->l_loff, __entry->l_len, __entry->l_state) @@ -2583,7 +2710,7 @@ DECLARE_EVENT_CLASS(xfs_rmap_class, if (unwritten) __entry->flags |= XFS_RMAP_UNWRITTEN; ), - TP_printk("dev %d:%d agno %u agbno %u len %u owner %lld offset %llu flags 0x%lx", + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%lx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->agbno, @@ -2616,7 +2743,7 @@ DECLARE_EVENT_CLASS(xfs_ag_error_class, __entry->error = error; __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d agno %u error %d caller %pS", + TP_printk("dev %d:%d agno 0x%x error %d caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->error, @@ -2663,7 +2790,7 @@ DECLARE_EVENT_CLASS(xfs_rmapbt_class, __entry->offset = offset; __entry->flags = flags; ), - TP_printk("dev %d:%d agno %u agbno %u len %u owner %lld offset %llu flags 0x%x", + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->agbno, @@ -2732,7 +2859,7 @@ DECLARE_EVENT_CLASS(xfs_ag_resv_class, __entry->asked = r ? r->ar_asked : 0; __entry->len = len; ), - TP_printk("dev %d:%d agno %u resv %d freeblks %u flcount %u " + TP_printk("dev %d:%d agno 0x%x resv %d freeblks %u flcount %u " "resv %u ask %u len %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, @@ -2785,7 +2912,7 @@ DECLARE_EVENT_CLASS(xfs_ag_btree_lookup_class, __entry->agbno = agbno; __entry->dir = dir; ), - TP_printk("dev %d:%d agno %u agbno %u cmp %s(%d)", + TP_printk("dev %d:%d agno 0x%x agbno 0x%x cmp %s(%d)", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->agbno, @@ -2818,7 +2945,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_class, __entry->blockcount = irec->rc_blockcount; __entry->refcount = irec->rc_refcount; ), - TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u", + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->startblock, @@ -2853,7 +2980,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_at_class, __entry->refcount = irec->rc_refcount; __entry->agbno = agbno; ), - TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u @ agbno %u", + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u @ agbno 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->startblock, @@ -2893,8 +3020,8 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class, __entry->i2_blockcount = i2->rc_blockcount; __entry->i2_refcount = i2->rc_refcount; ), - TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- " - "agbno %u len %u refcount %u", + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u -- " + "agbno 0x%x fsbcount 0x%x refcount %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->i1_startblock, @@ -2939,8 +3066,8 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class, __entry->i2_refcount = i2->rc_refcount; __entry->agbno = agbno; ), - TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- " - "agbno %u len %u refcount %u @ agbno %u", + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u -- " + "agbno 0x%x fsbcount 0x%x refcount %u @ agbno 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->i1_startblock, @@ -2991,9 +3118,9 @@ DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class, __entry->i3_blockcount = i3->rc_blockcount; __entry->i3_refcount = i3->rc_refcount; ), - TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- " - "agbno %u len %u refcount %u -- " - "agbno %u len %u refcount %u", + TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u -- " + "agbno 0x%x fsbcount 0x%x refcount %u -- " + "agbno 0x%x fsbcount 0x%x refcount %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->i1_startblock, @@ -3080,7 +3207,7 @@ TRACE_EVENT(xfs_refcount_finish_one_leftover, __entry->new_agbno = new_agbno; __entry->new_len = new_len; ), - TP_printk("dev %d:%d type %d agno %u agbno %u len %u new_agbno %u new_len %u", + TP_printk("dev %d:%d type %d agno 0x%x agbno 0x%x fsbcount 0x%x new_agbno 0x%x new_fsbcount 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->type, __entry->agno, @@ -3106,7 +3233,7 @@ DECLARE_EVENT_CLASS(xfs_inode_error_class, __entry->error = error; __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d ino %llx error %d caller %pS", + TP_printk("dev %d:%d ino 0x%llx error %d caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->error, @@ -3132,7 +3259,7 @@ DECLARE_EVENT_CLASS(xfs_double_io_class, __field(loff_t, src_isize) __field(loff_t, src_disize) __field(loff_t, src_offset) - __field(size_t, len) + __field(long long, len) __field(xfs_ino_t, dest_ino) __field(loff_t, dest_isize) __field(loff_t, dest_disize) @@ -3150,9 +3277,9 @@ DECLARE_EVENT_CLASS(xfs_double_io_class, __entry->dest_disize = dest->i_disk_size; __entry->dest_offset = doffset; ), - TP_printk("dev %d:%d count %zd " - "ino 0x%llx isize 0x%llx disize 0x%llx offset 0x%llx -> " - "ino 0x%llx isize 0x%llx disize 0x%llx offset 0x%llx", + TP_printk("dev %d:%d bytecount 0x%llx " + "ino 0x%llx isize 0x%llx disize 0x%llx pos 0x%llx -> " + "ino 0x%llx isize 0x%llx disize 0x%llx pos 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->len, __entry->src_ino, @@ -3191,7 +3318,7 @@ DECLARE_EVENT_CLASS(xfs_inode_irec_class, __entry->pblk = irec->br_startblock; __entry->state = irec->br_state; ), - TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x pblk %llu st %d", + TP_printk("dev %d:%d ino 0x%llx fileoff 0x%llx fsbcount 0x%x startblock 0x%llx st %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->lblk, @@ -3231,9 +3358,8 @@ TRACE_EVENT(xfs_reflink_remap_blocks, __entry->dest_ino = dest->i_ino; __entry->dest_lblk = doffset; ), - TP_printk("dev %d:%d len 0x%llx " - "ino 0x%llx offset 0x%llx blocks -> " - "ino 0x%llx offset 0x%llx blocks", + TP_printk("dev %d:%d fsbcount 0x%llx " + "ino 0x%llx fileoff 0x%llx -> ino 0x%llx fileoff 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->len, __entry->src_ino, @@ -3272,9 +3398,7 @@ TRACE_EVENT(xfs_ioctl_clone, __entry->dest_ino = dest->i_ino; __entry->dest_isize = i_size_read(dest); ), - TP_printk("dev %d:%d " - "ino 0x%lx isize 0x%llx -> " - "ino 0x%lx isize 0x%llx", + TP_printk("dev %d:%d ino 0x%lx isize 0x%llx -> ino 0x%lx isize 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->src_ino, __entry->src_isize, @@ -3310,7 +3434,7 @@ DEFINE_INODE_ERROR_EVENT(xfs_swap_extent_rmap_error); /* fsmap traces */ DECLARE_EVENT_CLASS(xfs_fsmap_class, TP_PROTO(struct xfs_mount *mp, u32 keydev, xfs_agnumber_t agno, - struct xfs_rmap_irec *rmap), + const struct xfs_rmap_irec *rmap), TP_ARGS(mp, keydev, agno, rmap), TP_STRUCT__entry( __field(dev_t, dev) @@ -3332,7 +3456,7 @@ DECLARE_EVENT_CLASS(xfs_fsmap_class, __entry->offset = rmap->rm_offset; __entry->flags = rmap->rm_flags; ), - TP_printk("dev %d:%d keydev %d:%d agno %u bno %llu len %llu owner %lld offset %llu flags 0x%x", + TP_printk("dev %d:%d keydev %d:%d agno 0x%x startblock 0x%llx fsbcount 0x%llx owner 0x%llx fileoff 0x%llx flags 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), MAJOR(__entry->keydev), MINOR(__entry->keydev), __entry->agno, @@ -3345,7 +3469,7 @@ DECLARE_EVENT_CLASS(xfs_fsmap_class, #define DEFINE_FSMAP_EVENT(name) \ DEFINE_EVENT(xfs_fsmap_class, name, \ TP_PROTO(struct xfs_mount *mp, u32 keydev, xfs_agnumber_t agno, \ - struct xfs_rmap_irec *rmap), \ + const struct xfs_rmap_irec *rmap), \ TP_ARGS(mp, keydev, agno, rmap)) DEFINE_FSMAP_EVENT(xfs_fsmap_low_key); DEFINE_FSMAP_EVENT(xfs_fsmap_high_key); @@ -3372,7 +3496,7 @@ DECLARE_EVENT_CLASS(xfs_getfsmap_class, __entry->offset = fsmap->fmr_offset; __entry->flags = fsmap->fmr_flags; ), - TP_printk("dev %d:%d keydev %d:%d block %llu len %llu owner %lld offset %llu flags 0x%llx", + TP_printk("dev %d:%d keydev %d:%d daddr 0x%llx bbcount 0x%llx owner 0x%llx fileoff_daddr 0x%llx flags 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), MAJOR(__entry->keydev), MINOR(__entry->keydev), __entry->block, @@ -3471,7 +3595,7 @@ TRACE_EVENT(xfs_iunlink_update_bucket, __entry->old_ptr = old_ptr; __entry->new_ptr = new_ptr; ), - TP_printk("dev %d:%d agno %u bucket %u old 0x%x new 0x%x", + TP_printk("dev %d:%d agno 0x%x bucket %u old 0x%x new 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->bucket, @@ -3497,7 +3621,7 @@ TRACE_EVENT(xfs_iunlink_update_dinode, __entry->old_ptr = old_ptr; __entry->new_ptr = new_ptr; ), - TP_printk("dev %d:%d agno %u agino 0x%x old 0x%x new 0x%x", + TP_printk("dev %d:%d agno 0x%x agino 0x%x old 0x%x new 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->agino, @@ -3518,7 +3642,7 @@ DECLARE_EVENT_CLASS(xfs_ag_inode_class, __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino); __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino); ), - TP_printk("dev %d:%d agno %u agino %u", + TP_printk("dev %d:%d agno 0x%x agino 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->agino) ) @@ -3570,7 +3694,7 @@ DECLARE_EVENT_CLASS(xfs_ag_corrupt_class, __entry->agno = agno; __entry->flags = flags; ), - TP_printk("dev %d:%d agno %u flags 0x%x", + TP_printk("dev %d:%d agno 0x%x flags 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->flags) ); @@ -3621,7 +3745,7 @@ TRACE_EVENT(xfs_iwalk_ag, __entry->agno = agno; __entry->startino = startino; ), - TP_printk("dev %d:%d agno %d startino %u", + TP_printk("dev %d:%d agno 0x%x startino 0x%x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->startino) ) @@ -3642,7 +3766,7 @@ TRACE_EVENT(xfs_iwalk_ag_rec, __entry->startino = irec->ir_startino; __entry->freemask = irec->ir_free; ), - TP_printk("dev %d:%d agno %d startino %u freemask 0x%llx", + TP_printk("dev %d:%d agno 0x%x startino 0x%x freemask 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->startino, __entry->freemask) ) @@ -3689,8 +3813,6 @@ DEFINE_EVENT(xfs_kmem_class, name, \ TP_PROTO(ssize_t size, int flags, unsigned long caller_ip), \ TP_ARGS(size, flags, caller_ip)) DEFINE_KMEM_EVENT(kmem_alloc); -DEFINE_KMEM_EVENT(kmem_alloc_io); -DEFINE_KMEM_EVENT(kmem_alloc_large); TRACE_EVENT(xfs_check_new_dalign, TP_PROTO(struct xfs_mount *mp, int new_dalign, xfs_ino_t calc_rootino), @@ -3707,7 +3829,7 @@ TRACE_EVENT(xfs_check_new_dalign, __entry->sb_rootino = mp->m_sb.sb_rootino; __entry->calc_rootino = calc_rootino; ), - TP_printk("dev %d:%d new_dalign %d sb_rootino %llu calc_rootino %llu", + TP_printk("dev %d:%d new_dalign %d sb_rootino 0x%llx calc_rootino 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->new_dalign, __entry->sb_rootino, __entry->calc_rootino) @@ -3732,7 +3854,7 @@ TRACE_EVENT(xfs_btree_commit_afakeroot, __entry->levels = cur->bc_ag.afake->af_levels; __entry->blocks = cur->bc_ag.afake->af_blocks; ), - TP_printk("dev %d:%d btree %s ag %u levels %u blocks %u root %u", + TP_printk("dev %d:%d btree %s agno 0x%x levels %u blocks %u root %u", MAJOR(__entry->dev), MINOR(__entry->dev), __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), __entry->agno, @@ -3764,12 +3886,12 @@ TRACE_EVENT(xfs_btree_commit_ifakeroot, __entry->blocks = cur->bc_ino.ifake->if_blocks; __entry->whichfork = cur->bc_ino.whichfork; ), - TP_printk("dev %d:%d btree %s ag %u agino %u whichfork %s levels %u blocks %u", + TP_printk("dev %d:%d btree %s agno 0x%x agino 0x%x whichfork %s levels %u blocks %u", MAJOR(__entry->dev), MINOR(__entry->dev), __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), __entry->agno, __entry->agino, - __entry->whichfork == XFS_ATTR_FORK ? "attr" : "data", + __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS), __entry->levels, __entry->blocks) ) @@ -3847,7 +3969,7 @@ TRACE_EVENT(xfs_btree_bload_block, } __entry->nr_records = nr_records; ), - TP_printk("dev %d:%d btree %s level %u block %llu/%llu fsb (%u/%u) recs %u", + TP_printk("dev %d:%d btree %s level %u block %llu/%llu agno 0x%x agbno 0x%x recs %u", MAJOR(__entry->dev), MINOR(__entry->dev), __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS), __entry->level, @@ -3934,7 +4056,6 @@ TRACE_DEFINE_ENUM(XLOG_STATE_SYNCING); TRACE_DEFINE_ENUM(XLOG_STATE_DONE_SYNC); TRACE_DEFINE_ENUM(XLOG_STATE_CALLBACK); TRACE_DEFINE_ENUM(XLOG_STATE_DIRTY); -TRACE_DEFINE_ENUM(XLOG_STATE_IOERROR); DECLARE_EVENT_CLASS(xlog_iclog_class, TP_PROTO(struct xlog_in_core *iclog, unsigned long caller_ip), @@ -3990,6 +4111,57 @@ DEFINE_ICLOG_EVENT(xlog_iclog_want_sync); DEFINE_ICLOG_EVENT(xlog_iclog_wait_on); DEFINE_ICLOG_EVENT(xlog_iclog_write); +DECLARE_EVENT_CLASS(xfs_das_state_class, + TP_PROTO(int das, struct xfs_inode *ip), + TP_ARGS(das, ip), + TP_STRUCT__entry( + __field(int, das) + __field(xfs_ino_t, ino) + ), + TP_fast_assign( + __entry->das = das; + __entry->ino = ip->i_ino; + ), + TP_printk("state change %d ino 0x%llx", + __entry->das, __entry->ino) +) + +#define DEFINE_DAS_STATE_EVENT(name) \ +DEFINE_EVENT(xfs_das_state_class, name, \ + TP_PROTO(int das, struct xfs_inode *ip), \ + TP_ARGS(das, ip)) +DEFINE_DAS_STATE_EVENT(xfs_attr_sf_addname_return); +DEFINE_DAS_STATE_EVENT(xfs_attr_set_iter_return); +DEFINE_DAS_STATE_EVENT(xfs_attr_node_addname_return); +DEFINE_DAS_STATE_EVENT(xfs_attr_remove_iter_return); +DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_remove_return); + +TRACE_EVENT(xfs_force_shutdown, + TP_PROTO(struct xfs_mount *mp, int ptag, int flags, const char *fname, + int line_num), + TP_ARGS(mp, ptag, flags, fname, line_num), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, ptag) + __field(int, flags) + __string(fname, fname) + __field(int, line_num) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->ptag = ptag; + __entry->flags = flags; + __assign_str(fname, fname); + __entry->line_num = line_num; + ), + TP_printk("dev %d:%d tag %s flags %s file %s line_num %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_flags(__entry->ptag, "|", XFS_PTAG_STRINGS), + __print_flags(__entry->flags, "|", XFS_SHUTDOWN_STRINGS), + __get_str(fname), + __entry->line_num) +); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 87bffd12c20c..67dec11e34c7 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -9,7 +9,6 @@ #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" -#include "xfs_log_priv.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_extent_busy.h" @@ -17,6 +16,7 @@ #include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_log.h" +#include "xfs_log_priv.h" #include "xfs_trace.h" #include "xfs_error.h" #include "xfs_defer.h" @@ -275,7 +275,7 @@ retry: WARN_ON(resp->tr_logres > 0 && mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE); ASSERT(!(flags & XFS_TRANS_RES_FDBLKS) || - xfs_sb_version_haslazysbcount(&mp->m_sb)); + xfs_has_lazysbcount(mp)); tp->t_magic = XFS_TRANS_HEADER_MAGIC; tp->t_flags = flags; @@ -295,10 +295,7 @@ retry: * Do not perform a synchronous scan because callers can hold * other locks. */ - error = xfs_blockgc_free_space(mp, NULL); - if (error) - return error; - + xfs_blockgc_flush_all(mp); want_retry = false; goto retry; } @@ -367,12 +364,12 @@ xfs_trans_mod_sb( switch (field) { case XFS_TRANS_SB_ICOUNT: tp->t_icount_delta += delta; - if (xfs_sb_version_haslazysbcount(&mp->m_sb)) + if (xfs_has_lazysbcount(mp)) flags &= ~XFS_TRANS_SB_DIRTY; break; case XFS_TRANS_SB_IFREE: tp->t_ifree_delta += delta; - if (xfs_sb_version_haslazysbcount(&mp->m_sb)) + if (xfs_has_lazysbcount(mp)) flags &= ~XFS_TRANS_SB_DIRTY; break; case XFS_TRANS_SB_FDBLOCKS: @@ -401,7 +398,7 @@ xfs_trans_mod_sb( delta -= blkres_delta; } tp->t_fdblocks_delta += delta; - if (xfs_sb_version_haslazysbcount(&mp->m_sb)) + if (xfs_has_lazysbcount(mp)) flags &= ~XFS_TRANS_SB_DIRTY; break; case XFS_TRANS_SB_RES_FDBLOCKS: @@ -411,7 +408,7 @@ xfs_trans_mod_sb( * be applied to the on-disk superblock. */ tp->t_res_fdblocks_delta += delta; - if (xfs_sb_version_haslazysbcount(&mp->m_sb)) + if (xfs_has_lazysbcount(mp)) flags &= ~XFS_TRANS_SB_DIRTY; break; case XFS_TRANS_SB_FREXTENTS: @@ -490,7 +487,7 @@ xfs_trans_apply_sb_deltas( /* * Only update the superblock counters if we are logging them */ - if (!xfs_sb_version_haslazysbcount(&(tp->t_mountp->m_sb))) { + if (!xfs_has_lazysbcount((tp->t_mountp))) { if (tp->t_icount_delta) be64_add_cpu(&sbp->sb_icount, tp->t_icount_delta); if (tp->t_ifree_delta) @@ -588,7 +585,7 @@ xfs_trans_unreserve_and_mod_sb( if (tp->t_blk_res > 0) blkdelta = tp->t_blk_res; if ((tp->t_fdblocks_delta != 0) && - (xfs_sb_version_haslazysbcount(&mp->m_sb) || + (xfs_has_lazysbcount(mp) || (tp->t_flags & XFS_TRANS_SB_DIRTY))) blkdelta += tp->t_fdblocks_delta; @@ -598,7 +595,7 @@ xfs_trans_unreserve_and_mod_sb( (tp->t_flags & XFS_TRANS_SB_DIRTY)) rtxdelta += tp->t_frextents_delta; - if (xfs_sb_version_haslazysbcount(&mp->m_sb) || + if (xfs_has_lazysbcount(mp) || (tp->t_flags & XFS_TRANS_SB_DIRTY)) { idelta = tp->t_icount_delta; ifreedelta = tp->t_ifree_delta; @@ -778,7 +775,7 @@ xfs_trans_committed_bulk( * object into the AIL as we are in a shutdown situation. */ if (aborted) { - ASSERT(XFS_FORCED_SHUTDOWN(ailp->ail_mount)); + ASSERT(xfs_is_shutdown(ailp->ail_mount)); if (lip->li_ops->iop_unpin) lip->li_ops->iop_unpin(lip, 1); continue; @@ -867,7 +864,7 @@ __xfs_trans_commit( if (!(tp->t_flags & XFS_TRANS_DIRTY)) goto out_unreserve; - if (XFS_FORCED_SHUTDOWN(mp)) { + if (xfs_is_shutdown(mp)) { error = -EIO; goto out_unreserve; } @@ -908,7 +905,7 @@ out_unreserve: */ xfs_trans_unreserve_and_mod_dquots(tp); if (tp->t_ticket) { - if (regrant && !XLOG_FORCED_SHUTDOWN(mp->m_log)) + if (regrant && !xlog_is_shutdown(mp->m_log)) xfs_log_ticket_regrant(mp->m_log, tp->t_ticket); else xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket); @@ -953,12 +950,12 @@ xfs_trans_cancel( * filesystem. This happens in paths where we detect * corruption and decide to give up. */ - if (dirty && !XFS_FORCED_SHUTDOWN(mp)) { + if (dirty && !xfs_is_shutdown(mp)) { XFS_ERROR_REPORT("xfs_trans_cancel", XFS_ERRLEVEL_LOW, mp); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); } #ifdef DEBUG - if (!dirty && !XFS_FORCED_SHUTDOWN(mp)) { + if (!dirty && !xfs_is_shutdown(mp)) { struct xfs_log_item *lip; list_for_each_entry(lip, &tp->t_items, li_trans) diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index dbb69b4bf3ed..2a8c8dc54c95 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -17,6 +17,7 @@ #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_log.h" +#include "xfs_log_priv.h" #ifdef DEBUG /* @@ -429,8 +430,12 @@ xfsaild_push( /* * If we encountered pinned items or did not finish writing out all - * buffers the last time we ran, force the log first and wait for it - * before pushing again. + * buffers the last time we ran, force a background CIL push to get the + * items unpinned in the near future. We do not wait on the CIL push as + * that could stall us for seconds if there is enough background IO + * load. Stalling for that long when the tail of the log is pinned and + * needs flushing will hard stop the transaction subsystem when log + * space runs out. */ if (ailp->ail_log_flush && ailp->ail_last_pushed_lsn == 0 && (!list_empty_careful(&ailp->ail_buf_list) || @@ -438,7 +443,7 @@ xfsaild_push( ailp->ail_log_flush = 0; XFS_STATS_INC(mp, xs_push_ail_flush); - xfs_log_force(mp, XFS_LOG_SYNC); + xlog_cil_flush(mp->m_log); } spin_lock(&ailp->ail_lock); @@ -615,7 +620,7 @@ xfsaild( * opportunity to release such buffers from the queue. */ ASSERT(list_empty(&ailp->ail_buf_list) || - XFS_FORCED_SHUTDOWN(ailp->ail_mount)); + xfs_is_shutdown(ailp->ail_mount)); xfs_buf_delwri_cancel(&ailp->ail_buf_list); break; } @@ -678,7 +683,7 @@ xfs_ail_push( struct xfs_log_item *lip; lip = xfs_ail_min(ailp); - if (!lip || XFS_FORCED_SHUTDOWN(ailp->ail_mount) || + if (!lip || xfs_is_shutdown(ailp->ail_mount) || XFS_LSN_CMP(threshold_lsn, ailp->ail_target) <= 0) return; @@ -743,7 +748,7 @@ xfs_ail_update_finish( return; } - if (!XFS_FORCED_SHUTDOWN(mp)) + if (!xfs_is_shutdown(mp)) xlog_assign_tail_lsn_locked(mp); if (list_empty(&ailp->ail_head)) @@ -863,7 +868,7 @@ xfs_trans_ail_delete( spin_lock(&ailp->ail_lock); if (!test_bit(XFS_LI_IN_AIL, &lip->li_flags)) { spin_unlock(&ailp->ail_lock); - if (shutdown_type && !XFS_FORCED_SHUTDOWN(mp)) { + if (shutdown_type && !xfs_is_shutdown(mp)) { xfs_alert_tag(mp, XFS_PTAG_AILDELETE, "%s: attempting to delete a log item that is not in the AIL", __func__); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index d11d032da0b4..6549e50d852c 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -38,7 +38,7 @@ xfs_trans_buf_item_match( blip = (struct xfs_buf_log_item *)lip; if (blip->bli_item.li_type == XFS_LI_BUF && blip->bli_buf->b_target == target && - XFS_BUF_ADDR(blip->bli_buf) == map[0].bm_bn && + xfs_buf_daddr(blip->bli_buf) == map[0].bm_bn && blip->bli_buf->b_length == len) { ASSERT(blip->bli_buf->b_map_count == nmaps); return blip->bli_buf; @@ -138,7 +138,7 @@ xfs_trans_get_buf_map( bp = xfs_trans_buf_item_match(tp, target, map, nmaps); if (bp != NULL) { ASSERT(xfs_buf_islocked(bp)); - if (XFS_FORCED_SHUTDOWN(tp->t_mountp)) { + if (xfs_is_shutdown(tp->t_mountp)) { xfs_buf_stale(bp); bp->b_flags |= XBF_DONE; } @@ -244,7 +244,7 @@ xfs_trans_read_buf_map( * We never locked this buf ourselves, so we shouldn't * brelse it either. Just get out. */ - if (XFS_FORCED_SHUTDOWN(mp)) { + if (xfs_is_shutdown(mp)) { trace_xfs_trans_read_buf_shut(bp, _RET_IP_); return -EIO; } @@ -300,7 +300,7 @@ xfs_trans_read_buf_map( return error; } - if (XFS_FORCED_SHUTDOWN(mp)) { + if (xfs_is_shutdown(mp)) { xfs_buf_relse(bp); trace_xfs_trans_read_buf_shut(bp, _RET_IP_); return -EIO; diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 48e09ea30ee5..3872ce671411 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -58,7 +58,7 @@ xfs_trans_log_dquot( /* Upgrade the dquot to bigtime format if possible. */ if (dqp->q_id != 0 && - xfs_sb_version_hasbigtime(&tp->t_mountp->m_sb) && + xfs_has_bigtime(tp->t_mountp) && !(dqp->q_type & XFS_DQTYPE_BIGTIME)) dqp->q_type |= XFS_DQTYPE_BIGTIME; @@ -132,8 +132,7 @@ xfs_trans_mod_dquot_byino( { xfs_mount_t *mp = tp->t_mountp; - if (!XFS_IS_QUOTA_RUNNING(mp) || - !XFS_IS_QUOTA_ON(mp) || + if (!XFS_IS_QUOTA_ON(mp) || xfs_is_quota_inode(&mp->m_sb, ip->i_ino)) return; @@ -192,7 +191,7 @@ xfs_trans_mod_dquot( struct xfs_dqtrx *qtrx; ASSERT(tp); - ASSERT(XFS_IS_QUOTA_RUNNING(tp->t_mountp)); + ASSERT(XFS_IS_QUOTA_ON(tp->t_mountp)); qtrx = NULL; if (!delta) @@ -738,7 +737,7 @@ xfs_trans_reserve_quota_bydquots( { int error; - if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) + if (!XFS_IS_QUOTA_ON(mp)) return 0; ASSERT(flags & XFS_QMOPT_RESBLK_MASK); @@ -795,7 +794,7 @@ xfs_trans_reserve_quota_nblks( unsigned int qflags = 0; int error; - if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) + if (!XFS_IS_QUOTA_ON(mp)) return 0; ASSERT(!xfs_is_quota_inode(&mp->m_sb, ip->i_ino)); @@ -836,51 +835,13 @@ xfs_trans_reserve_quota_icreate( { struct xfs_mount *mp = tp->t_mountp; - if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) + if (!XFS_IS_QUOTA_ON(mp)) return 0; return xfs_trans_reserve_quota_bydquots(tp, mp, udqp, gdqp, pdqp, dblocks, 1, XFS_QMOPT_RES_REGBLKS); } -/* - * This routine is called to allocate a quotaoff log item. - */ -struct xfs_qoff_logitem * -xfs_trans_get_qoff_item( - struct xfs_trans *tp, - struct xfs_qoff_logitem *startqoff, - uint flags) -{ - struct xfs_qoff_logitem *q; - - ASSERT(tp != NULL); - - q = xfs_qm_qoff_logitem_init(tp->t_mountp, startqoff, flags); - ASSERT(q != NULL); - - /* - * Get a log_item_desc to point at the new item. - */ - xfs_trans_add_item(tp, &q->qql_item); - return q; -} - - -/* - * This is called to mark the quotaoff logitem as needing - * to be logged when the transaction is committed. The logitem must - * already be associated with the given transaction. - */ -void -xfs_trans_log_quotaoff_item( - struct xfs_trans *tp, - struct xfs_qoff_logitem *qlp) -{ - tp->t_flags |= XFS_TRANS_DIRTY; - set_bit(XFS_LI_DIRTY, &qlp->qql_item.li_flags); -} - STATIC void xfs_trans_alloc_dqinfo( xfs_trans_t *tp) |