diff options
Diffstat (limited to 'fs')
67 files changed, 3898 insertions, 1044 deletions
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index e7e9d0cde51a..b6b3d052ca86 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -310,7 +310,10 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec, * Grow the stack manually; some architectures have a limit on how * far ahead a user-space access may be in order to grow the stack. */ + if (mmap_read_lock_killable(mm)) + return -EINTR; vma = find_extend_vma(mm, bprm->p); + mmap_read_unlock(mm); if (!vma) return -EFAULT; diff --git a/fs/buffer.c b/fs/buffer.c index 5a28a6aa7f16..23f645657488 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -842,13 +842,13 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, struct buffer_head *bh, *head; gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT; long offset; - struct mem_cgroup *memcg; + struct mem_cgroup *memcg, *old_memcg; if (retry) gfp |= __GFP_NOFAIL; memcg = get_mem_cgroup_from_page(page); - memalloc_use_memcg(memcg); + old_memcg = set_active_memcg(memcg); head = NULL; offset = PAGE_SIZE; @@ -867,7 +867,7 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, set_bh_page(bh, page, offset); } out: - memalloc_unuse_memcg(); + set_active_memcg(old_memcg); mem_cgroup_put(memcg); return head; /* @@ -559,8 +559,11 @@ fallback: } /** - * dax_layout_busy_page - find first pinned page in @mapping + * dax_layout_busy_page_range - find first pinned page in @mapping * @mapping: address space to scan for a page with ref count > 1 + * @start: Starting offset. Page containing 'start' is included. + * @end: End offset. Page containing 'end' is included. If 'end' is LLONG_MAX, + * pages from 'start' till the end of file are included. * * DAX requires ZONE_DEVICE mapped pages. These pages are never * 'onlined' to the page allocator so they are considered idle when @@ -573,12 +576,15 @@ fallback: * to be able to run unmap_mapping_range() and subsequently not race * mapping_mapped() becoming true. */ -struct page *dax_layout_busy_page(struct address_space *mapping) +struct page *dax_layout_busy_page_range(struct address_space *mapping, + loff_t start, loff_t end) { - XA_STATE(xas, &mapping->i_pages, 0); void *entry; unsigned int scanned = 0; struct page *page = NULL; + pgoff_t start_idx = start >> PAGE_SHIFT; + pgoff_t end_idx; + XA_STATE(xas, &mapping->i_pages, start_idx); /* * In the 'limited' case get_user_pages() for dax is disabled. @@ -589,6 +595,11 @@ struct page *dax_layout_busy_page(struct address_space *mapping) if (!dax_mapping(mapping) || !mapping_mapped(mapping)) return NULL; + /* If end == LLONG_MAX, all pages from start to till end of file */ + if (end == LLONG_MAX) + end_idx = ULONG_MAX; + else + end_idx = end >> PAGE_SHIFT; /* * If we race get_user_pages_fast() here either we'll see the * elevated page count in the iteration and wait, or @@ -596,15 +607,15 @@ struct page *dax_layout_busy_page(struct address_space *mapping) * against is no longer mapped in the page tables and bail to the * get_user_pages() slow path. The slow path is protected by * pte_lock() and pmd_lock(). New references are not taken without - * holding those locks, and unmap_mapping_range() will not zero the + * holding those locks, and unmap_mapping_pages() will not zero the * pte or pmd without holding the respective lock, so we are * guaranteed to either see new references or prevent new * references from being established. */ - unmap_mapping_range(mapping, 0, 0, 0); + unmap_mapping_pages(mapping, start_idx, end_idx - start_idx + 1, 0); xas_lock_irq(&xas); - xas_for_each(&xas, entry, ULONG_MAX) { + xas_for_each(&xas, entry, end_idx) { if (WARN_ON_ONCE(!xa_is_value(entry))) continue; if (unlikely(dax_is_locked(entry))) @@ -625,6 +636,12 @@ struct page *dax_layout_busy_page(struct address_space *mapping) xas_unlock_irq(&xas); return page; } +EXPORT_SYMBOL_GPL(dax_layout_busy_page_range); + +struct page *dax_layout_busy_page(struct address_space *mapping) +{ + return dax_layout_busy_page_range(mapping, 0, LLONG_MAX); +} EXPORT_SYMBOL_GPL(dax_layout_busy_page); static int __dax_invalidate_entry(struct address_space *mapping, diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig index 774b2618018a..40ce9a1c12e5 100644 --- a/fs/fuse/Kconfig +++ b/fs/fuse/Kconfig @@ -8,7 +8,7 @@ config FUSE_FS There's also a companion library: libfuse2. This library is available from the FUSE homepage: - <http://fuse.sourceforge.net/> + <https://github.com/libfuse/> although chances are your distribution already has that library installed if you've installed the "fuse" package itself. @@ -38,3 +38,17 @@ config VIRTIO_FS If you want to share files between guests or with the host, answer Y or M. + +config FUSE_DAX + bool "Virtio Filesystem Direct Host Memory Access support" + default y + select INTERVAL_TREE + depends on VIRTIO_FS + depends on FS_DAX + depends on DAX_DRIVER + help + This allows bypassing guest page cache and allows mapping host page + cache directly in guest address space. + + If you want to allow mounting a Virtio Filesystem with the "dax" + option, answer Y. diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile index 3e8cebfb59b7..8c7021fb2cd4 100644 --- a/fs/fuse/Makefile +++ b/fs/fuse/Makefile @@ -7,5 +7,7 @@ obj-$(CONFIG_FUSE_FS) += fuse.o obj-$(CONFIG_CUSE) += cuse.o obj-$(CONFIG_VIRTIO_FS) += virtiofs.o -fuse-objs := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o -virtiofs-y += virtio_fs.o +fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o +fuse-$(CONFIG_FUSE_DAX) += dax.o + +virtiofs-y := virtio_fs.o diff --git a/fs/fuse/control.c b/fs/fuse/control.c index a1303ad303ba..cc7e94d73c6c 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -164,6 +164,7 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file, { unsigned val; struct fuse_conn *fc; + struct fuse_mount *fm; ssize_t ret; ret = fuse_conn_limit_write(file, buf, count, ppos, &val, @@ -174,18 +175,27 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file, if (!fc) goto out; + down_read(&fc->killsb); spin_lock(&fc->bg_lock); fc->congestion_threshold = val; - if (fc->sb) { + + /* + * Get any fuse_mount belonging to this fuse_conn; s_bdi is + * shared between all of them + */ + + if (!list_empty(&fc->mounts)) { + fm = list_first_entry(&fc->mounts, struct fuse_mount, fc_entry); if (fc->num_background < fc->congestion_threshold) { - clear_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC); - clear_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC); + clear_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC); + clear_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC); } else { - set_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC); - set_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC); + set_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC); + set_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC); } } spin_unlock(&fc->bg_lock); + up_read(&fc->killsb); fuse_conn_put(fc); out: return ret; diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index 2cc17816d7b1..45082269e698 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -57,6 +57,7 @@ struct cuse_conn { struct list_head list; /* linked on cuse_conntbl */ + struct fuse_mount fm; /* Dummy mount referencing fc */ struct fuse_conn fc; /* fuse connection */ struct cdev *cdev; /* associated character device */ struct device *dev; /* device representing @cdev */ @@ -134,7 +135,7 @@ static int cuse_open(struct inode *inode, struct file *file) * Generic permission check is already done against the chrdev * file, proceed to open. */ - rc = fuse_do_open(&cc->fc, 0, file, 0); + rc = fuse_do_open(&cc->fm, 0, file, 0); if (rc) fuse_conn_put(&cc->fc); return rc; @@ -143,10 +144,10 @@ static int cuse_open(struct inode *inode, struct file *file) static int cuse_release(struct inode *inode, struct file *file) { struct fuse_file *ff = file->private_data; - struct fuse_conn *fc = ff->fc; + struct fuse_mount *fm = ff->fm; fuse_sync_release(NULL, ff, file->f_flags); - fuse_conn_put(fc); + fuse_conn_put(fm->fc); return 0; } @@ -155,7 +156,7 @@ static long cuse_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct fuse_file *ff = file->private_data; - struct cuse_conn *cc = fc_to_cc(ff->fc); + struct cuse_conn *cc = fc_to_cc(ff->fm->fc); unsigned int flags = 0; if (cc->unrestricted_ioctl) @@ -168,7 +169,7 @@ static long cuse_file_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct fuse_file *ff = file->private_data; - struct cuse_conn *cc = fc_to_cc(ff->fc); + struct cuse_conn *cc = fc_to_cc(ff->fm->fc); unsigned int flags = FUSE_IOCTL_COMPAT; if (cc->unrestricted_ioctl) @@ -313,9 +314,10 @@ struct cuse_init_args { * required data structures for it. Please read the comment at the * top of this file for high level overview. */ -static void cuse_process_init_reply(struct fuse_conn *fc, +static void cuse_process_init_reply(struct fuse_mount *fm, struct fuse_args *args, int error) { + struct fuse_conn *fc = fm->fc; struct cuse_init_args *ia = container_of(args, typeof(*ia), ap.args); struct fuse_args_pages *ap = &ia->ap; struct cuse_conn *cc = fc_to_cc(fc), *pos; @@ -424,7 +426,7 @@ static int cuse_send_init(struct cuse_conn *cc) { int rc; struct page *page; - struct fuse_conn *fc = &cc->fc; + struct fuse_mount *fm = &cc->fm; struct cuse_init_args *ia; struct fuse_args_pages *ap; @@ -460,7 +462,7 @@ static int cuse_send_init(struct cuse_conn *cc) ia->desc.length = ap->args.out_args[1].size; ap->args.end = cuse_process_init_reply; - rc = fuse_simple_background(fc, &ap->args, GFP_KERNEL); + rc = fuse_simple_background(fm, &ap->args, GFP_KERNEL); if (rc) { kfree(ia); err_free_page: @@ -506,7 +508,8 @@ static int cuse_channel_open(struct inode *inode, struct file *file) * Limit the cuse channel to requests that can * be represented in file->f_cred->user_ns. */ - fuse_conn_init(&cc->fc, file->f_cred->user_ns, &fuse_dev_fiq_ops, NULL); + fuse_conn_init(&cc->fc, &cc->fm, file->f_cred->user_ns, + &fuse_dev_fiq_ops, NULL); fud = fuse_dev_alloc_install(&cc->fc); if (!fud) { diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c new file mode 100644 index 000000000000..ff99ab2a3c43 --- /dev/null +++ b/fs/fuse/dax.c @@ -0,0 +1,1365 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * dax: direct host memory access + * Copyright (C) 2020 Red Hat, Inc. + */ + +#include "fuse_i.h" + +#include <linux/delay.h> +#include <linux/dax.h> +#include <linux/uio.h> +#include <linux/pfn_t.h> +#include <linux/iomap.h> +#include <linux/interval_tree.h> + +/* + * Default memory range size. A power of 2 so it agrees with common FUSE_INIT + * map_alignment values 4KB and 64KB. + */ +#define FUSE_DAX_SHIFT 21 +#define FUSE_DAX_SZ (1 << FUSE_DAX_SHIFT) +#define FUSE_DAX_PAGES (FUSE_DAX_SZ / PAGE_SIZE) + +/* Number of ranges reclaimer will try to free in one invocation */ +#define FUSE_DAX_RECLAIM_CHUNK (10) + +/* + * Dax memory reclaim threshold in percetage of total ranges. When free + * number of free ranges drops below this threshold, reclaim can trigger + * Default is 20% + */ +#define FUSE_DAX_RECLAIM_THRESHOLD (20) + +/** Translation information for file offsets to DAX window offsets */ +struct fuse_dax_mapping { + /* Pointer to inode where this memory range is mapped */ + struct inode *inode; + + /* Will connect in fcd->free_ranges to keep track of free memory */ + struct list_head list; + + /* For interval tree in file/inode */ + struct interval_tree_node itn; + + /* Will connect in fc->busy_ranges to keep track busy memory */ + struct list_head busy_list; + + /** Position in DAX window */ + u64 window_offset; + + /** Length of mapping, in bytes */ + loff_t length; + + /* Is this mapping read-only or read-write */ + bool writable; + + /* reference count when the mapping is used by dax iomap. */ + refcount_t refcnt; +}; + +/* Per-inode dax map */ +struct fuse_inode_dax { + /* Semaphore to protect modifications to the dmap tree */ + struct rw_semaphore sem; + + /* Sorted rb tree of struct fuse_dax_mapping elements */ + struct rb_root_cached tree; + unsigned long nr; +}; + +struct fuse_conn_dax { + /* DAX device */ + struct dax_device *dev; + + /* Lock protecting accessess to members of this structure */ + spinlock_t lock; + + /* List of memory ranges which are busy */ + unsigned long nr_busy_ranges; + struct list_head busy_ranges; + + /* Worker to free up memory ranges */ + struct delayed_work free_work; + + /* Wait queue for a dax range to become free */ + wait_queue_head_t range_waitq; + + /* DAX Window Free Ranges */ + long nr_free_ranges; + struct list_head free_ranges; + + unsigned long nr_ranges; +}; + +static inline struct fuse_dax_mapping * +node_to_dmap(struct interval_tree_node *node) +{ + if (!node) + return NULL; + + return container_of(node, struct fuse_dax_mapping, itn); +} + +static struct fuse_dax_mapping * +alloc_dax_mapping_reclaim(struct fuse_conn_dax *fcd, struct inode *inode); + +static void +__kick_dmap_free_worker(struct fuse_conn_dax *fcd, unsigned long delay_ms) +{ + unsigned long free_threshold; + + /* If number of free ranges are below threshold, start reclaim */ + free_threshold = max_t(unsigned long, fcd->nr_ranges * FUSE_DAX_RECLAIM_THRESHOLD / 100, + 1); + if (fcd->nr_free_ranges < free_threshold) + queue_delayed_work(system_long_wq, &fcd->free_work, + msecs_to_jiffies(delay_ms)); +} + +static void kick_dmap_free_worker(struct fuse_conn_dax *fcd, + unsigned long delay_ms) +{ + spin_lock(&fcd->lock); + __kick_dmap_free_worker(fcd, delay_ms); + spin_unlock(&fcd->lock); +} + +static struct fuse_dax_mapping *alloc_dax_mapping(struct fuse_conn_dax *fcd) +{ + struct fuse_dax_mapping *dmap; + + spin_lock(&fcd->lock); + dmap = list_first_entry_or_null(&fcd->free_ranges, + struct fuse_dax_mapping, list); + if (dmap) { + list_del_init(&dmap->list); + WARN_ON(fcd->nr_free_ranges <= 0); + fcd->nr_free_ranges--; + } + spin_unlock(&fcd->lock); + + kick_dmap_free_worker(fcd, 0); + return dmap; +} + +/* This assumes fcd->lock is held */ +static void __dmap_remove_busy_list(struct fuse_conn_dax *fcd, + struct fuse_dax_mapping *dmap) +{ + list_del_init(&dmap->busy_list); + WARN_ON(fcd->nr_busy_ranges == 0); + fcd->nr_busy_ranges--; +} + +static void dmap_remove_busy_list(struct fuse_conn_dax *fcd, + struct fuse_dax_mapping *dmap) +{ + spin_lock(&fcd->lock); + __dmap_remove_busy_list(fcd, dmap); + spin_unlock(&fcd->lock); +} + +/* This assumes fcd->lock is held */ +static void __dmap_add_to_free_pool(struct fuse_conn_dax *fcd, + struct fuse_dax_mapping *dmap) +{ + list_add_tail(&dmap->list, &fcd->free_ranges); + fcd->nr_free_ranges++; + wake_up(&fcd->range_waitq); +} + +static void dmap_add_to_free_pool(struct fuse_conn_dax *fcd, + struct fuse_dax_mapping *dmap) +{ + /* Return fuse_dax_mapping to free list */ + spin_lock(&fcd->lock); + __dmap_add_to_free_pool(fcd, dmap); + spin_unlock(&fcd->lock); +} + +static int fuse_setup_one_mapping(struct inode *inode, unsigned long start_idx, + struct fuse_dax_mapping *dmap, bool writable, + bool upgrade) +{ + struct fuse_mount *fm = get_fuse_mount(inode); + struct fuse_conn_dax *fcd = fm->fc->dax; + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_setupmapping_in inarg; + loff_t offset = start_idx << FUSE_DAX_SHIFT; + FUSE_ARGS(args); + ssize_t err; + + WARN_ON(fcd->nr_free_ranges < 0); + + /* Ask fuse daemon to setup mapping */ + memset(&inarg, 0, sizeof(inarg)); + inarg.foffset = offset; + inarg.fh = -1; + inarg.moffset = dmap->window_offset; + inarg.len = FUSE_DAX_SZ; + inarg.flags |= FUSE_SETUPMAPPING_FLAG_READ; + if (writable) + inarg.flags |= FUSE_SETUPMAPPING_FLAG_WRITE; + args.opcode = FUSE_SETUPMAPPING; + args.nodeid = fi->nodeid; + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + err = fuse_simple_request(fm, &args); + if (err < 0) + return err; + dmap->writable = writable; + if (!upgrade) { + /* + * We don't take a refernce on inode. inode is valid right now + * and when inode is going away, cleanup logic should first + * cleanup dmap entries. + */ + dmap->inode = inode; + dmap->itn.start = dmap->itn.last = start_idx; + /* Protected by fi->dax->sem */ + interval_tree_insert(&dmap->itn, &fi->dax->tree); + fi->dax->nr++; + spin_lock(&fcd->lock); + list_add_tail(&dmap->busy_list, &fcd->busy_ranges); + fcd->nr_busy_ranges++; + spin_unlock(&fcd->lock); + } + return 0; +} + +static int fuse_send_removemapping(struct inode *inode, + struct fuse_removemapping_in *inargp, + struct fuse_removemapping_one *remove_one) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_mount *fm = get_fuse_mount(inode); + FUSE_ARGS(args); + + args.opcode = FUSE_REMOVEMAPPING; + args.nodeid = fi->nodeid; + args.in_numargs = 2; + args.in_args[0].size = sizeof(*inargp); + args.in_args[0].value = inargp; + args.in_args[1].size = inargp->count * sizeof(*remove_one); + args.in_args[1].value = remove_one; + return fuse_simple_request(fm, &args); +} + +static int dmap_removemapping_list(struct inode *inode, unsigned int num, + struct list_head *to_remove) +{ + struct fuse_removemapping_one *remove_one, *ptr; + struct fuse_removemapping_in inarg; + struct fuse_dax_mapping *dmap; + int ret, i = 0, nr_alloc; + + nr_alloc = min_t(unsigned int, num, FUSE_REMOVEMAPPING_MAX_ENTRY); + remove_one = kmalloc_array(nr_alloc, sizeof(*remove_one), GFP_NOFS); + if (!remove_one) + return -ENOMEM; + + ptr = remove_one; + list_for_each_entry(dmap, to_remove, list) { + ptr->moffset = dmap->window_offset; + ptr->len = dmap->length; + ptr++; + i++; + num--; + if (i >= nr_alloc || num == 0) { + memset(&inarg, 0, sizeof(inarg)); + inarg.count = i; + ret = fuse_send_removemapping(inode, &inarg, + remove_one); + if (ret) + goto out; + ptr = remove_one; + i = 0; + } + } +out: + kfree(remove_one); + return ret; +} + +/* + * Cleanup dmap entry and add back to free list. This should be called with + * fcd->lock held. + */ +static void dmap_reinit_add_to_free_pool(struct fuse_conn_dax *fcd, + struct fuse_dax_mapping *dmap) +{ + pr_debug("fuse: freeing memory range start_idx=0x%lx end_idx=0x%lx window_offset=0x%llx length=0x%llx\n", + dmap->itn.start, dmap->itn.last, dmap->window_offset, + dmap->length); + __dmap_remove_busy_list(fcd, dmap); + dmap->inode = NULL; + dmap->itn.start = dmap->itn.last = 0; + __dmap_add_to_free_pool(fcd, dmap); +} + +/* + * Free inode dmap entries whose range falls inside [start, end]. + * Does not take any locks. At this point of time it should only be + * called from evict_inode() path where we know all dmap entries can be + * reclaimed. + */ +static void inode_reclaim_dmap_range(struct fuse_conn_dax *fcd, + struct inode *inode, + loff_t start, loff_t end) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_dax_mapping *dmap, *n; + int err, num = 0; + LIST_HEAD(to_remove); + unsigned long start_idx = start >> FUSE_DAX_SHIFT; + unsigned long end_idx = end >> FUSE_DAX_SHIFT; + struct interval_tree_node *node; + + while (1) { + node = interval_tree_iter_first(&fi->dax->tree, start_idx, + end_idx); + if (!node) + break; + dmap = node_to_dmap(node); + /* inode is going away. There should not be any users of dmap */ + WARN_ON(refcount_read(&dmap->refcnt) > 1); + interval_tree_remove(&dmap->itn, &fi->dax->tree); + num++; + list_add(&dmap->list, &to_remove); + } + + /* Nothing to remove */ + if (list_empty(&to_remove)) + return; + + WARN_ON(fi->dax->nr < num); + fi->dax->nr -= num; + err = dmap_removemapping_list(inode, num, &to_remove); + if (err && err != -ENOTCONN) { + pr_warn("Failed to removemappings. start=0x%llx end=0x%llx\n", + start, end); + } + spin_lock(&fcd->lock); + list_for_each_entry_safe(dmap, n, &to_remove, list) { + list_del_init(&dmap->list); + dmap_reinit_add_to_free_pool(fcd, dmap); + } + spin_unlock(&fcd->lock); +} + +static int dmap_removemapping_one(struct inode *inode, + struct fuse_dax_mapping *dmap) +{ + struct fuse_removemapping_one forget_one; + struct fuse_removemapping_in inarg; + + memset(&inarg, 0, sizeof(inarg)); + inarg.count = 1; + memset(&forget_one, 0, sizeof(forget_one)); + forget_one.moffset = dmap->window_offset; + forget_one.len = dmap->length; + + return fuse_send_removemapping(inode, &inarg, &forget_one); +} + +/* + * It is called from evict_inode() and by that time inode is going away. So + * this function does not take any locks like fi->dax->sem for traversing + * that fuse inode interval tree. If that lock is taken then lock validator + * complains of deadlock situation w.r.t fs_reclaim lock. + */ +void fuse_dax_inode_cleanup(struct inode *inode) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + + /* + * fuse_evict_inode() has already called truncate_inode_pages_final() + * before we arrive here. So we should not have to worry about any + * pages/exception entries still associated with inode. + */ + inode_reclaim_dmap_range(fc->dax, inode, 0, -1); + WARN_ON(fi->dax->nr); +} + +static void fuse_fill_iomap_hole(struct iomap *iomap, loff_t length) +{ + iomap->addr = IOMAP_NULL_ADDR; + iomap->length = length; + iomap->type = IOMAP_HOLE; +} + +static void fuse_fill_iomap(struct inode *inode, loff_t pos, loff_t length, + struct iomap *iomap, struct fuse_dax_mapping *dmap, + unsigned int flags) +{ + loff_t offset, len; + loff_t i_size = i_size_read(inode); + + offset = pos - (dmap->itn.start << FUSE_DAX_SHIFT); + len = min(length, dmap->length - offset); + + /* If length is beyond end of file, truncate further */ + if (pos + len > i_size) + len = i_size - pos; + + if (len > 0) { + iomap->addr = dmap->window_offset + offset; + iomap->length = len; + if (flags & IOMAP_FAULT) + iomap->length = ALIGN(len, PAGE_SIZE); + iomap->type = IOMAP_MAPPED; + /* + * increace refcnt so that reclaim code knows this dmap is in + * use. This assumes fi->dax->sem mutex is held either + * shared/exclusive. + */ + refcount_inc(&dmap->refcnt); + + /* iomap->private should be NULL */ + WARN_ON_ONCE(iomap->private); + iomap->private = dmap; + } else { + /* Mapping beyond end of file is hole */ + fuse_fill_iomap_hole(iomap, length); + } +} + +static int fuse_setup_new_dax_mapping(struct inode *inode, loff_t pos, + loff_t length, unsigned int flags, + struct iomap *iomap) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_conn_dax *fcd = fc->dax; + struct fuse_dax_mapping *dmap, *alloc_dmap = NULL; + int ret; + bool writable = flags & IOMAP_WRITE; + unsigned long start_idx = pos >> FUSE_DAX_SHIFT; + struct interval_tree_node *node; + + /* + * Can't do inline reclaim in fault path. We call + * dax_layout_busy_page() before we free a range. And + * fuse_wait_dax_page() drops fi->i_mmap_sem lock and requires it. + * In fault path we enter with fi->i_mmap_sem held and can't drop + * it. Also in fault path we hold fi->i_mmap_sem shared and not + * exclusive, so that creates further issues with fuse_wait_dax_page(). + * Hence return -EAGAIN and fuse_dax_fault() will wait for a memory + * range to become free and retry. + */ + if (flags & IOMAP_FAULT) { + alloc_dmap = alloc_dax_mapping(fcd); + if (!alloc_dmap) + return -EAGAIN; + } else { + alloc_dmap = alloc_dax_mapping_reclaim(fcd, inode); + if (IS_ERR(alloc_dmap)) + return PTR_ERR(alloc_dmap); + } + + /* If we are here, we should have memory allocated */ + if (WARN_ON(!alloc_dmap)) + return -EIO; + + /* + * Take write lock so that only one caller can try to setup mapping + * and other waits. + */ + down_write(&fi->dax->sem); + /* + * We dropped lock. Check again if somebody else setup + * mapping already. + */ + node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx); + if (node) { + dmap = node_to_dmap(node); + fuse_fill_iomap(inode, pos, length, iomap, dmap, flags); + dmap_add_to_free_pool(fcd, alloc_dmap); + up_write(&fi->dax->sem); + return 0; + } + + /* Setup one mapping */ + ret = fuse_setup_one_mapping(inode, pos >> FUSE_DAX_SHIFT, alloc_dmap, + writable, false); + if (ret < 0) { + dmap_add_to_free_pool(fcd, alloc_dmap); + up_write(&fi->dax->sem); + return ret; + } + fuse_fill_iomap(inode, pos, length, iomap, alloc_dmap, flags); + up_write(&fi->dax->sem); + return 0; +} + +static int fuse_upgrade_dax_mapping(struct inode *inode, loff_t pos, + loff_t length, unsigned int flags, + struct iomap *iomap) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_dax_mapping *dmap; + int ret; + unsigned long idx = pos >> FUSE_DAX_SHIFT; + struct interval_tree_node *node; + + /* + * Take exclusive lock so that only one caller can try to setup + * mapping and others wait. + */ + down_write(&fi->dax->sem); + node = interval_tree_iter_first(&fi->dax->tree, idx, idx); + + /* We are holding either inode lock or i_mmap_sem, and that should + * ensure that dmap can't be truncated. We are holding a reference + * on dmap and that should make sure it can't be reclaimed. So dmap + * should still be there in tree despite the fact we dropped and + * re-acquired the fi->dax->sem lock. + */ + ret = -EIO; + if (WARN_ON(!node)) + goto out_err; + + dmap = node_to_dmap(node); + + /* We took an extra reference on dmap to make sure its not reclaimd. + * Now we hold fi->dax->sem lock and that reference is not needed + * anymore. Drop it. + */ + if (refcount_dec_and_test(&dmap->refcnt)) { + /* refcount should not hit 0. This object only goes + * away when fuse connection goes away + */ + WARN_ON_ONCE(1); + } + + /* Maybe another thread already upgraded mapping while we were not + * holding lock. + */ + if (dmap->writable) { + ret = 0; + goto out_fill_iomap; + } + + ret = fuse_setup_one_mapping(inode, pos >> FUSE_DAX_SHIFT, dmap, true, + true); + if (ret < 0) + goto out_err; +out_fill_iomap: + fuse_fill_iomap(inode, pos, length, iomap, dmap, flags); +out_err: + up_write(&fi->dax->sem); + return ret; +} + +/* This is just for DAX and the mapping is ephemeral, do not use it for other + * purposes since there is no block device with a permanent mapping. + */ +static int fuse_iomap_begin(struct inode *inode, loff_t pos, loff_t length, + unsigned int flags, struct iomap *iomap, + struct iomap *srcmap) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_dax_mapping *dmap; + bool writable = flags & IOMAP_WRITE; + unsigned long start_idx = pos >> FUSE_DAX_SHIFT; + struct interval_tree_node *node; + + /* We don't support FIEMAP */ + if (WARN_ON(flags & IOMAP_REPORT)) + return -EIO; + + iomap->offset = pos; + iomap->flags = 0; + iomap->bdev = NULL; + iomap->dax_dev = fc->dax->dev; + + /* + * Both read/write and mmap path can race here. So we need something + * to make sure if we are setting up mapping, then other path waits + * + * For now, use a semaphore for this. It probably needs to be + * optimized later. + */ + down_read(&fi->dax->sem); + node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx); + if (node) { + dmap = node_to_dmap(node); + if (writable && !dmap->writable) { + /* Upgrade read-only mapping to read-write. This will + * require exclusive fi->dax->sem lock as we don't want + * two threads to be trying to this simultaneously + * for same dmap. So drop shared lock and acquire + * exclusive lock. + * + * Before dropping fi->dax->sem lock, take reference + * on dmap so that its not freed by range reclaim. + */ + refcount_inc(&dmap->refcnt); + up_read(&fi->dax->sem); + pr_debug("%s: Upgrading mapping at offset 0x%llx length 0x%llx\n", + __func__, pos, length); + return fuse_upgrade_dax_mapping(inode, pos, length, + flags, iomap); + } else { + fuse_fill_iomap(inode, pos, length, iomap, dmap, flags); + up_read(&fi->dax->sem); + return 0; + } + } else { + up_read(&fi->dax->sem); + pr_debug("%s: no mapping at offset 0x%llx length 0x%llx\n", + __func__, pos, length); + if (pos >= i_size_read(inode)) + goto iomap_hole; + + return fuse_setup_new_dax_mapping(inode, pos, length, flags, + iomap); + } + + /* + * If read beyond end of file happnes, fs code seems to return + * it as hole + */ +iomap_hole: + fuse_fill_iomap_hole(iomap, length); + pr_debug("%s returning hole mapping. pos=0x%llx length_asked=0x%llx length_returned=0x%llx\n", + __func__, pos, length, iomap->length); + return 0; +} + +static int fuse_iomap_end(struct inode *inode, loff_t pos, loff_t length, + ssize_t written, unsigned int flags, + struct iomap *iomap) +{ + struct fuse_dax_mapping *dmap = iomap->private; + + if (dmap) { + if (refcount_dec_and_test(&dmap->refcnt)) { + /* refcount should not hit 0. This object only goes + * away when fuse connection goes away + */ + WARN_ON_ONCE(1); + } + } + + /* DAX writes beyond end-of-file aren't handled using iomap, so the + * file size is unchanged and there is nothing to do here. + */ + return 0; +} + +static const struct iomap_ops fuse_iomap_ops = { + .iomap_begin = fuse_iomap_begin, + .iomap_end = fuse_iomap_end, +}; + +static void fuse_wait_dax_page(struct inode *inode) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + + up_write(&fi->i_mmap_sem); + schedule(); + down_write(&fi->i_mmap_sem); +} + +/* Should be called with fi->i_mmap_sem lock held exclusively */ +static int __fuse_dax_break_layouts(struct inode *inode, bool *retry, + loff_t start, loff_t end) +{ + struct page *page; + + page = dax_layout_busy_page_range(inode->i_mapping, start, end); + if (!page) + return 0; + + *retry = true; + return ___wait_var_event(&page->_refcount, + atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE, + 0, 0, fuse_wait_dax_page(inode)); +} + +/* dmap_end == 0 leads to unmapping of whole file */ +int fuse_dax_break_layouts(struct inode *inode, u64 dmap_start, + u64 dmap_end) +{ + bool retry; + int ret; + + do { + retry = false; + ret = __fuse_dax_break_layouts(inode, &retry, dmap_start, + dmap_end); + } while (ret == 0 && retry); + + return ret; +} + +ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct inode *inode = file_inode(iocb->ki_filp); + ssize_t ret; + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock_shared(inode)) + return -EAGAIN; + } else { + inode_lock_shared(inode); + } + + ret = dax_iomap_rw(iocb, to, &fuse_iomap_ops); + inode_unlock_shared(inode); + + /* TODO file_accessed(iocb->f_filp) */ + return ret; +} + +static bool file_extending_write(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + + return (iov_iter_rw(from) == WRITE && + ((iocb->ki_pos) >= i_size_read(inode) || + (iocb->ki_pos + iov_iter_count(from) > i_size_read(inode)))); +} + +static ssize_t fuse_dax_direct_write(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); + ssize_t ret; + + ret = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE); + if (ret < 0) + return ret; + + fuse_invalidate_attr(inode); + fuse_write_update_size(inode, iocb->ki_pos); + return ret; +} + +ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + ssize_t ret; + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (!inode_trylock(inode)) + return -EAGAIN; + } else { + inode_lock(inode); + } + + ret = generic_write_checks(iocb, from); + if (ret <= 0) + goto out; + + ret = file_remove_privs(iocb->ki_filp); + if (ret) + goto out; + /* TODO file_update_time() but we don't want metadata I/O */ + + /* Do not use dax for file extending writes as write and on + * disk i_size increase are not atomic otherwise. + */ + if (file_extending_write(iocb, from)) + ret = fuse_dax_direct_write(iocb, from); + else + ret = dax_iomap_rw(iocb, from, &fuse_iomap_ops); + +out: + inode_unlock(inode); + + if (ret > 0) + ret = generic_write_sync(iocb, ret); + return ret; +} + +static int fuse_dax_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + + struct inode *inode = mapping->host; + struct fuse_conn *fc = get_fuse_conn(inode); + + return dax_writeback_mapping_range(mapping, fc->dax->dev, wbc); +} + +static vm_fault_t __fuse_dax_fault(struct vm_fault *vmf, + enum page_entry_size pe_size, bool write) +{ + vm_fault_t ret; + struct inode *inode = file_inode(vmf->vma->vm_file); + struct super_block *sb = inode->i_sb; + pfn_t pfn; + int error = 0; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_conn_dax *fcd = fc->dax; + bool retry = false; + + if (write) + sb_start_pagefault(sb); +retry: + if (retry && !(fcd->nr_free_ranges > 0)) + wait_event(fcd->range_waitq, (fcd->nr_free_ranges > 0)); + + /* + * We need to serialize against not only truncate but also against + * fuse dax memory range reclaim. While a range is being reclaimed, + * we do not want any read/write/mmap to make progress and try + * to populate page cache or access memory we are trying to free. + */ + down_read(&get_fuse_inode(inode)->i_mmap_sem); + ret = dax_iomap_fault(vmf, pe_size, &pfn, &error, &fuse_iomap_ops); + if ((ret & VM_FAULT_ERROR) && error == -EAGAIN) { + error = 0; + retry = true; + up_read(&get_fuse_inode(inode)->i_mmap_sem); + goto retry; + } + + if (ret & VM_FAULT_NEEDDSYNC) + ret = dax_finish_sync_fault(vmf, pe_size, pfn); + up_read(&get_fuse_inode(inode)->i_mmap_sem); + + if (write) + sb_end_pagefault(sb); + + return ret; +} + +static vm_fault_t fuse_dax_fault(struct vm_fault *vmf) +{ + return __fuse_dax_fault(vmf, PE_SIZE_PTE, + vmf->flags & FAULT_FLAG_WRITE); +} + +static vm_fault_t fuse_dax_huge_fault(struct vm_fault *vmf, + enum page_entry_size pe_size) +{ + return __fuse_dax_fault(vmf, pe_size, vmf->flags & FAULT_FLAG_WRITE); +} + +static vm_fault_t fuse_dax_page_mkwrite(struct vm_fault *vmf) +{ + return __fuse_dax_fault(vmf, PE_SIZE_PTE, true); +} + +static vm_fault_t fuse_dax_pfn_mkwrite(struct vm_fault *vmf) +{ + return __fuse_dax_fault(vmf, PE_SIZE_PTE, true); +} + +static const struct vm_operations_struct fuse_dax_vm_ops = { + .fault = fuse_dax_fault, + .huge_fault = fuse_dax_huge_fault, + .page_mkwrite = fuse_dax_page_mkwrite, + .pfn_mkwrite = fuse_dax_pfn_mkwrite, +}; + +int fuse_dax_mmap(struct file *file, struct vm_area_struct *vma) +{ + file_accessed(file); + vma->vm_ops = &fuse_dax_vm_ops; + vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; + return 0; +} + +static int dmap_writeback_invalidate(struct inode *inode, + struct fuse_dax_mapping *dmap) +{ + int ret; + loff_t start_pos = dmap->itn.start << FUSE_DAX_SHIFT; + loff_t end_pos = (start_pos + FUSE_DAX_SZ - 1); + + ret = filemap_fdatawrite_range(inode->i_mapping, start_pos, end_pos); + if (ret) { + pr_debug("fuse: filemap_fdatawrite_range() failed. err=%d start_pos=0x%llx, end_pos=0x%llx\n", + ret, start_pos, end_pos); + return ret; + } + + ret = invalidate_inode_pages2_range(inode->i_mapping, + start_pos >> PAGE_SHIFT, + end_pos >> PAGE_SHIFT); + if (ret) + pr_debug("fuse: invalidate_inode_pages2_range() failed err=%d\n", + ret); + + return ret; +} + +static int reclaim_one_dmap_locked(struct inode *inode, + struct fuse_dax_mapping *dmap) +{ + int ret; + struct fuse_inode *fi = get_fuse_inode(inode); + + /* + * igrab() was done to make sure inode won't go under us, and this + * further avoids the race with evict(). + */ + ret = dmap_writeback_invalidate(inode, dmap); + if (ret) + return ret; + + /* Remove dax mapping from inode interval tree now */ + interval_tree_remove(&dmap->itn, &fi->dax->tree); + fi->dax->nr--; + + /* It is possible that umount/shutdown has killed the fuse connection + * and worker thread is trying to reclaim memory in parallel. Don't + * warn in that case. + */ + ret = dmap_removemapping_one(inode, dmap); + if (ret && ret != -ENOTCONN) { + pr_warn("Failed to remove mapping. offset=0x%llx len=0x%llx ret=%d\n", + dmap->window_offset, dmap->length, ret); + } + return 0; +} + +/* Find first mapped dmap for an inode and return file offset. Caller needs + * to hold fi->dax->sem lock either shared or exclusive. + */ +static struct fuse_dax_mapping *inode_lookup_first_dmap(struct inode *inode) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_dax_mapping *dmap; + struct interval_tree_node *node; + + for (node = interval_tree_iter_first(&fi->dax->tree, 0, -1); node; + node = interval_tree_iter_next(node, 0, -1)) { + dmap = node_to_dmap(node); + /* still in use. */ + if (refcount_read(&dmap->refcnt) > 1) + continue; + + return dmap; + } + + return NULL; +} + +/* + * Find first mapping in the tree and free it and return it. Do not add + * it back to free pool. + */ +static struct fuse_dax_mapping * +inode_inline_reclaim_one_dmap(struct fuse_conn_dax *fcd, struct inode *inode, + bool *retry) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_dax_mapping *dmap; + u64 dmap_start, dmap_end; + unsigned long start_idx; + int ret; + struct interval_tree_node *node; + + down_write(&fi->i_mmap_sem); + + /* Lookup a dmap and corresponding file offset to reclaim. */ + down_read(&fi->dax->sem); + dmap = inode_lookup_first_dmap(inode); + if (dmap) { + start_idx = dmap->itn.start; + dmap_start = start_idx << FUSE_DAX_SHIFT; + dmap_end = dmap_start + FUSE_DAX_SZ - 1; + } + up_read(&fi->dax->sem); + + if (!dmap) + goto out_mmap_sem; + /* + * Make sure there are no references to inode pages using + * get_user_pages() + */ + ret = fuse_dax_break_layouts(inode, dmap_start, dmap_end); + if (ret) { + pr_debug("fuse: fuse_dax_break_layouts() failed. err=%d\n", + ret); + dmap = ERR_PTR(ret); + goto out_mmap_sem; + } + + down_write(&fi->dax->sem); + node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx); + /* Range already got reclaimed by somebody else */ + if (!node) { + if (retry) + *retry = true; + goto out_write_dmap_sem; + } + + dmap = node_to_dmap(node); + /* still in use. */ + if (refcount_read(&dmap->refcnt) > 1) { + dmap = NULL; + if (retry) + *retry = true; + goto out_write_dmap_sem; + } + + ret = reclaim_one_dmap_locked(inode, dmap); + if (ret < 0) { + dmap = ERR_PTR(ret); + goto out_write_dmap_sem; + } + + /* Clean up dmap. Do not add back to free list */ + dmap_remove_busy_list(fcd, dmap); + dmap->inode = NULL; + dmap->itn.start = dmap->itn.last = 0; + + pr_debug("fuse: %s: inline reclaimed memory range. inode=%p, window_offset=0x%llx, length=0x%llx\n", + __func__, inode, dmap->window_offset, dmap->length); + +out_write_dmap_sem: + up_write(&fi->dax->sem); +out_mmap_sem: + up_write(&fi->i_mmap_sem); + return dmap; +} + +static struct fuse_dax_mapping * +alloc_dax_mapping_reclaim(struct fuse_conn_dax *fcd, struct inode *inode) +{ + struct fuse_dax_mapping *dmap; + struct fuse_inode *fi = get_fuse_inode(inode); + + while (1) { + bool retry = false; + + dmap = alloc_dax_mapping(fcd); + if (dmap) + return dmap; + + dmap = inode_inline_reclaim_one_dmap(fcd, inode, &retry); + /* + * Either we got a mapping or it is an error, return in both + * the cases. + */ + if (dmap) + return dmap; + + /* If we could not reclaim a mapping because it + * had a reference or some other temporary failure, + * Try again. We want to give up inline reclaim only + * if there is no range assigned to this node. Otherwise + * if a deadlock is possible if we sleep with fi->i_mmap_sem + * held and worker to free memory can't make progress due + * to unavailability of fi->i_mmap_sem lock. So sleep + * only if fi->dax->nr=0 + */ + if (retry) + continue; + /* + * There are no mappings which can be reclaimed. Wait for one. + * We are not holding fi->dax->sem. So it is possible + * that range gets added now. But as we are not holding + * fi->i_mmap_sem, worker should still be able to free up + * a range and wake us up. + */ + if (!fi->dax->nr && !(fcd->nr_free_ranges > 0)) { + if (wait_event_killable_exclusive(fcd->range_waitq, + (fcd->nr_free_ranges > 0))) { + return ERR_PTR(-EINTR); + } + } + } +} + +static int lookup_and_reclaim_dmap_locked(struct fuse_conn_dax *fcd, + struct inode *inode, + unsigned long start_idx) +{ + int ret; + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_dax_mapping *dmap; + struct interval_tree_node *node; + + /* Find fuse dax mapping at file offset inode. */ + node = interval_tree_iter_first(&fi->dax->tree, start_idx, start_idx); + + /* Range already got cleaned up by somebody else */ + if (!node) + return 0; + dmap = node_to_dmap(node); + + /* still in use. */ + if (refcount_read(&dmap->refcnt) > 1) + return 0; + + ret = reclaim_one_dmap_locked(inode, dmap); + if (ret < 0) + return ret; + + /* Cleanup dmap entry and add back to free list */ + spin_lock(&fcd->lock); + dmap_reinit_add_to_free_pool(fcd, dmap); + spin_unlock(&fcd->lock); + return ret; +} + +/* + * Free a range of memory. + * Locking: + * 1. Take fi->i_mmap_sem to block dax faults. + * 2. Take fi->dax->sem to protect interval tree and also to make sure + * read/write can not reuse a dmap which we might be freeing. + */ +static int lookup_and_reclaim_dmap(struct fuse_conn_dax *fcd, + struct inode *inode, + unsigned long start_idx, + unsigned long end_idx) +{ + int ret; + struct fuse_inode *fi = get_fuse_inode(inode); + loff_t dmap_start = start_idx << FUSE_DAX_SHIFT; + loff_t dmap_end = (dmap_start + FUSE_DAX_SZ) - 1; + + down_write(&fi->i_mmap_sem); + ret = fuse_dax_break_layouts(inode, dmap_start, dmap_end); + if (ret) { + pr_debug("virtio_fs: fuse_dax_break_layouts() failed. err=%d\n", + ret); + goto out_mmap_sem; + } + + down_write(&fi->dax->sem); + ret = lookup_and_reclaim_dmap_locked(fcd, inode, start_idx); + up_write(&fi->dax->sem); +out_mmap_sem: + up_write(&fi->i_mmap_sem); + return ret; +} + +static int try_to_free_dmap_chunks(struct fuse_conn_dax *fcd, + unsigned long nr_to_free) +{ + struct fuse_dax_mapping *dmap, *pos, *temp; + int ret, nr_freed = 0; + unsigned long start_idx = 0, end_idx = 0; + struct inode *inode = NULL; + + /* Pick first busy range and free it for now*/ + while (1) { + if (nr_freed >= nr_to_free) + break; + + dmap = NULL; + spin_lock(&fcd->lock); + + if (!fcd->nr_busy_ranges) { + spin_unlock(&fcd->lock); + return 0; + } + + list_for_each_entry_safe(pos, temp, &fcd->busy_ranges, + busy_list) { + /* skip this range if it's in use. */ + if (refcount_read(&pos->refcnt) > 1) + continue; + + inode = igrab(pos->inode); + /* + * This inode is going away. That will free + * up all the ranges anyway, continue to + * next range. + */ + if (!inode) + continue; + /* + * Take this element off list and add it tail. If + * this element can't be freed, it will help with + * selecting new element in next iteration of loop. + */ + dmap = pos; + list_move_tail(&dmap->busy_list, &fcd->busy_ranges); + start_idx = end_idx = dmap->itn.start; + break; + } + spin_unlock(&fcd->lock); + if (!dmap) + return 0; + + ret = lookup_and_reclaim_dmap(fcd, inode, start_idx, end_idx); + iput(inode); + if (ret) + return ret; + nr_freed++; + } + return 0; +} + +static void fuse_dax_free_mem_worker(struct work_struct *work) +{ + int ret; + struct fuse_conn_dax *fcd = container_of(work, struct fuse_conn_dax, + free_work.work); + ret = try_to_free_dmap_chunks(fcd, FUSE_DAX_RECLAIM_CHUNK); + if (ret) { + pr_debug("fuse: try_to_free_dmap_chunks() failed with err=%d\n", + ret); + } + + /* If number of free ranges are still below threhold, requeue */ + kick_dmap_free_worker(fcd, 1); +} + +static void fuse_free_dax_mem_ranges(struct list_head *mem_list) +{ + struct fuse_dax_mapping *range, *temp; + + /* Free All allocated elements */ + list_for_each_entry_safe(range, temp, mem_list, list) { + list_del(&range->list); + if (!list_empty(&range->busy_list)) + list_del(&range->busy_list); + kfree(range); + } +} + +void fuse_dax_conn_free(struct fuse_conn *fc) +{ + if (fc->dax) { + fuse_free_dax_mem_ranges(&fc->dax->free_ranges); + kfree(fc->dax); + } +} + +static int fuse_dax_mem_range_init(struct fuse_conn_dax *fcd) +{ + long nr_pages, nr_ranges; + void *kaddr; + pfn_t pfn; + struct fuse_dax_mapping *range; + int ret, id; + size_t dax_size = -1; + unsigned long i; + + init_waitqueue_head(&fcd->range_waitq); + INIT_LIST_HEAD(&fcd->free_ranges); + INIT_LIST_HEAD(&fcd->busy_ranges); + INIT_DELAYED_WORK(&fcd->free_work, fuse_dax_free_mem_worker); + + id = dax_read_lock(); + nr_pages = dax_direct_access(fcd->dev, 0, PHYS_PFN(dax_size), &kaddr, + &pfn); + dax_read_unlock(id); + if (nr_pages < 0) { + pr_debug("dax_direct_access() returned %ld\n", nr_pages); + return nr_pages; + } + + nr_ranges = nr_pages/FUSE_DAX_PAGES; + pr_debug("%s: dax mapped %ld pages. nr_ranges=%ld\n", + __func__, nr_pages, nr_ranges); + + for (i = 0; i < nr_ranges; i++) { + range = kzalloc(sizeof(struct fuse_dax_mapping), GFP_KERNEL); + ret = -ENOMEM; + if (!range) + goto out_err; + + /* TODO: This offset only works if virtio-fs driver is not + * having some memory hidden at the beginning. This needs + * better handling + */ + range->window_offset = i * FUSE_DAX_SZ; + range->length = FUSE_DAX_SZ; + INIT_LIST_HEAD(&range->busy_list); + refcount_set(&range->refcnt, 1); + list_add_tail(&range->list, &fcd->free_ranges); + } + + fcd->nr_free_ranges = nr_ranges; + fcd->nr_ranges = nr_ranges; + return 0; +out_err: + /* Free All allocated elements */ + fuse_free_dax_mem_ranges(&fcd->free_ranges); + return ret; +} + +int fuse_dax_conn_alloc(struct fuse_conn *fc, struct dax_device *dax_dev) +{ + struct fuse_conn_dax *fcd; + int err; + + if (!dax_dev) + return 0; + + fcd = kzalloc(sizeof(*fcd), GFP_KERNEL); + if (!fcd) + return -ENOMEM; + + spin_lock_init(&fcd->lock); + fcd->dev = dax_dev; + err = fuse_dax_mem_range_init(fcd); + if (err) { + kfree(fcd); + return err; + } + + fc->dax = fcd; + return 0; +} + +bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi) +{ + struct fuse_conn *fc = get_fuse_conn_super(sb); + + fi->dax = NULL; + if (fc->dax) { + fi->dax = kzalloc(sizeof(*fi->dax), GFP_KERNEL_ACCOUNT); + if (!fi->dax) + return false; + + init_rwsem(&fi->dax->sem); + fi->dax->tree = RB_ROOT_CACHED; + } + + return true; +} + +static const struct address_space_operations fuse_dax_file_aops = { + .writepages = fuse_dax_writepages, + .direct_IO = noop_direct_IO, + .set_page_dirty = noop_set_page_dirty, + .invalidatepage = noop_invalidatepage, +}; + +void fuse_dax_inode_init(struct inode *inode) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + + if (!fc->dax) + return; + + inode->i_flags |= S_DAX; + inode->i_data.a_ops = &fuse_dax_file_aops; +} + +bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int map_alignment) +{ + if (fc->dax && (map_alignment > FUSE_DAX_SHIFT)) { + pr_warn("FUSE: map_alignment %u incompatible with dax mem range size %u\n", + map_alignment, FUSE_DAX_SZ); + return false; + } + return true; +} + +void fuse_dax_cancel_work(struct fuse_conn *fc) +{ + struct fuse_conn_dax *fcd = fc->dax; + + if (fcd) + cancel_delayed_work_sync(&fcd->free_work); + +} +EXPORT_SYMBOL_GPL(fuse_dax_cancel_work); diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 02b3c36b3676..588f8d1240aa 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -40,20 +40,21 @@ static struct fuse_dev *fuse_get_dev(struct file *file) return READ_ONCE(file->private_data); } -static void fuse_request_init(struct fuse_req *req) +static void fuse_request_init(struct fuse_mount *fm, struct fuse_req *req) { INIT_LIST_HEAD(&req->list); INIT_LIST_HEAD(&req->intr_entry); init_waitqueue_head(&req->waitq); refcount_set(&req->count, 1); __set_bit(FR_PENDING, &req->flags); + req->fm = fm; } -static struct fuse_req *fuse_request_alloc(gfp_t flags) +static struct fuse_req *fuse_request_alloc(struct fuse_mount *fm, gfp_t flags) { struct fuse_req *req = kmem_cache_zalloc(fuse_req_cachep, flags); if (req) - fuse_request_init(req); + fuse_request_init(fm, req); return req; } @@ -100,10 +101,11 @@ static void fuse_drop_waiting(struct fuse_conn *fc) } } -static void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req); +static void fuse_put_request(struct fuse_req *req); -static struct fuse_req *fuse_get_req(struct fuse_conn *fc, bool for_background) +static struct fuse_req *fuse_get_req(struct fuse_mount *fm, bool for_background) { + struct fuse_conn *fc = fm->fc; struct fuse_req *req; int err; atomic_inc(&fc->num_waiting); @@ -125,7 +127,7 @@ static struct fuse_req *fuse_get_req(struct fuse_conn *fc, bool for_background) if (fc->conn_error) goto out; - req = fuse_request_alloc(GFP_KERNEL); + req = fuse_request_alloc(fm, GFP_KERNEL); err = -ENOMEM; if (!req) { if (for_background) @@ -143,7 +145,7 @@ static struct fuse_req *fuse_get_req(struct fuse_conn *fc, bool for_background) if (unlikely(req->in.h.uid == ((uid_t)-1) || req->in.h.gid == ((gid_t)-1))) { - fuse_put_request(fc, req); + fuse_put_request(req); return ERR_PTR(-EOVERFLOW); } return req; @@ -153,8 +155,10 @@ static struct fuse_req *fuse_get_req(struct fuse_conn *fc, bool for_background) return ERR_PTR(err); } -static void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) +static void fuse_put_request(struct fuse_req *req) { + struct fuse_conn *fc = req->fm->fc; + if (refcount_dec_and_test(&req->count)) { if (test_bit(FR_BACKGROUND, &req->flags)) { /* @@ -273,8 +277,10 @@ static void flush_bg_queue(struct fuse_conn *fc) * the 'end' callback is called if given, else the reference to the * request is released */ -void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req) +void fuse_request_end(struct fuse_req *req) { + struct fuse_mount *fm = req->fm; + struct fuse_conn *fc = fm->fc; struct fuse_iqueue *fiq = &fc->iq; if (test_and_set_bit(FR_FINISHED, &req->flags)) @@ -309,9 +315,9 @@ void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req) wake_up(&fc->blocked_waitq); } - if (fc->num_background == fc->congestion_threshold && fc->sb) { - clear_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC); - clear_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC); + if (fc->num_background == fc->congestion_threshold && fm->sb) { + clear_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC); + clear_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC); } fc->num_background--; fc->active_background--; @@ -323,14 +329,16 @@ void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req) } if (test_bit(FR_ASYNC, &req->flags)) - req->args->end(fc, req->args, req->out.h.error); + req->args->end(fm, req->args, req->out.h.error); put_request: - fuse_put_request(fc, req); + fuse_put_request(req); } EXPORT_SYMBOL_GPL(fuse_request_end); -static int queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) +static int queue_interrupt(struct fuse_req *req) { + struct fuse_iqueue *fiq = &req->fm->fc->iq; + spin_lock(&fiq->lock); /* Check for we've sent request to interrupt this req */ if (unlikely(!test_bit(FR_INTERRUPTED, &req->flags))) { @@ -357,8 +365,9 @@ static int queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) return 0; } -static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) +static void request_wait_answer(struct fuse_req *req) { + struct fuse_conn *fc = req->fm->fc; struct fuse_iqueue *fiq = &fc->iq; int err; @@ -373,7 +382,7 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) /* matches barrier in fuse_dev_do_read() */ smp_mb__after_atomic(); if (test_bit(FR_SENT, &req->flags)) - queue_interrupt(fiq, req); + queue_interrupt(req); } if (!test_bit(FR_FORCE, &req->flags)) { @@ -402,9 +411,9 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) wait_event(req->waitq, test_bit(FR_FINISHED, &req->flags)); } -static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) +static void __fuse_request_send(struct fuse_req *req) { - struct fuse_iqueue *fiq = &fc->iq; + struct fuse_iqueue *fiq = &req->fm->fc->iq; BUG_ON(test_bit(FR_BACKGROUND, &req->flags)); spin_lock(&fiq->lock); @@ -418,7 +427,7 @@ static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) __fuse_get_request(req); queue_request_and_unlock(fiq, req); - request_wait_answer(fc, req); + request_wait_answer(req); /* Pairs with smp_wmb() in fuse_request_end() */ smp_rmb(); } @@ -457,8 +466,10 @@ static void fuse_adjust_compat(struct fuse_conn *fc, struct fuse_args *args) } } -static void fuse_force_creds(struct fuse_conn *fc, struct fuse_req *req) +static void fuse_force_creds(struct fuse_req *req) { + struct fuse_conn *fc = req->fm->fc; + req->in.h.uid = from_kuid_munged(fc->user_ns, current_fsuid()); req->in.h.gid = from_kgid_munged(fc->user_ns, current_fsgid()); req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns); @@ -473,23 +484,24 @@ static void fuse_args_to_req(struct fuse_req *req, struct fuse_args *args) __set_bit(FR_ASYNC, &req->flags); } -ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args) +ssize_t fuse_simple_request(struct fuse_mount *fm, struct fuse_args *args) { + struct fuse_conn *fc = fm->fc; struct fuse_req *req; ssize_t ret; if (args->force) { atomic_inc(&fc->num_waiting); - req = fuse_request_alloc(GFP_KERNEL | __GFP_NOFAIL); + req = fuse_request_alloc(fm, GFP_KERNEL | __GFP_NOFAIL); if (!args->nocreds) - fuse_force_creds(fc, req); + fuse_force_creds(req); __set_bit(FR_WAITING, &req->flags); __set_bit(FR_FORCE, &req->flags); } else { WARN_ON(args->nocreds); - req = fuse_get_req(fc, false); + req = fuse_get_req(fm, false); if (IS_ERR(req)) return PTR_ERR(req); } @@ -500,20 +512,21 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args) if (!args->noreply) __set_bit(FR_ISREPLY, &req->flags); - __fuse_request_send(fc, req); + __fuse_request_send(req); ret = req->out.h.error; if (!ret && args->out_argvar) { BUG_ON(args->out_numargs == 0); ret = args->out_args[args->out_numargs - 1].size; } - fuse_put_request(fc, req); + fuse_put_request(req); return ret; } -static bool fuse_request_queue_background(struct fuse_conn *fc, - struct fuse_req *req) +static bool fuse_request_queue_background(struct fuse_req *req) { + struct fuse_mount *fm = req->fm; + struct fuse_conn *fc = fm->fc; bool queued = false; WARN_ON(!test_bit(FR_BACKGROUND, &req->flags)); @@ -527,9 +540,9 @@ static bool fuse_request_queue_background(struct fuse_conn *fc, fc->num_background++; if (fc->num_background == fc->max_background) fc->blocked = 1; - if (fc->num_background == fc->congestion_threshold && fc->sb) { - set_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC); - set_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC); + if (fc->num_background == fc->congestion_threshold && fm->sb) { + set_bdi_congested(fm->sb->s_bdi, BLK_RW_SYNC); + set_bdi_congested(fm->sb->s_bdi, BLK_RW_ASYNC); } list_add_tail(&req->list, &fc->bg_queue); flush_bg_queue(fc); @@ -540,28 +553,28 @@ static bool fuse_request_queue_background(struct fuse_conn *fc, return queued; } -int fuse_simple_background(struct fuse_conn *fc, struct fuse_args *args, +int fuse_simple_background(struct fuse_mount *fm, struct fuse_args *args, gfp_t gfp_flags) { struct fuse_req *req; if (args->force) { WARN_ON(!args->nocreds); - req = fuse_request_alloc(gfp_flags); + req = fuse_request_alloc(fm, gfp_flags); if (!req) return -ENOMEM; __set_bit(FR_BACKGROUND, &req->flags); } else { WARN_ON(args->nocreds); - req = fuse_get_req(fc, true); + req = fuse_get_req(fm, true); if (IS_ERR(req)) return PTR_ERR(req); } fuse_args_to_req(req, args); - if (!fuse_request_queue_background(fc, req)) { - fuse_put_request(fc, req); + if (!fuse_request_queue_background(req)) { + fuse_put_request(req); return -ENOTCONN; } @@ -569,14 +582,14 @@ int fuse_simple_background(struct fuse_conn *fc, struct fuse_args *args, } EXPORT_SYMBOL_GPL(fuse_simple_background); -static int fuse_simple_notify_reply(struct fuse_conn *fc, +static int fuse_simple_notify_reply(struct fuse_mount *fm, struct fuse_args *args, u64 unique) { struct fuse_req *req; - struct fuse_iqueue *fiq = &fc->iq; + struct fuse_iqueue *fiq = &fm->fc->iq; int err = 0; - req = fuse_get_req(fc, false); + req = fuse_get_req(fm, false); if (IS_ERR(req)) return PTR_ERR(req); @@ -591,7 +604,7 @@ static int fuse_simple_notify_reply(struct fuse_conn *fc, } else { err = -ENODEV; spin_unlock(&fiq->lock); - fuse_put_request(fc, req); + fuse_put_request(req); } return err; @@ -785,15 +798,16 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) struct page *newpage; struct pipe_buffer *buf = cs->pipebufs; + get_page(oldpage); err = unlock_request(cs->req); if (err) - return err; + goto out_put_old; fuse_copy_finish(cs); err = pipe_buf_confirm(cs->pipe, buf); if (err) - return err; + goto out_put_old; BUG_ON(!cs->nr_segs); cs->currbuf = buf; @@ -833,7 +847,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) err = replace_page_cache_page(oldpage, newpage, GFP_KERNEL); if (err) { unlock_page(newpage); - return err; + goto out_put_old; } get_page(newpage); @@ -852,14 +866,19 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) if (err) { unlock_page(newpage); put_page(newpage); - return err; + goto out_put_old; } unlock_page(oldpage); + /* Drop ref for ap->pages[] array */ put_page(oldpage); cs->len = 0; - return 0; + err = 0; +out_put_old: + /* Drop ref obtained in this function */ + put_page(oldpage); + return err; out_fallback_unlock: unlock_page(newpage); @@ -868,10 +887,10 @@ out_fallback: cs->offset = buf->offset; err = lock_request(cs->req); - if (err) - return err; + if (!err) + err = 1; - return 1; + goto out_put_old; } static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page, @@ -883,14 +902,16 @@ static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page, if (cs->nr_segs >= cs->pipe->max_usage) return -EIO; + get_page(page); err = unlock_request(cs->req); - if (err) + if (err) { + put_page(page); return err; + } fuse_copy_finish(cs); buf = cs->pipebufs; - get_page(page); buf->page = page; buf->offset = offset; buf->len = count; @@ -1250,7 +1271,7 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, /* SETXATTR is special, since it may contain too large data */ if (args->opcode == FUSE_SETXATTR) req->out.h.error = -E2BIG; - fuse_request_end(fc, req); + fuse_request_end(req); goto restart; } spin_lock(&fpq->lock); @@ -1284,8 +1305,8 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, /* matches barrier in request_wait_answer() */ smp_mb__after_atomic(); if (test_bit(FR_INTERRUPTED, &req->flags)) - queue_interrupt(fiq, req); - fuse_put_request(fc, req); + queue_interrupt(req); + fuse_put_request(req); return reqsize; @@ -1293,7 +1314,7 @@ out_end: if (!test_bit(FR_PRIVATE, &req->flags)) list_del_init(&req->list); spin_unlock(&fpq->lock); - fuse_request_end(fc, req); + fuse_request_end(req); return err; err_unlock: @@ -1416,11 +1437,8 @@ static int fuse_notify_inval_inode(struct fuse_conn *fc, unsigned int size, fuse_copy_finish(cs); down_read(&fc->killsb); - err = -ENOENT; - if (fc->sb) { - err = fuse_reverse_inval_inode(fc->sb, outarg.ino, - outarg.off, outarg.len); - } + err = fuse_reverse_inval_inode(fc, outarg.ino, + outarg.off, outarg.len); up_read(&fc->killsb); return err; @@ -1466,9 +1484,7 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size, buf[outarg.namelen] = 0; down_read(&fc->killsb); - err = -ENOENT; - if (fc->sb) - err = fuse_reverse_inval_entry(fc->sb, outarg.parent, 0, &name); + err = fuse_reverse_inval_entry(fc, outarg.parent, 0, &name); up_read(&fc->killsb); kfree(buf); return err; @@ -1516,10 +1532,7 @@ static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size, buf[outarg.namelen] = 0; down_read(&fc->killsb); - err = -ENOENT; - if (fc->sb) - err = fuse_reverse_inval_entry(fc->sb, outarg.parent, - outarg.child, &name); + err = fuse_reverse_inval_entry(fc, outarg.parent, outarg.child, &name); up_read(&fc->killsb); kfree(buf); return err; @@ -1561,10 +1574,7 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size, down_read(&fc->killsb); err = -ENOENT; - if (!fc->sb) - goto out_up_killsb; - - inode = ilookup5(fc->sb, nodeid, fuse_inode_eq, &nodeid); + inode = fuse_ilookup(fc, nodeid, NULL); if (!inode) goto out_up_killsb; @@ -1621,7 +1631,7 @@ struct fuse_retrieve_args { struct fuse_notify_retrieve_in inarg; }; -static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_args *args, +static void fuse_retrieve_end(struct fuse_mount *fm, struct fuse_args *args, int error) { struct fuse_retrieve_args *ra = @@ -1631,7 +1641,7 @@ static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_args *args, kfree(ra); } -static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, +static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode, struct fuse_notify_retrieve_out *outarg) { int err; @@ -1642,6 +1652,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, unsigned int offset; size_t total_len = 0; unsigned int num_pages; + struct fuse_conn *fc = fm->fc; struct fuse_retrieve_args *ra; size_t args_size = sizeof(*ra); struct fuse_args_pages *ap; @@ -1703,9 +1714,9 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, args->in_args[0].value = &ra->inarg; args->in_args[1].size = total_len; - err = fuse_simple_notify_reply(fc, args, outarg->notify_unique); + err = fuse_simple_notify_reply(fm, args, outarg->notify_unique); if (err) - fuse_retrieve_end(fc, args, err); + fuse_retrieve_end(fm, args, err); return err; } @@ -1714,7 +1725,9 @@ static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size, struct fuse_copy_state *cs) { struct fuse_notify_retrieve_out outarg; + struct fuse_mount *fm; struct inode *inode; + u64 nodeid; int err; err = -EINVAL; @@ -1729,14 +1742,12 @@ static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size, down_read(&fc->killsb); err = -ENOENT; - if (fc->sb) { - u64 nodeid = outarg.nodeid; + nodeid = outarg.nodeid; - inode = ilookup5(fc->sb, nodeid, fuse_inode_eq, &nodeid); - if (inode) { - err = fuse_retrieve(fc, inode, &outarg); - iput(inode); - } + inode = fuse_ilookup(fc, nodeid, &fm); + if (inode) { + err = fuse_retrieve(fm, inode, &outarg); + iput(inode); } up_read(&fc->killsb); @@ -1875,9 +1886,9 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, else if (oh.error == -ENOSYS) fc->no_interrupt = 1; else if (oh.error == -EAGAIN) - err = queue_interrupt(&fc->iq, req); + err = queue_interrupt(req); - fuse_put_request(fc, req); + fuse_put_request(req); goto copy_finish; } @@ -1907,7 +1918,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, list_del_init(&req->list); spin_unlock(&fpq->lock); - fuse_request_end(fc, req); + fuse_request_end(req); out: return err ? err : nbytes; @@ -2045,7 +2056,7 @@ static __poll_t fuse_dev_poll(struct file *file, poll_table *wait) } /* Abort all requests on the given list (pending or processing) */ -static void end_requests(struct fuse_conn *fc, struct list_head *head) +static void end_requests(struct list_head *head) { while (!list_empty(head)) { struct fuse_req *req; @@ -2053,7 +2064,7 @@ static void end_requests(struct fuse_conn *fc, struct list_head *head) req->out.h.error = -ECONNABORTED; clear_bit(FR_SENT, &req->flags); list_del_init(&req->list); - fuse_request_end(fc, req); + fuse_request_end(req); } } @@ -2148,7 +2159,7 @@ void fuse_abort_conn(struct fuse_conn *fc) wake_up_all(&fc->blocked_waitq); spin_unlock(&fc->lock); - end_requests(fc, &to_end); + end_requests(&to_end); } else { spin_unlock(&fc->lock); } @@ -2178,7 +2189,7 @@ int fuse_dev_release(struct inode *inode, struct file *file) list_splice_init(&fpq->processing[i], &to_end); spin_unlock(&fpq->lock); - end_requests(fc, &to_end); + end_requests(&to_end); /* Are we the last open device? */ if (atomic_dec_and_test(&fc->dev_count)) { diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 26f028bc760b..ff7dbeb16f88 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -10,6 +10,7 @@ #include <linux/pagemap.h> #include <linux/file.h> +#include <linux/fs_context.h> #include <linux/sched.h> #include <linux/namei.h> #include <linux/slab.h> @@ -196,7 +197,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) { struct inode *inode; struct dentry *parent; - struct fuse_conn *fc; + struct fuse_mount *fm; struct fuse_inode *fi; int ret; @@ -218,27 +219,29 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) if (flags & LOOKUP_RCU) goto out; - fc = get_fuse_conn(inode); + fm = get_fuse_mount(inode); forget = fuse_alloc_forget(); ret = -ENOMEM; if (!forget) goto out; - attr_version = fuse_get_attr_version(fc); + attr_version = fuse_get_attr_version(fm->fc); parent = dget_parent(entry); - fuse_lookup_init(fc, &args, get_node_id(d_inode(parent)), + fuse_lookup_init(fm->fc, &args, get_node_id(d_inode(parent)), &entry->d_name, &outarg); - ret = fuse_simple_request(fc, &args); + ret = fuse_simple_request(fm, &args); dput(parent); /* Zero nodeid is same as -ENOENT */ if (!ret && !outarg.nodeid) ret = -ENOENT; if (!ret) { fi = get_fuse_inode(inode); - if (outarg.nodeid != get_node_id(inode)) { - fuse_queue_forget(fc, forget, outarg.nodeid, 1); + if (outarg.nodeid != get_node_id(inode) || + (bool) IS_AUTOMOUNT(inode) != (bool) (outarg.attr.flags & FUSE_ATTR_SUBMOUNT)) { + fuse_queue_forget(fm->fc, forget, + outarg.nodeid, 1); goto invalid; } spin_lock(&fi->lock); @@ -298,6 +301,79 @@ static int fuse_dentry_delete(const struct dentry *dentry) return time_before64(fuse_dentry_time(dentry), get_jiffies_64()); } +/* + * Create a fuse_mount object with a new superblock (with path->dentry + * as the root), and return that mount so it can be auto-mounted on + * @path. + */ +static struct vfsmount *fuse_dentry_automount(struct path *path) +{ + struct fs_context *fsc; + struct fuse_mount *parent_fm = get_fuse_mount_super(path->mnt->mnt_sb); + struct fuse_conn *fc = parent_fm->fc; + struct fuse_mount *fm; + struct vfsmount *mnt; + struct fuse_inode *mp_fi = get_fuse_inode(d_inode(path->dentry)); + struct super_block *sb; + int err; + + fsc = fs_context_for_submount(path->mnt->mnt_sb->s_type, path->dentry); + if (IS_ERR(fsc)) { + err = PTR_ERR(fsc); + goto out; + } + + err = -ENOMEM; + fm = kzalloc(sizeof(struct fuse_mount), GFP_KERNEL); + if (!fm) + goto out_put_fsc; + + refcount_set(&fm->count, 1); + fsc->s_fs_info = fm; + sb = sget_fc(fsc, NULL, set_anon_super_fc); + if (IS_ERR(sb)) { + err = PTR_ERR(sb); + fuse_mount_put(fm); + goto out_put_fsc; + } + fm->fc = fuse_conn_get(fc); + + /* Initialize superblock, making @mp_fi its root */ + err = fuse_fill_super_submount(sb, mp_fi); + if (err) + goto out_put_sb; + + sb->s_flags |= SB_ACTIVE; + fsc->root = dget(sb->s_root); + /* We are done configuring the superblock, so unlock it */ + up_write(&sb->s_umount); + + down_write(&fc->killsb); + list_add_tail(&fm->fc_entry, &fc->mounts); + up_write(&fc->killsb); + + /* Create the submount */ + mnt = vfs_create_mount(fsc); + if (IS_ERR(mnt)) { + err = PTR_ERR(mnt); + goto out_put_fsc; + } + mntget(mnt); + put_fs_context(fsc); + return mnt; + +out_put_sb: + /* + * Only jump here when fsc->root is NULL and sb is still locked + * (otherwise put_fs_context() will put the superblock) + */ + deactivate_locked_super(sb); +out_put_fsc: + put_fs_context(fsc); +out: + return ERR_PTR(err); +} + const struct dentry_operations fuse_dentry_operations = { .d_revalidate = fuse_dentry_revalidate, .d_delete = fuse_dentry_delete, @@ -305,6 +381,7 @@ const struct dentry_operations fuse_dentry_operations = { .d_init = fuse_dentry_init, .d_release = fuse_dentry_release, #endif + .d_automount = fuse_dentry_automount, }; const struct dentry_operations fuse_root_dentry_operations = { @@ -329,7 +406,7 @@ bool fuse_invalid_attr(struct fuse_attr *attr) int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name, struct fuse_entry_out *outarg, struct inode **inode) { - struct fuse_conn *fc = get_fuse_conn_super(sb); + struct fuse_mount *fm = get_fuse_mount_super(sb); FUSE_ARGS(args); struct fuse_forget_link *forget; u64 attr_version; @@ -346,10 +423,10 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name if (!forget) goto out; - attr_version = fuse_get_attr_version(fc); + attr_version = fuse_get_attr_version(fm->fc); - fuse_lookup_init(fc, &args, nodeid, name, outarg); - err = fuse_simple_request(fc, &args); + fuse_lookup_init(fm->fc, &args, nodeid, name, outarg); + err = fuse_simple_request(fm, &args); /* Zero nodeid is same as -ENOENT, but with valid timeout */ if (err || !outarg->nodeid) goto out_put_forget; @@ -365,7 +442,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name attr_version); err = -ENOMEM; if (!*inode) { - fuse_queue_forget(fc, forget, outarg->nodeid, 1); + fuse_queue_forget(fm->fc, forget, outarg->nodeid, 1); goto out; } err = 0; @@ -434,7 +511,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, { int err; struct inode *inode; - struct fuse_conn *fc = get_fuse_conn(dir); + struct fuse_mount *fm = get_fuse_mount(dir); FUSE_ARGS(args); struct fuse_forget_link *forget; struct fuse_create_in inarg; @@ -452,11 +529,11 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, goto out_err; err = -ENOMEM; - ff = fuse_file_alloc(fc); + ff = fuse_file_alloc(fm); if (!ff) goto out_put_forget_req; - if (!fc->dont_mask) + if (!fm->fc->dont_mask) mode &= ~current_umask(); flags &= ~O_NOCTTY; @@ -477,7 +554,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, args.out_args[0].value = &outentry; args.out_args[1].size = sizeof(outopen); args.out_args[1].value = &outopen; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (err) goto out_free_ff; @@ -494,7 +571,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, if (!inode) { flags &= ~(O_CREAT | O_EXCL | O_TRUNC); fuse_sync_release(NULL, ff, flags); - fuse_queue_forget(fc, forget, outentry.nodeid, 1); + fuse_queue_forget(fm->fc, forget, outentry.nodeid, 1); err = -ENOMEM; goto out_err; } @@ -567,7 +644,7 @@ no_open: /* * Code shared between mknod, mkdir, symlink and link */ -static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args, +static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args, struct inode *dir, struct dentry *entry, umode_t mode) { @@ -586,7 +663,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args, args->out_numargs = 1; args->out_args[0].size = sizeof(outarg); args->out_args[0].value = &outarg; - err = fuse_simple_request(fc, args); + err = fuse_simple_request(fm, args); if (err) goto out_put_forget_req; @@ -600,7 +677,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args, inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation, &outarg.attr, entry_attr_timeout(&outarg), 0); if (!inode) { - fuse_queue_forget(fc, forget, outarg.nodeid, 1); + fuse_queue_forget(fm->fc, forget, outarg.nodeid, 1); return -ENOMEM; } kfree(forget); @@ -628,10 +705,10 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode, dev_t rdev) { struct fuse_mknod_in inarg; - struct fuse_conn *fc = get_fuse_conn(dir); + struct fuse_mount *fm = get_fuse_mount(dir); FUSE_ARGS(args); - if (!fc->dont_mask) + if (!fm->fc->dont_mask) mode &= ~current_umask(); memset(&inarg, 0, sizeof(inarg)); @@ -644,7 +721,7 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode, args.in_args[0].value = &inarg; args.in_args[1].size = entry->d_name.len + 1; args.in_args[1].value = entry->d_name.name; - return create_new_entry(fc, &args, dir, entry, mode); + return create_new_entry(fm, &args, dir, entry, mode); } static int fuse_create(struct inode *dir, struct dentry *entry, umode_t mode, @@ -656,10 +733,10 @@ static int fuse_create(struct inode *dir, struct dentry *entry, umode_t mode, static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode) { struct fuse_mkdir_in inarg; - struct fuse_conn *fc = get_fuse_conn(dir); + struct fuse_mount *fm = get_fuse_mount(dir); FUSE_ARGS(args); - if (!fc->dont_mask) + if (!fm->fc->dont_mask) mode &= ~current_umask(); memset(&inarg, 0, sizeof(inarg)); @@ -671,13 +748,13 @@ static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode) args.in_args[0].value = &inarg; args.in_args[1].size = entry->d_name.len + 1; args.in_args[1].value = entry->d_name.name; - return create_new_entry(fc, &args, dir, entry, S_IFDIR); + return create_new_entry(fm, &args, dir, entry, S_IFDIR); } static int fuse_symlink(struct inode *dir, struct dentry *entry, const char *link) { - struct fuse_conn *fc = get_fuse_conn(dir); + struct fuse_mount *fm = get_fuse_mount(dir); unsigned len = strlen(link) + 1; FUSE_ARGS(args); @@ -687,7 +764,7 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry, args.in_args[0].value = entry->d_name.name; args.in_args[1].size = len; args.in_args[1].value = link; - return create_new_entry(fc, &args, dir, entry, S_IFLNK); + return create_new_entry(fm, &args, dir, entry, S_IFLNK); } void fuse_update_ctime(struct inode *inode) @@ -701,7 +778,7 @@ void fuse_update_ctime(struct inode *inode) static int fuse_unlink(struct inode *dir, struct dentry *entry) { int err; - struct fuse_conn *fc = get_fuse_conn(dir); + struct fuse_mount *fm = get_fuse_mount(dir); FUSE_ARGS(args); args.opcode = FUSE_UNLINK; @@ -709,13 +786,13 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) args.in_numargs = 1; args.in_args[0].size = entry->d_name.len + 1; args.in_args[0].value = entry->d_name.name; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (!err) { struct inode *inode = d_inode(entry); struct fuse_inode *fi = get_fuse_inode(inode); spin_lock(&fi->lock); - fi->attr_version = atomic64_inc_return(&fc->attr_version); + fi->attr_version = atomic64_inc_return(&fm->fc->attr_version); /* * If i_nlink == 0 then unlink doesn't make sense, yet this can * happen if userspace filesystem is careless. It would be @@ -737,7 +814,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) static int fuse_rmdir(struct inode *dir, struct dentry *entry) { int err; - struct fuse_conn *fc = get_fuse_conn(dir); + struct fuse_mount *fm = get_fuse_mount(dir); FUSE_ARGS(args); args.opcode = FUSE_RMDIR; @@ -745,7 +822,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) args.in_numargs = 1; args.in_args[0].size = entry->d_name.len + 1; args.in_args[0].value = entry->d_name.name; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (!err) { clear_nlink(d_inode(entry)); fuse_dir_changed(dir); @@ -761,7 +838,7 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, { int err; struct fuse_rename2_in inarg; - struct fuse_conn *fc = get_fuse_conn(olddir); + struct fuse_mount *fm = get_fuse_mount(olddir); FUSE_ARGS(args); memset(&inarg, 0, argsize); @@ -776,7 +853,7 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, args.in_args[1].value = oldent->d_name.name; args.in_args[2].size = newent->d_name.len + 1; args.in_args[2].value = newent->d_name.name; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (!err) { /* ctime changes */ fuse_invalidate_attr(d_inode(oldent)); @@ -847,7 +924,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, int err; struct fuse_link_in inarg; struct inode *inode = d_inode(entry); - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); memset(&inarg, 0, sizeof(inarg)); @@ -858,7 +935,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, args.in_args[0].value = &inarg; args.in_args[1].size = newent->d_name.len + 1; args.in_args[1].value = newent->d_name.name; - err = create_new_entry(fc, &args, newdir, newent, inode->i_mode); + err = create_new_entry(fm, &args, newdir, newent, inode->i_mode); /* Contrary to "normal" filesystems it can happen that link makes two "logical" inodes point to the same "physical" inode. We invalidate the attributes of the old one, so it @@ -869,7 +946,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, struct fuse_inode *fi = get_fuse_inode(inode); spin_lock(&fi->lock); - fi->attr_version = atomic64_inc_return(&fc->attr_version); + fi->attr_version = atomic64_inc_return(&fm->fc->attr_version); if (likely(inode->i_nlink < UINT_MAX)) inc_nlink(inode); spin_unlock(&fi->lock); @@ -926,11 +1003,11 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat, int err; struct fuse_getattr_in inarg; struct fuse_attr_out outarg; - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); u64 attr_version; - attr_version = fuse_get_attr_version(fc); + attr_version = fuse_get_attr_version(fm->fc); memset(&inarg, 0, sizeof(inarg)); memset(&outarg, 0, sizeof(outarg)); @@ -949,7 +1026,7 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat, args.out_numargs = 1; args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (!err) { if (fuse_invalid_attr(&outarg.attr) || (inode->i_mode ^ outarg.attr.mode) & S_IFMT) { @@ -1002,7 +1079,7 @@ int fuse_update_attributes(struct inode *inode, struct file *file) STATX_BASIC_STATS & ~STATX_ATIME, 0); } -int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, +int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, u64 child_nodeid, struct qstr *name) { int err = -ENOTDIR; @@ -1010,7 +1087,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, struct dentry *dir; struct dentry *entry; - parent = ilookup5(sb, parent_nodeid, fuse_inode_eq, &parent_nodeid); + parent = fuse_ilookup(fc, parent_nodeid, NULL); if (!parent) return -ENOENT; @@ -1102,14 +1179,14 @@ int fuse_allow_current_process(struct fuse_conn *fc) static int fuse_access(struct inode *inode, int mask) { - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); struct fuse_access_in inarg; int err; BUG_ON(mask & MAY_NOT_BLOCK); - if (fc->no_access) + if (fm->fc->no_access) return 0; memset(&inarg, 0, sizeof(inarg)); @@ -1119,9 +1196,9 @@ static int fuse_access(struct inode *inode, int mask) args.in_numargs = 1; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { - fc->no_access = 1; + fm->fc->no_access = 1; err = 0; } return err; @@ -1209,7 +1286,7 @@ static int fuse_permission(struct inode *inode, int mask) static int fuse_readlink_page(struct inode *inode, struct page *page) { - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_page_desc desc = { .length = PAGE_SIZE - 1 }; struct fuse_args_pages ap = { .num_pages = 1, @@ -1226,7 +1303,7 @@ static int fuse_readlink_page(struct inode *inode, struct page *page) ap.args.page_zeroing = true; ap.args.out_numargs = 1; ap.args.out_args[0].size = desc.length; - res = fuse_simple_request(fc, &ap.args); + res = fuse_simple_request(fm, &ap.args); fuse_invalidate_atime(inode); @@ -1454,7 +1531,7 @@ static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_args *args, */ int fuse_flush_times(struct inode *inode, struct fuse_file *ff) { - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); struct fuse_setattr_in inarg; struct fuse_attr_out outarg; @@ -1465,7 +1542,7 @@ int fuse_flush_times(struct inode *inode, struct fuse_file *ff) inarg.valid = FATTR_MTIME; inarg.mtime = inode->i_mtime.tv_sec; inarg.mtimensec = inode->i_mtime.tv_nsec; - if (fc->minor >= 23) { + if (fm->fc->minor >= 23) { inarg.valid |= FATTR_CTIME; inarg.ctime = inode->i_ctime.tv_sec; inarg.ctimensec = inode->i_ctime.tv_nsec; @@ -1474,9 +1551,9 @@ int fuse_flush_times(struct inode *inode, struct fuse_file *ff) inarg.valid |= FATTR_FH; inarg.fh = ff->fh; } - fuse_setattr_fill(fc, &args, inode, &inarg, &outarg); + fuse_setattr_fill(fm->fc, &args, inode, &inarg, &outarg); - return fuse_simple_request(fc, &args); + return fuse_simple_request(fm, &args); } /* @@ -1491,7 +1568,8 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, struct file *file) { struct inode *inode = d_inode(dentry); - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); + struct fuse_conn *fc = fm->fc; struct fuse_inode *fi = get_fuse_inode(inode); FUSE_ARGS(args); struct fuse_setattr_in inarg; @@ -1501,6 +1579,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, loff_t oldsize; int err; bool trust_local_cmtime = is_wb && S_ISREG(inode->i_mode); + bool fault_blocked = false; if (!fc->default_permissions) attr->ia_valid |= ATTR_FORCE; @@ -1509,6 +1588,22 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, if (err) return err; + if (attr->ia_valid & ATTR_SIZE) { + if (WARN_ON(!S_ISREG(inode->i_mode))) + return -EIO; + is_truncate = true; + } + + if (FUSE_IS_DAX(inode) && is_truncate) { + down_write(&fi->i_mmap_sem); + fault_blocked = true; + err = fuse_dax_break_layouts(inode, 0, 0); + if (err) { + up_write(&fi->i_mmap_sem); + return err; + } + } + if (attr->ia_valid & ATTR_OPEN) { /* This is coming from open(..., ... | O_TRUNC); */ WARN_ON(!(attr->ia_valid & ATTR_SIZE)); @@ -1521,17 +1616,11 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, */ i_size_write(inode, 0); truncate_pagecache(inode, 0); - return 0; + goto out; } file = NULL; } - if (attr->ia_valid & ATTR_SIZE) { - if (WARN_ON(!S_ISREG(inode->i_mode))) - return -EIO; - is_truncate = true; - } - /* Flush dirty data/metadata before non-truncate SETATTR */ if (is_wb && S_ISREG(inode->i_mode) && attr->ia_valid & @@ -1566,7 +1655,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, inarg.lock_owner = fuse_lock_owner_id(fc, current->files); } fuse_setattr_fill(fc, &args, inode, &inarg, &outarg); - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (err) { if (err == -EINTR) fuse_invalidate_attr(inode); @@ -1614,6 +1703,10 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, } clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); +out: + if (fault_blocked) + up_write(&fi->i_mmap_sem); + return 0; error: @@ -1621,6 +1714,9 @@ error: fuse_release_nowrite(inode); clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); + + if (fault_blocked) + up_write(&fi->i_mmap_sem); return err; } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 43c165e796da..c03034e8c152 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -32,7 +32,7 @@ static struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags, return pages; } -static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file, +static int fuse_send_open(struct fuse_mount *fm, u64 nodeid, struct file *file, int opcode, struct fuse_open_out *outargp) { struct fuse_open_in inarg; @@ -40,7 +40,7 @@ static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file, memset(&inarg, 0, sizeof(inarg)); inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY); - if (!fc->atomic_o_trunc) + if (!fm->fc->atomic_o_trunc) inarg.flags &= ~O_TRUNC; args.opcode = opcode; args.nodeid = nodeid; @@ -51,7 +51,7 @@ static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file, args.out_args[0].size = sizeof(*outargp); args.out_args[0].value = outargp; - return fuse_simple_request(fc, &args); + return fuse_simple_request(fm, &args); } struct fuse_release_args { @@ -60,7 +60,7 @@ struct fuse_release_args { struct inode *inode; }; -struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) +struct fuse_file *fuse_file_alloc(struct fuse_mount *fm) { struct fuse_file *ff; @@ -68,7 +68,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) if (unlikely(!ff)) return NULL; - ff->fc = fc; + ff->fm = fm; ff->release_args = kzalloc(sizeof(*ff->release_args), GFP_KERNEL_ACCOUNT); if (!ff->release_args) { @@ -82,7 +82,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) RB_CLEAR_NODE(&ff->polled_node); init_waitqueue_head(&ff->poll_wait); - ff->kh = atomic64_inc_return(&fc->khctr); + ff->kh = atomic64_inc_return(&fm->fc->khctr); return ff; } @@ -100,7 +100,7 @@ static struct fuse_file *fuse_file_get(struct fuse_file *ff) return ff; } -static void fuse_release_end(struct fuse_conn *fc, struct fuse_args *args, +static void fuse_release_end(struct fuse_mount *fm, struct fuse_args *args, int error) { struct fuse_release_args *ra = container_of(args, typeof(*ra), args); @@ -114,29 +114,30 @@ static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir) if (refcount_dec_and_test(&ff->count)) { struct fuse_args *args = &ff->release_args->args; - if (isdir ? ff->fc->no_opendir : ff->fc->no_open) { + if (isdir ? ff->fm->fc->no_opendir : ff->fm->fc->no_open) { /* Do nothing when client does not implement 'open' */ - fuse_release_end(ff->fc, args, 0); + fuse_release_end(ff->fm, args, 0); } else if (sync) { - fuse_simple_request(ff->fc, args); - fuse_release_end(ff->fc, args, 0); + fuse_simple_request(ff->fm, args); + fuse_release_end(ff->fm, args, 0); } else { args->end = fuse_release_end; - if (fuse_simple_background(ff->fc, args, + if (fuse_simple_background(ff->fm, args, GFP_KERNEL | __GFP_NOFAIL)) - fuse_release_end(ff->fc, args, -ENOTCONN); + fuse_release_end(ff->fm, args, -ENOTCONN); } kfree(ff); } } -int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, +int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file, bool isdir) { + struct fuse_conn *fc = fm->fc; struct fuse_file *ff; int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN; - ff = fuse_file_alloc(fc); + ff = fuse_file_alloc(fm); if (!ff) return -ENOMEM; @@ -147,7 +148,7 @@ int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, struct fuse_open_out outarg; int err; - err = fuse_send_open(fc, nodeid, file, opcode, &outarg); + err = fuse_send_open(fm, nodeid, file, opcode, &outarg); if (!err) { ff->fh = outarg.fh; ff->open_flags = outarg.open_flags; @@ -216,27 +217,40 @@ void fuse_finish_open(struct inode *inode, struct file *file) int fuse_open_common(struct inode *inode, struct file *file, bool isdir) { - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); + struct fuse_conn *fc = fm->fc; int err; bool is_wb_truncate = (file->f_flags & O_TRUNC) && fc->atomic_o_trunc && fc->writeback_cache; + bool dax_truncate = (file->f_flags & O_TRUNC) && + fc->atomic_o_trunc && FUSE_IS_DAX(inode); err = generic_file_open(inode, file); if (err) return err; - if (is_wb_truncate) { + if (is_wb_truncate || dax_truncate) { inode_lock(inode); fuse_set_nowrite(inode); } - err = fuse_do_open(fc, get_node_id(inode), file, isdir); + if (dax_truncate) { + down_write(&get_fuse_inode(inode)->i_mmap_sem); + err = fuse_dax_break_layouts(inode, 0, 0); + if (err) + goto out; + } + err = fuse_do_open(fm, get_node_id(inode), file, isdir); if (!err) fuse_finish_open(inode, file); - if (is_wb_truncate) { +out: + if (dax_truncate) + up_write(&get_fuse_inode(inode)->i_mmap_sem); + + if (is_wb_truncate | dax_truncate) { fuse_release_nowrite(inode); inode_unlock(inode); } @@ -247,7 +261,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff, int flags, int opcode) { - struct fuse_conn *fc = ff->fc; + struct fuse_conn *fc = ff->fm->fc; struct fuse_release_args *ra = ff->release_args; /* Inode is NULL on error path of fuse_create_open() */ @@ -285,7 +299,7 @@ void fuse_release_common(struct file *file, bool isdir) if (ff->flock) { ra->inarg.release_flags |= FUSE_RELEASE_FLOCK_UNLOCK; - ra->inarg.lock_owner = fuse_lock_owner_id(ff->fc, + ra->inarg.lock_owner = fuse_lock_owner_id(ff->fm->fc, (fl_owner_t) file); } /* Hold inode until release is finished */ @@ -300,7 +314,7 @@ void fuse_release_common(struct file *file, bool isdir) * synchronous RELEASE is allowed (and desirable) in this case * because the server can be trusted not to screw up. */ - fuse_file_put(ff, ff->fc->destroy, isdir); + fuse_file_put(ff, ff->fm->fc->destroy, isdir); } static int fuse_open(struct inode *inode, struct file *file) @@ -443,7 +457,7 @@ static void fuse_sync_writes(struct inode *inode) static int fuse_flush(struct file *file, fl_owner_t id) { struct inode *inode = file_inode(file); - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_file *ff = file->private_data; struct fuse_flush_in inarg; FUSE_ARGS(args); @@ -465,12 +479,12 @@ static int fuse_flush(struct file *file, fl_owner_t id) return err; err = 0; - if (fc->no_flush) + if (fm->fc->no_flush) goto inval_attr_out; memset(&inarg, 0, sizeof(inarg)); inarg.fh = ff->fh; - inarg.lock_owner = fuse_lock_owner_id(fc, id); + inarg.lock_owner = fuse_lock_owner_id(fm->fc, id); args.opcode = FUSE_FLUSH; args.nodeid = get_node_id(inode); args.in_numargs = 1; @@ -478,9 +492,9 @@ static int fuse_flush(struct file *file, fl_owner_t id) args.in_args[0].value = &inarg; args.force = true; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { - fc->no_flush = 1; + fm->fc->no_flush = 1; err = 0; } @@ -489,7 +503,7 @@ inval_attr_out: * In memory i_blocks is not maintained by fuse, if writeback cache is * enabled, i_blocks from cached attr may not be accurate. */ - if (!err && fc->writeback_cache) + if (!err && fm->fc->writeback_cache) fuse_invalidate_attr(inode); return err; } @@ -498,7 +512,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, int datasync, int opcode) { struct inode *inode = file->f_mapping->host; - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_file *ff = file->private_data; FUSE_ARGS(args); struct fuse_fsync_in inarg; @@ -511,7 +525,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, args.in_numargs = 1; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; - return fuse_simple_request(fc, &args); + return fuse_simple_request(fm, &args); } static int fuse_fsync(struct file *file, loff_t start, loff_t end, @@ -686,7 +700,7 @@ static void fuse_io_free(struct fuse_io_args *ia) kfree(ia); } -static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_args *args, +static void fuse_aio_complete_req(struct fuse_mount *fm, struct fuse_args *args, int err) { struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args); @@ -715,7 +729,7 @@ static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_args *args, fuse_io_free(ia); } -static ssize_t fuse_async_req_send(struct fuse_conn *fc, +static ssize_t fuse_async_req_send(struct fuse_mount *fm, struct fuse_io_args *ia, size_t num_bytes) { ssize_t err; @@ -729,9 +743,9 @@ static ssize_t fuse_async_req_send(struct fuse_conn *fc, ia->ap.args.end = fuse_aio_complete_req; ia->ap.args.may_block = io->should_dirty; - err = fuse_simple_background(fc, &ia->ap.args, GFP_KERNEL); + err = fuse_simple_background(fm, &ia->ap.args, GFP_KERNEL); if (err) - fuse_aio_complete_req(fc, &ia->ap.args, err); + fuse_aio_complete_req(fm, &ia->ap.args, err); return num_bytes; } @@ -741,18 +755,18 @@ static ssize_t fuse_send_read(struct fuse_io_args *ia, loff_t pos, size_t count, { struct file *file = ia->io->iocb->ki_filp; struct fuse_file *ff = file->private_data; - struct fuse_conn *fc = ff->fc; + struct fuse_mount *fm = ff->fm; fuse_read_args_fill(ia, file, pos, count, FUSE_READ); if (owner != NULL) { ia->read.in.read_flags |= FUSE_READ_LOCKOWNER; - ia->read.in.lock_owner = fuse_lock_owner_id(fc, owner); + ia->read.in.lock_owner = fuse_lock_owner_id(fm->fc, owner); } if (ia->io->async) - return fuse_async_req_send(fc, ia, count); + return fuse_async_req_send(fm, ia, count); - return fuse_simple_request(fc, &ia->ap.args); + return fuse_simple_request(fm, &ia->ap.args); } static void fuse_read_update_size(struct inode *inode, loff_t size, @@ -798,7 +812,7 @@ static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read, static int fuse_do_readpage(struct file *file, struct page *page) { struct inode *inode = page->mapping->host; - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); loff_t pos = page_offset(page); struct fuse_page_desc desc = { .length = PAGE_SIZE }; struct fuse_io_args ia = { @@ -818,14 +832,14 @@ static int fuse_do_readpage(struct file *file, struct page *page) */ fuse_wait_on_page_writeback(inode, page->index); - attr_ver = fuse_get_attr_version(fc); + attr_ver = fuse_get_attr_version(fm->fc); /* Don't overflow end offset */ if (pos + (desc.length - 1) == LLONG_MAX) desc.length--; fuse_read_args_fill(&ia, file, pos, desc.length, FUSE_READ); - res = fuse_simple_request(fc, &ia.ap.args); + res = fuse_simple_request(fm, &ia.ap.args); if (res < 0) return res; /* @@ -855,7 +869,7 @@ static int fuse_readpage(struct file *file, struct page *page) return err; } -static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_args *args, +static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, int err) { int i; @@ -899,7 +913,7 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_args *args, static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) { struct fuse_file *ff = file->private_data; - struct fuse_conn *fc = ff->fc; + struct fuse_mount *fm = ff->fm; struct fuse_args_pages *ap = &ia->ap; loff_t pos = page_offset(ap->pages[0]); size_t count = ap->num_pages << PAGE_SHIFT; @@ -918,18 +932,18 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) WARN_ON((loff_t) (pos + count) < 0); fuse_read_args_fill(ia, file, pos, count, FUSE_READ); - ia->read.attr_ver = fuse_get_attr_version(fc); - if (fc->async_read) { + ia->read.attr_ver = fuse_get_attr_version(fm->fc); + if (fm->fc->async_read) { ia->ff = fuse_file_get(ff); ap->args.end = fuse_readpages_end; - err = fuse_simple_background(fc, &ap->args, GFP_KERNEL); + err = fuse_simple_background(fm, &ap->args, GFP_KERNEL); if (!err) return; } else { - res = fuse_simple_request(fc, &ap->args); + res = fuse_simple_request(fm, &ap->args); err = res < 0 ? res : 0; } - fuse_readpages_end(fc, &ap->args, err); + fuse_readpages_end(fm, &ap->args, err); } static void fuse_readahead(struct readahead_control *rac) @@ -1000,7 +1014,7 @@ static void fuse_write_args_fill(struct fuse_io_args *ia, struct fuse_file *ff, args->opcode = FUSE_WRITE; args->nodeid = ff->nodeid; args->in_numargs = 2; - if (ff->fc->minor < 9) + if (ff->fm->fc->minor < 9) args->in_args[0].size = FUSE_COMPAT_WRITE_IN_SIZE; else args->in_args[0].size = sizeof(ia->write.in); @@ -1029,7 +1043,7 @@ static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos, struct kiocb *iocb = ia->io->iocb; struct file *file = iocb->ki_filp; struct fuse_file *ff = file->private_data; - struct fuse_conn *fc = ff->fc; + struct fuse_mount *fm = ff->fm; struct fuse_write_in *inarg = &ia->write.in; ssize_t err; @@ -1037,13 +1051,13 @@ static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos, inarg->flags = fuse_write_flags(iocb); if (owner != NULL) { inarg->write_flags |= FUSE_WRITE_LOCKOWNER; - inarg->lock_owner = fuse_lock_owner_id(fc, owner); + inarg->lock_owner = fuse_lock_owner_id(fm->fc, owner); } if (ia->io->async) - return fuse_async_req_send(fc, ia, count); + return fuse_async_req_send(fm, ia, count); - err = fuse_simple_request(fc, &ia->ap.args); + err = fuse_simple_request(fm, &ia->ap.args); if (!err && ia->write.out.size > count) err = -EIO; @@ -1074,7 +1088,7 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, struct fuse_args_pages *ap = &ia->ap; struct file *file = iocb->ki_filp; struct fuse_file *ff = file->private_data; - struct fuse_conn *fc = ff->fc; + struct fuse_mount *fm = ff->fm; unsigned int offset, i; int err; @@ -1084,7 +1098,7 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, fuse_write_args_fill(ia, ff, pos, count); ia->write.in.flags = fuse_write_flags(iocb); - err = fuse_simple_request(fc, &ap->args); + err = fuse_simple_request(fm, &ap->args); if (!err && ia->write.out.size > count) err = -EIO; @@ -1399,7 +1413,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, struct file *file = io->iocb->ki_filp; struct inode *inode = file->f_mapping->host; struct fuse_file *ff = file->private_data; - struct fuse_conn *fc = ff->fc; + struct fuse_conn *fc = ff->fm->fc; size_t nmax = write ? fc->max_write : fc->max_read; loff_t pos = *ppos; size_t count = iov_iter_count(iter); @@ -1539,10 +1553,14 @@ static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct fuse_file *ff = file->private_data; + struct inode *inode = file_inode(file); - if (is_bad_inode(file_inode(file))) + if (is_bad_inode(inode)) return -EIO; + if (FUSE_IS_DAX(inode)) + return fuse_dax_read_iter(iocb, to); + if (!(ff->open_flags & FOPEN_DIRECT_IO)) return fuse_cache_read_iter(iocb, to); else @@ -1553,10 +1571,14 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct fuse_file *ff = file->private_data; + struct inode *inode = file_inode(file); - if (is_bad_inode(file_inode(file))) + if (is_bad_inode(inode)) return -EIO; + if (FUSE_IS_DAX(inode)) + return fuse_dax_write_iter(iocb, from); + if (!(ff->open_flags & FOPEN_DIRECT_IO)) return fuse_cache_write_iter(iocb, from); else @@ -1578,7 +1600,7 @@ static void fuse_writepage_free(struct fuse_writepage_args *wpa) kfree(wpa); } -static void fuse_writepage_finish(struct fuse_conn *fc, +static void fuse_writepage_finish(struct fuse_mount *fm, struct fuse_writepage_args *wpa) { struct fuse_args_pages *ap = &wpa->ia.ap; @@ -1596,7 +1618,7 @@ static void fuse_writepage_finish(struct fuse_conn *fc, } /* Called under fi->lock, may release and reacquire it */ -static void fuse_send_writepage(struct fuse_conn *fc, +static void fuse_send_writepage(struct fuse_mount *fm, struct fuse_writepage_args *wpa, loff_t size) __releases(fi->lock) __acquires(fi->lock) @@ -1622,10 +1644,10 @@ __acquires(fi->lock) args->force = true; args->nocreds = true; - err = fuse_simple_background(fc, args, GFP_ATOMIC); + err = fuse_simple_background(fm, args, GFP_ATOMIC); if (err == -ENOMEM) { spin_unlock(&fi->lock); - err = fuse_simple_background(fc, args, GFP_NOFS | __GFP_NOFAIL); + err = fuse_simple_background(fm, args, GFP_NOFS | __GFP_NOFAIL); spin_lock(&fi->lock); } @@ -1638,7 +1660,7 @@ __acquires(fi->lock) out_free: fi->writectr--; rb_erase(&wpa->writepages_entry, &fi->writepages); - fuse_writepage_finish(fc, wpa); + fuse_writepage_finish(fm, wpa); spin_unlock(&fi->lock); /* After fuse_writepage_finish() aux request list is private */ @@ -1662,7 +1684,7 @@ void fuse_flush_writepages(struct inode *inode) __releases(fi->lock) __acquires(fi->lock) { - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_inode *fi = get_fuse_inode(inode); loff_t crop = i_size_read(inode); struct fuse_writepage_args *wpa; @@ -1671,7 +1693,7 @@ __acquires(fi->lock) wpa = list_entry(fi->queued_writes.next, struct fuse_writepage_args, queue_entry); list_del_init(&wpa->queue_entry); - fuse_send_writepage(fc, wpa, crop); + fuse_send_writepage(fm, wpa, crop); } } @@ -1712,7 +1734,7 @@ static void tree_insert(struct rb_root *root, struct fuse_writepage_args *wpa) WARN_ON(fuse_insert_writeback(root, wpa)); } -static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args, +static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args, int error) { struct fuse_writepage_args *wpa = @@ -1724,7 +1746,7 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args, spin_lock(&fi->lock); rb_erase(&wpa->writepages_entry, &fi->writepages); while (wpa->next) { - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_write_in *inarg = &wpa->ia.write.in; struct fuse_writepage_args *next = wpa->next; @@ -1756,10 +1778,10 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args, * no invocations of fuse_writepage_end() while we're in * fuse_set_nowrite..fuse_release_nowrite section. */ - fuse_send_writepage(fc, next, inarg->offset + inarg->size); + fuse_send_writepage(fm, next, inarg->offset + inarg->size); } fi->writectr--; - fuse_writepage_finish(fc, wpa); + fuse_writepage_finish(fm, wpa); spin_unlock(&fi->lock); fuse_writepage_free(wpa); } @@ -2317,6 +2339,10 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) { struct fuse_file *ff = file->private_data; + /* DAX mmap is superior to direct_io mmap */ + if (FUSE_IS_DAX(file_inode(file))) + return fuse_dax_mmap(file, vma); + if (ff->open_flags & FOPEN_DIRECT_IO) { /* Can't provide the coherency needed for MAP_SHARED */ if (vma->vm_flags & VM_MAYSHARE) @@ -2395,7 +2421,7 @@ static void fuse_lk_fill(struct fuse_args *args, struct file *file, static int fuse_getlk(struct file *file, struct file_lock *fl) { struct inode *inode = file_inode(file); - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); struct fuse_lk_in inarg; struct fuse_lk_out outarg; @@ -2405,9 +2431,9 @@ static int fuse_getlk(struct file *file, struct file_lock *fl) args.out_numargs = 1; args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (!err) - err = convert_fuse_file_lock(fc, &outarg.lk, fl); + err = convert_fuse_file_lock(fm->fc, &outarg.lk, fl); return err; } @@ -2415,12 +2441,12 @@ static int fuse_getlk(struct file *file, struct file_lock *fl) static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) { struct inode *inode = file_inode(file); - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); struct fuse_lk_in inarg; int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK; struct pid *pid = fl->fl_type != F_UNLCK ? task_tgid(current) : NULL; - pid_t pid_nr = pid_nr_ns(pid, fc->pid_ns); + pid_t pid_nr = pid_nr_ns(pid, fm->fc->pid_ns); int err; if (fl->fl_lmops && fl->fl_lmops->lm_grant) { @@ -2433,7 +2459,7 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) return 0; fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg); - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); /* locking is restartable */ if (err == -EINTR) @@ -2487,13 +2513,13 @@ static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl) static sector_t fuse_bmap(struct address_space *mapping, sector_t block) { struct inode *inode = mapping->host; - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); struct fuse_bmap_in inarg; struct fuse_bmap_out outarg; int err; - if (!inode->i_sb->s_bdev || fc->no_bmap) + if (!inode->i_sb->s_bdev || fm->fc->no_bmap) return 0; memset(&inarg, 0, sizeof(inarg)); @@ -2507,9 +2533,9 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block) args.out_numargs = 1; args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (err == -ENOSYS) - fc->no_bmap = 1; + fm->fc->no_bmap = 1; return err ? 0 : outarg.block; } @@ -2517,7 +2543,7 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block) static loff_t fuse_lseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_file *ff = file->private_data; FUSE_ARGS(args); struct fuse_lseek_in inarg = { @@ -2528,7 +2554,7 @@ static loff_t fuse_lseek(struct file *file, loff_t offset, int whence) struct fuse_lseek_out outarg; int err; - if (fc->no_lseek) + if (fm->fc->no_lseek) goto fallback; args.opcode = FUSE_LSEEK; @@ -2539,10 +2565,10 @@ static loff_t fuse_lseek(struct file *file, loff_t offset, int whence) args.out_numargs = 1; args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (err) { if (err == -ENOSYS) { - fc->no_lseek = 1; + fm->fc->no_lseek = 1; goto fallback; } return err; @@ -2728,7 +2754,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, unsigned int flags) { struct fuse_file *ff = file->private_data; - struct fuse_conn *fc = ff->fc; + struct fuse_mount *fm = ff->fm; struct fuse_ioctl_in inarg = { .fh = ff->fh, .cmd = cmd, @@ -2761,12 +2787,12 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE); err = -ENOMEM; - ap.pages = fuse_pages_alloc(fc->max_pages, GFP_KERNEL, &ap.descs); + ap.pages = fuse_pages_alloc(fm->fc->max_pages, GFP_KERNEL, &ap.descs); iov_page = (struct iovec *) __get_free_page(GFP_KERNEL); if (!ap.pages || !iov_page) goto out; - fuse_page_descs_length_init(ap.descs, 0, fc->max_pages); + fuse_page_descs_length_init(ap.descs, 0, fm->fc->max_pages); /* * If restricted, initialize IO parameters as encoded in @cmd. @@ -2811,7 +2837,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, /* make sure there are enough buffer pages and init request with them */ err = -ENOMEM; - if (max_pages > fc->max_pages) + if (max_pages > fm->fc->max_pages) goto out; while (ap.num_pages < max_pages) { ap.pages[ap.num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); @@ -2848,7 +2874,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, ap.args.out_pages = true; ap.args.out_argvar = true; - transferred = fuse_simple_request(fc, &ap.args); + transferred = fuse_simple_request(fm, &ap.args); err = transferred; if (transferred < 0) goto out; @@ -2876,7 +2902,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, goto out; vaddr = kmap_atomic(ap.pages[0]); - err = fuse_copy_ioctl_iovec(fc, iov_page, vaddr, + err = fuse_copy_ioctl_iovec(fm->fc, iov_page, vaddr, transferred, in_iovs + out_iovs, (flags & FUSE_IOCTL_COMPAT) != 0); kunmap_atomic(vaddr); @@ -2886,11 +2912,11 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, in_iov = iov_page; out_iov = in_iov + in_iovs; - err = fuse_verify_ioctl_iov(fc, in_iov, in_iovs); + err = fuse_verify_ioctl_iov(fm->fc, in_iov, in_iovs); if (err) goto out; - err = fuse_verify_ioctl_iov(fc, out_iov, out_iovs); + err = fuse_verify_ioctl_iov(fm->fc, out_iov, out_iovs); if (err) goto out; @@ -3000,13 +3026,13 @@ static void fuse_register_polled_file(struct fuse_conn *fc, __poll_t fuse_file_poll(struct file *file, poll_table *wait) { struct fuse_file *ff = file->private_data; - struct fuse_conn *fc = ff->fc; + struct fuse_mount *fm = ff->fm; struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh }; struct fuse_poll_out outarg; FUSE_ARGS(args); int err; - if (fc->no_poll) + if (fm->fc->no_poll) return DEFAULT_POLLMASK; poll_wait(file, &ff->poll_wait, wait); @@ -3018,7 +3044,7 @@ __poll_t fuse_file_poll(struct file *file, poll_table *wait) */ if (waitqueue_active(&ff->poll_wait)) { inarg.flags |= FUSE_POLL_SCHEDULE_NOTIFY; - fuse_register_polled_file(fc, ff); + fuse_register_polled_file(fm->fc, ff); } args.opcode = FUSE_POLL; @@ -3029,12 +3055,12 @@ __poll_t fuse_file_poll(struct file *file, poll_table *wait) args.out_numargs = 1; args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (!err) return demangle_poll(outarg.revents); if (err == -ENOSYS) { - fc->no_poll = 1; + fm->fc->no_poll = 1; return DEFAULT_POLLMASK; } return EPOLLERR; @@ -3120,13 +3146,13 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) * By default, we want to optimize all I/Os with async request * submission to the client filesystem if supported. */ - io->async = ff->fc->async_dio; + io->async = ff->fm->fc->async_dio; io->iocb = iocb; io->blocking = is_sync_kiocb(iocb); /* optimization for short read */ if (io->async && !io->write && offset + count > i_size) { - iov_iter_truncate(iter, fuse_round_up(ff->fc, i_size - offset)); + iov_iter_truncate(iter, fuse_round_up(ff->fm->fc, i_size - offset)); shortened = count - iov_iter_count(iter); count -= shortened; } @@ -3196,7 +3222,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, struct fuse_file *ff = file->private_data; struct inode *inode = file_inode(file); struct fuse_inode *fi = get_fuse_inode(inode); - struct fuse_conn *fc = ff->fc; + struct fuse_mount *fm = ff->fm; FUSE_ARGS(args); struct fuse_fallocate_in inarg = { .fh = ff->fh, @@ -3208,14 +3234,23 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) || (mode & FALLOC_FL_PUNCH_HOLE); + bool block_faults = FUSE_IS_DAX(inode) && lock_inode; + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) return -EOPNOTSUPP; - if (fc->no_fallocate) + if (fm->fc->no_fallocate) return -EOPNOTSUPP; if (lock_inode) { inode_lock(inode); + if (block_faults) { + down_write(&fi->i_mmap_sem); + err = fuse_dax_break_layouts(inode, 0, 0); + if (err) + goto out; + } + if (mode & FALLOC_FL_PUNCH_HOLE) { loff_t endbyte = offset + length - 1; @@ -3240,9 +3275,9 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, args.in_numargs = 1; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { - fc->no_fallocate = 1; + fm->fc->no_fallocate = 1; err = -EOPNOTSUPP; } if (err) @@ -3252,7 +3287,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, if (!(mode & FALLOC_FL_KEEP_SIZE)) { bool changed = fuse_write_update_size(inode, offset + length); - if (changed && fc->writeback_cache) + if (changed && fm->fc->writeback_cache) file_update_time(file); } @@ -3265,6 +3300,9 @@ out: if (!(mode & FALLOC_FL_KEEP_SIZE)) clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); + if (block_faults) + up_write(&fi->i_mmap_sem); + if (lock_inode) inode_unlock(inode); @@ -3280,7 +3318,8 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); struct fuse_inode *fi_out = get_fuse_inode(inode_out); - struct fuse_conn *fc = ff_in->fc; + struct fuse_mount *fm = ff_in->fm; + struct fuse_conn *fc = fm->fc; FUSE_ARGS(args); struct fuse_copy_file_range_in inarg = { .fh_in = ff_in->fh, @@ -3349,7 +3388,7 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, args.out_numargs = 1; args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { fc->no_copy_file_range = 1; err = -EOPNOTSUPP; @@ -3404,6 +3443,7 @@ static const struct file_operations fuse_file_operations = { .release = fuse_release, .fsync = fuse_fsync, .lock = fuse_file_lock, + .get_unmapped_area = thp_get_unmapped_area, .flock = fuse_file_flock, .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, @@ -3439,4 +3479,7 @@ void fuse_init_file_inode(struct inode *inode) fi->writectr = 0; init_waitqueue_head(&fi->page_waitq); fi->writepages = RB_ROOT; + + if (IS_ENABLED(CONFIG_FUSE_DAX)) + fuse_dax_inode_init(inode); } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 740a8a7d7ae6..d51598017d13 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -148,6 +148,20 @@ struct fuse_inode { /** Lock to protect write related fields */ spinlock_t lock; + + /** + * Can't take inode lock in fault path (leads to circular dependency). + * Introduce another semaphore which can be taken in fault path and + * then other filesystem paths can take this to block faults. + */ + struct rw_semaphore i_mmap_sem; + +#ifdef CONFIG_FUSE_DAX + /* + * Dax specific inode data + */ + struct fuse_inode_dax *dax; +#endif }; /** FUSE inode state bits */ @@ -161,12 +175,13 @@ enum { }; struct fuse_conn; +struct fuse_mount; struct fuse_release_args; /** FUSE specific file data */ struct fuse_file { /** Fuse connection for this file */ - struct fuse_conn *fc; + struct fuse_mount *fm; /* Argument space reserved for release */ struct fuse_release_args *release_args; @@ -252,7 +267,7 @@ struct fuse_args { bool may_block:1; struct fuse_in_arg in_args[3]; struct fuse_arg out_args[2]; - void (*end)(struct fuse_conn *fc, struct fuse_args *args, int error); + void (*end)(struct fuse_mount *fm, struct fuse_args *args, int error); }; struct fuse_args_pages { @@ -360,6 +375,9 @@ struct fuse_req { /** virtio-fs's physically contiguous buffer for in and out args */ void *argbuf; #endif + + /** fuse_mount this request belongs to */ + struct fuse_mount *fm; }; struct fuse_iqueue; @@ -482,11 +500,15 @@ struct fuse_fs_context { bool destroy:1; bool no_control:1; bool no_force_umount:1; - bool no_mount_options:1; + bool legacy_opts_show:1; + bool dax:1; unsigned int max_read; unsigned int blksize; const char *subtype; + /* DAX device, may be NULL */ + struct dax_device *dax_dev; + /* fuse_dev pointer to fill in, should contain NULL on entry */ void **fudptr; }; @@ -494,9 +516,9 @@ struct fuse_fs_context { /** * A Fuse connection. * - * This structure is created, when the filesystem is mounted, and is - * destroyed, when the client device is closed and the filesystem is - * unmounted. + * This structure is created, when the root filesystem is mounted, and + * is destroyed, when the client device is closed and the last + * fuse_mount is destroyed. */ struct fuse_conn { /** Lock protecting accessess to members of this structure */ @@ -610,6 +632,9 @@ struct fuse_conn { /** cache READLINK responses in page cache */ unsigned cache_symlinks:1; + /* show legacy mount options */ + unsigned int legacy_opts_show:1; + /* * The following bitfields are only for optimization purposes * and hence races in setting them will not cause malfunction @@ -717,8 +742,8 @@ struct fuse_conn { /** Do not allow MNT_FORCE umount */ unsigned int no_force_umount:1; - /* Do not show mount options */ - unsigned int no_mount_options:1; + /* Auto-mount submounts announced by the server */ + unsigned int auto_submounts:1; /** The number of requests waiting for completion */ atomic_t num_waiting; @@ -726,10 +751,10 @@ struct fuse_conn { /** Negotiated minor version */ unsigned minor; - /** Entry on the fuse_conn_list */ + /** Entry on the fuse_mount_list */ struct list_head entry; - /** Device ID from super block */ + /** Device ID from the root super block */ dev_t dev; /** Dentries in the control filesystem */ @@ -747,24 +772,70 @@ struct fuse_conn { /** Called on final put */ void (*release)(struct fuse_conn *); - /** Super block for this connection. */ - struct super_block *sb; - - /** Read/write semaphore to hold when accessing sb. */ + /** + * Read/write semaphore to hold when accessing the sb of any + * fuse_mount belonging to this connection + */ struct rw_semaphore killsb; /** List of device instances belonging to this connection */ struct list_head devices; + +#ifdef CONFIG_FUSE_DAX + /* Dax specific conn data, non-NULL if DAX is enabled */ + struct fuse_conn_dax *dax; +#endif + + /** List of filesystems using this connection */ + struct list_head mounts; }; -static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb) +/* + * Represents a mounted filesystem, potentially a submount. + * + * This object allows sharing a fuse_conn between separate mounts to + * allow submounts with dedicated superblocks and thus separate device + * IDs. + */ +struct fuse_mount { + /* Underlying (potentially shared) connection to the FUSE server */ + struct fuse_conn *fc; + + /* Refcount */ + refcount_t count; + + /* + * Super block for this connection (fc->killsb must be held when + * accessing this). + */ + struct super_block *sb; + + /* Entry on fc->mounts */ + struct list_head fc_entry; +}; + +static inline struct fuse_mount *get_fuse_mount_super(struct super_block *sb) { return sb->s_fs_info; } +static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb) +{ + struct fuse_mount *fm = get_fuse_mount_super(sb); + + return fm ? fm->fc : NULL; +} + +static inline struct fuse_mount *get_fuse_mount(struct inode *inode) +{ + return get_fuse_mount_super(inode->i_sb); +} + static inline struct fuse_conn *get_fuse_conn(struct inode *inode) { - return get_fuse_conn_super(inode->i_sb); + struct fuse_mount *fm = get_fuse_mount(inode); + + return fm ? fm->fc : NULL; } static inline struct fuse_inode *get_fuse_inode(struct inode *inode) @@ -794,11 +865,6 @@ extern const struct dentry_operations fuse_dentry_operations; extern const struct dentry_operations fuse_root_dentry_operations; /** - * Inode to nodeid comparison. - */ -int fuse_inode_eq(struct inode *inode, void *_nodeidp); - -/** * Get a filled in inode */ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, @@ -848,7 +914,7 @@ void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, */ int fuse_open_common(struct inode *inode, struct file *file, bool isdir); -struct fuse_file *fuse_file_alloc(struct fuse_conn *fc); +struct fuse_file *fuse_file_alloc(struct fuse_mount *fm); void fuse_file_free(struct fuse_file *ff); void fuse_finish_open(struct inode *inode, struct file *file); @@ -916,14 +982,14 @@ void __exit fuse_ctl_cleanup(void); /** * Simple request sending that does request allocation and freeing */ -ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args); -int fuse_simple_background(struct fuse_conn *fc, struct fuse_args *args, +ssize_t fuse_simple_request(struct fuse_mount *fm, struct fuse_args *args); +int fuse_simple_background(struct fuse_mount *fm, struct fuse_args *args, gfp_t gfp_flags); /** * End a finished request */ -void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req); +void fuse_request_end(struct fuse_req *req); /* Abort all requests */ void fuse_abort_conn(struct fuse_conn *fc); @@ -949,7 +1015,8 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc); /** * Initialize fuse_conn */ -void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns, +void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, + struct user_namespace *user_ns, const struct fuse_iqueue_ops *fiq_ops, void *fiq_priv); /** @@ -957,11 +1024,21 @@ void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns, */ void fuse_conn_put(struct fuse_conn *fc); +/** + * Acquire reference to fuse_mount + */ +struct fuse_mount *fuse_mount_get(struct fuse_mount *fm); + +/** + * Release reference to fuse_mount + */ +void fuse_mount_put(struct fuse_mount *fm); + struct fuse_dev *fuse_dev_alloc_install(struct fuse_conn *fc); struct fuse_dev *fuse_dev_alloc(void); void fuse_dev_install(struct fuse_dev *fud, struct fuse_conn *fc); void fuse_dev_free(struct fuse_dev *fud); -void fuse_send_init(struct fuse_conn *fc); +void fuse_send_init(struct fuse_mount *fm); /** * Fill in superblock and initialize fuse connection @@ -970,12 +1047,26 @@ void fuse_send_init(struct fuse_conn *fc); */ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx); -/** - * Disassociate fuse connection from superblock and kill the superblock +/* + * Fill in superblock for submounts + * @sb: partially-initialized superblock to fill in + * @parent_fi: The fuse_inode of the parent filesystem where this submount is + * mounted + */ +int fuse_fill_super_submount(struct super_block *sb, + struct fuse_inode *parent_fi); + +/* + * Remove the mount from the connection * - * Calls kill_anon_super(), do not use with bdev mounts. + * Returns whether this was the last mount */ -void fuse_kill_sb_anon(struct super_block *sb); +bool fuse_mount_remove(struct fuse_mount *fm); + +/* + * Shut down the connection (possibly sending DESTROY request). + */ +void fuse_conn_destroy(struct fuse_mount *fm); /** * Add connection to control filesystem @@ -1011,9 +1102,19 @@ void fuse_set_nowrite(struct inode *inode); void fuse_release_nowrite(struct inode *inode); /** + * Scan all fuse_mounts belonging to fc to find the first where + * ilookup5() returns a result. Return that result and the + * respective fuse_mount in *fm (unless fm is NULL). + * + * The caller must hold fc->killsb. + */ +struct inode *fuse_ilookup(struct fuse_conn *fc, u64 nodeid, + struct fuse_mount **fm); + +/** * File-system tells the kernel to invalidate cache for the given node id. */ -int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid, +int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid, loff_t offset, loff_t len); /** @@ -1026,10 +1127,10 @@ int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid, * - is a file or oan empty directory * then the dentry is unhashed (d_delete()). */ -int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, +int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, u64 child_nodeid, struct qstr *name); -int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, +int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file, bool isdir); /** @@ -1093,4 +1194,20 @@ unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args); u64 fuse_get_unique(struct fuse_iqueue *fiq); void fuse_free_conn(struct fuse_conn *fc); +/* dax.c */ + +#define FUSE_IS_DAX(inode) (IS_ENABLED(CONFIG_FUSE_DAX) && IS_DAX(inode)) + +ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to); +ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from); +int fuse_dax_mmap(struct file *file, struct vm_area_struct *vma); +int fuse_dax_break_layouts(struct inode *inode, u64 dmap_start, u64 dmap_end); +int fuse_dax_conn_alloc(struct fuse_conn *fc, struct dax_device *dax_dev); +void fuse_dax_conn_free(struct fuse_conn *fc); +bool fuse_dax_inode_alloc(struct super_block *sb, struct fuse_inode *fi); +void fuse_dax_inode_init(struct inode *inode); +void fuse_dax_inode_cleanup(struct inode *inode); +bool fuse_dax_check_alignment(struct fuse_conn *fc, unsigned int map_alignment); +void fuse_dax_cancel_work(struct fuse_conn *fc); + #endif /* _FS_FUSE_I_H */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 581329203d68..1a47afc95f80 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -85,14 +85,22 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) fi->orig_ino = 0; fi->state = 0; mutex_init(&fi->mutex); + init_rwsem(&fi->i_mmap_sem); spin_lock_init(&fi->lock); fi->forget = fuse_alloc_forget(); - if (!fi->forget) { - kmem_cache_free(fuse_inode_cachep, fi); - return NULL; - } + if (!fi->forget) + goto out_free; + + if (IS_ENABLED(CONFIG_FUSE_DAX) && !fuse_dax_inode_alloc(sb, fi)) + goto out_free_forget; return &fi->inode; + +out_free_forget: + kfree(fi->forget); +out_free: + kmem_cache_free(fuse_inode_cachep, fi); + return NULL; } static void fuse_free_inode(struct inode *inode) @@ -101,6 +109,9 @@ static void fuse_free_inode(struct inode *inode) mutex_destroy(&fi->mutex); kfree(fi->forget); +#ifdef CONFIG_FUSE_DAX + kfree(fi->dax); +#endif kmem_cache_free(fuse_inode_cachep, fi); } @@ -112,8 +123,14 @@ static void fuse_evict_inode(struct inode *inode) clear_inode(inode); if (inode->i_sb->s_flags & SB_ACTIVE) { struct fuse_conn *fc = get_fuse_conn(inode); - fuse_queue_forget(fc, fi->forget, fi->nodeid, fi->nlookup); - fi->forget = NULL; + + if (FUSE_IS_DAX(inode)) + fuse_dax_inode_cleanup(inode); + if (fi->nlookup) { + fuse_queue_forget(fc, fi->forget, fi->nodeid, + fi->nlookup); + fi->forget = NULL; + } } if (S_ISREG(inode->i_mode) && !is_bad_inode(inode)) { WARN_ON(!list_empty(&fi->write_files)); @@ -268,7 +285,7 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr) BUG(); } -int fuse_inode_eq(struct inode *inode, void *_nodeidp) +static int fuse_inode_eq(struct inode *inode, void *_nodeidp) { u64 nodeid = *(u64 *) _nodeidp; if (get_node_id(inode) == nodeid) @@ -292,7 +309,26 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, struct fuse_inode *fi; struct fuse_conn *fc = get_fuse_conn_super(sb); - retry: + /* + * Auto mount points get their node id from the submount root, which is + * not a unique identifier within this filesystem. + * + * To avoid conflicts, do not place submount points into the inode hash + * table. + */ + if (fc->auto_submounts && (attr->flags & FUSE_ATTR_SUBMOUNT) && + S_ISDIR(attr->mode)) { + inode = new_inode(sb); + if (!inode) + return NULL; + + fuse_init_inode(inode, attr); + get_fuse_inode(inode)->nodeid = nodeid; + inode->i_flags |= S_AUTOMOUNT; + goto done; + } + +retry: inode = iget5_locked(sb, nodeid, fuse_inode_eq, fuse_inode_set, &nodeid); if (!inode) return NULL; @@ -310,7 +346,7 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, iput(inode); goto retry; } - +done: fi = get_fuse_inode(inode); spin_lock(&fi->lock); fi->nlookup++; @@ -320,16 +356,37 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, return inode; } -int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid, +struct inode *fuse_ilookup(struct fuse_conn *fc, u64 nodeid, + struct fuse_mount **fm) +{ + struct fuse_mount *fm_iter; + struct inode *inode; + + WARN_ON(!rwsem_is_locked(&fc->killsb)); + list_for_each_entry(fm_iter, &fc->mounts, fc_entry) { + if (!fm_iter->sb) + continue; + + inode = ilookup5(fm_iter->sb, nodeid, fuse_inode_eq, &nodeid); + if (inode) { + if (fm) + *fm = fm_iter; + return inode; + } + } + + return NULL; +} + +int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid, loff_t offset, loff_t len) { - struct fuse_conn *fc = get_fuse_conn_super(sb); struct fuse_inode *fi; struct inode *inode; pgoff_t pg_start; pgoff_t pg_end; - inode = ilookup5(sb, nodeid, fuse_inode_eq, &nodeid); + inode = fuse_ilookup(fc, nodeid, NULL); if (!inode) return -ENOENT; @@ -379,28 +436,23 @@ static void fuse_umount_begin(struct super_block *sb) fuse_abort_conn(fc); } -static void fuse_send_destroy(struct fuse_conn *fc) +static void fuse_send_destroy(struct fuse_mount *fm) { - if (fc->conn_init) { + if (fm->fc->conn_init) { FUSE_ARGS(args); args.opcode = FUSE_DESTROY; args.force = true; args.nocreds = true; - fuse_simple_request(fc, &args); + fuse_simple_request(fm, &args); } } static void fuse_put_super(struct super_block *sb) { - struct fuse_conn *fc = get_fuse_conn_super(sb); + struct fuse_mount *fm = get_fuse_mount_super(sb); - mutex_lock(&fuse_mutex); - list_del(&fc->entry); - fuse_ctl_remove_conn(fc); - mutex_unlock(&fuse_mutex); - - fuse_conn_put(fc); + fuse_mount_put(fm); } static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr) @@ -420,12 +472,12 @@ static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; - struct fuse_conn *fc = get_fuse_conn_super(sb); + struct fuse_mount *fm = get_fuse_mount_super(sb); FUSE_ARGS(args); struct fuse_statfs_out outarg; int err; - if (!fuse_allow_current_process(fc)) { + if (!fuse_allow_current_process(fm->fc)) { buf->f_type = FUSE_SUPER_MAGIC; return 0; } @@ -437,7 +489,7 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf) args.out_numargs = 1; args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (!err) convert_fuse_statfs(buf, &outarg.st); return err; @@ -573,19 +625,25 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root) struct super_block *sb = root->d_sb; struct fuse_conn *fc = get_fuse_conn_super(sb); - if (fc->no_mount_options) - return 0; + if (fc->legacy_opts_show) { + seq_printf(m, ",user_id=%u", + from_kuid_munged(fc->user_ns, fc->user_id)); + seq_printf(m, ",group_id=%u", + from_kgid_munged(fc->user_ns, fc->group_id)); + if (fc->default_permissions) + seq_puts(m, ",default_permissions"); + if (fc->allow_other) + seq_puts(m, ",allow_other"); + if (fc->max_read != ~0) + seq_printf(m, ",max_read=%u", fc->max_read); + if (sb->s_bdev && sb->s_blocksize != FUSE_DEFAULT_BLKSIZE) + seq_printf(m, ",blksize=%lu", sb->s_blocksize); + } +#ifdef CONFIG_FUSE_DAX + if (fc->dax) + seq_puts(m, ",dax"); +#endif - seq_printf(m, ",user_id=%u", from_kuid_munged(fc->user_ns, fc->user_id)); - seq_printf(m, ",group_id=%u", from_kgid_munged(fc->user_ns, fc->group_id)); - if (fc->default_permissions) - seq_puts(m, ",default_permissions"); - if (fc->allow_other) - seq_puts(m, ",allow_other"); - if (fc->max_read != ~0) - seq_printf(m, ",max_read=%u", fc->max_read); - if (sb->s_bdev && sb->s_blocksize != FUSE_DEFAULT_BLKSIZE) - seq_printf(m, ",blksize=%lu", sb->s_blocksize); return 0; } @@ -615,7 +673,8 @@ static void fuse_pqueue_init(struct fuse_pqueue *fpq) fpq->connected = 1; } -void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns, +void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, + struct user_namespace *user_ns, const struct fuse_iqueue_ops *fiq_ops, void *fiq_priv) { memset(fc, 0, sizeof(*fc)); @@ -642,6 +701,11 @@ void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns, fc->pid_ns = get_pid_ns(task_active_pid_ns(current)); fc->user_ns = get_user_ns(user_ns); fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ; + + INIT_LIST_HEAD(&fc->mounts); + list_add(&fm->fc_entry, &fc->mounts); + fm->fc = fc; + refcount_set(&fm->count, 1); } EXPORT_SYMBOL_GPL(fuse_conn_init); @@ -650,6 +714,8 @@ void fuse_conn_put(struct fuse_conn *fc) if (refcount_dec_and_test(&fc->count)) { struct fuse_iqueue *fiq = &fc->iq; + if (IS_ENABLED(CONFIG_FUSE_DAX)) + fuse_dax_conn_free(fc); if (fiq->ops->release) fiq->ops->release(fiq); put_pid_ns(fc->pid_ns); @@ -666,6 +732,23 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc) } EXPORT_SYMBOL_GPL(fuse_conn_get); +void fuse_mount_put(struct fuse_mount *fm) +{ + if (refcount_dec_and_test(&fm->count)) { + if (fm->fc) + fuse_conn_put(fm->fc); + kfree(fm); + } +} +EXPORT_SYMBOL_GPL(fuse_mount_put); + +struct fuse_mount *fuse_mount_get(struct fuse_mount *fm) +{ + refcount_inc(&fm->count); + return fm; +} +EXPORT_SYMBOL_GPL(fuse_mount_get); + static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode) { struct fuse_attr attr; @@ -895,14 +978,16 @@ struct fuse_init_args { struct fuse_init_out out; }; -static void process_init_reply(struct fuse_conn *fc, struct fuse_args *args, +static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, int error) { + struct fuse_conn *fc = fm->fc; struct fuse_init_args *ia = container_of(args, typeof(*ia), args); struct fuse_init_out *arg = &ia->out; + bool ok = true; if (error || arg->major != FUSE_KERNEL_VERSION) - fc->conn_error = 1; + ok = false; else { unsigned long ra_pages; @@ -950,11 +1035,11 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_args *args, if (arg->flags & FUSE_HANDLE_KILLPRIV) fc->handle_killpriv = 1; if (arg->time_gran && arg->time_gran <= 1000000000) - fc->sb->s_time_gran = arg->time_gran; + fm->sb->s_time_gran = arg->time_gran; if ((arg->flags & FUSE_POSIX_ACL)) { fc->default_permissions = 1; fc->posix_acl = 1; - fc->sb->s_xattr = fuse_acl_xattr_handlers; + fm->sb->s_xattr = fuse_acl_xattr_handlers; } if (arg->flags & FUSE_CACHE_SYMLINKS) fc->cache_symlinks = 1; @@ -965,14 +1050,19 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_args *args, min_t(unsigned int, FUSE_MAX_MAX_PAGES, max_t(unsigned int, arg->max_pages, 1)); } + if (IS_ENABLED(CONFIG_FUSE_DAX) && + arg->flags & FUSE_MAP_ALIGNMENT && + !fuse_dax_check_alignment(fc, arg->map_alignment)) { + ok = false; + } } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; fc->no_flock = 1; } - fc->sb->s_bdi->ra_pages = - min(fc->sb->s_bdi->ra_pages, ra_pages); + fm->sb->s_bdi->ra_pages = + min(fm->sb->s_bdi->ra_pages, ra_pages); fc->minor = arg->minor; fc->max_write = arg->minor < 5 ? 4096 : arg->max_write; fc->max_write = max_t(unsigned, 4096, fc->max_write); @@ -980,11 +1070,16 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_args *args, } kfree(ia); + if (!ok) { + fc->conn_init = 0; + fc->conn_error = 1; + } + fuse_set_initialized(fc); wake_up_all(&fc->blocked_waitq); } -void fuse_send_init(struct fuse_conn *fc) +void fuse_send_init(struct fuse_mount *fm) { struct fuse_init_args *ia; @@ -992,7 +1087,7 @@ void fuse_send_init(struct fuse_conn *fc) ia->in.major = FUSE_KERNEL_VERSION; ia->in.minor = FUSE_KERNEL_MINOR_VERSION; - ia->in.max_readahead = fc->sb->s_bdi->ra_pages * PAGE_SIZE; + ia->in.max_readahead = fm->sb->s_bdi->ra_pages * PAGE_SIZE; ia->in.flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK | @@ -1003,6 +1098,13 @@ void fuse_send_init(struct fuse_conn *fc) FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL | FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS | FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA; +#ifdef CONFIG_FUSE_DAX + if (fm->fc->dax) + ia->in.flags |= FUSE_MAP_ALIGNMENT; +#endif + if (fm->fc->auto_submounts) + ia->in.flags |= FUSE_SUBMOUNTS; + ia->args.opcode = FUSE_INIT; ia->args.in_numargs = 1; ia->args.in_args[0].size = sizeof(ia->in); @@ -1018,8 +1120,8 @@ void fuse_send_init(struct fuse_conn *fc) ia->args.nocreds = true; ia->args.end = process_init_reply; - if (fuse_simple_background(fc, &ia->args, GFP_KERNEL) != 0) - process_init_reply(fc, &ia->args, -ENOTCONN); + if (fuse_simple_background(fm, &ia->args, GFP_KERNEL) != 0) + process_init_reply(fm, &ia->args, -ENOTCONN); } EXPORT_SYMBOL_GPL(fuse_send_init); @@ -1130,10 +1232,92 @@ void fuse_dev_free(struct fuse_dev *fud) } EXPORT_SYMBOL_GPL(fuse_dev_free); +static void fuse_fill_attr_from_inode(struct fuse_attr *attr, + const struct fuse_inode *fi) +{ + *attr = (struct fuse_attr){ + .ino = fi->inode.i_ino, + .size = fi->inode.i_size, + .blocks = fi->inode.i_blocks, + .atime = fi->inode.i_atime.tv_sec, + .mtime = fi->inode.i_mtime.tv_sec, + .ctime = fi->inode.i_ctime.tv_sec, + .atimensec = fi->inode.i_atime.tv_nsec, + .mtimensec = fi->inode.i_mtime.tv_nsec, + .ctimensec = fi->inode.i_ctime.tv_nsec, + .mode = fi->inode.i_mode, + .nlink = fi->inode.i_nlink, + .uid = fi->inode.i_uid.val, + .gid = fi->inode.i_gid.val, + .rdev = fi->inode.i_rdev, + .blksize = 1u << fi->inode.i_blkbits, + }; +} + +static void fuse_sb_defaults(struct super_block *sb) +{ + sb->s_magic = FUSE_SUPER_MAGIC; + sb->s_op = &fuse_super_operations; + sb->s_xattr = fuse_xattr_handlers; + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_time_gran = 1; + sb->s_export_op = &fuse_export_operations; + sb->s_iflags |= SB_I_IMA_UNVERIFIABLE_SIGNATURE; + if (sb->s_user_ns != &init_user_ns) + sb->s_iflags |= SB_I_UNTRUSTED_MOUNTER; + sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION); + + /* + * If we are not in the initial user namespace posix + * acls must be translated. + */ + if (sb->s_user_ns != &init_user_ns) + sb->s_xattr = fuse_no_acl_xattr_handlers; +} + +int fuse_fill_super_submount(struct super_block *sb, + struct fuse_inode *parent_fi) +{ + struct fuse_mount *fm = get_fuse_mount_super(sb); + struct super_block *parent_sb = parent_fi->inode.i_sb; + struct fuse_attr root_attr; + struct inode *root; + + fuse_sb_defaults(sb); + fm->sb = sb; + + WARN_ON(sb->s_bdi != &noop_backing_dev_info); + sb->s_bdi = bdi_get(parent_sb->s_bdi); + + sb->s_xattr = parent_sb->s_xattr; + sb->s_time_gran = parent_sb->s_time_gran; + sb->s_blocksize = parent_sb->s_blocksize; + sb->s_blocksize_bits = parent_sb->s_blocksize_bits; + sb->s_subtype = kstrdup(parent_sb->s_subtype, GFP_KERNEL); + if (parent_sb->s_subtype && !sb->s_subtype) + return -ENOMEM; + + fuse_fill_attr_from_inode(&root_attr, parent_fi); + root = fuse_iget(sb, parent_fi->nodeid, 0, &root_attr, 0, 0); + /* + * This inode is just a duplicate, so it is not looked up and + * its nlookup should not be incremented. fuse_iget() does + * that, though, so undo it here. + */ + get_fuse_inode(root)->nlookup--; + sb->s_d_op = &fuse_dentry_operations; + sb->s_root = d_make_root(root); + if (!sb->s_root) + return -ENOMEM; + + return 0; +} + int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) { struct fuse_dev *fud = NULL; - struct fuse_conn *fc = get_fuse_conn_super(sb); + struct fuse_mount *fm = get_fuse_mount_super(sb); + struct fuse_conn *fc = fm->fc; struct inode *root; struct dentry *root_dentry; int err; @@ -1142,7 +1326,7 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) if (sb->s_flags & SB_MANDLOCK) goto err; - sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION); + fuse_sb_defaults(sb); if (ctx->is_bdev) { #ifdef CONFIG_BLOCK @@ -1157,32 +1341,21 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) sb->s_subtype = ctx->subtype; ctx->subtype = NULL; - sb->s_magic = FUSE_SUPER_MAGIC; - sb->s_op = &fuse_super_operations; - sb->s_xattr = fuse_xattr_handlers; - sb->s_maxbytes = MAX_LFS_FILESIZE; - sb->s_time_gran = 1; - sb->s_export_op = &fuse_export_operations; - sb->s_iflags |= SB_I_IMA_UNVERIFIABLE_SIGNATURE; - if (sb->s_user_ns != &init_user_ns) - sb->s_iflags |= SB_I_UNTRUSTED_MOUNTER; - - /* - * If we are not in the initial user namespace posix - * acls must be translated. - */ - if (sb->s_user_ns != &init_user_ns) - sb->s_xattr = fuse_no_acl_xattr_handlers; + if (IS_ENABLED(CONFIG_FUSE_DAX)) { + err = fuse_dax_conn_alloc(fc, ctx->dax_dev); + if (err) + goto err; + } if (ctx->fudptr) { err = -ENOMEM; fud = fuse_dev_alloc_install(fc); if (!fud) - goto err; + goto err_free_dax; } fc->dev = sb->s_dev; - fc->sb = sb; + fm->sb = sb; err = fuse_bdi_init(fc, sb); if (err) goto err_dev_free; @@ -1196,11 +1369,11 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) fc->allow_other = ctx->allow_other; fc->user_id = ctx->user_id; fc->group_id = ctx->group_id; - fc->max_read = max_t(unsigned, 4096, ctx->max_read); + fc->legacy_opts_show = ctx->legacy_opts_show; + fc->max_read = max_t(unsigned int, 4096, ctx->max_read); fc->destroy = ctx->destroy; fc->no_control = ctx->no_control; fc->no_force_umount = ctx->no_force_umount; - fc->no_mount_options = ctx->no_mount_options; err = -ENOMEM; root = fuse_get_root_inode(sb, ctx->rootmode); @@ -1233,6 +1406,9 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) err_dev_free: if (fud) fuse_dev_free(fud); + err_free_dax: + if (IS_ENABLED(CONFIG_FUSE_DAX)) + fuse_dax_conn_free(fc); err: return err; } @@ -1244,6 +1420,7 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) struct file *file; int err; struct fuse_conn *fc; + struct fuse_mount *fm; err = -EINVAL; file = fget(ctx->fd); @@ -1264,9 +1441,16 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) if (!fc) goto err_fput; - fuse_conn_init(fc, sb->s_user_ns, &fuse_dev_fiq_ops, NULL); + fm = kzalloc(sizeof(*fm), GFP_KERNEL); + if (!fm) { + kfree(fc); + goto err_fput; + } + + fuse_conn_init(fc, fm, sb->s_user_ns, &fuse_dev_fiq_ops, NULL); fc->release = fuse_free_conn; - sb->s_fs_info = fc; + + sb->s_fs_info = fm; err = fuse_fill_super_common(sb, ctx); if (err) @@ -1277,11 +1461,11 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) * CPUs after this */ fput(file); - fuse_send_init(get_fuse_conn_super(sb)); + fuse_send_init(get_fuse_mount_super(sb)); return 0; err_put_conn: - fuse_conn_put(fc); + fuse_mount_put(fm); sb->s_fs_info = NULL; err_fput: fput(file); @@ -1325,6 +1509,7 @@ static int fuse_init_fs_context(struct fs_context *fc) ctx->max_read = ~0; ctx->blksize = FUSE_DEFAULT_BLKSIZE; + ctx->legacy_opts_show = true; #ifdef CONFIG_BLOCK if (fc->fs_type == &fuseblk_fs_type) { @@ -1338,29 +1523,52 @@ static int fuse_init_fs_context(struct fs_context *fc) return 0; } -static void fuse_sb_destroy(struct super_block *sb) +bool fuse_mount_remove(struct fuse_mount *fm) { - struct fuse_conn *fc = get_fuse_conn_super(sb); + struct fuse_conn *fc = fm->fc; + bool last = false; - if (fc) { - if (fc->destroy) - fuse_send_destroy(fc); + down_write(&fc->killsb); + list_del_init(&fm->fc_entry); + if (list_empty(&fc->mounts)) + last = true; + up_write(&fc->killsb); - fuse_abort_conn(fc); - fuse_wait_aborted(fc); + return last; +} +EXPORT_SYMBOL_GPL(fuse_mount_remove); - down_write(&fc->killsb); - fc->sb = NULL; - up_write(&fc->killsb); +void fuse_conn_destroy(struct fuse_mount *fm) +{ + struct fuse_conn *fc = fm->fc; + + if (fc->destroy) + fuse_send_destroy(fm); + + fuse_abort_conn(fc); + fuse_wait_aborted(fc); + + if (!list_empty(&fc->entry)) { + mutex_lock(&fuse_mutex); + list_del(&fc->entry); + fuse_ctl_remove_conn(fc); + mutex_unlock(&fuse_mutex); } } +EXPORT_SYMBOL_GPL(fuse_conn_destroy); -void fuse_kill_sb_anon(struct super_block *sb) +static void fuse_kill_sb_anon(struct super_block *sb) { - fuse_sb_destroy(sb); + struct fuse_mount *fm = get_fuse_mount_super(sb); + bool last; + + if (fm) { + last = fuse_mount_remove(fm); + if (last) + fuse_conn_destroy(fm); + } kill_anon_super(sb); } -EXPORT_SYMBOL_GPL(fuse_kill_sb_anon); static struct file_system_type fuse_fs_type = { .owner = THIS_MODULE, @@ -1375,7 +1583,14 @@ MODULE_ALIAS_FS("fuse"); #ifdef CONFIG_BLOCK static void fuse_kill_sb_blk(struct super_block *sb) { - fuse_sb_destroy(sb); + struct fuse_mount *fm = get_fuse_mount_super(sb); + bool last; + + if (fm) { + last = fuse_mount_remove(fm); + if (last) + fuse_conn_destroy(fm); + } kill_block_super(sb); } diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index 90e3f01bd796..3b5e91045871 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -252,7 +252,7 @@ retry: static void fuse_force_forget(struct file *file, u64 nodeid) { struct inode *inode = file_inode(file); - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_forget_in inarg; FUSE_ARGS(args); @@ -266,7 +266,7 @@ static void fuse_force_forget(struct file *file, u64 nodeid) args.force = true; args.noreply = true; - fuse_simple_request(fc, &args); + fuse_simple_request(fm, &args); /* ignore errors */ } @@ -320,7 +320,7 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx) ssize_t res; struct page *page; struct inode *inode = file_inode(file); - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_io_args ia = {}; struct fuse_args_pages *ap = &ia.ap; struct fuse_page_desc desc = { .length = PAGE_SIZE }; @@ -337,7 +337,7 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx) ap->pages = &page; ap->descs = &desc; if (plus) { - attr_version = fuse_get_attr_version(fc); + attr_version = fuse_get_attr_version(fm->fc); fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE, FUSE_READDIRPLUS); } else { @@ -345,7 +345,7 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx) FUSE_READDIR); } locked = fuse_lock_inode(inode); - res = fuse_simple_request(fc, &ap->args); + res = fuse_simple_request(fm, &ap->args); fuse_unlock_inode(inode, locked); if (res >= 0) { if (!res) { diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 104f35de5270..21a9e534417c 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -5,12 +5,17 @@ */ #include <linux/fs.h> +#include <linux/dax.h> +#include <linux/pci.h> +#include <linux/pfn_t.h> #include <linux/module.h> #include <linux/virtio.h> #include <linux/virtio_fs.h> #include <linux/delay.h> #include <linux/fs_context.h> +#include <linux/fs_parser.h> #include <linux/highmem.h> +#include <linux/uio.h> #include "fuse_i.h" /* List of virtio-fs device instances and a lock for the list. Also provides @@ -24,6 +29,8 @@ enum { VQ_REQUEST }; +#define VQ_NAME_LEN 24 + /* Per-virtqueue state */ struct virtio_fs_vq { spinlock_t lock; @@ -36,7 +43,7 @@ struct virtio_fs_vq { bool connected; long in_flight; struct completion in_flight_zero; /* No inflight requests */ - char name[24]; + char name[VQ_NAME_LEN]; } ____cacheline_aligned_in_smp; /* A virtio-fs device instance */ @@ -47,6 +54,12 @@ struct virtio_fs { struct virtio_fs_vq *vqs; unsigned int nvqs; /* number of virtqueues */ unsigned int num_request_queues; /* number of request queues */ + struct dax_device *dax_dev; + + /* DAX memory window where file contents are mapped */ + void *window_kaddr; + phys_addr_t window_phys_addr; + size_t window_len; }; struct virtio_fs_forget_req { @@ -69,6 +82,44 @@ struct virtio_fs_req_work { static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, struct fuse_req *req, bool in_flight); +enum { + OPT_DAX, +}; + +static const struct fs_parameter_spec virtio_fs_parameters[] = { + fsparam_flag("dax", OPT_DAX), + {} +}; + +static int virtio_fs_parse_param(struct fs_context *fc, + struct fs_parameter *param) +{ + struct fs_parse_result result; + struct fuse_fs_context *ctx = fc->fs_private; + int opt; + + opt = fs_parse(fc, virtio_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case OPT_DAX: + ctx->dax = 1; + break; + default: + return -EINVAL; + } + + return 0; +} + +static void virtio_fs_free_fc(struct fs_context *fc) +{ + struct fuse_fs_context *ctx = fc->fs_private; + + kfree(ctx); +} + static inline struct virtio_fs_vq *vq_to_fsvq(struct virtqueue *vq) { struct virtio_fs *fs = vq->vdev->priv; @@ -289,7 +340,6 @@ static void virtio_fs_request_dispatch_work(struct work_struct *work) struct fuse_req *req; struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq, dispatch_work.work); - struct fuse_conn *fc = fsvq->fud->fc; int ret; pr_debug("virtio-fs: worker %s called.\n", __func__); @@ -304,7 +354,7 @@ static void virtio_fs_request_dispatch_work(struct work_struct *work) list_del_init(&req->list); spin_unlock(&fsvq->lock); - fuse_request_end(fc, req); + fuse_request_end(req); } /* Dispatch pending requests */ @@ -335,7 +385,7 @@ static void virtio_fs_request_dispatch_work(struct work_struct *work) spin_unlock(&fsvq->lock); pr_err("virtio-fs: virtio_fs_enqueue_req() failed %d\n", ret); - fuse_request_end(fc, req); + fuse_request_end(req); } } } @@ -495,7 +545,6 @@ static void virtio_fs_request_complete(struct fuse_req *req, struct virtio_fs_vq *fsvq) { struct fuse_pqueue *fpq = &fsvq->fud->pq; - struct fuse_conn *fc = fsvq->fud->fc; struct fuse_args *args; struct fuse_args_pages *ap; unsigned int len, i, thislen; @@ -528,7 +577,7 @@ static void virtio_fs_request_complete(struct fuse_req *req, clear_bit(FR_SENT, &req->flags); spin_unlock(&fpq->lock); - fuse_request_end(fc, req); + fuse_request_end(req); spin_lock(&fsvq->lock); dec_in_flight_req(fsvq); spin_unlock(&fsvq->lock); @@ -596,6 +645,26 @@ static void virtio_fs_vq_done(struct virtqueue *vq) schedule_work(&fsvq->done_work); } +static void virtio_fs_init_vq(struct virtio_fs_vq *fsvq, char *name, + int vq_type) +{ + strncpy(fsvq->name, name, VQ_NAME_LEN); + spin_lock_init(&fsvq->lock); + INIT_LIST_HEAD(&fsvq->queued_reqs); + INIT_LIST_HEAD(&fsvq->end_reqs); + init_completion(&fsvq->in_flight_zero); + + if (vq_type == VQ_REQUEST) { + INIT_WORK(&fsvq->done_work, virtio_fs_requests_done_work); + INIT_DELAYED_WORK(&fsvq->dispatch_work, + virtio_fs_request_dispatch_work); + } else { + INIT_WORK(&fsvq->done_work, virtio_fs_hiprio_done_work); + INIT_DELAYED_WORK(&fsvq->dispatch_work, + virtio_fs_hiprio_dispatch_work); + } +} + /* Initialize virtqueues */ static int virtio_fs_setup_vqs(struct virtio_device *vdev, struct virtio_fs *fs) @@ -611,7 +680,7 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev, if (fs->num_request_queues == 0) return -EINVAL; - fs->nvqs = 1 + fs->num_request_queues; + fs->nvqs = VQ_REQUEST + fs->num_request_queues; fs->vqs = kcalloc(fs->nvqs, sizeof(fs->vqs[VQ_HIPRIO]), GFP_KERNEL); if (!fs->vqs) return -ENOMEM; @@ -625,29 +694,17 @@ static int virtio_fs_setup_vqs(struct virtio_device *vdev, goto out; } + /* Initialize the hiprio/forget request virtqueue */ callbacks[VQ_HIPRIO] = virtio_fs_vq_done; - snprintf(fs->vqs[VQ_HIPRIO].name, sizeof(fs->vqs[VQ_HIPRIO].name), - "hiprio"); + virtio_fs_init_vq(&fs->vqs[VQ_HIPRIO], "hiprio", VQ_HIPRIO); names[VQ_HIPRIO] = fs->vqs[VQ_HIPRIO].name; - INIT_WORK(&fs->vqs[VQ_HIPRIO].done_work, virtio_fs_hiprio_done_work); - INIT_LIST_HEAD(&fs->vqs[VQ_HIPRIO].queued_reqs); - INIT_LIST_HEAD(&fs->vqs[VQ_HIPRIO].end_reqs); - INIT_DELAYED_WORK(&fs->vqs[VQ_HIPRIO].dispatch_work, - virtio_fs_hiprio_dispatch_work); - init_completion(&fs->vqs[VQ_HIPRIO].in_flight_zero); - spin_lock_init(&fs->vqs[VQ_HIPRIO].lock); /* Initialize the requests virtqueues */ for (i = VQ_REQUEST; i < fs->nvqs; i++) { - spin_lock_init(&fs->vqs[i].lock); - INIT_WORK(&fs->vqs[i].done_work, virtio_fs_requests_done_work); - INIT_DELAYED_WORK(&fs->vqs[i].dispatch_work, - virtio_fs_request_dispatch_work); - INIT_LIST_HEAD(&fs->vqs[i].queued_reqs); - INIT_LIST_HEAD(&fs->vqs[i].end_reqs); - init_completion(&fs->vqs[i].in_flight_zero); - snprintf(fs->vqs[i].name, sizeof(fs->vqs[i].name), - "requests.%u", i - VQ_REQUEST); + char vq_name[VQ_NAME_LEN]; + + snprintf(vq_name, VQ_NAME_LEN, "requests.%u", i - VQ_REQUEST); + virtio_fs_init_vq(&fs->vqs[i], vq_name, VQ_REQUEST); callbacks[i] = virtio_fs_vq_done; names[i] = fs->vqs[i].name; } @@ -676,6 +733,130 @@ static void virtio_fs_cleanup_vqs(struct virtio_device *vdev, vdev->config->del_vqs(vdev); } +/* Map a window offset to a page frame number. The window offset will have + * been produced by .iomap_begin(), which maps a file offset to a window + * offset. + */ +static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, + long nr_pages, void **kaddr, pfn_t *pfn) +{ + struct virtio_fs *fs = dax_get_private(dax_dev); + phys_addr_t offset = PFN_PHYS(pgoff); + size_t max_nr_pages = fs->window_len/PAGE_SIZE - pgoff; + + if (kaddr) + *kaddr = fs->window_kaddr + offset; + if (pfn) + *pfn = phys_to_pfn_t(fs->window_phys_addr + offset, + PFN_DEV | PFN_MAP); + return nr_pages > max_nr_pages ? max_nr_pages : nr_pages; +} + +static size_t virtio_fs_copy_from_iter(struct dax_device *dax_dev, + pgoff_t pgoff, void *addr, + size_t bytes, struct iov_iter *i) +{ + return copy_from_iter(addr, bytes, i); +} + +static size_t virtio_fs_copy_to_iter(struct dax_device *dax_dev, + pgoff_t pgoff, void *addr, + size_t bytes, struct iov_iter *i) +{ + return copy_to_iter(addr, bytes, i); +} + +static int virtio_fs_zero_page_range(struct dax_device *dax_dev, + pgoff_t pgoff, size_t nr_pages) +{ + long rc; + void *kaddr; + + rc = dax_direct_access(dax_dev, pgoff, nr_pages, &kaddr, NULL); + if (rc < 0) + return rc; + memset(kaddr, 0, nr_pages << PAGE_SHIFT); + dax_flush(dax_dev, kaddr, nr_pages << PAGE_SHIFT); + return 0; +} + +static const struct dax_operations virtio_fs_dax_ops = { + .direct_access = virtio_fs_direct_access, + .copy_from_iter = virtio_fs_copy_from_iter, + .copy_to_iter = virtio_fs_copy_to_iter, + .zero_page_range = virtio_fs_zero_page_range, +}; + +static void virtio_fs_cleanup_dax(void *data) +{ + struct dax_device *dax_dev = data; + + kill_dax(dax_dev); + put_dax(dax_dev); +} + +static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs) +{ + struct virtio_shm_region cache_reg; + struct dev_pagemap *pgmap; + bool have_cache; + + if (!IS_ENABLED(CONFIG_FUSE_DAX)) + return 0; + + /* Get cache region */ + have_cache = virtio_get_shm_region(vdev, &cache_reg, + (u8)VIRTIO_FS_SHMCAP_ID_CACHE); + if (!have_cache) { + dev_notice(&vdev->dev, "%s: No cache capability\n", __func__); + return 0; + } + + if (!devm_request_mem_region(&vdev->dev, cache_reg.addr, cache_reg.len, + dev_name(&vdev->dev))) { + dev_warn(&vdev->dev, "could not reserve region addr=0x%llx len=0x%llx\n", + cache_reg.addr, cache_reg.len); + return -EBUSY; + } + + dev_notice(&vdev->dev, "Cache len: 0x%llx @ 0x%llx\n", cache_reg.len, + cache_reg.addr); + + pgmap = devm_kzalloc(&vdev->dev, sizeof(*pgmap), GFP_KERNEL); + if (!pgmap) + return -ENOMEM; + + pgmap->type = MEMORY_DEVICE_FS_DAX; + + /* Ideally we would directly use the PCI BAR resource but + * devm_memremap_pages() wants its own copy in pgmap. So + * initialize a struct resource from scratch (only the start + * and end fields will be used). + */ + pgmap->range = (struct range) { + .start = (phys_addr_t) cache_reg.addr, + .end = (phys_addr_t) cache_reg.addr + cache_reg.len - 1, + }; + pgmap->nr_range = 1; + + fs->window_kaddr = devm_memremap_pages(&vdev->dev, pgmap); + if (IS_ERR(fs->window_kaddr)) + return PTR_ERR(fs->window_kaddr); + + fs->window_phys_addr = (phys_addr_t) cache_reg.addr; + fs->window_len = (phys_addr_t) cache_reg.len; + + dev_dbg(&vdev->dev, "%s: window kaddr 0x%px phys_addr 0x%llx len 0x%llx\n", + __func__, fs->window_kaddr, cache_reg.addr, cache_reg.len); + + fs->dax_dev = alloc_dax(fs, NULL, &virtio_fs_dax_ops, 0); + if (IS_ERR(fs->dax_dev)) + return PTR_ERR(fs->dax_dev); + + return devm_add_action_or_reset(&vdev->dev, virtio_fs_cleanup_dax, + fs->dax_dev); +} + static int virtio_fs_probe(struct virtio_device *vdev) { struct virtio_fs *fs; @@ -697,6 +878,10 @@ static int virtio_fs_probe(struct virtio_device *vdev) /* TODO vq affinity */ + ret = virtio_fs_setup_dax(vdev, fs); + if (ret < 0) + goto out_vqs; + /* Bring the device online in case the filesystem is mounted and * requests need to be sent before we return. */ @@ -833,18 +1018,37 @@ __releases(fiq->lock) spin_unlock(&fiq->lock); } +/* Count number of scatter-gather elements required */ +static unsigned int sg_count_fuse_pages(struct fuse_page_desc *page_descs, + unsigned int num_pages, + unsigned int total_len) +{ + unsigned int i; + unsigned int this_len; + + for (i = 0; i < num_pages && total_len; i++) { + this_len = min(page_descs[i].length, total_len); + total_len -= this_len; + } + + return i; +} + /* Return the number of scatter-gather list elements required */ static unsigned int sg_count_fuse_req(struct fuse_req *req) { struct fuse_args *args = req->args; struct fuse_args_pages *ap = container_of(args, typeof(*ap), args); - unsigned int total_sgs = 1 /* fuse_in_header */; + unsigned int size, total_sgs = 1 /* fuse_in_header */; if (args->in_numargs - args->in_pages) total_sgs += 1; - if (args->in_pages) - total_sgs += ap->num_pages; + if (args->in_pages) { + size = args->in_args[args->in_numargs - 1].size; + total_sgs += sg_count_fuse_pages(ap->descs, ap->num_pages, + size); + } if (!test_bit(FR_ISREPLY, &req->flags)) return total_sgs; @@ -854,8 +1058,11 @@ static unsigned int sg_count_fuse_req(struct fuse_req *req) if (args->out_numargs - args->out_pages) total_sgs += 1; - if (args->out_pages) - total_sgs += ap->num_pages; + if (args->out_pages) { + size = args->out_args[args->out_numargs - 1].size; + total_sgs += sg_count_fuse_pages(ap->descs, ap->num_pages, + size); + } return total_sgs; } @@ -1071,24 +1278,28 @@ static const struct fuse_iqueue_ops virtio_fs_fiq_ops = { .release = virtio_fs_fiq_release, }; -static int virtio_fs_fill_super(struct super_block *sb) +static inline void virtio_fs_ctx_set_defaults(struct fuse_fs_context *ctx) { - struct fuse_conn *fc = get_fuse_conn_super(sb); + ctx->rootmode = S_IFDIR; + ctx->default_permissions = 1; + ctx->allow_other = 1; + ctx->max_read = UINT_MAX; + ctx->blksize = 512; + ctx->destroy = true; + ctx->no_control = true; + ctx->no_force_umount = true; +} + +static int virtio_fs_fill_super(struct super_block *sb, struct fs_context *fsc) +{ + struct fuse_mount *fm = get_fuse_mount_super(sb); + struct fuse_conn *fc = fm->fc; struct virtio_fs *fs = fc->iq.priv; + struct fuse_fs_context *ctx = fsc->fs_private; unsigned int i; int err; - struct fuse_fs_context ctx = { - .rootmode = S_IFDIR, - .default_permissions = 1, - .allow_other = 1, - .max_read = UINT_MAX, - .blksize = 512, - .destroy = true, - .no_control = true, - .no_force_umount = true, - .no_mount_options = true, - }; + virtio_fs_ctx_set_defaults(ctx); mutex_lock(&virtio_fs_mutex); /* After holding mutex, make sure virtiofs device is still there. @@ -1112,8 +1323,10 @@ static int virtio_fs_fill_super(struct super_block *sb) } /* virtiofs allocates and installs its own fuse devices */ - ctx.fudptr = NULL; - err = fuse_fill_super_common(sb, &ctx); + ctx->fudptr = NULL; + if (ctx->dax) + ctx->dax_dev = fs->dax_dev; + err = fuse_fill_super_common(sb, ctx); if (err < 0) goto err_free_fuse_devs; @@ -1125,7 +1338,7 @@ static int virtio_fs_fill_super(struct super_block *sb) /* Previous unmount will stop all queues. Start these again */ virtio_fs_start_all_queues(fs); - fuse_send_init(fc); + fuse_send_init(fm); mutex_unlock(&virtio_fs_mutex); return 0; @@ -1136,18 +1349,17 @@ err: return err; } -static void virtio_kill_sb(struct super_block *sb) +static void virtio_fs_conn_destroy(struct fuse_mount *fm) { - struct fuse_conn *fc = get_fuse_conn_super(sb); - struct virtio_fs *vfs; - struct virtio_fs_vq *fsvq; - - /* If mount failed, we can still be called without any fc */ - if (!fc) - return fuse_kill_sb_anon(sb); + struct fuse_conn *fc = fm->fc; + struct virtio_fs *vfs = fc->iq.priv; + struct virtio_fs_vq *fsvq = &vfs->vqs[VQ_HIPRIO]; - vfs = fc->iq.priv; - fsvq = &vfs->vqs[VQ_HIPRIO]; + /* Stop dax worker. Soon evict_inodes() will be called which + * will free all memory ranges belonging to all inodes. + */ + if (IS_ENABLED(CONFIG_FUSE_DAX)) + fuse_dax_cancel_work(fc); /* Stop forget queue. Soon destroy will be sent */ spin_lock(&fsvq->lock); @@ -1155,9 +1367,9 @@ static void virtio_kill_sb(struct super_block *sb) spin_unlock(&fsvq->lock); virtio_fs_drain_all_queues(vfs); - fuse_kill_sb_anon(sb); + fuse_conn_destroy(fm); - /* fuse_kill_sb_anon() must have sent destroy. Stop all queues + /* fuse_conn_destroy() must have sent destroy. Stop all queues * and drain one more time and free fuse devices. Freeing fuse * devices will drop their reference on fuse_conn and that in * turn will drop its reference on virtio_fs object. @@ -1167,12 +1379,27 @@ static void virtio_kill_sb(struct super_block *sb) virtio_fs_free_devs(vfs); } +static void virtio_kill_sb(struct super_block *sb) +{ + struct fuse_mount *fm = get_fuse_mount_super(sb); + bool last; + + /* If mount failed, we can still be called without any fc */ + if (fm) { + last = fuse_mount_remove(fm); + if (last) + virtio_fs_conn_destroy(fm); + } + kill_anon_super(sb); +} + static int virtio_fs_test_super(struct super_block *sb, struct fs_context *fsc) { - struct fuse_conn *fc = fsc->s_fs_info; + struct fuse_mount *fsc_fm = fsc->s_fs_info; + struct fuse_mount *sb_fm = get_fuse_mount_super(sb); - return fc->iq.priv == get_fuse_conn_super(sb)->iq.priv; + return fsc_fm->fc->iq.priv == sb_fm->fc->iq.priv; } static int virtio_fs_set_super(struct super_block *sb, @@ -1182,7 +1409,7 @@ static int virtio_fs_set_super(struct super_block *sb, err = get_anon_bdev(&sb->s_dev); if (!err) - fuse_conn_get(fsc->s_fs_info); + fuse_mount_get(fsc->s_fs_info); return err; } @@ -1192,6 +1419,7 @@ static int virtio_fs_get_tree(struct fs_context *fsc) struct virtio_fs *fs; struct super_block *sb; struct fuse_conn *fc; + struct fuse_mount *fm; int err; /* This gets a reference on virtio_fs object. This ptr gets installed @@ -1212,19 +1440,29 @@ static int virtio_fs_get_tree(struct fs_context *fsc) return -ENOMEM; } - fuse_conn_init(fc, get_user_ns(current_user_ns()), &virtio_fs_fiq_ops, - fs); + fm = kzalloc(sizeof(struct fuse_mount), GFP_KERNEL); + if (!fm) { + mutex_lock(&virtio_fs_mutex); + virtio_fs_put(fs); + mutex_unlock(&virtio_fs_mutex); + kfree(fc); + return -ENOMEM; + } + + fuse_conn_init(fc, fm, get_user_ns(current_user_ns()), + &virtio_fs_fiq_ops, fs); fc->release = fuse_free_conn; fc->delete_stale = true; + fc->auto_submounts = true; - fsc->s_fs_info = fc; + fsc->s_fs_info = fm; sb = sget_fc(fsc, virtio_fs_test_super, virtio_fs_set_super); - fuse_conn_put(fc); + fuse_mount_put(fm); if (IS_ERR(sb)) return PTR_ERR(sb); if (!sb->s_root) { - err = virtio_fs_fill_super(sb); + err = virtio_fs_fill_super(sb, fsc); if (err) { deactivate_locked_super(sb); return err; @@ -1239,11 +1477,19 @@ static int virtio_fs_get_tree(struct fs_context *fsc) } static const struct fs_context_operations virtio_fs_context_ops = { + .free = virtio_fs_free_fc, + .parse_param = virtio_fs_parse_param, .get_tree = virtio_fs_get_tree, }; static int virtio_fs_init_fs_context(struct fs_context *fsc) { + struct fuse_fs_context *ctx; + + ctx = kzalloc(sizeof(struct fuse_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + fsc->fs_private = ctx; fsc->ops = &virtio_fs_context_ops; return 0; } diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c index 20d052e08b3b..371bdcbc7233 100644 --- a/fs/fuse/xattr.c +++ b/fs/fuse/xattr.c @@ -14,12 +14,12 @@ int fuse_setxattr(struct inode *inode, const char *name, const void *value, size_t size, int flags) { - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); struct fuse_setxattr_in inarg; int err; - if (fc->no_setxattr) + if (fm->fc->no_setxattr) return -EOPNOTSUPP; memset(&inarg, 0, sizeof(inarg)); @@ -34,9 +34,9 @@ int fuse_setxattr(struct inode *inode, const char *name, const void *value, args.in_args[1].value = name; args.in_args[2].size = size; args.in_args[2].value = value; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { - fc->no_setxattr = 1; + fm->fc->no_setxattr = 1; err = -EOPNOTSUPP; } if (!err) { @@ -49,13 +49,13 @@ int fuse_setxattr(struct inode *inode, const char *name, const void *value, ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value, size_t size) { - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); struct fuse_getxattr_in inarg; struct fuse_getxattr_out outarg; ssize_t ret; - if (fc->no_getxattr) + if (fm->fc->no_getxattr) return -EOPNOTSUPP; memset(&inarg, 0, sizeof(inarg)); @@ -77,11 +77,11 @@ ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value, args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; } - ret = fuse_simple_request(fc, &args); + ret = fuse_simple_request(fm, &args); if (!ret && !size) ret = min_t(ssize_t, outarg.size, XATTR_SIZE_MAX); if (ret == -ENOSYS) { - fc->no_getxattr = 1; + fm->fc->no_getxattr = 1; ret = -EOPNOTSUPP; } return ret; @@ -107,16 +107,16 @@ static int fuse_verify_xattr_list(char *list, size_t size) ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) { struct inode *inode = d_inode(entry); - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); struct fuse_getxattr_in inarg; struct fuse_getxattr_out outarg; ssize_t ret; - if (!fuse_allow_current_process(fc)) + if (!fuse_allow_current_process(fm->fc)) return -EACCES; - if (fc->no_listxattr) + if (fm->fc->no_listxattr) return -EOPNOTSUPP; memset(&inarg, 0, sizeof(inarg)); @@ -136,13 +136,13 @@ ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; } - ret = fuse_simple_request(fc, &args); + ret = fuse_simple_request(fm, &args); if (!ret && !size) ret = min_t(ssize_t, outarg.size, XATTR_LIST_MAX); if (ret > 0 && size) ret = fuse_verify_xattr_list(list, ret); if (ret == -ENOSYS) { - fc->no_listxattr = 1; + fm->fc->no_listxattr = 1; ret = -EOPNOTSUPP; } return ret; @@ -150,11 +150,11 @@ ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) int fuse_removexattr(struct inode *inode, const char *name) { - struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); int err; - if (fc->no_removexattr) + if (fm->fc->no_removexattr) return -EOPNOTSUPP; args.opcode = FUSE_REMOVEXATTR; @@ -162,9 +162,9 @@ int fuse_removexattr(struct inode *inode, const char *name) args.in_numargs = 1; args.in_args[0].size = strlen(name) + 1; args.in_args[0].value = name; - err = fuse_simple_request(fc, &args); + err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { - fc->no_removexattr = 1; + fm->fc->no_removexattr = 1; err = -EOPNOTSUPP; } if (!err) { diff --git a/fs/io_uring.c b/fs/io_uring.c index 0f4a9c45061d..02dc81622081 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -4081,7 +4081,7 @@ static int io_madvise(struct io_kiocb *req, bool force_nonblock) if (force_nonblock) return -EAGAIN; - ret = do_madvise(ma->addr, ma->len, ma->advice); + ret = do_madvise(current->mm, ma->addr, ma->len, ma->advice); if (ret < 0) req_set_fail_links(req); io_req_complete(req, ret); diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index c942910a8649..9167884a61ec 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -531,6 +531,7 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, struct inode *dirid = fanotify_dfid_inode(mask, data, data_type, dir); const struct path *path = fsnotify_data_path(data, data_type); unsigned int fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS); + struct mem_cgroup *old_memcg; struct inode *child = NULL; bool name_event = false; @@ -580,7 +581,7 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, gfp |= __GFP_RETRY_MAYFAIL; /* Whoever is interested in the event, pays for the allocation. */ - memalloc_use_memcg(group->memcg); + old_memcg = set_active_memcg(group->memcg); if (fanotify_is_perm_event(mask)) { event = fanotify_alloc_perm_event(path, gfp); @@ -608,7 +609,7 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, event->pid = get_pid(task_tgid(current)); out: - memalloc_unuse_memcg(); + set_active_memcg(old_memcg); return event; } diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index a65cf8c9f600..9ddcbadc98e2 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -66,6 +66,7 @@ static int inotify_one_event(struct fsnotify_group *group, u32 mask, int ret; int len = 0; int alloc_len = sizeof(struct inotify_event_info); + struct mem_cgroup *old_memcg; if ((inode_mark->mask & FS_EXCL_UNLINK) && path && d_unlinked(path->dentry)) @@ -87,9 +88,9 @@ static int inotify_one_event(struct fsnotify_group *group, u32 mask, * trigger OOM killer in the target monitoring memcg as it may have * security repercussion. */ - memalloc_use_memcg(group->memcg); + old_memcg = set_active_memcg(group->memcg); event = kmalloc(alloc_len, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL); - memalloc_unuse_memcg(); + set_active_memcg(old_memcg); if (unlikely(!event)) { /* diff --git a/fs/ubifs/auth.c b/fs/ubifs/auth.c index cc5c0abfd536..b93b3cd10bfd 100644 --- a/fs/ubifs/auth.c +++ b/fs/ubifs/auth.c @@ -54,7 +54,7 @@ static int ubifs_hash_calc_hmac(const struct ubifs_info *c, const u8 *hash, * ubifs_prepare_auth_node - Prepare an authentication node * @c: UBIFS file-system description object * @node: the node to calculate a hash for - * @hash: input hash of previous nodes + * @inhash: input hash of previous nodes * * This function prepares an authentication node for writing onto flash. * It creates a HMAC from the given input hash and writes it to the node. diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 31288d8fa2ce..ebff43f8009c 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -1123,6 +1123,7 @@ int dbg_check_dir(struct ubifs_info *c, const struct inode *dir) err = PTR_ERR(dent); if (err == -ENOENT) break; + kfree(pdent); return err; } diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c index 62cb3db44e6e..a4aaeea63893 100644 --- a/fs/ubifs/gc.c +++ b/fs/ubifs/gc.c @@ -57,10 +57,6 @@ /** * switch_gc_head - switch the garbage collection journal head. * @c: UBIFS file-system description object - * @buf: buffer to write - * @len: length of the buffer to write - * @lnum: LEB number written is returned here - * @offs: offset written is returned here * * This function switch the GC head to the next LEB which is reserved in * @c->gc_lnum. Returns %0 in case of success, %-EAGAIN if commit is required, diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c index 3df9be2c684c..4363d85a3fd4 100644 --- a/fs/ubifs/ioctl.c +++ b/fs/ubifs/ioctl.c @@ -134,7 +134,6 @@ static int setflags(struct inode *inode, int flags) return err; out_unlock: - ubifs_err(c, "can't modify inode %lu attributes", inode->i_ino); mutex_unlock(&ui->ui_mutex); ubifs_release_budget(c, &req); return err; diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 4a5b06f8d812..091c2ad8f211 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -894,6 +894,7 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode) if (err == -ENOENT) break; + kfree(pxent); goto out_release; } @@ -906,6 +907,7 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode) ubifs_err(c, "dead directory entry '%s', error %d", xent->name, err); ubifs_ro_mode(c, err); + kfree(pxent); kfree(xent); goto out_release; } @@ -936,8 +938,6 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode) inode->i_ino); release_head(c, BASEHD); - ubifs_add_auth_dirt(c, lnum); - if (last_reference) { err = ubifs_tnc_remove_ino(c, inode->i_ino); if (err) @@ -947,6 +947,8 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode) } else { union ubifs_key key; + ubifs_add_auth_dirt(c, lnum); + ino_key_init(c, &key, inode->i_ino); err = ubifs_tnc_add(c, &key, lnum, offs, ilen, hash); } @@ -1798,7 +1800,6 @@ int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode, u8 hash[UBIFS_HASH_ARR_SZ]; dbg_jnl("ino %lu, ino %lu", host->i_ino, inode->i_ino); - ubifs_assert(c, host->i_nlink > 0); ubifs_assert(c, inode->i_nlink > 0); ubifs_assert(c, mutex_is_locked(&host_ui->ui_mutex)); diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index 2c294085ffed..0fb61956146d 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -173,6 +173,7 @@ int ubifs_add_orphan(struct ubifs_info *c, ino_t inum) err = PTR_ERR(xent); if (err == -ENOENT) break; + kfree(pxent); return err; } @@ -182,6 +183,7 @@ int ubifs_add_orphan(struct ubifs_info *c, ino_t inum) xattr_orphan = orphan_add(c, xattr_inum, orphan); if (IS_ERR(xattr_orphan)) { + kfree(pxent); kfree(xent); return PTR_ERR(xattr_orphan); } diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c index b69ffac7e415..2f8d8f4f411a 100644 --- a/fs/ubifs/replay.c +++ b/fs/ubifs/replay.c @@ -931,8 +931,6 @@ out: * validate_ref - validate a reference node. * @c: UBIFS file-system description object * @ref: the reference node to validate - * @ref_lnum: LEB number of the reference node - * @ref_offs: reference node offset * * This function returns %1 if a bud reference already exists for the LEB. %0 is * returned if the reference node is new, otherwise %-EINVAL is returned if diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index fbddb2a1c03f..cb3acfb7dd1f 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1110,14 +1110,20 @@ static int ubifs_parse_options(struct ubifs_info *c, char *options, break; } case Opt_auth_key: - c->auth_key_name = kstrdup(args[0].from, GFP_KERNEL); - if (!c->auth_key_name) - return -ENOMEM; + if (!is_remount) { + c->auth_key_name = kstrdup(args[0].from, + GFP_KERNEL); + if (!c->auth_key_name) + return -ENOMEM; + } break; case Opt_auth_hash_name: - c->auth_hash_name = kstrdup(args[0].from, GFP_KERNEL); - if (!c->auth_hash_name) - return -ENOMEM; + if (!is_remount) { + c->auth_hash_name = kstrdup(args[0].from, + GFP_KERNEL); + if (!c->auth_hash_name) + return -ENOMEM; + } break; case Opt_ignore: break; @@ -1141,6 +1147,18 @@ static int ubifs_parse_options(struct ubifs_info *c, char *options, return 0; } +/* + * ubifs_release_options - release mount parameters which have been dumped. + * @c: UBIFS file-system description object + */ +static void ubifs_release_options(struct ubifs_info *c) +{ + kfree(c->auth_key_name); + c->auth_key_name = NULL; + kfree(c->auth_hash_name); + c->auth_hash_name = NULL; +} + /** * destroy_journal - destroy journal data structures. * @c: UBIFS file-system description object @@ -1313,7 +1331,7 @@ static int mount_ubifs(struct ubifs_info *c) err = ubifs_read_superblock(c); if (err) - goto out_free; + goto out_auth; c->probing = 0; @@ -1325,18 +1343,18 @@ static int mount_ubifs(struct ubifs_info *c) ubifs_err(c, "'compressor \"%s\" is not compiled in", ubifs_compr_name(c, c->default_compr)); err = -ENOTSUPP; - goto out_free; + goto out_auth; } err = init_constants_sb(c); if (err) - goto out_free; + goto out_auth; sz = ALIGN(c->max_idx_node_sz, c->min_io_size) * 2; c->cbuf = kmalloc(sz, GFP_NOFS); if (!c->cbuf) { err = -ENOMEM; - goto out_free; + goto out_auth; } err = alloc_wbufs(c); @@ -1611,6 +1629,8 @@ out_wbufs: free_wbufs(c); out_cbuf: kfree(c->cbuf); +out_auth: + ubifs_exit_authentication(c); out_free: kfree(c->write_reserve_buf); kfree(c->bu.buf); @@ -1650,8 +1670,7 @@ static void ubifs_umount(struct ubifs_info *c) ubifs_lpt_free(c, 0); ubifs_exit_authentication(c); - kfree(c->auth_key_name); - kfree(c->auth_hash_name); + ubifs_release_options(c); kfree(c->cbuf); kfree(c->rcvrd_mst_node); kfree(c->mst_node); @@ -2221,6 +2240,7 @@ out_umount: out_unlock: mutex_unlock(&c->umount_mutex); out_close: + ubifs_release_options(c); ubi_close_volume(c->ubi); out: return err; diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index f609f6cdde70..894f1ab14616 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -360,7 +360,6 @@ static int lnc_add_directly(struct ubifs_info *c, struct ubifs_zbranch *zbr, /** * lnc_free - remove a leaf node from the leaf node cache. * @zbr: zbranch of leaf node - * @node: leaf node */ static void lnc_free(struct ubifs_zbranch *zbr) { @@ -2885,6 +2884,7 @@ int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum) err = PTR_ERR(xent); if (err == -ENOENT) break; + kfree(pxent); return err; } @@ -2898,6 +2898,7 @@ int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum) fname_len(&nm) = le16_to_cpu(xent->nlen); err = ubifs_tnc_remove_nm(c, &key1, &nm); if (err) { + kfree(pxent); kfree(xent); return err; } @@ -2906,6 +2907,7 @@ int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum) highest_ino_key(c, &key2, xattr_inum); err = ubifs_tnc_remove_range(c, &key1, &key2); if (err) { + kfree(pxent); kfree(xent); return err; } @@ -3466,7 +3468,7 @@ out_unlock: /** * dbg_check_inode_size - check if inode size is correct. * @c: UBIFS file-system description object - * @inum: inode number + * @inode: inode to check * @size: inode size * * This function makes sure that the inode size (@size) is correct and it does diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index 9aefbb60074f..a0b9b349efe6 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -522,6 +522,7 @@ int ubifs_purge_xattrs(struct inode *host) xent->name, err); ubifs_ro_mode(c, err); kfree(pxent); + kfree(xent); return err; } @@ -531,6 +532,7 @@ int ubifs_purge_xattrs(struct inode *host) err = remove_xattr(c, host, xino, &nm); if (err) { kfree(pxent); + kfree(xent); iput(xino); ubifs_err(c, "cannot remove xattr, error %d", err); return err; diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index e685299eb3d2..9fac5ea8d0e4 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -22,6 +22,31 @@ config XFS_FS system of your root partition is compiled as a module, you'll need to use an initial ramdisk (initrd) to boot. +config XFS_SUPPORT_V4 + bool "Support deprecated V4 (crc=0) format" + depends on XFS_FS + default y + help + The V4 filesystem format lacks certain features that are supported + by the V5 format, such as metadata checksumming, strengthened + metadata verification, and the ability to store timestamps past the + year 2038. Because of this, the V4 format is deprecated. All users + should upgrade by backing up their files, reformatting, and restoring + from the backup. + + Administrators and users can detect a V4 filesystem by running + xfs_info against a filesystem mountpoint and checking for a string + beginning with "crc=". If the string "crc=0" is found, the + filesystem is a V4 filesystem. If no such string is found, please + upgrade xfsprogs to the latest version and try again. + + This option will become default N in September 2025. Support for the + V4 format will be removed entirely in September 2030. Distributors + can say N here to withdraw support earlier. + + To continue supporting the old V4 format (crc=0), say Y. + To close off an attack surface, say N. + config XFS_QUOTA bool "XFS Quota support" depends on XFS_FS diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 3f80cede7406..48d8e9caf86f 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -96,8 +96,6 @@ xfs_attr3_rmt_verify( { struct xfs_attr3_rmt_hdr *rmt = ptr; - if (!xfs_sb_version_hascrc(&mp->m_sb)) - return __this_address; if (!xfs_verify_magic(bp, rmt->rm_magic)) return __this_address; if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_meta_uuid)) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 1b0a01b06a05..d9a692484eae 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -5046,20 +5046,25 @@ xfs_bmap_del_extent_real( flags = XFS_ILOG_CORE; if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) { - xfs_fsblock_t bno; xfs_filblks_t len; xfs_extlen_t mod; - bno = div_u64_rem(del->br_startblock, mp->m_sb.sb_rextsize, - &mod); - ASSERT(mod == 0); len = div_u64_rem(del->br_blockcount, mp->m_sb.sb_rextsize, &mod); ASSERT(mod == 0); - error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len); - if (error) - goto done; + if (!(bflags & XFS_BMAPI_REMAP)) { + xfs_fsblock_t bno; + + bno = div_u64_rem(del->br_startblock, + mp->m_sb.sb_rextsize, &mod); + ASSERT(mod == 0); + + error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len); + if (error) + goto done; + } + do_fx = 0; nblks = len * mp->m_sb.sb_rextsize; qfield = XFS_TRANS_DQ_RTBCOUNT; diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index b40a4e80f5ee..b876b44c0204 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -15,8 +15,8 @@ */ #define XFS_DA_NODE_MAGIC 0xfebe /* magic number: non-leaf blocks */ #define XFS_ATTR_LEAF_MAGIC 0xfbee /* magic number: attribute leaf blks */ -#define XFS_DIR2_LEAF1_MAGIC 0xd2f1 /* magic number: v2 dirlf single blks */ -#define XFS_DIR2_LEAFN_MAGIC 0xd2ff /* magic number: v2 dirlf multi blks */ +#define XFS_DIR2_LEAF1_MAGIC 0xd2f1 /* magic number: v2 dirlf single blks */ +#define XFS_DIR2_LEAFN_MAGIC 0xd2ff /* magic number: v2 dirlf multi blks */ typedef struct xfs_da_blkinfo { __be32 forw; /* previous block in list */ @@ -35,8 +35,8 @@ typedef struct xfs_da_blkinfo { */ #define XFS_DA3_NODE_MAGIC 0x3ebe /* magic number: non-leaf blocks */ #define XFS_ATTR3_LEAF_MAGIC 0x3bee /* magic number: attribute leaf blks */ -#define XFS_DIR3_LEAF1_MAGIC 0x3df1 /* magic number: v2 dirlf single blks */ -#define XFS_DIR3_LEAFN_MAGIC 0x3dff /* magic number: v2 dirlf multi blks */ +#define XFS_DIR3_LEAF1_MAGIC 0x3df1 /* magic number: v3 dirlf single blks */ +#define XFS_DIR3_LEAFN_MAGIC 0x3dff /* magic number: v3 dirlf multi blks */ struct xfs_da3_blkinfo { /* @@ -61,7 +61,7 @@ struct xfs_da3_blkinfo { * Since we have duplicate keys, use a binary search but always follow * all match in the block, not just the first match found. */ -#define XFS_DA_NODE_MAXDEPTH 5 /* max depth of Btree */ +#define XFS_DA_NODE_MAXDEPTH 5 /* max depth of Btree */ typedef struct xfs_da_node_hdr { struct xfs_da_blkinfo info; /* block type, links, etc. */ @@ -746,14 +746,14 @@ xfs_attr3_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx) */ static inline int xfs_attr_leaf_entsize_remote(int nlen) { - return ((uint)sizeof(xfs_attr_leaf_name_remote_t) - 1 + (nlen) + \ - XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1); + return round_up(sizeof(struct xfs_attr_leaf_name_remote) - 1 + + nlen, XFS_ATTR_LEAF_NAME_ALIGN); } static inline int xfs_attr_leaf_entsize_local(int nlen, int vlen) { - return ((uint)sizeof(xfs_attr_leaf_name_local_t) - 1 + (nlen) + (vlen) + - XFS_ATTR_LEAF_NAME_ALIGN - 1) & ~(XFS_ATTR_LEAF_NAME_ALIGN - 1); + return round_up(sizeof(struct xfs_attr_leaf_name_local) - 1 + + nlen + vlen, XFS_ATTR_LEAF_NAME_ALIGN); } static inline int xfs_attr_leaf_entsize_local_max(int bsize) diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index d8f586256add..eff4a127188e 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -16,6 +16,8 @@ #include "xfs_inode.h" #include "xfs_inode_item.h" #include "xfs_trace.h" +#include "xfs_icache.h" +#include "xfs_log.h" /* * Deferred Operations in XFS @@ -186,8 +188,9 @@ xfs_defer_create_intent( { const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type]; - dfp->dfp_intent = ops->create_intent(tp, &dfp->dfp_work, - dfp->dfp_count, sort); + if (!dfp->dfp_intent) + dfp->dfp_intent = ops->create_intent(tp, &dfp->dfp_work, + dfp->dfp_count, sort); } /* @@ -312,22 +315,6 @@ xfs_defer_trans_roll( } /* - * Reset an already used dfops after finish. - */ -static void -xfs_defer_reset( - struct xfs_trans *tp) -{ - ASSERT(list_empty(&tp->t_dfops)); - - /* - * Low mode state transfers across transaction rolls to mirror dfops - * lifetime. Clear it now that dfops is reset. - */ - tp->t_flags &= ~XFS_TRANS_LOWMODE; -} - -/* * Free up any items left in the list. */ static void @@ -360,6 +347,58 @@ xfs_defer_cancel_list( } /* + * Prevent a log intent item from pinning the tail of the log by logging a + * done item to release the intent item; and then log a new intent item. + * The caller should provide a fresh transaction and roll it after we're done. + */ +static int +xfs_defer_relog( + struct xfs_trans **tpp, + struct list_head *dfops) +{ + struct xlog *log = (*tpp)->t_mountp->m_log; + struct xfs_defer_pending *dfp; + xfs_lsn_t threshold_lsn = NULLCOMMITLSN; + + + ASSERT((*tpp)->t_flags & XFS_TRANS_PERM_LOG_RES); + + list_for_each_entry(dfp, dfops, dfp_list) { + /* + * If the log intent item for this deferred op is not a part of + * the current log checkpoint, relog the intent item to keep + * the log tail moving forward. We're ok with this being racy + * because an incorrect decision means we'll be a little slower + * at pushing the tail. + */ + if (dfp->dfp_intent == NULL || + xfs_log_item_in_current_chkpt(dfp->dfp_intent)) + continue; + + /* + * Figure out where we need the tail to be in order to maintain + * the minimum required free space in the log. Only sample + * the log threshold once per call. + */ + if (threshold_lsn == NULLCOMMITLSN) { + threshold_lsn = xlog_grant_push_threshold(log, 0); + if (threshold_lsn == NULLCOMMITLSN) + break; + } + if (XFS_LSN_CMP(dfp->dfp_intent->li_lsn, threshold_lsn) >= 0) + continue; + + trace_xfs_defer_relog_intent((*tpp)->t_mountp, dfp); + XFS_STATS_INC((*tpp)->t_mountp, defer_relog); + dfp->dfp_intent = xfs_trans_item_relog(dfp->dfp_intent, *tpp); + } + + if ((*tpp)->t_flags & XFS_TRANS_DIRTY) + return xfs_defer_trans_roll(tpp); + return 0; +} + +/* * Log an intent-done item for the first pending intent, and finish the work * items. */ @@ -390,6 +429,7 @@ xfs_defer_finish_one( list_add(li, &dfp->dfp_work); dfp->dfp_count++; dfp->dfp_done = NULL; + dfp->dfp_intent = NULL; xfs_defer_create_intent(tp, dfp, false); } @@ -428,13 +468,27 @@ xfs_defer_finish_noroll( /* Until we run out of pending work to finish... */ while (!list_empty(&dop_pending) || !list_empty(&(*tp)->t_dfops)) { + /* + * Deferred items that are created in the process of finishing + * other deferred work items should be queued at the head of + * the pending list, which puts them ahead of the deferred work + * that was created by the caller. This keeps the number of + * pending work items to a minimum, which decreases the amount + * of time that any one intent item can stick around in memory, + * pinning the log tail. + */ xfs_defer_create_intents(*tp); - list_splice_tail_init(&(*tp)->t_dfops, &dop_pending); + list_splice_init(&(*tp)->t_dfops, &dop_pending); error = xfs_defer_trans_roll(tp); if (error) goto out_shutdown; + /* Possibly relog intent items to keep the log moving. */ + error = xfs_defer_relog(tp, &dop_pending); + if (error) + goto out_shutdown; + dfp = list_first_entry(&dop_pending, struct xfs_defer_pending, dfp_list); error = xfs_defer_finish_one(*tp, dfp); @@ -475,7 +529,10 @@ xfs_defer_finish( return error; } } - xfs_defer_reset(*tp); + + /* Reset LOWMODE now that we've finished all the dfops. */ + ASSERT(list_empty(&(*tp)->t_dfops)); + (*tp)->t_flags &= ~XFS_TRANS_LOWMODE; return 0; } @@ -549,6 +606,139 @@ xfs_defer_move( * that behavior. */ dtp->t_flags |= (stp->t_flags & XFS_TRANS_LOWMODE); + stp->t_flags &= ~XFS_TRANS_LOWMODE; +} + +/* + * Prepare a chain of fresh deferred ops work items to be completed later. Log + * recovery requires the ability to put off until later the actual finishing + * work so that it can process unfinished items recovered from the log in + * correct order. + * + * Create and log intent items for all the work that we're capturing so that we + * can be assured that the items will get replayed if the system goes down + * before log recovery gets a chance to finish the work it put off. The entire + * deferred ops state is transferred to the capture structure and the + * transaction is then ready for the caller to commit it. If there are no + * intent items to capture, this function returns NULL. + * + * If capture_ip is not NULL, the capture structure will obtain an extra + * reference to the inode. + */ +static struct xfs_defer_capture * +xfs_defer_ops_capture( + struct xfs_trans *tp, + struct xfs_inode *capture_ip) +{ + struct xfs_defer_capture *dfc; + + if (list_empty(&tp->t_dfops)) + return NULL; + + /* Create an object to capture the defer ops. */ + dfc = kmem_zalloc(sizeof(*dfc), KM_NOFS); + INIT_LIST_HEAD(&dfc->dfc_list); + INIT_LIST_HEAD(&dfc->dfc_dfops); + + xfs_defer_create_intents(tp); + + /* Move the dfops chain and transaction state to the capture struct. */ + list_splice_init(&tp->t_dfops, &dfc->dfc_dfops); + dfc->dfc_tpflags = tp->t_flags & XFS_TRANS_LOWMODE; + tp->t_flags &= ~XFS_TRANS_LOWMODE; + + /* Capture the remaining block reservations along with the dfops. */ + dfc->dfc_blkres = tp->t_blk_res - tp->t_blk_res_used; + dfc->dfc_rtxres = tp->t_rtx_res - tp->t_rtx_res_used; + + /* Preserve the log reservation size. */ + dfc->dfc_logres = tp->t_log_res; + + /* + * Grab an extra reference to this inode and attach it to the capture + * structure. + */ + if (capture_ip) { + ihold(VFS_I(capture_ip)); + dfc->dfc_capture_ip = capture_ip; + } + + return dfc; +} + +/* Release all resources that we used to capture deferred ops. */ +void +xfs_defer_ops_release( + struct xfs_mount *mp, + struct xfs_defer_capture *dfc) +{ + xfs_defer_cancel_list(mp, &dfc->dfc_dfops); + if (dfc->dfc_capture_ip) + xfs_irele(dfc->dfc_capture_ip); + kmem_free(dfc); +} + +/* + * Capture any deferred ops and commit the transaction. This is the last step + * needed to finish a log intent item that we recovered from the log. If any + * of the deferred ops operate on an inode, the caller must pass in that inode + * so that the reference can be transferred to the capture structure. The + * caller must hold ILOCK_EXCL on the inode, and must unlock it before calling + * xfs_defer_ops_continue. + */ +int +xfs_defer_ops_capture_and_commit( + struct xfs_trans *tp, + struct xfs_inode *capture_ip, + struct list_head *capture_list) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_defer_capture *dfc; + int error; + + ASSERT(!capture_ip || xfs_isilocked(capture_ip, XFS_ILOCK_EXCL)); + + /* If we don't capture anything, commit transaction and exit. */ + dfc = xfs_defer_ops_capture(tp, capture_ip); + if (!dfc) + return xfs_trans_commit(tp); + + /* Commit the transaction and add the capture structure to the list. */ + error = xfs_trans_commit(tp); + if (error) { + xfs_defer_ops_release(mp, dfc); + return error; + } + + list_add_tail(&dfc->dfc_list, capture_list); + return 0; +} + +/* + * Attach a chain of captured deferred ops to a new transaction and free the + * capture structure. If an inode was captured, it will be passed back to the + * caller with ILOCK_EXCL held and joined to the transaction with lockflags==0. + * The caller now owns the inode reference. + */ +void +xfs_defer_ops_continue( + struct xfs_defer_capture *dfc, + struct xfs_trans *tp, + struct xfs_inode **captured_ipp) +{ + ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); + ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY)); + + /* Lock and join the captured inode to the new transaction. */ + if (dfc->dfc_capture_ip) { + xfs_ilock(dfc->dfc_capture_ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, dfc->dfc_capture_ip, 0); + } + *captured_ipp = dfc->dfc_capture_ip; + + /* Move captured dfops chain and state to the transaction. */ + list_splice_init(&dfc->dfc_dfops, &tp->t_dfops); + tp->t_flags |= dfc->dfc_tpflags; - xfs_defer_reset(stp); + kmem_free(dfc); } diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index 6b2ca580f2b0..05472f71fffe 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -8,6 +8,7 @@ struct xfs_btree_cur; struct xfs_defer_op_type; +struct xfs_defer_capture; /* * Header for deferred operation list. @@ -63,4 +64,40 @@ extern const struct xfs_defer_op_type xfs_rmap_update_defer_type; extern const struct xfs_defer_op_type xfs_extent_free_defer_type; extern const struct xfs_defer_op_type xfs_agfl_free_defer_type; +/* + * This structure enables a dfops user to detach the chain of deferred + * operations from a transaction so that they can be continued later. + */ +struct xfs_defer_capture { + /* List of other capture structures. */ + struct list_head dfc_list; + + /* Deferred ops state saved from the transaction. */ + struct list_head dfc_dfops; + unsigned int dfc_tpflags; + + /* Block reservations for the data and rt devices. */ + unsigned int dfc_blkres; + unsigned int dfc_rtxres; + + /* Log reservation saved from the transaction. */ + unsigned int dfc_logres; + + /* + * An inode reference that must be maintained to complete the deferred + * work. + */ + struct xfs_inode *dfc_capture_ip; +}; + +/* + * Functions to capture a chain of deferred operations and continue them later. + * This doesn't normally happen except log recovery. + */ +int xfs_defer_ops_capture_and_commit(struct xfs_trans *tp, + struct xfs_inode *capture_ip, struct list_head *capture_list); +void xfs_defer_ops_continue(struct xfs_defer_capture *d, struct xfs_trans *tp, + struct xfs_inode **captured_ipp); +void xfs_defer_ops_release(struct xfs_mount *mp, struct xfs_defer_capture *d); + #endif /* __XFS_DEFER_H__ */ diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h index 536666143fe7..ef5eaf33d146 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.h +++ b/fs/xfs/libxfs/xfs_inode_buf.h @@ -17,7 +17,7 @@ struct xfs_dinode; */ struct xfs_icdinode { uint16_t di_flushiter; /* incremented on flush */ - uint32_t di_projid; /* owner's project id */ + prid_t di_projid; /* owner's project id */ xfs_fsize_t di_size; /* number of bytes in file */ xfs_rfsblock_t di_nblocks; /* # of direct & btree blocks used */ xfs_extlen_t di_extsize; /* basic/minimum extent size for file */ diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index 27c39268c31f..340c83f76c80 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -2505,12 +2505,15 @@ xfs_rmap_map_extent( int whichfork, struct xfs_bmbt_irec *PREV) { + enum xfs_rmap_intent_type type = XFS_RMAP_MAP; + if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork)) return; - __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ? - XFS_RMAP_MAP_SHARED : XFS_RMAP_MAP, ip->i_ino, - whichfork, PREV); + if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip)) + type = XFS_RMAP_MAP_SHARED; + + __xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV); } /* Unmap an extent out of a file. */ @@ -2521,12 +2524,15 @@ xfs_rmap_unmap_extent( int whichfork, struct xfs_bmbt_irec *PREV) { + enum xfs_rmap_intent_type type = XFS_RMAP_UNMAP; + if (!xfs_rmap_update_is_needed(tp->t_mountp, whichfork)) return; - __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ? - XFS_RMAP_UNMAP_SHARED : XFS_RMAP_UNMAP, ip->i_ino, - whichfork, PREV); + if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip)) + type = XFS_RMAP_UNMAP_SHARED; + + __xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV); } /* @@ -2543,12 +2549,15 @@ xfs_rmap_convert_extent( int whichfork, struct xfs_bmbt_irec *PREV) { + enum xfs_rmap_intent_type type = XFS_RMAP_CONVERT; + if (!xfs_rmap_update_is_needed(mp, whichfork)) return; - __xfs_rmap_add(tp, xfs_is_reflink_inode(ip) ? - XFS_RMAP_CONVERT_SHARED : XFS_RMAP_CONVERT, ip->i_ino, - whichfork, PREV); + if (whichfork != XFS_ATTR_FORK && xfs_is_reflink_inode(ip)) + type = XFS_RMAP_CONVERT_SHARED; + + __xfs_rmap_add(tp, type, ip->i_ino, whichfork, PREV); } /* Schedule the creation of an rmap for non-file data. */ diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 1d9fa8a300f1..6c1aba16113c 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -1018,7 +1018,6 @@ xfs_rtalloc_query_range( struct xfs_mount *mp = tp->t_mountp; xfs_rtblock_t rtstart; xfs_rtblock_t rtend; - xfs_rtblock_t rem; int is_free; int error = 0; @@ -1027,13 +1026,12 @@ xfs_rtalloc_query_range( if (low_rec->ar_startext >= mp->m_sb.sb_rextents || low_rec->ar_startext == high_rec->ar_startext) return 0; - if (high_rec->ar_startext > mp->m_sb.sb_rextents) - high_rec->ar_startext = mp->m_sb.sb_rextents; + high_rec->ar_startext = min(high_rec->ar_startext, + mp->m_sb.sb_rextents - 1); /* Iterate the bitmap, looking for discrepancies. */ rtstart = low_rec->ar_startext; - rem = high_rec->ar_startext - rtstart; - while (rem) { + while (rtstart <= high_rec->ar_startext) { /* Is the first block free? */ error = xfs_rtcheck_range(mp, tp, rtstart, 1, 1, &rtend, &is_free); @@ -1042,7 +1040,7 @@ xfs_rtalloc_query_range( /* How long does the extent go for? */ error = xfs_rtfind_forw(mp, tp, rtstart, - high_rec->ar_startext - 1, &rtend); + high_rec->ar_startext, &rtend); if (error) break; @@ -1055,7 +1053,6 @@ xfs_rtalloc_query_range( break; } - rem -= rtend - rtstart + 1; rtstart = rtend + 1; } diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c index e56786f0a13c..653f3280e1c1 100644 --- a/fs/xfs/scrub/dabtree.c +++ b/fs/xfs/scrub/dabtree.c @@ -441,6 +441,20 @@ xchk_da_btree_block( goto out_freebp; } + /* + * If we've been handed a block that is below the dabtree root, does + * its hashval match what the parent block expected to see? + */ + if (level > 0) { + struct xfs_da_node_entry *key; + + key = xchk_da_btree_node_entry(ds, level - 1); + if (be32_to_cpu(key->hashval) != blk->hashval) { + xchk_da_set_corrupt(ds, level); + goto out_freebp; + } + } + out: return error; out_freebp: diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index ec3691372e7c..9e16a4d0f97c 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -24,6 +24,7 @@ #include "xfs_error.h" #include "xfs_log_priv.h" #include "xfs_log_recover.h" +#include "xfs_quota.h" kmem_zone_t *xfs_bui_zone; kmem_zone_t *xfs_bud_zone; @@ -423,30 +424,26 @@ const struct xfs_defer_op_type xfs_bmap_update_defer_type = { STATIC int xfs_bui_item_recover( struct xfs_log_item *lip, - struct xfs_trans *parent_tp) + struct list_head *capture_list) { struct xfs_bmbt_irec irec; struct xfs_bui_log_item *buip = BUI_ITEM(lip); struct xfs_trans *tp; struct xfs_inode *ip = NULL; - struct xfs_mount *mp = parent_tp->t_mountp; + struct xfs_mount *mp = lip->li_mountp; struct xfs_map_extent *bmap; struct xfs_bud_log_item *budp; xfs_fsblock_t startblock_fsb; xfs_fsblock_t inode_fsb; xfs_filblks_t count; xfs_exntst_t state; - enum xfs_bmap_intent_type type; - bool op_ok; unsigned int bui_type; int whichfork; int error = 0; /* Only one mapping operation per BUI... */ - if (buip->bui_format.bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) { - xfs_bui_release(buip); + if (buip->bui_format.bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) return -EFSCORRUPTED; - } /* * First check the validity of the extent described by the @@ -457,76 +454,58 @@ xfs_bui_item_recover( XFS_FSB_TO_DADDR(mp, bmap->me_startblock)); inode_fsb = XFS_BB_TO_FSB(mp, XFS_FSB_TO_DADDR(mp, XFS_INO_TO_FSB(mp, bmap->me_owner))); - switch (bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK) { + state = (bmap->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ? + XFS_EXT_UNWRITTEN : XFS_EXT_NORM; + whichfork = (bmap->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ? + XFS_ATTR_FORK : XFS_DATA_FORK; + bui_type = bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK; + switch (bui_type) { case XFS_BMAP_MAP: case XFS_BMAP_UNMAP: - op_ok = true; break; default: - op_ok = false; - break; + return -EFSCORRUPTED; } - if (!op_ok || startblock_fsb == 0 || + if (startblock_fsb == 0 || bmap->me_len == 0 || inode_fsb == 0 || startblock_fsb >= mp->m_sb.sb_dblocks || bmap->me_len >= mp->m_sb.sb_agblocks || inode_fsb >= mp->m_sb.sb_dblocks || - (bmap->me_flags & ~XFS_BMAP_EXTENT_FLAGS)) { - /* - * This will pull the BUI from the AIL and - * free the memory associated with it. - */ - xfs_bui_release(buip); + (bmap->me_flags & ~XFS_BMAP_EXTENT_FLAGS)) return -EFSCORRUPTED; - } - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, - XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 0, 0, &tp); + /* Grab the inode. */ + error = xfs_iget(mp, NULL, bmap->me_owner, 0, 0, &ip); if (error) return error; - /* - * Recovery stashes all deferred ops during intent processing and - * finishes them on completion. Transfer current dfops state to this - * transaction and transfer the result back before we return. - */ - xfs_defer_move(tp, parent_tp); - budp = xfs_trans_get_bud(tp, buip); - /* Grab the inode. */ - error = xfs_iget(mp, tp, bmap->me_owner, 0, XFS_ILOCK_EXCL, &ip); + error = xfs_qm_dqattach(ip); if (error) - goto err_inode; + goto err_rele; if (VFS_I(ip)->i_nlink == 0) xfs_iflags_set(ip, XFS_IRECOVERY); - /* Process deferred bmap item. */ - state = (bmap->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ? - XFS_EXT_UNWRITTEN : XFS_EXT_NORM; - whichfork = (bmap->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ? - XFS_ATTR_FORK : XFS_DATA_FORK; - bui_type = bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK; - switch (bui_type) { - case XFS_BMAP_MAP: - case XFS_BMAP_UNMAP: - type = bui_type; - break; - default: - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp); - error = -EFSCORRUPTED; - goto err_inode; - } + /* Allocate transaction and do the work. */ + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 0, 0, &tp); + if (error) + goto err_rele; + + budp = xfs_trans_get_bud(tp, buip); + xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); count = bmap->me_len; - error = xfs_trans_log_finish_bmap_update(tp, budp, type, ip, whichfork, - bmap->me_startoff, bmap->me_startblock, &count, state); + error = xfs_trans_log_finish_bmap_update(tp, budp, bui_type, ip, + whichfork, bmap->me_startoff, bmap->me_startblock, + &count, state); if (error) - goto err_inode; + goto err_cancel; if (count > 0) { - ASSERT(type == XFS_BMAP_UNMAP); + ASSERT(bui_type == XFS_BMAP_UNMAP); irec.br_startblock = bmap->me_startblock; irec.br_blockcount = count; irec.br_startoff = bmap->me_startoff; @@ -534,20 +513,24 @@ xfs_bui_item_recover( xfs_bmap_unmap_extent(tp, ip, &irec); } - xfs_defer_move(parent_tp, tp); - error = xfs_trans_commit(tp); + /* + * Commit transaction, which frees the transaction and saves the inode + * for later replay activities. + */ + error = xfs_defer_ops_capture_and_commit(tp, ip, capture_list); + if (error) + goto err_unlock; + xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_irele(ip); + return 0; - return error; - -err_inode: - xfs_defer_move(parent_tp, tp); +err_cancel: xfs_trans_cancel(tp); - if (ip) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_irele(ip); - } +err_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); +err_rele: + xfs_irele(ip); return error; } @@ -559,6 +542,32 @@ xfs_bui_item_match( return BUI_ITEM(lip)->bui_format.bui_id == intent_id; } +/* Relog an intent item to push the log tail forward. */ +static struct xfs_log_item * +xfs_bui_item_relog( + struct xfs_log_item *intent, + struct xfs_trans *tp) +{ + struct xfs_bud_log_item *budp; + struct xfs_bui_log_item *buip; + struct xfs_map_extent *extp; + unsigned int count; + + count = BUI_ITEM(intent)->bui_format.bui_nextents; + extp = BUI_ITEM(intent)->bui_format.bui_extents; + + tp->t_flags |= XFS_TRANS_DIRTY; + budp = xfs_trans_get_bud(tp, BUI_ITEM(intent)); + set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags); + + buip = xfs_bui_init(tp->t_mountp); + memcpy(buip->bui_format.bui_extents, extp, count * sizeof(*extp)); + atomic_set(&buip->bui_next_extent, count); + xfs_trans_add_item(tp, &buip->bui_item); + set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags); + return &buip->bui_item; +} + static const struct xfs_item_ops xfs_bui_item_ops = { .iop_size = xfs_bui_item_size, .iop_format = xfs_bui_item_format, @@ -566,6 +575,7 @@ static const struct xfs_item_ops xfs_bui_item_ops = { .iop_release = xfs_bui_item_release, .iop_recover = xfs_bui_item_recover, .iop_match = xfs_bui_item_match, + .iop_relog = xfs_bui_item_relog, }; /* diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index 24c7a8d11e1a..d44e8b4a3391 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -719,6 +719,8 @@ xlog_recover_get_buf_lsn( case XFS_ABTC_MAGIC: case XFS_RMAP_CRC_MAGIC: case XFS_REFC_CRC_MAGIC: + case XFS_FIBT_CRC_MAGIC: + case XFS_FIBT_MAGIC: case XFS_IBT_CRC_MAGIC: case XFS_IBT_MAGIC: { struct xfs_btree_block *btb = blk; diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 3072814e407d..1d95ed387d66 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -831,8 +831,8 @@ xfs_qm_dqget_checks( } /* - * Given the file system, id, and type (UDQUOT/GDQUOT), return a locked - * dquot, doing an allocation (if requested) as needed. + * Given the file system, id, and type (UDQUOT/GDQUOT/PDQUOT), return a + * locked dquot, doing an allocation (if requested) as needed. */ int xfs_qm_dqget( diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 6cb8cd11072a..6c11bfc3d452 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -585,10 +585,10 @@ const struct xfs_defer_op_type xfs_agfl_free_defer_type = { STATIC int xfs_efi_item_recover( struct xfs_log_item *lip, - struct xfs_trans *parent_tp) + struct list_head *capture_list) { struct xfs_efi_log_item *efip = EFI_ITEM(lip); - struct xfs_mount *mp = parent_tp->t_mountp; + struct xfs_mount *mp = lip->li_mountp; struct xfs_efd_log_item *efdp; struct xfs_trans *tp; struct xfs_extent *extp; @@ -608,14 +608,8 @@ xfs_efi_item_recover( if (startblock_fsb == 0 || extp->ext_len == 0 || startblock_fsb >= mp->m_sb.sb_dblocks || - extp->ext_len >= mp->m_sb.sb_agblocks) { - /* - * This will pull the EFI from the AIL and - * free the memory associated with it. - */ - xfs_efi_release(efip); + extp->ext_len >= mp->m_sb.sb_agblocks) return -EFSCORRUPTED; - } } error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); @@ -633,8 +627,7 @@ xfs_efi_item_recover( } - error = xfs_trans_commit(tp); - return error; + return xfs_defer_ops_capture_and_commit(tp, NULL, capture_list); abort_error: xfs_trans_cancel(tp); @@ -649,6 +642,34 @@ xfs_efi_item_match( return EFI_ITEM(lip)->efi_format.efi_id == intent_id; } +/* Relog an intent item to push the log tail forward. */ +static struct xfs_log_item * +xfs_efi_item_relog( + struct xfs_log_item *intent, + struct xfs_trans *tp) +{ + struct xfs_efd_log_item *efdp; + struct xfs_efi_log_item *efip; + struct xfs_extent *extp; + unsigned int count; + + count = EFI_ITEM(intent)->efi_format.efi_nextents; + extp = EFI_ITEM(intent)->efi_format.efi_extents; + + tp->t_flags |= XFS_TRANS_DIRTY; + efdp = xfs_trans_get_efd(tp, EFI_ITEM(intent), count); + efdp->efd_next_extent = count; + memcpy(efdp->efd_format.efd_extents, extp, count * sizeof(*extp)); + set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags); + + efip = xfs_efi_init(tp->t_mountp, count); + memcpy(efip->efi_format.efi_extents, extp, count * sizeof(*extp)); + atomic_set(&efip->efi_next_extent, count); + xfs_trans_add_item(tp, &efip->efi_item); + set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags); + return &efip->efi_item; +} + static const struct xfs_item_ops xfs_efi_item_ops = { .iop_size = xfs_efi_item_size, .iop_format = xfs_efi_item_format, @@ -656,6 +677,7 @@ static const struct xfs_item_ops xfs_efi_item_ops = { .iop_release = xfs_efi_item_release, .iop_recover = xfs_efi_item_recover, .iop_match = xfs_efi_item_match, + .iop_relog = xfs_efi_item_relog, }; /* diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index 1a88025e68a3..db23e455eb91 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -33,39 +33,7 @@ enum xfs_fstrm_alloc { /* * Allocation group filestream associations are tracked with per-ag atomic * counters. These counters allow xfs_filestream_pick_ag() to tell whether a - * particular AG already has active filestreams associated with it. The mount - * point's m_peraglock is used to protect these counters from per-ag array - * re-allocation during a growfs operation. When xfs_growfs_data_private() is - * about to reallocate the array, it calls xfs_filestream_flush() with the - * m_peraglock held in write mode. - * - * Since xfs_mru_cache_flush() guarantees that all the free functions for all - * the cache elements have finished executing before it returns, it's safe for - * the free functions to use the atomic counters without m_peraglock protection. - * This allows the implementation of xfs_fstrm_free_func() to be agnostic about - * whether it was called with the m_peraglock held in read mode, write mode or - * not held at all. The race condition this addresses is the following: - * - * - The work queue scheduler fires and pulls a filestream directory cache - * element off the LRU end of the cache for deletion, then gets pre-empted. - * - A growfs operation grabs the m_peraglock in write mode, flushes all the - * remaining items from the cache and reallocates the mount point's per-ag - * array, resetting all the counters to zero. - * - The work queue thread resumes and calls the free function for the element - * it started cleaning up earlier. In the process it decrements the - * filestreams counter for an AG that now has no references. - * - * With a shrinkfs feature, the above scenario could panic the system. - * - * All other uses of the following macros should be protected by either the - * m_peraglock held in read mode, or the cache's internal locking exposed by the - * interval between a call to xfs_mru_cache_lookup() and a call to - * xfs_mru_cache_done(). In addition, the m_peraglock must be held in read mode - * when new elements are added to the cache. - * - * Combined, these locking rules ensure that no associations will ever exist in - * the cache that reference per-ag array elements that have since been - * reallocated. + * particular AG already has active filestreams associated with it. */ int xfs_filestream_peek_ag( diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 4eebcec4aae6..9ce5e7d5bf8f 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -26,7 +26,7 @@ #include "xfs_rtalloc.h" /* Convert an xfs_fsmap to an fsmap. */ -void +static void xfs_fsmap_from_internal( struct fsmap *dest, struct xfs_fsmap *src) @@ -155,8 +155,7 @@ xfs_fsmap_owner_from_rmap( /* getfsmap query state */ struct xfs_getfsmap_info { struct xfs_fsmap_head *head; - xfs_fsmap_format_t formatter; /* formatting fn */ - void *format_arg; /* format buffer */ + struct fsmap *fsmap_recs; /* mapping records */ struct xfs_buf *agf_bp; /* AGF, for refcount queries */ xfs_daddr_t next_daddr; /* next daddr we expect */ u64 missing_owner; /* owner of holes */ @@ -224,6 +223,20 @@ xfs_getfsmap_is_shared( return 0; } +static inline void +xfs_getfsmap_format( + struct xfs_mount *mp, + struct xfs_fsmap *xfm, + struct xfs_getfsmap_info *info) +{ + struct fsmap *rec; + + trace_xfs_getfsmap_mapping(mp, xfm); + + rec = &info->fsmap_recs[info->head->fmh_entries++]; + xfs_fsmap_from_internal(rec, xfm); +} + /* * Format a reverse mapping for getfsmap, having translated rm_startblock * into the appropriate daddr units. @@ -256,6 +269,9 @@ xfs_getfsmap_helper( /* Are we just counting mappings? */ if (info->head->fmh_count == 0) { + if (info->head->fmh_entries == UINT_MAX) + return -ECANCELED; + if (rec_daddr > info->next_daddr) info->head->fmh_entries++; @@ -285,10 +301,7 @@ xfs_getfsmap_helper( fmr.fmr_offset = 0; fmr.fmr_length = rec_daddr - info->next_daddr; fmr.fmr_flags = FMR_OF_SPECIAL_OWNER; - error = info->formatter(&fmr, info->format_arg); - if (error) - return error; - info->head->fmh_entries++; + xfs_getfsmap_format(mp, &fmr, info); } if (info->last) @@ -320,11 +333,8 @@ xfs_getfsmap_helper( if (shared) fmr.fmr_flags |= FMR_OF_SHARED; } - error = info->formatter(&fmr, info->format_arg); - if (error) - return error; - info->head->fmh_entries++; + xfs_getfsmap_format(mp, &fmr, info); out: rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount); if (info->next_daddr < rec_daddr) @@ -792,11 +802,11 @@ xfs_getfsmap_check_keys( #endif /* CONFIG_XFS_RT */ /* - * Get filesystem's extents as described in head, and format for - * output. Calls formatter to fill the user's buffer until all - * extents are mapped, until the passed-in head->fmh_count slots have - * been filled, or until the formatter short-circuits the loop, if it - * is tracking filled-in extents on its own. + * Get filesystem's extents as described in head, and format for output. Fills + * in the supplied records array until there are no more reverse mappings to + * return or head.fmh_entries == head.fmh_count. In the second case, this + * function returns -ECANCELED to indicate that more records would have been + * returned. * * Key to Confusion * ---------------- @@ -816,8 +826,7 @@ int xfs_getfsmap( struct xfs_mount *mp, struct xfs_fsmap_head *head, - xfs_fsmap_format_t formatter, - void *arg) + struct fsmap *fsmap_recs) { struct xfs_trans *tp = NULL; struct xfs_fsmap dkeys[2]; /* per-dev keys */ @@ -892,8 +901,7 @@ xfs_getfsmap( info.next_daddr = head->fmh_keys[0].fmr_physical + head->fmh_keys[0].fmr_length; - info.formatter = formatter; - info.format_arg = arg; + info.fsmap_recs = fsmap_recs; info.head = head; /* diff --git a/fs/xfs/xfs_fsmap.h b/fs/xfs/xfs_fsmap.h index c6c57739b862..a0775788e7b1 100644 --- a/fs/xfs/xfs_fsmap.h +++ b/fs/xfs/xfs_fsmap.h @@ -27,13 +27,9 @@ struct xfs_fsmap_head { struct xfs_fsmap fmh_keys[2]; /* low and high keys */ }; -void xfs_fsmap_from_internal(struct fsmap *dest, struct xfs_fsmap *src); void xfs_fsmap_to_internal(struct xfs_fsmap *dest, struct fsmap *src); -/* fsmap to userspace formatter - copy to user & advance pointer */ -typedef int (*xfs_fsmap_format_t)(struct xfs_fsmap *, void *); - int xfs_getfsmap(struct xfs_mount *mp, struct xfs_fsmap_head *head, - xfs_fsmap_format_t formatter, void *arg); + struct fsmap *out_recs); #endif /* __XFS_FSMAP_H__ */ diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 49624973eecc..2bfbcf28b1bd 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -698,6 +698,68 @@ out_unlock: return error; } +/* Propagate di_flags from a parent inode to a child inode. */ +static void +xfs_inode_inherit_flags( + struct xfs_inode *ip, + const struct xfs_inode *pip) +{ + unsigned int di_flags = 0; + umode_t mode = VFS_I(ip)->i_mode; + + if (S_ISDIR(mode)) { + if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) + di_flags |= XFS_DIFLAG_RTINHERIT; + if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { + di_flags |= XFS_DIFLAG_EXTSZINHERIT; + ip->i_d.di_extsize = pip->i_d.di_extsize; + } + if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) + di_flags |= XFS_DIFLAG_PROJINHERIT; + } else if (S_ISREG(mode)) { + if ((pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) && + xfs_sb_version_hasrealtime(&ip->i_mount->m_sb)) + di_flags |= XFS_DIFLAG_REALTIME; + if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { + di_flags |= XFS_DIFLAG_EXTSIZE; + ip->i_d.di_extsize = pip->i_d.di_extsize; + } + } + if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) && + xfs_inherit_noatime) + di_flags |= XFS_DIFLAG_NOATIME; + if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) && + xfs_inherit_nodump) + di_flags |= XFS_DIFLAG_NODUMP; + if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) && + xfs_inherit_sync) + di_flags |= XFS_DIFLAG_SYNC; + if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) && + xfs_inherit_nosymlinks) + di_flags |= XFS_DIFLAG_NOSYMLINKS; + if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) && + xfs_inherit_nodefrag) + di_flags |= XFS_DIFLAG_NODEFRAG; + if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM) + di_flags |= XFS_DIFLAG_FILESTREAM; + + ip->i_d.di_flags |= di_flags; +} + +/* Propagate di_flags2 from a parent inode to a child inode. */ +static void +xfs_inode_inherit_flags2( + struct xfs_inode *ip, + const struct xfs_inode *pip) +{ + if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) { + ip->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; + ip->i_d.di_cowextsize = pip->i_d.di_cowextsize; + } + if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX) + ip->i_d.di_flags2 |= XFS_DIFLAG2_DAX; +} + /* * Allocate an inode on disk and return a copy of its in-core version. * The in-core inode is locked exclusively. Set mode, nlink, and rdev @@ -841,54 +903,10 @@ xfs_ialloc( break; case S_IFREG: case S_IFDIR: - if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) { - uint di_flags = 0; - - if (S_ISDIR(mode)) { - if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) - di_flags |= XFS_DIFLAG_RTINHERIT; - if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { - di_flags |= XFS_DIFLAG_EXTSZINHERIT; - ip->i_d.di_extsize = pip->i_d.di_extsize; - } - if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) - di_flags |= XFS_DIFLAG_PROJINHERIT; - } else if (S_ISREG(mode)) { - if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) - di_flags |= XFS_DIFLAG_REALTIME; - if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) { - di_flags |= XFS_DIFLAG_EXTSIZE; - ip->i_d.di_extsize = pip->i_d.di_extsize; - } - } - if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) && - xfs_inherit_noatime) - di_flags |= XFS_DIFLAG_NOATIME; - if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) && - xfs_inherit_nodump) - di_flags |= XFS_DIFLAG_NODUMP; - if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) && - xfs_inherit_sync) - di_flags |= XFS_DIFLAG_SYNC; - if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) && - xfs_inherit_nosymlinks) - di_flags |= XFS_DIFLAG_NOSYMLINKS; - if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) && - xfs_inherit_nodefrag) - di_flags |= XFS_DIFLAG_NODEFRAG; - if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM) - di_flags |= XFS_DIFLAG_FILESTREAM; - - ip->i_d.di_flags |= di_flags; - } - if (pip && (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY)) { - if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) { - ip->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; - ip->i_d.di_cowextsize = pip->i_d.di_cowextsize; - } - if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX) - ip->i_d.di_flags2 |= XFS_DIFLAG2_DAX; - } + if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) + xfs_inode_inherit_flags(ip, pip); + if (pip && (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY)) + xfs_inode_inherit_flags2(ip, pip); /* FALLTHROUGH */ case S_IFLNK: ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS; @@ -1516,17 +1534,10 @@ xfs_itruncate_extents_flags( if (error) goto out; - /* - * Duplicate the transaction that has the permanent - * reservation and commit the old transaction. - */ + /* free the just unmapped extents */ error = xfs_defer_finish(&tp); if (error) goto out; - - error = xfs_trans_roll_inode(&tp, ip); - if (error) - goto out; } if (whichfork == XFS_DATA_FORK) { diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index bca7659fb5c6..3fbd98f61ea5 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1716,39 +1716,17 @@ out_free_buf: return error; } -struct getfsmap_info { - struct xfs_mount *mp; - struct fsmap_head __user *data; - unsigned int idx; - __u32 last_flags; -}; - -STATIC int -xfs_getfsmap_format(struct xfs_fsmap *xfm, void *priv) -{ - struct getfsmap_info *info = priv; - struct fsmap fm; - - trace_xfs_getfsmap_mapping(info->mp, xfm); - - info->last_flags = xfm->fmr_flags; - xfs_fsmap_from_internal(&fm, xfm); - if (copy_to_user(&info->data->fmh_recs[info->idx++], &fm, - sizeof(struct fsmap))) - return -EFAULT; - - return 0; -} - STATIC int xfs_ioc_getfsmap( struct xfs_inode *ip, struct fsmap_head __user *arg) { - struct getfsmap_info info = { NULL }; struct xfs_fsmap_head xhead = {0}; struct fsmap_head head; - bool aborted = false; + struct fsmap *recs; + unsigned int count; + __u32 last_flags = 0; + bool done = false; int error; if (copy_from_user(&head, arg, sizeof(struct fsmap_head))) @@ -1760,38 +1738,112 @@ xfs_ioc_getfsmap( sizeof(head.fmh_keys[1].fmr_reserved))) return -EINVAL; + /* + * Use an internal memory buffer so that we don't have to copy fsmap + * data to userspace while holding locks. Start by trying to allocate + * up to 128k for the buffer, but fall back to a single page if needed. + */ + count = min_t(unsigned int, head.fmh_count, + 131072 / sizeof(struct fsmap)); + recs = kvzalloc(count * sizeof(struct fsmap), GFP_KERNEL); + if (!recs) { + count = min_t(unsigned int, head.fmh_count, + PAGE_SIZE / sizeof(struct fsmap)); + recs = kvzalloc(count * sizeof(struct fsmap), GFP_KERNEL); + if (!recs) + return -ENOMEM; + } + xhead.fmh_iflags = head.fmh_iflags; - xhead.fmh_count = head.fmh_count; xfs_fsmap_to_internal(&xhead.fmh_keys[0], &head.fmh_keys[0]); xfs_fsmap_to_internal(&xhead.fmh_keys[1], &head.fmh_keys[1]); trace_xfs_getfsmap_low_key(ip->i_mount, &xhead.fmh_keys[0]); trace_xfs_getfsmap_high_key(ip->i_mount, &xhead.fmh_keys[1]); - info.mp = ip->i_mount; - info.data = arg; - error = xfs_getfsmap(ip->i_mount, &xhead, xfs_getfsmap_format, &info); - if (error == -ECANCELED) { - error = 0; - aborted = true; - } else if (error) - return error; + head.fmh_entries = 0; + do { + struct fsmap __user *user_recs; + struct fsmap *last_rec; + + user_recs = &arg->fmh_recs[head.fmh_entries]; + xhead.fmh_entries = 0; + xhead.fmh_count = min_t(unsigned int, count, + head.fmh_count - head.fmh_entries); + + /* Run query, record how many entries we got. */ + error = xfs_getfsmap(ip->i_mount, &xhead, recs); + switch (error) { + case 0: + /* + * There are no more records in the result set. Copy + * whatever we got to userspace and break out. + */ + done = true; + break; + case -ECANCELED: + /* + * The internal memory buffer is full. Copy whatever + * records we got to userspace and go again if we have + * not yet filled the userspace buffer. + */ + error = 0; + break; + default: + goto out_free; + } + head.fmh_entries += xhead.fmh_entries; + head.fmh_oflags = xhead.fmh_oflags; - /* If we didn't abort, set the "last" flag in the last fmx */ - if (!aborted && info.idx) { - info.last_flags |= FMR_OF_LAST; - if (copy_to_user(&info.data->fmh_recs[info.idx - 1].fmr_flags, - &info.last_flags, sizeof(info.last_flags))) - return -EFAULT; + /* + * If the caller wanted a record count or there aren't any + * new records to return, we're done. + */ + if (head.fmh_count == 0 || xhead.fmh_entries == 0) + break; + + /* Copy all the records we got out to userspace. */ + if (copy_to_user(user_recs, recs, + xhead.fmh_entries * sizeof(struct fsmap))) { + error = -EFAULT; + goto out_free; + } + + /* Remember the last record flags we copied to userspace. */ + last_rec = &recs[xhead.fmh_entries - 1]; + last_flags = last_rec->fmr_flags; + + /* Set up the low key for the next iteration. */ + xfs_fsmap_to_internal(&xhead.fmh_keys[0], last_rec); + trace_xfs_getfsmap_low_key(ip->i_mount, &xhead.fmh_keys[0]); + } while (!done && head.fmh_entries < head.fmh_count); + + /* + * If there are no more records in the query result set and we're not + * in counting mode, mark the last record returned with the LAST flag. + */ + if (done && head.fmh_count > 0 && head.fmh_entries > 0) { + struct fsmap __user *user_rec; + + last_flags |= FMR_OF_LAST; + user_rec = &arg->fmh_recs[head.fmh_entries - 1]; + + if (copy_to_user(&user_rec->fmr_flags, &last_flags, + sizeof(last_flags))) { + error = -EFAULT; + goto out_free; + } } /* copy back header */ - head.fmh_entries = xhead.fmh_entries; - head.fmh_oflags = xhead.fmh_oflags; - if (copy_to_user(arg, &head, sizeof(struct fsmap_head))) - return -EFAULT; + if (copy_to_user(arg, &head, sizeof(struct fsmap_head))) { + error = -EFAULT; + goto out_free; + } - return 0; +out_free: + kmem_free(recs); + return error; } STATIC int diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 80a13c8561d8..5e165456da68 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -237,7 +237,7 @@ xfs_vn_create( umode_t mode, bool flags) { - return xfs_vn_mknod(dir, dentry, mode, 0); + return xfs_generic_create(dir, dentry, mode, 0, false); } STATIC int @@ -246,7 +246,7 @@ xfs_vn_mkdir( struct dentry *dentry, umode_t mode) { - return xfs_vn_mknod(dir, dentry, mode|S_IFDIR, 0); + return xfs_generic_create(dir, dentry, mode | S_IFDIR, 0, false); } STATIC struct dentry * diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index ab737fed7b12..ad1009778d33 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -123,7 +123,6 @@ typedef __u32 xfs_nlink_t; #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ #define EFSBADCRC EBADMSG /* Bad CRC detected */ -#define SYNCHRONIZE() barrier() #define __return_address __builtin_return_address(0) /* diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index ad0c69ee8947..fa2d05e65ff1 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1475,14 +1475,14 @@ xlog_commit_record( } /* - * Push on the buffer cache code if we ever use more than 75% of the on-disk - * log space. This code pushes on the lsn which would supposedly free up - * the 25% which we want to leave free. We may need to adopt a policy which - * pushes on an lsn which is further along in the log once we reach the high - * water mark. In this manner, we would be creating a low water mark. + * Compute the LSN that we'd need to push the log tail towards in order to have + * (a) enough on-disk log space to log the number of bytes specified, (b) at + * least 25% of the log space free, and (c) at least 256 blocks free. If the + * log free space already meets all three thresholds, this function returns + * NULLCOMMITLSN. */ -STATIC void -xlog_grant_push_ail( +xfs_lsn_t +xlog_grant_push_threshold( struct xlog *log, int need_bytes) { @@ -1508,7 +1508,7 @@ xlog_grant_push_ail( free_threshold = max(free_threshold, (log->l_logBBsize >> 2)); free_threshold = max(free_threshold, 256); if (free_blocks >= free_threshold) - return; + return NULLCOMMITLSN; xlog_crack_atomic_lsn(&log->l_tail_lsn, &threshold_cycle, &threshold_block); @@ -1528,13 +1528,33 @@ xlog_grant_push_ail( if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > 0) threshold_lsn = last_sync_lsn; + return threshold_lsn; +} + +/* + * Push the tail of the log if we need to do so to maintain the free log space + * thresholds set out by xlog_grant_push_threshold. We may need to adopt a + * policy which pushes on an lsn which is further along in the log once we + * reach the high water mark. In this manner, we would be creating a low water + * mark. + */ +STATIC void +xlog_grant_push_ail( + struct xlog *log, + int need_bytes) +{ + xfs_lsn_t threshold_lsn; + + threshold_lsn = xlog_grant_push_threshold(log, need_bytes); + if (threshold_lsn == NULLCOMMITLSN || XLOG_FORCED_SHUTDOWN(log)) + return; + /* * Get the transaction layer to kick the dirty buffers out to * disk asynchronously. No point in trying to do this if * the filesystem is shutting down. */ - if (!XLOG_FORCED_SHUTDOWN(log)) - xfs_ail_push(log->l_ailp, threshold_lsn); + xfs_ail_push(log->l_ailp, threshold_lsn); } /* @@ -1604,9 +1624,7 @@ xlog_cksum( int i; int xheads; - xheads = size / XLOG_HEADER_CYCLE_SIZE; - if (size % XLOG_HEADER_CYCLE_SIZE) - xheads++; + xheads = DIV_ROUND_UP(size, XLOG_HEADER_CYCLE_SIZE); for (i = 1; i < xheads; i++) { crc = crc32c(crc, &xhdr[i].hic_xheader, diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 1412d6993f1e..58c3fcbec94a 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -141,4 +141,6 @@ void xfs_log_quiesce(struct xfs_mount *mp); bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t); bool xfs_log_in_recovery(struct xfs_mount *); +xfs_lsn_t xlog_grant_push_threshold(struct xlog *log, int need_bytes); + #endif /* __XFS_LOG_H__ */ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index a17d788921d6..a8289adc1b29 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -371,6 +371,19 @@ out: return error; } +static inline int +xlog_logrec_hblks(struct xlog *log, struct xlog_rec_header *rh) +{ + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + int h_size = be32_to_cpu(rh->h_size); + + if ((be32_to_cpu(rh->h_version) & XLOG_VERSION_2) && + h_size > XLOG_HEADER_CYCLE_SIZE) + return DIV_ROUND_UP(h_size, XLOG_HEADER_CYCLE_SIZE); + } + return 1; +} + /* * Potentially backup over partial log record write. * @@ -463,15 +476,7 @@ xlog_find_verify_log_record( * reset last_blk. Only when last_blk points in the middle of a log * record do we update last_blk. */ - if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { - uint h_size = be32_to_cpu(head->h_size); - - xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE; - if (h_size % XLOG_HEADER_CYCLE_SIZE) - xhdrs++; - } else { - xhdrs = 1; - } + xhdrs = xlog_logrec_hblks(log, head); if (*last_blk - i + extra_bblks != BTOBB(be32_to_cpu(head->h_len)) + xhdrs) @@ -1158,22 +1163,7 @@ xlog_check_unmount_rec( * below. We won't want to clear the unmount record if there is one, so * we pass the lsn of the unmount record rather than the block after it. */ - if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { - int h_size = be32_to_cpu(rhead->h_size); - int h_version = be32_to_cpu(rhead->h_version); - - if ((h_version & XLOG_VERSION_2) && - (h_size > XLOG_HEADER_CYCLE_SIZE)) { - hblks = h_size / XLOG_HEADER_CYCLE_SIZE; - if (h_size % XLOG_HEADER_CYCLE_SIZE) - hblks++; - } else { - hblks = 1; - } - } else { - hblks = 1; - } - + hblks = xlog_logrec_hblks(log, rhead); after_umount_blk = xlog_wrap_logbno(log, rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len))); @@ -2444,44 +2434,66 @@ xlog_recover_process_data( /* Take all the collected deferred ops and finish them in order. */ static int xlog_finish_defer_ops( - struct xfs_trans *parent_tp) + struct xfs_mount *mp, + struct list_head *capture_list) { - struct xfs_mount *mp = parent_tp->t_mountp; + struct xfs_defer_capture *dfc, *next; struct xfs_trans *tp; - int64_t freeblks; - uint resblks; - int error; + struct xfs_inode *ip; + int error = 0; - /* - * We're finishing the defer_ops that accumulated as a result of - * recovering unfinished intent items during log recovery. We - * reserve an itruncate transaction because it is the largest - * permanent transaction type. Since we're the only user of the fs - * right now, take 93% (15/16) of the available free blocks. Use - * weird math to avoid a 64-bit division. - */ - freeblks = percpu_counter_sum(&mp->m_fdblocks); - if (freeblks <= 0) - return -ENOSPC; - resblks = min_t(int64_t, UINT_MAX, freeblks); - resblks = (resblks * 15) >> 4; - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, resblks, - 0, XFS_TRANS_RESERVE, &tp); - if (error) - return error; - /* transfer all collected dfops to this transaction */ - xfs_defer_move(tp, parent_tp); + list_for_each_entry_safe(dfc, next, capture_list, dfc_list) { + struct xfs_trans_res resv; + + /* + * Create a new transaction reservation from the captured + * information. Set logcount to 1 to force the new transaction + * to regrant every roll so that we can make forward progress + * in recovery no matter how full the log might be. + */ + resv.tr_logres = dfc->dfc_logres; + resv.tr_logcount = 1; + resv.tr_logflags = XFS_TRANS_PERM_LOG_RES; + + error = xfs_trans_alloc(mp, &resv, dfc->dfc_blkres, + dfc->dfc_rtxres, XFS_TRANS_RESERVE, &tp); + if (error) + return error; - return xfs_trans_commit(tp); + /* + * Transfer to this new transaction all the dfops we captured + * from recovering a single intent item. + */ + list_del_init(&dfc->dfc_list); + xfs_defer_ops_continue(dfc, tp, &ip); + + error = xfs_trans_commit(tp); + if (ip) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_irele(ip); + } + if (error) + return error; + } + + ASSERT(list_empty(capture_list)); + return 0; } -/* Is this log item a deferred action intent? */ -static inline bool xlog_item_is_intent(struct xfs_log_item *lip) +/* Release all the captured defer ops and capture structures in this list. */ +static void +xlog_abort_defer_ops( + struct xfs_mount *mp, + struct list_head *capture_list) { - return lip->li_ops->iop_recover != NULL && - lip->li_ops->iop_match != NULL; -} + struct xfs_defer_capture *dfc; + struct xfs_defer_capture *next; + list_for_each_entry_safe(dfc, next, capture_list, dfc_list) { + list_del_init(&dfc->dfc_list); + xfs_defer_ops_release(mp, dfc); + } +} /* * When this is called, all of the log intent items which did not have * corresponding log done items should be in the AIL. What we do now @@ -2502,35 +2514,23 @@ STATIC int xlog_recover_process_intents( struct xlog *log) { - struct xfs_trans *parent_tp; + LIST_HEAD(capture_list); struct xfs_ail_cursor cur; struct xfs_log_item *lip; struct xfs_ail *ailp; - int error; + int error = 0; #if defined(DEBUG) || defined(XFS_WARN) xfs_lsn_t last_lsn; #endif - /* - * The intent recovery handlers commit transactions to complete recovery - * for individual intents, but any new deferred operations that are - * queued during that process are held off until the very end. The - * purpose of this transaction is to serve as a container for deferred - * operations. Each intent recovery handler must transfer dfops here - * before its local transaction commits, and we'll finish the entire - * list below. - */ - error = xfs_trans_alloc_empty(log->l_mp, &parent_tp); - if (error) - return error; - ailp = log->l_ailp; spin_lock(&ailp->ail_lock); - lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); #if defined(DEBUG) || defined(XFS_WARN) last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block); #endif - while (lip != NULL) { + for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); + lip != NULL; + lip = xfs_trans_ail_cursor_next(ailp, &cur)) { /* * We're done when we see something other than an intent. * There should be no intents left in the AIL now. @@ -2552,26 +2552,29 @@ xlog_recover_process_intents( /* * NOTE: If your intent processing routine can create more - * deferred ops, you /must/ attach them to the transaction in - * this routine or else those subsequent intents will get + * deferred ops, you /must/ attach them to the capture list in + * the recover routine or else those subsequent intents will be * replayed in the wrong order! */ - if (!test_and_set_bit(XFS_LI_RECOVERED, &lip->li_flags)) { - spin_unlock(&ailp->ail_lock); - error = lip->li_ops->iop_recover(lip, parent_tp); - spin_lock(&ailp->ail_lock); - } + spin_unlock(&ailp->ail_lock); + error = lip->li_ops->iop_recover(lip, &capture_list); + spin_lock(&ailp->ail_lock); if (error) - goto out; - lip = xfs_trans_ail_cursor_next(ailp, &cur); + break; } -out: + xfs_trans_ail_cursor_done(&cur); spin_unlock(&ailp->ail_lock); - if (!error) - error = xlog_finish_defer_ops(parent_tp); - xfs_trans_cancel(parent_tp); + if (error) + goto err; + + error = xlog_finish_defer_ops(log->l_mp, &capture_list); + if (error) + goto err; + return 0; +err: + xlog_abort_defer_ops(log->l_mp, &capture_list); return error; } @@ -2878,7 +2881,8 @@ STATIC int xlog_valid_rec_header( struct xlog *log, struct xlog_rec_header *rhead, - xfs_daddr_t blkno) + xfs_daddr_t blkno, + int bufsize) { int hlen; @@ -2894,10 +2898,14 @@ xlog_valid_rec_header( return -EFSCORRUPTED; } - /* LR body must have data or it wouldn't have been written */ + /* + * LR body must have data (or it wouldn't have been written) + * and h_len must not be greater than LR buffer size. + */ hlen = be32_to_cpu(rhead->h_len); - if (XFS_IS_CORRUPT(log->l_mp, hlen <= 0 || hlen > INT_MAX)) + if (XFS_IS_CORRUPT(log->l_mp, hlen <= 0 || hlen > bufsize)) return -EFSCORRUPTED; + if (XFS_IS_CORRUPT(log->l_mp, blkno > log->l_logBBsize || blkno > INT_MAX)) return -EFSCORRUPTED; @@ -2958,9 +2966,6 @@ xlog_do_recovery_pass( goto bread_err1; rhead = (xlog_rec_header_t *)offset; - error = xlog_valid_rec_header(log, rhead, tail_blk); - if (error) - goto bread_err1; /* * xfsprogs has a bug where record length is based on lsunit but @@ -2975,30 +2980,22 @@ xlog_do_recovery_pass( */ h_size = be32_to_cpu(rhead->h_size); h_len = be32_to_cpu(rhead->h_len); - if (h_len > h_size) { - if (h_len <= log->l_mp->m_logbsize && - be32_to_cpu(rhead->h_num_logops) == 1) { - xfs_warn(log->l_mp, + if (h_len > h_size && h_len <= log->l_mp->m_logbsize && + rhead->h_num_logops == cpu_to_be32(1)) { + xfs_warn(log->l_mp, "invalid iclog size (%d bytes), using lsunit (%d bytes)", - h_size, log->l_mp->m_logbsize); - h_size = log->l_mp->m_logbsize; - } else { - XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, - log->l_mp); - error = -EFSCORRUPTED; - goto bread_err1; - } + h_size, log->l_mp->m_logbsize); + h_size = log->l_mp->m_logbsize; } - if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) && - (h_size > XLOG_HEADER_CYCLE_SIZE)) { - hblks = h_size / XLOG_HEADER_CYCLE_SIZE; - if (h_size % XLOG_HEADER_CYCLE_SIZE) - hblks++; + error = xlog_valid_rec_header(log, rhead, tail_blk, h_size); + if (error) + goto bread_err1; + + hblks = xlog_logrec_hblks(log, rhead); + if (hblks != 1) { kmem_free(hbp); hbp = xlog_alloc_buffer(log, hblks); - } else { - hblks = 1; } } else { ASSERT(log->l_sectBBsize == 1); @@ -3070,7 +3067,7 @@ xlog_do_recovery_pass( } rhead = (xlog_rec_header_t *)offset; error = xlog_valid_rec_header(log, rhead, - split_hblks ? blk_no : 0); + split_hblks ? blk_no : 0, h_size); if (error) goto bread_err2; @@ -3151,7 +3148,7 @@ xlog_do_recovery_pass( goto bread_err2; rhead = (xlog_rec_header_t *)offset; - error = xlog_valid_rec_header(log, rhead, blk_no); + error = xlog_valid_rec_header(log, rhead, blk_no, h_size); if (error) goto bread_err2; diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 3f82e0c92c2d..b2a9abee8b2b 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -249,7 +249,6 @@ xfs_qm_unmount_quotas( STATIC int xfs_qm_dqattach_one( struct xfs_inode *ip, - xfs_dqid_t id, xfs_dqtype_t type, bool doalloc, struct xfs_dquot **IO_idqpp) @@ -330,23 +329,23 @@ xfs_qm_dqattach_locked( ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); if (XFS_IS_UQUOTA_ON(mp) && !ip->i_udquot) { - error = xfs_qm_dqattach_one(ip, i_uid_read(VFS_I(ip)), - XFS_DQTYPE_USER, doalloc, &ip->i_udquot); + error = xfs_qm_dqattach_one(ip, XFS_DQTYPE_USER, + doalloc, &ip->i_udquot); if (error) goto done; ASSERT(ip->i_udquot); } if (XFS_IS_GQUOTA_ON(mp) && !ip->i_gdquot) { - error = xfs_qm_dqattach_one(ip, i_gid_read(VFS_I(ip)), - XFS_DQTYPE_GROUP, doalloc, &ip->i_gdquot); + error = xfs_qm_dqattach_one(ip, XFS_DQTYPE_GROUP, + doalloc, &ip->i_gdquot); if (error) goto done; ASSERT(ip->i_gdquot); } if (XFS_IS_PQUOTA_ON(mp) && !ip->i_pdquot) { - error = xfs_qm_dqattach_one(ip, ip->i_d.di_projid, XFS_DQTYPE_PROJ, + error = xfs_qm_dqattach_one(ip, XFS_DQTYPE_PROJ, doalloc, &ip->i_pdquot); if (error) goto done; @@ -1663,6 +1662,7 @@ xfs_qm_vop_dqalloc( } if ((flags & XFS_QMOPT_UQUOTA) && XFS_IS_UQUOTA_ON(mp)) { + ASSERT(O_udqpp); if (!uid_eq(inode->i_uid, uid)) { /* * What we need is the dquot that has this uid, and @@ -1696,6 +1696,7 @@ xfs_qm_vop_dqalloc( } } if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) { + ASSERT(O_gdqpp); if (!gid_eq(inode->i_gid, gid)) { xfs_iunlock(ip, lockflags); error = xfs_qm_dqget(mp, from_kgid(user_ns, gid), @@ -1713,9 +1714,10 @@ xfs_qm_vop_dqalloc( } } if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) { + ASSERT(O_pdqpp); if (ip->i_d.di_projid != prid) { xfs_iunlock(ip, lockflags); - error = xfs_qm_dqget(mp, (xfs_dqid_t)prid, + error = xfs_qm_dqget(mp, prid, XFS_DQTYPE_PROJ, true, &pq); if (error) { ASSERT(error != -ENOENT); diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index ca93b6488377..7529eb63ce94 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -424,7 +424,7 @@ const struct xfs_defer_op_type xfs_refcount_update_defer_type = { STATIC int xfs_cui_item_recover( struct xfs_log_item *lip, - struct xfs_trans *parent_tp) + struct list_head *capture_list) { struct xfs_bmbt_irec irec; struct xfs_cui_log_item *cuip = CUI_ITEM(lip); @@ -432,7 +432,7 @@ xfs_cui_item_recover( struct xfs_cud_log_item *cudp; struct xfs_trans *tp; struct xfs_btree_cur *rcur = NULL; - struct xfs_mount *mp = parent_tp->t_mountp; + struct xfs_mount *mp = lip->li_mountp; xfs_fsblock_t startblock_fsb; xfs_fsblock_t new_fsb; xfs_extlen_t new_len; @@ -467,14 +467,8 @@ xfs_cui_item_recover( refc->pe_len == 0 || startblock_fsb >= mp->m_sb.sb_dblocks || refc->pe_len >= mp->m_sb.sb_agblocks || - (refc->pe_flags & ~XFS_REFCOUNT_EXTENT_FLAGS)) { - /* - * This will pull the CUI from the AIL and - * free the memory associated with it. - */ - xfs_cui_release(cuip); + (refc->pe_flags & ~XFS_REFCOUNT_EXTENT_FLAGS)) return -EFSCORRUPTED; - } } /* @@ -493,12 +487,7 @@ xfs_cui_item_recover( mp->m_refc_maxlevels * 2, 0, XFS_TRANS_RESERVE, &tp); if (error) return error; - /* - * Recovery stashes all deferred ops during intent processing and - * finishes them on completion. Transfer current dfops state to this - * transaction and transfer the result back before we return. - */ - xfs_defer_move(tp, parent_tp); + cudp = xfs_trans_get_cud(tp, cuip); for (i = 0; i < cuip->cui_format.cui_nextents; i++) { @@ -555,13 +544,10 @@ xfs_cui_item_recover( } xfs_refcount_finish_one_cleanup(tp, rcur, error); - xfs_defer_move(parent_tp, tp); - error = xfs_trans_commit(tp); - return error; + return xfs_defer_ops_capture_and_commit(tp, NULL, capture_list); abort_error: xfs_refcount_finish_one_cleanup(tp, rcur, error); - xfs_defer_move(parent_tp, tp); xfs_trans_cancel(tp); return error; } @@ -574,6 +560,32 @@ xfs_cui_item_match( return CUI_ITEM(lip)->cui_format.cui_id == intent_id; } +/* Relog an intent item to push the log tail forward. */ +static struct xfs_log_item * +xfs_cui_item_relog( + struct xfs_log_item *intent, + struct xfs_trans *tp) +{ + struct xfs_cud_log_item *cudp; + struct xfs_cui_log_item *cuip; + struct xfs_phys_extent *extp; + unsigned int count; + + count = CUI_ITEM(intent)->cui_format.cui_nextents; + extp = CUI_ITEM(intent)->cui_format.cui_extents; + + tp->t_flags |= XFS_TRANS_DIRTY; + cudp = xfs_trans_get_cud(tp, CUI_ITEM(intent)); + set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags); + + cuip = xfs_cui_init(tp->t_mountp, count); + memcpy(cuip->cui_format.cui_extents, extp, count * sizeof(*extp)); + atomic_set(&cuip->cui_next_extent, count); + xfs_trans_add_item(tp, &cuip->cui_item); + set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags); + return &cuip->cui_item; +} + static const struct xfs_item_ops xfs_cui_item_ops = { .iop_size = xfs_cui_item_size, .iop_format = xfs_cui_item_format, @@ -581,6 +593,7 @@ static const struct xfs_item_ops xfs_cui_item_ops = { .iop_release = xfs_cui_item_release, .iop_recover = xfs_cui_item_recover, .iop_match = xfs_cui_item_match, + .iop_relog = xfs_cui_item_relog, }; /* diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index dc5b0753cd51..7adc996ca6e3 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -467,14 +467,14 @@ const struct xfs_defer_op_type xfs_rmap_update_defer_type = { STATIC int xfs_rui_item_recover( struct xfs_log_item *lip, - struct xfs_trans *parent_tp) + struct list_head *capture_list) { struct xfs_rui_log_item *ruip = RUI_ITEM(lip); struct xfs_map_extent *rmap; struct xfs_rud_log_item *rudp; struct xfs_trans *tp; struct xfs_btree_cur *rcur = NULL; - struct xfs_mount *mp = parent_tp->t_mountp; + struct xfs_mount *mp = lip->li_mountp; xfs_fsblock_t startblock_fsb; enum xfs_rmap_intent_type type; xfs_exntst_t state; @@ -511,14 +511,8 @@ xfs_rui_item_recover( rmap->me_len == 0 || startblock_fsb >= mp->m_sb.sb_dblocks || rmap->me_len >= mp->m_sb.sb_agblocks || - (rmap->me_flags & ~XFS_RMAP_EXTENT_FLAGS)) { - /* - * This will pull the RUI from the AIL and - * free the memory associated with it. - */ - xfs_rui_release(ruip); + (rmap->me_flags & ~XFS_RMAP_EXTENT_FLAGS)) return -EFSCORRUPTED; - } } error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, @@ -573,8 +567,7 @@ xfs_rui_item_recover( } xfs_rmap_finish_one_cleanup(tp, rcur, error); - error = xfs_trans_commit(tp); - return error; + return xfs_defer_ops_capture_and_commit(tp, NULL, capture_list); abort_error: xfs_rmap_finish_one_cleanup(tp, rcur, error); @@ -590,6 +583,32 @@ xfs_rui_item_match( return RUI_ITEM(lip)->rui_format.rui_id == intent_id; } +/* Relog an intent item to push the log tail forward. */ +static struct xfs_log_item * +xfs_rui_item_relog( + struct xfs_log_item *intent, + struct xfs_trans *tp) +{ + struct xfs_rud_log_item *rudp; + struct xfs_rui_log_item *ruip; + struct xfs_map_extent *extp; + unsigned int count; + + count = RUI_ITEM(intent)->rui_format.rui_nextents; + extp = RUI_ITEM(intent)->rui_format.rui_extents; + + tp->t_flags |= XFS_TRANS_DIRTY; + rudp = xfs_trans_get_rud(tp, RUI_ITEM(intent)); + set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags); + + ruip = xfs_rui_init(tp->t_mountp, count); + memcpy(ruip->rui_format.rui_extents, extp, count * sizeof(*extp)); + atomic_set(&ruip->rui_next_extent, count); + xfs_trans_add_item(tp, &ruip->rui_item); + set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags); + return &ruip->rui_item; +} + static const struct xfs_item_ops xfs_rui_item_ops = { .iop_size = xfs_rui_item_size, .iop_format = xfs_rui_item_format, @@ -597,6 +616,7 @@ static const struct xfs_item_ops xfs_rui_item_ops = { .iop_release = xfs_rui_item_release, .iop_recover = xfs_rui_item_recover, .iop_match = xfs_rui_item_match, + .iop_relog = xfs_rui_item_relog, }; /* diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 5b89c12f1566..ede1baf31413 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -18,7 +18,7 @@ #include "xfs_trans_space.h" #include "xfs_icache.h" #include "xfs_rtalloc.h" - +#include "xfs_sb.h" /* * Read and return the summary information for a given extent size, @@ -778,8 +778,14 @@ xfs_growfs_rt_alloc( struct xfs_bmbt_irec map; /* block map output */ int nmap; /* number of block maps */ int resblks; /* space reservation */ + enum xfs_blft buf_type; struct xfs_trans *tp; + if (ip == mp->m_rsumip) + buf_type = XFS_BLFT_RTSUMMARY_BUF; + else + buf_type = XFS_BLFT_RTBITMAP_BUF; + /* * Allocate space to the file, as necessary. */ @@ -841,6 +847,9 @@ xfs_growfs_rt_alloc( mp->m_bsize, 0, &bp); if (error) goto out_trans_cancel; + + xfs_trans_buf_set_type(tp, bp, buf_type); + bp->b_ops = &xfs_rtbuf_ops; memset(bp->b_addr, 0, mp->m_sb.sb_blocksize); xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1); /* @@ -1015,23 +1024,29 @@ xfs_growfs_rt( /* * Lock out other callers by grabbing the bitmap inode lock. */ - xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL); + xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP); xfs_trans_ijoin(tp, mp->m_rbmip, XFS_ILOCK_EXCL); /* - * Update the bitmap inode's size. + * Update the bitmap inode's size ondisk and incore. We need + * to update the incore size so that inode inactivation won't + * punch what it thinks are "posteof" blocks. */ mp->m_rbmip->i_d.di_size = nsbp->sb_rbmblocks * nsbp->sb_blocksize; + i_size_write(VFS_I(mp->m_rbmip), mp->m_rbmip->i_d.di_size); xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE); /* * Get the summary inode into the transaction. */ - xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL); + xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM); xfs_trans_ijoin(tp, mp->m_rsumip, XFS_ILOCK_EXCL); /* - * Update the summary inode's size. + * Update the summary inode's size. We need to update the + * incore size so that inode inactivation won't punch what it + * thinks are "posteof" blocks. */ mp->m_rsumip->i_d.di_size = nmp->m_rsumsize; + i_size_write(VFS_I(mp->m_rsumip), mp->m_rsumip->i_d.di_size); xfs_trans_log_inode(tp, mp->m_rsumip, XFS_ILOG_CORE); /* * Copy summary data from old to new sizes. @@ -1087,7 +1102,13 @@ error_cancel: if (error) break; } + if (error) + goto out_free; + + /* Update secondary superblocks now the physical grow has completed */ + error = xfs_update_secondary_sbs(mp); +out_free: /* * Free the fake mp structure. */ diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index f70f1255220b..20e0534a772c 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -23,6 +23,7 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) uint64_t xs_xstrat_bytes = 0; uint64_t xs_write_bytes = 0; uint64_t xs_read_bytes = 0; + uint64_t defer_relog = 0; static const struct xstats_entry { char *desc; @@ -70,10 +71,13 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) xs_xstrat_bytes += per_cpu_ptr(stats, i)->s.xs_xstrat_bytes; xs_write_bytes += per_cpu_ptr(stats, i)->s.xs_write_bytes; xs_read_bytes += per_cpu_ptr(stats, i)->s.xs_read_bytes; + defer_relog += per_cpu_ptr(stats, i)->s.defer_relog; } len += scnprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n", xs_xstrat_bytes, xs_write_bytes, xs_read_bytes); + len += scnprintf(buf + len, PATH_MAX-len, "defer_relog %llu\n", + defer_relog); len += scnprintf(buf + len, PATH_MAX-len, "debug %u\n", #if defined(DEBUG) 1); diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h index 34d704f703d2..43ffba74f045 100644 --- a/fs/xfs/xfs_stats.h +++ b/fs/xfs/xfs_stats.h @@ -137,6 +137,7 @@ struct __xfsstats { uint64_t xs_xstrat_bytes; uint64_t xs_write_bytes; uint64_t xs_read_bytes; + uint64_t defer_relog; }; #define xfsstats_offset(f) (offsetof(struct __xfsstats, f)/sizeof(uint32_t)) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index baf5de30eebb..d1b5f2d2a245 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1234,25 +1234,12 @@ xfs_fc_parse_param( case Opt_nouuid: mp->m_flags |= XFS_MOUNT_NOUUID; return 0; - case Opt_ikeep: - mp->m_flags |= XFS_MOUNT_IKEEP; - return 0; - case Opt_noikeep: - mp->m_flags &= ~XFS_MOUNT_IKEEP; - return 0; case Opt_largeio: mp->m_flags |= XFS_MOUNT_LARGEIO; return 0; case Opt_nolargeio: mp->m_flags &= ~XFS_MOUNT_LARGEIO; return 0; - case Opt_attr2: - mp->m_flags |= XFS_MOUNT_ATTR2; - return 0; - case Opt_noattr2: - mp->m_flags &= ~XFS_MOUNT_ATTR2; - mp->m_flags |= XFS_MOUNT_NOATTR2; - return 0; case Opt_filestreams: mp->m_flags |= XFS_MOUNT_FILESTREAMS; return 0; @@ -1304,6 +1291,24 @@ xfs_fc_parse_param( xfs_mount_set_dax_mode(mp, result.uint_32); return 0; #endif + /* Following mount options will be removed in September 2025 */ + case Opt_ikeep: + xfs_warn(mp, "%s mount option is deprecated.", param->key); + mp->m_flags |= XFS_MOUNT_IKEEP; + return 0; + case Opt_noikeep: + xfs_warn(mp, "%s mount option is deprecated.", param->key); + mp->m_flags &= ~XFS_MOUNT_IKEEP; + return 0; + case Opt_attr2: + xfs_warn(mp, "%s mount option is deprecated.", param->key); + mp->m_flags |= XFS_MOUNT_ATTR2; + return 0; + case Opt_noattr2: + xfs_warn(mp, "%s mount option is deprecated.", param->key); + mp->m_flags &= ~XFS_MOUNT_ATTR2; + mp->m_flags |= XFS_MOUNT_NOATTR2; + return 0; default: xfs_warn(mp, "unknown mount option [%s].", param->key); return -EINVAL; @@ -1450,6 +1455,19 @@ xfs_fc_fill_super( if (error) goto out_free_sb; + /* V4 support is undergoing deprecation. */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) { +#ifdef CONFIG_XFS_SUPPORT_V4 + xfs_warn_once(mp, + "Deprecated V4 format (crc=0) will not be supported after September 2030."); +#else + xfs_warn(mp, + "Deprecated V4 format (crc=0) not supported by kernel."); + error = -EINVAL; + goto out_free_sb; +#endif + } + /* * XFS block mappings use 54 bits to store the logical block offset. * This should suffice to handle the maximum file size that the VFS diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c index 021ef96d0542..fac9de7ee6d0 100644 --- a/fs/xfs/xfs_sysctl.c +++ b/fs/xfs/xfs_sysctl.c @@ -50,13 +50,45 @@ xfs_panic_mask_proc_handler( } #endif /* CONFIG_PROC_FS */ +STATIC int +xfs_deprecate_irix_sgid_inherit_proc_handler( + struct ctl_table *ctl, + int write, + void *buffer, + size_t *lenp, + loff_t *ppos) +{ + if (write) { + printk_once(KERN_WARNING + "XFS: " "%s sysctl option is deprecated.\n", + ctl->procname); + } + return proc_dointvec_minmax(ctl, write, buffer, lenp, ppos); +} + +STATIC int +xfs_deprecate_irix_symlink_mode_proc_handler( + struct ctl_table *ctl, + int write, + void *buffer, + size_t *lenp, + loff_t *ppos) +{ + if (write) { + printk_once(KERN_WARNING + "XFS: " "%s sysctl option is deprecated.\n", + ctl->procname); + } + return proc_dointvec_minmax(ctl, write, buffer, lenp, ppos); +} + static struct ctl_table xfs_table[] = { { .procname = "irix_sgid_inherit", .data = &xfs_params.sgid_inherit.val, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = xfs_deprecate_irix_sgid_inherit_proc_handler, .extra1 = &xfs_params.sgid_inherit.min, .extra2 = &xfs_params.sgid_inherit.max }, @@ -65,7 +97,7 @@ static struct ctl_table xfs_table[] = { .data = &xfs_params.symlink_mode.val, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = xfs_deprecate_irix_symlink_mode_proc_handler, .extra1 = &xfs_params.symlink_mode.min, .extra2 = &xfs_params.symlink_mode.max }, diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index dcdcf99cfa5d..86951652d3ed 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -2533,6 +2533,7 @@ DEFINE_DEFER_PENDING_EVENT(xfs_defer_create_intent); DEFINE_DEFER_PENDING_EVENT(xfs_defer_cancel_list); DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_finish); DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_abort); +DEFINE_DEFER_PENDING_EVENT(xfs_defer_relog_intent); #define DEFINE_BMAP_FREE_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_defer); diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index ca18a040336a..c94e71f741b6 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -959,7 +959,7 @@ xfs_trans_cancel( struct xfs_log_item *lip; list_for_each_entry(lip, &tp->t_items, li_trans) - ASSERT(!(lip->li_type == XFS_LI_EFD)); + ASSERT(!xlog_item_is_intent_done(lip)); } #endif xfs_trans_unreserve_and_mod_sb(tp); diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index f46534b75236..084658946cc8 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -55,14 +55,12 @@ struct xfs_log_item { #define XFS_LI_ABORTED 1 #define XFS_LI_FAILED 2 #define XFS_LI_DIRTY 3 /* log item dirty in transaction */ -#define XFS_LI_RECOVERED 4 /* log intent item has been recovered */ #define XFS_LI_FLAGS \ { (1 << XFS_LI_IN_AIL), "IN_AIL" }, \ { (1 << XFS_LI_ABORTED), "ABORTED" }, \ { (1 << XFS_LI_FAILED), "FAILED" }, \ - { (1 << XFS_LI_DIRTY), "DIRTY" }, \ - { (1 << XFS_LI_RECOVERED), "RECOVERED" } + { (1 << XFS_LI_DIRTY), "DIRTY" } struct xfs_item_ops { unsigned flags; @@ -74,10 +72,29 @@ struct xfs_item_ops { void (*iop_committing)(struct xfs_log_item *, xfs_lsn_t commit_lsn); void (*iop_release)(struct xfs_log_item *); xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t); - int (*iop_recover)(struct xfs_log_item *lip, struct xfs_trans *tp); + int (*iop_recover)(struct xfs_log_item *lip, + struct list_head *capture_list); bool (*iop_match)(struct xfs_log_item *item, uint64_t id); + struct xfs_log_item *(*iop_relog)(struct xfs_log_item *intent, + struct xfs_trans *tp); }; +/* Is this log item a deferred action intent? */ +static inline bool +xlog_item_is_intent(struct xfs_log_item *lip) +{ + return lip->li_ops->iop_recover != NULL && + lip->li_ops->iop_match != NULL; +} + +/* Is this a log intent-done item? */ +static inline bool +xlog_item_is_intent_done(struct xfs_log_item *lip) +{ + return lip->li_ops->iop_unpin == NULL && + lip->li_ops->iop_push == NULL; +} + /* * Release the log item as soon as committed. This is for items just logging * intents that never need to be written back in place. @@ -243,4 +260,12 @@ void xfs_trans_buf_copy_type(struct xfs_buf *dst_bp, extern kmem_zone_t *xfs_trans_zone; +static inline struct xfs_log_item * +xfs_trans_item_relog( + struct xfs_log_item *lip, + struct xfs_trans *tp) +{ + return lip->li_ops->iop_relog(lip, tp); +} + #endif /* __XFS_TRANS_H__ */ diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 133fc6fc3edd..fe45b0c3970c 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -221,36 +221,27 @@ xfs_trans_mod_dquot( } switch (field) { - - /* - * regular disk blk reservation - */ - case XFS_TRANS_DQ_RES_BLKS: + /* regular disk blk reservation */ + case XFS_TRANS_DQ_RES_BLKS: qtrx->qt_blk_res += delta; break; - /* - * inode reservation - */ - case XFS_TRANS_DQ_RES_INOS: + /* inode reservation */ + case XFS_TRANS_DQ_RES_INOS: qtrx->qt_ino_res += delta; break; - /* - * disk blocks used. - */ - case XFS_TRANS_DQ_BCOUNT: + /* disk blocks used. */ + case XFS_TRANS_DQ_BCOUNT: qtrx->qt_bcount_delta += delta; break; - case XFS_TRANS_DQ_DELBCOUNT: + case XFS_TRANS_DQ_DELBCOUNT: qtrx->qt_delbcnt_delta += delta; break; - /* - * Inode Count - */ - case XFS_TRANS_DQ_ICOUNT: + /* Inode Count */ + case XFS_TRANS_DQ_ICOUNT: if (qtrx->qt_ino_res && delta > 0) { qtrx->qt_ino_res_used += delta; ASSERT(qtrx->qt_ino_res >= qtrx->qt_ino_res_used); @@ -258,17 +249,13 @@ xfs_trans_mod_dquot( qtrx->qt_icount_delta += delta; break; - /* - * rtblk reservation - */ - case XFS_TRANS_DQ_RES_RTBLKS: + /* rtblk reservation */ + case XFS_TRANS_DQ_RES_RTBLKS: qtrx->qt_rtblk_res += delta; break; - /* - * rtblk count - */ - case XFS_TRANS_DQ_RTBCOUNT: + /* rtblk count */ + case XFS_TRANS_DQ_RTBCOUNT: if (qtrx->qt_rtblk_res && delta > 0) { qtrx->qt_rtblk_res_used += delta; ASSERT(qtrx->qt_rtblk_res >= qtrx->qt_rtblk_res_used); @@ -276,11 +263,11 @@ xfs_trans_mod_dquot( qtrx->qt_rtbcount_delta += delta; break; - case XFS_TRANS_DQ_DELRTBCOUNT: + case XFS_TRANS_DQ_DELRTBCOUNT: qtrx->qt_delrtb_delta += delta; break; - default: + default: ASSERT(0); } diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 8ec7c8f109d7..64cc2a9c38c8 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -24,6 +24,39 @@ #include "zonefs.h" +static inline int zonefs_zone_mgmt(struct inode *inode, + enum req_opf op) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + int ret; + + lockdep_assert_held(&zi->i_truncate_mutex); + + ret = blkdev_zone_mgmt(inode->i_sb->s_bdev, op, zi->i_zsector, + zi->i_zone_size >> SECTOR_SHIFT, GFP_NOFS); + if (ret) { + zonefs_err(inode->i_sb, + "Zone management operation %s at %llu failed %d\n", + blk_op_str(op), zi->i_zsector, ret); + return ret; + } + + return 0; +} + +static inline void zonefs_i_size_write(struct inode *inode, loff_t isize) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + + i_size_write(inode, isize); + /* + * A full zone is no longer open/active and does not need + * explicit closing. + */ + if (isize >= zi->i_max_size) + zi->i_flags &= ~ZONEFS_ZONE_OPEN; +} + static int zonefs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) @@ -302,6 +335,17 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, } /* + * If the filesystem is mounted with the explicit-open mount option, we + * need to clear the ZONEFS_ZONE_OPEN flag if the zone transitioned to + * the read-only or offline condition, to avoid attempting an explicit + * close of the zone when the inode file is closed. + */ + if ((sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) && + (zone->cond == BLK_ZONE_COND_OFFLINE || + zone->cond == BLK_ZONE_COND_READONLY)) + zi->i_flags &= ~ZONEFS_ZONE_OPEN; + + /* * If error=remount-ro was specified, any error result in remounting * the volume as read-only. */ @@ -315,7 +359,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, * invalid data. */ zonefs_update_stats(inode, data_size); - i_size_write(inode, data_size); + zonefs_i_size_write(inode, data_size); zi->i_wpoffset = data_size; return 0; @@ -328,7 +372,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx, * eventually correct the file size and zonefs inode write pointer offset * (which can be out of sync with the drive due to partial write failures). */ -static void zonefs_io_error(struct inode *inode, bool write) +static void __zonefs_io_error(struct inode *inode, bool write) { struct zonefs_inode_info *zi = ZONEFS_I(inode); struct super_block *sb = inode->i_sb; @@ -342,8 +386,6 @@ static void zonefs_io_error(struct inode *inode, bool write) }; int ret; - mutex_lock(&zi->i_truncate_mutex); - /* * Memory allocations in blkdev_report_zones() can trigger a memory * reclaim which may in turn cause a recursion into zonefs as well as @@ -359,7 +401,14 @@ static void zonefs_io_error(struct inode *inode, bool write) zonefs_err(sb, "Get inode %lu zone information failed %d\n", inode->i_ino, ret); memalloc_noio_restore(noio_flag); +} +static void zonefs_io_error(struct inode *inode, bool write) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + + mutex_lock(&zi->i_truncate_mutex); + __zonefs_io_error(inode, write); mutex_unlock(&zi->i_truncate_mutex); } @@ -397,13 +446,27 @@ static int zonefs_file_truncate(struct inode *inode, loff_t isize) if (isize == old_isize) goto unlock; - ret = blkdev_zone_mgmt(inode->i_sb->s_bdev, op, zi->i_zsector, - zi->i_zone_size >> SECTOR_SHIFT, GFP_NOFS); - if (ret) { - zonefs_err(inode->i_sb, - "Zone management operation at %llu failed %d", - zi->i_zsector, ret); + ret = zonefs_zone_mgmt(inode, op); + if (ret) goto unlock; + + /* + * If the mount option ZONEFS_MNTOPT_EXPLICIT_OPEN is set, + * take care of open zones. + */ + if (zi->i_flags & ZONEFS_ZONE_OPEN) { + /* + * Truncating a zone to EMPTY or FULL is the equivalent of + * closing the zone. For a truncation to 0, we need to + * re-open the zone to ensure new writes can be processed. + * For a truncation to the maximum file size, the zone is + * closed and writes cannot be accepted anymore, so clear + * the open flag. + */ + if (!isize) + ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN); + else + zi->i_flags &= ~ZONEFS_ZONE_OPEN; } zonefs_update_stats(inode, isize); @@ -584,7 +647,7 @@ static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size, mutex_lock(&zi->i_truncate_mutex); if (i_size_read(inode) < iocb->ki_pos + size) { zonefs_update_stats(inode, iocb->ki_pos + size); - i_size_write(inode, iocb->ki_pos + size); + zonefs_i_size_write(inode, iocb->ki_pos + size); } mutex_unlock(&zi->i_truncate_mutex); } @@ -865,8 +928,128 @@ inode_unlock: return ret; } +static inline bool zonefs_file_use_exp_open(struct inode *inode, struct file *file) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); + + if (!(sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN)) + return false; + + if (zi->i_ztype != ZONEFS_ZTYPE_SEQ) + return false; + + if (!(file->f_mode & FMODE_WRITE)) + return false; + + return true; +} + +static int zonefs_open_zone(struct inode *inode) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); + int ret = 0; + + mutex_lock(&zi->i_truncate_mutex); + + zi->i_wr_refcnt++; + if (zi->i_wr_refcnt == 1) { + + if (atomic_inc_return(&sbi->s_open_zones) > sbi->s_max_open_zones) { + atomic_dec(&sbi->s_open_zones); + ret = -EBUSY; + goto unlock; + } + + if (i_size_read(inode) < zi->i_max_size) { + ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_OPEN); + if (ret) { + zi->i_wr_refcnt--; + atomic_dec(&sbi->s_open_zones); + goto unlock; + } + zi->i_flags |= ZONEFS_ZONE_OPEN; + } + } + +unlock: + mutex_unlock(&zi->i_truncate_mutex); + + return ret; +} + +static int zonefs_file_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = generic_file_open(inode, file); + if (ret) + return ret; + + if (zonefs_file_use_exp_open(inode, file)) + return zonefs_open_zone(inode); + + return 0; +} + +static void zonefs_close_zone(struct inode *inode) +{ + struct zonefs_inode_info *zi = ZONEFS_I(inode); + int ret = 0; + + mutex_lock(&zi->i_truncate_mutex); + zi->i_wr_refcnt--; + if (!zi->i_wr_refcnt) { + struct zonefs_sb_info *sbi = ZONEFS_SB(inode->i_sb); + struct super_block *sb = inode->i_sb; + + /* + * If the file zone is full, it is not open anymore and we only + * need to decrement the open count. + */ + if (!(zi->i_flags & ZONEFS_ZONE_OPEN)) + goto dec; + + ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE); + if (ret) { + __zonefs_io_error(inode, false); + /* + * Leaving zones explicitly open may lead to a state + * where most zones cannot be written (zone resources + * exhausted). So take preventive action by remounting + * read-only. + */ + if (zi->i_flags & ZONEFS_ZONE_OPEN && + !(sb->s_flags & SB_RDONLY)) { + zonefs_warn(sb, "closing zone failed, remounting filesystem read-only\n"); + sb->s_flags |= SB_RDONLY; + } + } + zi->i_flags &= ~ZONEFS_ZONE_OPEN; +dec: + atomic_dec(&sbi->s_open_zones); + } + mutex_unlock(&zi->i_truncate_mutex); +} + +static int zonefs_file_release(struct inode *inode, struct file *file) +{ + /* + * If we explicitly open a zone we must close it again as well, but the + * zone management operation can fail (either due to an IO error or as + * the zone has gone offline or read-only). Make sure we don't fail the + * close(2) for user-space. + */ + if (zonefs_file_use_exp_open(inode, file)) + zonefs_close_zone(inode); + + return 0; +} + static const struct file_operations zonefs_file_operations = { - .open = generic_file_open, + .open = zonefs_file_open, + .release = zonefs_file_release, .fsync = zonefs_file_fsync, .mmap = zonefs_file_mmap, .llseek = zonefs_file_llseek, @@ -890,6 +1073,7 @@ static struct inode *zonefs_alloc_inode(struct super_block *sb) inode_init_once(&zi->i_vnode); mutex_init(&zi->i_truncate_mutex); init_rwsem(&zi->i_mmap_sem); + zi->i_wr_refcnt = 0; return &zi->i_vnode; } @@ -940,7 +1124,7 @@ static int zonefs_statfs(struct dentry *dentry, struct kstatfs *buf) enum { Opt_errors_ro, Opt_errors_zro, Opt_errors_zol, Opt_errors_repair, - Opt_err, + Opt_explicit_open, Opt_err, }; static const match_table_t tokens = { @@ -948,6 +1132,7 @@ static const match_table_t tokens = { { Opt_errors_zro, "errors=zone-ro"}, { Opt_errors_zol, "errors=zone-offline"}, { Opt_errors_repair, "errors=repair"}, + { Opt_explicit_open, "explicit-open" }, { Opt_err, NULL} }; @@ -984,6 +1169,9 @@ static int zonefs_parse_options(struct super_block *sb, char *options) sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK; sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_REPAIR; break; + case Opt_explicit_open: + sbi->s_mount_opts |= ZONEFS_MNTOPT_EXPLICIT_OPEN; + break; default: return -EINVAL; } @@ -1403,6 +1591,13 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent) sbi->s_gid = GLOBAL_ROOT_GID; sbi->s_perm = 0640; sbi->s_mount_opts = ZONEFS_MNTOPT_ERRORS_RO; + sbi->s_max_open_zones = bdev_max_open_zones(sb->s_bdev); + atomic_set(&sbi->s_open_zones, 0); + if (!sbi->s_max_open_zones && + sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) { + zonefs_info(sb, "No open zones limit. Ignoring explicit_open mount option\n"); + sbi->s_mount_opts &= ~ZONEFS_MNTOPT_EXPLICIT_OPEN; + } ret = zonefs_read_super(sb); if (ret) diff --git a/fs/zonefs/zonefs.h b/fs/zonefs/zonefs.h index 55b39970acb2..51141907097c 100644 --- a/fs/zonefs/zonefs.h +++ b/fs/zonefs/zonefs.h @@ -38,6 +38,8 @@ static inline enum zonefs_ztype zonefs_zone_type(struct blk_zone *zone) return ZONEFS_ZTYPE_SEQ; } +#define ZONEFS_ZONE_OPEN (1 << 0) + /* * In-memory inode data. */ @@ -74,6 +76,10 @@ struct zonefs_inode_info { */ struct mutex i_truncate_mutex; struct rw_semaphore i_mmap_sem; + + /* guarded by i_truncate_mutex */ + unsigned int i_wr_refcnt; + unsigned int i_flags; }; static inline struct zonefs_inode_info *ZONEFS_I(struct inode *inode) @@ -154,6 +160,7 @@ enum zonefs_features { #define ZONEFS_MNTOPT_ERRORS_MASK \ (ZONEFS_MNTOPT_ERRORS_RO | ZONEFS_MNTOPT_ERRORS_ZRO | \ ZONEFS_MNTOPT_ERRORS_ZOL | ZONEFS_MNTOPT_ERRORS_REPAIR) +#define ZONEFS_MNTOPT_EXPLICIT_OPEN (1 << 4) /* Explicit open/close of zones on open/close */ /* * In-memory Super block information. @@ -175,6 +182,9 @@ struct zonefs_sb_info { loff_t s_blocks; loff_t s_used_blocks; + + unsigned int s_max_open_zones; + atomic_t s_open_zones; }; static inline struct zonefs_sb_info *ZONEFS_SB(struct super_block *sb) |