diff options
Diffstat (limited to 'fs')
203 files changed, 8091 insertions, 4254 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c index 995e332eee5c..eb2151fb6049 100644 --- a/fs/9p/cache.c +++ b/fs/9p/cache.c @@ -51,6 +51,8 @@ void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses) if (!v9ses->cachetag) { if (v9fs_random_cachetag(v9ses) < 0) { v9ses->fscache = NULL; + kfree(v9ses->cachetag); + v9ses->cachetag = NULL; return; } } diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 4cc966a31cb3..fe7f0bd2048e 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -513,6 +513,7 @@ v9fs_mmap_file_mmap(struct file *filp, struct vm_area_struct *vma) v9inode = V9FS_I(inode); mutex_lock(&v9inode->v_mutex); if (!v9inode->writeback_fid && + (vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) { /* * clone a fid and add it to writeback_fid @@ -614,6 +615,8 @@ static void v9fs_mmap_vm_close(struct vm_area_struct *vma) (vma->vm_end - vma->vm_start - 1), }; + if (!(vma->vm_flags & VM_SHARED)) + return; p9_debug(P9_DEBUG_VFS, "9p VMA close, %p, flushing", vma); diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index ca243e658d71..74df32be4c6a 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -58,7 +58,7 @@ static int v9fs_set_super(struct super_block *s, void *data) static int v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, - int flags, void *data) + int flags) { int ret; @@ -132,7 +132,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, retval = PTR_ERR(sb); goto clunk_fid; } - retval = v9fs_fill_super(sb, v9ses, flags, data); + retval = v9fs_fill_super(sb, v9ses, flags); if (retval) goto release_sb; diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index bcd1bafb0278..4150280509ff 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -10,13 +10,6 @@ #include <linux/dns_resolver.h> #include "internal.h" -const struct file_operations afs_dynroot_file_operations = { - .open = dcache_dir_open, - .release = dcache_dir_close, - .iterate_shared = dcache_readdir, - .llseek = dcache_dir_lseek, -}; - /* * Probe to see if a cell may exist. This prevents positive dentries from * being created unnecessarily. diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 7b1c18c32f48..46d2d7cb461d 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -443,7 +443,7 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root) inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; if (root) { inode->i_op = &afs_dynroot_inode_operations; - inode->i_fop = &afs_dynroot_file_operations; + inode->i_fop = &simple_dir_operations; } else { inode->i_op = &afs_autocell_inode_operations; } diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 9cdfabaeaa0b..759e0578012c 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -910,7 +910,6 @@ extern int afs_silly_iput(struct dentry *, struct inode *); /* * dynroot.c */ -extern const struct file_operations afs_dynroot_file_operations; extern const struct inode_operations afs_dynroot_inode_operations; extern const struct dentry_operations afs_dynroot_dentry_operations; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index d4e11b2e04f6..c5642bcb6b46 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -670,26 +670,6 @@ out: * libraries. There is no binary dependent code anywhere else. */ -#ifndef STACK_RND_MASK -#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */ -#endif - -static unsigned long randomize_stack_top(unsigned long stack_top) -{ - unsigned long random_variable = 0; - - if (current->flags & PF_RANDOMIZE) { - random_variable = get_random_long(); - random_variable &= STACK_RND_MASK; - random_variable <<= PAGE_SHIFT; - } -#ifdef CONFIG_STACK_GROWSUP - return PAGE_ALIGN(stack_top) + random_variable; -#else - return PAGE_ALIGN(stack_top) - random_variable; -#endif -} - static int load_elf_binary(struct linux_binprm *bprm) { struct file *interpreter = NULL; /* to shut gcc up */ @@ -899,7 +879,7 @@ out_free_interp: the correct location in memory. */ for(i = 0, elf_ppnt = elf_phdata; i < loc->elf_ex.e_phnum; i++, elf_ppnt++) { - int elf_prot, elf_flags, elf_fixed = MAP_FIXED_NOREPLACE; + int elf_prot, elf_flags; unsigned long k, vaddr; unsigned long total_size = 0; @@ -931,13 +911,6 @@ out_free_interp: */ } } - - /* - * Some binaries have overlapping elf segments and then - * we have to forcefully map over an existing mapping - * e.g. over this newly established brk mapping. - */ - elf_fixed = MAP_FIXED; } elf_prot = make_prot(elf_ppnt->p_flags); @@ -950,7 +923,7 @@ out_free_interp: * the ET_DYN load_addr calculations, proceed normally. */ if (loc->elf_ex.e_type == ET_EXEC || load_addr_set) { - elf_flags |= elf_fixed; + elf_flags |= MAP_FIXED; } else if (loc->elf_ex.e_type == ET_DYN) { /* * This logic is run once for the first LOAD Program @@ -986,7 +959,7 @@ out_free_interp: load_bias = ELF_ET_DYN_BASE; if (current->flags & PF_RANDOMIZE) load_bias += arch_mmap_rnd(); - elf_flags |= elf_fixed; + elf_flags |= MAP_FIXED; } else load_bias = 0; @@ -1141,7 +1114,8 @@ out_free_interp: * (since it grows up, and may collide early with the stack * growing down), and into the unused ELF_ET_DYN_BASE region. */ - if (IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) && !interpreter) + if (IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) && + loc->elf_ex.e_type == ET_DYN && !interpreter) current->mm->brk = current->mm->start_brk = ELF_ET_DYN_BASE; diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index bf7e3f23bba7..670700cb1110 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1761,6 +1761,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) btrfs_err(info, "bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups", cache->key.objectid); + btrfs_put_block_group(cache); ret = -EINVAL; goto error; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 19d669d12ca1..fe2b8765d9e6 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -734,8 +734,6 @@ struct btrfs_fs_info { struct btrfs_workqueue *fixup_workers; struct btrfs_workqueue *delayed_workers; - /* the extent workers do delayed refs on the extent allocation tree */ - struct btrfs_workqueue *extent_workers; struct task_struct *transaction_kthread; struct task_struct *cleaner_kthread; u32 thread_pool_size; @@ -2489,8 +2487,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, int nitems, bool use_global_rsv); void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv); -void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes, - bool qgroup_free); +void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes); int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes); u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index d949d7d2abed..db9f2c58eb4a 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -381,7 +381,6 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) out_qgroup: btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve); out_fail: - btrfs_inode_rsv_release(inode, true); if (delalloc_lock) mutex_unlock(&inode->delalloc_mutex); return ret; @@ -418,7 +417,6 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, * btrfs_delalloc_release_extents - release our outstanding_extents * @inode: the inode to balance the reservation for. * @num_bytes: the number of bytes we originally reserved with - * @qgroup_free: do we need to free qgroup meta reservation or convert them. * * When we reserve space we increase outstanding_extents for the extents we may * add. Once we've set the range as delalloc or created our ordered extents we @@ -426,8 +424,7 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, * temporarily tracked outstanding_extents. This _must_ be used in conjunction * with btrfs_delalloc_reserve_metadata. */ -void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes, - bool qgroup_free) +void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes) { struct btrfs_fs_info *fs_info = inode->root->fs_info; unsigned num_extents; @@ -441,7 +438,7 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes, if (btrfs_is_testing(fs_info)) return; - btrfs_inode_rsv_release(inode, qgroup_free); + btrfs_inode_rsv_release(inode, true); } /** diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 044981cf6df9..402b61bf345c 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2008,7 +2008,6 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) btrfs_destroy_workqueue(fs_info->readahead_workers); btrfs_destroy_workqueue(fs_info->flush_workers); btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers); - btrfs_destroy_workqueue(fs_info->extent_workers); /* * Now that all other work queues are destroyed, we can safely destroy * the queues used for metadata I/O, since tasks from those other work @@ -2214,10 +2213,6 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, max_active, 2); fs_info->qgroup_rescan_workers = btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0); - fs_info->extent_workers = - btrfs_alloc_workqueue(fs_info, "extent-refs", flags, - min_t(u64, fs_devices->num_devices, - max_active), 8); if (!(fs_info->workers && fs_info->delalloc_workers && fs_info->submit_workers && fs_info->flush_workers && @@ -2228,7 +2223,6 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, fs_info->endio_freespace_worker && fs_info->rmw_workers && fs_info->caching_workers && fs_info->readahead_workers && fs_info->fixup_workers && fs_info->delayed_workers && - fs_info->extent_workers && fs_info->qgroup_rescan_workers)) { return -ENOMEM; } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 7b32b6af322d..cceaf05aada2 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3745,12 +3745,21 @@ err_unlock: static void set_btree_ioerr(struct page *page) { struct extent_buffer *eb = (struct extent_buffer *)page->private; + struct btrfs_fs_info *fs_info; SetPageError(page); if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) return; /* + * If we error out, we should add back the dirty_metadata_bytes + * to make it consistent. + */ + fs_info = eb->fs_info; + percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, + eb->len, fs_info->dirty_metadata_batch); + + /* * If writeback for a btree extent that doesn't belong to a log tree * failed, increment the counter transaction->eb_write_errors. * We do this because while the transaction is running and before it's @@ -3986,6 +3995,10 @@ retry: if (!ret) { free_extent_buffer(eb); continue; + } else if (ret < 0) { + done = 1; + free_extent_buffer(eb); + break; } ret = write_one_eb(eb, wbc, &epd); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 8fe4eb7e5045..435a502a3226 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1591,7 +1591,6 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct page **pages = NULL; - struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; u64 release_bytes = 0; u64 lockstart; @@ -1611,6 +1610,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, return -ENOMEM; while (iov_iter_count(i) > 0) { + struct extent_state *cached_state = NULL; size_t offset = offset_in_page(pos); size_t sector_offset; size_t write_bytes = min(iov_iter_count(i), @@ -1692,7 +1692,7 @@ again: force_page_uptodate); if (ret) { btrfs_delalloc_release_extents(BTRFS_I(inode), - reserve_bytes, true); + reserve_bytes); break; } @@ -1704,7 +1704,7 @@ again: if (extents_locked == -EAGAIN) goto again; btrfs_delalloc_release_extents(BTRFS_I(inode), - reserve_bytes, true); + reserve_bytes); ret = extents_locked; break; } @@ -1758,11 +1758,21 @@ again: if (copied > 0) ret = btrfs_dirty_pages(inode, pages, dirty_pages, pos, copied, &cached_state); + + /* + * If we have not locked the extent range, because the range's + * start offset is >= i_size, we might still have a non-NULL + * cached extent state, acquired while marking the extent range + * as delalloc through btrfs_dirty_pages(). Therefore free any + * possible cached extent state to avoid a memory leak. + */ if (extents_locked) unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state); - btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes, - true); + else + free_extent_state(cached_state); + + btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); if (ret) { btrfs_drop_pages(pages, num_pages); break; @@ -2057,25 +2067,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) struct btrfs_trans_handle *trans; struct btrfs_log_ctx ctx; int ret = 0, err; - u64 len; - /* - * If the inode needs a full sync, make sure we use a full range to - * avoid log tree corruption, due to hole detection racing with ordered - * extent completion for adjacent ranges, and assertion failures during - * hole detection. - */ - if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags)) { - start = 0; - end = LLONG_MAX; - } - - /* - * The range length can be represented by u64, we have to do the typecasts - * to avoid signed overflow if it's [0, LLONG_MAX] eg. from fsync() - */ - len = (u64)end - (u64)start + 1; trace_btrfs_sync_file(file, datasync); btrfs_init_log_ctx(&ctx, inode); @@ -2102,6 +2094,19 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) atomic_inc(&root->log_batch); /* + * If the inode needs a full sync, make sure we use a full range to + * avoid log tree corruption, due to hole detection racing with ordered + * extent completion for adjacent ranges, and assertion failures during + * hole detection. Do this while holding the inode lock, to avoid races + * with other tasks. + */ + if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags)) { + start = 0; + end = LLONG_MAX; + } + + /* * Before we acquired the inode's lock, someone may have dirtied more * pages in the target range. We need to make sure that writeback for * any such pages does not start while we are logging the inode, because @@ -2128,8 +2133,11 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) /* * We have to do this here to avoid the priority inversion of waiting on * IO of a lower priority task while holding a transaction open. + * + * Also, the range length can be represented by u64, we have to do the + * typecasts to avoid signed overflow if it's [0, LLONG_MAX]. */ - ret = btrfs_wait_ordered_range(inode, start, len); + ret = btrfs_wait_ordered_range(inode, start, (u64)end - (u64)start + 1); if (ret) { up_write(&BTRFS_I(inode)->dio_sem); inode_unlock(inode); diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 63cad7865d75..37345fb6191d 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -501,13 +501,13 @@ again: ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc, prealloc, prealloc, &alloc_hint); if (ret) { - btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc, true); + btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc); btrfs_delalloc_release_metadata(BTRFS_I(inode), prealloc, true); goto out_put; } ret = btrfs_write_out_ino_cache(root, trans, path, inode); - btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc, false); + btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc); out_put: iput(inode); out_release: diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a0546401bc0a..c3f386b7cc0b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2206,7 +2206,7 @@ again: ClearPageChecked(page); set_page_dirty(page); - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, false); + btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); out: unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end, &cached_state); @@ -4951,7 +4951,7 @@ again: if (!page) { btrfs_delalloc_release_space(inode, data_reserved, block_start, blocksize, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize, true); + btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize); ret = -ENOMEM; goto out; } @@ -5018,7 +5018,7 @@ out_unlock: if (ret) btrfs_delalloc_release_space(inode, data_reserved, block_start, blocksize, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize, (ret != 0)); + btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize); unlock_page(page); put_page(page); out: @@ -6305,13 +6305,16 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, u32 sizes[2]; int nitems = name ? 2 : 1; unsigned long ptr; + unsigned int nofs_flag; int ret; path = btrfs_alloc_path(); if (!path) return ERR_PTR(-ENOMEM); + nofs_flag = memalloc_nofs_save(); inode = new_inode(fs_info->sb); + memalloc_nofs_restore(nofs_flag); if (!inode) { btrfs_free_path(path); return ERR_PTR(-ENOMEM); @@ -8706,7 +8709,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) } else if (ret >= 0 && (size_t)ret < count) btrfs_delalloc_release_space(inode, data_reserved, offset, count - (size_t)ret, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), count, false); + btrfs_delalloc_release_extents(BTRFS_I(inode), count); } out: if (wakeup) @@ -9056,7 +9059,7 @@ again: unlock_extent_cached(io_tree, page_start, page_end, &cached_state); if (!ret2) { - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, true); + btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); sb_end_pagefault(inode->i_sb); extent_changeset_free(data_reserved); return VM_FAULT_LOCKED; @@ -9065,7 +9068,7 @@ again: out_unlock: unlock_page(page); out: - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, (ret != 0)); + btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); btrfs_delalloc_release_space(inode, data_reserved, page_start, reserved_space, (ret != 0)); out_noreserve: diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index de730e56d3f5..7c145a41decd 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1360,8 +1360,7 @@ again: unlock_page(pages[i]); put_page(pages[i]); } - btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT, - false); + btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT); extent_changeset_free(data_reserved); return i_done; out: @@ -1372,8 +1371,7 @@ out: btrfs_delalloc_release_space(inode, data_reserved, start_index << PAGE_SHIFT, page_cnt << PAGE_SHIFT, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT, - true); + btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT); extent_changeset_free(data_reserved); return ret; diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 8d3bd799ac7d..3ad151655eb8 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3166,9 +3166,6 @@ out: btrfs_free_path(path); mutex_lock(&fs_info->qgroup_rescan_lock); - if (!btrfs_fs_closing(fs_info)) - fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; - if (err > 0 && fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) { fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; @@ -3184,16 +3181,30 @@ out: trans = btrfs_start_transaction(fs_info->quota_root, 1); if (IS_ERR(trans)) { err = PTR_ERR(trans); + trans = NULL; btrfs_err(fs_info, "fail to start transaction for status update: %d", err); - goto done; } - ret = update_qgroup_status_item(trans); - if (ret < 0) { - err = ret; - btrfs_err(fs_info, "fail to update qgroup status: %d", err); + + mutex_lock(&fs_info->qgroup_rescan_lock); + if (!btrfs_fs_closing(fs_info)) + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; + if (trans) { + ret = update_qgroup_status_item(trans); + if (ret < 0) { + err = ret; + btrfs_err(fs_info, "fail to update qgroup status: %d", + err); + } } + fs_info->qgroup_rescan_running = false; + complete_all(&fs_info->qgroup_rescan_completion); + mutex_unlock(&fs_info->qgroup_rescan_lock); + + if (!trans) + return; + btrfs_end_transaction(trans); if (btrfs_fs_closing(fs_info)) { @@ -3204,12 +3215,6 @@ out: } else { btrfs_err(fs_info, "qgroup scan failed with %d", err); } - -done: - mutex_lock(&fs_info->qgroup_rescan_lock); - fs_info->qgroup_rescan_running = false; - mutex_unlock(&fs_info->qgroup_rescan_lock); - complete_all(&fs_info->qgroup_rescan_completion); } /* @@ -3437,6 +3442,9 @@ cleanup: while ((unode = ulist_next(&reserved->range_changed, &uiter))) clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val, unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL); + /* Also free data bytes of already reserved one */ + btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, + orig_reserved, BTRFS_QGROUP_RSV_DATA); extent_changeset_release(reserved); return ret; } @@ -3481,7 +3489,7 @@ static int qgroup_free_reserved_data(struct inode *inode, * EXTENT_QGROUP_RESERVED, we won't double free. * So not need to rush. */ - ret = clear_record_extent_bits(&BTRFS_I(inode)->io_failure_tree, + ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, free_start, free_start + free_len - 1, EXTENT_QGROUP_RESERVED, &changeset); if (ret < 0) @@ -3621,7 +3629,7 @@ int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, return 0; BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); - trace_qgroup_meta_reserve(root, type, (s64)num_bytes); + trace_qgroup_meta_reserve(root, (s64)num_bytes, type); ret = qgroup_reserve(root, num_bytes, enforce, type); if (ret < 0) return ret; @@ -3668,7 +3676,7 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, */ num_bytes = sub_root_meta_rsv(root, num_bytes, type); BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); - trace_qgroup_meta_reserve(root, type, -(s64)num_bytes); + trace_qgroup_meta_reserve(root, -(s64)num_bytes, type); btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, num_bytes, type); } diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index e87cbdad02a3..b57f3618e58e 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -500,7 +500,7 @@ static int process_leaf(struct btrfs_root *root, struct btrfs_extent_data_ref *dref; struct btrfs_shared_data_ref *sref; u32 count; - int i = 0, tree_block_level = 0, ret; + int i = 0, tree_block_level = 0, ret = 0; struct btrfs_key key; int nritems = btrfs_header_nritems(leaf); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 2f0e25afa486..5cd42b66818c 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1435,6 +1435,13 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, int clear_rsv = 0; int ret; + /* + * The subvolume has reloc tree but the swap is finished, no need to + * create/update the dead reloc tree + */ + if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state)) + return 0; + if (root->reloc_root) { reloc_root = root->reloc_root; reloc_root->last_trans = trans->transid; @@ -2187,7 +2194,6 @@ static int clean_dirty_subvols(struct reloc_control *rc) /* Merged subvolume, cleanup its reloc root */ struct btrfs_root *reloc_root = root->reloc_root; - clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); list_del_init(&root->reloc_dirty_list); root->reloc_root = NULL; if (reloc_root) { @@ -2196,6 +2202,7 @@ static int clean_dirty_subvols(struct reloc_control *rc) if (ret2 < 0 && !ret) ret = ret2; } + clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state); btrfs_put_fs_root(root); } else { /* Orphan reloc tree, just clean it up */ @@ -3270,6 +3277,8 @@ static int relocate_file_extent_cluster(struct inode *inode, if (!page) { btrfs_delalloc_release_metadata(BTRFS_I(inode), PAGE_SIZE, true); + btrfs_delalloc_release_extents(BTRFS_I(inode), + PAGE_SIZE); ret = -ENOMEM; goto out; } @@ -3290,7 +3299,7 @@ static int relocate_file_extent_cluster(struct inode *inode, btrfs_delalloc_release_metadata(BTRFS_I(inode), PAGE_SIZE, true); btrfs_delalloc_release_extents(BTRFS_I(inode), - PAGE_SIZE, true); + PAGE_SIZE); ret = -EIO; goto out; } @@ -3319,7 +3328,7 @@ static int relocate_file_extent_cluster(struct inode *inode, btrfs_delalloc_release_metadata(BTRFS_I(inode), PAGE_SIZE, true); btrfs_delalloc_release_extents(BTRFS_I(inode), - PAGE_SIZE, true); + PAGE_SIZE); clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, @@ -3335,8 +3344,7 @@ static int relocate_file_extent_cluster(struct inode *inode, put_page(page); index++; - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, - false); + btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); balance_dirty_pages_ratelimited(inode->i_mapping); btrfs_throttle(fs_info); } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index f3215028235c..123ac54af071 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -5085,7 +5085,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *path; struct btrfs_key key; int ret; - u64 clone_src_i_size; + u64 clone_src_i_size = 0; /* * Prevent cloning from a zero offset with a length matching the sector diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index b5e80563efaa..99fe9bf3fdac 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -52,7 +52,13 @@ static struct file_system_type test_type = { struct inode *btrfs_new_test_inode(void) { - return new_inode(test_mnt->mnt_sb); + struct inode *inode; + + inode = new_inode(test_mnt->mnt_sb); + if (inode) + inode_init_owner(inode, NULL, S_IFREG); + + return inode; } static int btrfs_init_test_fs(void) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 29b82a795522..8a6cc600bf18 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2932,7 +2932,8 @@ out: * in the tree of log roots */ static int update_log_root(struct btrfs_trans_handle *trans, - struct btrfs_root *log) + struct btrfs_root *log, + struct btrfs_root_item *root_item) { struct btrfs_fs_info *fs_info = log->fs_info; int ret; @@ -2940,10 +2941,10 @@ static int update_log_root(struct btrfs_trans_handle *trans, if (log->log_transid == 1) { /* insert root item on the first sync */ ret = btrfs_insert_root(trans, fs_info->log_root_tree, - &log->root_key, &log->root_item); + &log->root_key, root_item); } else { ret = btrfs_update_root(trans, fs_info->log_root_tree, - &log->root_key, &log->root_item); + &log->root_key, root_item); } return ret; } @@ -3041,6 +3042,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *log = root->log_root; struct btrfs_root *log_root_tree = fs_info->log_root_tree; + struct btrfs_root_item new_root_item; int log_transid = 0; struct btrfs_log_ctx root_log_ctx; struct blk_plug plug; @@ -3104,18 +3106,26 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, goto out; } + /* + * We _must_ update under the root->log_mutex in order to make sure we + * have a consistent view of the log root we are trying to commit at + * this moment. + * + * We _must_ copy this into a local copy, because we are not holding the + * log_root_tree->log_mutex yet. This is important because when we + * commit the log_root_tree we must have a consistent view of the + * log_root_tree when we update the super block to point at the + * log_root_tree bytenr. If we update the log_root_tree here we'll race + * with the commit and possibly point at the new block which we may not + * have written out. + */ btrfs_set_root_node(&log->root_item, log->node); + memcpy(&new_root_item, &log->root_item, sizeof(new_root_item)); root->log_transid++; log->log_transid = root->log_transid; root->log_start_pid = 0; /* - * Update or create log root item under the root's log_mutex to prevent - * races with concurrent log syncs that can lead to failure to update - * log root item because it was not created yet. - */ - ret = update_log_root(trans, log); - /* * IO has been started, blocks of the log tree have WRITTEN flag set * in their headers. new modifications of the log will be written to * new positions. so it's safe to allow log writers to go in. @@ -3135,6 +3145,14 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, mutex_unlock(&log_root_tree->log_mutex); mutex_lock(&log_root_tree->log_mutex); + + /* + * Now we are safe to update the log_root_tree because we're under the + * log_mutex, and we're a current writer so we're holding the commit + * open until we drop the log_mutex. + */ + ret = update_log_root(trans, log, &new_root_item); + if (atomic_dec_and_test(&log_root_tree->log_writers)) { /* atomic_dec_and_test implies a barrier */ cond_wake_up_nomb(&log_root_tree->log_writer_wait); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a324480bc88b..bdfe4493e43a 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3845,7 +3845,11 @@ static int alloc_profile_is_valid(u64 flags, int extended) return !extended; /* "0" is valid for usual profiles */ /* true if exactly one bit set */ - return is_power_of_2(flags); + /* + * Don't use is_power_of_2(unsigned long) because it won't work + * for the single profile (1ULL << 48) on 32-bit CPUs. + */ + return flags != 0 && (flags & (flags - 1)) == 0; } static inline int balance_need_close(struct btrfs_fs_info *fs_info) @@ -4063,7 +4067,13 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, } num_devices = btrfs_num_devices(fs_info); - allowed = 0; + + /* + * SINGLE profile on-disk has no profile bit, but in-memory we have a + * special bit for it, to make it easier to distinguish. Thus we need + * to set it manually, or balance would refuse the profile. + */ + allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) if (num_devices >= btrfs_raid_array[i].devs_min) allowed |= btrfs_raid_array[i].bg_flag; diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile index a699e320393f..c1da294418d1 100644 --- a/fs/ceph/Makefile +++ b/fs/ceph/Makefile @@ -6,7 +6,7 @@ obj-$(CONFIG_CEPH_FS) += ceph.o ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ - export.o caps.o snap.o xattr.o quota.o \ + export.o caps.o snap.o xattr.o quota.o io.o \ mds_client.o mdsmap.o strings.o ceph_frag.o \ debugfs.o diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index b3c8b886bf64..7ab616601141 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -189,8 +189,7 @@ static int ceph_do_readpage(struct file *filp, struct page *page) { struct inode *inode = file_inode(filp); struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_osd_client *osdc = - &ceph_inode_to_client(inode)->client->osdc; + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); int err = 0; u64 off = page_offset(page); u64 len = PAGE_SIZE; @@ -219,8 +218,8 @@ static int ceph_do_readpage(struct file *filp, struct page *page) dout("readpage inode %p file %p page %p index %lu\n", inode, filp, page, page->index); - err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, - off, &len, + err = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode), + &ci->i_layout, off, &len, ci->i_truncate_seq, ci->i_truncate_size, &page, 1, 0); if (err == -ENOENT) @@ -228,6 +227,8 @@ static int ceph_do_readpage(struct file *filp, struct page *page) if (err < 0) { SetPageError(page); ceph_fscache_readpage_cancel(inode, page); + if (err == -EBLACKLISTED) + fsc->blacklisted = true; goto out; } if (err < PAGE_SIZE) @@ -266,6 +267,8 @@ static void finish_read(struct ceph_osd_request *req) int i; dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); + if (rc == -EBLACKLISTED) + ceph_inode_to_client(inode)->blacklisted = true; /* unlock all pages, zeroing any data we didn't read */ osd_data = osd_req_op_extent_osd_data(req, 0); @@ -323,7 +326,8 @@ static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx, /* caller of readpages does not hold buffer and read caps * (fadvise, madvise and readahead cases) */ int want = CEPH_CAP_FILE_CACHE; - ret = ceph_try_get_caps(ci, CEPH_CAP_FILE_RD, want, true, &got); + ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want, + true, &got); if (ret < 0) { dout("start_read %p, error getting cap\n", inode); } else if (!(got & want)) { @@ -569,7 +573,7 @@ static u64 get_writepages_data_length(struct inode *inode, /* * Write a single page, but leave the page locked. * - * If we get a write error, set the page error bit, but still adjust the + * If we get a write error, mark the mapping for error, but still adjust the * dirty page accounting (i.e., page is no longer dirty). */ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) @@ -640,9 +644,10 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) end_page_writeback(page); return err; } + if (err == -EBLACKLISTED) + fsc->blacklisted = true; dout("writepage setting page/mapping error %d %p\n", err, page); - SetPageError(page); mapping_set_error(&inode->i_data, err); wbc->pages_skipped++; } else { @@ -680,23 +685,6 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc) } /* - * lame release_pages helper. release_pages() isn't exported to - * modules. - */ -static void ceph_release_pages(struct page **pages, int num) -{ - struct pagevec pvec; - int i; - - pagevec_init(&pvec); - for (i = 0; i < num; i++) { - if (pagevec_add(&pvec, pages[i]) == 0) - pagevec_release(&pvec); - } - pagevec_release(&pvec); -} - -/* * async writeback completion handler. * * If we get an error, set the mapping error bit, but not the individual @@ -720,6 +708,8 @@ static void writepages_finish(struct ceph_osd_request *req) if (rc < 0) { mapping_set_error(mapping, rc); ceph_set_error_write(ci); + if (rc == -EBLACKLISTED) + fsc->blacklisted = true; } else { ceph_clear_error_write(ci); } @@ -769,7 +759,7 @@ static void writepages_finish(struct ceph_osd_request *req) dout("writepages_finish %p wrote %llu bytes cleaned %d pages\n", inode, osd_data->length, rc >= 0 ? num_pages : 0); - ceph_release_pages(osd_data->pages, num_pages); + release_pages(osd_data->pages, num_pages); } ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc); @@ -1452,7 +1442,8 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf) want = CEPH_CAP_FILE_CACHE; got = 0; - err = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page); + err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_RD, want, -1, + &got, &pinned_page); if (err < 0) goto out_restore; @@ -1540,6 +1531,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) if (!prealloc_cf) return VM_FAULT_OOM; + sb_start_pagefault(inode->i_sb); ceph_block_sigs(&oldset); if (ci->i_inline_version != CEPH_INLINE_NONE) { @@ -1568,7 +1560,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) want = CEPH_CAP_FILE_BUFFER; got = 0; - err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len, + err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_WR, want, off + len, &got, NULL); if (err < 0) goto out_free; @@ -1614,6 +1606,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) ceph_put_cap_refs(ci, got); out_free: ceph_restore_sigs(&oldset); + sb_end_pagefault(inode->i_sb); ceph_free_cap_flush(prealloc_cf); if (err < 0) ret = vmf_error(err); @@ -1946,12 +1939,17 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, if (err >= 0 || err == -ENOENT) have |= POOL_READ; - else if (err != -EPERM) + else if (err != -EPERM) { + if (err == -EBLACKLISTED) + fsc->blacklisted = true; goto out_unlock; + } if (err2 == 0 || err2 == -EEXIST) have |= POOL_WRITE; else if (err2 != -EPERM) { + if (err2 == -EBLACKLISTED) + fsc->blacklisted = true; err = err2; goto out_unlock; } @@ -1989,10 +1987,11 @@ out: return err; } -int ceph_pool_perm_check(struct ceph_inode_info *ci, int need) +int ceph_pool_perm_check(struct inode *inode, int need) { - s64 pool; + struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_string *pool_ns; + s64 pool; int ret, flags; if (ci->i_vino.snap != CEPH_NOSNAP) { @@ -2004,7 +2003,7 @@ int ceph_pool_perm_check(struct ceph_inode_info *ci, int need) return 0; } - if (ceph_test_mount_opt(ceph_inode_to_client(&ci->vfs_inode), + if (ceph_test_mount_opt(ceph_inode_to_client(inode), NOPOOLPERM)) return 0; diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index bc90cf6ad7ed..b2ec29eeb4c4 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -6,6 +6,8 @@ * Written by Milosz Tanski (milosz@adfin.com) */ +#include <linux/ceph/ceph_debug.h> + #include "super.h" #include "cache.h" diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index ce0f5658720a..d3b9c9d5c1bd 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -458,37 +458,6 @@ struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds) } /* - * Return id of any MDS with a cap, preferably FILE_WR|BUFFER|EXCL, else -1. - */ -static int __ceph_get_cap_mds(struct ceph_inode_info *ci) -{ - struct ceph_cap *cap; - int mds = -1; - struct rb_node *p; - - /* prefer mds with WR|BUFFER|EXCL caps */ - for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { - cap = rb_entry(p, struct ceph_cap, ci_node); - mds = cap->mds; - if (cap->issued & (CEPH_CAP_FILE_WR | - CEPH_CAP_FILE_BUFFER | - CEPH_CAP_FILE_EXCL)) - break; - } - return mds; -} - -int ceph_get_cap_mds(struct inode *inode) -{ - struct ceph_inode_info *ci = ceph_inode(inode); - int mds; - spin_lock(&ci->i_ceph_lock); - mds = __ceph_get_cap_mds(ceph_inode(inode)); - spin_unlock(&ci->i_ceph_lock); - return mds; -} - -/* * Called under i_ceph_lock. */ static void __insert_cap_node(struct ceph_inode_info *ci, @@ -628,7 +597,7 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, /* * Add a capability under the given MDS session. * - * Caller should hold session snap_rwsem (read) and s_mutex. + * Caller should hold session snap_rwsem (read) and ci->i_ceph_lock * * @fmode is the open file mode, if we are opening a file, otherwise * it is < 0. (This is so we can atomically add the cap and add an @@ -645,6 +614,9 @@ void ceph_add_cap(struct inode *inode, struct ceph_cap *cap; int mds = session->s_mds; int actual_wanted; + u32 gen; + + lockdep_assert_held(&ci->i_ceph_lock); dout("add_cap %p mds%d cap %llx %s seq %d\n", inode, session->s_mds, cap_id, ceph_cap_string(issued), seq); @@ -656,6 +628,10 @@ void ceph_add_cap(struct inode *inode, if (fmode >= 0) wanted |= ceph_caps_for_mode(fmode); + spin_lock(&session->s_gen_ttl_lock); + gen = session->s_cap_gen; + spin_unlock(&session->s_gen_ttl_lock); + cap = __get_cap_for_mds(ci, mds); if (!cap) { cap = *new_cap; @@ -681,7 +657,7 @@ void ceph_add_cap(struct inode *inode, list_move_tail(&cap->session_caps, &session->s_caps); spin_unlock(&session->s_cap_lock); - if (cap->cap_gen < session->s_cap_gen) + if (cap->cap_gen < gen) cap->issued = cap->implemented = CEPH_CAP_PIN; /* @@ -775,7 +751,7 @@ void ceph_add_cap(struct inode *inode, cap->seq = seq; cap->issue_seq = seq; cap->mseq = mseq; - cap->cap_gen = session->s_cap_gen; + cap->cap_gen = gen; if (fmode >= 0) __ceph_get_fmode(ci, fmode); @@ -1284,10 +1260,6 @@ void __ceph_remove_caps(struct ceph_inode_info *ci) * Make note of max_size reported/requested from mds, revoked caps * that have now been implemented. * - * Make half-hearted attempt ot to invalidate page cache if we are - * dropping RDCACHE. Note that this will leave behind locked pages - * that we'll then need to deal with elsewhere. - * * Return non-zero if delayed release, or we experienced an error * such that the caller should requeue + retry later. * @@ -1746,11 +1718,11 @@ static bool __finish_cap_flush(struct ceph_mds_client *mdsc, * Add dirty inode to the flushing list. Assigned a seq number so we * can wait for caps to flush without starving. * - * Called under i_ceph_lock. + * Called under i_ceph_lock. Returns the flush tid. */ -static int __mark_caps_flushing(struct inode *inode, +static u64 __mark_caps_flushing(struct inode *inode, struct ceph_mds_session *session, bool wake, - u64 *flush_tid, u64 *oldest_flush_tid) + u64 *oldest_flush_tid) { struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); @@ -1789,8 +1761,7 @@ static int __mark_caps_flushing(struct inode *inode, list_add_tail(&cf->i_list, &ci->i_cap_flush_list); - *flush_tid = cf->tid; - return flushing; + return cf->tid; } /* @@ -2028,11 +1999,6 @@ retry_locked: } ack: - if (ci->i_ceph_flags & CEPH_I_NOFLUSH) { - dout(" skipping %p I_NOFLUSH set\n", inode); - continue; - } - if (session && session != cap->session) { dout("oops, wrong session %p mutex\n", session); mutex_unlock(&session->s_mutex); @@ -2080,9 +2046,9 @@ ack: } if (cap == ci->i_auth_cap && ci->i_dirty_caps) { - flushing = __mark_caps_flushing(inode, session, false, - &flush_tid, - &oldest_flush_tid); + flushing = ci->i_dirty_caps; + flush_tid = __mark_caps_flushing(inode, session, false, + &oldest_flush_tid); } else { flushing = 0; flush_tid = 0; @@ -2130,16 +2096,11 @@ static int try_flush_caps(struct inode *inode, u64 *ptid) retry: spin_lock(&ci->i_ceph_lock); retry_locked: - if (ci->i_ceph_flags & CEPH_I_NOFLUSH) { - spin_unlock(&ci->i_ceph_lock); - dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode); - goto out; - } if (ci->i_dirty_caps && ci->i_auth_cap) { struct ceph_cap *cap = ci->i_auth_cap; int delayed; - if (!session || session != cap->session) { + if (session != cap->session) { spin_unlock(&ci->i_ceph_lock); if (session) mutex_unlock(&session->s_mutex); @@ -2161,8 +2122,9 @@ retry_locked: goto retry_locked; } - flushing = __mark_caps_flushing(inode, session, true, - &flush_tid, &oldest_flush_tid); + flushing = ci->i_dirty_caps; + flush_tid = __mark_caps_flushing(inode, session, true, + &oldest_flush_tid); /* __send_cap drops i_ceph_lock */ delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, @@ -2261,35 +2223,45 @@ static int unsafe_request_wait(struct inode *inode) int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) { + struct ceph_file_info *fi = file->private_data; struct inode *inode = file->f_mapping->host; struct ceph_inode_info *ci = ceph_inode(inode); u64 flush_tid; - int ret; + int ret, err; int dirty; dout("fsync %p%s\n", inode, datasync ? " datasync" : ""); ret = file_write_and_wait_range(file, start, end); - if (ret < 0) - goto out; - if (datasync) goto out; dirty = try_flush_caps(inode, &flush_tid); dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); - ret = unsafe_request_wait(inode); + err = unsafe_request_wait(inode); /* * only wait on non-file metadata writeback (the mds * can recover size and mtime, so we don't need to * wait for that) */ - if (!ret && (dirty & ~CEPH_CAP_ANY_FILE_WR)) { - ret = wait_event_interruptible(ci->i_cap_wq, + if (!err && (dirty & ~CEPH_CAP_ANY_FILE_WR)) { + err = wait_event_interruptible(ci->i_cap_wq, caps_are_flushed(inode, flush_tid)); } + + if (err < 0) + ret = err; + + if (errseq_check(&ci->i_meta_err, READ_ONCE(fi->meta_err))) { + spin_lock(&file->f_lock); + err = errseq_check_and_advance(&ci->i_meta_err, + &fi->meta_err); + spin_unlock(&file->f_lock); + if (err < 0) + ret = err; + } out: dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret); return ret; @@ -2560,10 +2532,15 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got, * * FIXME: how does a 0 return differ from -EAGAIN? */ -static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, - loff_t endoff, bool nonblock, int *got) +enum { + NON_BLOCKING = 1, + CHECK_FILELOCK = 2, +}; + +static int try_get_cap_refs(struct inode *inode, int need, int want, + loff_t endoff, int flags, int *got) { - struct inode *inode = &ci->vfs_inode; + struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; int ret = 0; int have, implemented; @@ -2576,6 +2553,13 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, again: spin_lock(&ci->i_ceph_lock); + if ((flags & CHECK_FILELOCK) && + (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK)) { + dout("try_get_cap_refs %p error filelock\n", inode); + ret = -EIO; + goto out_unlock; + } + /* make sure file is actually open */ file_wanted = __ceph_caps_file_wanted(ci); if ((file_wanted & need) != need) { @@ -2637,7 +2621,7 @@ again: * we can not call down_read() when * task isn't in TASK_RUNNING state */ - if (nonblock) { + if (flags & NON_BLOCKING) { ret = -EAGAIN; goto out_unlock; } @@ -2731,18 +2715,19 @@ static void check_max_size(struct inode *inode, loff_t endoff) ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); } -int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want, +int ceph_try_get_caps(struct inode *inode, int need, int want, bool nonblock, int *got) { int ret; BUG_ON(need & ~CEPH_CAP_FILE_RD); BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO|CEPH_CAP_FILE_SHARED)); - ret = ceph_pool_perm_check(ci, need); + ret = ceph_pool_perm_check(inode, need); if (ret < 0) return ret; - ret = try_get_cap_refs(ci, need, want, 0, nonblock, got); + ret = try_get_cap_refs(inode, need, want, 0, + (nonblock ? NON_BLOCKING : 0), got); return ret == -EAGAIN ? 0 : ret; } @@ -2751,30 +2736,40 @@ int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want, * due to a small max_size, make sure we check_max_size (and possibly * ask the mds) so we don't get hung up indefinitely. */ -int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, +int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got, struct page **pinned_page) { - int _got, ret; + struct ceph_file_info *fi = filp->private_data; + struct inode *inode = file_inode(filp); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + int ret, _got, flags; - ret = ceph_pool_perm_check(ci, need); + ret = ceph_pool_perm_check(inode, need); if (ret < 0) return ret; + if ((fi->fmode & CEPH_FILE_MODE_WR) && + fi->filp_gen != READ_ONCE(fsc->filp_gen)) + return -EBADF; + while (true) { if (endoff > 0) - check_max_size(&ci->vfs_inode, endoff); + check_max_size(inode, endoff); + flags = atomic_read(&fi->num_locks) ? CHECK_FILELOCK : 0; _got = 0; - ret = try_get_cap_refs(ci, need, want, endoff, - false, &_got); + ret = try_get_cap_refs(inode, need, want, endoff, + flags, &_got); if (ret == -EAGAIN) continue; if (!ret) { DEFINE_WAIT_FUNC(wait, woken_wake_function); add_wait_queue(&ci->i_cap_wq, &wait); - while (!(ret = try_get_cap_refs(ci, need, want, endoff, - true, &_got))) { + flags |= NON_BLOCKING; + while (!(ret = try_get_cap_refs(inode, need, want, + endoff, flags, &_got))) { if (signal_pending(current)) { ret = -ERESTARTSYS; break; @@ -2786,10 +2781,18 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, if (ret == -EAGAIN) continue; } + + if ((fi->fmode & CEPH_FILE_MODE_WR) && + fi->filp_gen != READ_ONCE(fsc->filp_gen)) { + if (ret >= 0 && _got) + ceph_put_cap_refs(ci, _got); + return -EBADF; + } + if (ret < 0) { if (ret == -ESTALE) { /* session was killed, try renew caps */ - ret = ceph_renew_caps(&ci->vfs_inode); + ret = ceph_renew_caps(inode); if (ret == 0) continue; } @@ -2798,9 +2801,9 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, if (ci->i_inline_version != CEPH_INLINE_NONE && (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && - i_size_read(&ci->vfs_inode) > 0) { + i_size_read(inode) > 0) { struct page *page = - find_get_page(ci->vfs_inode.i_mapping, 0); + find_get_page(inode->i_mapping, 0); if (page) { if (PageUptodate(page)) { *pinned_page = page; @@ -2819,7 +2822,7 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, * getattr request will bring inline data into * page cache */ - ret = __ceph_do_getattr(&ci->vfs_inode, NULL, + ret = __ceph_do_getattr(inode, NULL, CEPH_STAT_CAP_INLINE_DATA, true); if (ret < 0) diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 2eb88ed22993..facb387c2735 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -294,7 +294,6 @@ void ceph_fs_debugfs_init(struct ceph_fs_client *fsc) void ceph_fs_debugfs_init(struct ceph_fs_client *fsc) { - return 0; } void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc) diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 15ff1b09cfa2..b6bfa94332c3 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -35,7 +35,7 @@ struct ceph_nfs_snapfh { static int ceph_encode_snapfh(struct inode *inode, u32 *rawfh, int *max_len, struct inode *parent_inode) { - const static int snap_handle_length = + static const int snap_handle_length = sizeof(struct ceph_nfs_snapfh) >> 2; struct ceph_nfs_snapfh *sfh = (void *)rawfh; u64 snapid = ceph_snap(inode); @@ -85,9 +85,9 @@ out: static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len, struct inode *parent_inode) { - const static int handle_length = + static const int handle_length = sizeof(struct ceph_nfs_fh) >> 2; - const static int connected_handle_length = + static const int connected_handle_length = sizeof(struct ceph_nfs_confh) >> 2; int type; @@ -458,33 +458,33 @@ static int __get_snap_name(struct dentry *parent, char *name, if (err < 0) goto out; - rinfo = &req->r_reply_info; - for (i = 0; i < rinfo->dir_nr; i++) { - rde = rinfo->dir_entries + i; - BUG_ON(!rde->inode.in); - if (ceph_snap(inode) == - le64_to_cpu(rde->inode.in->snapid)) { - memcpy(name, rde->name, rde->name_len); - name[rde->name_len] = '\0'; - err = 0; - goto out; - } - } - - if (rinfo->dir_end) - break; - - BUG_ON(rinfo->dir_nr <= 0); - rde = rinfo->dir_entries + (rinfo->dir_nr - 1); - next_offset += rinfo->dir_nr; - last_name = kstrndup(rde->name, rde->name_len, GFP_KERNEL); - if (!last_name) { - err = -ENOMEM; - goto out; - } - - ceph_mdsc_put_request(req); - req = NULL; + rinfo = &req->r_reply_info; + for (i = 0; i < rinfo->dir_nr; i++) { + rde = rinfo->dir_entries + i; + BUG_ON(!rde->inode.in); + if (ceph_snap(inode) == + le64_to_cpu(rde->inode.in->snapid)) { + memcpy(name, rde->name, rde->name_len); + name[rde->name_len] = '\0'; + err = 0; + goto out; + } + } + + if (rinfo->dir_end) + break; + + BUG_ON(rinfo->dir_nr <= 0); + rde = rinfo->dir_entries + (rinfo->dir_nr - 1); + next_offset += rinfo->dir_nr; + last_name = kstrndup(rde->name, rde->name_len, GFP_KERNEL); + if (!last_name) { + err = -ENOMEM; + goto out; + } + + ceph_mdsc_put_request(req); + req = NULL; } err = -ENOENT; out: diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 685a03cc4b77..d277f71abe0b 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -15,6 +15,7 @@ #include "super.h" #include "mds_client.h" #include "cache.h" +#include "io.h" static __le32 ceph_flags_sys2wire(u32 flags) { @@ -201,6 +202,7 @@ out: static int ceph_init_file_info(struct inode *inode, struct file *file, int fmode, bool isdir) { + struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_file_info *fi; dout("%s %p %p 0%o (%s)\n", __func__, inode, file, @@ -211,7 +213,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, struct ceph_dir_file_info *dfi = kmem_cache_zalloc(ceph_dir_file_cachep, GFP_KERNEL); if (!dfi) { - ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ + ceph_put_fmode(ci, fmode); /* clean up */ return -ENOMEM; } @@ -222,7 +224,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, } else { fi = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL); if (!fi) { - ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ + ceph_put_fmode(ci, fmode); /* clean up */ return -ENOMEM; } @@ -232,6 +234,8 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, fi->fmode = fmode; spin_lock_init(&fi->rw_contexts_lock); INIT_LIST_HEAD(&fi->rw_contexts); + fi->meta_err = errseq_sample(&ci->i_meta_err); + fi->filp_gen = READ_ONCE(ceph_inode_to_client(inode)->filp_gen); return 0; } @@ -695,7 +699,13 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, ceph_release_page_vector(pages, num_pages); } - if (ret <= 0 || off >= i_size || !more) + if (ret < 0) { + if (ret == -EBLACKLISTED) + fsc->blacklisted = true; + break; + } + + if (off >= i_size || !more) break; } @@ -921,7 +931,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, struct ceph_aio_request *aio_req = NULL; int num_pages = 0; int flags; - int ret; + int ret = 0; struct timespec64 mtime = current_time(inode); size_t count = iov_iter_count(iter); loff_t pos = iocb->ki_pos; @@ -935,11 +945,6 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, (write ? "write" : "read"), file, pos, (unsigned)count, snapc, snapc ? snapc->seq : 0); - ret = filemap_write_and_wait_range(inode->i_mapping, - pos, pos + count - 1); - if (ret < 0) - return ret; - if (write) { int ret2 = invalidate_inode_pages2_range(inode->i_mapping, pos >> PAGE_SHIFT, @@ -1260,7 +1265,8 @@ again: want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; else want = CEPH_CAP_FILE_CACHE; - ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page); + ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1, + &got, &pinned_page); if (ret < 0) return ret; @@ -1274,12 +1280,16 @@ again: if (ci->i_inline_version == CEPH_INLINE_NONE) { if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) { + ceph_start_io_direct(inode); ret = ceph_direct_read_write(iocb, to, NULL, NULL); + ceph_end_io_direct(inode); if (ret >= 0 && ret < len) retry_op = CHECK_EOF; } else { + ceph_start_io_read(inode); ret = ceph_sync_read(iocb, to, &retry_op); + ceph_end_io_read(inode); } } else { retry_op = READ_INLINE; @@ -1290,7 +1300,9 @@ again: inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, ceph_cap_string(got)); ceph_add_rw_context(fi, &rw_ctx); + ceph_start_io_read(inode); ret = generic_file_read_iter(iocb, to); + ceph_end_io_read(inode); ceph_del_rw_context(fi, &rw_ctx); } dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", @@ -1399,7 +1411,10 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) return -ENOMEM; retry_snap: - inode_lock(inode); + if (iocb->ki_flags & IOCB_DIRECT) + ceph_start_io_direct(inode); + else + ceph_start_io_write(inode); /* We can write back this queue in page reclaim */ current->backing_dev_info = inode_to_bdi(inode); @@ -1457,7 +1472,7 @@ retry_snap: else want = CEPH_CAP_FILE_BUFFER; got = 0; - err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, pos + count, + err = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, pos + count, &got, NULL); if (err < 0) goto out; @@ -1470,7 +1485,6 @@ retry_snap: (ci->i_ceph_flags & CEPH_I_ERROR_WRITE)) { struct ceph_snap_context *snapc; struct iov_iter data; - inode_unlock(inode); spin_lock(&ci->i_ceph_lock); if (__ceph_have_pending_cap_snap(ci)) { @@ -1487,11 +1501,14 @@ retry_snap: /* we might need to revert back to that point */ data = *from; - if (iocb->ki_flags & IOCB_DIRECT) + if (iocb->ki_flags & IOCB_DIRECT) { written = ceph_direct_read_write(iocb, &data, snapc, &prealloc_cf); - else + ceph_end_io_direct(inode); + } else { written = ceph_sync_write(iocb, &data, pos, snapc); + ceph_end_io_write(inode); + } if (written > 0) iov_iter_advance(from, written); ceph_put_snap_context(snapc); @@ -1506,7 +1523,7 @@ retry_snap: written = generic_perform_write(file, from, pos); if (likely(written >= 0)) iocb->ki_pos = pos + written; - inode_unlock(inode); + ceph_end_io_write(inode); } if (written >= 0) { @@ -1541,9 +1558,11 @@ retry_snap: } goto out_unlocked; - out: - inode_unlock(inode); + if (iocb->ki_flags & IOCB_DIRECT) + ceph_end_io_direct(inode); + else + ceph_end_io_write(inode); out_unlocked: ceph_free_cap_flush(prealloc_cf); current->backing_dev_info = NULL; @@ -1781,7 +1800,7 @@ static long ceph_fallocate(struct file *file, int mode, else want = CEPH_CAP_FILE_BUFFER; - ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, endoff, &got, NULL); + ret = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, endoff, &got, NULL); if (ret < 0) goto unlock; @@ -1810,16 +1829,15 @@ unlock: * src_ci. Two attempts are made to obtain both caps, and an error is return if * this fails; zero is returned on success. */ -static int get_rd_wr_caps(struct ceph_inode_info *src_ci, - loff_t src_endoff, int *src_got, - struct ceph_inode_info *dst_ci, +static int get_rd_wr_caps(struct file *src_filp, int *src_got, + struct file *dst_filp, loff_t dst_endoff, int *dst_got) { int ret = 0; bool retrying = false; retry_caps: - ret = ceph_get_caps(dst_ci, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, + ret = ceph_get_caps(dst_filp, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, dst_endoff, dst_got, NULL); if (ret < 0) return ret; @@ -1829,24 +1847,24 @@ retry_caps: * we would risk a deadlock by using ceph_get_caps. Thus, we'll do some * retry dance instead to try to get both capabilities. */ - ret = ceph_try_get_caps(src_ci, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_SHARED, + ret = ceph_try_get_caps(file_inode(src_filp), + CEPH_CAP_FILE_RD, CEPH_CAP_FILE_SHARED, false, src_got); if (ret <= 0) { /* Start by dropping dst_ci caps and getting src_ci caps */ - ceph_put_cap_refs(dst_ci, *dst_got); + ceph_put_cap_refs(ceph_inode(file_inode(dst_filp)), *dst_got); if (retrying) { if (!ret) /* ceph_try_get_caps masks EAGAIN */ ret = -EAGAIN; return ret; } - ret = ceph_get_caps(src_ci, CEPH_CAP_FILE_RD, - CEPH_CAP_FILE_SHARED, src_endoff, - src_got, NULL); + ret = ceph_get_caps(src_filp, CEPH_CAP_FILE_RD, + CEPH_CAP_FILE_SHARED, -1, src_got, NULL); if (ret < 0) return ret; /*... drop src_ci caps too, and retry */ - ceph_put_cap_refs(src_ci, *src_got); + ceph_put_cap_refs(ceph_inode(file_inode(src_filp)), *src_got); retrying = true; goto retry_caps; } @@ -1904,6 +1922,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, struct ceph_inode_info *src_ci = ceph_inode(src_inode); struct ceph_inode_info *dst_ci = ceph_inode(dst_inode); struct ceph_cap_flush *prealloc_cf; + struct ceph_fs_client *src_fsc = ceph_inode_to_client(src_inode); struct ceph_object_locator src_oloc, dst_oloc; struct ceph_object_id src_oid, dst_oid; loff_t endoff = 0, size; @@ -1913,10 +1932,16 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, int src_got = 0, dst_got = 0, err, dirty; bool do_final_copy = false; - if (src_inode == dst_inode) - return -EINVAL; - if (src_inode->i_sb != dst_inode->i_sb) - return -EXDEV; + if (src_inode->i_sb != dst_inode->i_sb) { + struct ceph_fs_client *dst_fsc = ceph_inode_to_client(dst_inode); + + if (ceph_fsid_compare(&src_fsc->client->fsid, + &dst_fsc->client->fsid)) { + dout("Copying files across clusters: src: %pU dst: %pU\n", + &src_fsc->client->fsid, &dst_fsc->client->fsid); + return -EXDEV; + } + } if (ceph_snap(dst_inode) != CEPH_NOSNAP) return -EROFS; @@ -1928,7 +1953,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, * efficient). */ - if (ceph_test_mount_opt(ceph_inode_to_client(src_inode), NOCOPYFROM)) + if (ceph_test_mount_opt(src_fsc, NOCOPYFROM)) return -EOPNOTSUPP; if ((src_ci->i_layout.stripe_unit != dst_ci->i_layout.stripe_unit) || @@ -1960,8 +1985,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, * clients may have dirty data in their caches. And OSDs know nothing * about caps, so they can't safely do the remote object copies. */ - err = get_rd_wr_caps(src_ci, (src_off + len), &src_got, - dst_ci, (dst_off + len), &dst_got); + err = get_rd_wr_caps(src_file, &src_got, + dst_file, (dst_off + len), &dst_got); if (err < 0) { dout("get_rd_wr_caps returned %d\n", err); ret = -EOPNOTSUPP; @@ -2018,9 +2043,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, goto out; } len -= ret; - err = get_rd_wr_caps(src_ci, (src_off + len), - &src_got, dst_ci, - (dst_off + len), &dst_got); + err = get_rd_wr_caps(src_file, &src_got, + dst_file, (dst_off + len), &dst_got); if (err < 0) goto out; err = is_file_size_ok(src_inode, dst_inode, @@ -2044,7 +2068,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, dst_ci->i_vino.ino, dst_objnum); /* Do an object remote copy */ err = ceph_osdc_copy_from( - &ceph_inode_to_client(src_inode)->client->osdc, + &src_fsc->client->osdc, src_ci->i_vino.snap, 0, &src_oid, &src_oloc, CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 18500edefc56..9f135624ae47 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -515,6 +515,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ceph_fscache_inode_init(ci); + ci->i_meta_err = 0; + return &ci->vfs_inode; } @@ -801,7 +803,12 @@ static int fill_inode(struct inode *inode, struct page *locked_page, /* update inode */ inode->i_rdev = le32_to_cpu(info->rdev); - inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; + /* directories have fl_stripe_unit set to zero */ + if (le32_to_cpu(info->layout.fl_stripe_unit)) + inode->i_blkbits = + fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; + else + inode->i_blkbits = CEPH_BLOCK_SHIFT; __ceph_update_quota(ci, iinfo->max_bytes, iinfo->max_files); @@ -1982,7 +1989,7 @@ static const struct inode_operations ceph_symlink_iops = { int __ceph_setattr(struct inode *inode, struct iattr *attr) { struct ceph_inode_info *ci = ceph_inode(inode); - const unsigned int ia_valid = attr->ia_valid; + unsigned int ia_valid = attr->ia_valid; struct ceph_mds_request *req; struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_cap_flush *prealloc_cf; @@ -2087,6 +2094,26 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; } } + if (ia_valid & ATTR_SIZE) { + dout("setattr %p size %lld -> %lld\n", inode, + inode->i_size, attr->ia_size); + if ((issued & CEPH_CAP_FILE_EXCL) && + attr->ia_size > inode->i_size) { + i_size_write(inode, attr->ia_size); + inode->i_blocks = calc_inode_blocks(attr->ia_size); + ci->i_reported_size = attr->ia_size; + dirtied |= CEPH_CAP_FILE_EXCL; + ia_valid |= ATTR_MTIME; + } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || + attr->ia_size != inode->i_size) { + req->r_args.setattr.size = cpu_to_le64(attr->ia_size); + req->r_args.setattr.old_size = + cpu_to_le64(inode->i_size); + mask |= CEPH_SETATTR_SIZE; + release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL | + CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; + } + } if (ia_valid & ATTR_MTIME) { dout("setattr %p mtime %lld.%ld -> %lld.%ld\n", inode, inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec, @@ -2109,25 +2136,6 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; } } - if (ia_valid & ATTR_SIZE) { - dout("setattr %p size %lld -> %lld\n", inode, - inode->i_size, attr->ia_size); - if ((issued & CEPH_CAP_FILE_EXCL) && - attr->ia_size > inode->i_size) { - i_size_write(inode, attr->ia_size); - inode->i_blocks = calc_inode_blocks(attr->ia_size); - ci->i_reported_size = attr->ia_size; - dirtied |= CEPH_CAP_FILE_EXCL; - } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || - attr->ia_size != inode->i_size) { - req->r_args.setattr.size = cpu_to_le64(attr->ia_size); - req->r_args.setattr.old_size = - cpu_to_le64(inode->i_size); - mask |= CEPH_SETATTR_SIZE; - release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL | - CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; - } - } /* these do nothing */ if (ia_valid & ATTR_CTIME) { diff --git a/fs/ceph/io.c b/fs/ceph/io.c new file mode 100644 index 000000000000..97602ea92ff4 --- /dev/null +++ b/fs/ceph/io.c @@ -0,0 +1,163 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2016 Trond Myklebust + * Copyright (c) 2019 Jeff Layton + * + * I/O and data path helper functionality. + * + * Heavily borrowed from equivalent code in fs/nfs/io.c + */ + +#include <linux/ceph/ceph_debug.h> + +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/rwsem.h> +#include <linux/fs.h> + +#include "super.h" +#include "io.h" + +/* Call with exclusively locked inode->i_rwsem */ +static void ceph_block_o_direct(struct ceph_inode_info *ci, struct inode *inode) +{ + lockdep_assert_held_write(&inode->i_rwsem); + + if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT) { + spin_lock(&ci->i_ceph_lock); + ci->i_ceph_flags &= ~CEPH_I_ODIRECT; + spin_unlock(&ci->i_ceph_lock); + inode_dio_wait(inode); + } +} + +/** + * ceph_start_io_read - declare the file is being used for buffered reads + * @inode: file inode + * + * Declare that a buffered read operation is about to start, and ensure + * that we block all direct I/O. + * On exit, the function ensures that the CEPH_I_ODIRECT flag is unset, + * and holds a shared lock on inode->i_rwsem to ensure that the flag + * cannot be changed. + * In practice, this means that buffered read operations are allowed to + * execute in parallel, thanks to the shared lock, whereas direct I/O + * operations need to wait to grab an exclusive lock in order to set + * CEPH_I_ODIRECT. + * Note that buffered writes and truncates both take a write lock on + * inode->i_rwsem, meaning that those are serialised w.r.t. the reads. + */ +void +ceph_start_io_read(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + /* Be an optimist! */ + down_read(&inode->i_rwsem); + if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT)) + return; + up_read(&inode->i_rwsem); + /* Slow path.... */ + down_write(&inode->i_rwsem); + ceph_block_o_direct(ci, inode); + downgrade_write(&inode->i_rwsem); +} + +/** + * ceph_end_io_read - declare that the buffered read operation is done + * @inode: file inode + * + * Declare that a buffered read operation is done, and release the shared + * lock on inode->i_rwsem. + */ +void +ceph_end_io_read(struct inode *inode) +{ + up_read(&inode->i_rwsem); +} + +/** + * ceph_start_io_write - declare the file is being used for buffered writes + * @inode: file inode + * + * Declare that a buffered write operation is about to start, and ensure + * that we block all direct I/O. + */ +void +ceph_start_io_write(struct inode *inode) +{ + down_write(&inode->i_rwsem); + ceph_block_o_direct(ceph_inode(inode), inode); +} + +/** + * ceph_end_io_write - declare that the buffered write operation is done + * @inode: file inode + * + * Declare that a buffered write operation is done, and release the + * lock on inode->i_rwsem. + */ +void +ceph_end_io_write(struct inode *inode) +{ + up_write(&inode->i_rwsem); +} + +/* Call with exclusively locked inode->i_rwsem */ +static void ceph_block_buffered(struct ceph_inode_info *ci, struct inode *inode) +{ + lockdep_assert_held_write(&inode->i_rwsem); + + if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT)) { + spin_lock(&ci->i_ceph_lock); + ci->i_ceph_flags |= CEPH_I_ODIRECT; + spin_unlock(&ci->i_ceph_lock); + /* FIXME: unmap_mapping_range? */ + filemap_write_and_wait(inode->i_mapping); + } +} + +/** + * ceph_end_io_direct - declare the file is being used for direct i/o + * @inode: file inode + * + * Declare that a direct I/O operation is about to start, and ensure + * that we block all buffered I/O. + * On exit, the function ensures that the CEPH_I_ODIRECT flag is set, + * and holds a shared lock on inode->i_rwsem to ensure that the flag + * cannot be changed. + * In practice, this means that direct I/O operations are allowed to + * execute in parallel, thanks to the shared lock, whereas buffered I/O + * operations need to wait to grab an exclusive lock in order to clear + * CEPH_I_ODIRECT. + * Note that buffered writes and truncates both take a write lock on + * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT. + */ +void +ceph_start_io_direct(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + + /* Be an optimist! */ + down_read(&inode->i_rwsem); + if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT) + return; + up_read(&inode->i_rwsem); + /* Slow path.... */ + down_write(&inode->i_rwsem); + ceph_block_buffered(ci, inode); + downgrade_write(&inode->i_rwsem); +} + +/** + * ceph_end_io_direct - declare that the direct i/o operation is done + * @inode: file inode + * + * Declare that a direct I/O operation is done, and release the shared + * lock on inode->i_rwsem. + */ +void +ceph_end_io_direct(struct inode *inode) +{ + up_read(&inode->i_rwsem); +} diff --git a/fs/ceph/io.h b/fs/ceph/io.h new file mode 100644 index 000000000000..fa594cd77348 --- /dev/null +++ b/fs/ceph/io.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _FS_CEPH_IO_H +#define _FS_CEPH_IO_H + +void ceph_start_io_read(struct inode *inode); +void ceph_end_io_read(struct inode *inode); +void ceph_start_io_write(struct inode *inode); +void ceph_end_io_write(struct inode *inode); +void ceph_start_io_direct(struct inode *inode); +void ceph_end_io_direct(struct inode *inode); + +#endif /* FS_CEPH_IO_H */ diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index 5083e238ad15..544e9e85b120 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -32,14 +32,18 @@ void __init ceph_flock_init(void) static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src) { - struct inode *inode = file_inode(src->fl_file); + struct ceph_file_info *fi = dst->fl_file->private_data; + struct inode *inode = file_inode(dst->fl_file); atomic_inc(&ceph_inode(inode)->i_filelock_ref); + atomic_inc(&fi->num_locks); } static void ceph_fl_release_lock(struct file_lock *fl) { + struct ceph_file_info *fi = fl->fl_file->private_data; struct inode *inode = file_inode(fl->fl_file); struct ceph_inode_info *ci = ceph_inode(inode); + atomic_dec(&fi->num_locks); if (atomic_dec_and_test(&ci->i_filelock_ref)) { /* clear error when all locks are released */ spin_lock(&ci->i_ceph_lock); @@ -73,7 +77,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode, * window. Caller function will decrease the counter. */ fl->fl_ops = &ceph_fl_lock_ops; - atomic_inc(&ceph_inode(inode)->i_filelock_ref); + fl->fl_ops->fl_copy_lock(fl, NULL); } if (operation != CEPH_MDS_OP_SETFILELOCK || cmd == CEPH_LOCK_UNLOCK) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 920e9f048bd8..a5163296d9d9 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -384,8 +384,8 @@ static int parse_reply_info_readdir(void **p, void *end, } done: - if (*p != end) - goto bad; + /* Skip over any unrecognized fields */ + *p = end; return 0; bad: @@ -406,12 +406,10 @@ static int parse_reply_info_filelock(void **p, void *end, goto bad; info->filelock_reply = *p; - *p += sizeof(*info->filelock_reply); - if (unlikely(*p != end)) - goto bad; + /* Skip over any unrecognized fields */ + *p = end; return 0; - bad: return -EIO; } @@ -425,18 +423,21 @@ static int parse_reply_info_create(void **p, void *end, { if (features == (u64)-1 || (features & CEPH_FEATURE_REPLY_CREATE_INODE)) { + /* Malformed reply? */ if (*p == end) { info->has_create_ino = false; } else { info->has_create_ino = true; - info->ino = ceph_decode_64(p); + ceph_decode_64_safe(p, end, info->ino, bad); } + } else { + if (*p != end) + goto bad; } - if (unlikely(*p != end)) - goto bad; + /* Skip over any unrecognized fields */ + *p = end; return 0; - bad: return -EIO; } @@ -639,7 +640,6 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, s->s_renew_seq = 0; INIT_LIST_HEAD(&s->s_caps); s->s_nr_caps = 0; - s->s_trim_caps = 0; refcount_set(&s->s_ref, 1); INIT_LIST_HEAD(&s->s_waiting); INIT_LIST_HEAD(&s->s_unsafe); @@ -1270,6 +1270,7 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc, { struct ceph_mds_request *req; struct rb_node *p; + struct ceph_inode_info *ci; dout("cleanup_session_requests mds%d\n", session->s_mds); mutex_lock(&mdsc->mutex); @@ -1278,6 +1279,16 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc, struct ceph_mds_request, r_unsafe_item); pr_warn_ratelimited(" dropping unsafe request %llu\n", req->r_tid); + if (req->r_target_inode) { + /* dropping unsafe change of inode's attributes */ + ci = ceph_inode(req->r_target_inode); + errseq_set(&ci->i_meta_err, -EIO); + } + if (req->r_unsafe_dir) { + /* dropping unsafe directory operation */ + ci = ceph_inode(req->r_unsafe_dir); + errseq_set(&ci->i_meta_err, -EIO); + } __unregister_request(mdsc, req); } /* zero r_attempts, so kick_requests() will re-send requests */ @@ -1370,7 +1381,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, struct ceph_fs_client *fsc = (struct ceph_fs_client *)arg; struct ceph_inode_info *ci = ceph_inode(inode); LIST_HEAD(to_remove); - bool drop = false; + bool dirty_dropped = false; bool invalidate = false; dout("removing cap %p, ci is %p, inode is %p\n", @@ -1383,9 +1394,12 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, struct ceph_cap_flush *cf; struct ceph_mds_client *mdsc = fsc->mdsc; - if (ci->i_wrbuffer_ref > 0 && - READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) - invalidate = true; + if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { + if (inode->i_data.nrpages > 0) + invalidate = true; + if (ci->i_wrbuffer_ref > 0) + mapping_set_error(&inode->i_data, -EIO); + } while (!list_empty(&ci->i_cap_flush_list)) { cf = list_first_entry(&ci->i_cap_flush_list, @@ -1405,7 +1419,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, inode, ceph_ino(inode)); ci->i_dirty_caps = 0; list_del_init(&ci->i_dirty_item); - drop = true; + dirty_dropped = true; } if (!list_empty(&ci->i_flushing_item)) { pr_warn_ratelimited( @@ -1415,10 +1429,22 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, ci->i_flushing_caps = 0; list_del_init(&ci->i_flushing_item); mdsc->num_cap_flushing--; - drop = true; + dirty_dropped = true; } spin_unlock(&mdsc->cap_dirty_lock); + if (dirty_dropped) { + errseq_set(&ci->i_meta_err, -EIO); + + if (ci->i_wrbuffer_ref_head == 0 && + ci->i_wr_ref == 0 && + ci->i_dirty_caps == 0 && + ci->i_flushing_caps == 0) { + ceph_put_snap_context(ci->i_head_snapc); + ci->i_head_snapc = NULL; + } + } + if (atomic_read(&ci->i_filelock_ref) > 0) { /* make further file lock syscall return -EIO */ ci->i_ceph_flags |= CEPH_I_ERROR_FILELOCK; @@ -1430,15 +1456,6 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, list_add(&ci->i_prealloc_cap_flush->i_list, &to_remove); ci->i_prealloc_cap_flush = NULL; } - - if (drop && - ci->i_wrbuffer_ref_head == 0 && - ci->i_wr_ref == 0 && - ci->i_dirty_caps == 0 && - ci->i_flushing_caps == 0) { - ceph_put_snap_context(ci->i_head_snapc); - ci->i_head_snapc = NULL; - } } spin_unlock(&ci->i_ceph_lock); while (!list_empty(&to_remove)) { @@ -1452,7 +1469,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, wake_up_all(&ci->i_cap_wq); if (invalidate) ceph_queue_invalidate(inode); - if (drop) + if (dirty_dropped) iput(inode); return 0; } @@ -1705,11 +1722,11 @@ out: */ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) { - struct ceph_mds_session *session = arg; + int *remaining = arg; struct ceph_inode_info *ci = ceph_inode(inode); int used, wanted, oissued, mine; - if (session->s_trim_caps <= 0) + if (*remaining <= 0) return -1; spin_lock(&ci->i_ceph_lock); @@ -1746,7 +1763,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) if (oissued) { /* we aren't the only cap.. just remove us */ __ceph_remove_cap(cap, true); - session->s_trim_caps--; + (*remaining)--; } else { struct dentry *dentry; /* try dropping referring dentries */ @@ -1758,7 +1775,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) d_prune_aliases(inode); count = atomic_read(&inode->i_count); if (count == 1) - session->s_trim_caps--; + (*remaining)--; dout("trim_caps_cb %p cap %p pruned, count now %d\n", inode, cap, count); } else { @@ -1784,12 +1801,12 @@ int ceph_trim_caps(struct ceph_mds_client *mdsc, dout("trim_caps mds%d start: %d / %d, trim %d\n", session->s_mds, session->s_nr_caps, max_caps, trim_caps); if (trim_caps > 0) { - session->s_trim_caps = trim_caps; - ceph_iterate_session_caps(session, trim_caps_cb, session); + int remaining = trim_caps; + + ceph_iterate_session_caps(session, trim_caps_cb, &remaining); dout("trim_caps mds%d done: %d / %d, trimmed %d\n", session->s_mds, session->s_nr_caps, max_caps, - trim_caps - session->s_trim_caps); - session->s_trim_caps = 0; + trim_caps - remaining); } ceph_flush_cap_releases(mdsc, session); @@ -3015,18 +3032,23 @@ bad: pr_err("mdsc_handle_forward decode error err=%d\n", err); } -static int __decode_and_drop_session_metadata(void **p, void *end) +static int __decode_session_metadata(void **p, void *end, + bool *blacklisted) { /* map<string,string> */ u32 n; + bool err_str; ceph_decode_32_safe(p, end, n, bad); while (n-- > 0) { u32 len; ceph_decode_32_safe(p, end, len, bad); ceph_decode_need(p, end, len, bad); + err_str = !strncmp(*p, "error_string", len); *p += len; ceph_decode_32_safe(p, end, len, bad); ceph_decode_need(p, end, len, bad); + if (err_str && strnstr(*p, "blacklisted", len)) + *blacklisted = true; *p += len; } return 0; @@ -3050,6 +3072,7 @@ static void handle_session(struct ceph_mds_session *session, u64 seq; unsigned long features = 0; int wake = 0; + bool blacklisted = false; /* decode */ ceph_decode_need(&p, end, sizeof(*h), bad); @@ -3062,7 +3085,7 @@ static void handle_session(struct ceph_mds_session *session, if (msg_version >= 3) { u32 len; /* version >= 2, metadata */ - if (__decode_and_drop_session_metadata(&p, end) < 0) + if (__decode_session_metadata(&p, end, &blacklisted) < 0) goto bad; /* version >= 3, feature bits */ ceph_decode_32_safe(&p, end, len, bad); @@ -3149,6 +3172,8 @@ static void handle_session(struct ceph_mds_session *session, session->s_state = CEPH_MDS_SESSION_REJECTED; cleanup_session_requests(mdsc, session); remove_session_caps(session); + if (blacklisted) + mdsc->fsc->blacklisted = true; wake = 2; /* for good measure */ break; @@ -3998,7 +4023,27 @@ static void lock_unlock_sessions(struct ceph_mds_client *mdsc) mutex_unlock(&mdsc->mutex); } +static void maybe_recover_session(struct ceph_mds_client *mdsc) +{ + struct ceph_fs_client *fsc = mdsc->fsc; + + if (!ceph_test_mount_opt(fsc, CLEANRECOVER)) + return; + + if (READ_ONCE(fsc->mount_state) != CEPH_MOUNT_MOUNTED) + return; + + if (!READ_ONCE(fsc->blacklisted)) + return; + if (fsc->last_auto_reconnect && + time_before(jiffies, fsc->last_auto_reconnect + HZ * 60 * 30)) + return; + + pr_info("auto reconnect after blacklisted\n"); + fsc->last_auto_reconnect = jiffies; + ceph_force_reconnect(fsc->sb); +} /* * delayed work -- periodically trim expired leases, renew caps with mds @@ -4044,7 +4089,9 @@ static void delayed_work(struct work_struct *work) pr_info("mds%d hung\n", s->s_mds); } } - if (s->s_state < CEPH_MDS_SESSION_OPEN) { + if (s->s_state == CEPH_MDS_SESSION_NEW || + s->s_state == CEPH_MDS_SESSION_RESTARTING || + s->s_state == CEPH_MDS_SESSION_REJECTED) { /* this mds is failed or recovering, just wait */ ceph_put_mds_session(s); continue; @@ -4072,6 +4119,8 @@ static void delayed_work(struct work_struct *work) ceph_trim_snapid_map(mdsc); + maybe_recover_session(mdsc); + schedule_delayed(mdsc); } @@ -4355,7 +4404,12 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc) session = __ceph_lookup_mds_session(mdsc, mds); if (!session) continue; + + if (session->s_state == CEPH_MDS_SESSION_REJECTED) + __unregister_session(mdsc, session); + __wake_requests(mdsc, &session->s_waiting); mutex_unlock(&mdsc->mutex); + mutex_lock(&session->s_mutex); __close_session(mdsc, session); if (session->s_state == CEPH_MDS_SESSION_CLOSING) { @@ -4364,6 +4418,7 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc) } mutex_unlock(&session->s_mutex); ceph_put_mds_session(session); + mutex_lock(&mdsc->mutex); kick_requests(mdsc, mds); } diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index f7c8603484fe..5cd131b41d84 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -148,9 +148,9 @@ enum { CEPH_MDS_SESSION_OPENING = 2, CEPH_MDS_SESSION_OPEN = 3, CEPH_MDS_SESSION_HUNG = 4, - CEPH_MDS_SESSION_CLOSING = 5, - CEPH_MDS_SESSION_RESTARTING = 6, - CEPH_MDS_SESSION_RECONNECTING = 7, + CEPH_MDS_SESSION_RESTARTING = 5, + CEPH_MDS_SESSION_RECONNECTING = 6, + CEPH_MDS_SESSION_CLOSING = 7, CEPH_MDS_SESSION_REJECTED = 8, }; @@ -176,7 +176,7 @@ struct ceph_mds_session { spinlock_t s_cap_lock; struct list_head s_caps; /* all caps issued by this session */ struct ceph_cap *s_cap_iterator; - int s_nr_caps, s_trim_caps; + int s_nr_caps; int s_num_cap_releases; int s_cap_reconnect; int s_readonly; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 377fafc76f20..edfd643a8205 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -143,6 +143,7 @@ enum { Opt_snapdirname, Opt_mds_namespace, Opt_fscache_uniq, + Opt_recover_session, Opt_last_string, /* string args above */ Opt_dirstat, @@ -184,6 +185,7 @@ static match_table_t fsopt_tokens = { /* int args above */ {Opt_snapdirname, "snapdirname=%s"}, {Opt_mds_namespace, "mds_namespace=%s"}, + {Opt_recover_session, "recover_session=%s"}, {Opt_fscache_uniq, "fsc=%s"}, /* string args above */ {Opt_dirstat, "dirstat"}, @@ -254,6 +256,17 @@ static int parse_fsopt_token(char *c, void *private) if (!fsopt->mds_namespace) return -ENOMEM; break; + case Opt_recover_session: + if (!strncmp(argstr[0].from, "no", + argstr[0].to - argstr[0].from)) { + fsopt->flags &= ~CEPH_MOUNT_OPT_CLEANRECOVER; + } else if (!strncmp(argstr[0].from, "clean", + argstr[0].to - argstr[0].from)) { + fsopt->flags |= CEPH_MOUNT_OPT_CLEANRECOVER; + } else { + return -EINVAL; + } + break; case Opt_fscache_uniq: kfree(fsopt->fscache_uniq); fsopt->fscache_uniq = kstrndup(argstr[0].from, @@ -576,6 +589,10 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) if (fsopt->mds_namespace) seq_show_option(m, "mds_namespace", fsopt->mds_namespace); + + if (fsopt->flags & CEPH_MOUNT_OPT_CLEANRECOVER) + seq_show_option(m, "recover_session", "clean"); + if (fsopt->wsize != CEPH_MAX_WRITE_SIZE) seq_printf(m, ",wsize=%d", fsopt->wsize); if (fsopt->rsize != CEPH_MAX_READ_SIZE) @@ -664,6 +681,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, fsc->sb = NULL; fsc->mount_state = CEPH_MOUNT_MOUNTING; + fsc->filp_gen = 1; atomic_long_set(&fsc->writeback_count, 0); @@ -713,6 +731,7 @@ static void destroy_fs_client(struct ceph_fs_client *fsc) { dout("destroy_fs_client %p\n", fsc); + ceph_mdsc_destroy(fsc); destroy_workqueue(fsc->inode_wq); destroy_workqueue(fsc->cap_wq); @@ -829,7 +848,7 @@ static void ceph_umount_begin(struct super_block *sb) fsc->mount_state = CEPH_MOUNT_SHUTDOWN; ceph_osdc_abort_requests(&fsc->client->osdc, -EIO); ceph_mdsc_force_umount(fsc->mdsc); - return; + fsc->filp_gen++; // invalidate open files } static int ceph_remount(struct super_block *sb, int *flags, char *data) @@ -1089,7 +1108,6 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type, } if (ceph_sb_to_client(sb) != fsc) { - ceph_mdsc_destroy(fsc); destroy_fs_client(fsc); fsc = ceph_sb_to_client(sb); dout("get_sb got existing client %p\n", fsc); @@ -1115,7 +1133,6 @@ out_splat: goto out_final; out: - ceph_mdsc_destroy(fsc); destroy_fs_client(fsc); out_final: dout("ceph_mount fail %ld\n", PTR_ERR(res)); @@ -1139,8 +1156,6 @@ static void ceph_kill_sb(struct super_block *s) ceph_fscache_unregister_fs(fsc); - ceph_mdsc_destroy(fsc); - destroy_fs_client(fsc); free_anon_bdev(dev); } @@ -1154,6 +1169,33 @@ static struct file_system_type ceph_fs_type = { }; MODULE_ALIAS_FS("ceph"); +int ceph_force_reconnect(struct super_block *sb) +{ + struct ceph_fs_client *fsc = ceph_sb_to_client(sb); + int err = 0; + + ceph_umount_begin(sb); + + /* Make sure all page caches get invalidated. + * see remove_session_caps_cb() */ + flush_workqueue(fsc->inode_wq); + + /* In case that we were blacklisted. This also reset + * all mon/osd connections */ + ceph_reset_client_addr(fsc->client); + + ceph_osdc_clear_abort_err(&fsc->client->osdc); + + fsc->blacklisted = false; + fsc->mount_state = CEPH_MOUNT_MOUNTED; + + if (sb->s_root) { + err = __ceph_do_getattr(d_inode(sb->s_root), NULL, + CEPH_STAT_CAP_INODE, true); + } + return err; +} + static int __init init_ceph(void) { int ret = init_caches(); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 6b9f1ee7de85..f98d9247f9cb 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -16,6 +16,7 @@ #include <linux/slab.h> #include <linux/posix_acl.h> #include <linux/refcount.h> +#include <linux/security.h> #include <linux/ceph/libceph.h> @@ -31,6 +32,7 @@ #define CEPH_BLOCK_SHIFT 22 /* 4 MB */ #define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) +#define CEPH_MOUNT_OPT_CLEANRECOVER (1<<1) /* auto reonnect (clean mode) after blacklisted */ #define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */ #define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */ #define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */ @@ -101,6 +103,11 @@ struct ceph_fs_client { struct ceph_client *client; unsigned long mount_state; + + unsigned long last_auto_reconnect; + bool blacklisted; + + u32 filp_gen; loff_t max_file_size; struct ceph_mds_client *mdsc; @@ -395,6 +402,8 @@ struct ceph_inode_info { struct fscache_cookie *fscache; u32 i_fscache_gen; #endif + errseq_t i_meta_err; + struct inode vfs_inode; /* at end */ }; @@ -499,17 +508,16 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, #define CEPH_I_DIR_ORDERED (1 << 0) /* dentries in dir are ordered */ #define CEPH_I_NODELAY (1 << 1) /* do not delay cap release */ #define CEPH_I_FLUSH (1 << 2) /* do not delay flush of dirty metadata */ -#define CEPH_I_NOFLUSH (1 << 3) /* do not flush dirty caps */ -#define CEPH_I_POOL_PERM (1 << 4) /* pool rd/wr bits are valid */ -#define CEPH_I_POOL_RD (1 << 5) /* can read from pool */ -#define CEPH_I_POOL_WR (1 << 6) /* can write to pool */ -#define CEPH_I_SEC_INITED (1 << 7) /* security initialized */ -#define CEPH_I_CAP_DROPPED (1 << 8) /* caps were forcibly dropped */ -#define CEPH_I_KICK_FLUSH (1 << 9) /* kick flushing caps */ -#define CEPH_I_FLUSH_SNAPS (1 << 10) /* need flush snapss */ -#define CEPH_I_ERROR_WRITE (1 << 11) /* have seen write errors */ -#define CEPH_I_ERROR_FILELOCK (1 << 12) /* have seen file lock errors */ - +#define CEPH_I_POOL_PERM (1 << 3) /* pool rd/wr bits are valid */ +#define CEPH_I_POOL_RD (1 << 4) /* can read from pool */ +#define CEPH_I_POOL_WR (1 << 5) /* can write to pool */ +#define CEPH_I_SEC_INITED (1 << 6) /* security initialized */ +#define CEPH_I_CAP_DROPPED (1 << 7) /* caps were forcibly dropped */ +#define CEPH_I_KICK_FLUSH (1 << 8) /* kick flushing caps */ +#define CEPH_I_FLUSH_SNAPS (1 << 9) /* need flush snapss */ +#define CEPH_I_ERROR_WRITE (1 << 10) /* have seen write errors */ +#define CEPH_I_ERROR_FILELOCK (1 << 11) /* have seen file lock errors */ +#define CEPH_I_ODIRECT (1 << 12) /* inode in direct I/O mode */ /* * Masks of ceph inode work. @@ -703,6 +711,10 @@ struct ceph_file_info { spinlock_t rw_contexts_lock; struct list_head rw_contexts; + + errseq_t meta_err; + u32 filp_gen; + atomic_t num_locks; }; struct ceph_dir_file_info { @@ -842,7 +854,8 @@ static inline int default_congestion_kb(void) } - +/* super.c */ +extern int ceph_force_reconnect(struct super_block *sb); /* snap.c */ struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, u64 ino); @@ -959,7 +972,10 @@ static inline bool ceph_security_xattr_wanted(struct inode *in) #ifdef CONFIG_CEPH_FS_SECURITY_LABEL extern int ceph_security_init_secctx(struct dentry *dentry, umode_t mode, struct ceph_acl_sec_ctx *ctx); -extern void ceph_security_invalidate_secctx(struct inode *inode); +static inline void ceph_security_invalidate_secctx(struct inode *inode) +{ + security_inode_invalidate_secctx(inode); +} #else static inline int ceph_security_init_secctx(struct dentry *dentry, umode_t mode, struct ceph_acl_sec_ctx *ctx) @@ -1039,7 +1055,6 @@ extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session); extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds); -extern int ceph_get_cap_mds(struct inode *inode); extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps); extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had); extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, @@ -1058,9 +1073,9 @@ extern int ceph_encode_dentry_release(void **p, struct dentry *dn, struct inode *dir, int mds, int drop, int unless); -extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, +extern int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got, struct page **pinned_page); -extern int ceph_try_get_caps(struct ceph_inode_info *ci, +extern int ceph_try_get_caps(struct inode *inode, int need, int want, bool nonblock, int *got); /* for counting open files by mode */ @@ -1071,7 +1086,7 @@ extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode); extern const struct address_space_operations ceph_aops; extern int ceph_mmap(struct file *file, struct vm_area_struct *vma); extern int ceph_uninline_data(struct file *filp, struct page *locked_page); -extern int ceph_pool_perm_check(struct ceph_inode_info *ci, int need); +extern int ceph_pool_perm_check(struct inode *inode, int need); extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc); /* file.c */ diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 939eab7aa219..cb18ee637cb7 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -20,7 +20,8 @@ static int __remove_xattr(struct ceph_inode_info *ci, static bool ceph_is_valid_xattr(const char *name) { - return !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) || + return !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) || + !strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN) || !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); } @@ -892,7 +893,8 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, memcpy(value, xattr->val, xattr->val_len); if (current->journal_info && - !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) + !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) && + security_ismaclabel(name + XATTR_SECURITY_PREFIX_LEN)) ci->i_ceph_flags |= CEPH_I_SEC_INITED; out: spin_unlock(&ci->i_ceph_lock); @@ -903,11 +905,9 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) { struct inode *inode = d_inode(dentry); struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_vxattr *vxattrs = ceph_inode_vxattrs(inode); bool len_only = (size == 0); u32 namelen; int err; - int i; spin_lock(&ci->i_ceph_lock); dout("listxattr %p ver=%lld index_ver=%lld\n", inode, @@ -936,33 +936,6 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) names = __copy_xattr_names(ci, names); size -= namelen; } - - - /* virtual xattr names, too */ - if (vxattrs) { - for (i = 0; vxattrs[i].name; i++) { - size_t this_len; - - if (vxattrs[i].flags & VXATTR_FLAG_HIDDEN) - continue; - if (vxattrs[i].exists_cb && !vxattrs[i].exists_cb(ci)) - continue; - - this_len = strlen(vxattrs[i].name) + 1; - namelen += this_len; - if (len_only) - continue; - - if (this_len > size) { - err = -ERANGE; - goto out; - } - - memcpy(names, vxattrs[i].name, this_len); - names += this_len; - size -= this_len; - } - } err = namelen; out: spin_unlock(&ci->i_ceph_lock); @@ -1293,42 +1266,8 @@ out: ceph_pagelist_release(pagelist); return err; } - -void ceph_security_invalidate_secctx(struct inode *inode) -{ - security_inode_invalidate_secctx(inode); -} - -static int ceph_xattr_set_security_label(const struct xattr_handler *handler, - struct dentry *unused, struct inode *inode, - const char *key, const void *buf, - size_t buflen, int flags) -{ - if (security_ismaclabel(key)) { - const char *name = xattr_full_name(handler, key); - return __ceph_setxattr(inode, name, buf, buflen, flags); - } - return -EOPNOTSUPP; -} - -static int ceph_xattr_get_security_label(const struct xattr_handler *handler, - struct dentry *unused, struct inode *inode, - const char *key, void *buf, size_t buflen) -{ - if (security_ismaclabel(key)) { - const char *name = xattr_full_name(handler, key); - return __ceph_getxattr(inode, name, buf, buflen); - } - return -EOPNOTSUPP; -} - -static const struct xattr_handler ceph_security_label_handler = { - .prefix = XATTR_SECURITY_PREFIX, - .get = ceph_xattr_get_security_label, - .set = ceph_xattr_set_security_label, -}; -#endif -#endif +#endif /* CONFIG_CEPH_FS_SECURITY_LABEL */ +#endif /* CONFIG_SECURITY */ void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx) { @@ -1352,9 +1291,6 @@ const struct xattr_handler *ceph_xattr_handlers[] = { &posix_acl_access_xattr_handler, &posix_acl_default_xattr_handler, #endif -#ifdef CONFIG_CEPH_FS_SECURITY_LABEL - &ceph_security_label_handler, -#endif &ceph_other_xattr_handler, NULL, }; diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h index 6c3bd07868d7..0f0dc1c1fe41 100644 --- a/fs/cifs/cifs_ioctl.h +++ b/fs/cifs/cifs_ioctl.h @@ -57,9 +57,18 @@ struct smb_query_info { /* char buffer[]; */ } __packed; +struct smb3_key_debug_info { + __u64 Suid; + __u16 cipher_type; + __u8 auth_key[16]; /* SMB2_NTLMV2_SESSKEY_SIZE */ + __u8 smb3encryptionkey[SMB3_SIGN_KEY_SIZE]; + __u8 smb3decryptionkey[SMB3_SIGN_KEY_SIZE]; +} __packed; + #define CIFS_IOCTL_MAGIC 0xCF #define CIFS_IOC_COPYCHUNK_FILE _IOW(CIFS_IOCTL_MAGIC, 3, int) #define CIFS_IOC_SET_INTEGRITY _IO(CIFS_IOCTL_MAGIC, 4) #define CIFS_IOC_GET_MNT_INFO _IOR(CIFS_IOCTL_MAGIC, 5, struct smb_mnt_fs_info) #define CIFS_ENUMERATE_SNAPSHOTS _IOR(CIFS_IOCTL_MAGIC, 6, struct smb_snapshot_array) #define CIFS_QUERY_INFO _IOWR(CIFS_IOCTL_MAGIC, 7, struct smb_query_info) +#define CIFS_DUMP_KEY _IOWR(CIFS_IOCTL_MAGIC, 8, struct smb3_key_debug_info) diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h index eb428349f29a..439b99cefeb0 100644 --- a/fs/cifs/cifsacl.h +++ b/fs/cifs/cifsacl.h @@ -90,8 +90,39 @@ struct cifs_acl { __le32 num_aces; } __attribute__((packed)); +/* ACE types - see MS-DTYP 2.4.4.1 */ +#define ACCESS_ALLOWED_ACE_TYPE 0x00 +#define ACCESS_DENIED_ACE_TYPE 0x01 +#define SYSTEM_AUDIT_ACE_TYPE 0x02 +#define SYSTEM_ALARM_ACE_TYPE 0x03 +#define ACCESS_ALLOWED_COMPOUND_ACE_TYPE 0x04 +#define ACCESS_ALLOWED_OBJECT_ACE_TYPE 0x05 +#define ACCESS_DENIED_OBJECT_ACE_TYPE 0x06 +#define SYSTEM_AUDIT_OBJECT_ACE_TYPE 0x07 +#define SYSTEM_ALARM_OBJECT_ACE_TYPE 0x08 +#define ACCESS_ALLOWED_CALLBACK_ACE_TYPE 0x09 +#define ACCESS_DENIED_CALLBACK_ACE_TYPE 0x0A +#define ACCESS_ALLOWED_CALLBACK_OBJECT_ACE_TYPE 0x0B +#define ACCESS_DENIED_CALLBACK_OBJECT_ACE_TYPE 0x0C +#define SYSTEM_AUDIT_CALLBACK_ACE_TYPE 0x0D +#define SYSTEM_ALARM_CALLBACK_ACE_TYPE 0x0E /* Reserved */ +#define SYSTEM_AUDIT_CALLBACK_OBJECT_ACE_TYPE 0x0F +#define SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE 0x10 /* reserved */ +#define SYSTEM_MANDATORY_LABEL_ACE_TYPE 0x11 +#define SYSTEM_RESOURCE_ATTRIBUTE_ACE_TYPE 0x12 +#define SYSTEM_SCOPED_POLICY_ID_ACE_TYPE 0x13 + +/* ACE flags */ +#define OBJECT_INHERIT_ACE 0x01 +#define CONTAINER_INHERIT_ACE 0x02 +#define NO_PROPAGATE_INHERIT_ACE 0x04 +#define INHERIT_ONLY_ACE 0x08 +#define INHERITED_ACE 0x10 +#define SUCCESSFUL_ACCESS_ACE_FLAG 0x40 +#define FAILED_ACCESS_ACE_FLAG 0x80 + struct cifs_ace { - __u8 type; + __u8 type; /* see above and MS-DTYP 2.4.4.1 */ __u8 flags; __le16 size; __le32 access_req; @@ -99,6 +130,54 @@ struct cifs_ace { } __attribute__((packed)); /* + * The current SMB3 form of security descriptor is similar to what was used for + * cifs (see above) but some fields are split, and fields in the struct below + * matches names of fields to the the spec, MS-DTYP (see sections 2.4.5 and + * 2.4.6). Note that "CamelCase" fields are used in this struct in order to + * match the MS-DTYP and MS-SMB2 specs which define the wire format. + */ +struct smb3_sd { + __u8 Revision; /* revision level, MUST be one */ + __u8 Sbz1; /* only meaningful if 'RM' flag set below */ + __le16 Control; + __le32 OffsetOwner; + __le32 OffsetGroup; + __le32 OffsetSacl; + __le32 OffsetDacl; +} __packed; + +/* Meaning of 'Control' field flags */ +#define ACL_CONTROL_SR 0x0001 /* Self relative */ +#define ACL_CONTROL_RM 0x0002 /* Resource manager control bits */ +#define ACL_CONTROL_PS 0x0004 /* SACL protected from inherits */ +#define ACL_CONTROL_PD 0x0008 /* DACL protected from inherits */ +#define ACL_CONTROL_SI 0x0010 /* SACL Auto-Inherited */ +#define ACL_CONTROL_DI 0x0020 /* DACL Auto-Inherited */ +#define ACL_CONTROL_SC 0x0040 /* SACL computed through inheritance */ +#define ACL_CONTROL_DC 0x0080 /* DACL computed through inheritence */ +#define ACL_CONTROL_SS 0x0100 /* Create server ACL */ +#define ACL_CONTROL_DT 0x0200 /* DACL provided by trusteed source */ +#define ACL_CONTROL_SD 0x0400 /* SACL defaulted */ +#define ACL_CONTROL_SP 0x0800 /* SACL is present on object */ +#define ACL_CONTROL_DD 0x1000 /* DACL defaulted */ +#define ACL_CONTROL_DP 0x2000 /* DACL is present on object */ +#define ACL_CONTROL_GD 0x4000 /* Group was defaulted */ +#define ACL_CONTROL_OD 0x8000 /* User was defaulted */ + +/* Meaning of AclRevision flags */ +#define ACL_REVISION 0x02 /* See section 2.4.4.1 of MS-DTYP */ +#define ACL_REVISION_DS 0x04 /* Additional AceTypes allowed */ + +struct smb3_acl { + u8 AclRevision; /* revision level */ + u8 Sbz1; /* MBZ */ + __le16 AclSize; + __le16 AceCount; + __le16 Sbz2; /* MBZ */ +} __packed; + + +/* * Minimum security identifier can be one for system defined Users * and Groups such as NULL SID and World or Built-in accounts such * as Administrator and Guest and consists of diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 2e9c7f493f99..1a135d1b85bd 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -169,18 +169,32 @@ cifs_read_super(struct super_block *sb) else sb->s_maxbytes = MAX_NON_LFS; - /* BB FIXME fix time_gran to be larger for LANMAN sessions */ - sb->s_time_gran = 100; - - if (tcon->unix_ext) { - ts = cifs_NTtimeToUnix(0); + /* + * Some very old servers like DOS and OS/2 used 2 second granularity + * (while all current servers use 100ns granularity - see MS-DTYP) + * but 1 second is the maximum allowed granularity for the VFS + * so for old servers set time granularity to 1 second while for + * everything else (current servers) set it to 100ns. + */ + if ((tcon->ses->server->vals->protocol_id == SMB10_PROT_ID) && + ((tcon->ses->capabilities & + tcon->ses->server->vals->cap_nt_find) == 0) && + !tcon->unix_ext) { + sb->s_time_gran = 1000000000; /* 1 second is max allowed gran */ + ts = cnvrtDosUnixTm(cpu_to_le16(SMB_DATE_MIN), 0, 0); sb->s_time_min = ts.tv_sec; - ts = cifs_NTtimeToUnix(cpu_to_le64(S64_MAX)); + ts = cnvrtDosUnixTm(cpu_to_le16(SMB_DATE_MAX), + cpu_to_le16(SMB_TIME_MAX), 0); sb->s_time_max = ts.tv_sec; } else { - ts = cnvrtDosUnixTm(cpu_to_le16(SMB_DATE_MIN), 0, 0); + /* + * Almost every server, including all SMB2+, uses DCE TIME + * ie 100 nanosecond units, since 1601. See MS-DTYP and MS-FSCC + */ + sb->s_time_gran = 100; + ts = cifs_NTtimeToUnix(0); sb->s_time_min = ts.tv_sec; - ts = cnvrtDosUnixTm(cpu_to_le16(SMB_DATE_MAX), cpu_to_le16(SMB_TIME_MAX), 0); + ts = cifs_NTtimeToUnix(cpu_to_le64(S64_MAX)); sb->s_time_max = ts.tv_sec; } diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 54e204589cb9..d78bfcc19156 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -331,8 +331,9 @@ struct smb_version_operations { umode_t mode, struct cifs_tcon *tcon, const char *full_path, struct cifs_sb_info *cifs_sb); - int (*mkdir)(const unsigned int, struct cifs_tcon *, const char *, - struct cifs_sb_info *); + int (*mkdir)(const unsigned int xid, struct inode *inode, umode_t mode, + struct cifs_tcon *tcon, const char *name, + struct cifs_sb_info *sb); /* set info on created directory */ void (*mkdir_setinfo)(struct inode *, const char *, struct cifs_sb_info *, struct cifs_tcon *, @@ -1209,6 +1210,7 @@ struct cifs_search_info { bool smallBuf:1; /* so we know which buf_release function to call */ }; +#define ACL_NO_MODE ((umode_t)(-1)) struct cifs_open_parms { struct cifs_tcon *tcon; struct cifs_sb_info *cifs_sb; @@ -1389,6 +1391,11 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file); struct cifsInodeInfo { bool can_cache_brlcks; struct list_head llist; /* locks helb by this inode */ + /* + * NOTE: Some code paths call down_read(lock_sem) twice, so + * we must always use use cifs_down_write() instead of down_write() + * for this semaphore to avoid deadlocks. + */ struct rw_semaphore lock_sem; /* protect the fields above */ /* BB add in lists for dirty pages i.e. write caching info for oplock */ struct list_head openFileList; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 99b1b1ef558c..fe597d3d5208 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -170,6 +170,7 @@ extern int cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, const unsigned int xid); extern int cifs_push_mandatory_locks(struct cifsFileInfo *cfile); +extern void cifs_down_write(struct rw_semaphore *sem); extern struct cifsFileInfo *cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, struct tcon_link *tlink, @@ -372,7 +373,8 @@ extern int CIFSSMBUnixSetPathInfo(const unsigned int xid, const struct nls_table *nls_codepage, int remap); -extern int CIFSSMBMkDir(const unsigned int xid, struct cifs_tcon *tcon, +extern int CIFSSMBMkDir(const unsigned int xid, struct inode *inode, + umode_t mode, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb); extern int CIFSSMBRmDir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index dbee2132e419..4f554f019a98 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1078,7 +1078,8 @@ RmDirRetry: } int -CIFSSMBMkDir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, +CIFSSMBMkDir(const unsigned int xid, struct inode *inode, umode_t mode, + struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb) { int rc = 0; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 2850c3ce4391..ccaa8bad336f 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -564,9 +564,11 @@ cifs_reconnect(struct TCP_Server_Info *server) spin_lock(&GlobalMid_Lock); list_for_each_safe(tmp, tmp2, &server->pending_mid_q) { mid_entry = list_entry(tmp, struct mid_q_entry, qhead); + kref_get(&mid_entry->refcount); if (mid_entry->mid_state == MID_REQUEST_SUBMITTED) mid_entry->mid_state = MID_RETRY_NEEDED; list_move(&mid_entry->qhead, &retry_list); + mid_entry->mid_flags |= MID_DELETED; } spin_unlock(&GlobalMid_Lock); mutex_unlock(&server->srv_mutex); @@ -576,6 +578,7 @@ cifs_reconnect(struct TCP_Server_Info *server) mid_entry = list_entry(tmp, struct mid_q_entry, qhead); list_del_init(&mid_entry->qhead); mid_entry->callback(mid_entry); + cifs_mid_q_entry_release(mid_entry); } if (cifs_rdma_enabled(server)) { @@ -895,8 +898,10 @@ dequeue_mid(struct mid_q_entry *mid, bool malformed) if (mid->mid_flags & MID_DELETED) printk_once(KERN_WARNING "trying to dequeue a deleted mid\n"); - else + else { list_del_init(&mid->qhead); + mid->mid_flags |= MID_DELETED; + } spin_unlock(&GlobalMid_Lock); } @@ -966,8 +971,10 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server) list_for_each_safe(tmp, tmp2, &server->pending_mid_q) { mid_entry = list_entry(tmp, struct mid_q_entry, qhead); cifs_dbg(FYI, "Clearing mid 0x%llx\n", mid_entry->mid); + kref_get(&mid_entry->refcount); mid_entry->mid_state = MID_SHUTDOWN; list_move(&mid_entry->qhead, &dispose_list); + mid_entry->mid_flags |= MID_DELETED; } spin_unlock(&GlobalMid_Lock); @@ -977,6 +984,7 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server) cifs_dbg(FYI, "Callback mid 0x%llx\n", mid_entry->mid); list_del_init(&mid_entry->qhead); mid_entry->callback(mid_entry); + cifs_mid_q_entry_release(mid_entry); } /* 1/8th of sec is more than enough time for them to exit */ msleep(125); @@ -3882,8 +3890,12 @@ generic_ip_connect(struct TCP_Server_Info *server) rc = socket->ops->connect(socket, saddr, slen, server->noblockcnt ? O_NONBLOCK : 0); - - if (rc == -EINPROGRESS) + /* + * When mounting SMB root file systems, we do not want to block in + * connect. Otherwise bail out and then let cifs_reconnect() perform + * reconnect failover - if possible. + */ + if (server->noblockcnt && rc == -EINPROGRESS) rc = 0; if (rc < 0) { cifs_dbg(FYI, "Error %d connecting to server\n", rc); @@ -4264,7 +4276,7 @@ static int mount_get_conns(struct smb_vol *vol, struct cifs_sb_info *cifs_sb, server->ops->qfs_tcon(*xid, tcon); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RO_CACHE) { if (tcon->fsDevInfo.DeviceCharacteristics & - FILE_READ_ONLY_DEVICE) + cpu_to_le32(FILE_READ_ONLY_DEVICE)) cifs_dbg(VFS, "mounted to read only share\n"); else if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RW_CACHE) == 0) @@ -4445,7 +4457,7 @@ static int setup_dfs_tgt_conn(const char *path, int rc; struct dfs_info3_param ref = {0}; char *mdata = NULL, *fake_devname = NULL; - struct smb_vol fake_vol = {0}; + struct smb_vol fake_vol = {NULL}; cifs_dbg(FYI, "%s: dfs path: %s\n", __func__, path); diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index dd5ac841aefa..7ce689d31aa2 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -738,10 +738,16 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, static int cifs_d_revalidate(struct dentry *direntry, unsigned int flags) { + struct inode *inode; + if (flags & LOOKUP_RCU) return -ECHILD; if (d_really_is_positive(direntry)) { + inode = d_inode(direntry); + if ((flags & LOOKUP_REVAL) && !CIFS_CACHE_READ(CIFS_I(inode))) + CIFS_I(inode)->time = 0; /* force reval */ + if (cifs_revalidate_dentry(direntry)) return 0; else { @@ -752,7 +758,7 @@ cifs_d_revalidate(struct dentry *direntry, unsigned int flags) * attributes will have been updated by * cifs_revalidate_dentry(). */ - if (IS_AUTOMOUNT(d_inode(direntry)) && + if (IS_AUTOMOUNT(inode) && !(direntry->d_flags & DCACHE_NEED_AUTOMOUNT)) { spin_lock(&direntry->d_lock); direntry->d_flags |= DCACHE_NEED_AUTOMOUNT; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 4b95700c507c..fa7b0fa72bb3 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -253,6 +253,12 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb, rc = cifs_get_inode_info(&inode, full_path, buf, inode->i_sb, xid, fid); + if (rc) { + server->ops->close(xid, tcon, fid); + if (rc == -ESTALE) + rc = -EOPENSTALE; + } + out: kfree(buf); return rc; @@ -275,6 +281,13 @@ cifs_has_mand_locks(struct cifsInodeInfo *cinode) return has_locks; } +void +cifs_down_write(struct rw_semaphore *sem) +{ + while (!down_write_trylock(sem)) + msleep(10); +} + struct cifsFileInfo * cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, struct tcon_link *tlink, __u32 oplock) @@ -300,7 +313,7 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, INIT_LIST_HEAD(&fdlocks->locks); fdlocks->cfile = cfile; cfile->llist = fdlocks; - down_write(&cinode->lock_sem); + cifs_down_write(&cinode->lock_sem); list_add(&fdlocks->llist, &cinode->llist); up_write(&cinode->lock_sem); @@ -399,10 +412,11 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler) bool oplock_break_cancelled; spin_lock(&tcon->open_file_lock); - + spin_lock(&cifsi->open_file_lock); spin_lock(&cifs_file->file_info_lock); if (--cifs_file->count > 0) { spin_unlock(&cifs_file->file_info_lock); + spin_unlock(&cifsi->open_file_lock); spin_unlock(&tcon->open_file_lock); return; } @@ -415,9 +429,7 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler) cifs_add_pending_open_locked(&fid, cifs_file->tlink, &open); /* remove it from the lists */ - spin_lock(&cifsi->open_file_lock); list_del(&cifs_file->flist); - spin_unlock(&cifsi->open_file_lock); list_del(&cifs_file->tlist); atomic_dec(&tcon->num_local_opens); @@ -434,6 +446,7 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler) cifs_set_oplock_level(cifsi, 0); } + spin_unlock(&cifsi->open_file_lock); spin_unlock(&tcon->open_file_lock); oplock_break_cancelled = wait_oplock_handler ? @@ -458,7 +471,7 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler) * Delete any outstanding lock records. We'll lose them when the file * is closed anyway. */ - down_write(&cifsi->lock_sem); + cifs_down_write(&cifsi->lock_sem); list_for_each_entry_safe(li, tmp, &cifs_file->llist->locks, llist) { list_del(&li->llist); cifs_del_lock_waiters(li); @@ -1021,7 +1034,7 @@ static void cifs_lock_add(struct cifsFileInfo *cfile, struct cifsLockInfo *lock) { struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry)); - down_write(&cinode->lock_sem); + cifs_down_write(&cinode->lock_sem); list_add_tail(&lock->llist, &cfile->llist->locks); up_write(&cinode->lock_sem); } @@ -1043,7 +1056,7 @@ cifs_lock_add_if(struct cifsFileInfo *cfile, struct cifsLockInfo *lock, try_again: exist = false; - down_write(&cinode->lock_sem); + cifs_down_write(&cinode->lock_sem); exist = cifs_find_lock_conflict(cfile, lock->offset, lock->length, lock->type, lock->flags, &conf_lock, @@ -1066,7 +1079,7 @@ try_again: (lock->blist.next == &lock->blist)); if (!rc) goto try_again; - down_write(&cinode->lock_sem); + cifs_down_write(&cinode->lock_sem); list_del_init(&lock->blist); } @@ -1119,7 +1132,7 @@ cifs_posix_lock_set(struct file *file, struct file_lock *flock) return rc; try_again: - down_write(&cinode->lock_sem); + cifs_down_write(&cinode->lock_sem); if (!cinode->can_cache_brlcks) { up_write(&cinode->lock_sem); return rc; @@ -1325,7 +1338,7 @@ cifs_push_locks(struct cifsFileInfo *cfile) int rc = 0; /* we are going to update can_cache_brlcks here - need a write access */ - down_write(&cinode->lock_sem); + cifs_down_write(&cinode->lock_sem); if (!cinode->can_cache_brlcks) { up_write(&cinode->lock_sem); return rc; @@ -1516,7 +1529,7 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, if (!buf) return -ENOMEM; - down_write(&cinode->lock_sem); + cifs_down_write(&cinode->lock_sem); for (i = 0; i < 2; i++) { cur = buf; num = 0; @@ -1840,13 +1853,12 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, { struct cifsFileInfo *open_file = NULL; struct cifs_sb_info *cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb); - struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); /* only filter by fsuid on multiuser mounts */ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)) fsuid_only = false; - spin_lock(&tcon->open_file_lock); + spin_lock(&cifs_inode->open_file_lock); /* we could simply get the first_list_entry since write-only entries are always at the end of the list but since the first entry might have a close pending, we go through the whole list */ @@ -1858,7 +1870,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, /* found a good file */ /* lock it so it will not be closed on us */ cifsFileInfo_get(open_file); - spin_unlock(&tcon->open_file_lock); + spin_unlock(&cifs_inode->open_file_lock); return open_file; } /* else might as well continue, and look for another, or simply have the caller reopen it @@ -1866,7 +1878,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, } else /* write only file */ break; /* write only files are last so must be done */ } - spin_unlock(&tcon->open_file_lock); + spin_unlock(&cifs_inode->open_file_lock); return NULL; } @@ -1877,7 +1889,6 @@ cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only, { struct cifsFileInfo *open_file, *inv_file = NULL; struct cifs_sb_info *cifs_sb; - struct cifs_tcon *tcon; bool any_available = false; int rc = -EBADF; unsigned int refind = 0; @@ -1897,16 +1908,15 @@ cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, bool fsuid_only, } cifs_sb = CIFS_SB(cifs_inode->vfs_inode.i_sb); - tcon = cifs_sb_master_tcon(cifs_sb); /* only filter by fsuid on multiuser mounts */ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER)) fsuid_only = false; - spin_lock(&tcon->open_file_lock); + spin_lock(&cifs_inode->open_file_lock); refind_writable: if (refind > MAX_REOPEN_ATT) { - spin_unlock(&tcon->open_file_lock); + spin_unlock(&cifs_inode->open_file_lock); return rc; } list_for_each_entry(open_file, &cifs_inode->openFileList, flist) { @@ -1918,7 +1928,7 @@ refind_writable: if (!open_file->invalidHandle) { /* found a good writable file */ cifsFileInfo_get(open_file); - spin_unlock(&tcon->open_file_lock); + spin_unlock(&cifs_inode->open_file_lock); *ret_file = open_file; return 0; } else { @@ -1938,7 +1948,7 @@ refind_writable: cifsFileInfo_get(inv_file); } - spin_unlock(&tcon->open_file_lock); + spin_unlock(&cifs_inode->open_file_lock); if (inv_file) { rc = cifs_reopen_file(inv_file, false); @@ -1953,7 +1963,7 @@ refind_writable: cifsFileInfo_put(inv_file); ++refind; inv_file = NULL; - spin_lock(&tcon->open_file_lock); + spin_lock(&cifs_inode->open_file_lock); goto refind_writable; } @@ -4461,17 +4471,15 @@ static int cifs_readpage(struct file *file, struct page *page) static int is_inode_writable(struct cifsInodeInfo *cifs_inode) { struct cifsFileInfo *open_file; - struct cifs_tcon *tcon = - cifs_sb_master_tcon(CIFS_SB(cifs_inode->vfs_inode.i_sb)); - spin_lock(&tcon->open_file_lock); + spin_lock(&cifs_inode->open_file_lock); list_for_each_entry(open_file, &cifs_inode->openFileList, flist) { if (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE) { - spin_unlock(&tcon->open_file_lock); + spin_unlock(&cifs_inode->open_file_lock); return 1; } } - spin_unlock(&tcon->open_file_lock); + spin_unlock(&cifs_inode->open_file_lock); return 0; } diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 26cdfbf1e164..df9377828e2f 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -414,6 +414,7 @@ int cifs_get_inode_info_unix(struct inode **pinode, /* if uniqueid is different, return error */ if (unlikely(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM && CIFS_I(*pinode)->uniqueid != fattr.cf_uniqueid)) { + CIFS_I(*pinode)->time = 0; /* force reval */ rc = -ESTALE; goto cgiiu_exit; } @@ -421,6 +422,7 @@ int cifs_get_inode_info_unix(struct inode **pinode, /* if filetype is different, return error */ if (unlikely(((*pinode)->i_mode & S_IFMT) != (fattr.cf_mode & S_IFMT))) { + CIFS_I(*pinode)->time = 0; /* force reval */ rc = -ESTALE; goto cgiiu_exit; } @@ -933,6 +935,7 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, /* if uniqueid is different, return error */ if (unlikely(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM && CIFS_I(*inode)->uniqueid != fattr.cf_uniqueid)) { + CIFS_I(*inode)->time = 0; /* force reval */ rc = -ESTALE; goto cgii_exit; } @@ -940,6 +943,7 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, /* if filetype is different, return error */ if (unlikely(((*inode)->i_mode & S_IFMT) != (fattr.cf_mode & S_IFMT))) { + CIFS_I(*inode)->time = 0; /* force reval */ rc = -ESTALE; goto cgii_exit; } @@ -1622,13 +1626,14 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, umode_t mode) } /* BB add setting the equivalent of mode via CreateX w/ACLs */ - rc = server->ops->mkdir(xid, tcon, full_path, cifs_sb); + rc = server->ops->mkdir(xid, inode, mode, tcon, full_path, cifs_sb); if (rc) { cifs_dbg(FYI, "cifs_mkdir returned 0x%x\n", rc); d_drop(direntry); goto mkdir_out; } + /* TODO: skip this for smb2/smb3 */ rc = cifs_mkdir_qinfo(inode, direntry, mode, full_path, cifs_sb, tcon, xid); mkdir_out: @@ -2470,9 +2475,9 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) rc = tcon->ses->server->ops->flush(xid, tcon, &wfile->fid); cifsFileInfo_put(wfile); if (rc) - return rc; + goto cifs_setattr_exit; } else if (rc != -EBADF) - return rc; + goto cifs_setattr_exit; else rc = 0; } diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index 76ddd98b6298..1a01e108d75e 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -164,6 +164,7 @@ static long smb_mnt_get_fsinfo(unsigned int xid, struct cifs_tcon *tcon, long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) { struct inode *inode = file_inode(filep); + struct smb3_key_debug_info pkey_inf; int rc = -ENOTTY; /* strange error - but the precedent */ unsigned int xid; struct cifsFileInfo *pSMBFile = filep->private_data; @@ -270,6 +271,34 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) else rc = -EOPNOTSUPP; break; + case CIFS_DUMP_KEY: + if (pSMBFile == NULL) + break; + if (!capable(CAP_SYS_ADMIN)) { + rc = -EACCES; + break; + } + + tcon = tlink_tcon(pSMBFile->tlink); + if (!smb3_encryption_required(tcon)) { + rc = -EOPNOTSUPP; + break; + } + pkey_inf.cipher_type = + le16_to_cpu(tcon->ses->server->cipher_type); + pkey_inf.Suid = tcon->ses->Suid; + memcpy(pkey_inf.auth_key, tcon->ses->auth_key.response, + 16 /* SMB2_NTLMV2_SESSKEY_SIZE */); + memcpy(pkey_inf.smb3decryptionkey, + tcon->ses->smb3decryptionkey, SMB3_SIGN_KEY_SIZE); + memcpy(pkey_inf.smb3encryptionkey, + tcon->ses->smb3encryptionkey, SMB3_SIGN_KEY_SIZE); + if (copy_to_user((void __user *)arg, &pkey_inf, + sizeof(struct smb3_key_debug_info))) + rc = -EFAULT; + else + rc = 0; + break; default: cifs_dbg(FYI, "unsupported ioctl\n"); break; diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index 49c17ee18254..9b41436fb8db 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -117,10 +117,6 @@ static const struct smb_to_posix_error mapping_table_ERRSRV[] = { {0, 0} }; -static const struct smb_to_posix_error mapping_table_ERRHRD[] = { - {0, 0} -}; - /* * Convert a string containing text IPv4 or IPv6 address to binary form. * diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 4c764ff7edd2..85bd644f9773 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -698,7 +698,6 @@ sess_auth_lanman(struct sess_data *sess_data) char *bcc_ptr; struct cifs_ses *ses = sess_data->ses; char lnm_session_key[CIFS_AUTH_RESP_SIZE]; - __u32 capabilities; __u16 bytes_remaining; /* lanman 2 style sessionsetup */ @@ -709,7 +708,7 @@ sess_auth_lanman(struct sess_data *sess_data) pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; bcc_ptr = sess_data->iov[2].iov_base; - capabilities = cifs_ssetup_hdr(ses, pSMB); + (void)cifs_ssetup_hdr(ses, pSMB); pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE; diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index b7421a096319..514810694c0f 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -171,6 +171,9 @@ cifs_get_next_mid(struct TCP_Server_Info *server) /* we do not want to loop forever */ last_mid = cur_mid; cur_mid++; + /* avoid 0xFFFF MID */ + if (cur_mid == 0xffff) + cur_mid++; /* * This nested loop looks more expensive than it is. diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index e6a1fc72018f..8b0b512c5792 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -145,7 +145,7 @@ smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, cur = buf; - down_write(&cinode->lock_sem); + cifs_down_write(&cinode->lock_sem); list_for_each_entry_safe(li, tmp, &cfile->llist->locks, llist) { if (flock->fl_start > li->offset || (flock->fl_start + length) < diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index d2a3fb7e5c8d..4121ac1163ca 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -51,7 +51,7 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const char *full_path, __u32 desired_access, __u32 create_disposition, - __u32 create_options, void *ptr, int command, + __u32 create_options, umode_t mode, void *ptr, int command, struct cifsFileInfo *cfile) { int rc; @@ -103,6 +103,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, oparms.create_options |= CREATE_OPEN_BACKUP_INTENT; oparms.fid = &fid; oparms.reconnect = false; + oparms.mode = mode; memset(&open_iov, 0, sizeof(open_iov)); rqst[num_rqst].rq_iov = open_iov; @@ -478,7 +479,7 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, cifs_get_readable_path(tcon, full_path, &cfile); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, create_options, - smb2_data, SMB2_OP_QUERY_INFO, cfile); + ACL_NO_MODE, smb2_data, SMB2_OP_QUERY_INFO, cfile); if (rc == -EOPNOTSUPP) { *symlink = true; create_options |= OPEN_REPARSE_POINT; @@ -486,8 +487,8 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, /* Failed on a symbolic link - query a reparse point info */ rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN, - create_options, smb2_data, - SMB2_OP_QUERY_INFO, NULL); + create_options, ACL_NO_MODE, + smb2_data, SMB2_OP_QUERY_INFO, NULL); } if (rc) goto out; @@ -499,12 +500,14 @@ out: } int -smb2_mkdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, +smb2_mkdir(const unsigned int xid, struct inode *parent_inode, umode_t mode, + struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb) { return smb2_compound_op(xid, tcon, cifs_sb, name, FILE_WRITE_ATTRIBUTES, FILE_CREATE, - CREATE_NOT_FILE, NULL, SMB2_OP_MKDIR, NULL); + CREATE_NOT_FILE, mode, NULL, SMB2_OP_MKDIR, + NULL); } void @@ -525,8 +528,8 @@ smb2_mkdir_setinfo(struct inode *inode, const char *name, cifs_get_writable_path(tcon, name, &cfile); tmprc = smb2_compound_op(xid, tcon, cifs_sb, name, FILE_WRITE_ATTRIBUTES, FILE_CREATE, - CREATE_NOT_FILE, &data, SMB2_OP_SET_INFO, - cfile); + CREATE_NOT_FILE, ACL_NO_MODE, + &data, SMB2_OP_SET_INFO, cfile); if (tmprc == 0) cifs_i->cifsAttrs = dosattrs; } @@ -536,7 +539,7 @@ smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb) { return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, - CREATE_NOT_FILE, + CREATE_NOT_FILE, ACL_NO_MODE, NULL, SMB2_OP_RMDIR, NULL); } @@ -546,7 +549,7 @@ smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name, { return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN, CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT, - NULL, SMB2_OP_DELETE, NULL); + ACL_NO_MODE, NULL, SMB2_OP_DELETE, NULL); } static int @@ -564,7 +567,8 @@ smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon, goto smb2_rename_path; } rc = smb2_compound_op(xid, tcon, cifs_sb, from_name, access, - FILE_OPEN, 0, smb2_to_name, command, cfile); + FILE_OPEN, 0, ACL_NO_MODE, smb2_to_name, + command, cfile); smb2_rename_path: kfree(smb2_to_name); return rc; @@ -601,8 +605,8 @@ smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon, __le64 eof = cpu_to_le64(size); return smb2_compound_op(xid, tcon, cifs_sb, full_path, - FILE_WRITE_DATA, FILE_OPEN, 0, &eof, - SMB2_OP_SET_EOF, NULL); + FILE_WRITE_DATA, FILE_OPEN, 0, ACL_NO_MODE, + &eof, SMB2_OP_SET_EOF, NULL); } int @@ -623,8 +627,8 @@ smb2_set_file_info(struct inode *inode, const char *full_path, return PTR_ERR(tlink); rc = smb2_compound_op(xid, tlink_tcon(tlink), cifs_sb, full_path, - FILE_WRITE_ATTRIBUTES, FILE_OPEN, 0, buf, - SMB2_OP_SET_INFO, NULL); + FILE_WRITE_ATTRIBUTES, FILE_OPEN, + 0, ACL_NO_MODE, buf, SMB2_OP_SET_INFO, NULL); cifs_put_tlink(tlink); return rc; } diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index eaed18061314..cd55af9b7cc5 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -751,6 +751,8 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid) goto oshr_exit; } + atomic_inc(&tcon->num_remote_opens); + o_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base; oparms.fid->persistent_fid = o_rsp->PersistentFileId; oparms.fid->volatile_fid = o_rsp->VolatileFileId; @@ -1176,6 +1178,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, rc = compound_send_recv(xid, ses, flags, 3, rqst, resp_buftype, rsp_iov); + /* no need to bump num_remote_opens because handle immediately closed */ sea_exit: kfree(ea); @@ -1518,6 +1521,8 @@ smb2_ioctl_query_info(const unsigned int xid, resp_buftype, rsp_iov); if (rc) goto iqinf_exit; + + /* No need to bump num_remote_opens since handle immediately closed */ if (qi.flags & PASSTHRU_FSCTL) { pqi = (struct smb_query_info __user *)arg; io_rsp = (struct smb2_ioctl_rsp *)rsp_iov[1].iov_base; @@ -3328,6 +3333,11 @@ smb21_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, if (oplock == SMB2_OPLOCK_LEVEL_NOCHANGE) return; + /* Check if the server granted an oplock rather than a lease */ + if (oplock & SMB2_OPLOCK_LEVEL_EXCLUSIVE) + return smb2_set_oplock_level(cinode, oplock, epoch, + purge_cache); + if (oplock & SMB2_LEASE_READ_CACHING_HE) { new_oplock |= CIFS_CACHE_READ_FLG; strcat(message, "R"); @@ -4074,6 +4084,7 @@ free_pages: kfree(dw->ppages); cifs_small_buf_release(dw->buf); + kfree(dw); } @@ -4147,7 +4158,7 @@ receive_encrypted_read(struct TCP_Server_Info *server, struct mid_q_entry **mid, dw->server = server; dw->ppages = pages; dw->len = len; - queue_work(cifsiod_wq, &dw->decrypt); + queue_work(decrypt_wq, &dw->decrypt); *num_mids = 0; /* worker thread takes care of finding mid */ return -1; } diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 87066f1af12c..05149862aea4 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -751,6 +751,8 @@ add_posix_context(struct kvec *iov, unsigned int *num_iovec, umode_t mode) unsigned int num = *num_iovec; iov[num].iov_base = create_posix_buf(mode); + if (mode == ACL_NO_MODE) + cifs_dbg(FYI, "illegal mode\n"); if (iov[num].iov_base == NULL) return -ENOMEM; iov[num].iov_len = sizeof(struct create_posix); @@ -2352,6 +2354,7 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode, rqst.rq_iov = iov; rqst.rq_nvec = n_iov; + /* no need to inc num_remote_opens because we close it just below */ trace_smb3_posix_mkdir_enter(xid, tcon->tid, ses->Suid, CREATE_NOT_FILE, FILE_WRITE_ATTRIBUTES); /* resource #4: response buffer */ @@ -2416,6 +2419,7 @@ SMB2_open_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, __u8 *oplock, /* File attributes ignored on open (used in create though) */ req->FileAttributes = cpu_to_le32(file_attributes); req->ShareAccess = FILE_SHARE_ALL_LE; + req->CreateDisposition = cpu_to_le32(oparms->disposition); req->CreateOptions = cpu_to_le32(oparms->create_options & CREATE_OPTIONS_MASK); req->NameOffset = cpu_to_le16(sizeof(struct smb2_create_req)); @@ -2517,6 +2521,20 @@ SMB2_open_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, __u8 *oplock, return rc; } + if ((oparms->disposition == FILE_CREATE) && + (oparms->mode != ACL_NO_MODE)) { + if (n_iov > 2) { + struct create_context *ccontext = + (struct create_context *)iov[n_iov-1].iov_base; + ccontext->Next = + cpu_to_le32(iov[n_iov-1].iov_len); + } + + /* rc = add_sd_context(iov, &n_iov, oparms->mode); */ + if (rc) + return rc; + } + if (n_iov > 2) { struct create_context *ccontext = (struct create_context *)iov[n_iov-1].iov_base; @@ -3180,7 +3198,7 @@ SMB2_get_srv_num(const unsigned int xid, struct cifs_tcon *tcon, * See MS-SMB2 2.2.35 and 2.2.36 */ -int +static int SMB2_notify_init(const unsigned int xid, struct smb_rqst *rqst, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, u32 completion_filter, bool watch_tree) @@ -3196,7 +3214,8 @@ SMB2_notify_init(const unsigned int xid, struct smb_rqst *rqst, req->PersistentFileId = persistent_fid; req->VolatileFileId = volatile_fid; - req->OutputBufferLength = SMB2_MAX_BUFFER_SIZE - MAX_SMB2_HDR_SIZE; + req->OutputBufferLength = + cpu_to_le32(SMB2_MAX_BUFFER_SIZE - MAX_SMB2_HDR_SIZE); req->CompletionFilter = cpu_to_le32(completion_filter); if (watch_tree) req->Flags = cpu_to_le16(SMB2_WATCH_TREE); diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 67a91b11fd59..71b2930b8e0b 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -84,7 +84,8 @@ extern int smb311_posix_mkdir(const unsigned int xid, struct inode *inode, umode_t mode, struct cifs_tcon *tcon, const char *full_path, struct cifs_sb_info *cifs_sb); -extern int smb2_mkdir(const unsigned int xid, struct cifs_tcon *tcon, +extern int smb2_mkdir(const unsigned int xid, struct inode *inode, + umode_t mode, struct cifs_tcon *tcon, const char *name, struct cifs_sb_info *cifs_sb); extern void smb2_mkdir_setinfo(struct inode *inode, const char *full_path, struct cifs_sb_info *cifs_sb, @@ -149,6 +150,10 @@ extern int SMB2_ioctl_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, bool is_fsctl, char *in_data, u32 indatalen, __u32 max_response_size); extern void SMB2_ioctl_free(struct smb_rqst *rqst); +extern int SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, bool watch_tree, + u32 completion_filter); + extern int SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_file_id, u64 volatile_file_id); extern int SMB2_close_flags(const unsigned int xid, struct cifs_tcon *tcon, diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h index 08628e6a42ac..1ff28529cf4b 100644 --- a/fs/cifs/smbfsctl.h +++ b/fs/cifs/smbfsctl.h @@ -144,6 +144,17 @@ #define IO_REPARSE_APPXSTREAM 0xC0000014 /* NFS symlinks, Win 8/SMB3 and later */ #define IO_REPARSE_TAG_NFS 0x80000014 +/* + * AzureFileSync - see + * https://docs.microsoft.com/en-us/azure/storage/files/storage-sync-cloud-tiering + */ +#define IO_REPARSE_TAG_AZ_FILE_SYNC 0x8000001e +/* WSL reparse tags */ +#define IO_REPARSE_TAG_LX_SYMLINK 0xA000001D +#define IO_REPARSE_TAG_AF_UNIX 0x80000023 +#define IO_REPARSE_TAG_LX_FIFO 0x80000024 +#define IO_REPARSE_TAG_LX_CHR 0x80000025 +#define IO_REPARSE_TAG_LX_BLK 0x80000026 /* fsctl flags */ /* If Flags is set to this value, the request is an FSCTL not ioctl request */ diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 308ad0f495e1..ca3de62688d6 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -86,22 +86,8 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server) static void _cifs_mid_q_entry_release(struct kref *refcount) { - struct mid_q_entry *mid = container_of(refcount, struct mid_q_entry, - refcount); - - mempool_free(mid, cifs_mid_poolp); -} - -void cifs_mid_q_entry_release(struct mid_q_entry *midEntry) -{ - spin_lock(&GlobalMid_Lock); - kref_put(&midEntry->refcount, _cifs_mid_q_entry_release); - spin_unlock(&GlobalMid_Lock); -} - -void -DeleteMidQEntry(struct mid_q_entry *midEntry) -{ + struct mid_q_entry *midEntry = + container_of(refcount, struct mid_q_entry, refcount); #ifdef CONFIG_CIFS_STATS2 __le16 command = midEntry->server->vals->lock_cmd; __u16 smb_cmd = le16_to_cpu(midEntry->command); @@ -166,6 +152,19 @@ DeleteMidQEntry(struct mid_q_entry *midEntry) } } #endif + + mempool_free(midEntry, cifs_mid_poolp); +} + +void cifs_mid_q_entry_release(struct mid_q_entry *midEntry) +{ + spin_lock(&GlobalMid_Lock); + kref_put(&midEntry->refcount, _cifs_mid_q_entry_release); + spin_unlock(&GlobalMid_Lock); +} + +void DeleteMidQEntry(struct mid_q_entry *midEntry) +{ cifs_mid_q_entry_release(midEntry); } @@ -173,8 +172,10 @@ void cifs_delete_mid(struct mid_q_entry *mid) { spin_lock(&GlobalMid_Lock); - list_del_init(&mid->qhead); - mid->mid_flags |= MID_DELETED; + if (!(mid->mid_flags & MID_DELETED)) { + list_del_init(&mid->qhead); + mid->mid_flags |= MID_DELETED; + } spin_unlock(&GlobalMid_Lock); DeleteMidQEntry(mid); @@ -872,7 +873,10 @@ cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server) rc = -EHOSTDOWN; break; default: - list_del_init(&mid->qhead); + if (!(mid->mid_flags & MID_DELETED)) { + list_del_init(&mid->qhead); + mid->mid_flags |= MID_DELETED; + } cifs_server_dbg(VFS, "%s: invalid mid state mid=%llu state=%d\n", __func__, mid->mid, mid->mid_state); rc = -EIO; diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 9076150758d8..db4ba8f6077e 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -31,7 +31,7 @@ #include "cifs_fs_sb.h" #include "cifs_unicode.h" -#define MAX_EA_VALUE_SIZE 65535 +#define MAX_EA_VALUE_SIZE CIFSMaxBufSize #define CIFS_XATTR_CIFS_ACL "system.cifs_acl" #define CIFS_XATTR_ATTRIB "cifs.dosattrib" /* full name: user.cifs.dosattrib */ #define CIFS_XATTR_CREATETIME "cifs.creationtime" /* user.cifs.creationtime */ @@ -220,10 +220,11 @@ static void *get_unlocked_entry(struct xa_state *xas, unsigned int order) for (;;) { entry = xas_find_conflict(xas); + if (!entry || WARN_ON_ONCE(!xa_is_value(entry))) + return entry; if (dax_entry_order(entry) < order) return XA_RETRY_ENTRY; - if (!entry || WARN_ON_ONCE(!xa_is_value(entry)) || - !dax_is_locked(entry)) + if (!dax_is_locked(entry)) return entry; wq = dax_entry_waitqueue(xas, entry, &ewait.key); diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 93e4ca6b2ad7..87846aad594b 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -19,6 +19,7 @@ #include <linux/atomic.h> #include <linux/device.h> #include <linux/poll.h> +#include <linux/security.h> #include "internal.h" @@ -136,6 +137,25 @@ void debugfs_file_put(struct dentry *dentry) } EXPORT_SYMBOL_GPL(debugfs_file_put); +/* + * Only permit access to world-readable files when the kernel is locked down. + * We also need to exclude any file that has ways to write or alter it as root + * can bypass the permissions check. + */ +static bool debugfs_is_locked_down(struct inode *inode, + struct file *filp, + const struct file_operations *real_fops) +{ + if ((inode->i_mode & 07777) == 0444 && + !(filp->f_mode & FMODE_WRITE) && + !real_fops->unlocked_ioctl && + !real_fops->compat_ioctl && + !real_fops->mmap) + return false; + + return security_locked_down(LOCKDOWN_DEBUGFS); +} + static int open_proxy_open(struct inode *inode, struct file *filp) { struct dentry *dentry = F_DENTRY(filp); @@ -147,6 +167,11 @@ static int open_proxy_open(struct inode *inode, struct file *filp) return r == -EIO ? -ENOENT : r; real_fops = debugfs_real_fops(filp); + + r = debugfs_is_locked_down(inode, filp, real_fops); + if (r) + goto out; + real_fops = fops_get(real_fops); if (!real_fops) { /* Huh? Module did not clean up after itself at exit? */ @@ -272,6 +297,11 @@ static int full_proxy_open(struct inode *inode, struct file *filp) return r == -EIO ? -ENOENT : r; real_fops = debugfs_real_fops(filp); + + r = debugfs_is_locked_down(inode, filp, real_fops); + if (r) + goto out; + real_fops = fops_get(real_fops); if (!real_fops) { /* Huh? Module did not cleanup after itself at exit? */ diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 042b688ed124..7b975dbb2bb4 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -26,6 +26,7 @@ #include <linux/parser.h> #include <linux/magic.h> #include <linux/slab.h> +#include <linux/security.h> #include "internal.h" @@ -35,6 +36,32 @@ static struct vfsmount *debugfs_mount; static int debugfs_mount_count; static bool debugfs_registered; +/* + * Don't allow access attributes to be changed whilst the kernel is locked down + * so that we can use the file mode as part of a heuristic to determine whether + * to lock down individual files. + */ +static int debugfs_setattr(struct dentry *dentry, struct iattr *ia) +{ + int ret = security_locked_down(LOCKDOWN_DEBUGFS); + + if (ret && (ia->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID))) + return ret; + return simple_setattr(dentry, ia); +} + +static const struct inode_operations debugfs_file_inode_operations = { + .setattr = debugfs_setattr, +}; +static const struct inode_operations debugfs_dir_inode_operations = { + .lookup = simple_lookup, + .setattr = debugfs_setattr, +}; +static const struct inode_operations debugfs_symlink_inode_operations = { + .get_link = simple_get_link, + .setattr = debugfs_setattr, +}; + static struct inode *debugfs_get_inode(struct super_block *sb) { struct inode *inode = new_inode(sb); @@ -369,6 +396,7 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode, inode->i_mode = mode; inode->i_private = data; + inode->i_op = &debugfs_file_inode_operations; inode->i_fop = proxy_fops; dentry->d_fsdata = (void *)((unsigned long)real_fops | DEBUGFS_FSDATA_IS_REAL_FOPS_BIT); @@ -532,7 +560,7 @@ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent) } inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; - inode->i_op = &simple_dir_inode_operations; + inode->i_op = &debugfs_dir_inode_operations; inode->i_fop = &simple_dir_operations; /* directory inodes start off with i_nlink == 2 (for "." entry) */ @@ -632,7 +660,7 @@ struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent, return failed_creating(dentry); } inode->i_mode = S_IFLNK | S_IRWXUGO; - inode->i_op = &simple_symlink_inode_operations; + inode->i_op = &debugfs_symlink_inode_operations; inode->i_link = link; d_instantiate(dentry, inode); return end_creating(dentry); diff --git a/fs/direct-io.c b/fs/direct-io.c index ae196784f487..9329ced91f1d 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -241,9 +241,8 @@ void dio_warn_stale_pagecache(struct file *filp) } } -/** +/* * dio_complete() - called when all DIO BIO I/O has been completed - * @offset: the byte offset in the file of the completed operation * * This drops i_dio_count, lets interested parties know that a DIO operation * has completed, and calculates the resulting return code for the operation. diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 8a9fcbd0e8ac..fc3a8d8064f8 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -34,11 +34,15 @@ static void erofs_readendio(struct bio *bio) struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr) { - struct inode *const bd_inode = sb->s_bdev->bd_inode; - struct address_space *const mapping = bd_inode->i_mapping; + struct address_space *const mapping = sb->s_bdev->bd_inode->i_mapping; + struct page *page; - return read_cache_page_gfp(mapping, blkaddr, + page = read_cache_page_gfp(mapping, blkaddr, mapping_gfp_constraint(mapping, ~__GFP_FS)); + /* should already be PageUptodate */ + if (!IS_ERR(page)) + lock_page(page); + return page; } static int erofs_map_blocks_flatmode(struct inode *inode, diff --git a/fs/erofs/super.c b/fs/erofs/super.c index caf9a95173b0..0e369494f2f2 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -105,9 +105,9 @@ static int erofs_read_superblock(struct super_block *sb) int ret; page = read_mapping_page(sb->s_bdev->bd_inode->i_mapping, 0, NULL); - if (!page) { + if (IS_ERR(page)) { erofs_err(sb, "cannot read erofs superblock"); - return -EIO; + return PTR_ERR(page); } sbi = EROFS_SB(sb); diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 96e34c90f814..fad80c97d247 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -575,7 +575,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, struct erofs_map_blocks *const map = &fe->map; struct z_erofs_collector *const clt = &fe->clt; const loff_t offset = page_offset(page); - bool tight = (clt->mode >= COLLECT_PRIMARY_HOOKED); + bool tight = true; enum z_erofs_cache_alloctype cache_strategy; enum z_erofs_page_type page_type; @@ -628,8 +628,16 @@ restart_now: preload_compressed_pages(clt, MNGD_MAPPING(sbi), cache_strategy, pagepool); - tight &= (clt->mode >= COLLECT_PRIMARY_HOOKED); hitted: + /* + * Ensure the current partial page belongs to this submit chain rather + * than other concurrent submit chains or the noio(bypass) chain since + * those chains are handled asynchronously thus the page cannot be used + * for inplace I/O or pagevec (should be processed in strict order.) + */ + tight &= (clt->mode >= COLLECT_PRIMARY_HOOKED && + clt->mode != COLLECT_PRIMARY_FOLLOWED_NOINPLACE); + cur = end - min_t(unsigned int, offset + end - map->m_la, end); if (!(map->m_flags & EROFS_MAP_MAPPED)) { zero_user_segment(page, cur, end); diff --git a/fs/exec.c b/fs/exec.c index f7f6a140856a..555e93c7dec8 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1033,6 +1033,7 @@ static int exec_mmap(struct mm_struct *mm) } task_lock(tsk); active_mm = tsk->active_mm; + membarrier_exec_mmap(mm); tsk->mm = mm; tsk->active_mm = mm; activate_mm(active_mm, mm); @@ -1825,7 +1826,6 @@ static int __do_execve_file(int fd, struct filename *filename, /* execve succeeded */ current->fs->in_exec = 0; current->in_execve = 0; - membarrier_execve(current); rseq_execve(current); acct_update_integrals(current); task_numa_free(current, false); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 123e3dee7733..516faa280ced 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4551,6 +4551,7 @@ static int __ext4_get_inode_loc(struct inode *inode, struct buffer_head *bh; struct super_block *sb = inode->i_sb; ext4_fsblk_t block; + struct blk_plug plug; int inodes_per_block, inode_offset; iloc->bh = NULL; @@ -4639,6 +4640,7 @@ make_io: * If we need to do any I/O, try to pre-readahead extra * blocks from the inode table. */ + blk_start_plug(&plug); if (EXT4_SB(sb)->s_inode_readahead_blks) { ext4_fsblk_t b, end, table; unsigned num; @@ -4669,6 +4671,7 @@ make_io: get_bh(bh); bh->b_end_io = end_buffer_read_sync; submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, bh); + blk_finish_plug(&plug); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { EXT4_ERROR_INODE_BLOCK(inode, block, diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 1bda2ab6745b..054acd9fd033 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -88,9 +88,7 @@ static int fat__get_entry(struct inode *dir, loff_t *pos, int err, offset; next: - if (*bh) - brelse(*bh); - + brelse(*bh); *bh = NULL; iblock = *pos >> sb->s_blocksize_bits; err = fat_bmap(dir, iblock, &phys, &mapped_blocks, 0, false); @@ -1100,8 +1098,11 @@ static int fat_zeroed_cluster(struct inode *dir, sector_t blknr, int nr_used, err = -ENOMEM; goto error; } + /* Avoid race with userspace read via bdev */ + lock_buffer(bhs[n]); memset(bhs[n]->b_data, 0, sb->s_blocksize); set_buffer_uptodate(bhs[n]); + unlock_buffer(bhs[n]); mark_buffer_dirty_inode(bhs[n], dir); n++; @@ -1158,6 +1159,8 @@ int fat_alloc_new_dir(struct inode *dir, struct timespec64 *ts) fat_time_unix2fat(sbi, ts, &time, &date, &time_cs); de = (struct msdos_dir_entry *)bhs[0]->b_data; + /* Avoid race with userspace read via bdev */ + lock_buffer(bhs[0]); /* filling the new directory slots ("." and ".." entries) */ memcpy(de[0].name, MSDOS_DOT, MSDOS_NAME); memcpy(de[1].name, MSDOS_DOTDOT, MSDOS_NAME); @@ -1180,6 +1183,7 @@ int fat_alloc_new_dir(struct inode *dir, struct timespec64 *ts) de[0].size = de[1].size = 0; memset(de + 2, 0, sb->s_blocksize - 2 * sizeof(*de)); set_buffer_uptodate(bhs[0]); + unlock_buffer(bhs[0]); mark_buffer_dirty_inode(bhs[0], dir); err = fat_zeroed_cluster(dir, blknr, 1, bhs, MAX_BUF_PER_PAGE); @@ -1237,11 +1241,14 @@ static int fat_add_new_entries(struct inode *dir, void *slots, int nr_slots, /* fill the directory entry */ copy = min(size, sb->s_blocksize); + /* Avoid race with userspace read via bdev */ + lock_buffer(bhs[n]); memcpy(bhs[n]->b_data, slots, copy); - slots += copy; - size -= copy; set_buffer_uptodate(bhs[n]); + unlock_buffer(bhs[n]); mark_buffer_dirty_inode(bhs[n], dir); + slots += copy; + size -= copy; if (!size) break; n++; diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index 265983635f2b..3647c65a0f48 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -388,8 +388,11 @@ static int fat_mirror_bhs(struct super_block *sb, struct buffer_head **bhs, err = -ENOMEM; goto error; } + /* Avoid race with userspace read via bdev */ + lock_buffer(c_bh); memcpy(c_bh->b_data, bhs[n]->b_data, sb->s_blocksize); set_buffer_uptodate(c_bh); + unlock_buffer(c_bh); mark_buffer_dirty_inode(c_bh, sbi->fat_inode); if (sb->s_flags & SB_SYNCHRONOUS) err = sync_dirty_buffer(c_bh); diff --git a/fs/fhandle.c b/fs/fhandle.c index 0ee727485615..01263ffbc4c0 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -246,7 +246,7 @@ static long do_handle_open(int mountdirfd, struct file_handle __user *ufh, * sys_open_by_handle_at: Open the file handle * @mountdirfd: directory file descriptor * @handle: file handle to be opened - * @flag: open flags. + * @flags: open flags. * * @mountdirfd indicate the directory file descriptor * of the mount point. file handle is decoded relative diff --git a/fs/file_table.c b/fs/file_table.c index b07b53f24ff5..30d55c9a1744 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -327,6 +327,7 @@ void flush_delayed_fput(void) { delayed_fput(NULL); } +EXPORT_SYMBOL_GPL(flush_delayed_fput); static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 8aaa7eec7b74..8461a6322039 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -164,8 +164,13 @@ static void finish_writeback_work(struct bdi_writeback *wb, if (work->auto_free) kfree(work); - if (done && atomic_dec_and_test(&done->cnt)) - wake_up_all(done->waitq); + if (done) { + wait_queue_head_t *waitq = done->waitq; + + /* @done can't be accessed after the following dec */ + if (atomic_dec_and_test(&done->cnt)) + wake_up_all(waitq); + } } static void wb_queue_work(struct bdi_writeback *wb, @@ -900,7 +905,7 @@ restart: * cgroup_writeback_by_id - initiate cgroup writeback from bdi and memcg IDs * @bdi_id: target bdi id * @memcg_id: target memcg css id - * @nr_pages: number of pages to write, 0 for best-effort dirty flushing + * @nr: number of pages to write, 0 for best-effort dirty flushing * @reason: reason why some writeback work initiated * @done: target wb_completion * diff --git a/fs/fs_context.c b/fs/fs_context.c index 87c2c9687d90..138b5b4d621d 100644 --- a/fs/fs_context.c +++ b/fs/fs_context.c @@ -504,7 +504,6 @@ void put_fs_context(struct fs_context *fc) put_net(fc->net_ns); put_user_ns(fc->user_ns); put_cred(fc->cred); - kfree(fc->subtype); put_fc_log(fc); put_filesystem(fc->fs_type); kfree(fc->source); @@ -571,17 +570,6 @@ static int legacy_parse_param(struct fs_context *fc, struct fs_parameter *param) return 0; } - if ((fc->fs_type->fs_flags & FS_HAS_SUBTYPE) && - strcmp(param->key, "subtype") == 0) { - if (param->type != fs_value_is_string) - return invalf(fc, "VFS: Legacy: Non-string subtype"); - if (fc->subtype) - return invalf(fc, "VFS: Legacy: Multiple subtype"); - fc->subtype = param->string; - param->string = NULL; - return 0; - } - if (ctx->param_type == LEGACY_FS_MONOLITHIC_PARAMS) return invalf(fc, "VFS: Legacy: Can't mix monolithic and individual options"); @@ -738,8 +726,6 @@ void vfs_clean_context(struct fs_context *fc) fc->s_fs_info = NULL; fc->sb_flags = 0; security_free_mnt_opts(&fc->security); - kfree(fc->subtype); - fc->subtype = NULL; kfree(fc->source); fc->source = NULL; diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig index 24fc5a5c1b97..0635cba19971 100644 --- a/fs/fuse/Kconfig +++ b/fs/fuse/Kconfig @@ -27,3 +27,14 @@ config CUSE If you want to develop or use a userspace character device based on CUSE, answer Y or M. + +config VIRTIO_FS + tristate "Virtio Filesystem" + depends on FUSE_FS + select VIRTIO + help + The Virtio Filesystem allows guests to mount file systems from the + host. + + If you want to share files between guests or with the host, answer Y + or M. diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile index 9485019c2a14..3e8cebfb59b7 100644 --- a/fs/fuse/Makefile +++ b/fs/fuse/Makefile @@ -5,5 +5,7 @@ obj-$(CONFIG_FUSE_FS) += fuse.o obj-$(CONFIG_CUSE) += cuse.o +obj-$(CONFIG_VIRTIO_FS) += virtiofs.o fuse-objs := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o +virtiofs-y += virtio_fs.o diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index bab7a0db81dd..00015d851382 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -142,11 +142,10 @@ static int cuse_open(struct inode *inode, struct file *file) static int cuse_release(struct inode *inode, struct file *file) { - struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fc; - fuse_sync_release(fi, ff, file->f_flags); + fuse_sync_release(NULL, ff, file->f_flags); fuse_conn_put(fc); return 0; @@ -299,6 +298,14 @@ static void cuse_gendev_release(struct device *dev) kfree(dev); } +struct cuse_init_args { + struct fuse_args_pages ap; + struct cuse_init_in in; + struct cuse_init_out out; + struct page *page; + struct fuse_page_desc desc; +}; + /** * cuse_process_init_reply - finish initializing CUSE channel * @@ -306,21 +313,22 @@ static void cuse_gendev_release(struct device *dev) * required data structures for it. Please read the comment at the * top of this file for high level overview. */ -static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req) +static void cuse_process_init_reply(struct fuse_conn *fc, + struct fuse_args *args, int error) { + struct cuse_init_args *ia = container_of(args, typeof(*ia), ap.args); + struct fuse_args_pages *ap = &ia->ap; struct cuse_conn *cc = fc_to_cc(fc), *pos; - struct cuse_init_out *arg = req->out.args[0].value; - struct page *page = req->pages[0]; + struct cuse_init_out *arg = &ia->out; + struct page *page = ap->pages[0]; struct cuse_devinfo devinfo = { }; struct device *dev; struct cdev *cdev; dev_t devt; int rc, i; - if (req->out.h.error || - arg->major != FUSE_KERNEL_VERSION || arg->minor < 11) { + if (error || arg->major != FUSE_KERNEL_VERSION || arg->minor < 11) goto err; - } fc->minor = arg->minor; fc->max_read = max_t(unsigned, arg->max_read, 4096); @@ -329,7 +337,7 @@ static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req) /* parse init reply */ cc->unrestricted_ioctl = arg->flags & CUSE_UNRESTRICTED_IOCTL; - rc = cuse_parse_devinfo(page_address(page), req->out.args[1].size, + rc = cuse_parse_devinfo(page_address(page), ap->args.out_args[1].size, &devinfo); if (rc) goto err; @@ -396,7 +404,7 @@ static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req) dev_set_uevent_suppress(dev, 0); kobject_uevent(&dev->kobj, KOBJ_ADD); out: - kfree(arg); + kfree(ia); __free_page(page); return; @@ -415,55 +423,49 @@ err: static int cuse_send_init(struct cuse_conn *cc) { int rc; - struct fuse_req *req; struct page *page; struct fuse_conn *fc = &cc->fc; - struct cuse_init_in *arg; - void *outarg; + struct cuse_init_args *ia; + struct fuse_args_pages *ap; BUILD_BUG_ON(CUSE_INIT_INFO_MAX > PAGE_SIZE); - req = fuse_get_req_for_background(fc, 1); - if (IS_ERR(req)) { - rc = PTR_ERR(req); - goto err; - } - rc = -ENOMEM; page = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!page) - goto err_put_req; + goto err; - outarg = kzalloc(sizeof(struct cuse_init_out), GFP_KERNEL); - if (!outarg) + ia = kzalloc(sizeof(*ia), GFP_KERNEL); + if (!ia) goto err_free_page; - arg = &req->misc.cuse_init_in; - arg->major = FUSE_KERNEL_VERSION; - arg->minor = FUSE_KERNEL_MINOR_VERSION; - arg->flags |= CUSE_UNRESTRICTED_IOCTL; - req->in.h.opcode = CUSE_INIT; - req->in.numargs = 1; - req->in.args[0].size = sizeof(struct cuse_init_in); - req->in.args[0].value = arg; - req->out.numargs = 2; - req->out.args[0].size = sizeof(struct cuse_init_out); - req->out.args[0].value = outarg; - req->out.args[1].size = CUSE_INIT_INFO_MAX; - req->out.argvar = 1; - req->out.argpages = 1; - req->pages[0] = page; - req->page_descs[0].length = req->out.args[1].size; - req->num_pages = 1; - req->end = cuse_process_init_reply; - fuse_request_send_background(fc, req); - - return 0; - + ap = &ia->ap; + ia->in.major = FUSE_KERNEL_VERSION; + ia->in.minor = FUSE_KERNEL_MINOR_VERSION; + ia->in.flags |= CUSE_UNRESTRICTED_IOCTL; + ap->args.opcode = CUSE_INIT; + ap->args.in_numargs = 1; + ap->args.in_args[0].size = sizeof(ia->in); + ap->args.in_args[0].value = &ia->in; + ap->args.out_numargs = 2; + ap->args.out_args[0].size = sizeof(ia->out); + ap->args.out_args[0].value = &ia->out; + ap->args.out_args[1].size = CUSE_INIT_INFO_MAX; + ap->args.out_argvar = 1; + ap->args.out_pages = 1; + ap->num_pages = 1; + ap->pages = &ia->page; + ap->descs = &ia->desc; + ia->page = page; + ia->desc.length = ap->args.out_args[1].size; + ap->args.end = cuse_process_init_reply; + + rc = fuse_simple_background(fc, &ap->args, GFP_KERNEL); + if (rc) { + kfree(ia); err_free_page: - __free_page(page); -err_put_req: - fuse_put_request(fc, req); + __free_page(page); + } err: return rc; } @@ -504,9 +506,9 @@ static int cuse_channel_open(struct inode *inode, struct file *file) * Limit the cuse channel to requests that can * be represented in file->f_cred->user_ns. */ - fuse_conn_init(&cc->fc, file->f_cred->user_ns); + fuse_conn_init(&cc->fc, file->f_cred->user_ns, &fuse_dev_fiq_ops, NULL); - fud = fuse_dev_alloc(&cc->fc); + fud = fuse_dev_alloc_install(&cc->fc); if (!fud) { kfree(cc); return -ENOMEM; @@ -519,6 +521,7 @@ static int cuse_channel_open(struct inode *inode, struct file *file) rc = cuse_send_init(cc); if (rc) { fuse_dev_free(fud); + fuse_conn_put(&cc->fc); return rc; } file->private_data = fud; diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index ea8237513dfa..ed1abc9e33cf 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -40,107 +40,30 @@ static struct fuse_dev *fuse_get_dev(struct file *file) return READ_ONCE(file->private_data); } -static void fuse_request_init(struct fuse_req *req, struct page **pages, - struct fuse_page_desc *page_descs, - unsigned npages) +static void fuse_request_init(struct fuse_req *req) { INIT_LIST_HEAD(&req->list); INIT_LIST_HEAD(&req->intr_entry); init_waitqueue_head(&req->waitq); refcount_set(&req->count, 1); - req->pages = pages; - req->page_descs = page_descs; - req->max_pages = npages; __set_bit(FR_PENDING, &req->flags); } -static struct page **fuse_req_pages_alloc(unsigned int npages, gfp_t flags, - struct fuse_page_desc **desc) -{ - struct page **pages; - - pages = kzalloc(npages * (sizeof(struct page *) + - sizeof(struct fuse_page_desc)), flags); - *desc = (void *) pages + npages * sizeof(struct page *); - - return pages; -} - -static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags) +static struct fuse_req *fuse_request_alloc(gfp_t flags) { struct fuse_req *req = kmem_cache_zalloc(fuse_req_cachep, flags); - if (req) { - struct page **pages = NULL; - struct fuse_page_desc *page_descs = NULL; - - WARN_ON(npages > FUSE_MAX_MAX_PAGES); - if (npages > FUSE_REQ_INLINE_PAGES) { - pages = fuse_req_pages_alloc(npages, flags, - &page_descs); - if (!pages) { - kmem_cache_free(fuse_req_cachep, req); - return NULL; - } - } else if (npages) { - pages = req->inline_pages; - page_descs = req->inline_page_descs; - } + if (req) + fuse_request_init(req); - fuse_request_init(req, pages, page_descs, npages); - } return req; } -struct fuse_req *fuse_request_alloc(unsigned npages) -{ - return __fuse_request_alloc(npages, GFP_KERNEL); -} -EXPORT_SYMBOL_GPL(fuse_request_alloc); - -struct fuse_req *fuse_request_alloc_nofs(unsigned npages) -{ - return __fuse_request_alloc(npages, GFP_NOFS); -} - -static void fuse_req_pages_free(struct fuse_req *req) -{ - if (req->pages != req->inline_pages) - kfree(req->pages); -} - -bool fuse_req_realloc_pages(struct fuse_conn *fc, struct fuse_req *req, - gfp_t flags) -{ - struct page **pages; - struct fuse_page_desc *page_descs; - unsigned int npages = min_t(unsigned int, - max_t(unsigned int, req->max_pages * 2, - FUSE_DEFAULT_MAX_PAGES_PER_REQ), - fc->max_pages); - WARN_ON(npages <= req->max_pages); - - pages = fuse_req_pages_alloc(npages, flags, &page_descs); - if (!pages) - return false; - - memcpy(pages, req->pages, sizeof(struct page *) * req->max_pages); - memcpy(page_descs, req->page_descs, - sizeof(struct fuse_page_desc) * req->max_pages); - fuse_req_pages_free(req); - req->pages = pages; - req->page_descs = page_descs; - req->max_pages = npages; - - return true; -} - -void fuse_request_free(struct fuse_req *req) +static void fuse_request_free(struct fuse_req *req) { - fuse_req_pages_free(req); kmem_cache_free(fuse_req_cachep, req); } -void __fuse_get_request(struct fuse_req *req) +static void __fuse_get_request(struct fuse_req *req) { refcount_inc(&req->count); } @@ -177,8 +100,9 @@ static void fuse_drop_waiting(struct fuse_conn *fc) } } -static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages, - bool for_background) +static void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req); + +static struct fuse_req *fuse_get_req(struct fuse_conn *fc, bool for_background) { struct fuse_req *req; int err; @@ -201,7 +125,7 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages, if (fc->conn_error) goto out; - req = fuse_request_alloc(npages); + req = fuse_request_alloc(GFP_KERNEL); err = -ENOMEM; if (!req) { if (for_background) @@ -229,101 +153,7 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages, return ERR_PTR(err); } -struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages) -{ - return __fuse_get_req(fc, npages, false); -} -EXPORT_SYMBOL_GPL(fuse_get_req); - -struct fuse_req *fuse_get_req_for_background(struct fuse_conn *fc, - unsigned npages) -{ - return __fuse_get_req(fc, npages, true); -} -EXPORT_SYMBOL_GPL(fuse_get_req_for_background); - -/* - * Return request in fuse_file->reserved_req. However that may - * currently be in use. If that is the case, wait for it to become - * available. - */ -static struct fuse_req *get_reserved_req(struct fuse_conn *fc, - struct file *file) -{ - struct fuse_req *req = NULL; - struct fuse_inode *fi = get_fuse_inode(file_inode(file)); - struct fuse_file *ff = file->private_data; - - do { - wait_event(fc->reserved_req_waitq, ff->reserved_req); - spin_lock(&fi->lock); - if (ff->reserved_req) { - req = ff->reserved_req; - ff->reserved_req = NULL; - req->stolen_file = get_file(file); - } - spin_unlock(&fi->lock); - } while (!req); - - return req; -} - -/* - * Put stolen request back into fuse_file->reserved_req - */ -static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req) -{ - struct file *file = req->stolen_file; - struct fuse_inode *fi = get_fuse_inode(file_inode(file)); - struct fuse_file *ff = file->private_data; - - WARN_ON(req->max_pages); - spin_lock(&fi->lock); - memset(req, 0, sizeof(*req)); - fuse_request_init(req, NULL, NULL, 0); - BUG_ON(ff->reserved_req); - ff->reserved_req = req; - wake_up_all(&fc->reserved_req_waitq); - spin_unlock(&fi->lock); - fput(file); -} - -/* - * Gets a requests for a file operation, always succeeds - * - * This is used for sending the FLUSH request, which must get to - * userspace, due to POSIX locks which may need to be unlocked. - * - * If allocation fails due to OOM, use the reserved request in - * fuse_file. - * - * This is very unlikely to deadlock accidentally, since the - * filesystem should not have it's own file open. If deadlock is - * intentional, it can still be broken by "aborting" the filesystem. - */ -struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc, - struct file *file) -{ - struct fuse_req *req; - - atomic_inc(&fc->num_waiting); - wait_event(fc->blocked_waitq, fc->initialized); - /* Matches smp_wmb() in fuse_set_initialized() */ - smp_rmb(); - req = fuse_request_alloc(0); - if (!req) - req = get_reserved_req(fc, file); - - req->in.h.uid = from_kuid_munged(fc->user_ns, current_fsuid()); - req->in.h.gid = from_kgid_munged(fc->user_ns, current_fsgid()); - req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns); - - __set_bit(FR_WAITING, &req->flags); - __clear_bit(FR_BACKGROUND, &req->flags); - return req; -} - -void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) +static void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) { if (refcount_dec_and_test(&req->count)) { if (test_bit(FR_BACKGROUND, &req->flags)) { @@ -342,15 +172,11 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) fuse_drop_waiting(fc); } - if (req->stolen_file) - put_reserved_req(fc, req); - else - fuse_request_free(req); + fuse_request_free(req); } } -EXPORT_SYMBOL_GPL(fuse_put_request); -static unsigned len_args(unsigned numargs, struct fuse_arg *args) +unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args) { unsigned nbytes = 0; unsigned i; @@ -360,25 +186,47 @@ static unsigned len_args(unsigned numargs, struct fuse_arg *args) return nbytes; } +EXPORT_SYMBOL_GPL(fuse_len_args); -static u64 fuse_get_unique(struct fuse_iqueue *fiq) +u64 fuse_get_unique(struct fuse_iqueue *fiq) { fiq->reqctr += FUSE_REQ_ID_STEP; return fiq->reqctr; } +EXPORT_SYMBOL_GPL(fuse_get_unique); static unsigned int fuse_req_hash(u64 unique) { return hash_long(unique & ~FUSE_INT_REQ_BIT, FUSE_PQ_HASH_BITS); } -static void queue_request(struct fuse_iqueue *fiq, struct fuse_req *req) +/** + * A new request is available, wake fiq->waitq + */ +static void fuse_dev_wake_and_unlock(struct fuse_iqueue *fiq) +__releases(fiq->lock) +{ + wake_up(&fiq->waitq); + kill_fasync(&fiq->fasync, SIGIO, POLL_IN); + spin_unlock(&fiq->lock); +} + +const struct fuse_iqueue_ops fuse_dev_fiq_ops = { + .wake_forget_and_unlock = fuse_dev_wake_and_unlock, + .wake_interrupt_and_unlock = fuse_dev_wake_and_unlock, + .wake_pending_and_unlock = fuse_dev_wake_and_unlock, +}; +EXPORT_SYMBOL_GPL(fuse_dev_fiq_ops); + +static void queue_request_and_unlock(struct fuse_iqueue *fiq, + struct fuse_req *req) +__releases(fiq->lock) { req->in.h.len = sizeof(struct fuse_in_header) + - len_args(req->in.numargs, (struct fuse_arg *) req->in.args); + fuse_len_args(req->args->in_numargs, + (struct fuse_arg *) req->args->in_args); list_add_tail(&req->list, &fiq->pending); - wake_up_locked(&fiq->waitq); - kill_fasync(&fiq->fasync, SIGIO, POLL_IN); + fiq->ops->wake_pending_and_unlock(fiq); } void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, @@ -389,16 +237,15 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, forget->forget_one.nodeid = nodeid; forget->forget_one.nlookup = nlookup; - spin_lock(&fiq->waitq.lock); + spin_lock(&fiq->lock); if (fiq->connected) { fiq->forget_list_tail->next = forget; fiq->forget_list_tail = forget; - wake_up_locked(&fiq->waitq); - kill_fasync(&fiq->fasync, SIGIO, POLL_IN); + fiq->ops->wake_forget_and_unlock(fiq); } else { kfree(forget); + spin_unlock(&fiq->lock); } - spin_unlock(&fiq->waitq.lock); } static void flush_bg_queue(struct fuse_conn *fc) @@ -412,10 +259,9 @@ static void flush_bg_queue(struct fuse_conn *fc) req = list_first_entry(&fc->bg_queue, struct fuse_req, list); list_del(&req->list); fc->active_background++; - spin_lock(&fiq->waitq.lock); + spin_lock(&fiq->lock); req->in.h.unique = fuse_get_unique(fiq); - queue_request(fiq, req); - spin_unlock(&fiq->waitq.lock); + queue_request_and_unlock(fiq, req); } } @@ -427,21 +273,24 @@ static void flush_bg_queue(struct fuse_conn *fc) * the 'end' callback is called if given, else the reference to the * request is released */ -static void request_end(struct fuse_conn *fc, struct fuse_req *req) +void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req) { struct fuse_iqueue *fiq = &fc->iq; + bool async; if (test_and_set_bit(FR_FINISHED, &req->flags)) goto put_request; + + async = req->args->end; /* * test_and_set_bit() implies smp_mb() between bit * changing and below intr_entry check. Pairs with * smp_mb() from queue_interrupt(). */ if (!list_empty(&req->intr_entry)) { - spin_lock(&fiq->waitq.lock); + spin_lock(&fiq->lock); list_del_init(&req->intr_entry); - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); } WARN_ON(test_bit(FR_PENDING, &req->flags)); WARN_ON(test_bit(FR_SENT, &req->flags)); @@ -475,18 +324,19 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) wake_up(&req->waitq); } - if (req->end) - req->end(fc, req); + if (async) + req->args->end(fc, req->args, req->out.h.error); put_request: fuse_put_request(fc, req); } +EXPORT_SYMBOL_GPL(fuse_request_end); static int queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) { - spin_lock(&fiq->waitq.lock); + spin_lock(&fiq->lock); /* Check for we've sent request to interrupt this req */ if (unlikely(!test_bit(FR_INTERRUPTED, &req->flags))) { - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); return -EINVAL; } @@ -499,13 +349,13 @@ static int queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) smp_mb(); if (test_bit(FR_FINISHED, &req->flags)) { list_del_init(&req->intr_entry); - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); return 0; } - wake_up_locked(&fiq->waitq); - kill_fasync(&fiq->fasync, SIGIO, POLL_IN); + fiq->ops->wake_interrupt_and_unlock(fiq); + } else { + spin_unlock(&fiq->lock); } - spin_unlock(&fiq->waitq.lock); return 0; } @@ -535,16 +385,16 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) if (!err) return; - spin_lock(&fiq->waitq.lock); + spin_lock(&fiq->lock); /* Request is not yet in userspace, bail out */ if (test_bit(FR_PENDING, &req->flags)) { list_del(&req->list); - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); __fuse_put_request(req); req->out.h.error = -EINTR; return; } - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); } /* @@ -559,101 +409,110 @@ static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) struct fuse_iqueue *fiq = &fc->iq; BUG_ON(test_bit(FR_BACKGROUND, &req->flags)); - spin_lock(&fiq->waitq.lock); + spin_lock(&fiq->lock); if (!fiq->connected) { - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); req->out.h.error = -ENOTCONN; } else { req->in.h.unique = fuse_get_unique(fiq); - queue_request(fiq, req); /* acquire extra reference, since request is still needed - after request_end() */ + after fuse_request_end() */ __fuse_get_request(req); - spin_unlock(&fiq->waitq.lock); + queue_request_and_unlock(fiq, req); request_wait_answer(fc, req); - /* Pairs with smp_wmb() in request_end() */ + /* Pairs with smp_wmb() in fuse_request_end() */ smp_rmb(); } } -void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) -{ - __set_bit(FR_ISREPLY, &req->flags); - if (!test_bit(FR_WAITING, &req->flags)) { - __set_bit(FR_WAITING, &req->flags); - atomic_inc(&fc->num_waiting); - } - __fuse_request_send(fc, req); -} -EXPORT_SYMBOL_GPL(fuse_request_send); - static void fuse_adjust_compat(struct fuse_conn *fc, struct fuse_args *args) { - if (fc->minor < 4 && args->in.h.opcode == FUSE_STATFS) - args->out.args[0].size = FUSE_COMPAT_STATFS_SIZE; + if (fc->minor < 4 && args->opcode == FUSE_STATFS) + args->out_args[0].size = FUSE_COMPAT_STATFS_SIZE; if (fc->minor < 9) { - switch (args->in.h.opcode) { + switch (args->opcode) { case FUSE_LOOKUP: case FUSE_CREATE: case FUSE_MKNOD: case FUSE_MKDIR: case FUSE_SYMLINK: case FUSE_LINK: - args->out.args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE; + args->out_args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE; break; case FUSE_GETATTR: case FUSE_SETATTR: - args->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE; + args->out_args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE; break; } } if (fc->minor < 12) { - switch (args->in.h.opcode) { + switch (args->opcode) { case FUSE_CREATE: - args->in.args[0].size = sizeof(struct fuse_open_in); + args->in_args[0].size = sizeof(struct fuse_open_in); break; case FUSE_MKNOD: - args->in.args[0].size = FUSE_COMPAT_MKNOD_IN_SIZE; + args->in_args[0].size = FUSE_COMPAT_MKNOD_IN_SIZE; break; } } } +static void fuse_force_creds(struct fuse_conn *fc, struct fuse_req *req) +{ + req->in.h.uid = from_kuid_munged(fc->user_ns, current_fsuid()); + req->in.h.gid = from_kgid_munged(fc->user_ns, current_fsgid()); + req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns); +} + +static void fuse_args_to_req(struct fuse_req *req, struct fuse_args *args) +{ + req->in.h.opcode = args->opcode; + req->in.h.nodeid = args->nodeid; + req->args = args; +} + ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args) { struct fuse_req *req; ssize_t ret; - req = fuse_get_req(fc, 0); - if (IS_ERR(req)) - return PTR_ERR(req); + if (args->force) { + atomic_inc(&fc->num_waiting); + req = fuse_request_alloc(GFP_KERNEL | __GFP_NOFAIL); + + if (!args->nocreds) + fuse_force_creds(fc, req); + + __set_bit(FR_WAITING, &req->flags); + __set_bit(FR_FORCE, &req->flags); + } else { + WARN_ON(args->nocreds); + req = fuse_get_req(fc, false); + if (IS_ERR(req)) + return PTR_ERR(req); + } /* Needs to be done after fuse_get_req() so that fc->minor is valid */ fuse_adjust_compat(fc, args); + fuse_args_to_req(req, args); - req->in.h.opcode = args->in.h.opcode; - req->in.h.nodeid = args->in.h.nodeid; - req->in.numargs = args->in.numargs; - memcpy(req->in.args, args->in.args, - args->in.numargs * sizeof(struct fuse_in_arg)); - req->out.argvar = args->out.argvar; - req->out.numargs = args->out.numargs; - memcpy(req->out.args, args->out.args, - args->out.numargs * sizeof(struct fuse_arg)); - fuse_request_send(fc, req); + if (!args->noreply) + __set_bit(FR_ISREPLY, &req->flags); + __fuse_request_send(fc, req); ret = req->out.h.error; - if (!ret && args->out.argvar) { - BUG_ON(args->out.numargs != 1); - ret = req->out.args[0].size; + if (!ret && args->out_argvar) { + BUG_ON(args->out_numargs == 0); + ret = args->out_args[args->out_numargs - 1].size; } fuse_put_request(fc, req); return ret; } -bool fuse_request_queue_background(struct fuse_conn *fc, struct fuse_req *req) +static bool fuse_request_queue_background(struct fuse_conn *fc, + struct fuse_req *req) { bool queued = false; @@ -681,56 +540,63 @@ bool fuse_request_queue_background(struct fuse_conn *fc, struct fuse_req *req) return queued; } -void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req) +int fuse_simple_background(struct fuse_conn *fc, struct fuse_args *args, + gfp_t gfp_flags) { - WARN_ON(!req->end); + struct fuse_req *req; + + if (args->force) { + WARN_ON(!args->nocreds); + req = fuse_request_alloc(gfp_flags); + if (!req) + return -ENOMEM; + __set_bit(FR_BACKGROUND, &req->flags); + } else { + WARN_ON(args->nocreds); + req = fuse_get_req(fc, true); + if (IS_ERR(req)) + return PTR_ERR(req); + } + + fuse_args_to_req(req, args); + if (!fuse_request_queue_background(fc, req)) { - req->out.h.error = -ENOTCONN; - req->end(fc, req); fuse_put_request(fc, req); + return -ENOTCONN; } + + return 0; } -EXPORT_SYMBOL_GPL(fuse_request_send_background); +EXPORT_SYMBOL_GPL(fuse_simple_background); -static int fuse_request_send_notify_reply(struct fuse_conn *fc, - struct fuse_req *req, u64 unique) +static int fuse_simple_notify_reply(struct fuse_conn *fc, + struct fuse_args *args, u64 unique) { - int err = -ENODEV; + struct fuse_req *req; struct fuse_iqueue *fiq = &fc->iq; + int err = 0; + + req = fuse_get_req(fc, false); + if (IS_ERR(req)) + return PTR_ERR(req); __clear_bit(FR_ISREPLY, &req->flags); req->in.h.unique = unique; - spin_lock(&fiq->waitq.lock); + + fuse_args_to_req(req, args); + + spin_lock(&fiq->lock); if (fiq->connected) { - queue_request(fiq, req); - err = 0; + queue_request_and_unlock(fiq, req); + } else { + err = -ENODEV; + spin_unlock(&fiq->lock); + fuse_put_request(fc, req); } - spin_unlock(&fiq->waitq.lock); return err; } -void fuse_force_forget(struct file *file, u64 nodeid) -{ - struct inode *inode = file_inode(file); - struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req; - struct fuse_forget_in inarg; - - memset(&inarg, 0, sizeof(inarg)); - inarg.nlookup = 1; - req = fuse_get_req_nofail_nopages(fc, file); - req->in.h.opcode = FUSE_FORGET; - req->in.h.nodeid = nodeid; - req->in.numargs = 1; - req->in.args[0].size = sizeof(inarg); - req->in.args[0].value = &inarg; - __clear_bit(FR_ISREPLY, &req->flags); - __fuse_request_send(fc, req); - /* ignore errors */ - fuse_put_request(fc, req); -} - /* * Lock the request. Up to the next unlock_request() there mustn't be * anything that could cause a page-fault. If the request was already @@ -1084,14 +950,15 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes, { unsigned i; struct fuse_req *req = cs->req; + struct fuse_args_pages *ap = container_of(req->args, typeof(*ap), args); + - for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) { + for (i = 0; i < ap->num_pages && (nbytes || zeroing); i++) { int err; - unsigned offset = req->page_descs[i].offset; - unsigned count = min(nbytes, req->page_descs[i].length); + unsigned int offset = ap->descs[i].offset; + unsigned int count = min(nbytes, ap->descs[i].length); - err = fuse_copy_page(cs, &req->pages[i], offset, count, - zeroing); + err = fuse_copy_page(cs, &ap->pages[i], offset, count, zeroing); if (err) return err; @@ -1149,12 +1016,12 @@ static int request_pending(struct fuse_iqueue *fiq) * Unlike other requests this is assembled on demand, without a need * to allocate a separate fuse_req structure. * - * Called with fiq->waitq.lock held, releases it + * Called with fiq->lock held, releases it */ static int fuse_read_interrupt(struct fuse_iqueue *fiq, struct fuse_copy_state *cs, size_t nbytes, struct fuse_req *req) -__releases(fiq->waitq.lock) +__releases(fiq->lock) { struct fuse_in_header ih; struct fuse_interrupt_in arg; @@ -1169,7 +1036,7 @@ __releases(fiq->waitq.lock) ih.unique = (req->in.h.unique | FUSE_INT_REQ_BIT); arg.unique = req->in.h.unique; - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); if (nbytes < reqsize) return -EINVAL; @@ -1181,9 +1048,9 @@ __releases(fiq->waitq.lock) return err ? err : reqsize; } -static struct fuse_forget_link *dequeue_forget(struct fuse_iqueue *fiq, - unsigned max, - unsigned *countp) +struct fuse_forget_link *fuse_dequeue_forget(struct fuse_iqueue *fiq, + unsigned int max, + unsigned int *countp) { struct fuse_forget_link *head = fiq->forget_list_head.next; struct fuse_forget_link **newhead = &head; @@ -1202,14 +1069,15 @@ static struct fuse_forget_link *dequeue_forget(struct fuse_iqueue *fiq, return head; } +EXPORT_SYMBOL(fuse_dequeue_forget); static int fuse_read_single_forget(struct fuse_iqueue *fiq, struct fuse_copy_state *cs, size_t nbytes) -__releases(fiq->waitq.lock) +__releases(fiq->lock) { int err; - struct fuse_forget_link *forget = dequeue_forget(fiq, 1, NULL); + struct fuse_forget_link *forget = fuse_dequeue_forget(fiq, 1, NULL); struct fuse_forget_in arg = { .nlookup = forget->forget_one.nlookup, }; @@ -1220,7 +1088,7 @@ __releases(fiq->waitq.lock) .len = sizeof(ih) + sizeof(arg), }; - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); kfree(forget); if (nbytes < ih.len) return -EINVAL; @@ -1238,7 +1106,7 @@ __releases(fiq->waitq.lock) static int fuse_read_batch_forget(struct fuse_iqueue *fiq, struct fuse_copy_state *cs, size_t nbytes) -__releases(fiq->waitq.lock) +__releases(fiq->lock) { int err; unsigned max_forgets; @@ -1252,13 +1120,13 @@ __releases(fiq->waitq.lock) }; if (nbytes < ih.len) { - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); return -EINVAL; } max_forgets = (nbytes - ih.len) / sizeof(struct fuse_forget_one); - head = dequeue_forget(fiq, max_forgets, &count); - spin_unlock(&fiq->waitq.lock); + head = fuse_dequeue_forget(fiq, max_forgets, &count); + spin_unlock(&fiq->lock); arg.count = count; ih.len += count * sizeof(struct fuse_forget_one); @@ -1288,7 +1156,7 @@ __releases(fiq->waitq.lock) static int fuse_read_forget(struct fuse_conn *fc, struct fuse_iqueue *fiq, struct fuse_copy_state *cs, size_t nbytes) -__releases(fiq->waitq.lock) +__releases(fiq->lock) { if (fc->minor < 16 || fiq->forget_list_head.next->next == NULL) return fuse_read_single_forget(fiq, cs, nbytes); @@ -1302,7 +1170,7 @@ __releases(fiq->waitq.lock) * the pending list and copies request data to userspace buffer. If * no reply is needed (FORGET) or request has been aborted or there * was an error during the copying then it's finished by calling - * request_end(). Otherwise add it to the processing list, and set + * fuse_request_end(). Otherwise add it to the processing list, and set * the 'sent' flag. */ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, @@ -1313,21 +1181,42 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, struct fuse_iqueue *fiq = &fc->iq; struct fuse_pqueue *fpq = &fud->pq; struct fuse_req *req; - struct fuse_in *in; + struct fuse_args *args; unsigned reqsize; unsigned int hash; + /* + * Require sane minimum read buffer - that has capacity for fixed part + * of any request header + negotiated max_write room for data. + * + * Historically libfuse reserves 4K for fixed header room, but e.g. + * GlusterFS reserves only 80 bytes + * + * = `sizeof(fuse_in_header) + sizeof(fuse_write_in)` + * + * which is the absolute minimum any sane filesystem should be using + * for header room. + */ + if (nbytes < max_t(size_t, FUSE_MIN_READ_BUFFER, + sizeof(struct fuse_in_header) + + sizeof(struct fuse_write_in) + + fc->max_write)) + return -EINVAL; + restart: - spin_lock(&fiq->waitq.lock); - err = -EAGAIN; - if ((file->f_flags & O_NONBLOCK) && fiq->connected && - !request_pending(fiq)) - goto err_unlock; + for (;;) { + spin_lock(&fiq->lock); + if (!fiq->connected || request_pending(fiq)) + break; + spin_unlock(&fiq->lock); - err = wait_event_interruptible_exclusive_locked(fiq->waitq, + if (file->f_flags & O_NONBLOCK) + return -EAGAIN; + err = wait_event_interruptible_exclusive(fiq->waitq, !fiq->connected || request_pending(fiq)); - if (err) - goto err_unlock; + if (err) + return err; + } if (!fiq->connected) { err = fc->aborted ? -ECONNABORTED : -ENODEV; @@ -1351,28 +1240,28 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, req = list_entry(fiq->pending.next, struct fuse_req, list); clear_bit(FR_PENDING, &req->flags); list_del_init(&req->list); - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); - in = &req->in; - reqsize = in->h.len; + args = req->args; + reqsize = req->in.h.len; /* If request is too large, reply with an error and restart the read */ if (nbytes < reqsize) { req->out.h.error = -EIO; /* SETXATTR is special, since it may contain too large data */ - if (in->h.opcode == FUSE_SETXATTR) + if (args->opcode == FUSE_SETXATTR) req->out.h.error = -E2BIG; - request_end(fc, req); + fuse_request_end(fc, req); goto restart; } spin_lock(&fpq->lock); list_add(&req->list, &fpq->io); spin_unlock(&fpq->lock); cs->req = req; - err = fuse_copy_one(cs, &in->h, sizeof(in->h)); + err = fuse_copy_one(cs, &req->in.h, sizeof(req->in.h)); if (!err) - err = fuse_copy_args(cs, in->numargs, in->argpages, - (struct fuse_arg *) in->args, 0); + err = fuse_copy_args(cs, args->in_numargs, args->in_pages, + (struct fuse_arg *) args->in_args, 0); fuse_copy_finish(cs); spin_lock(&fpq->lock); clear_bit(FR_LOCKED, &req->flags); @@ -1405,11 +1294,11 @@ out_end: if (!test_bit(FR_PRIVATE, &req->flags)) list_del_init(&req->list); spin_unlock(&fpq->lock); - request_end(fc, req); + fuse_request_end(fc, req); return err; err_unlock: - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); return err; } @@ -1728,9 +1617,19 @@ out_finish: return err; } -static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req) +struct fuse_retrieve_args { + struct fuse_args_pages ap; + struct fuse_notify_retrieve_in inarg; +}; + +static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_args *args, + int error) { - release_pages(req->pages, req->num_pages); + struct fuse_retrieve_args *ra = + container_of(args, typeof(*ra), ap.args); + + release_pages(ra->ap.pages, ra->ap.num_pages); + kfree(ra); } static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, @@ -1738,13 +1637,16 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, { int err; struct address_space *mapping = inode->i_mapping; - struct fuse_req *req; pgoff_t index; loff_t file_size; unsigned int num; unsigned int offset; size_t total_len = 0; unsigned int num_pages; + struct fuse_retrieve_args *ra; + size_t args_size = sizeof(*ra); + struct fuse_args_pages *ap; + struct fuse_args *args; offset = outarg->offset & ~PAGE_MASK; file_size = i_size_read(inode); @@ -1758,19 +1660,26 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, num_pages = (num + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; num_pages = min(num_pages, fc->max_pages); - req = fuse_get_req(fc, num_pages); - if (IS_ERR(req)) - return PTR_ERR(req); + args_size += num_pages * (sizeof(ap->pages[0]) + sizeof(ap->descs[0])); - req->in.h.opcode = FUSE_NOTIFY_REPLY; - req->in.h.nodeid = outarg->nodeid; - req->in.numargs = 2; - req->in.argpages = 1; - req->end = fuse_retrieve_end; + ra = kzalloc(args_size, GFP_KERNEL); + if (!ra) + return -ENOMEM; + + ap = &ra->ap; + ap->pages = (void *) (ra + 1); + ap->descs = (void *) (ap->pages + num_pages); + + args = &ap->args; + args->nodeid = outarg->nodeid; + args->opcode = FUSE_NOTIFY_REPLY; + args->in_numargs = 2; + args->in_pages = true; + args->end = fuse_retrieve_end; index = outarg->offset >> PAGE_SHIFT; - while (num && req->num_pages < num_pages) { + while (num && ap->num_pages < num_pages) { struct page *page; unsigned int this_num; @@ -1779,27 +1688,25 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, break; this_num = min_t(unsigned, num, PAGE_SIZE - offset); - req->pages[req->num_pages] = page; - req->page_descs[req->num_pages].offset = offset; - req->page_descs[req->num_pages].length = this_num; - req->num_pages++; + ap->pages[ap->num_pages] = page; + ap->descs[ap->num_pages].offset = offset; + ap->descs[ap->num_pages].length = this_num; + ap->num_pages++; offset = 0; num -= this_num; total_len += this_num; index++; } - req->misc.retrieve_in.offset = outarg->offset; - req->misc.retrieve_in.size = total_len; - req->in.args[0].size = sizeof(req->misc.retrieve_in); - req->in.args[0].value = &req->misc.retrieve_in; - req->in.args[1].size = total_len; + ra->inarg.offset = outarg->offset; + ra->inarg.size = total_len; + args->in_args[0].size = sizeof(ra->inarg); + args->in_args[0].value = &ra->inarg; + args->in_args[1].size = total_len; - err = fuse_request_send_notify_reply(fc, req, outarg->notify_unique); - if (err) { - fuse_retrieve_end(fc, req); - fuse_put_request(fc, req); - } + err = fuse_simple_notify_reply(fc, args, outarg->notify_unique); + if (err) + fuse_retrieve_end(fc, args, err); return err; } @@ -1885,27 +1792,25 @@ static struct fuse_req *request_find(struct fuse_pqueue *fpq, u64 unique) return NULL; } -static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out, +static int copy_out_args(struct fuse_copy_state *cs, struct fuse_args *args, unsigned nbytes) { unsigned reqsize = sizeof(struct fuse_out_header); - if (out->h.error) - return nbytes != reqsize ? -EINVAL : 0; - - reqsize += len_args(out->numargs, out->args); + reqsize += fuse_len_args(args->out_numargs, args->out_args); - if (reqsize < nbytes || (reqsize > nbytes && !out->argvar)) + if (reqsize < nbytes || (reqsize > nbytes && !args->out_argvar)) return -EINVAL; else if (reqsize > nbytes) { - struct fuse_arg *lastarg = &out->args[out->numargs-1]; + struct fuse_arg *lastarg = &args->out_args[args->out_numargs-1]; unsigned diffsize = reqsize - nbytes; + if (diffsize > lastarg->size) return -EINVAL; lastarg->size -= diffsize; } - return fuse_copy_args(cs, out->numargs, out->argpages, out->args, - out->page_zeroing); + return fuse_copy_args(cs, args->out_numargs, args->out_pages, + args->out_args, args->page_zeroing); } /* @@ -1913,7 +1818,7 @@ static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out, * the write buffer. The request is then searched on the processing * list by the unique ID found in the header. If found, then remove * it from the list and copy the rest of the buffer to the request. - * The request is finished by calling request_end() + * The request is finished by calling fuse_request_end(). */ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, struct fuse_copy_state *cs, size_t nbytes) @@ -1984,10 +1889,13 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, set_bit(FR_LOCKED, &req->flags); spin_unlock(&fpq->lock); cs->req = req; - if (!req->out.page_replace) + if (!req->args->page_replace) cs->move_pages = 0; - err = copy_out_args(cs, &req->out, nbytes); + if (oh.error) + err = nbytes != sizeof(oh) ? -EINVAL : 0; + else + err = copy_out_args(cs, req->args, nbytes); fuse_copy_finish(cs); spin_lock(&fpq->lock); @@ -2000,7 +1908,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, list_del_init(&req->list); spin_unlock(&fpq->lock); - request_end(fc, req); + fuse_request_end(fc, req); out: return err ? err : nbytes; @@ -2121,12 +2029,12 @@ static __poll_t fuse_dev_poll(struct file *file, poll_table *wait) fiq = &fud->fc->iq; poll_wait(file, &fiq->waitq, wait); - spin_lock(&fiq->waitq.lock); + spin_lock(&fiq->lock); if (!fiq->connected) mask = EPOLLERR; else if (request_pending(fiq)) mask |= EPOLLIN | EPOLLRDNORM; - spin_unlock(&fiq->waitq.lock); + spin_unlock(&fiq->lock); return mask; } @@ -2140,7 +2048,7 @@ static void end_requests(struct fuse_conn *fc, struct list_head *head) req->out.h.error = -ECONNABORTED; clear_bit(FR_SENT, &req->flags); list_del_init(&req->list); - request_end(fc, req); + fuse_request_end(fc, req); } } @@ -2221,15 +2129,15 @@ void fuse_abort_conn(struct fuse_conn *fc) flush_bg_queue(fc); spin_unlock(&fc->bg_lock); - spin_lock(&fiq->waitq.lock); + spin_lock(&fiq->lock); fiq->connected = 0; list_for_each_entry(req, &fiq->pending, list) clear_bit(FR_PENDING, &req->flags); list_splice_tail_init(&fiq->pending, &to_end); while (forget_pending(fiq)) - kfree(dequeue_forget(fiq, 1, NULL)); - wake_up_all_locked(&fiq->waitq); - spin_unlock(&fiq->waitq.lock); + kfree(fuse_dequeue_forget(fiq, 1, NULL)); + wake_up_all(&fiq->waitq); + spin_unlock(&fiq->lock); kill_fasync(&fiq->fasync, SIGIO, POLL_IN); end_polls(fc); wake_up_all(&fc->blocked_waitq); @@ -2296,7 +2204,7 @@ static int fuse_device_clone(struct fuse_conn *fc, struct file *new) if (new->private_data) return -EINVAL; - fud = fuse_dev_alloc(fc); + fud = fuse_dev_alloc_install(fc); if (!fud) return -ENOMEM; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index dd0f64f7bc06..54d638f9ba1c 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -24,20 +24,54 @@ static void fuse_advise_use_readdirplus(struct inode *dir) set_bit(FUSE_I_ADVISE_RDPLUS, &fi->state); } +#if BITS_PER_LONG >= 64 +static inline void __fuse_dentry_settime(struct dentry *entry, u64 time) +{ + entry->d_fsdata = (void *) time; +} + +static inline u64 fuse_dentry_time(const struct dentry *entry) +{ + return (u64)entry->d_fsdata; +} + +#else union fuse_dentry { u64 time; struct rcu_head rcu; }; -static inline void fuse_dentry_settime(struct dentry *entry, u64 time) +static inline void __fuse_dentry_settime(struct dentry *dentry, u64 time) { - ((union fuse_dentry *) entry->d_fsdata)->time = time; + ((union fuse_dentry *) dentry->d_fsdata)->time = time; } -static inline u64 fuse_dentry_time(struct dentry *entry) +static inline u64 fuse_dentry_time(const struct dentry *entry) { return ((union fuse_dentry *) entry->d_fsdata)->time; } +#endif + +static void fuse_dentry_settime(struct dentry *dentry, u64 time) +{ + struct fuse_conn *fc = get_fuse_conn_super(dentry->d_sb); + bool delete = !time && fc->delete_stale; + /* + * Mess with DCACHE_OP_DELETE because dput() will be faster without it. + * Don't care about races, either way it's just an optimization + */ + if ((!delete && (dentry->d_flags & DCACHE_OP_DELETE)) || + (delete && !(dentry->d_flags & DCACHE_OP_DELETE))) { + spin_lock(&dentry->d_lock); + if (!delete) + dentry->d_flags &= ~DCACHE_OP_DELETE; + else + dentry->d_flags |= DCACHE_OP_DELETE; + spin_unlock(&dentry->d_lock); + } + + __fuse_dentry_settime(dentry, time); +} /* * FUSE caches dentries and attributes with separate timeout. The @@ -139,14 +173,14 @@ static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_args *args, struct fuse_entry_out *outarg) { memset(outarg, 0, sizeof(struct fuse_entry_out)); - args->in.h.opcode = FUSE_LOOKUP; - args->in.h.nodeid = nodeid; - args->in.numargs = 1; - args->in.args[0].size = name->len + 1; - args->in.args[0].value = name->name; - args->out.numargs = 1; - args->out.args[0].size = sizeof(struct fuse_entry_out); - args->out.args[0].value = outarg; + args->opcode = FUSE_LOOKUP; + args->nodeid = nodeid; + args->in_numargs = 1; + args->in_args[0].size = name->len + 1; + args->in_args[0].value = name->name; + args->out_numargs = 1; + args->out_args[0].size = sizeof(struct fuse_entry_out); + args->out_args[0].value = outarg; } /* @@ -242,9 +276,11 @@ invalid: goto out; } +#if BITS_PER_LONG < 64 static int fuse_dentry_init(struct dentry *dentry) { - dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry), GFP_KERNEL); + dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry), + GFP_KERNEL_ACCOUNT | __GFP_RECLAIMABLE); return dentry->d_fsdata ? 0 : -ENOMEM; } @@ -254,16 +290,27 @@ static void fuse_dentry_release(struct dentry *dentry) kfree_rcu(fd, rcu); } +#endif + +static int fuse_dentry_delete(const struct dentry *dentry) +{ + return time_before64(fuse_dentry_time(dentry), get_jiffies_64()); +} const struct dentry_operations fuse_dentry_operations = { .d_revalidate = fuse_dentry_revalidate, + .d_delete = fuse_dentry_delete, +#if BITS_PER_LONG < 64 .d_init = fuse_dentry_init, .d_release = fuse_dentry_release, +#endif }; const struct dentry_operations fuse_root_dentry_operations = { +#if BITS_PER_LONG < 64 .d_init = fuse_dentry_init, .d_release = fuse_dentry_release, +#endif }; int fuse_valid_type(int m) @@ -358,7 +405,8 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, else fuse_invalidate_entry_cache(entry); - fuse_advise_use_readdirplus(dir); + if (inode) + fuse_advise_use_readdirplus(dir); return newent; out_iput: @@ -410,18 +458,18 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, inarg.flags = flags; inarg.mode = mode; inarg.umask = current_umask(); - args.in.h.opcode = FUSE_CREATE; - args.in.h.nodeid = get_node_id(dir); - args.in.numargs = 2; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.in.args[1].size = entry->d_name.len + 1; - args.in.args[1].value = entry->d_name.name; - args.out.numargs = 2; - args.out.args[0].size = sizeof(outentry); - args.out.args[0].value = &outentry; - args.out.args[1].size = sizeof(outopen); - args.out.args[1].value = &outopen; + args.opcode = FUSE_CREATE; + args.nodeid = get_node_id(dir); + args.in_numargs = 2; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.in_args[1].size = entry->d_name.len + 1; + args.in_args[1].value = entry->d_name.name; + args.out_numargs = 2; + args.out_args[0].size = sizeof(outentry); + args.out_args[0].value = &outentry; + args.out_args[1].size = sizeof(outopen); + args.out_args[1].value = &outopen; err = fuse_simple_request(fc, &args); if (err) goto out_free_ff; @@ -526,10 +574,10 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args, return -ENOMEM; memset(&outarg, 0, sizeof(outarg)); - args->in.h.nodeid = get_node_id(dir); - args->out.numargs = 1; - args->out.args[0].size = sizeof(outarg); - args->out.args[0].value = &outarg; + args->nodeid = get_node_id(dir); + args->out_numargs = 1; + args->out_args[0].size = sizeof(outarg); + args->out_args[0].value = &outarg; err = fuse_simple_request(fc, args); if (err) goto out_put_forget_req; @@ -582,12 +630,12 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode, inarg.mode = mode; inarg.rdev = new_encode_dev(rdev); inarg.umask = current_umask(); - args.in.h.opcode = FUSE_MKNOD; - args.in.numargs = 2; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.in.args[1].size = entry->d_name.len + 1; - args.in.args[1].value = entry->d_name.name; + args.opcode = FUSE_MKNOD; + args.in_numargs = 2; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.in_args[1].size = entry->d_name.len + 1; + args.in_args[1].value = entry->d_name.name; return create_new_entry(fc, &args, dir, entry, mode); } @@ -609,12 +657,12 @@ static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode) memset(&inarg, 0, sizeof(inarg)); inarg.mode = mode; inarg.umask = current_umask(); - args.in.h.opcode = FUSE_MKDIR; - args.in.numargs = 2; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.in.args[1].size = entry->d_name.len + 1; - args.in.args[1].value = entry->d_name.name; + args.opcode = FUSE_MKDIR; + args.in_numargs = 2; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.in_args[1].size = entry->d_name.len + 1; + args.in_args[1].value = entry->d_name.name; return create_new_entry(fc, &args, dir, entry, S_IFDIR); } @@ -625,12 +673,12 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry, unsigned len = strlen(link) + 1; FUSE_ARGS(args); - args.in.h.opcode = FUSE_SYMLINK; - args.in.numargs = 2; - args.in.args[0].size = entry->d_name.len + 1; - args.in.args[0].value = entry->d_name.name; - args.in.args[1].size = len; - args.in.args[1].value = link; + args.opcode = FUSE_SYMLINK; + args.in_numargs = 2; + args.in_args[0].size = entry->d_name.len + 1; + args.in_args[0].value = entry->d_name.name; + args.in_args[1].size = len; + args.in_args[1].value = link; return create_new_entry(fc, &args, dir, entry, S_IFLNK); } @@ -648,11 +696,11 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) struct fuse_conn *fc = get_fuse_conn(dir); FUSE_ARGS(args); - args.in.h.opcode = FUSE_UNLINK; - args.in.h.nodeid = get_node_id(dir); - args.in.numargs = 1; - args.in.args[0].size = entry->d_name.len + 1; - args.in.args[0].value = entry->d_name.name; + args.opcode = FUSE_UNLINK; + args.nodeid = get_node_id(dir); + args.in_numargs = 1; + args.in_args[0].size = entry->d_name.len + 1; + args.in_args[0].value = entry->d_name.name; err = fuse_simple_request(fc, &args); if (!err) { struct inode *inode = d_inode(entry); @@ -684,11 +732,11 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) struct fuse_conn *fc = get_fuse_conn(dir); FUSE_ARGS(args); - args.in.h.opcode = FUSE_RMDIR; - args.in.h.nodeid = get_node_id(dir); - args.in.numargs = 1; - args.in.args[0].size = entry->d_name.len + 1; - args.in.args[0].value = entry->d_name.name; + args.opcode = FUSE_RMDIR; + args.nodeid = get_node_id(dir); + args.in_numargs = 1; + args.in_args[0].size = entry->d_name.len + 1; + args.in_args[0].value = entry->d_name.name; err = fuse_simple_request(fc, &args); if (!err) { clear_nlink(d_inode(entry)); @@ -711,15 +759,15 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, memset(&inarg, 0, argsize); inarg.newdir = get_node_id(newdir); inarg.flags = flags; - args.in.h.opcode = opcode; - args.in.h.nodeid = get_node_id(olddir); - args.in.numargs = 3; - args.in.args[0].size = argsize; - args.in.args[0].value = &inarg; - args.in.args[1].size = oldent->d_name.len + 1; - args.in.args[1].value = oldent->d_name.name; - args.in.args[2].size = newent->d_name.len + 1; - args.in.args[2].value = newent->d_name.name; + args.opcode = opcode; + args.nodeid = get_node_id(olddir); + args.in_numargs = 3; + args.in_args[0].size = argsize; + args.in_args[0].value = &inarg; + args.in_args[1].size = oldent->d_name.len + 1; + args.in_args[1].value = oldent->d_name.name; + args.in_args[2].size = newent->d_name.len + 1; + args.in_args[2].value = newent->d_name.name; err = fuse_simple_request(fc, &args); if (!err) { /* ctime changes */ @@ -796,12 +844,12 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, memset(&inarg, 0, sizeof(inarg)); inarg.oldnodeid = get_node_id(inode); - args.in.h.opcode = FUSE_LINK; - args.in.numargs = 2; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.in.args[1].size = newent->d_name.len + 1; - args.in.args[1].value = newent->d_name.name; + args.opcode = FUSE_LINK; + args.in_numargs = 2; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.in_args[1].size = newent->d_name.len + 1; + args.in_args[1].value = newent->d_name.name; err = create_new_entry(fc, &args, newdir, newent, inode->i_mode); /* Contrary to "normal" filesystems it can happen that link makes two "logical" inodes point to the same "physical" @@ -884,14 +932,14 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat, inarg.getattr_flags |= FUSE_GETATTR_FH; inarg.fh = ff->fh; } - args.in.h.opcode = FUSE_GETATTR; - args.in.h.nodeid = get_node_id(inode); - args.in.numargs = 1; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.out.numargs = 1; - args.out.args[0].size = sizeof(outarg); - args.out.args[0].value = &outarg; + args.opcode = FUSE_GETATTR; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; err = fuse_simple_request(fc, &args); if (!err) { if ((inode->i_mode ^ outarg.attr.mode) & S_IFMT) { @@ -1056,11 +1104,11 @@ static int fuse_access(struct inode *inode, int mask) memset(&inarg, 0, sizeof(inarg)); inarg.mask = mask & (MAY_READ | MAY_WRITE | MAY_EXEC); - args.in.h.opcode = FUSE_ACCESS; - args.in.h.nodeid = get_node_id(inode); - args.in.numargs = 1; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; + args.opcode = FUSE_ACCESS; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; err = fuse_simple_request(fc, &args); if (err == -ENOSYS) { fc->no_access = 1; @@ -1152,38 +1200,36 @@ static int fuse_permission(struct inode *inode, int mask) static int fuse_readlink_page(struct inode *inode, struct page *page) { struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req; - int err; + struct fuse_page_desc desc = { .length = PAGE_SIZE - 1 }; + struct fuse_args_pages ap = { + .num_pages = 1, + .pages = &page, + .descs = &desc, + }; + char *link; + ssize_t res; + + ap.args.opcode = FUSE_READLINK; + ap.args.nodeid = get_node_id(inode); + ap.args.out_pages = true; + ap.args.out_argvar = true; + ap.args.page_zeroing = true; + ap.args.out_numargs = 1; + ap.args.out_args[0].size = desc.length; + res = fuse_simple_request(fc, &ap.args); - req = fuse_get_req(fc, 1); - if (IS_ERR(req)) - return PTR_ERR(req); - - req->out.page_zeroing = 1; - req->out.argpages = 1; - req->num_pages = 1; - req->pages[0] = page; - req->page_descs[0].length = PAGE_SIZE - 1; - req->in.h.opcode = FUSE_READLINK; - req->in.h.nodeid = get_node_id(inode); - req->out.argvar = 1; - req->out.numargs = 1; - req->out.args[0].size = PAGE_SIZE - 1; - fuse_request_send(fc, req); - err = req->out.h.error; + fuse_invalidate_atime(inode); - if (!err) { - char *link = page_address(page); - size_t len = req->out.args[0].size; + if (res < 0) + return res; - BUG_ON(len >= PAGE_SIZE); - link[len] = '\0'; - } + if (WARN_ON(res >= PAGE_SIZE)) + return -EIO; - fuse_put_request(fc, req); - fuse_invalidate_atime(inode); + link = page_address(page); + link[res] = '\0'; - return err; + return 0; } static const char *fuse_get_link(struct dentry *dentry, struct inode *inode, @@ -1383,14 +1429,14 @@ static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_args *args, struct fuse_setattr_in *inarg_p, struct fuse_attr_out *outarg_p) { - args->in.h.opcode = FUSE_SETATTR; - args->in.h.nodeid = get_node_id(inode); - args->in.numargs = 1; - args->in.args[0].size = sizeof(*inarg_p); - args->in.args[0].value = inarg_p; - args->out.numargs = 1; - args->out.args[0].size = sizeof(*outarg_p); - args->out.args[0].value = outarg_p; + args->opcode = FUSE_SETATTR; + args->nodeid = get_node_id(inode); + args->in_numargs = 1; + args->in_args[0].size = sizeof(*inarg_p); + args->in_args[0].value = inarg_p; + args->out_numargs = 1; + args->out_args[0].size = sizeof(*outarg_p); + args->out_args[0].value = outarg_p; } /* @@ -1476,6 +1522,19 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, is_truncate = true; } + /* Flush dirty data/metadata before non-truncate SETATTR */ + if (is_wb && S_ISREG(inode->i_mode) && + attr->ia_valid & + (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_MTIME_SET | + ATTR_TIMES_SET)) { + err = write_inode_now(inode, true); + if (err) + return err; + + fuse_set_nowrite(inode); + fuse_release_nowrite(inode); + } + if (is_truncate) { fuse_set_nowrite(inode); set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 5ae2828beb00..db48a5cf8620 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -19,6 +19,18 @@ #include <linux/falloc.h> #include <linux/uio.h> +static struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags, + struct fuse_page_desc **desc) +{ + struct page **pages; + + pages = kzalloc(npages * (sizeof(struct page *) + + sizeof(struct fuse_page_desc)), flags); + *desc = (void *) (pages + npages); + + return pages; +} + static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file, int opcode, struct fuse_open_out *outargp) { @@ -29,29 +41,36 @@ static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file, inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY); if (!fc->atomic_o_trunc) inarg.flags &= ~O_TRUNC; - args.in.h.opcode = opcode; - args.in.h.nodeid = nodeid; - args.in.numargs = 1; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.out.numargs = 1; - args.out.args[0].size = sizeof(*outargp); - args.out.args[0].value = outargp; + args.opcode = opcode; + args.nodeid = nodeid; + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.out_numargs = 1; + args.out_args[0].size = sizeof(*outargp); + args.out_args[0].value = outargp; return fuse_simple_request(fc, &args); } +struct fuse_release_args { + struct fuse_args args; + struct fuse_release_in inarg; + struct inode *inode; +}; + struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) { struct fuse_file *ff; - ff = kzalloc(sizeof(struct fuse_file), GFP_KERNEL); + ff = kzalloc(sizeof(struct fuse_file), GFP_KERNEL_ACCOUNT); if (unlikely(!ff)) return NULL; ff->fc = fc; - ff->reserved_req = fuse_request_alloc(0); - if (unlikely(!ff->reserved_req)) { + ff->release_args = kzalloc(sizeof(*ff->release_args), + GFP_KERNEL_ACCOUNT); + if (!ff->release_args) { kfree(ff); return NULL; } @@ -69,7 +88,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) void fuse_file_free(struct fuse_file *ff) { - fuse_request_free(ff->reserved_req); + kfree(ff->release_args); mutex_destroy(&ff->readdir.lock); kfree(ff); } @@ -80,34 +99,31 @@ static struct fuse_file *fuse_file_get(struct fuse_file *ff) return ff; } -static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req) +static void fuse_release_end(struct fuse_conn *fc, struct fuse_args *args, + int error) { - iput(req->misc.release.inode); + struct fuse_release_args *ra = container_of(args, typeof(*ra), args); + + iput(ra->inode); + kfree(ra); } static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir) { if (refcount_dec_and_test(&ff->count)) { - struct fuse_req *req = ff->reserved_req; + struct fuse_args *args = &ff->release_args->args; if (isdir ? ff->fc->no_opendir : ff->fc->no_open) { - /* - * Drop the release request when client does not - * implement 'open' - */ - __clear_bit(FR_BACKGROUND, &req->flags); - iput(req->misc.release.inode); - fuse_put_request(ff->fc, req); + /* Do nothing when client does not implement 'open' */ + fuse_release_end(ff->fc, args, 0); } else if (sync) { - __set_bit(FR_FORCE, &req->flags); - __clear_bit(FR_BACKGROUND, &req->flags); - fuse_request_send(ff->fc, req); - iput(req->misc.release.inode); - fuse_put_request(ff->fc, req); + fuse_simple_request(ff->fc, args); + fuse_release_end(ff->fc, args, 0); } else { - req->end = fuse_release_end; - __set_bit(FR_BACKGROUND, &req->flags); - fuse_request_send_background(ff->fc, req); + args->end = fuse_release_end; + if (fuse_simple_background(ff->fc, args, + GFP_KERNEL | __GFP_NOFAIL)) + fuse_release_end(ff->fc, args, -ENOTCONN); } kfree(ff); } @@ -201,7 +217,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) { struct fuse_conn *fc = get_fuse_conn(inode); int err; - bool lock_inode = (file->f_flags & O_TRUNC) && + bool is_wb_truncate = (file->f_flags & O_TRUNC) && fc->atomic_o_trunc && fc->writeback_cache; @@ -209,16 +225,20 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) if (err) return err; - if (lock_inode) + if (is_wb_truncate) { inode_lock(inode); + fuse_set_nowrite(inode); + } err = fuse_do_open(fc, get_node_id(inode), file, isdir); if (!err) fuse_finish_open(inode, file); - if (lock_inode) + if (is_wb_truncate) { + fuse_release_nowrite(inode); inode_unlock(inode); + } return err; } @@ -227,8 +247,7 @@ static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff, int flags, int opcode) { struct fuse_conn *fc = ff->fc; - struct fuse_req *req = ff->reserved_req; - struct fuse_release_in *inarg = &req->misc.release.in; + struct fuse_release_args *ra = ff->release_args; /* Inode is NULL on error path of fuse_create_open() */ if (likely(fi)) { @@ -243,32 +262,33 @@ static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff, wake_up_interruptible_all(&ff->poll_wait); - inarg->fh = ff->fh; - inarg->flags = flags; - req->in.h.opcode = opcode; - req->in.h.nodeid = ff->nodeid; - req->in.numargs = 1; - req->in.args[0].size = sizeof(struct fuse_release_in); - req->in.args[0].value = inarg; + ra->inarg.fh = ff->fh; + ra->inarg.flags = flags; + ra->args.in_numargs = 1; + ra->args.in_args[0].size = sizeof(struct fuse_release_in); + ra->args.in_args[0].value = &ra->inarg; + ra->args.opcode = opcode; + ra->args.nodeid = ff->nodeid; + ra->args.force = true; + ra->args.nocreds = true; } void fuse_release_common(struct file *file, bool isdir) { struct fuse_inode *fi = get_fuse_inode(file_inode(file)); struct fuse_file *ff = file->private_data; - struct fuse_req *req = ff->reserved_req; + struct fuse_release_args *ra = ff->release_args; int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE; fuse_prepare_release(fi, ff, file->f_flags, opcode); if (ff->flock) { - struct fuse_release_in *inarg = &req->misc.release.in; - inarg->release_flags |= FUSE_RELEASE_FLOCK_UNLOCK; - inarg->lock_owner = fuse_lock_owner_id(ff->fc, - (fl_owner_t) file); + ra->inarg.release_flags |= FUSE_RELEASE_FLOCK_UNLOCK; + ra->inarg.lock_owner = fuse_lock_owner_id(ff->fc, + (fl_owner_t) file); } /* Hold inode until release is finished */ - req->misc.release.inode = igrab(file_inode(file)); + ra->inode = igrab(file_inode(file)); /* * Normally this will send the RELEASE request, however if @@ -279,7 +299,7 @@ void fuse_release_common(struct file *file, bool isdir) * synchronous RELEASE is allowed (and desirable) in this case * because the server can be trusted not to screw up. */ - fuse_file_put(ff, ff->fc->destroy_req != NULL, isdir); + fuse_file_put(ff, ff->fc->destroy, isdir); } static int fuse_open(struct inode *inode, struct file *file) @@ -335,19 +355,27 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id) return (u64) v0 + ((u64) v1 << 32); } -static struct fuse_req *fuse_find_writeback(struct fuse_inode *fi, +struct fuse_writepage_args { + struct fuse_io_args ia; + struct list_head writepages_entry; + struct list_head queue_entry; + struct fuse_writepage_args *next; + struct inode *inode; +}; + +static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi, pgoff_t idx_from, pgoff_t idx_to) { - struct fuse_req *req; + struct fuse_writepage_args *wpa; - list_for_each_entry(req, &fi->writepages, writepages_entry) { + list_for_each_entry(wpa, &fi->writepages, writepages_entry) { pgoff_t curr_index; - WARN_ON(get_fuse_inode(req->inode) != fi); - curr_index = req->misc.write.in.offset >> PAGE_SHIFT; - if (idx_from < curr_index + req->num_pages && + WARN_ON(get_fuse_inode(wpa->inode) != fi); + curr_index = wpa->ia.write.in.offset >> PAGE_SHIFT; + if (idx_from < curr_index + wpa->ia.ap.num_pages && curr_index <= idx_to) { - return req; + return wpa; } } return NULL; @@ -383,12 +411,11 @@ static inline bool fuse_page_is_writeback(struct inode *inode, pgoff_t index) * Since fuse doesn't rely on the VM writeback tracking, this has to * use some other means. */ -static int fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index) +static void fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index) { struct fuse_inode *fi = get_fuse_inode(inode); wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index)); - return 0; } /* @@ -411,8 +438,8 @@ static int fuse_flush(struct file *file, fl_owner_t id) struct inode *inode = file_inode(file); struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_file *ff = file->private_data; - struct fuse_req *req; struct fuse_flush_in inarg; + FUSE_ARGS(args); int err; if (is_bad_inode(inode)) @@ -433,19 +460,17 @@ static int fuse_flush(struct file *file, fl_owner_t id) if (err) return err; - req = fuse_get_req_nofail_nopages(fc, file); memset(&inarg, 0, sizeof(inarg)); inarg.fh = ff->fh; inarg.lock_owner = fuse_lock_owner_id(fc, id); - req->in.h.opcode = FUSE_FLUSH; - req->in.h.nodeid = get_node_id(inode); - req->in.numargs = 1; - req->in.args[0].size = sizeof(inarg); - req->in.args[0].value = &inarg; - __set_bit(FR_FORCE, &req->flags); - fuse_request_send(fc, req); - err = req->out.h.error; - fuse_put_request(fc, req); + args.opcode = FUSE_FLUSH; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.force = true; + + err = fuse_simple_request(fc, &args); if (err == -ENOSYS) { fc->no_flush = 1; err = 0; @@ -465,11 +490,11 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, memset(&inarg, 0, sizeof(inarg)); inarg.fh = ff->fh; inarg.fsync_flags = datasync ? FUSE_FSYNC_FDATASYNC : 0; - args.in.h.opcode = opcode; - args.in.h.nodeid = get_node_id(inode); - args.in.numargs = 1; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; + args.opcode = opcode; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; return fuse_simple_request(fc, &args); } @@ -523,35 +548,35 @@ out: return err; } -void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos, - size_t count, int opcode) +void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, + size_t count, int opcode) { - struct fuse_read_in *inarg = &req->misc.read.in; struct fuse_file *ff = file->private_data; + struct fuse_args *args = &ia->ap.args; - inarg->fh = ff->fh; - inarg->offset = pos; - inarg->size = count; - inarg->flags = file->f_flags; - req->in.h.opcode = opcode; - req->in.h.nodeid = ff->nodeid; - req->in.numargs = 1; - req->in.args[0].size = sizeof(struct fuse_read_in); - req->in.args[0].value = inarg; - req->out.argvar = 1; - req->out.numargs = 1; - req->out.args[0].size = count; + ia->read.in.fh = ff->fh; + ia->read.in.offset = pos; + ia->read.in.size = count; + ia->read.in.flags = file->f_flags; + args->opcode = opcode; + args->nodeid = ff->nodeid; + args->in_numargs = 1; + args->in_args[0].size = sizeof(ia->read.in); + args->in_args[0].value = &ia->read.in; + args->out_argvar = true; + args->out_numargs = 1; + args->out_args[0].size = count; } -static void fuse_release_user_pages(struct fuse_req *req, bool should_dirty) +static void fuse_release_user_pages(struct fuse_args_pages *ap, + bool should_dirty) { - unsigned i; + unsigned int i; - for (i = 0; i < req->num_pages; i++) { - struct page *page = req->pages[i]; + for (i = 0; i < ap->num_pages; i++) { if (should_dirty) - set_page_dirty_lock(page); - put_page(page); + set_page_dirty_lock(ap->pages[i]); + put_page(ap->pages[i]); } } @@ -621,64 +646,94 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos) kref_put(&io->refcnt, fuse_io_release); } -static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_req *req) +static struct fuse_io_args *fuse_io_alloc(struct fuse_io_priv *io, + unsigned int npages) { - struct fuse_io_priv *io = req->io; + struct fuse_io_args *ia; + + ia = kzalloc(sizeof(*ia), GFP_KERNEL); + if (ia) { + ia->io = io; + ia->ap.pages = fuse_pages_alloc(npages, GFP_KERNEL, + &ia->ap.descs); + if (!ia->ap.pages) { + kfree(ia); + ia = NULL; + } + } + return ia; +} + +static void fuse_io_free(struct fuse_io_args *ia) +{ + kfree(ia->ap.pages); + kfree(ia); +} + +static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_args *args, + int err) +{ + struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args); + struct fuse_io_priv *io = ia->io; ssize_t pos = -1; - fuse_release_user_pages(req, io->should_dirty); + fuse_release_user_pages(&ia->ap, io->should_dirty); - if (io->write) { - if (req->misc.write.in.size != req->misc.write.out.size) - pos = req->misc.write.in.offset - io->offset + - req->misc.write.out.size; + if (err) { + /* Nothing */ + } else if (io->write) { + if (ia->write.out.size > ia->write.in.size) { + err = -EIO; + } else if (ia->write.in.size != ia->write.out.size) { + pos = ia->write.in.offset - io->offset + + ia->write.out.size; + } } else { - if (req->misc.read.in.size != req->out.args[0].size) - pos = req->misc.read.in.offset - io->offset + - req->out.args[0].size; + u32 outsize = args->out_args[0].size; + + if (ia->read.in.size != outsize) + pos = ia->read.in.offset - io->offset + outsize; } - fuse_aio_complete(io, req->out.h.error, pos); + fuse_aio_complete(io, err, pos); + fuse_io_free(ia); } -static size_t fuse_async_req_send(struct fuse_conn *fc, struct fuse_req *req, - size_t num_bytes, struct fuse_io_priv *io) +static ssize_t fuse_async_req_send(struct fuse_conn *fc, + struct fuse_io_args *ia, size_t num_bytes) { + ssize_t err; + struct fuse_io_priv *io = ia->io; + spin_lock(&io->lock); kref_get(&io->refcnt); io->size += num_bytes; io->reqs++; spin_unlock(&io->lock); - req->io = io; - req->end = fuse_aio_complete_req; + ia->ap.args.end = fuse_aio_complete_req; + err = fuse_simple_background(fc, &ia->ap.args, GFP_KERNEL); - __fuse_get_request(req); - fuse_request_send_background(fc, req); - - return num_bytes; + return err ?: num_bytes; } -static size_t fuse_send_read(struct fuse_req *req, struct fuse_io_priv *io, - loff_t pos, size_t count, fl_owner_t owner) +static ssize_t fuse_send_read(struct fuse_io_args *ia, loff_t pos, size_t count, + fl_owner_t owner) { - struct file *file = io->iocb->ki_filp; + struct file *file = ia->io->iocb->ki_filp; struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fc; - fuse_read_fill(req, file, pos, count, FUSE_READ); + fuse_read_args_fill(ia, file, pos, count, FUSE_READ); if (owner != NULL) { - struct fuse_read_in *inarg = &req->misc.read.in; - - inarg->read_flags |= FUSE_READ_LOCKOWNER; - inarg->lock_owner = fuse_lock_owner_id(fc, owner); + ia->read.in.read_flags |= FUSE_READ_LOCKOWNER; + ia->read.in.lock_owner = fuse_lock_owner_id(fc, owner); } - if (io->async) - return fuse_async_req_send(fc, req, count, io); + if (ia->io->async) + return fuse_async_req_send(fc, ia, count); - fuse_request_send(fc, req); - return req->out.args[0].size; + return fuse_simple_request(fc, &ia->ap.args); } static void fuse_read_update_size(struct inode *inode, loff_t size, @@ -696,10 +751,9 @@ static void fuse_read_update_size(struct inode *inode, loff_t size, spin_unlock(&fi->lock); } -static void fuse_short_read(struct fuse_req *req, struct inode *inode, - u64 attr_ver) +static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read, + struct fuse_args_pages *ap) { - size_t num_read = req->out.args[0].size; struct fuse_conn *fc = get_fuse_conn(inode); if (fc->writeback_cache) { @@ -712,28 +766,31 @@ static void fuse_short_read(struct fuse_req *req, struct inode *inode, int start_idx = num_read >> PAGE_SHIFT; size_t off = num_read & (PAGE_SIZE - 1); - for (i = start_idx; i < req->num_pages; i++) { - zero_user_segment(req->pages[i], off, PAGE_SIZE); + for (i = start_idx; i < ap->num_pages; i++) { + zero_user_segment(ap->pages[i], off, PAGE_SIZE); off = 0; } } else { - loff_t pos = page_offset(req->pages[0]) + num_read; + loff_t pos = page_offset(ap->pages[0]) + num_read; fuse_read_update_size(inode, pos, attr_ver); } } static int fuse_do_readpage(struct file *file, struct page *page) { - struct kiocb iocb; - struct fuse_io_priv io; struct inode *inode = page->mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req; - size_t num_read; loff_t pos = page_offset(page); - size_t count = PAGE_SIZE; + struct fuse_page_desc desc = { .length = PAGE_SIZE }; + struct fuse_io_args ia = { + .ap.args.page_zeroing = true, + .ap.args.out_pages = true, + .ap.num_pages = 1, + .ap.pages = &page, + .ap.descs = &desc, + }; + ssize_t res; u64 attr_ver; - int err; /* * Page writeback can extend beyond the lifetime of the @@ -742,35 +799,21 @@ static int fuse_do_readpage(struct file *file, struct page *page) */ fuse_wait_on_page_writeback(inode, page->index); - req = fuse_get_req(fc, 1); - if (IS_ERR(req)) - return PTR_ERR(req); - attr_ver = fuse_get_attr_version(fc); - req->out.page_zeroing = 1; - req->out.argpages = 1; - req->num_pages = 1; - req->pages[0] = page; - req->page_descs[0].length = count; - init_sync_kiocb(&iocb, file); - io = (struct fuse_io_priv) FUSE_IO_PRIV_SYNC(&iocb); - num_read = fuse_send_read(req, &io, pos, count, NULL); - err = req->out.h.error; - - if (!err) { - /* - * Short read means EOF. If file size is larger, truncate it - */ - if (num_read < count) - fuse_short_read(req, inode, attr_ver); - - SetPageUptodate(page); - } + fuse_read_args_fill(&ia, file, pos, desc.length, FUSE_READ); + res = fuse_simple_request(fc, &ia.ap.args); + if (res < 0) + return res; + /* + * Short read means EOF. If file size is larger, truncate it + */ + if (res < desc.length) + fuse_short_read(inode, attr_ver, res, &ia.ap); - fuse_put_request(fc, req); + SetPageUptodate(page); - return err; + return 0; } static int fuse_readpage(struct file *file, struct page *page) @@ -789,15 +832,18 @@ static int fuse_readpage(struct file *file, struct page *page) return err; } -static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req) +static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_args *args, + int err) { int i; - size_t count = req->misc.read.in.size; - size_t num_read = req->out.args[0].size; + struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args); + struct fuse_args_pages *ap = &ia->ap; + size_t count = ia->read.in.size; + size_t num_read = args->out_args[0].size; struct address_space *mapping = NULL; - for (i = 0; mapping == NULL && i < req->num_pages; i++) - mapping = req->pages[i]->mapping; + for (i = 0; mapping == NULL && i < ap->num_pages; i++) + mapping = ap->pages[i]->mapping; if (mapping) { struct inode *inode = mapping->host; @@ -805,93 +851,97 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req) /* * Short read means EOF. If file size is larger, truncate it */ - if (!req->out.h.error && num_read < count) - fuse_short_read(req, inode, req->misc.read.attr_ver); + if (!err && num_read < count) + fuse_short_read(inode, ia->read.attr_ver, num_read, ap); fuse_invalidate_atime(inode); } - for (i = 0; i < req->num_pages; i++) { - struct page *page = req->pages[i]; - if (!req->out.h.error) + for (i = 0; i < ap->num_pages; i++) { + struct page *page = ap->pages[i]; + + if (!err) SetPageUptodate(page); else SetPageError(page); unlock_page(page); put_page(page); } - if (req->ff) - fuse_file_put(req->ff, false, false); + if (ia->ff) + fuse_file_put(ia->ff, false, false); + + fuse_io_free(ia); } -static void fuse_send_readpages(struct fuse_req *req, struct file *file) +static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) { struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fc; - loff_t pos = page_offset(req->pages[0]); - size_t count = req->num_pages << PAGE_SHIFT; - - req->out.argpages = 1; - req->out.page_zeroing = 1; - req->out.page_replace = 1; - fuse_read_fill(req, file, pos, count, FUSE_READ); - req->misc.read.attr_ver = fuse_get_attr_version(fc); + struct fuse_args_pages *ap = &ia->ap; + loff_t pos = page_offset(ap->pages[0]); + size_t count = ap->num_pages << PAGE_SHIFT; + int err; + + ap->args.out_pages = true; + ap->args.page_zeroing = true; + ap->args.page_replace = true; + fuse_read_args_fill(ia, file, pos, count, FUSE_READ); + ia->read.attr_ver = fuse_get_attr_version(fc); if (fc->async_read) { - req->ff = fuse_file_get(ff); - req->end = fuse_readpages_end; - fuse_request_send_background(fc, req); + ia->ff = fuse_file_get(ff); + ap->args.end = fuse_readpages_end; + err = fuse_simple_background(fc, &ap->args, GFP_KERNEL); + if (!err) + return; } else { - fuse_request_send(fc, req); - fuse_readpages_end(fc, req); - fuse_put_request(fc, req); + err = fuse_simple_request(fc, &ap->args); } + fuse_readpages_end(fc, &ap->args, err); } struct fuse_fill_data { - struct fuse_req *req; + struct fuse_io_args *ia; struct file *file; struct inode *inode; - unsigned nr_pages; + unsigned int nr_pages; + unsigned int max_pages; }; static int fuse_readpages_fill(void *_data, struct page *page) { struct fuse_fill_data *data = _data; - struct fuse_req *req = data->req; + struct fuse_io_args *ia = data->ia; + struct fuse_args_pages *ap = &ia->ap; struct inode *inode = data->inode; struct fuse_conn *fc = get_fuse_conn(inode); fuse_wait_on_page_writeback(inode, page->index); - if (req->num_pages && - (req->num_pages == fc->max_pages || - (req->num_pages + 1) * PAGE_SIZE > fc->max_read || - req->pages[req->num_pages - 1]->index + 1 != page->index)) { - unsigned int nr_alloc = min_t(unsigned int, data->nr_pages, - fc->max_pages); - fuse_send_readpages(req, data->file); - if (fc->async_read) - req = fuse_get_req_for_background(fc, nr_alloc); - else - req = fuse_get_req(fc, nr_alloc); - - data->req = req; - if (IS_ERR(req)) { + if (ap->num_pages && + (ap->num_pages == fc->max_pages || + (ap->num_pages + 1) * PAGE_SIZE > fc->max_read || + ap->pages[ap->num_pages - 1]->index + 1 != page->index)) { + data->max_pages = min_t(unsigned int, data->nr_pages, + fc->max_pages); + fuse_send_readpages(ia, data->file); + data->ia = ia = fuse_io_alloc(NULL, data->max_pages); + if (!ia) { unlock_page(page); - return PTR_ERR(req); + return -ENOMEM; } + ap = &ia->ap; } - if (WARN_ON(req->num_pages >= req->max_pages)) { + if (WARN_ON(ap->num_pages >= data->max_pages)) { unlock_page(page); - fuse_put_request(fc, req); + fuse_io_free(ia); return -EIO; } get_page(page); - req->pages[req->num_pages] = page; - req->page_descs[req->num_pages].length = PAGE_SIZE; - req->num_pages++; + ap->pages[ap->num_pages] = page; + ap->descs[ap->num_pages].length = PAGE_SIZE; + ap->num_pages++; data->nr_pages--; return 0; } @@ -903,7 +953,6 @@ static int fuse_readpages(struct file *file, struct address_space *mapping, struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_fill_data data; int err; - unsigned int nr_alloc = min_t(unsigned int, nr_pages, fc->max_pages); err = -EIO; if (is_bad_inode(inode)) @@ -911,21 +960,20 @@ static int fuse_readpages(struct file *file, struct address_space *mapping, data.file = file; data.inode = inode; - if (fc->async_read) - data.req = fuse_get_req_for_background(fc, nr_alloc); - else - data.req = fuse_get_req(fc, nr_alloc); data.nr_pages = nr_pages; - err = PTR_ERR(data.req); - if (IS_ERR(data.req)) + data.max_pages = min_t(unsigned int, nr_pages, fc->max_pages); +; + data.ia = fuse_io_alloc(NULL, data.max_pages); + err = -ENOMEM; + if (!data.ia) goto out; err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data); if (!err) { - if (data.req->num_pages) - fuse_send_readpages(data.req, file); + if (data.ia->ap.num_pages) + fuse_send_readpages(data.ia, file); else - fuse_put_request(fc, data.req); + fuse_io_free(data.ia); } out: return err; @@ -952,54 +1000,65 @@ static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to) return generic_file_read_iter(iocb, to); } -static void fuse_write_fill(struct fuse_req *req, struct fuse_file *ff, - loff_t pos, size_t count) +static void fuse_write_args_fill(struct fuse_io_args *ia, struct fuse_file *ff, + loff_t pos, size_t count) { - struct fuse_write_in *inarg = &req->misc.write.in; - struct fuse_write_out *outarg = &req->misc.write.out; + struct fuse_args *args = &ia->ap.args; - inarg->fh = ff->fh; - inarg->offset = pos; - inarg->size = count; - req->in.h.opcode = FUSE_WRITE; - req->in.h.nodeid = ff->nodeid; - req->in.numargs = 2; + ia->write.in.fh = ff->fh; + ia->write.in.offset = pos; + ia->write.in.size = count; + args->opcode = FUSE_WRITE; + args->nodeid = ff->nodeid; + args->in_numargs = 2; if (ff->fc->minor < 9) - req->in.args[0].size = FUSE_COMPAT_WRITE_IN_SIZE; + args->in_args[0].size = FUSE_COMPAT_WRITE_IN_SIZE; else - req->in.args[0].size = sizeof(struct fuse_write_in); - req->in.args[0].value = inarg; - req->in.args[1].size = count; - req->out.numargs = 1; - req->out.args[0].size = sizeof(struct fuse_write_out); - req->out.args[0].value = outarg; + args->in_args[0].size = sizeof(ia->write.in); + args->in_args[0].value = &ia->write.in; + args->in_args[1].size = count; + args->out_numargs = 1; + args->out_args[0].size = sizeof(ia->write.out); + args->out_args[0].value = &ia->write.out; +} + +static unsigned int fuse_write_flags(struct kiocb *iocb) +{ + unsigned int flags = iocb->ki_filp->f_flags; + + if (iocb->ki_flags & IOCB_DSYNC) + flags |= O_DSYNC; + if (iocb->ki_flags & IOCB_SYNC) + flags |= O_SYNC; + + return flags; } -static size_t fuse_send_write(struct fuse_req *req, struct fuse_io_priv *io, - loff_t pos, size_t count, fl_owner_t owner) +static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos, + size_t count, fl_owner_t owner) { - struct kiocb *iocb = io->iocb; + struct kiocb *iocb = ia->io->iocb; struct file *file = iocb->ki_filp; struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fc; - struct fuse_write_in *inarg = &req->misc.write.in; + struct fuse_write_in *inarg = &ia->write.in; + ssize_t err; - fuse_write_fill(req, ff, pos, count); - inarg->flags = file->f_flags; - if (iocb->ki_flags & IOCB_DSYNC) - inarg->flags |= O_DSYNC; - if (iocb->ki_flags & IOCB_SYNC) - inarg->flags |= O_SYNC; + fuse_write_args_fill(ia, ff, pos, count); + inarg->flags = fuse_write_flags(iocb); if (owner != NULL) { inarg->write_flags |= FUSE_WRITE_LOCKOWNER; inarg->lock_owner = fuse_lock_owner_id(fc, owner); } - if (io->async) - return fuse_async_req_send(fc, req, count, io); + if (ia->io->async) + return fuse_async_req_send(fc, ia, count); + + err = fuse_simple_request(fc, &ia->ap.args); + if (!err && ia->write.out.size > count) + err = -EIO; - fuse_request_send(fc, req); - return req->misc.write.out.size; + return err ?: ia->write.out.size; } bool fuse_write_update_size(struct inode *inode, loff_t pos) @@ -1019,26 +1078,31 @@ bool fuse_write_update_size(struct inode *inode, loff_t pos) return ret; } -static size_t fuse_send_write_pages(struct fuse_req *req, struct kiocb *iocb, - struct inode *inode, loff_t pos, - size_t count) +static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, + struct kiocb *iocb, struct inode *inode, + loff_t pos, size_t count) { - size_t res; - unsigned offset; - unsigned i; - struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); + struct fuse_args_pages *ap = &ia->ap; + struct file *file = iocb->ki_filp; + struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = ff->fc; + unsigned int offset, i; + int err; - for (i = 0; i < req->num_pages; i++) - fuse_wait_on_page_writeback(inode, req->pages[i]->index); + for (i = 0; i < ap->num_pages; i++) + fuse_wait_on_page_writeback(inode, ap->pages[i]->index); - res = fuse_send_write(req, &io, pos, count, NULL); + fuse_write_args_fill(ia, ff, pos, count); + ia->write.in.flags = fuse_write_flags(iocb); - offset = req->page_descs[0].offset; - count = res; - for (i = 0; i < req->num_pages; i++) { - struct page *page = req->pages[i]; + err = fuse_simple_request(fc, &ap->args); - if (!req->out.h.error && !offset && count >= PAGE_SIZE) + offset = ap->descs[0].offset; + count = ia->write.out.size; + for (i = 0; i < ap->num_pages; i++) { + struct page *page = ap->pages[i]; + + if (!err && !offset && count >= PAGE_SIZE) SetPageUptodate(page); if (count > PAGE_SIZE - offset) @@ -1051,20 +1115,21 @@ static size_t fuse_send_write_pages(struct fuse_req *req, struct kiocb *iocb, put_page(page); } - return res; + return err; } -static ssize_t fuse_fill_write_pages(struct fuse_req *req, - struct address_space *mapping, - struct iov_iter *ii, loff_t pos) +static ssize_t fuse_fill_write_pages(struct fuse_args_pages *ap, + struct address_space *mapping, + struct iov_iter *ii, loff_t pos, + unsigned int max_pages) { struct fuse_conn *fc = get_fuse_conn(mapping->host); unsigned offset = pos & (PAGE_SIZE - 1); size_t count = 0; int err; - req->in.argpages = 1; - req->page_descs[0].offset = offset; + ap->args.in_pages = true; + ap->descs[0].offset = offset; do { size_t tmp; @@ -1100,9 +1165,9 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req, } err = 0; - req->pages[req->num_pages] = page; - req->page_descs[req->num_pages].length = tmp; - req->num_pages++; + ap->pages[ap->num_pages] = page; + ap->descs[ap->num_pages].length = tmp; + ap->num_pages++; count += tmp; pos += tmp; @@ -1113,7 +1178,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req, if (!fc->big_writes) break; } while (iov_iter_count(ii) && count < fc->max_write && - req->num_pages < req->max_pages && offset == 0); + ap->num_pages < max_pages && offset == 0); return count > 0 ? count : err; } @@ -1141,27 +1206,27 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); do { - struct fuse_req *req; ssize_t count; + struct fuse_io_args ia = {}; + struct fuse_args_pages *ap = &ia.ap; unsigned int nr_pages = fuse_wr_pages(pos, iov_iter_count(ii), fc->max_pages); - req = fuse_get_req(fc, nr_pages); - if (IS_ERR(req)) { - err = PTR_ERR(req); + ap->pages = fuse_pages_alloc(nr_pages, GFP_KERNEL, &ap->descs); + if (!ap->pages) { + err = -ENOMEM; break; } - count = fuse_fill_write_pages(req, mapping, ii, pos); + count = fuse_fill_write_pages(ap, mapping, ii, pos, nr_pages); if (count <= 0) { err = count; } else { - size_t num_written; - - num_written = fuse_send_write_pages(req, iocb, inode, - pos, count); - err = req->out.h.error; + err = fuse_send_write_pages(&ia, iocb, inode, + pos, count); if (!err) { + size_t num_written = ia.write.out.size; + res += num_written; pos += num_written; @@ -1170,7 +1235,7 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, err = -EIO; } } - fuse_put_request(fc, req); + kfree(ap->pages); } while (!err && iov_iter_count(ii)); if (res > 0) @@ -1258,14 +1323,14 @@ out: return written ? written : err; } -static inline void fuse_page_descs_length_init(struct fuse_req *req, - unsigned index, unsigned nr_pages) +static inline void fuse_page_descs_length_init(struct fuse_page_desc *descs, + unsigned int index, + unsigned int nr_pages) { int i; for (i = index; i < index + nr_pages; i++) - req->page_descs[i].length = PAGE_SIZE - - req->page_descs[i].offset; + descs[i].length = PAGE_SIZE - descs[i].offset; } static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii) @@ -1279,8 +1344,9 @@ static inline size_t fuse_get_frag_size(const struct iov_iter *ii, return min(iov_iter_single_seg_count(ii), max_size); } -static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii, - size_t *nbytesp, int write) +static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, + size_t *nbytesp, int write, + unsigned int max_pages) { size_t nbytes = 0; /* # bytes already packed in req */ ssize_t ret = 0; @@ -1291,21 +1357,21 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii, size_t frag_size = fuse_get_frag_size(ii, *nbytesp); if (write) - req->in.args[1].value = (void *) user_addr; + ap->args.in_args[1].value = (void *) user_addr; else - req->out.args[0].value = (void *) user_addr; + ap->args.out_args[0].value = (void *) user_addr; iov_iter_advance(ii, frag_size); *nbytesp = frag_size; return 0; } - while (nbytes < *nbytesp && req->num_pages < req->max_pages) { + while (nbytes < *nbytesp && ap->num_pages < max_pages) { unsigned npages; size_t start; - ret = iov_iter_get_pages(ii, &req->pages[req->num_pages], + ret = iov_iter_get_pages(ii, &ap->pages[ap->num_pages], *nbytesp - nbytes, - req->max_pages - req->num_pages, + max_pages - ap->num_pages, &start); if (ret < 0) break; @@ -1316,18 +1382,18 @@ static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii, ret += start; npages = (ret + PAGE_SIZE - 1) / PAGE_SIZE; - req->page_descs[req->num_pages].offset = start; - fuse_page_descs_length_init(req, req->num_pages, npages); + ap->descs[ap->num_pages].offset = start; + fuse_page_descs_length_init(ap->descs, ap->num_pages, npages); - req->num_pages += npages; - req->page_descs[req->num_pages - 1].length -= + ap->num_pages += npages; + ap->descs[ap->num_pages - 1].length -= (PAGE_SIZE - ret) & (PAGE_SIZE - 1); } if (write) - req->in.argpages = 1; + ap->args.in_pages = 1; else - req->out.argpages = 1; + ap->args.out_pages = 1; *nbytesp = nbytes; @@ -1349,17 +1415,16 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, pgoff_t idx_from = pos >> PAGE_SHIFT; pgoff_t idx_to = (pos + count - 1) >> PAGE_SHIFT; ssize_t res = 0; - struct fuse_req *req; int err = 0; + struct fuse_io_args *ia; + unsigned int max_pages; - if (io->async) - req = fuse_get_req_for_background(fc, iov_iter_npages(iter, - fc->max_pages)); - else - req = fuse_get_req(fc, iov_iter_npages(iter, fc->max_pages)); - if (IS_ERR(req)) - return PTR_ERR(req); + max_pages = iov_iter_npages(iter, fc->max_pages); + ia = fuse_io_alloc(io, max_pages); + if (!ia) + return -ENOMEM; + ia->io = io; if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) { if (!write) inode_lock(inode); @@ -1370,54 +1435,49 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, io->should_dirty = !write && iter_is_iovec(iter); while (count) { - size_t nres; + ssize_t nres; fl_owner_t owner = current->files; size_t nbytes = min(count, nmax); - err = fuse_get_user_pages(req, iter, &nbytes, write); + + err = fuse_get_user_pages(&ia->ap, iter, &nbytes, write, + max_pages); if (err && !nbytes) break; if (write) { - if (!capable(CAP_FSETID)) { - struct fuse_write_in *inarg; + if (!capable(CAP_FSETID)) + ia->write.in.write_flags |= FUSE_WRITE_KILL_PRIV; - inarg = &req->misc.write.in; - inarg->write_flags |= FUSE_WRITE_KILL_PRIV; - } - nres = fuse_send_write(req, io, pos, nbytes, owner); + nres = fuse_send_write(ia, pos, nbytes, owner); } else { - nres = fuse_send_read(req, io, pos, nbytes, owner); + nres = fuse_send_read(ia, pos, nbytes, owner); } - if (!io->async) - fuse_release_user_pages(req, io->should_dirty); - if (req->out.h.error) { - err = req->out.h.error; - break; - } else if (nres > nbytes) { - res = 0; - err = -EIO; + if (!io->async || nres < 0) { + fuse_release_user_pages(&ia->ap, io->should_dirty); + fuse_io_free(ia); + } + ia = NULL; + if (nres < 0) { + err = nres; break; } + WARN_ON(nres > nbytes); + count -= nres; res += nres; pos += nres; if (nres != nbytes) break; if (count) { - fuse_put_request(fc, req); - if (io->async) - req = fuse_get_req_for_background(fc, - iov_iter_npages(iter, fc->max_pages)); - else - req = fuse_get_req(fc, iov_iter_npages(iter, - fc->max_pages)); - if (IS_ERR(req)) + max_pages = iov_iter_npages(iter, fc->max_pages); + ia = fuse_io_alloc(io, max_pages); + if (!ia) break; } } - if (!IS_ERR(req)) - fuse_put_request(fc, req); + if (ia) + fuse_io_free(ia); if (res > 0) *ppos = pos; @@ -1509,45 +1569,53 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) return fuse_direct_write_iter(iocb, from); } -static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req) +static void fuse_writepage_free(struct fuse_writepage_args *wpa) { + struct fuse_args_pages *ap = &wpa->ia.ap; int i; - for (i = 0; i < req->num_pages; i++) - __free_page(req->pages[i]); + for (i = 0; i < ap->num_pages; i++) + __free_page(ap->pages[i]); + + if (wpa->ia.ff) + fuse_file_put(wpa->ia.ff, false, false); - if (req->ff) - fuse_file_put(req->ff, false, false); + kfree(ap->pages); + kfree(wpa); } -static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) +static void fuse_writepage_finish(struct fuse_conn *fc, + struct fuse_writepage_args *wpa) { - struct inode *inode = req->inode; + struct fuse_args_pages *ap = &wpa->ia.ap; + struct inode *inode = wpa->inode; struct fuse_inode *fi = get_fuse_inode(inode); struct backing_dev_info *bdi = inode_to_bdi(inode); int i; - list_del(&req->writepages_entry); - for (i = 0; i < req->num_pages; i++) { + list_del(&wpa->writepages_entry); + for (i = 0; i < ap->num_pages; i++) { dec_wb_stat(&bdi->wb, WB_WRITEBACK); - dec_node_page_state(req->pages[i], NR_WRITEBACK_TEMP); + dec_node_page_state(ap->pages[i], NR_WRITEBACK_TEMP); wb_writeout_inc(&bdi->wb); } wake_up(&fi->page_waitq); } /* Called under fi->lock, may release and reacquire it */ -static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req, - loff_t size) +static void fuse_send_writepage(struct fuse_conn *fc, + struct fuse_writepage_args *wpa, loff_t size) __releases(fi->lock) __acquires(fi->lock) { - struct fuse_req *aux, *next; - struct fuse_inode *fi = get_fuse_inode(req->inode); - struct fuse_write_in *inarg = &req->misc.write.in; - __u64 data_size = req->num_pages * PAGE_SIZE; - bool queued; + struct fuse_writepage_args *aux, *next; + struct fuse_inode *fi = get_fuse_inode(wpa->inode); + struct fuse_write_in *inarg = &wpa->ia.write.in; + struct fuse_args *args = &wpa->ia.ap.args; + __u64 data_size = wpa->ia.ap.num_pages * PAGE_SIZE; + int err; + fi->writectr++; if (inarg->offset + data_size <= size) { inarg->size = data_size; } else if (inarg->offset < size) { @@ -1557,29 +1625,36 @@ __acquires(fi->lock) goto out_free; } - req->in.args[1].size = inarg->size; - queued = fuse_request_queue_background(fc, req); + args->in_args[1].size = inarg->size; + args->force = true; + args->nocreds = true; + + err = fuse_simple_background(fc, args, GFP_ATOMIC); + if (err == -ENOMEM) { + spin_unlock(&fi->lock); + err = fuse_simple_background(fc, args, GFP_NOFS | __GFP_NOFAIL); + spin_lock(&fi->lock); + } + /* Fails on broken connection only */ - if (unlikely(!queued)) + if (unlikely(err)) goto out_free; - fi->writectr++; return; out_free: - fuse_writepage_finish(fc, req); + fi->writectr--; + fuse_writepage_finish(fc, wpa); spin_unlock(&fi->lock); /* After fuse_writepage_finish() aux request list is private */ - for (aux = req->misc.write.next; aux; aux = next) { - next = aux->misc.write.next; - aux->misc.write.next = NULL; - fuse_writepage_free(fc, aux); - fuse_put_request(fc, aux); + for (aux = wpa->next; aux; aux = next) { + next = aux->next; + aux->next = NULL; + fuse_writepage_free(aux); } - fuse_writepage_free(fc, req); - fuse_put_request(fc, req); + fuse_writepage_free(wpa); spin_lock(&fi->lock); } @@ -1596,29 +1671,34 @@ __acquires(fi->lock) struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); loff_t crop = i_size_read(inode); - struct fuse_req *req; + struct fuse_writepage_args *wpa; while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) { - req = list_entry(fi->queued_writes.next, struct fuse_req, list); - list_del_init(&req->list); - fuse_send_writepage(fc, req, crop); + wpa = list_entry(fi->queued_writes.next, + struct fuse_writepage_args, queue_entry); + list_del_init(&wpa->queue_entry); + fuse_send_writepage(fc, wpa, crop); } } -static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req) +static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args, + int error) { - struct inode *inode = req->inode; + struct fuse_writepage_args *wpa = + container_of(args, typeof(*wpa), ia.ap.args); + struct inode *inode = wpa->inode; struct fuse_inode *fi = get_fuse_inode(inode); - mapping_set_error(inode->i_mapping, req->out.h.error); + mapping_set_error(inode->i_mapping, error); spin_lock(&fi->lock); - while (req->misc.write.next) { + while (wpa->next) { struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_write_in *inarg = &req->misc.write.in; - struct fuse_req *next = req->misc.write.next; - req->misc.write.next = next->misc.write.next; - next->misc.write.next = NULL; - next->ff = fuse_file_get(req->ff); + struct fuse_write_in *inarg = &wpa->ia.write.in; + struct fuse_writepage_args *next = wpa->next; + + wpa->next = next->next; + next->next = NULL; + next->ia.ff = fuse_file_get(wpa->ia.ff); list_add(&next->writepages_entry, &fi->writepages); /* @@ -1647,9 +1727,9 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req) fuse_send_writepage(fc, next, inarg->offset + inarg->size); } fi->writectr--; - fuse_writepage_finish(fc, req); + fuse_writepage_finish(fc, wpa); spin_unlock(&fi->lock); - fuse_writepage_free(fc, req); + fuse_writepage_free(wpa); } static struct fuse_file *__fuse_write_file_get(struct fuse_conn *fc, @@ -1691,52 +1771,71 @@ int fuse_write_inode(struct inode *inode, struct writeback_control *wbc) return err; } +static struct fuse_writepage_args *fuse_writepage_args_alloc(void) +{ + struct fuse_writepage_args *wpa; + struct fuse_args_pages *ap; + + wpa = kzalloc(sizeof(*wpa), GFP_NOFS); + if (wpa) { + ap = &wpa->ia.ap; + ap->num_pages = 0; + ap->pages = fuse_pages_alloc(1, GFP_NOFS, &ap->descs); + if (!ap->pages) { + kfree(wpa); + wpa = NULL; + } + } + return wpa; + +} + static int fuse_writepage_locked(struct page *page) { struct address_space *mapping = page->mapping; struct inode *inode = mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); - struct fuse_req *req; + struct fuse_writepage_args *wpa; + struct fuse_args_pages *ap; struct page *tmp_page; int error = -ENOMEM; set_page_writeback(page); - req = fuse_request_alloc_nofs(1); - if (!req) + wpa = fuse_writepage_args_alloc(); + if (!wpa) goto err; + ap = &wpa->ia.ap; - /* writeback always goes to bg_queue */ - __set_bit(FR_BACKGROUND, &req->flags); tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); if (!tmp_page) goto err_free; error = -EIO; - req->ff = fuse_write_file_get(fc, fi); - if (!req->ff) + wpa->ia.ff = fuse_write_file_get(fc, fi); + if (!wpa->ia.ff) goto err_nofile; - fuse_write_fill(req, req->ff, page_offset(page), 0); + fuse_write_args_fill(&wpa->ia, wpa->ia.ff, page_offset(page), 0); copy_highpage(tmp_page, page); - req->misc.write.in.write_flags |= FUSE_WRITE_CACHE; - req->misc.write.next = NULL; - req->in.argpages = 1; - req->num_pages = 1; - req->pages[0] = tmp_page; - req->page_descs[0].offset = 0; - req->page_descs[0].length = PAGE_SIZE; - req->end = fuse_writepage_end; - req->inode = inode; + wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE; + wpa->next = NULL; + ap->args.in_pages = true; + ap->num_pages = 1; + ap->pages[0] = tmp_page; + ap->descs[0].offset = 0; + ap->descs[0].length = PAGE_SIZE; + ap->args.end = fuse_writepage_end; + wpa->inode = inode; inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); spin_lock(&fi->lock); - list_add(&req->writepages_entry, &fi->writepages); - list_add_tail(&req->list, &fi->queued_writes); + list_add(&wpa->writepages_entry, &fi->writepages); + list_add_tail(&wpa->queue_entry, &fi->queued_writes); fuse_flush_writepages(inode); spin_unlock(&fi->lock); @@ -1747,7 +1846,7 @@ static int fuse_writepage_locked(struct page *page) err_nofile: __free_page(tmp_page); err_free: - fuse_request_free(req); + kfree(wpa); err: mapping_set_error(page->mapping, error); end_page_writeback(page); @@ -1767,6 +1866,7 @@ static int fuse_writepage(struct page *page, struct writeback_control *wbc) WARN_ON(wbc->sync_mode == WB_SYNC_ALL); redirty_page_for_writepage(wbc, page); + unlock_page(page); return 0; } @@ -1777,23 +1877,50 @@ static int fuse_writepage(struct page *page, struct writeback_control *wbc) } struct fuse_fill_wb_data { - struct fuse_req *req; + struct fuse_writepage_args *wpa; struct fuse_file *ff; struct inode *inode; struct page **orig_pages; + unsigned int max_pages; }; +static bool fuse_pages_realloc(struct fuse_fill_wb_data *data) +{ + struct fuse_args_pages *ap = &data->wpa->ia.ap; + struct fuse_conn *fc = get_fuse_conn(data->inode); + struct page **pages; + struct fuse_page_desc *descs; + unsigned int npages = min_t(unsigned int, + max_t(unsigned int, data->max_pages * 2, + FUSE_DEFAULT_MAX_PAGES_PER_REQ), + fc->max_pages); + WARN_ON(npages <= data->max_pages); + + pages = fuse_pages_alloc(npages, GFP_NOFS, &descs); + if (!pages) + return false; + + memcpy(pages, ap->pages, sizeof(struct page *) * ap->num_pages); + memcpy(descs, ap->descs, sizeof(struct fuse_page_desc) * ap->num_pages); + kfree(ap->pages); + ap->pages = pages; + ap->descs = descs; + data->max_pages = npages; + + return true; +} + static void fuse_writepages_send(struct fuse_fill_wb_data *data) { - struct fuse_req *req = data->req; + struct fuse_writepage_args *wpa = data->wpa; struct inode *inode = data->inode; struct fuse_inode *fi = get_fuse_inode(inode); - int num_pages = req->num_pages; + int num_pages = wpa->ia.ap.num_pages; int i; - req->ff = fuse_file_get(data->ff); + wpa->ia.ff = fuse_file_get(data->ff); spin_lock(&fi->lock); - list_add_tail(&req->list, &fi->queued_writes); + list_add_tail(&wpa->queue_entry, &fi->queued_writes); fuse_flush_writepages(inode); spin_unlock(&fi->lock); @@ -1808,54 +1935,52 @@ static void fuse_writepages_send(struct fuse_fill_wb_data *data) * this new request onto the auxiliary list, otherwise reuse the existing one by * copying the new page contents over to the old temporary page. */ -static bool fuse_writepage_in_flight(struct fuse_req *new_req, +static bool fuse_writepage_in_flight(struct fuse_writepage_args *new_wpa, struct page *page) { - struct fuse_conn *fc = get_fuse_conn(new_req->inode); - struct fuse_inode *fi = get_fuse_inode(new_req->inode); - struct fuse_req *tmp; - struct fuse_req *old_req; + struct fuse_inode *fi = get_fuse_inode(new_wpa->inode); + struct fuse_writepage_args *tmp; + struct fuse_writepage_args *old_wpa; + struct fuse_args_pages *new_ap = &new_wpa->ia.ap; - WARN_ON(new_req->num_pages != 0); + WARN_ON(new_ap->num_pages != 0); spin_lock(&fi->lock); - list_del(&new_req->writepages_entry); - old_req = fuse_find_writeback(fi, page->index, page->index); - if (!old_req) { - list_add(&new_req->writepages_entry, &fi->writepages); + list_del(&new_wpa->writepages_entry); + old_wpa = fuse_find_writeback(fi, page->index, page->index); + if (!old_wpa) { + list_add(&new_wpa->writepages_entry, &fi->writepages); spin_unlock(&fi->lock); return false; } - new_req->num_pages = 1; - for (tmp = old_req->misc.write.next; tmp; tmp = tmp->misc.write.next) { + new_ap->num_pages = 1; + for (tmp = old_wpa->next; tmp; tmp = tmp->next) { pgoff_t curr_index; - WARN_ON(tmp->inode != new_req->inode); - curr_index = tmp->misc.write.in.offset >> PAGE_SHIFT; + WARN_ON(tmp->inode != new_wpa->inode); + curr_index = tmp->ia.write.in.offset >> PAGE_SHIFT; if (curr_index == page->index) { - WARN_ON(tmp->num_pages != 1); - WARN_ON(!test_bit(FR_PENDING, &tmp->flags)); - swap(tmp->pages[0], new_req->pages[0]); + WARN_ON(tmp->ia.ap.num_pages != 1); + swap(tmp->ia.ap.pages[0], new_ap->pages[0]); break; } } if (!tmp) { - new_req->misc.write.next = old_req->misc.write.next; - old_req->misc.write.next = new_req; + new_wpa->next = old_wpa->next; + old_wpa->next = new_wpa; } spin_unlock(&fi->lock); if (tmp) { - struct backing_dev_info *bdi = inode_to_bdi(new_req->inode); + struct backing_dev_info *bdi = inode_to_bdi(new_wpa->inode); dec_wb_stat(&bdi->wb, WB_WRITEBACK); - dec_node_page_state(new_req->pages[0], NR_WRITEBACK_TEMP); + dec_node_page_state(new_ap->pages[0], NR_WRITEBACK_TEMP); wb_writeout_inc(&bdi->wb); - fuse_writepage_free(fc, new_req); - fuse_request_free(new_req); + fuse_writepage_free(new_wpa); } return true; @@ -1865,7 +1990,8 @@ static int fuse_writepages_fill(struct page *page, struct writeback_control *wbc, void *_data) { struct fuse_fill_wb_data *data = _data; - struct fuse_req *req = data->req; + struct fuse_writepage_args *wpa = data->wpa; + struct fuse_args_pages *ap = &wpa->ia.ap; struct inode *inode = data->inode; struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); @@ -1875,7 +2001,7 @@ static int fuse_writepages_fill(struct page *page, if (!data->ff) { err = -EIO; - data->ff = fuse_write_file_get(fc, get_fuse_inode(inode)); + data->ff = fuse_write_file_get(fc, fi); if (!data->ff) goto out_unlock; } @@ -1888,16 +2014,16 @@ static int fuse_writepages_fill(struct page *page, */ is_writeback = fuse_page_is_writeback(inode, page->index); - if (req && req->num_pages && - (is_writeback || req->num_pages == fc->max_pages || - (req->num_pages + 1) * PAGE_SIZE > fc->max_write || - data->orig_pages[req->num_pages - 1]->index + 1 != page->index)) { + if (wpa && ap->num_pages && + (is_writeback || ap->num_pages == fc->max_pages || + (ap->num_pages + 1) * PAGE_SIZE > fc->max_write || + data->orig_pages[ap->num_pages - 1]->index + 1 != page->index)) { fuse_writepages_send(data); - data->req = NULL; - } else if (req && req->num_pages == req->max_pages) { - if (!fuse_req_realloc_pages(fc, req, GFP_NOFS)) { + data->wpa = NULL; + } else if (wpa && ap->num_pages == data->max_pages) { + if (!fuse_pages_realloc(data)) { fuse_writepages_send(data); - req = data->req = NULL; + data->wpa = NULL; } } @@ -1915,59 +2041,58 @@ static int fuse_writepages_fill(struct page *page, * This is ensured by holding the page lock in page_mkwrite() while * checking fuse_page_is_writeback(). We already hold the page lock * since clear_page_dirty_for_io() and keep it held until we add the - * request to the fi->writepages list and increment req->num_pages. + * request to the fi->writepages list and increment ap->num_pages. * After this fuse_page_is_writeback() will indicate that the page is * under writeback, so we can release the page lock. */ - if (data->req == NULL) { - struct fuse_inode *fi = get_fuse_inode(inode); - + if (data->wpa == NULL) { err = -ENOMEM; - req = fuse_request_alloc_nofs(FUSE_REQ_INLINE_PAGES); - if (!req) { + wpa = fuse_writepage_args_alloc(); + if (!wpa) { __free_page(tmp_page); goto out_unlock; } + data->max_pages = 1; - fuse_write_fill(req, data->ff, page_offset(page), 0); - req->misc.write.in.write_flags |= FUSE_WRITE_CACHE; - req->misc.write.next = NULL; - req->in.argpages = 1; - __set_bit(FR_BACKGROUND, &req->flags); - req->num_pages = 0; - req->end = fuse_writepage_end; - req->inode = inode; + ap = &wpa->ia.ap; + fuse_write_args_fill(&wpa->ia, data->ff, page_offset(page), 0); + wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE; + wpa->next = NULL; + ap->args.in_pages = true; + ap->args.end = fuse_writepage_end; + ap->num_pages = 0; + wpa->inode = inode; spin_lock(&fi->lock); - list_add(&req->writepages_entry, &fi->writepages); + list_add(&wpa->writepages_entry, &fi->writepages); spin_unlock(&fi->lock); - data->req = req; + data->wpa = wpa; } set_page_writeback(page); copy_highpage(tmp_page, page); - req->pages[req->num_pages] = tmp_page; - req->page_descs[req->num_pages].offset = 0; - req->page_descs[req->num_pages].length = PAGE_SIZE; + ap->pages[ap->num_pages] = tmp_page; + ap->descs[ap->num_pages].offset = 0; + ap->descs[ap->num_pages].length = PAGE_SIZE; inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP); err = 0; - if (is_writeback && fuse_writepage_in_flight(req, page)) { + if (is_writeback && fuse_writepage_in_flight(wpa, page)) { end_page_writeback(page); - data->req = NULL; + data->wpa = NULL; goto out_unlock; } - data->orig_pages[req->num_pages] = page; + data->orig_pages[ap->num_pages] = page; /* * Protected by fi->lock against concurrent access by * fuse_page_is_writeback(). */ spin_lock(&fi->lock); - req->num_pages++; + ap->num_pages++; spin_unlock(&fi->lock); out_unlock: @@ -1989,7 +2114,7 @@ static int fuse_writepages(struct address_space *mapping, goto out; data.inode = inode; - data.req = NULL; + data.wpa = NULL; data.ff = NULL; err = -ENOMEM; @@ -2000,9 +2125,9 @@ static int fuse_writepages(struct address_space *mapping, goto out; err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data); - if (data.req) { + if (data.wpa) { /* Ignore errors if we can write at least one page */ - BUG_ON(!data.req->num_pages); + WARN_ON(!data.wpa->ia.ap.num_pages); fuse_writepages_send(&data); err = 0; } @@ -2222,11 +2347,11 @@ static void fuse_lk_fill(struct fuse_args *args, struct file *file, inarg->lk.pid = pid; if (flock) inarg->lk_flags |= FUSE_LK_FLOCK; - args->in.h.opcode = opcode; - args->in.h.nodeid = get_node_id(inode); - args->in.numargs = 1; - args->in.args[0].size = sizeof(*inarg); - args->in.args[0].value = inarg; + args->opcode = opcode; + args->nodeid = get_node_id(inode); + args->in_numargs = 1; + args->in_args[0].size = sizeof(*inarg); + args->in_args[0].value = inarg; } static int fuse_getlk(struct file *file, struct file_lock *fl) @@ -2239,9 +2364,9 @@ static int fuse_getlk(struct file *file, struct file_lock *fl) int err; fuse_lk_fill(&args, file, fl, FUSE_GETLK, 0, 0, &inarg); - args.out.numargs = 1; - args.out.args[0].size = sizeof(outarg); - args.out.args[0].value = &outarg; + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; err = fuse_simple_request(fc, &args); if (!err) err = convert_fuse_file_lock(fc, &outarg.lk, fl); @@ -2336,14 +2461,14 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block) memset(&inarg, 0, sizeof(inarg)); inarg.block = block; inarg.blocksize = inode->i_sb->s_blocksize; - args.in.h.opcode = FUSE_BMAP; - args.in.h.nodeid = get_node_id(inode); - args.in.numargs = 1; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.out.numargs = 1; - args.out.args[0].size = sizeof(outarg); - args.out.args[0].value = &outarg; + args.opcode = FUSE_BMAP; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; err = fuse_simple_request(fc, &args); if (err == -ENOSYS) fc->no_bmap = 1; @@ -2368,14 +2493,14 @@ static loff_t fuse_lseek(struct file *file, loff_t offset, int whence) if (fc->no_lseek) goto fallback; - args.in.h.opcode = FUSE_LSEEK; - args.in.h.nodeid = ff->nodeid; - args.in.numargs = 1; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.out.numargs = 1; - args.out.args[0].size = sizeof(outarg); - args.out.args[0].value = &outarg; + args.opcode = FUSE_LSEEK; + args.nodeid = ff->nodeid; + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; err = fuse_simple_request(fc, &args); if (err) { if (err == -ENOSYS) { @@ -2573,14 +2698,14 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, .flags = flags }; struct fuse_ioctl_out outarg; - struct fuse_req *req = NULL; - struct page **pages = NULL; struct iovec *iov_page = NULL; struct iovec *in_iov = NULL, *out_iov = NULL; - unsigned int in_iovs = 0, out_iovs = 0, num_pages = 0, max_pages; - size_t in_size, out_size, transferred, c; + unsigned int in_iovs = 0, out_iovs = 0, max_pages; + size_t in_size, out_size, c; + ssize_t transferred; int err, i; struct iov_iter ii; + struct fuse_args_pages ap = {}; #if BITS_PER_LONG == 32 inarg.flags |= FUSE_IOCTL_32BIT; @@ -2598,11 +2723,13 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE); err = -ENOMEM; - pages = kcalloc(fc->max_pages, sizeof(pages[0]), GFP_KERNEL); + ap.pages = fuse_pages_alloc(fc->max_pages, GFP_KERNEL, &ap.descs); iov_page = (struct iovec *) __get_free_page(GFP_KERNEL); - if (!pages || !iov_page) + if (!ap.pages || !iov_page) goto out; + fuse_page_descs_length_init(ap.descs, 0, fc->max_pages); + /* * If restricted, initialize IO parameters as encoded in @cmd. * RETRY from server is not allowed. @@ -2639,56 +2766,44 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, err = -ENOMEM; if (max_pages > fc->max_pages) goto out; - while (num_pages < max_pages) { - pages[num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); - if (!pages[num_pages]) + while (ap.num_pages < max_pages) { + ap.pages[ap.num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); + if (!ap.pages[ap.num_pages]) goto out; - num_pages++; + ap.num_pages++; } - req = fuse_get_req(fc, num_pages); - if (IS_ERR(req)) { - err = PTR_ERR(req); - req = NULL; - goto out; - } - memcpy(req->pages, pages, sizeof(req->pages[0]) * num_pages); - req->num_pages = num_pages; - fuse_page_descs_length_init(req, 0, req->num_pages); /* okay, let's send it to the client */ - req->in.h.opcode = FUSE_IOCTL; - req->in.h.nodeid = ff->nodeid; - req->in.numargs = 1; - req->in.args[0].size = sizeof(inarg); - req->in.args[0].value = &inarg; + ap.args.opcode = FUSE_IOCTL; + ap.args.nodeid = ff->nodeid; + ap.args.in_numargs = 1; + ap.args.in_args[0].size = sizeof(inarg); + ap.args.in_args[0].value = &inarg; if (in_size) { - req->in.numargs++; - req->in.args[1].size = in_size; - req->in.argpages = 1; + ap.args.in_numargs++; + ap.args.in_args[1].size = in_size; + ap.args.in_pages = true; err = -EFAULT; iov_iter_init(&ii, WRITE, in_iov, in_iovs, in_size); - for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= num_pages); i++) { - c = copy_page_from_iter(pages[i], 0, PAGE_SIZE, &ii); + for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) { + c = copy_page_from_iter(ap.pages[i], 0, PAGE_SIZE, &ii); if (c != PAGE_SIZE && iov_iter_count(&ii)) goto out; } } - req->out.numargs = 2; - req->out.args[0].size = sizeof(outarg); - req->out.args[0].value = &outarg; - req->out.args[1].size = out_size; - req->out.argpages = 1; - req->out.argvar = 1; + ap.args.out_numargs = 2; + ap.args.out_args[0].size = sizeof(outarg); + ap.args.out_args[0].value = &outarg; + ap.args.out_args[1].size = out_size; + ap.args.out_pages = true; + ap.args.out_argvar = true; - fuse_request_send(fc, req); - err = req->out.h.error; - transferred = req->out.args[1].size; - fuse_put_request(fc, req); - req = NULL; - if (err) + transferred = fuse_simple_request(fc, &ap.args); + err = transferred; + if (transferred < 0) goto out; /* did it ask for retry? */ @@ -2713,7 +2828,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV) goto out; - vaddr = kmap_atomic(pages[0]); + vaddr = kmap_atomic(ap.pages[0]); err = fuse_copy_ioctl_iovec(fc, iov_page, vaddr, transferred, in_iovs + out_iovs, (flags & FUSE_IOCTL_COMPAT) != 0); @@ -2741,19 +2856,17 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, err = -EFAULT; iov_iter_init(&ii, READ, out_iov, out_iovs, transferred); - for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= num_pages); i++) { - c = copy_page_to_iter(pages[i], 0, PAGE_SIZE, &ii); + for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) { + c = copy_page_to_iter(ap.pages[i], 0, PAGE_SIZE, &ii); if (c != PAGE_SIZE && iov_iter_count(&ii)) goto out; } err = 0; out: - if (req) - fuse_put_request(fc, req); free_page((unsigned long) iov_page); - while (num_pages) - __free_page(pages[--num_pages]); - kfree(pages); + while (ap.num_pages) + __free_page(ap.pages[--ap.num_pages]); + kfree(ap.pages); return err ? err : outarg.result; } @@ -2861,14 +2974,14 @@ __poll_t fuse_file_poll(struct file *file, poll_table *wait) fuse_register_polled_file(fc, ff); } - args.in.h.opcode = FUSE_POLL; - args.in.h.nodeid = ff->nodeid; - args.in.numargs = 1; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.out.numargs = 1; - args.out.args[0].size = sizeof(outarg); - args.out.args[0].value = &outarg; + args.opcode = FUSE_POLL; + args.nodeid = ff->nodeid; + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; err = fuse_simple_request(fc, &args); if (!err) @@ -3076,11 +3189,11 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, if (!(mode & FALLOC_FL_KEEP_SIZE)) set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); - args.in.h.opcode = FUSE_FALLOCATE; - args.in.h.nodeid = ff->nodeid; - args.in.numargs = 1; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; + args.opcode = FUSE_FALLOCATE; + args.nodeid = ff->nodeid; + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; err = fuse_simple_request(fc, &args); if (err == -ENOSYS) { fc->no_fallocate = 1; @@ -3168,14 +3281,14 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, if (is_unstable) set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state); - args.in.h.opcode = FUSE_COPY_FILE_RANGE; - args.in.h.nodeid = ff_in->nodeid; - args.in.numargs = 1; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.out.numargs = 1; - args.out.args[0].size = sizeof(outarg); - args.out.args[0].value = &outarg; + args.opcode = FUSE_COPY_FILE_RANGE; + args.nodeid = ff_in->nodeid; + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; err = fuse_simple_request(fc, &args); if (err == -ENOSYS) { fc->no_copy_file_range = 1; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 24dbca777775..d148188cfca4 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -47,9 +47,6 @@ /** Number of dentries for each connection in the control filesystem */ #define FUSE_CTL_NUM_DENTRIES 5 -/** Number of page pointers embedded in fuse_req */ -#define FUSE_REQ_INLINE_PAGES 1 - /** List of active connections */ extern struct list_head fuse_conn_list; @@ -164,17 +161,15 @@ enum { }; struct fuse_conn; +struct fuse_release_args; /** FUSE specific file data */ struct fuse_file { /** Fuse connection for this file */ struct fuse_conn *fc; - /* - * Request reserved for flush and release. - * Modified under relative fuse_inode::lock. - */ - struct fuse_req *reserved_req; + /* Argument space reserved for release */ + struct fuse_release_args *release_args; /** Kernel file handle guaranteed to be unique */ u64 kh; @@ -229,57 +224,12 @@ struct fuse_in_arg { const void *value; }; -/** The request input */ -struct fuse_in { - /** The request header */ - struct fuse_in_header h; - - /** True if the data for the last argument is in req->pages */ - unsigned argpages:1; - - /** Number of arguments */ - unsigned numargs; - - /** Array of arguments */ - struct fuse_in_arg args[3]; -}; - /** One output argument of a request */ struct fuse_arg { unsigned size; void *value; }; -/** The request output */ -struct fuse_out { - /** Header returned from userspace */ - struct fuse_out_header h; - - /* - * The following bitfields are not changed during the request - * processing - */ - - /** Last argument is variable length (can be shorter than - arg->size) */ - unsigned argvar:1; - - /** Last argument is a list of pages to copy data to */ - unsigned argpages:1; - - /** Zero partially or not copied pages */ - unsigned page_zeroing:1; - - /** Pages may be replaced with new ones */ - unsigned page_replace:1; - - /** Number or arguments */ - unsigned numargs; - - /** Array of arguments */ - struct fuse_arg args[2]; -}; - /** FUSE page descriptor */ struct fuse_page_desc { unsigned int length; @@ -287,20 +237,28 @@ struct fuse_page_desc { }; struct fuse_args { - struct { - struct { - uint32_t opcode; - uint64_t nodeid; - } h; - unsigned numargs; - struct fuse_in_arg args[3]; + uint64_t nodeid; + uint32_t opcode; + unsigned short in_numargs; + unsigned short out_numargs; + bool force:1; + bool noreply:1; + bool nocreds:1; + bool in_pages:1; + bool out_pages:1; + bool out_argvar:1; + bool page_zeroing:1; + bool page_replace:1; + struct fuse_in_arg in_args[3]; + struct fuse_arg out_args[2]; + void (*end)(struct fuse_conn *fc, struct fuse_args *args, int error); +}; - } in; - struct { - unsigned argvar:1; - unsigned numargs; - struct fuse_arg args[2]; - } out; +struct fuse_args_pages { + struct fuse_args args; + struct page **pages; + struct fuse_page_desc *descs; + unsigned int num_pages; }; #define FUSE_ARGS(args) struct fuse_args args = {} @@ -373,83 +331,79 @@ struct fuse_req { /** Entry on the interrupts list */ struct list_head intr_entry; + /* Input/output arguments */ + struct fuse_args *args; + /** refcount */ refcount_t count; /* Request flags, updated with test/set/clear_bit() */ unsigned long flags; - /** The request input */ - struct fuse_in in; + /* The request input header */ + struct { + struct fuse_in_header h; + } in; - /** The request output */ - struct fuse_out out; + /* The request output header */ + struct { + struct fuse_out_header h; + } out; /** Used to wake up the task waiting for completion of request*/ wait_queue_head_t waitq; - /** Data for asynchronous requests */ - union { - struct { - struct fuse_release_in in; - struct inode *inode; - } release; - struct fuse_init_in init_in; - struct fuse_init_out init_out; - struct cuse_init_in cuse_init_in; - struct { - struct fuse_read_in in; - u64 attr_ver; - } read; - struct { - struct fuse_write_in in; - struct fuse_write_out out; - struct fuse_req *next; - } write; - struct fuse_notify_retrieve_in retrieve_in; - } misc; - - /** page vector */ - struct page **pages; - - /** page-descriptor vector */ - struct fuse_page_desc *page_descs; - - /** size of the 'pages' array */ - unsigned max_pages; - - /** inline page vector */ - struct page *inline_pages[FUSE_REQ_INLINE_PAGES]; - - /** inline page-descriptor vector */ - struct fuse_page_desc inline_page_descs[FUSE_REQ_INLINE_PAGES]; - - /** number of pages in vector */ - unsigned num_pages; - - /** File used in the request (or NULL) */ - struct fuse_file *ff; +#if IS_ENABLED(CONFIG_VIRTIO_FS) + /** virtio-fs's physically contiguous buffer for in and out args */ + void *argbuf; +#endif +}; - /** Inode used in the request or NULL */ - struct inode *inode; +struct fuse_iqueue; - /** AIO control block */ - struct fuse_io_priv *io; +/** + * Input queue callbacks + * + * Input queue signalling is device-specific. For example, the /dev/fuse file + * uses fiq->waitq and fasync to wake processes that are waiting on queue + * readiness. These callbacks allow other device types to respond to input + * queue activity. + */ +struct fuse_iqueue_ops { + /** + * Signal that a forget has been queued + */ + void (*wake_forget_and_unlock)(struct fuse_iqueue *fiq) + __releases(fiq->lock); - /** Link on fi->writepages */ - struct list_head writepages_entry; + /** + * Signal that an INTERRUPT request has been queued + */ + void (*wake_interrupt_and_unlock)(struct fuse_iqueue *fiq) + __releases(fiq->lock); - /** Request completion callback */ - void (*end)(struct fuse_conn *, struct fuse_req *); + /** + * Signal that a request has been queued + */ + void (*wake_pending_and_unlock)(struct fuse_iqueue *fiq) + __releases(fiq->lock); - /** Request is stolen from fuse_file->reserved_req */ - struct file *stolen_file; + /** + * Clean up when fuse_iqueue is destroyed + */ + void (*release)(struct fuse_iqueue *fiq); }; +/** /dev/fuse input queue operations */ +extern const struct fuse_iqueue_ops fuse_dev_fiq_ops; + struct fuse_iqueue { /** Connection established */ unsigned connected; + /** Lock protecting accesses to members of this structure */ + spinlock_t lock; + /** Readers of the connection are waiting on this */ wait_queue_head_t waitq; @@ -471,6 +425,12 @@ struct fuse_iqueue { /** O_ASYNC requests */ struct fasync_struct *fasync; + + /** Device-specific callbacks */ + const struct fuse_iqueue_ops *ops; + + /** Device-specific state */ + void *priv; }; #define FUSE_PQ_HASH_BITS 8 @@ -504,6 +464,30 @@ struct fuse_dev { struct list_head entry; }; +struct fuse_fs_context { + int fd; + unsigned int rootmode; + kuid_t user_id; + kgid_t group_id; + bool is_bdev:1; + bool fd_present:1; + bool rootmode_present:1; + bool user_id_present:1; + bool group_id_present:1; + bool default_permissions:1; + bool allow_other:1; + bool destroy:1; + bool no_control:1; + bool no_force_umount:1; + bool no_mount_options:1; + unsigned int max_read; + unsigned int blksize; + const char *subtype; + + /* fuse_dev pointer to fill in, should contain NULL on entry */ + void **fudptr; +}; + /** * A Fuse connection. * @@ -584,9 +568,6 @@ struct fuse_conn { /** waitq for blocked connection */ wait_queue_head_t blocked_waitq; - /** waitq for reserved requests */ - wait_queue_head_t reserved_req_waitq; - /** Connection established, cleared on umount, connection abort and device release */ unsigned connected; @@ -721,6 +702,21 @@ struct fuse_conn { /** Does the filesystem support copy_file_range? */ unsigned no_copy_file_range:1; + /* Send DESTROY request */ + unsigned int destroy:1; + + /* Delete dentries that have gone stale */ + unsigned int delete_stale:1; + + /** Do not create entry in fusectl fs */ + unsigned int no_control:1; + + /** Do not allow MNT_FORCE umount */ + unsigned int no_force_umount:1; + + /* Do not show mount options */ + unsigned int no_mount_options:1; + /** The number of requests waiting for completion */ atomic_t num_waiting; @@ -742,9 +738,6 @@ struct fuse_conn { /** Key for lock owner ID scrambling */ u32 scramble_key[4]; - /** Reserved request for the DESTROY message */ - struct fuse_req *destroy_req; - /** Version counter for attribute changes */ atomic64_t attr_version; @@ -820,14 +813,32 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, struct fuse_forget_link *fuse_alloc_forget(void); -/* Used by READDIRPLUS */ -void fuse_force_forget(struct file *file, u64 nodeid); +struct fuse_forget_link *fuse_dequeue_forget(struct fuse_iqueue *fiq, + unsigned int max, + unsigned int *countp); -/** +/* * Initialize READ or READDIR request */ -void fuse_read_fill(struct fuse_req *req, struct file *file, - loff_t pos, size_t count, int opcode); +struct fuse_io_args { + union { + struct { + struct fuse_read_in in; + u64 attr_ver; + } read; + struct { + struct fuse_write_in in; + struct fuse_write_out out; + } write; + }; + struct fuse_args_pages ap; + struct fuse_io_priv *io; + struct fuse_file *ff; +}; + +void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, + size_t count, int opcode); + /** * Send OPEN or OPENDIR request @@ -900,61 +911,16 @@ int fuse_ctl_init(void); void __exit fuse_ctl_cleanup(void); /** - * Allocate a request - */ -struct fuse_req *fuse_request_alloc(unsigned npages); - -struct fuse_req *fuse_request_alloc_nofs(unsigned npages); - -bool fuse_req_realloc_pages(struct fuse_conn *fc, struct fuse_req *req, - gfp_t flags); - - -/** - * Free a request - */ -void fuse_request_free(struct fuse_req *req); - -/** - * Get a request, may fail with -ENOMEM, - * caller should specify # elements in req->pages[] explicitly - */ -struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages); -struct fuse_req *fuse_get_req_for_background(struct fuse_conn *fc, - unsigned npages); - -/* - * Increment reference count on request - */ -void __fuse_get_request(struct fuse_req *req); - -/** - * Gets a requests for a file operation, always succeeds - */ -struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc, - struct file *file); - -/** - * Decrement reference count of a request. If count goes to zero free - * the request. - */ -void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req); - -/** - * Send a request (synchronous) - */ -void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req); - -/** * Simple request sending that does request allocation and freeing */ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args); +int fuse_simple_background(struct fuse_conn *fc, struct fuse_args *args, + gfp_t gfp_flags); /** - * Send a request in the background + * End a finished request */ -void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req); -bool fuse_request_queue_background(struct fuse_conn *fc, struct fuse_req *req); +void fuse_request_end(struct fuse_conn *fc, struct fuse_req *req); /* Abort all requests */ void fuse_abort_conn(struct fuse_conn *fc); @@ -980,15 +946,33 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc); /** * Initialize fuse_conn */ -void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns); +void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns, + const struct fuse_iqueue_ops *fiq_ops, void *fiq_priv); /** * Release reference to fuse_conn */ void fuse_conn_put(struct fuse_conn *fc); -struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc); +struct fuse_dev *fuse_dev_alloc_install(struct fuse_conn *fc); +struct fuse_dev *fuse_dev_alloc(void); +void fuse_dev_install(struct fuse_dev *fud, struct fuse_conn *fc); void fuse_dev_free(struct fuse_dev *fud); +void fuse_send_init(struct fuse_conn *fc); + +/** + * Fill in superblock and initialize fuse connection + * @sb: partially-initialized superblock to fill in + * @ctx: mount context + */ +int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx); + +/** + * Disassociate fuse connection from superblock and kill the superblock + * + * Calls kill_anon_super(), do not use with bdev mounts. + */ +void fuse_kill_sb_anon(struct super_block *sb); /** * Add connection to control filesystem @@ -1093,4 +1077,15 @@ int fuse_set_acl(struct inode *inode, struct posix_acl *acl, int type); /* readdir.c */ int fuse_readdir(struct file *file, struct dir_context *ctx); +/** + * Return the number of bytes in an arguments list + */ +unsigned int fuse_len_args(unsigned int numargs, struct fuse_arg *args); + +/** + * Get the next unique ID for a request + */ +u64 fuse_get_unique(struct fuse_iqueue *fiq); +void fuse_free_conn(struct fuse_conn *fc); + #endif /* _FS_FUSE_I_H */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 4bb885b0f032..16aec32f7f3d 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -15,7 +15,8 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/moduleparam.h> -#include <linux/parser.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> #include <linux/statfs.h> #include <linux/random.h> #include <linux/sched.h> @@ -59,24 +60,13 @@ MODULE_PARM_DESC(max_user_congthresh, /** Congestion starts at 75% of maximum */ #define FUSE_DEFAULT_CONGESTION_THRESHOLD (FUSE_DEFAULT_MAX_BACKGROUND * 3 / 4) -struct fuse_mount_data { - int fd; - unsigned rootmode; - kuid_t user_id; - kgid_t group_id; - unsigned fd_present:1; - unsigned rootmode_present:1; - unsigned user_id_present:1; - unsigned group_id_present:1; - unsigned default_permissions:1; - unsigned allow_other:1; - unsigned max_read; - unsigned blksize; -}; +#ifdef CONFIG_BLOCK +static struct file_system_type fuseblk_fs_type; +#endif struct fuse_forget_link *fuse_alloc_forget(void) { - return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL); + return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL_ACCOUNT); } static struct inode *fuse_alloc_inode(struct super_block *sb) @@ -374,19 +364,21 @@ void fuse_unlock_inode(struct inode *inode, bool locked) static void fuse_umount_begin(struct super_block *sb) { - fuse_abort_conn(get_fuse_conn_super(sb)); + struct fuse_conn *fc = get_fuse_conn_super(sb); + + if (!fc->no_force_umount) + fuse_abort_conn(fc); } static void fuse_send_destroy(struct fuse_conn *fc) { - struct fuse_req *req = fc->destroy_req; - if (req && fc->conn_init) { - fc->destroy_req = NULL; - req->in.h.opcode = FUSE_DESTROY; - __set_bit(FR_FORCE, &req->flags); - __clear_bit(FR_BACKGROUND, &req->flags); - fuse_request_send(fc, req); - fuse_put_request(fc, req); + if (fc->conn_init) { + FUSE_ARGS(args); + + args.opcode = FUSE_DESTROY; + args.force = true; + args.nocreds = true; + fuse_simple_request(fc, &args); } } @@ -430,12 +422,12 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf) } memset(&outarg, 0, sizeof(outarg)); - args.in.numargs = 0; - args.in.h.opcode = FUSE_STATFS; - args.in.h.nodeid = get_node_id(d_inode(dentry)); - args.out.numargs = 1; - args.out.args[0].size = sizeof(outarg); - args.out.args[0].value = &outarg; + args.in_numargs = 0; + args.opcode = FUSE_STATFS; + args.nodeid = get_node_id(d_inode(dentry)); + args.out_numargs = 1; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; err = fuse_simple_request(fc, &args); if (!err) convert_fuse_statfs(buf, &outarg.st); @@ -443,6 +435,8 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf) } enum { + OPT_SOURCE, + OPT_SUBTYPE, OPT_FD, OPT_ROOTMODE, OPT_USER_ID, @@ -454,111 +448,109 @@ enum { OPT_ERR }; -static const match_table_t tokens = { - {OPT_FD, "fd=%u"}, - {OPT_ROOTMODE, "rootmode=%o"}, - {OPT_USER_ID, "user_id=%u"}, - {OPT_GROUP_ID, "group_id=%u"}, - {OPT_DEFAULT_PERMISSIONS, "default_permissions"}, - {OPT_ALLOW_OTHER, "allow_other"}, - {OPT_MAX_READ, "max_read=%u"}, - {OPT_BLKSIZE, "blksize=%u"}, - {OPT_ERR, NULL} +static const struct fs_parameter_spec fuse_param_specs[] = { + fsparam_string ("source", OPT_SOURCE), + fsparam_u32 ("fd", OPT_FD), + fsparam_u32oct ("rootmode", OPT_ROOTMODE), + fsparam_u32 ("user_id", OPT_USER_ID), + fsparam_u32 ("group_id", OPT_GROUP_ID), + fsparam_flag ("default_permissions", OPT_DEFAULT_PERMISSIONS), + fsparam_flag ("allow_other", OPT_ALLOW_OTHER), + fsparam_u32 ("max_read", OPT_MAX_READ), + fsparam_u32 ("blksize", OPT_BLKSIZE), + fsparam_string ("subtype", OPT_SUBTYPE), + {} +}; + +static const struct fs_parameter_description fuse_fs_parameters = { + .name = "fuse", + .specs = fuse_param_specs, }; -static int fuse_match_uint(substring_t *s, unsigned int *res) +static int fuse_parse_param(struct fs_context *fc, struct fs_parameter *param) { - int err = -ENOMEM; - char *buf = match_strdup(s); - if (buf) { - err = kstrtouint(buf, 10, res); - kfree(buf); + struct fs_parse_result result; + struct fuse_fs_context *ctx = fc->fs_private; + int opt; + + opt = fs_parse(fc, &fuse_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case OPT_SOURCE: + if (fc->source) + return invalf(fc, "fuse: Multiple sources specified"); + fc->source = param->string; + param->string = NULL; + break; + + case OPT_SUBTYPE: + if (ctx->subtype) + return invalf(fc, "fuse: Multiple subtypes specified"); + ctx->subtype = param->string; + param->string = NULL; + return 0; + + case OPT_FD: + ctx->fd = result.uint_32; + ctx->fd_present = 1; + break; + + case OPT_ROOTMODE: + if (!fuse_valid_type(result.uint_32)) + return invalf(fc, "fuse: Invalid rootmode"); + ctx->rootmode = result.uint_32; + ctx->rootmode_present = 1; + break; + + case OPT_USER_ID: + ctx->user_id = make_kuid(fc->user_ns, result.uint_32); + if (!uid_valid(ctx->user_id)) + return invalf(fc, "fuse: Invalid user_id"); + ctx->user_id_present = 1; + break; + + case OPT_GROUP_ID: + ctx->group_id = make_kgid(fc->user_ns, result.uint_32); + if (!gid_valid(ctx->group_id)) + return invalf(fc, "fuse: Invalid group_id"); + ctx->group_id_present = 1; + break; + + case OPT_DEFAULT_PERMISSIONS: + ctx->default_permissions = 1; + break; + + case OPT_ALLOW_OTHER: + ctx->allow_other = 1; + break; + + case OPT_MAX_READ: + ctx->max_read = result.uint_32; + break; + + case OPT_BLKSIZE: + if (!ctx->is_bdev) + return invalf(fc, "fuse: blksize only supported for fuseblk"); + ctx->blksize = result.uint_32; + break; + + default: + return -EINVAL; } - return err; + + return 0; } -static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev, - struct user_namespace *user_ns) +static void fuse_free_fc(struct fs_context *fc) { - char *p; - memset(d, 0, sizeof(struct fuse_mount_data)); - d->max_read = ~0; - d->blksize = FUSE_DEFAULT_BLKSIZE; - - while ((p = strsep(&opt, ",")) != NULL) { - int token; - int value; - unsigned uv; - substring_t args[MAX_OPT_ARGS]; - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case OPT_FD: - if (match_int(&args[0], &value)) - return 0; - d->fd = value; - d->fd_present = 1; - break; - - case OPT_ROOTMODE: - if (match_octal(&args[0], &value)) - return 0; - if (!fuse_valid_type(value)) - return 0; - d->rootmode = value; - d->rootmode_present = 1; - break; - - case OPT_USER_ID: - if (fuse_match_uint(&args[0], &uv)) - return 0; - d->user_id = make_kuid(user_ns, uv); - if (!uid_valid(d->user_id)) - return 0; - d->user_id_present = 1; - break; - - case OPT_GROUP_ID: - if (fuse_match_uint(&args[0], &uv)) - return 0; - d->group_id = make_kgid(user_ns, uv); - if (!gid_valid(d->group_id)) - return 0; - d->group_id_present = 1; - break; - - case OPT_DEFAULT_PERMISSIONS: - d->default_permissions = 1; - break; - - case OPT_ALLOW_OTHER: - d->allow_other = 1; - break; - - case OPT_MAX_READ: - if (match_int(&args[0], &value)) - return 0; - d->max_read = value; - break; - - case OPT_BLKSIZE: - if (!is_bdev || match_int(&args[0], &value)) - return 0; - d->blksize = value; - break; - - default: - return 0; - } - } + struct fuse_fs_context *ctx = fc->fs_private; - if (!d->fd_present || !d->rootmode_present || - !d->user_id_present || !d->group_id_present) - return 0; - - return 1; + if (ctx) { + kfree(ctx->subtype); + kfree(ctx); + } } static int fuse_show_options(struct seq_file *m, struct dentry *root) @@ -566,6 +558,9 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root) struct super_block *sb = root->d_sb; struct fuse_conn *fc = get_fuse_conn_super(sb); + if (fc->no_mount_options) + return 0; + seq_printf(m, ",user_id=%u", from_kuid_munged(fc->user_ns, fc->user_id)); seq_printf(m, ",group_id=%u", from_kgid_munged(fc->user_ns, fc->group_id)); if (fc->default_permissions) @@ -579,14 +574,19 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root) return 0; } -static void fuse_iqueue_init(struct fuse_iqueue *fiq) +static void fuse_iqueue_init(struct fuse_iqueue *fiq, + const struct fuse_iqueue_ops *ops, + void *priv) { memset(fiq, 0, sizeof(struct fuse_iqueue)); + spin_lock_init(&fiq->lock); init_waitqueue_head(&fiq->waitq); INIT_LIST_HEAD(&fiq->pending); INIT_LIST_HEAD(&fiq->interrupts); fiq->forget_list_tail = &fiq->forget_list_head; fiq->connected = 1; + fiq->ops = ops; + fiq->priv = priv; } static void fuse_pqueue_init(struct fuse_pqueue *fpq) @@ -600,7 +600,8 @@ static void fuse_pqueue_init(struct fuse_pqueue *fpq) fpq->connected = 1; } -void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns) +void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns, + const struct fuse_iqueue_ops *fiq_ops, void *fiq_priv) { memset(fc, 0, sizeof(*fc)); spin_lock_init(&fc->lock); @@ -609,8 +610,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct user_namespace *user_ns) refcount_set(&fc->count, 1); atomic_set(&fc->dev_count, 1); init_waitqueue_head(&fc->blocked_waitq); - init_waitqueue_head(&fc->reserved_req_waitq); - fuse_iqueue_init(&fc->iq); + fuse_iqueue_init(&fc->iq, fiq_ops, fiq_priv); INIT_LIST_HEAD(&fc->bg_queue); INIT_LIST_HEAD(&fc->entry); INIT_LIST_HEAD(&fc->devices); @@ -633,8 +633,10 @@ EXPORT_SYMBOL_GPL(fuse_conn_init); void fuse_conn_put(struct fuse_conn *fc) { if (refcount_dec_and_test(&fc->count)) { - if (fc->destroy_req) - fuse_request_free(fc->destroy_req); + struct fuse_iqueue *fiq = &fc->iq; + + if (fiq->ops->release) + fiq->ops->release(fiq); put_pid_ns(fc->pid_ns); put_user_ns(fc->user_ns); fc->release(fc); @@ -822,9 +824,12 @@ static const struct super_operations fuse_super_operations = { static void sanitize_global_limit(unsigned *limit) { + /* + * The default maximum number of async requests is calculated to consume + * 1/2^13 of the total memory, assuming 392 bytes per request. + */ if (*limit == 0) - *limit = ((totalram_pages() << PAGE_SHIFT) >> 13) / - sizeof(struct fuse_req); + *limit = ((totalram_pages() << PAGE_SHIFT) >> 13) / 392; if (*limit >= 1 << 16) *limit = (1 << 16) - 1; @@ -870,11 +875,19 @@ static void process_init_limits(struct fuse_conn *fc, struct fuse_init_out *arg) spin_unlock(&fc->bg_lock); } -static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) +struct fuse_init_args { + struct fuse_args args; + struct fuse_init_in in; + struct fuse_init_out out; +}; + +static void process_init_reply(struct fuse_conn *fc, struct fuse_args *args, + int error) { - struct fuse_init_out *arg = &req->misc.init_out; + struct fuse_init_args *ia = container_of(args, typeof(*ia), args); + struct fuse_init_out *arg = &ia->out; - if (req->out.h.error || arg->major != FUSE_KERNEL_VERSION) + if (error || arg->major != FUSE_KERNEL_VERSION) fc->conn_error = 1; else { unsigned long ra_pages; @@ -951,18 +964,23 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->max_write = max_t(unsigned, 4096, fc->max_write); fc->conn_init = 1; } + kfree(ia); + fuse_set_initialized(fc); wake_up_all(&fc->blocked_waitq); } -static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) +void fuse_send_init(struct fuse_conn *fc) { - struct fuse_init_in *arg = &req->misc.init_in; + struct fuse_init_args *ia; + + ia = kzalloc(sizeof(*ia), GFP_KERNEL | __GFP_NOFAIL); - arg->major = FUSE_KERNEL_VERSION; - arg->minor = FUSE_KERNEL_MINOR_VERSION; - arg->max_readahead = fc->sb->s_bdi->ra_pages * PAGE_SIZE; - arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | + ia->in.major = FUSE_KERNEL_VERSION; + ia->in.minor = FUSE_KERNEL_MINOR_VERSION; + ia->in.max_readahead = fc->sb->s_bdi->ra_pages * PAGE_SIZE; + ia->in.flags |= + FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK | FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ | FUSE_FLOCK_LOCKS | FUSE_HAS_IOCTL_DIR | FUSE_AUTO_INVAL_DATA | @@ -971,26 +989,32 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL | FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS | FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA; - req->in.h.opcode = FUSE_INIT; - req->in.numargs = 1; - req->in.args[0].size = sizeof(*arg); - req->in.args[0].value = arg; - req->out.numargs = 1; + ia->args.opcode = FUSE_INIT; + ia->args.in_numargs = 1; + ia->args.in_args[0].size = sizeof(ia->in); + ia->args.in_args[0].value = &ia->in; + ia->args.out_numargs = 1; /* Variable length argument used for backward compatibility with interface version < 7.5. Rest of init_out is zeroed by do_get_request(), so a short reply is not a problem */ - req->out.argvar = 1; - req->out.args[0].size = sizeof(struct fuse_init_out); - req->out.args[0].value = &req->misc.init_out; - req->end = process_init_reply; - fuse_request_send_background(fc, req); + ia->args.out_argvar = 1; + ia->args.out_args[0].size = sizeof(ia->out); + ia->args.out_args[0].value = &ia->out; + ia->args.force = true; + ia->args.nocreds = true; + ia->args.end = process_init_reply; + + if (fuse_simple_background(fc, &ia->args, GFP_KERNEL) != 0) + process_init_reply(fc, &ia->args, -ENOTCONN); } +EXPORT_SYMBOL_GPL(fuse_send_init); -static void fuse_free_conn(struct fuse_conn *fc) +void fuse_free_conn(struct fuse_conn *fc) { WARN_ON(!list_empty(&fc->devices)); kfree_rcu(fc, rcu); } +EXPORT_SYMBOL_GPL(fuse_free_conn); static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) { @@ -1032,7 +1056,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) return 0; } -struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc) +struct fuse_dev *fuse_dev_alloc(void) { struct fuse_dev *fud; struct list_head *pq; @@ -1048,16 +1072,33 @@ struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc) } fud->pq.processing = pq; - fud->fc = fuse_conn_get(fc); fuse_pqueue_init(&fud->pq); + return fud; +} +EXPORT_SYMBOL_GPL(fuse_dev_alloc); + +void fuse_dev_install(struct fuse_dev *fud, struct fuse_conn *fc) +{ + fud->fc = fuse_conn_get(fc); spin_lock(&fc->lock); list_add_tail(&fud->entry, &fc->devices); spin_unlock(&fc->lock); +} +EXPORT_SYMBOL_GPL(fuse_dev_install); + +struct fuse_dev *fuse_dev_alloc_install(struct fuse_conn *fc) +{ + struct fuse_dev *fud; + fud = fuse_dev_alloc(); + if (!fud) + return NULL; + + fuse_dev_install(fud, fc); return fud; } -EXPORT_SYMBOL_GPL(fuse_dev_alloc); +EXPORT_SYMBOL_GPL(fuse_dev_alloc_install); void fuse_dev_free(struct fuse_dev *fud) { @@ -1075,17 +1116,13 @@ void fuse_dev_free(struct fuse_dev *fud) } EXPORT_SYMBOL_GPL(fuse_dev_free); -static int fuse_fill_super(struct super_block *sb, void *data, int silent) +int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) { struct fuse_dev *fud; - struct fuse_conn *fc; + struct fuse_conn *fc = get_fuse_conn_super(sb); struct inode *root; - struct fuse_mount_data d; - struct file *file; struct dentry *root_dentry; - struct fuse_req *init_req; int err; - int is_bdev = sb->s_bdev != NULL; err = -EINVAL; if (sb->s_flags & SB_MANDLOCK) @@ -1093,19 +1130,19 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION); - if (!parse_fuse_opt(data, &d, is_bdev, sb->s_user_ns)) - goto err; - - if (is_bdev) { + if (ctx->is_bdev) { #ifdef CONFIG_BLOCK err = -EINVAL; - if (!sb_set_blocksize(sb, d.blksize)) + if (!sb_set_blocksize(sb, ctx->blksize)) goto err; #endif } else { sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; } + + sb->s_subtype = ctx->subtype; + ctx->subtype = NULL; sb->s_magic = FUSE_SUPER_MAGIC; sb->s_op = &fuse_super_operations; sb->s_xattr = fuse_xattr_handlers; @@ -1116,19 +1153,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) if (sb->s_user_ns != &init_user_ns) sb->s_iflags |= SB_I_UNTRUSTED_MOUNTER; - file = fget(d.fd); - err = -EINVAL; - if (!file) - goto err; - - /* - * Require mount to happen from the same user namespace which - * opened /dev/fuse to prevent potential attacks. - */ - if (file->f_op != &fuse_dev_operations || - file->f_cred->user_ns != sb->s_user_ns) - goto err_fput; - /* * If we are not in the initial user namespace posix * acls must be translated. @@ -1136,17 +1160,9 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) if (sb->s_user_ns != &init_user_ns) sb->s_xattr = fuse_no_acl_xattr_handlers; - fc = kmalloc(sizeof(*fc), GFP_KERNEL); - err = -ENOMEM; - if (!fc) - goto err_fput; - - fuse_conn_init(fc, sb->s_user_ns); - fc->release = fuse_free_conn; - - fud = fuse_dev_alloc(fc); + fud = fuse_dev_alloc_install(fc); if (!fud) - goto err_put_conn; + goto err; fc->dev = sb->s_dev; fc->sb = sb; @@ -1159,17 +1175,18 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) fc->dont_mask = 1; sb->s_flags |= SB_POSIXACL; - fc->default_permissions = d.default_permissions; - fc->allow_other = d.allow_other; - fc->user_id = d.user_id; - fc->group_id = d.group_id; - fc->max_read = max_t(unsigned, 4096, d.max_read); - - /* Used by get_root_inode() */ - sb->s_fs_info = fc; + fc->default_permissions = ctx->default_permissions; + fc->allow_other = ctx->allow_other; + fc->user_id = ctx->user_id; + fc->group_id = ctx->group_id; + fc->max_read = max_t(unsigned, 4096, ctx->max_read); + fc->destroy = ctx->destroy; + fc->no_control = ctx->no_control; + fc->no_force_umount = ctx->no_force_umount; + fc->no_mount_options = ctx->no_mount_options; err = -ENOMEM; - root = fuse_get_root_inode(sb, d.rootmode); + root = fuse_get_root_inode(sb, ctx->rootmode); sb->s_d_op = &fuse_root_dentry_operations; root_dentry = d_make_root(root); if (!root_dentry) @@ -1177,20 +1194,9 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) /* Root dentry doesn't have .d_revalidate */ sb->s_d_op = &fuse_dentry_operations; - init_req = fuse_request_alloc(0); - if (!init_req) - goto err_put_root; - __set_bit(FR_BACKGROUND, &init_req->flags); - - if (is_bdev) { - fc->destroy_req = fuse_request_alloc(0); - if (!fc->destroy_req) - goto err_free_init_req; - } - mutex_lock(&fuse_mutex); err = -EINVAL; - if (file->private_data) + if (*ctx->fudptr) goto err_unlock; err = fuse_ctl_add_conn(fc); @@ -1199,27 +1205,62 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) list_add_tail(&fc->entry, &fuse_conn_list); sb->s_root = root_dentry; - file->private_data = fud; + *ctx->fudptr = fud; mutex_unlock(&fuse_mutex); + return 0; + + err_unlock: + mutex_unlock(&fuse_mutex); + dput(root_dentry); + err_dev_free: + fuse_dev_free(fud); + err: + return err; +} +EXPORT_SYMBOL_GPL(fuse_fill_super_common); + +static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) +{ + struct fuse_fs_context *ctx = fsc->fs_private; + struct file *file; + int err; + struct fuse_conn *fc; + + err = -EINVAL; + file = fget(ctx->fd); + if (!file) + goto err; + + /* + * Require mount to happen from the same user namespace which + * opened /dev/fuse to prevent potential attacks. + */ + if ((file->f_op != &fuse_dev_operations) || + (file->f_cred->user_ns != sb->s_user_ns)) + goto err_fput; + ctx->fudptr = &file->private_data; + + fc = kmalloc(sizeof(*fc), GFP_KERNEL); + err = -ENOMEM; + if (!fc) + goto err_fput; + + fuse_conn_init(fc, sb->s_user_ns, &fuse_dev_fiq_ops, NULL); + fc->release = fuse_free_conn; + sb->s_fs_info = fc; + + err = fuse_fill_super_common(sb, ctx); + if (err) + goto err_put_conn; /* * atomic_dec_and_test() in fput() provides the necessary * memory barrier for file->private_data to be visible on all * CPUs after this */ fput(file); - - fuse_send_init(fc, init_req); - + fuse_send_init(get_fuse_conn_super(sb)); return 0; - err_unlock: - mutex_unlock(&fuse_mutex); - err_free_init_req: - fuse_request_free(init_req); - err_put_root: - dput(root_dentry); - err_dev_free: - fuse_dev_free(fud); err_put_conn: fuse_conn_put(fc); sb->s_fs_info = NULL; @@ -1229,11 +1270,52 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) return err; } -static struct dentry *fuse_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, - void *raw_data) +static int fuse_get_tree(struct fs_context *fc) { - return mount_nodev(fs_type, flags, raw_data, fuse_fill_super); + struct fuse_fs_context *ctx = fc->fs_private; + + if (!ctx->fd_present || !ctx->rootmode_present || + !ctx->user_id_present || !ctx->group_id_present) + return -EINVAL; + +#ifdef CONFIG_BLOCK + if (ctx->is_bdev) + return get_tree_bdev(fc, fuse_fill_super); +#endif + + return get_tree_nodev(fc, fuse_fill_super); +} + +static const struct fs_context_operations fuse_context_ops = { + .free = fuse_free_fc, + .parse_param = fuse_parse_param, + .get_tree = fuse_get_tree, +}; + +/* + * Set up the filesystem mount context. + */ +static int fuse_init_fs_context(struct fs_context *fc) +{ + struct fuse_fs_context *ctx; + + ctx = kzalloc(sizeof(struct fuse_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->max_read = ~0; + ctx->blksize = FUSE_DEFAULT_BLKSIZE; + +#ifdef CONFIG_BLOCK + if (fc->fs_type == &fuseblk_fs_type) { + ctx->is_bdev = true; + ctx->destroy = true; + } +#endif + + fc->fs_private = ctx; + fc->ops = &fuse_context_ops; + return 0; } static void fuse_sb_destroy(struct super_block *sb) @@ -1241,7 +1323,8 @@ static void fuse_sb_destroy(struct super_block *sb) struct fuse_conn *fc = get_fuse_conn_super(sb); if (fc) { - fuse_send_destroy(fc); + if (fc->destroy) + fuse_send_destroy(fc); fuse_abort_conn(fc); fuse_wait_aborted(fc); @@ -1252,29 +1335,24 @@ static void fuse_sb_destroy(struct super_block *sb) } } -static void fuse_kill_sb_anon(struct super_block *sb) +void fuse_kill_sb_anon(struct super_block *sb) { fuse_sb_destroy(sb); kill_anon_super(sb); } +EXPORT_SYMBOL_GPL(fuse_kill_sb_anon); static struct file_system_type fuse_fs_type = { .owner = THIS_MODULE, .name = "fuse", .fs_flags = FS_HAS_SUBTYPE | FS_USERNS_MOUNT, - .mount = fuse_mount, + .init_fs_context = fuse_init_fs_context, + .parameters = &fuse_fs_parameters, .kill_sb = fuse_kill_sb_anon, }; MODULE_ALIAS_FS("fuse"); #ifdef CONFIG_BLOCK -static struct dentry *fuse_mount_blk(struct file_system_type *fs_type, - int flags, const char *dev_name, - void *raw_data) -{ - return mount_bdev(fs_type, flags, dev_name, raw_data, fuse_fill_super); -} - static void fuse_kill_sb_blk(struct super_block *sb) { fuse_sb_destroy(sb); @@ -1284,7 +1362,8 @@ static void fuse_kill_sb_blk(struct super_block *sb) static struct file_system_type fuseblk_fs_type = { .owner = THIS_MODULE, .name = "fuseblk", - .mount = fuse_mount_blk, + .init_fs_context = fuse_init_fs_context, + .parameters = &fuse_fs_parameters, .kill_sb = fuse_kill_sb_blk, .fs_flags = FS_REQUIRES_DEV | FS_HAS_SUBTYPE, }; diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index 574d03f8a573..5c38b9d84c6e 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -249,6 +249,27 @@ retry: return 0; } +static void fuse_force_forget(struct file *file, u64 nodeid) +{ + struct inode *inode = file_inode(file); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_forget_in inarg; + FUSE_ARGS(args); + + memset(&inarg, 0, sizeof(inarg)); + inarg.nlookup = 1; + args.opcode = FUSE_FORGET; + args.nodeid = nodeid; + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.force = true; + args.noreply = true; + + fuse_simple_request(fc, &args); + /* ignore errors */ +} + static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file, struct dir_context *ctx, u64 attr_version) { @@ -295,62 +316,55 @@ static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file, static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx) { - int plus, err; - size_t nbytes; + int plus; + ssize_t res; struct page *page; struct inode *inode = file_inode(file); struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req; + struct fuse_io_args ia = {}; + struct fuse_args_pages *ap = &ia.ap; + struct fuse_page_desc desc = { .length = PAGE_SIZE }; u64 attr_version = 0; bool locked; - req = fuse_get_req(fc, 1); - if (IS_ERR(req)) - return PTR_ERR(req); - page = alloc_page(GFP_KERNEL); - if (!page) { - fuse_put_request(fc, req); + if (!page) return -ENOMEM; - } plus = fuse_use_readdirplus(inode, ctx); - req->out.argpages = 1; - req->num_pages = 1; - req->pages[0] = page; - req->page_descs[0].length = PAGE_SIZE; + ap->args.out_pages = 1; + ap->num_pages = 1; + ap->pages = &page; + ap->descs = &desc; if (plus) { attr_version = fuse_get_attr_version(fc); - fuse_read_fill(req, file, ctx->pos, PAGE_SIZE, - FUSE_READDIRPLUS); + fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE, + FUSE_READDIRPLUS); } else { - fuse_read_fill(req, file, ctx->pos, PAGE_SIZE, - FUSE_READDIR); + fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE, + FUSE_READDIR); } locked = fuse_lock_inode(inode); - fuse_request_send(fc, req); + res = fuse_simple_request(fc, &ap->args); fuse_unlock_inode(inode, locked); - nbytes = req->out.args[0].size; - err = req->out.h.error; - fuse_put_request(fc, req); - if (!err) { - if (!nbytes) { + if (res >= 0) { + if (!res) { struct fuse_file *ff = file->private_data; if (ff->open_flags & FOPEN_CACHE_DIR) fuse_readdir_cache_end(file, ctx->pos); } else if (plus) { - err = parse_dirplusfile(page_address(page), nbytes, + res = parse_dirplusfile(page_address(page), res, file, ctx, attr_version); } else { - err = parse_dirfile(page_address(page), nbytes, file, + res = parse_dirfile(page_address(page), res, file, ctx); } } __free_page(page); fuse_invalidate_atime(inode); - return err; + return res; } enum fuse_parse_result { @@ -372,11 +386,13 @@ static enum fuse_parse_result fuse_parse_cache(struct fuse_file *ff, for (;;) { struct fuse_dirent *dirent = addr + offset; unsigned int nbytes = size - offset; - size_t reclen = FUSE_DIRENT_SIZE(dirent); + size_t reclen; if (nbytes < FUSE_NAME_OFFSET || !dirent->namelen) break; + reclen = FUSE_DIRENT_SIZE(dirent); /* derefs ->namelen */ + if (WARN_ON(dirent->namelen > FUSE_NAME_MAX)) return FOUND_ERR; if (WARN_ON(reclen > nbytes)) diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c new file mode 100644 index 000000000000..a5c86048b96e --- /dev/null +++ b/fs/fuse/virtio_fs.c @@ -0,0 +1,1252 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * virtio-fs: Virtio Filesystem + * Copyright (C) 2018 Red Hat, Inc. + */ + +#include <linux/fs.h> +#include <linux/module.h> +#include <linux/virtio.h> +#include <linux/virtio_fs.h> +#include <linux/delay.h> +#include <linux/fs_context.h> +#include <linux/highmem.h> +#include "fuse_i.h" + +/* List of virtio-fs device instances and a lock for the list. Also provides + * mutual exclusion in device removal and mounting path + */ +static DEFINE_MUTEX(virtio_fs_mutex); +static LIST_HEAD(virtio_fs_instances); + +enum { + VQ_HIPRIO, + VQ_REQUEST +}; + +/* Per-virtqueue state */ +struct virtio_fs_vq { + spinlock_t lock; + struct virtqueue *vq; /* protected by ->lock */ + struct work_struct done_work; + struct list_head queued_reqs; + struct list_head end_reqs; /* End these requests */ + struct delayed_work dispatch_work; + struct fuse_dev *fud; + bool connected; + long in_flight; + char name[24]; +} ____cacheline_aligned_in_smp; + +/* A virtio-fs device instance */ +struct virtio_fs { + struct kref refcount; + struct list_head list; /* on virtio_fs_instances */ + char *tag; + struct virtio_fs_vq *vqs; + unsigned int nvqs; /* number of virtqueues */ + unsigned int num_request_queues; /* number of request queues */ +}; + +struct virtio_fs_forget { + struct fuse_in_header ih; + struct fuse_forget_in arg; + /* This request can be temporarily queued on virt queue */ + struct list_head list; +}; + +static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, + struct fuse_req *req, bool in_flight); + +static inline struct virtio_fs_vq *vq_to_fsvq(struct virtqueue *vq) +{ + struct virtio_fs *fs = vq->vdev->priv; + + return &fs->vqs[vq->index]; +} + +static inline struct fuse_pqueue *vq_to_fpq(struct virtqueue *vq) +{ + return &vq_to_fsvq(vq)->fud->pq; +} + +/* Should be called with fsvq->lock held. */ +static inline void inc_in_flight_req(struct virtio_fs_vq *fsvq) +{ + fsvq->in_flight++; +} + +/* Should be called with fsvq->lock held. */ +static inline void dec_in_flight_req(struct virtio_fs_vq *fsvq) +{ + WARN_ON(fsvq->in_flight <= 0); + fsvq->in_flight--; +} + +static void release_virtio_fs_obj(struct kref *ref) +{ + struct virtio_fs *vfs = container_of(ref, struct virtio_fs, refcount); + + kfree(vfs->vqs); + kfree(vfs); +} + +/* Make sure virtiofs_mutex is held */ +static void virtio_fs_put(struct virtio_fs *fs) +{ + kref_put(&fs->refcount, release_virtio_fs_obj); +} + +static void virtio_fs_fiq_release(struct fuse_iqueue *fiq) +{ + struct virtio_fs *vfs = fiq->priv; + + mutex_lock(&virtio_fs_mutex); + virtio_fs_put(vfs); + mutex_unlock(&virtio_fs_mutex); +} + +static void virtio_fs_drain_queue(struct virtio_fs_vq *fsvq) +{ + WARN_ON(fsvq->in_flight < 0); + + /* Wait for in flight requests to finish.*/ + while (1) { + spin_lock(&fsvq->lock); + if (!fsvq->in_flight) { + spin_unlock(&fsvq->lock); + break; + } + spin_unlock(&fsvq->lock); + /* TODO use completion instead of timeout */ + usleep_range(1000, 2000); + } + + flush_work(&fsvq->done_work); + flush_delayed_work(&fsvq->dispatch_work); +} + +static void virtio_fs_drain_all_queues(struct virtio_fs *fs) +{ + struct virtio_fs_vq *fsvq; + int i; + + for (i = 0; i < fs->nvqs; i++) { + fsvq = &fs->vqs[i]; + virtio_fs_drain_queue(fsvq); + } +} + +static void virtio_fs_start_all_queues(struct virtio_fs *fs) +{ + struct virtio_fs_vq *fsvq; + int i; + + for (i = 0; i < fs->nvqs; i++) { + fsvq = &fs->vqs[i]; + spin_lock(&fsvq->lock); + fsvq->connected = true; + spin_unlock(&fsvq->lock); + } +} + +/* Add a new instance to the list or return -EEXIST if tag name exists*/ +static int virtio_fs_add_instance(struct virtio_fs *fs) +{ + struct virtio_fs *fs2; + bool duplicate = false; + + mutex_lock(&virtio_fs_mutex); + + list_for_each_entry(fs2, &virtio_fs_instances, list) { + if (strcmp(fs->tag, fs2->tag) == 0) + duplicate = true; + } + + if (!duplicate) + list_add_tail(&fs->list, &virtio_fs_instances); + + mutex_unlock(&virtio_fs_mutex); + + if (duplicate) + return -EEXIST; + return 0; +} + +/* Return the virtio_fs with a given tag, or NULL */ +static struct virtio_fs *virtio_fs_find_instance(const char *tag) +{ + struct virtio_fs *fs; + + mutex_lock(&virtio_fs_mutex); + + list_for_each_entry(fs, &virtio_fs_instances, list) { + if (strcmp(fs->tag, tag) == 0) { + kref_get(&fs->refcount); + goto found; + } + } + + fs = NULL; /* not found */ + +found: + mutex_unlock(&virtio_fs_mutex); + + return fs; +} + +static void virtio_fs_free_devs(struct virtio_fs *fs) +{ + unsigned int i; + + for (i = 0; i < fs->nvqs; i++) { + struct virtio_fs_vq *fsvq = &fs->vqs[i]; + + if (!fsvq->fud) + continue; + + fuse_dev_free(fsvq->fud); + fsvq->fud = NULL; + } +} + +/* Read filesystem name from virtio config into fs->tag (must kfree()). */ +static int virtio_fs_read_tag(struct virtio_device *vdev, struct virtio_fs *fs) +{ + char tag_buf[sizeof_field(struct virtio_fs_config, tag)]; + char *end; + size_t len; + + virtio_cread_bytes(vdev, offsetof(struct virtio_fs_config, tag), + &tag_buf, sizeof(tag_buf)); + end = memchr(tag_buf, '\0', sizeof(tag_buf)); + if (end == tag_buf) + return -EINVAL; /* empty tag */ + if (!end) + end = &tag_buf[sizeof(tag_buf)]; + + len = end - tag_buf; + fs->tag = devm_kmalloc(&vdev->dev, len + 1, GFP_KERNEL); + if (!fs->tag) + return -ENOMEM; + memcpy(fs->tag, tag_buf, len); + fs->tag[len] = '\0'; + return 0; +} + +/* Work function for hiprio completion */ +static void virtio_fs_hiprio_done_work(struct work_struct *work) +{ + struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq, + done_work); + struct virtqueue *vq = fsvq->vq; + + /* Free completed FUSE_FORGET requests */ + spin_lock(&fsvq->lock); + do { + unsigned int len; + void *req; + + virtqueue_disable_cb(vq); + + while ((req = virtqueue_get_buf(vq, &len)) != NULL) { + kfree(req); + dec_in_flight_req(fsvq); + } + } while (!virtqueue_enable_cb(vq) && likely(!virtqueue_is_broken(vq))); + spin_unlock(&fsvq->lock); +} + +static void virtio_fs_request_dispatch_work(struct work_struct *work) +{ + struct fuse_req *req; + struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq, + dispatch_work.work); + struct fuse_conn *fc = fsvq->fud->fc; + int ret; + + pr_debug("virtio-fs: worker %s called.\n", __func__); + while (1) { + spin_lock(&fsvq->lock); + req = list_first_entry_or_null(&fsvq->end_reqs, struct fuse_req, + list); + if (!req) { + spin_unlock(&fsvq->lock); + break; + } + + list_del_init(&req->list); + spin_unlock(&fsvq->lock); + fuse_request_end(fc, req); + } + + /* Dispatch pending requests */ + while (1) { + spin_lock(&fsvq->lock); + req = list_first_entry_or_null(&fsvq->queued_reqs, + struct fuse_req, list); + if (!req) { + spin_unlock(&fsvq->lock); + return; + } + list_del_init(&req->list); + spin_unlock(&fsvq->lock); + + ret = virtio_fs_enqueue_req(fsvq, req, true); + if (ret < 0) { + if (ret == -ENOMEM || ret == -ENOSPC) { + spin_lock(&fsvq->lock); + list_add_tail(&req->list, &fsvq->queued_reqs); + schedule_delayed_work(&fsvq->dispatch_work, + msecs_to_jiffies(1)); + spin_unlock(&fsvq->lock); + return; + } + req->out.h.error = ret; + spin_lock(&fsvq->lock); + dec_in_flight_req(fsvq); + spin_unlock(&fsvq->lock); + pr_err("virtio-fs: virtio_fs_enqueue_req() failed %d\n", + ret); + fuse_request_end(fc, req); + } + } +} + +static void virtio_fs_hiprio_dispatch_work(struct work_struct *work) +{ + struct virtio_fs_forget *forget; + struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq, + dispatch_work.work); + struct virtqueue *vq = fsvq->vq; + struct scatterlist sg; + struct scatterlist *sgs[] = {&sg}; + bool notify; + int ret; + + pr_debug("virtio-fs: worker %s called.\n", __func__); + while (1) { + spin_lock(&fsvq->lock); + forget = list_first_entry_or_null(&fsvq->queued_reqs, + struct virtio_fs_forget, list); + if (!forget) { + spin_unlock(&fsvq->lock); + return; + } + + list_del(&forget->list); + if (!fsvq->connected) { + dec_in_flight_req(fsvq); + spin_unlock(&fsvq->lock); + kfree(forget); + continue; + } + + sg_init_one(&sg, forget, sizeof(*forget)); + + /* Enqueue the request */ + dev_dbg(&vq->vdev->dev, "%s\n", __func__); + ret = virtqueue_add_sgs(vq, sgs, 1, 0, forget, GFP_ATOMIC); + if (ret < 0) { + if (ret == -ENOMEM || ret == -ENOSPC) { + pr_debug("virtio-fs: Could not queue FORGET: err=%d. Will try later\n", + ret); + list_add_tail(&forget->list, + &fsvq->queued_reqs); + schedule_delayed_work(&fsvq->dispatch_work, + msecs_to_jiffies(1)); + } else { + pr_debug("virtio-fs: Could not queue FORGET: err=%d. Dropping it.\n", + ret); + dec_in_flight_req(fsvq); + kfree(forget); + } + spin_unlock(&fsvq->lock); + return; + } + + notify = virtqueue_kick_prepare(vq); + spin_unlock(&fsvq->lock); + + if (notify) + virtqueue_notify(vq); + pr_debug("virtio-fs: worker %s dispatched one forget request.\n", + __func__); + } +} + +/* Allocate and copy args into req->argbuf */ +static int copy_args_to_argbuf(struct fuse_req *req) +{ + struct fuse_args *args = req->args; + unsigned int offset = 0; + unsigned int num_in; + unsigned int num_out; + unsigned int len; + unsigned int i; + + num_in = args->in_numargs - args->in_pages; + num_out = args->out_numargs - args->out_pages; + len = fuse_len_args(num_in, (struct fuse_arg *) args->in_args) + + fuse_len_args(num_out, args->out_args); + + req->argbuf = kmalloc(len, GFP_ATOMIC); + if (!req->argbuf) + return -ENOMEM; + + for (i = 0; i < num_in; i++) { + memcpy(req->argbuf + offset, + args->in_args[i].value, + args->in_args[i].size); + offset += args->in_args[i].size; + } + + return 0; +} + +/* Copy args out of and free req->argbuf */ +static void copy_args_from_argbuf(struct fuse_args *args, struct fuse_req *req) +{ + unsigned int remaining; + unsigned int offset; + unsigned int num_in; + unsigned int num_out; + unsigned int i; + + remaining = req->out.h.len - sizeof(req->out.h); + num_in = args->in_numargs - args->in_pages; + num_out = args->out_numargs - args->out_pages; + offset = fuse_len_args(num_in, (struct fuse_arg *)args->in_args); + + for (i = 0; i < num_out; i++) { + unsigned int argsize = args->out_args[i].size; + + if (args->out_argvar && + i == args->out_numargs - 1 && + argsize > remaining) { + argsize = remaining; + } + + memcpy(args->out_args[i].value, req->argbuf + offset, argsize); + offset += argsize; + + if (i != args->out_numargs - 1) + remaining -= argsize; + } + + /* Store the actual size of the variable-length arg */ + if (args->out_argvar) + args->out_args[args->out_numargs - 1].size = remaining; + + kfree(req->argbuf); + req->argbuf = NULL; +} + +/* Work function for request completion */ +static void virtio_fs_requests_done_work(struct work_struct *work) +{ + struct virtio_fs_vq *fsvq = container_of(work, struct virtio_fs_vq, + done_work); + struct fuse_pqueue *fpq = &fsvq->fud->pq; + struct fuse_conn *fc = fsvq->fud->fc; + struct virtqueue *vq = fsvq->vq; + struct fuse_req *req; + struct fuse_args_pages *ap; + struct fuse_req *next; + struct fuse_args *args; + unsigned int len, i, thislen; + struct page *page; + LIST_HEAD(reqs); + + /* Collect completed requests off the virtqueue */ + spin_lock(&fsvq->lock); + do { + virtqueue_disable_cb(vq); + + while ((req = virtqueue_get_buf(vq, &len)) != NULL) { + spin_lock(&fpq->lock); + list_move_tail(&req->list, &reqs); + spin_unlock(&fpq->lock); + } + } while (!virtqueue_enable_cb(vq) && likely(!virtqueue_is_broken(vq))); + spin_unlock(&fsvq->lock); + + /* End requests */ + list_for_each_entry_safe(req, next, &reqs, list) { + /* + * TODO verify that server properly follows FUSE protocol + * (oh.uniq, oh.len) + */ + args = req->args; + copy_args_from_argbuf(args, req); + + if (args->out_pages && args->page_zeroing) { + len = args->out_args[args->out_numargs - 1].size; + ap = container_of(args, typeof(*ap), args); + for (i = 0; i < ap->num_pages; i++) { + thislen = ap->descs[i].length; + if (len < thislen) { + WARN_ON(ap->descs[i].offset); + page = ap->pages[i]; + zero_user_segment(page, len, thislen); + len = 0; + } else { + len -= thislen; + } + } + } + + spin_lock(&fpq->lock); + clear_bit(FR_SENT, &req->flags); + list_del_init(&req->list); + spin_unlock(&fpq->lock); + + fuse_request_end(fc, req); + spin_lock(&fsvq->lock); + dec_in_flight_req(fsvq); + spin_unlock(&fsvq->lock); + } +} + +/* Virtqueue interrupt handler */ +static void virtio_fs_vq_done(struct virtqueue *vq) +{ + struct virtio_fs_vq *fsvq = vq_to_fsvq(vq); + + dev_dbg(&vq->vdev->dev, "%s %s\n", __func__, fsvq->name); + + schedule_work(&fsvq->done_work); +} + +/* Initialize virtqueues */ +static int virtio_fs_setup_vqs(struct virtio_device *vdev, + struct virtio_fs *fs) +{ + struct virtqueue **vqs; + vq_callback_t **callbacks; + const char **names; + unsigned int i; + int ret = 0; + + virtio_cread(vdev, struct virtio_fs_config, num_request_queues, + &fs->num_request_queues); + if (fs->num_request_queues == 0) + return -EINVAL; + + fs->nvqs = 1 + fs->num_request_queues; + fs->vqs = kcalloc(fs->nvqs, sizeof(fs->vqs[VQ_HIPRIO]), GFP_KERNEL); + if (!fs->vqs) + return -ENOMEM; + + vqs = kmalloc_array(fs->nvqs, sizeof(vqs[VQ_HIPRIO]), GFP_KERNEL); + callbacks = kmalloc_array(fs->nvqs, sizeof(callbacks[VQ_HIPRIO]), + GFP_KERNEL); + names = kmalloc_array(fs->nvqs, sizeof(names[VQ_HIPRIO]), GFP_KERNEL); + if (!vqs || !callbacks || !names) { + ret = -ENOMEM; + goto out; + } + + callbacks[VQ_HIPRIO] = virtio_fs_vq_done; + snprintf(fs->vqs[VQ_HIPRIO].name, sizeof(fs->vqs[VQ_HIPRIO].name), + "hiprio"); + names[VQ_HIPRIO] = fs->vqs[VQ_HIPRIO].name; + INIT_WORK(&fs->vqs[VQ_HIPRIO].done_work, virtio_fs_hiprio_done_work); + INIT_LIST_HEAD(&fs->vqs[VQ_HIPRIO].queued_reqs); + INIT_LIST_HEAD(&fs->vqs[VQ_HIPRIO].end_reqs); + INIT_DELAYED_WORK(&fs->vqs[VQ_HIPRIO].dispatch_work, + virtio_fs_hiprio_dispatch_work); + spin_lock_init(&fs->vqs[VQ_HIPRIO].lock); + + /* Initialize the requests virtqueues */ + for (i = VQ_REQUEST; i < fs->nvqs; i++) { + spin_lock_init(&fs->vqs[i].lock); + INIT_WORK(&fs->vqs[i].done_work, virtio_fs_requests_done_work); + INIT_DELAYED_WORK(&fs->vqs[i].dispatch_work, + virtio_fs_request_dispatch_work); + INIT_LIST_HEAD(&fs->vqs[i].queued_reqs); + INIT_LIST_HEAD(&fs->vqs[i].end_reqs); + snprintf(fs->vqs[i].name, sizeof(fs->vqs[i].name), + "requests.%u", i - VQ_REQUEST); + callbacks[i] = virtio_fs_vq_done; + names[i] = fs->vqs[i].name; + } + + ret = virtio_find_vqs(vdev, fs->nvqs, vqs, callbacks, names, NULL); + if (ret < 0) + goto out; + + for (i = 0; i < fs->nvqs; i++) + fs->vqs[i].vq = vqs[i]; + + virtio_fs_start_all_queues(fs); +out: + kfree(names); + kfree(callbacks); + kfree(vqs); + if (ret) + kfree(fs->vqs); + return ret; +} + +/* Free virtqueues (device must already be reset) */ +static void virtio_fs_cleanup_vqs(struct virtio_device *vdev, + struct virtio_fs *fs) +{ + vdev->config->del_vqs(vdev); +} + +static int virtio_fs_probe(struct virtio_device *vdev) +{ + struct virtio_fs *fs; + int ret; + + fs = kzalloc(sizeof(*fs), GFP_KERNEL); + if (!fs) + return -ENOMEM; + kref_init(&fs->refcount); + vdev->priv = fs; + + ret = virtio_fs_read_tag(vdev, fs); + if (ret < 0) + goto out; + + ret = virtio_fs_setup_vqs(vdev, fs); + if (ret < 0) + goto out; + + /* TODO vq affinity */ + + /* Bring the device online in case the filesystem is mounted and + * requests need to be sent before we return. + */ + virtio_device_ready(vdev); + + ret = virtio_fs_add_instance(fs); + if (ret < 0) + goto out_vqs; + + return 0; + +out_vqs: + vdev->config->reset(vdev); + virtio_fs_cleanup_vqs(vdev, fs); + +out: + vdev->priv = NULL; + kfree(fs); + return ret; +} + +static void virtio_fs_stop_all_queues(struct virtio_fs *fs) +{ + struct virtio_fs_vq *fsvq; + int i; + + for (i = 0; i < fs->nvqs; i++) { + fsvq = &fs->vqs[i]; + spin_lock(&fsvq->lock); + fsvq->connected = false; + spin_unlock(&fsvq->lock); + } +} + +static void virtio_fs_remove(struct virtio_device *vdev) +{ + struct virtio_fs *fs = vdev->priv; + + mutex_lock(&virtio_fs_mutex); + /* This device is going away. No one should get new reference */ + list_del_init(&fs->list); + virtio_fs_stop_all_queues(fs); + virtio_fs_drain_all_queues(fs); + vdev->config->reset(vdev); + virtio_fs_cleanup_vqs(vdev, fs); + + vdev->priv = NULL; + /* Put device reference on virtio_fs object */ + virtio_fs_put(fs); + mutex_unlock(&virtio_fs_mutex); +} + +#ifdef CONFIG_PM_SLEEP +static int virtio_fs_freeze(struct virtio_device *vdev) +{ + /* TODO need to save state here */ + pr_warn("virtio-fs: suspend/resume not yet supported\n"); + return -EOPNOTSUPP; +} + +static int virtio_fs_restore(struct virtio_device *vdev) +{ + /* TODO need to restore state here */ + return 0; +} +#endif /* CONFIG_PM_SLEEP */ + +const static struct virtio_device_id id_table[] = { + { VIRTIO_ID_FS, VIRTIO_DEV_ANY_ID }, + {}, +}; + +const static unsigned int feature_table[] = {}; + +static struct virtio_driver virtio_fs_driver = { + .driver.name = KBUILD_MODNAME, + .driver.owner = THIS_MODULE, + .id_table = id_table, + .feature_table = feature_table, + .feature_table_size = ARRAY_SIZE(feature_table), + .probe = virtio_fs_probe, + .remove = virtio_fs_remove, +#ifdef CONFIG_PM_SLEEP + .freeze = virtio_fs_freeze, + .restore = virtio_fs_restore, +#endif +}; + +static void virtio_fs_wake_forget_and_unlock(struct fuse_iqueue *fiq) +__releases(fiq->lock) +{ + struct fuse_forget_link *link; + struct virtio_fs_forget *forget; + struct scatterlist sg; + struct scatterlist *sgs[] = {&sg}; + struct virtio_fs *fs; + struct virtqueue *vq; + struct virtio_fs_vq *fsvq; + bool notify; + u64 unique; + int ret; + + link = fuse_dequeue_forget(fiq, 1, NULL); + unique = fuse_get_unique(fiq); + + fs = fiq->priv; + fsvq = &fs->vqs[VQ_HIPRIO]; + spin_unlock(&fiq->lock); + + /* Allocate a buffer for the request */ + forget = kmalloc(sizeof(*forget), GFP_NOFS | __GFP_NOFAIL); + + forget->ih = (struct fuse_in_header){ + .opcode = FUSE_FORGET, + .nodeid = link->forget_one.nodeid, + .unique = unique, + .len = sizeof(*forget), + }; + forget->arg = (struct fuse_forget_in){ + .nlookup = link->forget_one.nlookup, + }; + + sg_init_one(&sg, forget, sizeof(*forget)); + + /* Enqueue the request */ + spin_lock(&fsvq->lock); + + if (!fsvq->connected) { + kfree(forget); + spin_unlock(&fsvq->lock); + goto out; + } + + vq = fsvq->vq; + dev_dbg(&vq->vdev->dev, "%s\n", __func__); + + ret = virtqueue_add_sgs(vq, sgs, 1, 0, forget, GFP_ATOMIC); + if (ret < 0) { + if (ret == -ENOMEM || ret == -ENOSPC) { + pr_debug("virtio-fs: Could not queue FORGET: err=%d. Will try later.\n", + ret); + list_add_tail(&forget->list, &fsvq->queued_reqs); + schedule_delayed_work(&fsvq->dispatch_work, + msecs_to_jiffies(1)); + inc_in_flight_req(fsvq); + } else { + pr_debug("virtio-fs: Could not queue FORGET: err=%d. Dropping it.\n", + ret); + kfree(forget); + } + spin_unlock(&fsvq->lock); + goto out; + } + + inc_in_flight_req(fsvq); + notify = virtqueue_kick_prepare(vq); + + spin_unlock(&fsvq->lock); + + if (notify) + virtqueue_notify(vq); +out: + kfree(link); +} + +static void virtio_fs_wake_interrupt_and_unlock(struct fuse_iqueue *fiq) +__releases(fiq->lock) +{ + /* + * TODO interrupts. + * + * Normal fs operations on a local filesystems aren't interruptible. + * Exceptions are blocking lock operations; for example fcntl(F_SETLKW) + * with shared lock between host and guest. + */ + spin_unlock(&fiq->lock); +} + +/* Return the number of scatter-gather list elements required */ +static unsigned int sg_count_fuse_req(struct fuse_req *req) +{ + struct fuse_args *args = req->args; + struct fuse_args_pages *ap = container_of(args, typeof(*ap), args); + unsigned int total_sgs = 1 /* fuse_in_header */; + + if (args->in_numargs - args->in_pages) + total_sgs += 1; + + if (args->in_pages) + total_sgs += ap->num_pages; + + if (!test_bit(FR_ISREPLY, &req->flags)) + return total_sgs; + + total_sgs += 1 /* fuse_out_header */; + + if (args->out_numargs - args->out_pages) + total_sgs += 1; + + if (args->out_pages) + total_sgs += ap->num_pages; + + return total_sgs; +} + +/* Add pages to scatter-gather list and return number of elements used */ +static unsigned int sg_init_fuse_pages(struct scatterlist *sg, + struct page **pages, + struct fuse_page_desc *page_descs, + unsigned int num_pages, + unsigned int total_len) +{ + unsigned int i; + unsigned int this_len; + + for (i = 0; i < num_pages && total_len; i++) { + sg_init_table(&sg[i], 1); + this_len = min(page_descs[i].length, total_len); + sg_set_page(&sg[i], pages[i], this_len, page_descs[i].offset); + total_len -= this_len; + } + + return i; +} + +/* Add args to scatter-gather list and return number of elements used */ +static unsigned int sg_init_fuse_args(struct scatterlist *sg, + struct fuse_req *req, + struct fuse_arg *args, + unsigned int numargs, + bool argpages, + void *argbuf, + unsigned int *len_used) +{ + struct fuse_args_pages *ap = container_of(req->args, typeof(*ap), args); + unsigned int total_sgs = 0; + unsigned int len; + + len = fuse_len_args(numargs - argpages, args); + if (len) + sg_init_one(&sg[total_sgs++], argbuf, len); + + if (argpages) + total_sgs += sg_init_fuse_pages(&sg[total_sgs], + ap->pages, ap->descs, + ap->num_pages, + args[numargs - 1].size); + + if (len_used) + *len_used = len; + + return total_sgs; +} + +/* Add a request to a virtqueue and kick the device */ +static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, + struct fuse_req *req, bool in_flight) +{ + /* requests need at least 4 elements */ + struct scatterlist *stack_sgs[6]; + struct scatterlist stack_sg[ARRAY_SIZE(stack_sgs)]; + struct scatterlist **sgs = stack_sgs; + struct scatterlist *sg = stack_sg; + struct virtqueue *vq; + struct fuse_args *args = req->args; + unsigned int argbuf_used = 0; + unsigned int out_sgs = 0; + unsigned int in_sgs = 0; + unsigned int total_sgs; + unsigned int i; + int ret; + bool notify; + struct fuse_pqueue *fpq; + + /* Does the sglist fit on the stack? */ + total_sgs = sg_count_fuse_req(req); + if (total_sgs > ARRAY_SIZE(stack_sgs)) { + sgs = kmalloc_array(total_sgs, sizeof(sgs[0]), GFP_ATOMIC); + sg = kmalloc_array(total_sgs, sizeof(sg[0]), GFP_ATOMIC); + if (!sgs || !sg) { + ret = -ENOMEM; + goto out; + } + } + + /* Use a bounce buffer since stack args cannot be mapped */ + ret = copy_args_to_argbuf(req); + if (ret < 0) + goto out; + + /* Request elements */ + sg_init_one(&sg[out_sgs++], &req->in.h, sizeof(req->in.h)); + out_sgs += sg_init_fuse_args(&sg[out_sgs], req, + (struct fuse_arg *)args->in_args, + args->in_numargs, args->in_pages, + req->argbuf, &argbuf_used); + + /* Reply elements */ + if (test_bit(FR_ISREPLY, &req->flags)) { + sg_init_one(&sg[out_sgs + in_sgs++], + &req->out.h, sizeof(req->out.h)); + in_sgs += sg_init_fuse_args(&sg[out_sgs + in_sgs], req, + args->out_args, args->out_numargs, + args->out_pages, + req->argbuf + argbuf_used, NULL); + } + + WARN_ON(out_sgs + in_sgs != total_sgs); + + for (i = 0; i < total_sgs; i++) + sgs[i] = &sg[i]; + + spin_lock(&fsvq->lock); + + if (!fsvq->connected) { + spin_unlock(&fsvq->lock); + ret = -ENOTCONN; + goto out; + } + + vq = fsvq->vq; + ret = virtqueue_add_sgs(vq, sgs, out_sgs, in_sgs, req, GFP_ATOMIC); + if (ret < 0) { + spin_unlock(&fsvq->lock); + goto out; + } + + /* Request successfully sent. */ + fpq = &fsvq->fud->pq; + spin_lock(&fpq->lock); + list_add_tail(&req->list, fpq->processing); + spin_unlock(&fpq->lock); + set_bit(FR_SENT, &req->flags); + /* matches barrier in request_wait_answer() */ + smp_mb__after_atomic(); + + if (!in_flight) + inc_in_flight_req(fsvq); + notify = virtqueue_kick_prepare(vq); + + spin_unlock(&fsvq->lock); + + if (notify) + virtqueue_notify(vq); + +out: + if (ret < 0 && req->argbuf) { + kfree(req->argbuf); + req->argbuf = NULL; + } + if (sgs != stack_sgs) { + kfree(sgs); + kfree(sg); + } + + return ret; +} + +static void virtio_fs_wake_pending_and_unlock(struct fuse_iqueue *fiq) +__releases(fiq->lock) +{ + unsigned int queue_id = VQ_REQUEST; /* TODO multiqueue */ + struct virtio_fs *fs; + struct fuse_req *req; + struct virtio_fs_vq *fsvq; + int ret; + + WARN_ON(list_empty(&fiq->pending)); + req = list_last_entry(&fiq->pending, struct fuse_req, list); + clear_bit(FR_PENDING, &req->flags); + list_del_init(&req->list); + WARN_ON(!list_empty(&fiq->pending)); + spin_unlock(&fiq->lock); + + fs = fiq->priv; + + pr_debug("%s: opcode %u unique %#llx nodeid %#llx in.len %u out.len %u\n", + __func__, req->in.h.opcode, req->in.h.unique, + req->in.h.nodeid, req->in.h.len, + fuse_len_args(req->args->out_numargs, req->args->out_args)); + + fsvq = &fs->vqs[queue_id]; + ret = virtio_fs_enqueue_req(fsvq, req, false); + if (ret < 0) { + if (ret == -ENOMEM || ret == -ENOSPC) { + /* + * Virtqueue full. Retry submission from worker + * context as we might be holding fc->bg_lock. + */ + spin_lock(&fsvq->lock); + list_add_tail(&req->list, &fsvq->queued_reqs); + inc_in_flight_req(fsvq); + schedule_delayed_work(&fsvq->dispatch_work, + msecs_to_jiffies(1)); + spin_unlock(&fsvq->lock); + return; + } + req->out.h.error = ret; + pr_err("virtio-fs: virtio_fs_enqueue_req() failed %d\n", ret); + + /* Can't end request in submission context. Use a worker */ + spin_lock(&fsvq->lock); + list_add_tail(&req->list, &fsvq->end_reqs); + schedule_delayed_work(&fsvq->dispatch_work, 0); + spin_unlock(&fsvq->lock); + return; + } +} + +const static struct fuse_iqueue_ops virtio_fs_fiq_ops = { + .wake_forget_and_unlock = virtio_fs_wake_forget_and_unlock, + .wake_interrupt_and_unlock = virtio_fs_wake_interrupt_and_unlock, + .wake_pending_and_unlock = virtio_fs_wake_pending_and_unlock, + .release = virtio_fs_fiq_release, +}; + +static int virtio_fs_fill_super(struct super_block *sb) +{ + struct fuse_conn *fc = get_fuse_conn_super(sb); + struct virtio_fs *fs = fc->iq.priv; + unsigned int i; + int err; + struct fuse_fs_context ctx = { + .rootmode = S_IFDIR, + .default_permissions = 1, + .allow_other = 1, + .max_read = UINT_MAX, + .blksize = 512, + .destroy = true, + .no_control = true, + .no_force_umount = true, + .no_mount_options = true, + }; + + mutex_lock(&virtio_fs_mutex); + + /* After holding mutex, make sure virtiofs device is still there. + * Though we are holding a reference to it, drive ->remove might + * still have cleaned up virtual queues. In that case bail out. + */ + err = -EINVAL; + if (list_empty(&fs->list)) { + pr_info("virtio-fs: tag <%s> not found\n", fs->tag); + goto err; + } + + err = -ENOMEM; + /* Allocate fuse_dev for hiprio and notification queues */ + for (i = 0; i < VQ_REQUEST; i++) { + struct virtio_fs_vq *fsvq = &fs->vqs[i]; + + fsvq->fud = fuse_dev_alloc(); + if (!fsvq->fud) + goto err_free_fuse_devs; + } + + ctx.fudptr = (void **)&fs->vqs[VQ_REQUEST].fud; + err = fuse_fill_super_common(sb, &ctx); + if (err < 0) + goto err_free_fuse_devs; + + fc = fs->vqs[VQ_REQUEST].fud->fc; + + for (i = 0; i < fs->nvqs; i++) { + struct virtio_fs_vq *fsvq = &fs->vqs[i]; + + if (i == VQ_REQUEST) + continue; /* already initialized */ + fuse_dev_install(fsvq->fud, fc); + } + + /* Previous unmount will stop all queues. Start these again */ + virtio_fs_start_all_queues(fs); + fuse_send_init(fc); + mutex_unlock(&virtio_fs_mutex); + return 0; + +err_free_fuse_devs: + virtio_fs_free_devs(fs); +err: + mutex_unlock(&virtio_fs_mutex); + return err; +} + +static void virtio_kill_sb(struct super_block *sb) +{ + struct fuse_conn *fc = get_fuse_conn_super(sb); + struct virtio_fs *vfs; + struct virtio_fs_vq *fsvq; + + /* If mount failed, we can still be called without any fc */ + if (!fc) + return fuse_kill_sb_anon(sb); + + vfs = fc->iq.priv; + fsvq = &vfs->vqs[VQ_HIPRIO]; + + /* Stop forget queue. Soon destroy will be sent */ + spin_lock(&fsvq->lock); + fsvq->connected = false; + spin_unlock(&fsvq->lock); + virtio_fs_drain_all_queues(vfs); + + fuse_kill_sb_anon(sb); + + /* fuse_kill_sb_anon() must have sent destroy. Stop all queues + * and drain one more time and free fuse devices. Freeing fuse + * devices will drop their reference on fuse_conn and that in + * turn will drop its reference on virtio_fs object. + */ + virtio_fs_stop_all_queues(vfs); + virtio_fs_drain_all_queues(vfs); + virtio_fs_free_devs(vfs); +} + +static int virtio_fs_test_super(struct super_block *sb, + struct fs_context *fsc) +{ + struct fuse_conn *fc = fsc->s_fs_info; + + return fc->iq.priv == get_fuse_conn_super(sb)->iq.priv; +} + +static int virtio_fs_set_super(struct super_block *sb, + struct fs_context *fsc) +{ + int err; + + err = get_anon_bdev(&sb->s_dev); + if (!err) + fuse_conn_get(fsc->s_fs_info); + + return err; +} + +static int virtio_fs_get_tree(struct fs_context *fsc) +{ + struct virtio_fs *fs; + struct super_block *sb; + struct fuse_conn *fc; + int err; + + /* This gets a reference on virtio_fs object. This ptr gets installed + * in fc->iq->priv. Once fuse_conn is going away, it calls ->put() + * to drop the reference to this object. + */ + fs = virtio_fs_find_instance(fsc->source); + if (!fs) { + pr_info("virtio-fs: tag <%s> not found\n", fsc->source); + return -EINVAL; + } + + fc = kzalloc(sizeof(struct fuse_conn), GFP_KERNEL); + if (!fc) { + mutex_lock(&virtio_fs_mutex); + virtio_fs_put(fs); + mutex_unlock(&virtio_fs_mutex); + return -ENOMEM; + } + + fuse_conn_init(fc, get_user_ns(current_user_ns()), &virtio_fs_fiq_ops, + fs); + fc->release = fuse_free_conn; + fc->delete_stale = true; + + fsc->s_fs_info = fc; + sb = sget_fc(fsc, virtio_fs_test_super, virtio_fs_set_super); + fuse_conn_put(fc); + if (IS_ERR(sb)) + return PTR_ERR(sb); + + if (!sb->s_root) { + err = virtio_fs_fill_super(sb); + if (err) { + deactivate_locked_super(sb); + return err; + } + + sb->s_flags |= SB_ACTIVE; + } + + WARN_ON(fsc->root); + fsc->root = dget(sb->s_root); + return 0; +} + +static const struct fs_context_operations virtio_fs_context_ops = { + .get_tree = virtio_fs_get_tree, +}; + +static int virtio_fs_init_fs_context(struct fs_context *fsc) +{ + fsc->ops = &virtio_fs_context_ops; + return 0; +} + +static struct file_system_type virtio_fs_type = { + .owner = THIS_MODULE, + .name = "virtiofs", + .init_fs_context = virtio_fs_init_fs_context, + .kill_sb = virtio_kill_sb, +}; + +static int __init virtio_fs_init(void) +{ + int ret; + + ret = register_virtio_driver(&virtio_fs_driver); + if (ret < 0) + return ret; + + ret = register_filesystem(&virtio_fs_type); + if (ret < 0) { + unregister_virtio_driver(&virtio_fs_driver); + return ret; + } + + return 0; +} +module_init(virtio_fs_init); + +static void __exit virtio_fs_exit(void) +{ + unregister_filesystem(&virtio_fs_type); + unregister_virtio_driver(&virtio_fs_driver); +} +module_exit(virtio_fs_exit); + +MODULE_AUTHOR("Stefan Hajnoczi <stefanha@redhat.com>"); +MODULE_DESCRIPTION("Virtio Filesystem"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_FS(KBUILD_MODNAME); +MODULE_DEVICE_TABLE(virtio, id_table); diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c index 433717640f78..20d052e08b3b 100644 --- a/fs/fuse/xattr.c +++ b/fs/fuse/xattr.c @@ -25,15 +25,15 @@ int fuse_setxattr(struct inode *inode, const char *name, const void *value, memset(&inarg, 0, sizeof(inarg)); inarg.size = size; inarg.flags = flags; - args.in.h.opcode = FUSE_SETXATTR; - args.in.h.nodeid = get_node_id(inode); - args.in.numargs = 3; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.in.args[1].size = strlen(name) + 1; - args.in.args[1].value = name; - args.in.args[2].size = size; - args.in.args[2].value = value; + args.opcode = FUSE_SETXATTR; + args.nodeid = get_node_id(inode); + args.in_numargs = 3; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.in_args[1].size = strlen(name) + 1; + args.in_args[1].value = name; + args.in_args[2].size = size; + args.in_args[2].value = value; err = fuse_simple_request(fc, &args); if (err == -ENOSYS) { fc->no_setxattr = 1; @@ -60,22 +60,22 @@ ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value, memset(&inarg, 0, sizeof(inarg)); inarg.size = size; - args.in.h.opcode = FUSE_GETXATTR; - args.in.h.nodeid = get_node_id(inode); - args.in.numargs = 2; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; - args.in.args[1].size = strlen(name) + 1; - args.in.args[1].value = name; + args.opcode = FUSE_GETXATTR; + args.nodeid = get_node_id(inode); + args.in_numargs = 2; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; + args.in_args[1].size = strlen(name) + 1; + args.in_args[1].value = name; /* This is really two different operations rolled into one */ - args.out.numargs = 1; + args.out_numargs = 1; if (size) { - args.out.argvar = 1; - args.out.args[0].size = size; - args.out.args[0].value = value; + args.out_argvar = true; + args.out_args[0].size = size; + args.out_args[0].value = value; } else { - args.out.args[0].size = sizeof(outarg); - args.out.args[0].value = &outarg; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; } ret = fuse_simple_request(fc, &args); if (!ret && !size) @@ -121,20 +121,20 @@ ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) memset(&inarg, 0, sizeof(inarg)); inarg.size = size; - args.in.h.opcode = FUSE_LISTXATTR; - args.in.h.nodeid = get_node_id(inode); - args.in.numargs = 1; - args.in.args[0].size = sizeof(inarg); - args.in.args[0].value = &inarg; + args.opcode = FUSE_LISTXATTR; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = sizeof(inarg); + args.in_args[0].value = &inarg; /* This is really two different operations rolled into one */ - args.out.numargs = 1; + args.out_numargs = 1; if (size) { - args.out.argvar = 1; - args.out.args[0].size = size; - args.out.args[0].value = list; + args.out_argvar = true; + args.out_args[0].size = size; + args.out_args[0].value = list; } else { - args.out.args[0].size = sizeof(outarg); - args.out.args[0].value = &outarg; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; } ret = fuse_simple_request(fc, &args); if (!ret && !size) @@ -157,11 +157,11 @@ int fuse_removexattr(struct inode *inode, const char *name) if (fc->no_removexattr) return -EOPNOTSUPP; - args.in.h.opcode = FUSE_REMOVEXATTR; - args.in.h.nodeid = get_node_id(inode); - args.in.numargs = 1; - args.in.args[0].size = strlen(name) + 1; - args.in.args[0].value = name; + args.opcode = FUSE_REMOVEXATTR; + args.nodeid = get_node_id(inode); + args.in_numargs = 1; + args.in_args[0].size = strlen(name) + 1; + args.in_args[0].value = name; err = fuse_simple_request(fc, &args); if (err == -ENOSYS) { fc->no_removexattr = 1; diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 6b450065b9d5..5f89c515f5bb 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -584,10 +584,10 @@ struct gfs2_args { unsigned int ar_rgrplvb:1; /* use lvbs for rgrp info */ unsigned int ar_loccookie:1; /* use location based readdir cookies */ - int ar_commit; /* Commit interval */ - int ar_statfs_quantum; /* The fast statfs interval */ - int ar_quota_quantum; /* The quota interval */ - int ar_statfs_percent; /* The % change to force sync */ + s32 ar_commit; /* Commit interval */ + s32 ar_statfs_quantum; /* The fast statfs interval */ + s32 ar_quota_quantum; /* The quota interval */ + s32 ar_statfs_percent; /* The % change to force sync */ }; struct gfs2_tune { diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index f3fd5cd9d43f..18daf494abab 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -21,6 +21,7 @@ #include <linux/lockdep.h> #include <linux/module.h> #include <linux/backing-dev.h> +#include <linux/fs_parser.h> #include "gfs2.h" #include "incore.h" @@ -1031,16 +1032,17 @@ void gfs2_online_uevent(struct gfs2_sbd *sdp) } /** - * fill_super - Read in superblock + * gfs2_fill_super - Read in superblock * @sb: The VFS superblock - * @data: Mount options + * @args: Mount options * @silent: Don't complain if it's not a GFS2 filesystem * - * Returns: errno + * Returns: -errno */ - -static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent) +static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc) { + struct gfs2_args *args = fc->fs_private; + int silent = fc->sb_flags & SB_SILENT; struct gfs2_sbd *sdp; struct gfs2_holder mount_gh; int error; @@ -1205,161 +1207,418 @@ fail_debug: return error; } -static int set_gfs2_super(struct super_block *s, void *data) +/** + * gfs2_get_tree - Get the GFS2 superblock and root directory + * @fc: The filesystem context + * + * Returns: 0 or -errno on error + */ +static int gfs2_get_tree(struct fs_context *fc) { - s->s_bdev = data; - s->s_dev = s->s_bdev->bd_dev; - s->s_bdi = bdi_get(s->s_bdev->bd_bdi); + struct gfs2_args *args = fc->fs_private; + struct gfs2_sbd *sdp; + int error; + + error = get_tree_bdev(fc, gfs2_fill_super); + if (error) + return error; + + sdp = fc->root->d_sb->s_fs_info; + dput(fc->root); + if (args->ar_meta) + fc->root = dget(sdp->sd_master_dir); + else + fc->root = dget(sdp->sd_root_dir); return 0; } -static int test_gfs2_super(struct super_block *s, void *ptr) +static void gfs2_fc_free(struct fs_context *fc) { - struct block_device *bdev = ptr; - return (bdev == s->s_bdev); + struct gfs2_args *args = fc->fs_private; + + kfree(args); } -/** - * gfs2_mount - Get the GFS2 superblock - * @fs_type: The GFS2 filesystem type - * @flags: Mount flags - * @dev_name: The name of the device - * @data: The mount arguments - * - * Q. Why not use get_sb_bdev() ? - * A. We need to select one of two root directories to mount, independent - * of whether this is the initial, or subsequent, mount of this sb - * - * Returns: 0 or -ve on error - */ +enum gfs2_param { + Opt_lockproto, + Opt_locktable, + Opt_hostdata, + Opt_spectator, + Opt_ignore_local_fs, + Opt_localflocks, + Opt_localcaching, + Opt_debug, + Opt_upgrade, + Opt_acl, + Opt_quota, + Opt_suiddir, + Opt_data, + Opt_meta, + Opt_discard, + Opt_commit, + Opt_errors, + Opt_statfs_quantum, + Opt_statfs_percent, + Opt_quota_quantum, + Opt_barrier, + Opt_rgrplvb, + Opt_loccookie, +}; + +enum opt_quota { + Opt_quota_unset = 0, + Opt_quota_off, + Opt_quota_account, + Opt_quota_on, +}; + +static const unsigned int opt_quota_values[] = { + [Opt_quota_off] = GFS2_QUOTA_OFF, + [Opt_quota_account] = GFS2_QUOTA_ACCOUNT, + [Opt_quota_on] = GFS2_QUOTA_ON, +}; + +enum opt_data { + Opt_data_writeback = GFS2_DATA_WRITEBACK, + Opt_data_ordered = GFS2_DATA_ORDERED, +}; + +enum opt_errors { + Opt_errors_withdraw = GFS2_ERRORS_WITHDRAW, + Opt_errors_panic = GFS2_ERRORS_PANIC, +}; + +static const struct fs_parameter_spec gfs2_param_specs[] = { + fsparam_string ("lockproto", Opt_lockproto), + fsparam_string ("locktable", Opt_locktable), + fsparam_string ("hostdata", Opt_hostdata), + fsparam_flag ("spectator", Opt_spectator), + fsparam_flag ("norecovery", Opt_spectator), + fsparam_flag ("ignore_local_fs", Opt_ignore_local_fs), + fsparam_flag ("localflocks", Opt_localflocks), + fsparam_flag ("localcaching", Opt_localcaching), + fsparam_flag_no("debug", Opt_debug), + fsparam_flag ("upgrade", Opt_upgrade), + fsparam_flag_no("acl", Opt_acl), + fsparam_flag_no("suiddir", Opt_suiddir), + fsparam_enum ("data", Opt_data), + fsparam_flag ("meta", Opt_meta), + fsparam_flag_no("discard", Opt_discard), + fsparam_s32 ("commit", Opt_commit), + fsparam_enum ("errors", Opt_errors), + fsparam_s32 ("statfs_quantum", Opt_statfs_quantum), + fsparam_s32 ("statfs_percent", Opt_statfs_percent), + fsparam_s32 ("quota_quantum", Opt_quota_quantum), + fsparam_flag_no("barrier", Opt_barrier), + fsparam_flag_no("rgrplvb", Opt_rgrplvb), + fsparam_flag_no("loccookie", Opt_loccookie), + /* quota can be a flag or an enum so it gets special treatment */ + __fsparam(fs_param_is_enum, "quota", Opt_quota, fs_param_neg_with_no|fs_param_v_optional), + {} +}; -static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data) +static const struct fs_parameter_enum gfs2_param_enums[] = { + { Opt_quota, "off", Opt_quota_off }, + { Opt_quota, "account", Opt_quota_account }, + { Opt_quota, "on", Opt_quota_on }, + { Opt_data, "writeback", Opt_data_writeback }, + { Opt_data, "ordered", Opt_data_ordered }, + { Opt_errors, "withdraw", Opt_errors_withdraw }, + { Opt_errors, "panic", Opt_errors_panic }, + {} +}; + +const struct fs_parameter_description gfs2_fs_parameters = { + .name = "gfs2", + .specs = gfs2_param_specs, + .enums = gfs2_param_enums, +}; + +/* Parse a single mount parameter */ +static int gfs2_parse_param(struct fs_context *fc, struct fs_parameter *param) { - struct block_device *bdev; - struct super_block *s; - fmode_t mode = FMODE_READ | FMODE_EXCL; - int error; - struct gfs2_args args; - struct gfs2_sbd *sdp; + struct gfs2_args *args = fc->fs_private; + struct fs_parse_result result; + int o; + + o = fs_parse(fc, &gfs2_fs_parameters, param, &result); + if (o < 0) + return o; + + switch (o) { + case Opt_lockproto: + strlcpy(args->ar_lockproto, param->string, GFS2_LOCKNAME_LEN); + break; + case Opt_locktable: + strlcpy(args->ar_locktable, param->string, GFS2_LOCKNAME_LEN); + break; + case Opt_hostdata: + strlcpy(args->ar_hostdata, param->string, GFS2_LOCKNAME_LEN); + break; + case Opt_spectator: + args->ar_spectator = 1; + break; + case Opt_ignore_local_fs: + /* Retained for backwards compat only */ + break; + case Opt_localflocks: + args->ar_localflocks = 1; + break; + case Opt_localcaching: + /* Retained for backwards compat only */ + break; + case Opt_debug: + if (result.boolean && args->ar_errors == GFS2_ERRORS_PANIC) + return invalf(fc, "gfs2: -o debug and -o errors=panic are mutually exclusive"); + args->ar_debug = result.boolean; + break; + case Opt_upgrade: + /* Retained for backwards compat only */ + break; + case Opt_acl: + args->ar_posix_acl = result.boolean; + break; + case Opt_quota: + /* The quota option can be a flag or an enum. A non-zero int_32 + result means that we have an enum index. Otherwise we have + to rely on the 'negated' flag to tell us whether 'quota' or + 'noquota' was specified. */ + if (result.negated) + args->ar_quota = GFS2_QUOTA_OFF; + else if (result.int_32 > 0) + args->ar_quota = opt_quota_values[result.int_32]; + else + args->ar_quota = GFS2_QUOTA_ON; + break; + case Opt_suiddir: + args->ar_suiddir = result.boolean; + break; + case Opt_data: + /* The uint_32 result maps directly to GFS2_DATA_* */ + args->ar_data = result.uint_32; + break; + case Opt_meta: + args->ar_meta = 1; + break; + case Opt_discard: + args->ar_discard = result.boolean; + break; + case Opt_commit: + if (result.int_32 <= 0) + return invalf(fc, "gfs2: commit mount option requires a positive numeric argument"); + args->ar_commit = result.int_32; + break; + case Opt_statfs_quantum: + if (result.int_32 < 0) + return invalf(fc, "gfs2: statfs_quantum mount option requires a non-negative numeric argument"); + args->ar_statfs_quantum = result.int_32; + break; + case Opt_quota_quantum: + if (result.int_32 <= 0) + return invalf(fc, "gfs2: quota_quantum mount option requires a positive numeric argument"); + args->ar_quota_quantum = result.int_32; + break; + case Opt_statfs_percent: + if (result.int_32 < 0 || result.int_32 > 100) + return invalf(fc, "gfs2: statfs_percent mount option requires a numeric argument between 0 and 100"); + args->ar_statfs_percent = result.int_32; + break; + case Opt_errors: + if (args->ar_debug && result.uint_32 == GFS2_ERRORS_PANIC) + return invalf(fc, "gfs2: -o debug and -o errors=panic are mutually exclusive"); + args->ar_errors = result.uint_32; + break; + case Opt_barrier: + args->ar_nobarrier = result.boolean; + break; + case Opt_rgrplvb: + args->ar_rgrplvb = result.boolean; + break; + case Opt_loccookie: + args->ar_loccookie = result.boolean; + break; + default: + return invalf(fc, "gfs2: invalid mount option: %s", param->key); + } + return 0; +} - if (!(flags & SB_RDONLY)) - mode |= FMODE_WRITE; +static int gfs2_reconfigure(struct fs_context *fc) +{ + struct super_block *sb = fc->root->d_sb; + struct gfs2_sbd *sdp = sb->s_fs_info; + struct gfs2_args *oldargs = &sdp->sd_args; + struct gfs2_args *newargs = fc->fs_private; + struct gfs2_tune *gt = &sdp->sd_tune; + int error = 0; - bdev = blkdev_get_by_path(dev_name, mode, fs_type); - if (IS_ERR(bdev)) - return ERR_CAST(bdev); + sync_filesystem(sb); - /* - * once the super is inserted into the list by sget, s_umount - * will protect the lockfs code from trying to start a snapshot - * while we are mounting - */ - mutex_lock(&bdev->bd_fsfreeze_mutex); - if (bdev->bd_fsfreeze_count > 0) { - mutex_unlock(&bdev->bd_fsfreeze_mutex); - error = -EBUSY; - goto error_bdev; + spin_lock(>->gt_spin); + oldargs->ar_commit = gt->gt_logd_secs; + oldargs->ar_quota_quantum = gt->gt_quota_quantum; + if (gt->gt_statfs_slow) + oldargs->ar_statfs_quantum = 0; + else + oldargs->ar_statfs_quantum = gt->gt_statfs_quantum; + spin_unlock(>->gt_spin); + + if (strcmp(newargs->ar_lockproto, oldargs->ar_lockproto)) { + errorf(fc, "gfs2: reconfiguration of locking protocol not allowed"); + return -EINVAL; } - s = sget(fs_type, test_gfs2_super, set_gfs2_super, flags, bdev); - mutex_unlock(&bdev->bd_fsfreeze_mutex); - error = PTR_ERR(s); - if (IS_ERR(s)) - goto error_bdev; - - if (s->s_root) { - /* - * s_umount nests inside bd_mutex during - * __invalidate_device(). blkdev_put() acquires - * bd_mutex and can't be called under s_umount. Drop - * s_umount temporarily. This is safe as we're - * holding an active reference. - */ - up_write(&s->s_umount); - blkdev_put(bdev, mode); - down_write(&s->s_umount); - } else { - /* s_mode must be set before deactivate_locked_super calls */ - s->s_mode = mode; + if (strcmp(newargs->ar_locktable, oldargs->ar_locktable)) { + errorf(fc, "gfs2: reconfiguration of lock table not allowed"); + return -EINVAL; } + if (strcmp(newargs->ar_hostdata, oldargs->ar_hostdata)) { + errorf(fc, "gfs2: reconfiguration of host data not allowed"); + return -EINVAL; + } + if (newargs->ar_spectator != oldargs->ar_spectator) { + errorf(fc, "gfs2: reconfiguration of spectator mode not allowed"); + return -EINVAL; + } + if (newargs->ar_localflocks != oldargs->ar_localflocks) { + errorf(fc, "gfs2: reconfiguration of localflocks not allowed"); + return -EINVAL; + } + if (newargs->ar_meta != oldargs->ar_meta) { + errorf(fc, "gfs2: switching between gfs2 and gfs2meta not allowed"); + return -EINVAL; + } + if (oldargs->ar_spectator) + fc->sb_flags |= SB_RDONLY; + + if ((sb->s_flags ^ fc->sb_flags) & SB_RDONLY) { + if (fc->sb_flags & SB_RDONLY) { + error = gfs2_make_fs_ro(sdp); + if (error) + errorf(fc, "gfs2: unable to remount read-only"); + } else { + error = gfs2_make_fs_rw(sdp); + if (error) + errorf(fc, "gfs2: unable to remount read-write"); + } + } + sdp->sd_args = *newargs; - memset(&args, 0, sizeof(args)); - args.ar_quota = GFS2_QUOTA_DEFAULT; - args.ar_data = GFS2_DATA_DEFAULT; - args.ar_commit = 30; - args.ar_statfs_quantum = 30; - args.ar_quota_quantum = 60; - args.ar_errors = GFS2_ERRORS_DEFAULT; - - error = gfs2_mount_args(&args, data); - if (error) { - pr_warn("can't parse mount arguments\n"); - goto error_super; + if (sdp->sd_args.ar_posix_acl) + sb->s_flags |= SB_POSIXACL; + else + sb->s_flags &= ~SB_POSIXACL; + if (sdp->sd_args.ar_nobarrier) + set_bit(SDF_NOBARRIERS, &sdp->sd_flags); + else + clear_bit(SDF_NOBARRIERS, &sdp->sd_flags); + spin_lock(>->gt_spin); + gt->gt_logd_secs = newargs->ar_commit; + gt->gt_quota_quantum = newargs->ar_quota_quantum; + if (newargs->ar_statfs_quantum) { + gt->gt_statfs_slow = 0; + gt->gt_statfs_quantum = newargs->ar_statfs_quantum; + } + else { + gt->gt_statfs_slow = 1; + gt->gt_statfs_quantum = 30; } + spin_unlock(>->gt_spin); + + gfs2_online_uevent(sdp); + return error; +} + +static const struct fs_context_operations gfs2_context_ops = { + .free = gfs2_fc_free, + .parse_param = gfs2_parse_param, + .get_tree = gfs2_get_tree, + .reconfigure = gfs2_reconfigure, +}; + +/* Set up the filesystem mount context */ +static int gfs2_init_fs_context(struct fs_context *fc) +{ + struct gfs2_args *args; + + args = kmalloc(sizeof(*args), GFP_KERNEL); + if (args == NULL) + return -ENOMEM; + + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + struct gfs2_sbd *sdp = fc->root->d_sb->s_fs_info; - if (s->s_root) { - error = -EBUSY; - if ((flags ^ s->s_flags) & SB_RDONLY) - goto error_super; + *args = sdp->sd_args; } else { - snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); - sb_set_blocksize(s, block_size(bdev)); - error = fill_super(s, &args, flags & SB_SILENT ? 1 : 0); - if (error) - goto error_super; - s->s_flags |= SB_ACTIVE; - bdev->bd_super = s; + memset(args, 0, sizeof(*args)); + args->ar_quota = GFS2_QUOTA_DEFAULT; + args->ar_data = GFS2_DATA_DEFAULT; + args->ar_commit = 30; + args->ar_statfs_quantum = 30; + args->ar_quota_quantum = 60; + args->ar_errors = GFS2_ERRORS_DEFAULT; } - - sdp = s->s_fs_info; - if (args.ar_meta) - return dget(sdp->sd_master_dir); - else - return dget(sdp->sd_root_dir); - -error_super: - deactivate_locked_super(s); - return ERR_PTR(error); -error_bdev: - blkdev_put(bdev, mode); - return ERR_PTR(error); + fc->fs_private = args; + fc->ops = &gfs2_context_ops; + return 0; } -static int set_meta_super(struct super_block *s, void *ptr) +static int set_meta_super(struct super_block *s, struct fs_context *fc) { return -EINVAL; } -static struct dentry *gfs2_mount_meta(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int test_meta_super(struct super_block *s, struct fs_context *fc) +{ + return (fc->sget_key == s->s_bdev); +} + +static int gfs2_meta_get_tree(struct fs_context *fc) { struct super_block *s; struct gfs2_sbd *sdp; struct path path; int error; - if (!dev_name || !*dev_name) - return ERR_PTR(-EINVAL); + if (!fc->source || !*fc->source) + return -EINVAL; - error = kern_path(dev_name, LOOKUP_FOLLOW, &path); + error = kern_path(fc->source, LOOKUP_FOLLOW, &path); if (error) { pr_warn("path_lookup on %s returned error %d\n", - dev_name, error); - return ERR_PTR(error); + fc->source, error); + return error; } - s = sget(&gfs2_fs_type, test_gfs2_super, set_meta_super, flags, - path.dentry->d_sb->s_bdev); + fc->fs_type = &gfs2_fs_type; + fc->sget_key = path.dentry->d_sb->s_bdev; + s = sget_fc(fc, test_meta_super, set_meta_super); path_put(&path); if (IS_ERR(s)) { pr_warn("gfs2 mount does not exist\n"); - return ERR_CAST(s); + return PTR_ERR(s); } - if ((flags ^ s->s_flags) & SB_RDONLY) { + if ((fc->sb_flags ^ s->s_flags) & SB_RDONLY) { deactivate_locked_super(s); - return ERR_PTR(-EBUSY); + return -EBUSY; } sdp = s->s_fs_info; - return dget(sdp->sd_master_dir); + fc->root = dget(sdp->sd_master_dir); + return 0; +} + +static const struct fs_context_operations gfs2_meta_context_ops = { + .free = gfs2_fc_free, + .get_tree = gfs2_meta_get_tree, +}; + +static int gfs2_meta_init_fs_context(struct fs_context *fc) +{ + int ret = gfs2_init_fs_context(fc); + + if (ret) + return ret; + + fc->ops = &gfs2_meta_context_ops; + return 0; } static void gfs2_kill_sb(struct super_block *sb) @@ -1383,7 +1642,8 @@ static void gfs2_kill_sb(struct super_block *sb) struct file_system_type gfs2_fs_type = { .name = "gfs2", .fs_flags = FS_REQUIRES_DEV, - .mount = gfs2_mount, + .init_fs_context = gfs2_init_fs_context, + .parameters = &gfs2_fs_parameters, .kill_sb = gfs2_kill_sb, .owner = THIS_MODULE, }; @@ -1392,7 +1652,7 @@ MODULE_ALIAS_FS("gfs2"); struct file_system_type gfs2meta_fs_type = { .name = "gfs2meta", .fs_flags = FS_REQUIRES_DEV, - .mount = gfs2_mount_meta, + .init_fs_context = gfs2_meta_init_fs_context, .owner = THIS_MODULE, }; MODULE_ALIAS_FS("gfs2meta"); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 644c70ae09f7..5fa1eec4fb4f 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -44,258 +44,6 @@ #include "xattr.h" #include "lops.h" -#define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x) - -enum { - Opt_lockproto, - Opt_locktable, - Opt_hostdata, - Opt_spectator, - Opt_ignore_local_fs, - Opt_localflocks, - Opt_localcaching, - Opt_debug, - Opt_nodebug, - Opt_upgrade, - Opt_acl, - Opt_noacl, - Opt_quota_off, - Opt_quota_account, - Opt_quota_on, - Opt_quota, - Opt_noquota, - Opt_suiddir, - Opt_nosuiddir, - Opt_data_writeback, - Opt_data_ordered, - Opt_meta, - Opt_discard, - Opt_nodiscard, - Opt_commit, - Opt_err_withdraw, - Opt_err_panic, - Opt_statfs_quantum, - Opt_statfs_percent, - Opt_quota_quantum, - Opt_barrier, - Opt_nobarrier, - Opt_rgrplvb, - Opt_norgrplvb, - Opt_loccookie, - Opt_noloccookie, - Opt_error, -}; - -static const match_table_t tokens = { - {Opt_lockproto, "lockproto=%s"}, - {Opt_locktable, "locktable=%s"}, - {Opt_hostdata, "hostdata=%s"}, - {Opt_spectator, "spectator"}, - {Opt_spectator, "norecovery"}, - {Opt_ignore_local_fs, "ignore_local_fs"}, - {Opt_localflocks, "localflocks"}, - {Opt_localcaching, "localcaching"}, - {Opt_debug, "debug"}, - {Opt_nodebug, "nodebug"}, - {Opt_upgrade, "upgrade"}, - {Opt_acl, "acl"}, - {Opt_noacl, "noacl"}, - {Opt_quota_off, "quota=off"}, - {Opt_quota_account, "quota=account"}, - {Opt_quota_on, "quota=on"}, - {Opt_quota, "quota"}, - {Opt_noquota, "noquota"}, - {Opt_suiddir, "suiddir"}, - {Opt_nosuiddir, "nosuiddir"}, - {Opt_data_writeback, "data=writeback"}, - {Opt_data_ordered, "data=ordered"}, - {Opt_meta, "meta"}, - {Opt_discard, "discard"}, - {Opt_nodiscard, "nodiscard"}, - {Opt_commit, "commit=%d"}, - {Opt_err_withdraw, "errors=withdraw"}, - {Opt_err_panic, "errors=panic"}, - {Opt_statfs_quantum, "statfs_quantum=%d"}, - {Opt_statfs_percent, "statfs_percent=%d"}, - {Opt_quota_quantum, "quota_quantum=%d"}, - {Opt_barrier, "barrier"}, - {Opt_nobarrier, "nobarrier"}, - {Opt_rgrplvb, "rgrplvb"}, - {Opt_norgrplvb, "norgrplvb"}, - {Opt_loccookie, "loccookie"}, - {Opt_noloccookie, "noloccookie"}, - {Opt_error, NULL} -}; - -/** - * gfs2_mount_args - Parse mount options - * @args: The structure into which the parsed options will be written - * @options: The options to parse - * - * Return: errno - */ - -int gfs2_mount_args(struct gfs2_args *args, char *options) -{ - char *o; - int token; - substring_t tmp[MAX_OPT_ARGS]; - int rv; - - /* Split the options into tokens with the "," character and - process them */ - - while (1) { - o = strsep(&options, ","); - if (o == NULL) - break; - if (*o == '\0') - continue; - - token = match_token(o, tokens, tmp); - switch (token) { - case Opt_lockproto: - match_strlcpy(args->ar_lockproto, &tmp[0], - GFS2_LOCKNAME_LEN); - break; - case Opt_locktable: - match_strlcpy(args->ar_locktable, &tmp[0], - GFS2_LOCKNAME_LEN); - break; - case Opt_hostdata: - match_strlcpy(args->ar_hostdata, &tmp[0], - GFS2_LOCKNAME_LEN); - break; - case Opt_spectator: - args->ar_spectator = 1; - break; - case Opt_ignore_local_fs: - /* Retained for backwards compat only */ - break; - case Opt_localflocks: - args->ar_localflocks = 1; - break; - case Opt_localcaching: - /* Retained for backwards compat only */ - break; - case Opt_debug: - if (args->ar_errors == GFS2_ERRORS_PANIC) { - pr_warn("-o debug and -o errors=panic are mutually exclusive\n"); - return -EINVAL; - } - args->ar_debug = 1; - break; - case Opt_nodebug: - args->ar_debug = 0; - break; - case Opt_upgrade: - /* Retained for backwards compat only */ - break; - case Opt_acl: - args->ar_posix_acl = 1; - break; - case Opt_noacl: - args->ar_posix_acl = 0; - break; - case Opt_quota_off: - case Opt_noquota: - args->ar_quota = GFS2_QUOTA_OFF; - break; - case Opt_quota_account: - args->ar_quota = GFS2_QUOTA_ACCOUNT; - break; - case Opt_quota_on: - case Opt_quota: - args->ar_quota = GFS2_QUOTA_ON; - break; - case Opt_suiddir: - args->ar_suiddir = 1; - break; - case Opt_nosuiddir: - args->ar_suiddir = 0; - break; - case Opt_data_writeback: - args->ar_data = GFS2_DATA_WRITEBACK; - break; - case Opt_data_ordered: - args->ar_data = GFS2_DATA_ORDERED; - break; - case Opt_meta: - args->ar_meta = 1; - break; - case Opt_discard: - args->ar_discard = 1; - break; - case Opt_nodiscard: - args->ar_discard = 0; - break; - case Opt_commit: - rv = match_int(&tmp[0], &args->ar_commit); - if (rv || args->ar_commit <= 0) { - pr_warn("commit mount option requires a positive numeric argument\n"); - return rv ? rv : -EINVAL; - } - break; - case Opt_statfs_quantum: - rv = match_int(&tmp[0], &args->ar_statfs_quantum); - if (rv || args->ar_statfs_quantum < 0) { - pr_warn("statfs_quantum mount option requires a non-negative numeric argument\n"); - return rv ? rv : -EINVAL; - } - break; - case Opt_quota_quantum: - rv = match_int(&tmp[0], &args->ar_quota_quantum); - if (rv || args->ar_quota_quantum <= 0) { - pr_warn("quota_quantum mount option requires a positive numeric argument\n"); - return rv ? rv : -EINVAL; - } - break; - case Opt_statfs_percent: - rv = match_int(&tmp[0], &args->ar_statfs_percent); - if (rv || args->ar_statfs_percent < 0 || - args->ar_statfs_percent > 100) { - pr_warn("statfs_percent mount option requires a numeric argument between 0 and 100\n"); - return rv ? rv : -EINVAL; - } - break; - case Opt_err_withdraw: - args->ar_errors = GFS2_ERRORS_WITHDRAW; - break; - case Opt_err_panic: - if (args->ar_debug) { - pr_warn("-o debug and -o errors=panic are mutually exclusive\n"); - return -EINVAL; - } - args->ar_errors = GFS2_ERRORS_PANIC; - break; - case Opt_barrier: - args->ar_nobarrier = 0; - break; - case Opt_nobarrier: - args->ar_nobarrier = 1; - break; - case Opt_rgrplvb: - args->ar_rgrplvb = 1; - break; - case Opt_norgrplvb: - args->ar_rgrplvb = 0; - break; - case Opt_loccookie: - args->ar_loccookie = 1; - break; - case Opt_noloccookie: - args->ar_loccookie = 0; - break; - case Opt_error: - default: - pr_warn("invalid mount option: %s\n", o); - return -EINVAL; - } - } - - return 0; -} - /** * gfs2_jindex_free - Clear all the journal index information * @sdp: The GFS2 superblock @@ -847,7 +595,7 @@ out: * Returns: errno */ -static int gfs2_make_fs_ro(struct gfs2_sbd *sdp) +int gfs2_make_fs_ro(struct gfs2_sbd *sdp) { struct gfs2_holder freeze_gh; int error; @@ -1227,84 +975,6 @@ static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf) } /** - * gfs2_remount_fs - called when the FS is remounted - * @sb: the filesystem - * @flags: the remount flags - * @data: extra data passed in (not used right now) - * - * Returns: errno - */ - -static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data) -{ - struct gfs2_sbd *sdp = sb->s_fs_info; - struct gfs2_args args = sdp->sd_args; /* Default to current settings */ - struct gfs2_tune *gt = &sdp->sd_tune; - int error; - - sync_filesystem(sb); - - spin_lock(>->gt_spin); - args.ar_commit = gt->gt_logd_secs; - args.ar_quota_quantum = gt->gt_quota_quantum; - if (gt->gt_statfs_slow) - args.ar_statfs_quantum = 0; - else - args.ar_statfs_quantum = gt->gt_statfs_quantum; - spin_unlock(>->gt_spin); - error = gfs2_mount_args(&args, data); - if (error) - return error; - - /* Not allowed to change locking details */ - if (strcmp(args.ar_lockproto, sdp->sd_args.ar_lockproto) || - strcmp(args.ar_locktable, sdp->sd_args.ar_locktable) || - strcmp(args.ar_hostdata, sdp->sd_args.ar_hostdata)) - return -EINVAL; - - /* Some flags must not be changed */ - if (args_neq(&args, &sdp->sd_args, spectator) || - args_neq(&args, &sdp->sd_args, localflocks) || - args_neq(&args, &sdp->sd_args, meta)) - return -EINVAL; - - if (sdp->sd_args.ar_spectator) - *flags |= SB_RDONLY; - - if ((sb->s_flags ^ *flags) & SB_RDONLY) { - if (*flags & SB_RDONLY) - error = gfs2_make_fs_ro(sdp); - else - error = gfs2_make_fs_rw(sdp); - } - - sdp->sd_args = args; - if (sdp->sd_args.ar_posix_acl) - sb->s_flags |= SB_POSIXACL; - else - sb->s_flags &= ~SB_POSIXACL; - if (sdp->sd_args.ar_nobarrier) - set_bit(SDF_NOBARRIERS, &sdp->sd_flags); - else - clear_bit(SDF_NOBARRIERS, &sdp->sd_flags); - spin_lock(>->gt_spin); - gt->gt_logd_secs = args.ar_commit; - gt->gt_quota_quantum = args.ar_quota_quantum; - if (args.ar_statfs_quantum) { - gt->gt_statfs_slow = 0; - gt->gt_statfs_quantum = args.ar_statfs_quantum; - } - else { - gt->gt_statfs_slow = 1; - gt->gt_statfs_quantum = 30; - } - spin_unlock(>->gt_spin); - - gfs2_online_uevent(sdp); - return error; -} - -/** * gfs2_drop_inode - Drop an inode (test for remote unlink) * @inode: The inode to drop * @@ -1748,7 +1418,6 @@ const struct super_operations gfs2_super_ops = { .freeze_super = gfs2_freeze, .thaw_super = gfs2_unfreeze, .statfs = gfs2_statfs, - .remount_fs = gfs2_remount_fs, .drop_inode = gfs2_drop_inode, .show_options = gfs2_show_options, }; diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h index 9d49eaadb9d9..b8bf811a1305 100644 --- a/fs/gfs2/super.h +++ b/fs/gfs2/super.h @@ -24,8 +24,6 @@ static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp) extern void gfs2_jindex_free(struct gfs2_sbd *sdp); -extern int gfs2_mount_args(struct gfs2_args *args, char *data); - extern struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid); extern int gfs2_jdesc_check(struct gfs2_jdesc *jd); @@ -33,6 +31,7 @@ extern int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename, struct gfs2_inode **ipp); extern int gfs2_make_fs_rw(struct gfs2_sbd *sdp); +extern int gfs2_make_fs_ro(struct gfs2_sbd *sdp); extern void gfs2_online_uevent(struct gfs2_sbd *sdp); extern int gfs2_statfs_init(struct gfs2_sbd *sdp); extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, diff --git a/fs/inode.c b/fs/inode.c index 64bf28cf05cd..fef457a42882 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -181,6 +181,9 @@ int inode_init_always(struct super_block *sb, struct inode *inode) mapping->flags = 0; mapping->wb_err = 0; atomic_set(&mapping->i_mmap_writable, 0); +#ifdef CONFIG_READ_ONLY_THP_FOR_FS + atomic_set(&mapping->nr_thps, 0); +#endif mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); mapping->private_data = NULL; mapping->writeback_index = 0; diff --git a/fs/io_uring.c b/fs/io_uring.c index 0dadbdbead0f..f9a38998f2fc 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -197,9 +197,11 @@ struct io_ring_ctx { unsigned sq_entries; unsigned sq_mask; unsigned sq_thread_idle; + unsigned cached_sq_dropped; struct io_uring_sqe *sq_sqes; struct list_head defer_list; + struct list_head timeout_list; } ____cacheline_aligned_in_smp; /* IO offload */ @@ -211,11 +213,13 @@ struct io_ring_ctx { struct { unsigned cached_cq_tail; + atomic_t cached_cq_overflow; unsigned cq_entries; unsigned cq_mask; struct wait_queue_head cq_wait; struct fasync_struct *cq_fasync; struct eventfd_ctx *cq_ev_fd; + atomic_t cq_timeouts; } ____cacheline_aligned_in_smp; struct io_rings *rings; @@ -283,6 +287,11 @@ struct io_poll_iocb { struct wait_queue_entry wait; }; +struct io_timeout { + struct file *file; + struct hrtimer timer; +}; + /* * NOTE! Each of the iocb union members has the file pointer * as the first entry in their struct definition. So you can @@ -294,6 +303,7 @@ struct io_kiocb { struct file *file; struct kiocb rw; struct io_poll_iocb poll; + struct io_timeout timeout; }; struct sqe_submit submit; @@ -313,6 +323,9 @@ struct io_kiocb { #define REQ_F_LINK_DONE 128 /* linked sqes done */ #define REQ_F_FAIL_LINK 256 /* fail rest of links */ #define REQ_F_SHADOW_DRAIN 512 /* link-drain shadow req */ +#define REQ_F_TIMEOUT 1024 /* timeout request */ +#define REQ_F_ISREG 2048 /* regular file */ +#define REQ_F_MUST_PUNT 4096 /* must be punted even for NONBLOCK */ u64 user_data; u32 result; u32 sequence; @@ -344,6 +357,8 @@ struct io_submit_state { }; static void io_sq_wq_submit_work(struct work_struct *work); +static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data, + long res); static void __io_free_req(struct io_kiocb *req); static struct kmem_cache *req_cachep; @@ -400,27 +415,45 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_LIST_HEAD(&ctx->poll_list); INIT_LIST_HEAD(&ctx->cancel_list); INIT_LIST_HEAD(&ctx->defer_list); + INIT_LIST_HEAD(&ctx->timeout_list); return ctx; } +static inline bool __io_sequence_defer(struct io_ring_ctx *ctx, + struct io_kiocb *req) +{ + return req->sequence != ctx->cached_cq_tail + ctx->cached_sq_dropped + + atomic_read(&ctx->cached_cq_overflow); +} + static inline bool io_sequence_defer(struct io_ring_ctx *ctx, struct io_kiocb *req) { if ((req->flags & (REQ_F_IO_DRAIN|REQ_F_IO_DRAINED)) != REQ_F_IO_DRAIN) return false; - return req->sequence != ctx->cached_cq_tail + ctx->rings->sq_dropped; + return __io_sequence_defer(ctx, req); } static struct io_kiocb *io_get_deferred_req(struct io_ring_ctx *ctx) { struct io_kiocb *req; - if (list_empty(&ctx->defer_list)) - return NULL; + req = list_first_entry_or_null(&ctx->defer_list, struct io_kiocb, list); + if (req && !io_sequence_defer(ctx, req)) { + list_del_init(&req->list); + return req; + } + + return NULL; +} - req = list_first_entry(&ctx->defer_list, struct io_kiocb, list); - if (!io_sequence_defer(ctx, req)) { +static struct io_kiocb *io_get_timeout_req(struct io_ring_ctx *ctx) +{ + struct io_kiocb *req; + + req = list_first_entry_or_null(&ctx->timeout_list, struct io_kiocb, list); + if (req && !__io_sequence_defer(ctx, req)) { list_del_init(&req->list); return req; } @@ -446,25 +479,50 @@ static void __io_commit_cqring(struct io_ring_ctx *ctx) static inline void io_queue_async_work(struct io_ring_ctx *ctx, struct io_kiocb *req) { - int rw; + int rw = 0; - switch (req->submit.sqe->opcode) { - case IORING_OP_WRITEV: - case IORING_OP_WRITE_FIXED: - rw = !(req->rw.ki_flags & IOCB_DIRECT); - break; - default: - rw = 0; - break; + if (req->submit.sqe) { + switch (req->submit.sqe->opcode) { + case IORING_OP_WRITEV: + case IORING_OP_WRITE_FIXED: + rw = !(req->rw.ki_flags & IOCB_DIRECT); + break; + } } queue_work(ctx->sqo_wq[rw], &req->work); } +static void io_kill_timeout(struct io_kiocb *req) +{ + int ret; + + ret = hrtimer_try_to_cancel(&req->timeout.timer); + if (ret != -1) { + atomic_inc(&req->ctx->cq_timeouts); + list_del(&req->list); + io_cqring_fill_event(req->ctx, req->user_data, 0); + __io_free_req(req); + } +} + +static void io_kill_timeouts(struct io_ring_ctx *ctx) +{ + struct io_kiocb *req, *tmp; + + spin_lock_irq(&ctx->completion_lock); + list_for_each_entry_safe(req, tmp, &ctx->timeout_list, list) + io_kill_timeout(req); + spin_unlock_irq(&ctx->completion_lock); +} + static void io_commit_cqring(struct io_ring_ctx *ctx) { struct io_kiocb *req; + while ((req = io_get_timeout_req(ctx)) != NULL) + io_kill_timeout(req); + __io_commit_cqring(ctx); while ((req = io_get_deferred_req(ctx)) != NULL) { @@ -512,9 +570,8 @@ static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data, WRITE_ONCE(cqe->res, res); WRITE_ONCE(cqe->flags, 0); } else { - unsigned overflow = READ_ONCE(ctx->rings->cq_overflow); - - WRITE_ONCE(ctx->rings->cq_overflow, overflow + 1); + WRITE_ONCE(ctx->rings->cq_overflow, + atomic_inc_return(&ctx->cached_cq_overflow)); } } @@ -541,14 +598,6 @@ static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 user_data, io_cqring_ev_posted(ctx); } -static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs) -{ - percpu_ref_put_many(&ctx->refs, refs); - - if (waitqueue_active(&ctx->wait)) - wake_up(&ctx->wait); -} - static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, struct io_submit_state *state) { @@ -596,7 +645,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, req->result = 0; return req; out: - io_ring_drop_ctx_refs(ctx, 1); + percpu_ref_put(&ctx->refs); return NULL; } @@ -604,7 +653,7 @@ static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr) { if (*nr) { kmem_cache_free_bulk(req_cachep, *nr, reqs); - io_ring_drop_ctx_refs(ctx, *nr); + percpu_ref_put_many(&ctx->refs, *nr); *nr = 0; } } @@ -613,7 +662,7 @@ static void __io_free_req(struct io_kiocb *req) { if (req->file && !(req->flags & REQ_F_FIXED_FILE)) fput(req->file); - io_ring_drop_ctx_refs(req->ctx, 1); + percpu_ref_put(&req->ctx->refs); kmem_cache_free(req_cachep, req); } @@ -688,6 +737,14 @@ static unsigned io_cqring_events(struct io_rings *rings) return READ_ONCE(rings->cq.tail) - READ_ONCE(rings->cq.head); } +static inline unsigned int io_sqring_entries(struct io_ring_ctx *ctx) +{ + struct io_rings *rings = ctx->rings; + + /* make sure SQ entry isn't read before tail */ + return smp_load_acquire(&rings->sq.tail) - ctx->cached_sq_head; +} + /* * Find and free completed poll iocbs */ @@ -817,19 +874,11 @@ static void io_iopoll_reap_events(struct io_ring_ctx *ctx) mutex_unlock(&ctx->uring_lock); } -static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, - long min) +static int __io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, + long min) { - int iters, ret = 0; + int iters = 0, ret = 0; - /* - * We disallow the app entering submit/complete with polling, but we - * still need to lock the ring to prevent racing with polled issue - * that got punted to a workqueue. - */ - mutex_lock(&ctx->uring_lock); - - iters = 0; do { int tmin = 0; @@ -865,30 +914,45 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, ret = 0; } while (min && !*nr_events && !need_resched()); + return ret; +} + +static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, + long min) +{ + int ret; + + /* + * We disallow the app entering submit/complete with polling, but we + * still need to lock the ring to prevent racing with polled issue + * that got punted to a workqueue. + */ + mutex_lock(&ctx->uring_lock); + ret = __io_iopoll_check(ctx, nr_events, min); mutex_unlock(&ctx->uring_lock); return ret; } -static void kiocb_end_write(struct kiocb *kiocb) +static void kiocb_end_write(struct io_kiocb *req) { - if (kiocb->ki_flags & IOCB_WRITE) { - struct inode *inode = file_inode(kiocb->ki_filp); + /* + * Tell lockdep we inherited freeze protection from submission + * thread. + */ + if (req->flags & REQ_F_ISREG) { + struct inode *inode = file_inode(req->file); - /* - * Tell lockdep we inherited freeze protection from submission - * thread. - */ - if (S_ISREG(inode->i_mode)) - __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); - file_end_write(kiocb->ki_filp); + __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); } + file_end_write(req->file); } static void io_complete_rw(struct kiocb *kiocb, long res, long res2) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); - kiocb_end_write(kiocb); + if (kiocb->ki_flags & IOCB_WRITE) + kiocb_end_write(req); if ((req->flags & REQ_F_LINK) && res != req->result) req->flags |= REQ_F_FAIL_LINK; @@ -900,7 +964,8 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); - kiocb_end_write(kiocb); + if (kiocb->ki_flags & IOCB_WRITE) + kiocb_end_write(req); if ((req->flags & REQ_F_LINK) && res != req->result) req->flags |= REQ_F_FAIL_LINK; @@ -1014,8 +1079,17 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, if (!req->file) return -EBADF; - if (force_nonblock && !io_file_supports_async(req->file)) - force_nonblock = false; + if (S_ISREG(file_inode(req->file)->i_mode)) + req->flags |= REQ_F_ISREG; + + /* + * If the file doesn't support async, mark it as REQ_F_MUST_PUNT so + * we know to async punt it even if it was opened O_NONBLOCK + */ + if (force_nonblock && !io_file_supports_async(req->file)) { + req->flags |= REQ_F_MUST_PUNT; + return -EAGAIN; + } kiocb->ki_pos = READ_ONCE(sqe->off); kiocb->ki_flags = iocb_flags(kiocb->ki_filp); @@ -1036,7 +1110,8 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, return ret; /* don't allow async punt if RWF_NOWAIT was requested */ - if (kiocb->ki_flags & IOCB_NOWAIT) + if ((kiocb->ki_flags & IOCB_NOWAIT) || + (req->file->f_flags & O_NONBLOCK)) req->flags |= REQ_F_NOWAIT; if (force_nonblock) @@ -1049,6 +1124,7 @@ static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, kiocb->ki_flags |= IOCB_HIPRI; kiocb->ki_complete = io_complete_rw_iopoll; + req->result = 0; } else { if (kiocb->ki_flags & IOCB_HIPRI) return -EINVAL; @@ -1248,6 +1324,51 @@ static void io_async_list_note(int rw, struct io_kiocb *req, size_t len) } } +/* + * For files that don't have ->read_iter() and ->write_iter(), handle them + * by looping over ->read() or ->write() manually. + */ +static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb, + struct iov_iter *iter) +{ + ssize_t ret = 0; + + /* + * Don't support polled IO through this interface, and we can't + * support non-blocking either. For the latter, this just causes + * the kiocb to be handled from an async context. + */ + if (kiocb->ki_flags & IOCB_HIPRI) + return -EOPNOTSUPP; + if (kiocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + + while (iov_iter_count(iter)) { + struct iovec iovec = iov_iter_iovec(iter); + ssize_t nr; + + if (rw == READ) { + nr = file->f_op->read(file, iovec.iov_base, + iovec.iov_len, &kiocb->ki_pos); + } else { + nr = file->f_op->write(file, iovec.iov_base, + iovec.iov_len, &kiocb->ki_pos); + } + + if (nr < 0) { + if (!ret) + ret = nr; + break; + } + ret += nr; + if (nr != iovec.iov_len) + break; + iov_iter_advance(iter, nr); + } + + return ret; +} + static int io_read(struct io_kiocb *req, const struct sqe_submit *s, bool force_nonblock) { @@ -1265,8 +1386,6 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s, if (unlikely(!(file->f_mode & FMODE_READ))) return -EBADF; - if (unlikely(!file->f_op->read_iter)) - return -EINVAL; ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter); if (ret < 0) @@ -1281,7 +1400,11 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s, if (!ret) { ssize_t ret2; - ret2 = call_read_iter(file, kiocb, &iter); + if (file->f_op->read_iter) + ret2 = call_read_iter(file, kiocb, &iter); + else + ret2 = loop_rw_iter(READ, file, kiocb, &iter); + /* * In case of a short read, punt to async. This can happen * if we have data partially cached. Alternatively we can @@ -1290,7 +1413,9 @@ static int io_read(struct io_kiocb *req, const struct sqe_submit *s, * need async punt anyway, so it's more efficient to do it * here. */ - if (force_nonblock && ret2 > 0 && ret2 < read_size) + if (force_nonblock && !(req->flags & REQ_F_NOWAIT) && + (req->flags & REQ_F_ISREG) && + ret2 > 0 && ret2 < read_size) ret2 = -EAGAIN; /* Catch -EAGAIN return for forced non-blocking submission */ if (!force_nonblock || ret2 != -EAGAIN) { @@ -1326,8 +1451,6 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s, file = kiocb->ki_filp; if (unlikely(!(file->f_mode & FMODE_WRITE))) return -EBADF; - if (unlikely(!file->f_op->write_iter)) - return -EINVAL; ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter); if (ret < 0) @@ -1357,7 +1480,7 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s, * released so that it doesn't complain about the held lock when * we return to userspace. */ - if (S_ISREG(file_inode(file)->i_mode)) { + if (req->flags & REQ_F_ISREG) { __sb_start_write(file_inode(file)->i_sb, SB_FREEZE_WRITE, true); __sb_writers_release(file_inode(file)->i_sb, @@ -1365,7 +1488,10 @@ static int io_write(struct io_kiocb *req, const struct sqe_submit *s, } kiocb->ki_flags |= IOCB_WRITE; - ret2 = call_write_iter(file, kiocb, &iter); + if (file->f_op->write_iter) + ret2 = call_write_iter(file, kiocb, &iter); + else + ret2 = loop_rw_iter(WRITE, file, kiocb, &iter); if (!force_nonblock || ret2 != -EAGAIN) { io_rw_done(kiocb, ret2); } else { @@ -1714,6 +1840,7 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (!poll->file) return -EBADF; + req->submit.sqe = NULL; INIT_WORK(&req->work, io_poll_complete_work); events = READ_ONCE(sqe->poll_events); poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; @@ -1765,6 +1892,114 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe) return ipt.error; } +static enum hrtimer_restart io_timeout_fn(struct hrtimer *timer) +{ + struct io_ring_ctx *ctx; + struct io_kiocb *req, *prev; + unsigned long flags; + + req = container_of(timer, struct io_kiocb, timeout.timer); + ctx = req->ctx; + atomic_inc(&ctx->cq_timeouts); + + spin_lock_irqsave(&ctx->completion_lock, flags); + /* + * Adjust the reqs sequence before the current one because it + * will consume a slot in the cq_ring and the the cq_tail pointer + * will be increased, otherwise other timeout reqs may return in + * advance without waiting for enough wait_nr. + */ + prev = req; + list_for_each_entry_continue_reverse(prev, &ctx->timeout_list, list) + prev->sequence++; + list_del(&req->list); + + io_cqring_fill_event(ctx, req->user_data, -ETIME); + io_commit_cqring(ctx); + spin_unlock_irqrestore(&ctx->completion_lock, flags); + + io_cqring_ev_posted(ctx); + + io_put_req(req); + return HRTIMER_NORESTART; +} + +static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + unsigned count; + struct io_ring_ctx *ctx = req->ctx; + struct list_head *entry; + struct timespec64 ts; + unsigned span = 0; + + if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + if (sqe->flags || sqe->ioprio || sqe->buf_index || sqe->timeout_flags || + sqe->len != 1) + return -EINVAL; + + if (get_timespec64(&ts, u64_to_user_ptr(sqe->addr))) + return -EFAULT; + + /* + * sqe->off holds how many events that need to occur for this + * timeout event to be satisfied. + */ + count = READ_ONCE(sqe->off); + if (!count) + count = 1; + + req->sequence = ctx->cached_sq_head + count - 1; + /* reuse it to store the count */ + req->submit.sequence = count; + req->flags |= REQ_F_TIMEOUT; + + /* + * Insertion sort, ensuring the first entry in the list is always + * the one we need first. + */ + spin_lock_irq(&ctx->completion_lock); + list_for_each_prev(entry, &ctx->timeout_list) { + struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list); + unsigned nxt_sq_head; + long long tmp, tmp_nxt; + + /* + * Since cached_sq_head + count - 1 can overflow, use type long + * long to store it. + */ + tmp = (long long)ctx->cached_sq_head + count - 1; + nxt_sq_head = nxt->sequence - nxt->submit.sequence + 1; + tmp_nxt = (long long)nxt_sq_head + nxt->submit.sequence - 1; + + /* + * cached_sq_head may overflow, and it will never overflow twice + * once there is some timeout req still be valid. + */ + if (ctx->cached_sq_head < nxt_sq_head) + tmp += UINT_MAX; + + if (tmp > tmp_nxt) + break; + + /* + * Sequence of reqs after the insert one and itself should + * be adjusted because each timeout req consumes a slot. + */ + span++; + nxt->sequence++; + } + req->sequence -= span; + list_add(&req->list, entry); + spin_unlock_irq(&ctx->completion_lock); + + hrtimer_init(&req->timeout.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + req->timeout.timer.function = io_timeout_fn; + hrtimer_start(&req->timeout.timer, timespec64_to_ktime(ts), + HRTIMER_MODE_REL); + return 0; +} + static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -1842,6 +2077,9 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, case IORING_OP_RECVMSG: ret = io_recvmsg(req, s->sqe, force_nonblock); break; + case IORING_OP_TIMEOUT: + ret = io_timeout(req, s->sqe); + break; default: ret = -EINVAL; break; @@ -2090,21 +2328,25 @@ static int io_req_set_file(struct io_ring_ctx *ctx, const struct sqe_submit *s, } static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, - struct sqe_submit *s, bool force_nonblock) + struct sqe_submit *s) { int ret; - ret = __io_submit_sqe(ctx, req, s, force_nonblock); - if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { + ret = __io_submit_sqe(ctx, req, s, true); + + /* + * We async punt it if the file wasn't marked NOWAIT, or if the file + * doesn't support non-blocking read/write attempts + */ + if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) || + (req->flags & REQ_F_MUST_PUNT))) { struct io_uring_sqe *sqe_copy; - sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL); + sqe_copy = kmemdup(s->sqe, sizeof(*sqe_copy), GFP_KERNEL); if (sqe_copy) { struct async_list *list; - memcpy(sqe_copy, s->sqe, sizeof(*sqe_copy)); s->sqe = sqe_copy; - memcpy(&req->submit, s, sizeof(*s)); list = io_async_list_from_sqe(ctx, s->sqe); if (!io_add_to_prev_work(list, req)) { @@ -2137,7 +2379,7 @@ static int __io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, } static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, - struct sqe_submit *s, bool force_nonblock) + struct sqe_submit *s) { int ret; @@ -2150,18 +2392,17 @@ static int io_queue_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, return 0; } - return __io_queue_sqe(ctx, req, s, force_nonblock); + return __io_queue_sqe(ctx, req, s); } static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req, - struct sqe_submit *s, struct io_kiocb *shadow, - bool force_nonblock) + struct sqe_submit *s, struct io_kiocb *shadow) { int ret; int need_submit = false; if (!shadow) - return io_queue_sqe(ctx, req, s, force_nonblock); + return io_queue_sqe(ctx, req, s); /* * Mark the first IO in link list as DRAIN, let all the following @@ -2173,6 +2414,7 @@ static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req, if (ret) { if (ret != -EIOCBQUEUED) { io_free_req(req); + __io_free_req(shadow); io_cqring_add_event(ctx, s->sqe->user_data, ret); return 0; } @@ -2190,7 +2432,7 @@ static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req, spin_unlock_irq(&ctx->completion_lock); if (need_submit) - return __io_queue_sqe(ctx, req, s, force_nonblock); + return __io_queue_sqe(ctx, req, s); return 0; } @@ -2198,8 +2440,7 @@ static int io_queue_link_head(struct io_ring_ctx *ctx, struct io_kiocb *req, #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK) static void io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, - struct io_submit_state *state, struct io_kiocb **link, - bool force_nonblock) + struct io_submit_state *state, struct io_kiocb **link) { struct io_uring_sqe *sqe_copy; struct io_kiocb *req; @@ -2226,6 +2467,8 @@ err: return; } + req->user_data = s->sqe->user_data; + /* * If we already have a head request, queue this one for async * submittal once the head completes. If we don't have a head but @@ -2252,7 +2495,7 @@ err: INIT_LIST_HEAD(&req->link_list); *link = req; } else { - io_queue_sqe(ctx, req, s, force_nonblock); + io_queue_sqe(ctx, req, s); } } @@ -2332,12 +2575,13 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s) /* drop invalid entries */ ctx->cached_sq_head++; - rings->sq_dropped++; + ctx->cached_sq_dropped++; + WRITE_ONCE(rings->sq_dropped, ctx->cached_sq_dropped); return false; } -static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes, - unsigned int nr, bool has_user, bool mm_fault) +static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, + bool has_user, bool mm_fault) { struct io_submit_state state, *statep = NULL; struct io_kiocb *link = NULL; @@ -2351,40 +2595,48 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes, } for (i = 0; i < nr; i++) { + struct sqe_submit s; + + if (!io_get_sqring(ctx, &s)) + break; + /* * If previous wasn't linked and we have a linked command, * that's the end of the chain. Submit the previous link. */ if (!prev_was_link && link) { - io_queue_link_head(ctx, link, &link->submit, shadow_req, - true); + io_queue_link_head(ctx, link, &link->submit, shadow_req); link = NULL; + shadow_req = NULL; } - prev_was_link = (sqes[i].sqe->flags & IOSQE_IO_LINK) != 0; + prev_was_link = (s.sqe->flags & IOSQE_IO_LINK) != 0; - if (link && (sqes[i].sqe->flags & IOSQE_IO_DRAIN)) { + if (link && (s.sqe->flags & IOSQE_IO_DRAIN)) { if (!shadow_req) { shadow_req = io_get_req(ctx, NULL); + if (unlikely(!shadow_req)) + goto out; shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN); refcount_dec(&shadow_req->refs); } - shadow_req->sequence = sqes[i].sequence; + shadow_req->sequence = s.sequence; } +out: if (unlikely(mm_fault)) { - io_cqring_add_event(ctx, sqes[i].sqe->user_data, + io_cqring_add_event(ctx, s.sqe->user_data, -EFAULT); } else { - sqes[i].has_user = has_user; - sqes[i].needs_lock = true; - sqes[i].needs_fixed_file = true; - io_submit_sqe(ctx, &sqes[i], statep, &link, true); + s.has_user = has_user; + s.needs_lock = true; + s.needs_fixed_file = true; + io_submit_sqe(ctx, &s, statep, &link); submitted++; } } if (link) - io_queue_link_head(ctx, link, &link->submit, shadow_req, true); + io_queue_link_head(ctx, link, &link->submit, shadow_req); if (statep) io_submit_state_end(&state); @@ -2393,7 +2645,6 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes, static int io_sq_thread(void *data) { - struct sqe_submit sqes[IO_IOPOLL_BATCH]; struct io_ring_ctx *ctx = data; struct mm_struct *cur_mm = NULL; mm_segment_t old_fs; @@ -2408,14 +2659,27 @@ static int io_sq_thread(void *data) timeout = inflight = 0; while (!kthread_should_park()) { - bool all_fixed, mm_fault = false; - int i; + bool mm_fault = false; + unsigned int to_submit; if (inflight) { unsigned nr_events = 0; if (ctx->flags & IORING_SETUP_IOPOLL) { - io_iopoll_check(ctx, &nr_events, 0); + /* + * inflight is the count of the maximum possible + * entries we submitted, but it can be smaller + * if we dropped some of them. If we don't have + * poll entries available, then we know that we + * have nothing left to poll for. Reset the + * inflight count to zero in that case. + */ + mutex_lock(&ctx->uring_lock); + if (!list_empty(&ctx->poll_list)) + __io_iopoll_check(ctx, &nr_events, 0); + else + inflight = 0; + mutex_unlock(&ctx->uring_lock); } else { /* * Normal IO, just pretend everything completed. @@ -2429,14 +2693,15 @@ static int io_sq_thread(void *data) timeout = jiffies + ctx->sq_thread_idle; } - if (!io_get_sqring(ctx, &sqes[0])) { + to_submit = io_sqring_entries(ctx); + if (!to_submit) { /* * We're polling. If we're within the defined idle * period, then let us spin without work before going * to sleep. */ if (inflight || !time_after(jiffies, timeout)) { - cpu_relax(); + cond_resched(); continue; } @@ -2460,7 +2725,8 @@ static int io_sq_thread(void *data) /* make sure to read SQ tail after writing flags */ smp_mb(); - if (!io_get_sqring(ctx, &sqes[0])) { + to_submit = io_sqring_entries(ctx); + if (!to_submit) { if (kthread_should_park()) { finish_wait(&ctx->sqo_wait, &wait); break; @@ -2478,19 +2744,8 @@ static int io_sq_thread(void *data) ctx->rings->sq_flags &= ~IORING_SQ_NEED_WAKEUP; } - i = 0; - all_fixed = true; - do { - if (all_fixed && io_sqe_needs_user(sqes[i].sqe)) - all_fixed = false; - - i++; - if (i == ARRAY_SIZE(sqes)) - break; - } while (io_get_sqring(ctx, &sqes[i])); - /* Unless all new commands are FIXED regions, grab mm */ - if (!all_fixed && !cur_mm) { + if (!cur_mm) { mm_fault = !mmget_not_zero(ctx->sqo_mm); if (!mm_fault) { use_mm(ctx->sqo_mm); @@ -2498,8 +2753,9 @@ static int io_sq_thread(void *data) } } - inflight += io_submit_sqes(ctx, sqes, i, cur_mm != NULL, - mm_fault); + to_submit = min(to_submit, ctx->sq_entries); + inflight += io_submit_sqes(ctx, to_submit, cur_mm != NULL, + mm_fault); /* Commit SQ ring head once we've consumed all SQEs */ io_commit_sqring(ctx); @@ -2516,8 +2772,7 @@ static int io_sq_thread(void *data) return 0; } -static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit, - bool block_for_last) +static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) { struct io_submit_state state, *statep = NULL; struct io_kiocb *link = NULL; @@ -2531,7 +2786,6 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit, } for (i = 0; i < to_submit; i++) { - bool force_nonblock = true; struct sqe_submit s; if (!io_get_sqring(ctx, &s)) @@ -2542,49 +2796,73 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit, * that's the end of the chain. Submit the previous link. */ if (!prev_was_link && link) { - io_queue_link_head(ctx, link, &link->submit, shadow_req, - force_nonblock); + io_queue_link_head(ctx, link, &link->submit, shadow_req); link = NULL; + shadow_req = NULL; } prev_was_link = (s.sqe->flags & IOSQE_IO_LINK) != 0; if (link && (s.sqe->flags & IOSQE_IO_DRAIN)) { if (!shadow_req) { shadow_req = io_get_req(ctx, NULL); + if (unlikely(!shadow_req)) + goto out; shadow_req->flags |= (REQ_F_IO_DRAIN | REQ_F_SHADOW_DRAIN); refcount_dec(&shadow_req->refs); } shadow_req->sequence = s.sequence; } +out: s.has_user = true; s.needs_lock = false; s.needs_fixed_file = false; submit++; - - /* - * The caller will block for events after submit, submit the - * last IO non-blocking. This is either the only IO it's - * submitting, or it already submitted the previous ones. This - * improves performance by avoiding an async punt that we don't - * need to do. - */ - if (block_for_last && submit == to_submit) - force_nonblock = false; - - io_submit_sqe(ctx, &s, statep, &link, force_nonblock); + io_submit_sqe(ctx, &s, statep, &link); } - io_commit_sqring(ctx); if (link) - io_queue_link_head(ctx, link, &link->submit, shadow_req, - block_for_last); + io_queue_link_head(ctx, link, &link->submit, shadow_req); if (statep) io_submit_state_end(statep); + io_commit_sqring(ctx); + return submit; } +struct io_wait_queue { + struct wait_queue_entry wq; + struct io_ring_ctx *ctx; + unsigned to_wait; + unsigned nr_timeouts; +}; + +static inline bool io_should_wake(struct io_wait_queue *iowq) +{ + struct io_ring_ctx *ctx = iowq->ctx; + + /* + * Wake up if we have enough events, or if a timeout occured since we + * started waiting. For timeouts, we always want to return to userspace, + * regardless of event count. + */ + return io_cqring_events(ctx->rings) >= iowq->to_wait || + atomic_read(&ctx->cq_timeouts) != iowq->nr_timeouts; +} + +static int io_wake_function(struct wait_queue_entry *curr, unsigned int mode, + int wake_flags, void *key) +{ + struct io_wait_queue *iowq = container_of(curr, struct io_wait_queue, + wq); + + if (!io_should_wake(iowq)) + return -1; + + return autoremove_wake_function(curr, mode, wake_flags, key); +} + /* * Wait until events become available, if we don't already have some. The * application must reap them itself, as they reside on the shared cq ring. @@ -2592,6 +2870,15 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit, static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, const sigset_t __user *sig, size_t sigsz) { + struct io_wait_queue iowq = { + .wq = { + .private = current, + .func = io_wake_function, + .entry = LIST_HEAD_INIT(iowq.wq.entry), + }, + .ctx = ctx, + .to_wait = min_events, + }; struct io_rings *rings = ctx->rings; int ret; @@ -2611,7 +2898,21 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, return ret; } - ret = wait_event_interruptible(ctx->wait, io_cqring_events(rings) >= min_events); + ret = 0; + iowq.nr_timeouts = atomic_read(&ctx->cq_timeouts); + do { + prepare_to_wait_exclusive(&ctx->wait, &iowq.wq, + TASK_INTERRUPTIBLE); + if (io_should_wake(&iowq)) + break; + schedule(); + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + } while (1); + finish_wait(&ctx->wait, &iowq.wq); + restore_saved_sigmask_unless(ret == -ERESTARTSYS); if (ret == -ERESTARTSYS) ret = -EINTR; @@ -2682,8 +2983,12 @@ static void io_finish_async(struct io_ring_ctx *ctx) static void io_destruct_skb(struct sk_buff *skb) { struct io_ring_ctx *ctx = skb->sk->sk_user_data; + int i; + + for (i = 0; i < ARRAY_SIZE(ctx->sqo_wq); i++) + if (ctx->sqo_wq[i]) + flush_workqueue(ctx->sqo_wq[i]); - io_finish_async(ctx); unix_destruct_scm(skb); } @@ -3263,7 +3568,7 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) if (READ_ONCE(ctx->rings->sq.tail) - ctx->cached_sq_head != ctx->rings->sq_ring_entries) mask |= EPOLLOUT | EPOLLWRNORM; - if (READ_ONCE(ctx->rings->sq.head) != ctx->cached_cq_tail) + if (READ_ONCE(ctx->rings->cq.head) != ctx->cached_cq_tail) mask |= EPOLLIN | EPOLLRDNORM; return mask; @@ -3282,6 +3587,7 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) percpu_ref_kill(&ctx->refs); mutex_unlock(&ctx->uring_lock); + io_kill_timeouts(ctx); io_poll_remove_all(ctx); io_iopoll_reap_events(ctx); wait_for_completion(&ctx->ctx_done); @@ -3319,7 +3625,7 @@ static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) } page = virt_to_head_page(ptr); - if (sz > (PAGE_SIZE << compound_order(page))) + if (sz > page_size(page)) return -EINVAL; pfn = virt_to_phys(ptr) >> PAGE_SHIFT; @@ -3362,21 +3668,10 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, wake_up(&ctx->sqo_wait); submitted = to_submit; } else if (to_submit) { - bool block_for_last = false; - to_submit = min(to_submit, ctx->sq_entries); - /* - * Allow last submission to block in a series, IFF the caller - * asked to wait for events and we don't currently have - * enough. This potentially avoids an async punt. - */ - if (to_submit == min_complete && - io_cqring_events(ctx->rings) < min_complete) - block_for_last = true; - mutex_lock(&ctx->uring_lock); - submitted = io_ring_submit(ctx, to_submit, block_for_last); + submitted = io_ring_submit(ctx, to_submit); mutex_unlock(&ctx->uring_lock); } if (flags & IORING_ENTER_GETEVENTS) { @@ -3391,7 +3686,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, } } - io_ring_drop_ctx_refs(ctx, 1); + percpu_ref_put(&ctx->refs); out_fput: fdput(f); return submitted ? submitted : ret; @@ -3535,10 +3830,6 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) if (ret) goto err; - ret = io_uring_get_fd(ctx); - if (ret < 0) - goto err; - memset(&p->sq_off, 0, sizeof(p->sq_off)); p->sq_off.head = offsetof(struct io_rings, sq.head); p->sq_off.tail = offsetof(struct io_rings, sq.tail); @@ -3556,6 +3847,14 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) p->cq_off.overflow = offsetof(struct io_rings, cq_overflow); p->cq_off.cqes = offsetof(struct io_rings, cqes); + /* + * Install ring fd as the very last thing, so we don't risk someone + * having closed it before we finish setup + */ + ret = io_uring_get_fd(ctx); + if (ret < 0) + goto err; + p->features = IORING_FEAT_SINGLE_MMAP; return ret; err: diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 10517cea9682..1fc28c2da279 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -24,7 +24,7 @@ struct iomap_dio { struct kiocb *iocb; - iomap_dio_end_io_t *end_io; + const struct iomap_dio_ops *dops; loff_t i_size; loff_t size; atomic_t ref; @@ -72,18 +72,14 @@ static void iomap_dio_submit_bio(struct iomap_dio *dio, struct iomap *iomap, static ssize_t iomap_dio_complete(struct iomap_dio *dio) { + const struct iomap_dio_ops *dops = dio->dops; struct kiocb *iocb = dio->iocb; struct inode *inode = file_inode(iocb->ki_filp); loff_t offset = iocb->ki_pos; - ssize_t ret; + ssize_t ret = dio->error; - if (dio->end_io) { - ret = dio->end_io(iocb, - dio->error ? dio->error : dio->size, - dio->flags); - } else { - ret = dio->error; - } + if (dops && dops->end_io) + ret = dops->end_io(iocb, dio->size, ret, dio->flags); if (likely(!ret)) { ret = dio->size; @@ -101,9 +97,9 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio) * one is a pretty crazy thing to do, so we don't support it 100%. If * this invalidation fails, tough, the write still worked... * - * And this page cache invalidation has to be after dio->end_io(), as - * some filesystems convert unwritten extents to real allocations in - * end_io() when necessary, otherwise a racing buffer read would cache + * And this page cache invalidation has to be after ->end_io(), as some + * filesystems convert unwritten extents to real allocations in + * ->end_io() when necessary, otherwise a racing buffer read would cache * zeros from unwritten extents. */ if (!dio->error && @@ -396,7 +392,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, */ ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, - const struct iomap_ops *ops, iomap_dio_end_io_t end_io) + const struct iomap_ops *ops, const struct iomap_dio_ops *dops) { struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = file_inode(iocb->ki_filp); @@ -421,7 +417,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, atomic_set(&dio->ref, 1); dio->size = 0; dio->i_size = i_size_read(inode); - dio->end_io = end_io; + dio->dops = dops; dio->error = 0; dio->flags = 0; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 953990eb70a9..1c58859aa592 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -89,8 +89,6 @@ EXPORT_SYMBOL(jbd2_journal_blocks_per_page); EXPORT_SYMBOL(jbd2_journal_invalidatepage); EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers); EXPORT_SYMBOL(jbd2_journal_force_commit); -EXPORT_SYMBOL(jbd2_journal_inode_add_write); -EXPORT_SYMBOL(jbd2_journal_inode_add_wait); EXPORT_SYMBOL(jbd2_journal_inode_ranged_write); EXPORT_SYMBOL(jbd2_journal_inode_ranged_wait); EXPORT_SYMBOL(jbd2_journal_init_jbd_inode); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index afc06daee5bb..bee8498d7792 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -2622,18 +2622,6 @@ done: return 0; } -int jbd2_journal_inode_add_write(handle_t *handle, struct jbd2_inode *jinode) -{ - return jbd2_journal_file_inode(handle, jinode, - JI_WRITE_DATA | JI_WAIT_DATA, 0, LLONG_MAX); -} - -int jbd2_journal_inode_add_wait(handle_t *handle, struct jbd2_inode *jinode) -{ - return jbd2_journal_file_inode(handle, jinode, JI_WAIT_DATA, 0, - LLONG_MAX); -} - int jbd2_journal_inode_ranged_write(handle_t *handle, struct jbd2_inode *jinode, loff_t start_byte, loff_t length) { diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index cbe70637c117..0e6406c4f362 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -163,13 +163,11 @@ static const struct export_operations jffs2_export_ops = { * Opt_rp_size: size of reserved pool in KiB */ enum { - Opt_source, Opt_override_compr, Opt_rp_size, }; static const struct fs_parameter_spec jffs2_param_specs[] = { - fsparam_string ("source", Opt_source), fsparam_enum ("compr", Opt_override_compr), fsparam_u32 ("rp_size", Opt_rp_size), {} diff --git a/fs/libfs.c b/fs/libfs.c index c9b2850c0f7c..1463b038ffc4 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -89,58 +89,45 @@ int dcache_dir_close(struct inode *inode, struct file *file) EXPORT_SYMBOL(dcache_dir_close); /* parent is locked at least shared */ -static struct dentry *next_positive(struct dentry *parent, - struct list_head *from, - int count) +/* + * Returns an element of siblings' list. + * We are looking for <count>th positive after <p>; if + * found, dentry is grabbed and returned to caller. + * If no such element exists, NULL is returned. + */ +static struct dentry *scan_positives(struct dentry *cursor, + struct list_head *p, + loff_t count, + struct dentry *last) { - unsigned *seq = &parent->d_inode->i_dir_seq, n; - struct dentry *res; - struct list_head *p; - bool skipped; - int i; + struct dentry *dentry = cursor->d_parent, *found = NULL; -retry: - i = count; - skipped = false; - n = smp_load_acquire(seq) & ~1; - res = NULL; - rcu_read_lock(); - for (p = from->next; p != &parent->d_subdirs; p = p->next) { + spin_lock(&dentry->d_lock); + while ((p = p->next) != &dentry->d_subdirs) { struct dentry *d = list_entry(p, struct dentry, d_child); - if (!simple_positive(d)) { - skipped = true; - } else if (!--i) { - res = d; - break; + // we must at least skip cursors, to avoid livelocks + if (d->d_flags & DCACHE_DENTRY_CURSOR) + continue; + if (simple_positive(d) && !--count) { + spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); + if (simple_positive(d)) + found = dget_dlock(d); + spin_unlock(&d->d_lock); + if (likely(found)) + break; + count = 1; + } + if (need_resched()) { + list_move(&cursor->d_child, p); + p = &cursor->d_child; + spin_unlock(&dentry->d_lock); + cond_resched(); + spin_lock(&dentry->d_lock); } } - rcu_read_unlock(); - if (skipped) { - smp_rmb(); - if (unlikely(*seq != n)) - goto retry; - } - return res; -} - -static void move_cursor(struct dentry *cursor, struct list_head *after) -{ - struct dentry *parent = cursor->d_parent; - unsigned n, *seq = &parent->d_inode->i_dir_seq; - spin_lock(&parent->d_lock); - for (;;) { - n = *seq; - if (!(n & 1) && cmpxchg(seq, n, n + 1) == n) - break; - cpu_relax(); - } - __list_del(cursor->d_child.prev, cursor->d_child.next); - if (after) - list_add(&cursor->d_child, after); - else - list_add_tail(&cursor->d_child, &parent->d_subdirs); - smp_store_release(seq, n + 2); - spin_unlock(&parent->d_lock); + spin_unlock(&dentry->d_lock); + dput(last); + return found; } loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) @@ -158,17 +145,25 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) return -EINVAL; } if (offset != file->f_pos) { + struct dentry *cursor = file->private_data; + struct dentry *to = NULL; + + inode_lock_shared(dentry->d_inode); + + if (offset > 2) + to = scan_positives(cursor, &dentry->d_subdirs, + offset - 2, NULL); + spin_lock(&dentry->d_lock); + if (to) + list_move(&cursor->d_child, &to->d_child); + else + list_del_init(&cursor->d_child); + spin_unlock(&dentry->d_lock); + dput(to); + file->f_pos = offset; - if (file->f_pos >= 2) { - struct dentry *cursor = file->private_data; - struct dentry *to; - loff_t n = file->f_pos - 2; - - inode_lock_shared(dentry->d_inode); - to = next_positive(dentry, &dentry->d_subdirs, n); - move_cursor(cursor, to ? &to->d_child : NULL); - inode_unlock_shared(dentry->d_inode); - } + + inode_unlock_shared(dentry->d_inode); } return offset; } @@ -190,25 +185,35 @@ int dcache_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dentry = file->f_path.dentry; struct dentry *cursor = file->private_data; - struct list_head *p = &cursor->d_child; - struct dentry *next; - bool moved = false; + struct list_head *anchor = &dentry->d_subdirs; + struct dentry *next = NULL; + struct list_head *p; if (!dir_emit_dots(file, ctx)) return 0; if (ctx->pos == 2) - p = &dentry->d_subdirs; - while ((next = next_positive(dentry, p, 1)) != NULL) { + p = anchor; + else if (!list_empty(&cursor->d_child)) + p = &cursor->d_child; + else + return 0; + + while ((next = scan_positives(cursor, p, 1, next)) != NULL) { if (!dir_emit(ctx, next->d_name.name, next->d_name.len, d_inode(next)->i_ino, dt_type(d_inode(next)))) break; - moved = true; - p = &next->d_child; ctx->pos++; + p = &next->d_child; } - if (moved) - move_cursor(cursor, p); + spin_lock(&dentry->d_lock); + if (next) + list_move_tail(&cursor->d_child, &next->d_child); + else + list_del_init(&cursor->d_child); + spin_unlock(&dentry->d_lock); + dput(next); + return 0; } EXPORT_SYMBOL(dcache_readdir); @@ -468,8 +473,7 @@ EXPORT_SYMBOL(simple_write_begin); /** * simple_write_end - .write_end helper for non-block-device FSes - * @available: See .write_end of address_space_operations - * @file: " + * @file: See .write_end of address_space_operations * @mapping: " * @pos: " * @len: " diff --git a/fs/locks.c b/fs/locks.c index a364ebc5cec3..6970f55daf54 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -212,6 +212,7 @@ struct file_lock_list_struct { static DEFINE_PER_CPU(struct file_lock_list_struct, file_lock_list); DEFINE_STATIC_PERCPU_RWSEM(file_rwsem); + /* * The blocked_hash is used to find POSIX lock loops for deadlock detection. * It is protected by blocked_lock_lock. @@ -1991,6 +1992,64 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp, } EXPORT_SYMBOL(generic_setlease); +#if IS_ENABLED(CONFIG_SRCU) +/* + * Kernel subsystems can register to be notified on any attempt to set + * a new lease with the lease_notifier_chain. This is used by (e.g.) nfsd + * to close files that it may have cached when there is an attempt to set a + * conflicting lease. + */ +static struct srcu_notifier_head lease_notifier_chain; + +static inline void +lease_notifier_chain_init(void) +{ + srcu_init_notifier_head(&lease_notifier_chain); +} + +static inline void +setlease_notifier(long arg, struct file_lock *lease) +{ + if (arg != F_UNLCK) + srcu_notifier_call_chain(&lease_notifier_chain, arg, lease); +} + +int lease_register_notifier(struct notifier_block *nb) +{ + return srcu_notifier_chain_register(&lease_notifier_chain, nb); +} +EXPORT_SYMBOL_GPL(lease_register_notifier); + +void lease_unregister_notifier(struct notifier_block *nb) +{ + srcu_notifier_chain_unregister(&lease_notifier_chain, nb); +} +EXPORT_SYMBOL_GPL(lease_unregister_notifier); + +#else /* !IS_ENABLED(CONFIG_SRCU) */ +static inline void +lease_notifier_chain_init(void) +{ +} + +static inline void +setlease_notifier(long arg, struct file_lock *lease) +{ +} + +int lease_register_notifier(struct notifier_block *nb) +{ + return 0; +} +EXPORT_SYMBOL_GPL(lease_register_notifier); + +void lease_unregister_notifier(struct notifier_block *nb) +{ +} +EXPORT_SYMBOL_GPL(lease_unregister_notifier); + +#endif /* IS_ENABLED(CONFIG_SRCU) */ + /** * vfs_setlease - sets a lease on an open file * @filp: file pointer @@ -2011,6 +2070,8 @@ EXPORT_SYMBOL(generic_setlease); int vfs_setlease(struct file *filp, long arg, struct file_lock **lease, void **priv) { + if (lease) + setlease_notifier(arg, *lease); if (filp->f_op->setlease) return filp->f_op->setlease(filp, arg, lease, priv); else @@ -2924,6 +2985,7 @@ static int __init filelock_init(void) INIT_HLIST_HEAD(&fll->hlist); } + lease_notifier_chain_init(); return 0; } core_initcall(filelock_init); diff --git a/fs/namespace.c b/fs/namespace.c index 93c043245c46..fe0e9e1410fe 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2802,8 +2802,6 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags, put_filesystem(type); return -EINVAL; } - } else { - subtype = ""; } } @@ -3028,7 +3026,7 @@ void *copy_mount_options(const void __user * data) * the remainder of the page. */ /* copy_from_user cannot cross TASK_SIZE ! */ - size = TASK_SIZE - (unsigned long)data; + size = TASK_SIZE - (unsigned long)untagged_addr(data); if (size > PAGE_SIZE) size = PAGE_SIZE; diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 071b90a45933..af549d70ec50 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -53,6 +53,16 @@ nfs4_is_valid_delegation(const struct nfs_delegation *delegation, return false; } +struct nfs_delegation *nfs4_get_valid_delegation(const struct inode *inode) +{ + struct nfs_delegation *delegation; + + delegation = rcu_dereference(NFS_I(inode)->delegation); + if (nfs4_is_valid_delegation(delegation, 0)) + return delegation; + return NULL; +} + static int nfs4_do_check_delegation(struct inode *inode, fmode_t flags, bool mark) { @@ -1181,7 +1191,7 @@ bool nfs4_refresh_delegation_stateid(nfs4_stateid *dst, struct inode *inode) if (delegation != NULL && nfs4_stateid_match_other(dst, &delegation->stateid)) { dst->seqid = delegation->stateid.seqid; - return ret; + ret = true; } rcu_read_unlock(); out: diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index 9eb87ae4c982..8b14d441e699 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -68,6 +68,7 @@ int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags, nfs4_stateid *dst, const struct cred **cred); bool nfs4_refresh_delegation_stateid(nfs4_stateid *dst, struct inode *inode); +struct nfs_delegation *nfs4_get_valid_delegation(const struct inode *inode); void nfs_mark_delegation_referenced(struct nfs_delegation *delegation); int nfs4_have_delegation(struct inode *inode, fmode_t flags); int nfs4_check_delegation(struct inode *inode, fmode_t flags); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 0adfd8840110..e180033e35cf 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1669,10 +1669,8 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags) #endif /* CONFIG_NFSV4 */ -/* - * Code common to create, mkdir, and mknod. - */ -int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle, +struct dentry * +nfs_add_or_obtain(struct dentry *dentry, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label) { @@ -1680,13 +1678,10 @@ int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle, struct inode *dir = d_inode(parent); struct inode *inode; struct dentry *d; - int error = -EACCES; + int error; d_drop(dentry); - /* We may have been initialized further down */ - if (d_really_is_positive(dentry)) - goto out; if (fhandle->size == 0) { error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, NULL); if (error) @@ -1702,18 +1697,32 @@ int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle, } inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label); d = d_splice_alias(inode, dentry); - if (IS_ERR(d)) { - error = PTR_ERR(d); - goto out_error; - } - dput(d); out: dput(parent); - return 0; + return d; out_error: nfs_mark_for_revalidate(dir); - dput(parent); - return error; + d = ERR_PTR(error); + goto out; +} +EXPORT_SYMBOL_GPL(nfs_add_or_obtain); + +/* + * Code common to create, mkdir, and mknod. + */ +int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle, + struct nfs_fattr *fattr, + struct nfs4_label *label) +{ + struct dentry *d; + + d = nfs_add_or_obtain(dentry, fhandle, fattr, label); + if (IS_ERR(d)) + return PTR_ERR(d); + + /* Callers don't care */ + dput(d); + return 0; } EXPORT_SYMBOL_GPL(nfs_instantiate); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 222d7115db71..040a50fd9bf3 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -64,13 +64,6 @@ static struct kmem_cache *nfs_direct_cachep; -/* - * This represents a set of asynchronous requests that we're waiting on - */ -struct nfs_direct_mirror { - ssize_t count; -}; - struct nfs_direct_req { struct kref kref; /* release manager */ @@ -84,9 +77,6 @@ struct nfs_direct_req { atomic_t io_count; /* i/os we're waiting for */ spinlock_t lock; /* protect completion state */ - struct nfs_direct_mirror mirrors[NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX]; - int mirror_count; - loff_t io_start; /* Start offset for I/O */ ssize_t count, /* bytes actually processed */ max_count, /* max expected count */ @@ -123,32 +113,42 @@ static inline int put_dreq(struct nfs_direct_req *dreq) } static void -nfs_direct_good_bytes(struct nfs_direct_req *dreq, struct nfs_pgio_header *hdr) +nfs_direct_handle_truncated(struct nfs_direct_req *dreq, + const struct nfs_pgio_header *hdr, + ssize_t dreq_len) { - int i; - ssize_t count; + if (!(test_bit(NFS_IOHDR_ERROR, &hdr->flags) || + test_bit(NFS_IOHDR_EOF, &hdr->flags))) + return; + if (dreq->max_count >= dreq_len) { + dreq->max_count = dreq_len; + if (dreq->count > dreq_len) + dreq->count = dreq_len; + + if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) + dreq->error = hdr->error; + else /* Clear outstanding error if this is EOF */ + dreq->error = 0; + } +} - WARN_ON_ONCE(dreq->count >= dreq->max_count); +static void +nfs_direct_count_bytes(struct nfs_direct_req *dreq, + const struct nfs_pgio_header *hdr) +{ + loff_t hdr_end = hdr->io_start + hdr->good_bytes; + ssize_t dreq_len = 0; - if (dreq->mirror_count == 1) { - dreq->mirrors[hdr->pgio_mirror_idx].count += hdr->good_bytes; - dreq->count += hdr->good_bytes; - } else { - /* mirrored writes */ - count = dreq->mirrors[hdr->pgio_mirror_idx].count; - if (count + dreq->io_start < hdr->io_start + hdr->good_bytes) { - count = hdr->io_start + hdr->good_bytes - dreq->io_start; - dreq->mirrors[hdr->pgio_mirror_idx].count = count; - } - /* update the dreq->count by finding the minimum agreed count from all - * mirrors */ - count = dreq->mirrors[0].count; + if (hdr_end > dreq->io_start) + dreq_len = hdr_end - dreq->io_start; - for (i = 1; i < dreq->mirror_count; i++) - count = min(count, dreq->mirrors[i].count); + nfs_direct_handle_truncated(dreq, hdr, dreq_len); - dreq->count = count; - } + if (dreq_len > dreq->max_count) + dreq_len = dreq->max_count; + + if (dreq->count < dreq_len) + dreq->count = dreq_len; } /* @@ -293,18 +293,6 @@ void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo, cinfo->completion_ops = &nfs_direct_commit_completion_ops; } -static inline void nfs_direct_setup_mirroring(struct nfs_direct_req *dreq, - struct nfs_pageio_descriptor *pgio, - struct nfs_page *req) -{ - int mirror_count = 1; - - if (pgio->pg_ops->pg_get_mirror_count) - mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req); - - dreq->mirror_count = mirror_count; -} - static inline struct nfs_direct_req *nfs_direct_req_alloc(void) { struct nfs_direct_req *dreq; @@ -319,7 +307,6 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void) INIT_LIST_HEAD(&dreq->mds_cinfo.list); dreq->verf.committed = NFS_INVALID_STABLE_HOW; /* not set yet */ INIT_WORK(&dreq->work, nfs_direct_write_schedule_work); - dreq->mirror_count = 1; spin_lock_init(&dreq->lock); return dreq; @@ -402,20 +389,12 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr) struct nfs_direct_req *dreq = hdr->dreq; spin_lock(&dreq->lock); - if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) - dreq->error = hdr->error; - if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) { spin_unlock(&dreq->lock); goto out_put; } - if (hdr->good_bytes != 0) - nfs_direct_good_bytes(dreq, hdr); - - if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) - dreq->error = 0; - + nfs_direct_count_bytes(dreq, hdr); spin_unlock(&dreq->lock); while (!list_empty(&hdr->pages)) { @@ -646,29 +625,22 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) LIST_HEAD(reqs); struct nfs_commit_info cinfo; LIST_HEAD(failed); - int i; nfs_init_cinfo_from_dreq(&cinfo, dreq); nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo); dreq->count = 0; + dreq->max_count = 0; + list_for_each_entry(req, &reqs, wb_list) + dreq->max_count += req->wb_bytes; dreq->verf.committed = NFS_INVALID_STABLE_HOW; nfs_clear_pnfs_ds_commit_verifiers(&dreq->ds_cinfo); - for (i = 0; i < dreq->mirror_count; i++) - dreq->mirrors[i].count = 0; get_dreq(dreq); nfs_pageio_init_write(&desc, dreq->inode, FLUSH_STABLE, false, &nfs_direct_write_completion_ops); desc.pg_dreq = dreq; - req = nfs_list_entry(reqs.next); - nfs_direct_setup_mirroring(dreq, &desc, req); - if (desc.pg_error < 0) { - list_splice_init(&reqs, &failed); - goto out_failed; - } - list_for_each_entry_safe(req, tmp, &reqs, wb_list) { /* Bump the transmission count */ req->wb_nio++; @@ -686,7 +658,6 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) } nfs_pageio_complete(&desc); -out_failed: while (!list_empty(&failed)) { req = nfs_list_entry(failed.next); nfs_list_remove_request(req); @@ -791,17 +762,13 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) nfs_init_cinfo_from_dreq(&cinfo, dreq); spin_lock(&dreq->lock); - - if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) - dreq->error = hdr->error; - if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) { spin_unlock(&dreq->lock); goto out_put; } + nfs_direct_count_bytes(dreq, hdr); if (hdr->good_bytes != 0) { - nfs_direct_good_bytes(dreq, hdr); if (nfs_write_need_commit(hdr)) { if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) request_commit = true; @@ -923,7 +890,6 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, break; } - nfs_direct_setup_mirroring(dreq, &desc, req); if (desc.pg_error < 0) { nfs_free_request(req); result = desc.pg_error; diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 3cb073c50fa6..c9b605f6c9cb 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -1164,6 +1164,7 @@ static struct pnfs_layoutdriver_type filelayout_type = { .id = LAYOUT_NFSV4_1_FILES, .name = "LAYOUT_NFSV4_1_FILES", .owner = THIS_MODULE, + .flags = PNFS_LAYOUTGET_ON_OPEN, .max_layoutget_response = 4096, /* 1 page or so... */ .alloc_layout_hdr = filelayout_alloc_layout_hdr, .free_layout_hdr = filelayout_free_layout_hdr, diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index e64f810223be..447a3c17fa8e 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -16,14 +16,6 @@ extern const struct export_operations nfs_export_ops; struct nfs_string; -/* Maximum number of readahead requests - * FIXME: this should really be a sysctl so that users may tune it to suit - * their needs. People that do NFS over a slow network, might for - * instance want to reduce it to something closer to 1 for improved - * interactive response. - */ -#define NFS_MAX_READAHEAD (RPC_DEF_SLOT_TABLE - 1) - static inline void nfs_attr_check_mountpoint(struct super_block *parent, struct nfs_fattr *fattr) { if (!nfs_fsid_equal(&NFS_SB(parent)->fsid, &fattr->fsid)) diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index a3ad2d46fd42..9eb2f1a503ab 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -279,15 +279,17 @@ static struct nfs3_createdata *nfs3_alloc_createdata(void) return data; } -static int nfs3_do_create(struct inode *dir, struct dentry *dentry, struct nfs3_createdata *data) +static struct dentry * +nfs3_do_create(struct inode *dir, struct dentry *dentry, struct nfs3_createdata *data) { int status; status = rpc_call_sync(NFS_CLIENT(dir), &data->msg, 0); nfs_post_op_update_inode(dir, data->res.dir_attr); - if (status == 0) - status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, NULL); - return status; + if (status != 0) + return ERR_PTR(status); + + return nfs_add_or_obtain(dentry, data->res.fh, data->res.fattr, NULL); } static void nfs3_free_createdata(struct nfs3_createdata *data) @@ -304,6 +306,7 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, { struct posix_acl *default_acl, *acl; struct nfs3_createdata *data; + struct dentry *d_alias; int status = -ENOMEM; dprintk("NFS call create %pd\n", dentry); @@ -330,7 +333,8 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, goto out; for (;;) { - status = nfs3_do_create(dir, dentry, data); + d_alias = nfs3_do_create(dir, dentry, data); + status = PTR_ERR_OR_ZERO(d_alias); if (status != -ENOTSUPP) break; @@ -355,6 +359,9 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, if (status != 0) goto out_release_acls; + if (d_alias) + dentry = d_alias; + /* When we created the file with exclusive semantics, make * sure we set the attributes afterwards. */ if (data->arg.create.createmode == NFS3_CREATE_EXCLUSIVE) { @@ -372,11 +379,13 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, nfs_post_op_update_inode(d_inode(dentry), data->res.fattr); dprintk("NFS reply setattr (post-create): %d\n", status); if (status != 0) - goto out_release_acls; + goto out_dput; } status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl); +out_dput: + dput(d_alias); out_release_acls: posix_acl_release(acl); posix_acl_release(default_acl); @@ -504,6 +513,7 @@ nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page, unsigned int len, struct iattr *sattr) { struct nfs3_createdata *data; + struct dentry *d_alias; int status = -ENOMEM; if (len > NFS3_MAXPATHLEN) @@ -522,7 +532,11 @@ nfs3_proc_symlink(struct inode *dir, struct dentry *dentry, struct page *page, data->arg.symlink.pathlen = len; data->arg.symlink.sattr = sattr; - status = nfs3_do_create(dir, dentry, data); + d_alias = nfs3_do_create(dir, dentry, data); + status = PTR_ERR_OR_ZERO(d_alias); + + if (status == 0) + dput(d_alias); nfs3_free_createdata(data); out: @@ -535,6 +549,7 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) { struct posix_acl *default_acl, *acl; struct nfs3_createdata *data; + struct dentry *d_alias; int status = -ENOMEM; dprintk("NFS call mkdir %pd\n", dentry); @@ -553,12 +568,18 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) data->arg.mkdir.len = dentry->d_name.len; data->arg.mkdir.sattr = sattr; - status = nfs3_do_create(dir, dentry, data); + d_alias = nfs3_do_create(dir, dentry, data); + status = PTR_ERR_OR_ZERO(d_alias); + if (status != 0) goto out_release_acls; + if (d_alias) + dentry = d_alias; + status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl); + dput(d_alias); out_release_acls: posix_acl_release(acl); posix_acl_release(default_acl); @@ -660,6 +681,7 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, { struct posix_acl *default_acl, *acl; struct nfs3_createdata *data; + struct dentry *d_alias; int status = -ENOMEM; dprintk("NFS call mknod %pd %u:%u\n", dentry, @@ -698,12 +720,17 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, goto out; } - status = nfs3_do_create(dir, dentry, data); + d_alias = nfs3_do_create(dir, dentry, data); + status = PTR_ERR_OR_ZERO(d_alias); if (status != 0) goto out_release_acls; + if (d_alias) + dentry = d_alias; + status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl); + dput(d_alias); out_release_acls: posix_acl_release(acl); posix_acl_release(default_acl); diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 3564da1ba8a1..16b2e5cc3e94 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -491,8 +491,6 @@ extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); extern int nfs4_select_rw_stateid(struct nfs4_state *, fmode_t, const struct nfs_lock_context *, nfs4_stateid *, const struct cred **); -extern bool nfs4_refresh_open_stateid(nfs4_stateid *dst, - struct nfs4_state *state); extern bool nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state); @@ -574,6 +572,15 @@ static inline bool nfs4_stateid_is_newer(const nfs4_stateid *s1, const nfs4_stat return (s32)(be32_to_cpu(s1->seqid) - be32_to_cpu(s2->seqid)) > 0; } +static inline void nfs4_stateid_seqid_inc(nfs4_stateid *s1) +{ + u32 seqid = be32_to_cpu(s1->seqid); + + if (++seqid == 0) + ++seqid; + s1->seqid = cpu_to_be32(seqid); +} + static inline bool nfs4_valid_open_stateid(const struct nfs4_state *state) { return test_bit(NFS_STATE_RECOVERY_FAILED, &state->flags) == 0; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 1406858bae6c..caacf5e7f5e1 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1073,14 +1073,26 @@ static const struct rpc_call_ops nfs40_call_sync_ops = { .rpc_call_done = nfs40_call_sync_done, }; +static int nfs4_call_sync_custom(struct rpc_task_setup *task_setup) +{ + int ret; + struct rpc_task *task; + + task = rpc_run_task(task_setup); + if (IS_ERR(task)) + return PTR_ERR(task); + + ret = task->tk_status; + rpc_put_task(task); + return ret; +} + static int nfs4_call_sync_sequence(struct rpc_clnt *clnt, struct nfs_server *server, struct rpc_message *msg, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res) { - int ret; - struct rpc_task *task; struct nfs_client *clp = server->nfs_client; struct nfs4_call_sync_data data = { .seq_server = server, @@ -1094,14 +1106,7 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt, .callback_data = &data }; - task = rpc_run_task(&task_setup); - if (IS_ERR(task)) - ret = PTR_ERR(task); - else { - ret = task->tk_status; - rpc_put_task(task); - } - return ret; + return nfs4_call_sync_custom(&task_setup); } int nfs4_call_sync(struct rpc_clnt *clnt, @@ -1435,8 +1440,6 @@ static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode, return 0; if ((delegation->type & fmode) != fmode) return 0; - if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) - return 0; switch (claim) { case NFS4_OPEN_CLAIM_NULL: case NFS4_OPEN_CLAIM_FH: @@ -1805,7 +1808,6 @@ static void nfs4_return_incompatible_delegation(struct inode *inode, fmode_t fmo static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata) { struct nfs4_state *state = opendata->state; - struct nfs_inode *nfsi = NFS_I(state->inode); struct nfs_delegation *delegation; int open_mode = opendata->o_arg.open_flags; fmode_t fmode = opendata->o_arg.fmode; @@ -1822,7 +1824,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata) } spin_unlock(&state->owner->so_lock); rcu_read_lock(); - delegation = rcu_dereference(nfsi->delegation); + delegation = nfs4_get_valid_delegation(state->inode); if (!can_open_delegated(delegation, fmode, claim)) { rcu_read_unlock(); break; @@ -2366,7 +2368,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) data->o_arg.open_flags, claim)) goto out_no_action; rcu_read_lock(); - delegation = rcu_dereference(NFS_I(data->state->inode)->delegation); + delegation = nfs4_get_valid_delegation(data->state->inode); if (can_open_delegated(delegation, data->o_arg.fmode, claim)) goto unlock_no_action; rcu_read_unlock(); @@ -3308,6 +3310,75 @@ nfs4_wait_on_layoutreturn(struct inode *inode, struct rpc_task *task) return pnfs_wait_on_layoutreturn(inode, task); } +/* + * Update the seqid of an open stateid + */ +static void nfs4_sync_open_stateid(nfs4_stateid *dst, + struct nfs4_state *state) +{ + __be32 seqid_open; + u32 dst_seqid; + int seq; + + for (;;) { + if (!nfs4_valid_open_stateid(state)) + break; + seq = read_seqbegin(&state->seqlock); + if (!nfs4_state_match_open_stateid_other(state, dst)) { + nfs4_stateid_copy(dst, &state->open_stateid); + if (read_seqretry(&state->seqlock, seq)) + continue; + break; + } + seqid_open = state->open_stateid.seqid; + if (read_seqretry(&state->seqlock, seq)) + continue; + + dst_seqid = be32_to_cpu(dst->seqid); + if ((s32)(dst_seqid - be32_to_cpu(seqid_open)) < 0) + dst->seqid = seqid_open; + break; + } +} + +/* + * Update the seqid of an open stateid after receiving + * NFS4ERR_OLD_STATEID + */ +static bool nfs4_refresh_open_old_stateid(nfs4_stateid *dst, + struct nfs4_state *state) +{ + __be32 seqid_open; + u32 dst_seqid; + bool ret; + int seq; + + for (;;) { + ret = false; + if (!nfs4_valid_open_stateid(state)) + break; + seq = read_seqbegin(&state->seqlock); + if (!nfs4_state_match_open_stateid_other(state, dst)) { + if (read_seqretry(&state->seqlock, seq)) + continue; + break; + } + seqid_open = state->open_stateid.seqid; + if (read_seqretry(&state->seqlock, seq)) + continue; + + dst_seqid = be32_to_cpu(dst->seqid); + if ((s32)(dst_seqid - be32_to_cpu(seqid_open)) >= 0) + dst->seqid = cpu_to_be32(dst_seqid + 1); + else + dst->seqid = seqid_open; + ret = true; + break; + } + + return ret; +} + struct nfs4_closedata { struct inode *inode; struct nfs4_state *state; @@ -3358,32 +3429,11 @@ static void nfs4_close_done(struct rpc_task *task, void *data) trace_nfs4_close(state, &calldata->arg, &calldata->res, task->tk_status); /* Handle Layoutreturn errors */ - if (calldata->arg.lr_args && task->tk_status != 0) { - switch (calldata->res.lr_ret) { - default: - calldata->res.lr_ret = -NFS4ERR_NOMATCHING_LAYOUT; - break; - case 0: - calldata->arg.lr_args = NULL; - calldata->res.lr_res = NULL; - break; - case -NFS4ERR_OLD_STATEID: - if (nfs4_layoutreturn_refresh_stateid(&calldata->arg.lr_args->stateid, - &calldata->arg.lr_args->range, - calldata->inode)) - goto lr_restart; - /* Fallthrough */ - case -NFS4ERR_ADMIN_REVOKED: - case -NFS4ERR_DELEG_REVOKED: - case -NFS4ERR_EXPIRED: - case -NFS4ERR_BAD_STATEID: - case -NFS4ERR_UNKNOWN_LAYOUTTYPE: - case -NFS4ERR_WRONG_CRED: - calldata->arg.lr_args = NULL; - calldata->res.lr_res = NULL; - goto lr_restart; - } - } + if (pnfs_roc_done(task, calldata->inode, + &calldata->arg.lr_args, + &calldata->res.lr_res, + &calldata->res.lr_ret) == -EAGAIN) + goto out_restart; /* hmm. we are done with the inode, and in the process of freeing * the state_owner. we keep this around to process errors @@ -3403,7 +3453,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data) break; case -NFS4ERR_OLD_STATEID: /* Did we race with OPEN? */ - if (nfs4_refresh_open_stateid(&calldata->arg.stateid, + if (nfs4_refresh_open_old_stateid(&calldata->arg.stateid, state)) goto out_restart; goto out_release; @@ -3415,7 +3465,9 @@ static void nfs4_close_done(struct rpc_task *task, void *data) task->tk_msg.rpc_cred); /* Fallthrough */ case -NFS4ERR_BAD_STATEID: - break; + if (calldata->arg.fmode == 0) + break; + /* Fallthrough */ default: task->tk_status = nfs4_async_handle_exception(task, server, task->tk_status, &exception); @@ -3430,8 +3482,6 @@ out_release: nfs_refresh_inode(calldata->inode, &calldata->fattr); dprintk("%s: done, ret = %d!\n", __func__, task->tk_status); return; -lr_restart: - calldata->res.lr_ret = 0; out_restart: task->tk_status = 0; rpc_restart_call_prepare(task); @@ -3472,8 +3522,8 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) } else if (is_rdwr) calldata->arg.fmode |= FMODE_READ|FMODE_WRITE; - if (!nfs4_valid_open_stateid(state) || - !nfs4_refresh_open_stateid(&calldata->arg.stateid, state)) + nfs4_sync_open_stateid(&calldata->arg.stateid, state); + if (!nfs4_valid_open_stateid(state)) call_close = 0; spin_unlock(&state->owner->so_lock); @@ -6018,7 +6068,6 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, .rpc_resp = res, .rpc_cred = cred, }; - struct rpc_task *task; struct rpc_task_setup task_setup_data = { .rpc_client = clp->cl_rpcclient, .rpc_message = &msg, @@ -6051,17 +6100,13 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, dprintk("NFS call setclientid auth=%s, '%s'\n", clp->cl_rpcclient->cl_auth->au_ops->au_name, clp->cl_owner_id); - task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) { - status = PTR_ERR(task); - goto out; - } - status = task->tk_status; + + status = nfs4_call_sync_custom(&task_setup_data); if (setclientid.sc_cred) { + kfree(clp->cl_acceptor); clp->cl_acceptor = rpcauth_stringify_acceptor(setclientid.sc_cred); put_rpccred(setclientid.sc_cred); } - rpc_put_task(task); out: trace_nfs4_setclientid(clp, status); dprintk("NFS reply setclientid: %d\n", status); @@ -6129,32 +6174,11 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) trace_nfs4_delegreturn_exit(&data->args, &data->res, task->tk_status); /* Handle Layoutreturn errors */ - if (data->args.lr_args && task->tk_status != 0) { - switch(data->res.lr_ret) { - default: - data->res.lr_ret = -NFS4ERR_NOMATCHING_LAYOUT; - break; - case 0: - data->args.lr_args = NULL; - data->res.lr_res = NULL; - break; - case -NFS4ERR_OLD_STATEID: - if (nfs4_layoutreturn_refresh_stateid(&data->args.lr_args->stateid, - &data->args.lr_args->range, - data->inode)) - goto lr_restart; - /* Fallthrough */ - case -NFS4ERR_ADMIN_REVOKED: - case -NFS4ERR_DELEG_REVOKED: - case -NFS4ERR_EXPIRED: - case -NFS4ERR_BAD_STATEID: - case -NFS4ERR_UNKNOWN_LAYOUTTYPE: - case -NFS4ERR_WRONG_CRED: - data->args.lr_args = NULL; - data->res.lr_res = NULL; - goto lr_restart; - } - } + if (pnfs_roc_done(task, data->inode, + &data->args.lr_args, + &data->res.lr_res, + &data->res.lr_ret) == -EAGAIN) + goto out_restart; switch (task->tk_status) { case 0: @@ -6192,8 +6216,6 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) } data->rpc_status = task->tk_status; return; -lr_restart: - data->res.lr_ret = 0; out_restart: task->tk_status = 0; rpc_restart_call_prepare(task); @@ -6386,6 +6408,42 @@ static int nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock * return err; } +/* + * Update the seqid of a lock stateid after receiving + * NFS4ERR_OLD_STATEID + */ +static bool nfs4_refresh_lock_old_stateid(nfs4_stateid *dst, + struct nfs4_lock_state *lsp) +{ + struct nfs4_state *state = lsp->ls_state; + bool ret = false; + + spin_lock(&state->state_lock); + if (!nfs4_stateid_match_other(dst, &lsp->ls_stateid)) + goto out; + if (!nfs4_stateid_is_newer(&lsp->ls_stateid, dst)) + nfs4_stateid_seqid_inc(dst); + else + dst->seqid = lsp->ls_stateid.seqid; + ret = true; +out: + spin_unlock(&state->state_lock); + return ret; +} + +static bool nfs4_sync_lock_stateid(nfs4_stateid *dst, + struct nfs4_lock_state *lsp) +{ + struct nfs4_state *state = lsp->ls_state; + bool ret; + + spin_lock(&state->state_lock); + ret = !nfs4_stateid_match_other(dst, &lsp->ls_stateid); + nfs4_stateid_copy(dst, &lsp->ls_stateid); + spin_unlock(&state->state_lock); + return ret; +} + struct nfs4_unlockdata { struct nfs_locku_args arg; struct nfs_locku_res res; @@ -6403,7 +6461,8 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, struct nfs_seqid *seqid) { struct nfs4_unlockdata *p; - struct inode *inode = lsp->ls_state->inode; + struct nfs4_state *state = lsp->ls_state; + struct inode *inode = state->inode; p = kzalloc(sizeof(*p), GFP_NOFS); if (p == NULL) @@ -6419,6 +6478,9 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, locks_init_lock(&p->fl); locks_copy_lock(&p->fl, fl); p->server = NFS_SERVER(inode); + spin_lock(&state->state_lock); + nfs4_stateid_copy(&p->arg.stateid, &lsp->ls_stateid); + spin_unlock(&state->state_lock); return p; } @@ -6457,10 +6519,14 @@ static void nfs4_locku_done(struct rpc_task *task, void *data) task->tk_msg.rpc_cred); /* Fall through */ case -NFS4ERR_BAD_STATEID: - case -NFS4ERR_OLD_STATEID: case -NFS4ERR_STALE_STATEID: - if (!nfs4_stateid_match(&calldata->arg.stateid, - &calldata->lsp->ls_stateid)) + if (nfs4_sync_lock_stateid(&calldata->arg.stateid, + calldata->lsp)) + rpc_restart_call_prepare(task); + break; + case -NFS4ERR_OLD_STATEID: + if (nfs4_refresh_lock_old_stateid(&calldata->arg.stateid, + calldata->lsp)) rpc_restart_call_prepare(task); break; default: @@ -6483,7 +6549,6 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data) if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0) goto out_wait; - nfs4_stateid_copy(&calldata->arg.stateid, &calldata->lsp->ls_stateid); if (test_bit(NFS_LOCK_INITIALIZED, &calldata->lsp->ls_flags) == 0) { /* Note: exit _without_ running nfs4_locku_done */ goto out_no_action; @@ -7645,6 +7710,8 @@ int nfs4_proc_fsid_present(struct inode *inode, const struct cred *cred) static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct nfs4_secinfo_flavors *flavors, bool use_integrity) { int status; + struct rpc_clnt *clnt = NFS_SERVER(dir)->client; + struct nfs_client *clp = NFS_SERVER(dir)->nfs_client; struct nfs4_secinfo_arg args = { .dir_fh = NFS_FH(dir), .name = name, @@ -7657,26 +7724,37 @@ static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct .rpc_argp = &args, .rpc_resp = &res, }; - struct rpc_clnt *clnt = NFS_SERVER(dir)->client; + struct nfs4_call_sync_data data = { + .seq_server = NFS_SERVER(dir), + .seq_args = &args.seq_args, + .seq_res = &res.seq_res, + }; + struct rpc_task_setup task_setup = { + .rpc_client = clnt, + .rpc_message = &msg, + .callback_ops = clp->cl_mvops->call_sync_ops, + .callback_data = &data, + .flags = RPC_TASK_NO_ROUND_ROBIN, + }; const struct cred *cred = NULL; if (use_integrity) { - clnt = NFS_SERVER(dir)->nfs_client->cl_rpcclient; - cred = nfs4_get_clid_cred(NFS_SERVER(dir)->nfs_client); + clnt = clp->cl_rpcclient; + task_setup.rpc_client = clnt; + + cred = nfs4_get_clid_cred(clp); msg.rpc_cred = cred; } dprintk("NFS call secinfo %s\n", name->name); - nfs4_state_protect(NFS_SERVER(dir)->nfs_client, - NFS_SP4_MACH_CRED_SECINFO, &clnt, &msg); + nfs4_state_protect(clp, NFS_SP4_MACH_CRED_SECINFO, &clnt, &msg); + nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 0); + status = nfs4_call_sync_custom(&task_setup); - status = nfs4_call_sync(clnt, NFS_SERVER(dir), &msg, &args.seq_args, - &res.seq_res, RPC_TASK_NO_ROUND_ROBIN); dprintk("NFS reply secinfo: %d\n", status); put_cred(cred); - return status; } @@ -8344,7 +8422,6 @@ static const struct rpc_call_ops nfs4_get_lease_time_ops = { int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo) { - struct rpc_task *task; struct nfs4_get_lease_time_args args; struct nfs4_get_lease_time_res res = { .lr_fsinfo = fsinfo, @@ -8366,17 +8443,9 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo) .callback_data = &data, .flags = RPC_TASK_TIMEOUT, }; - int status; nfs4_init_sequence(&args.la_seq_args, &res.lr_seq_res, 0, 1); - task = rpc_run_task(&task_setup); - - if (IS_ERR(task)) - return PTR_ERR(task); - - status = task->tk_status; - rpc_put_task(task); - return status; + return nfs4_call_sync_custom(&task_setup); } #ifdef CONFIG_NFS_V4_1 @@ -8845,7 +8914,6 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp, const struct cred *cred) { struct nfs4_reclaim_complete_data *calldata; - struct rpc_task *task; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RECLAIM_COMPLETE], .rpc_cred = cred, @@ -8854,7 +8922,7 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp, .rpc_client = clp->cl_rpcclient, .rpc_message = &msg, .callback_ops = &nfs4_reclaim_complete_call_ops, - .flags = RPC_TASK_ASYNC | RPC_TASK_NO_ROUND_ROBIN, + .flags = RPC_TASK_NO_ROUND_ROBIN, }; int status = -ENOMEM; @@ -8869,15 +8937,7 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp, msg.rpc_argp = &calldata->arg; msg.rpc_resp = &calldata->res; task_setup_data.callback_data = calldata; - task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) { - status = PTR_ERR(task); - goto out; - } - status = rpc_wait_for_completion_task(task); - if (status == 0) - status = task->tk_status; - rpc_put_task(task); + status = nfs4_call_sync_custom(&task_setup_data); out: dprintk("<-- %s status=%d\n", __func__, status); return status; @@ -9103,10 +9163,19 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) if (!nfs41_sequence_process(task, &lrp->res.seq_res)) return; + /* + * Was there an RPC level error? Assume the call succeeded, + * and that we need to release the layout + */ + if (task->tk_rpc_status != 0 && RPC_WAS_SENT(task)) { + lrp->res.lrs_present = 0; + return; + } + server = NFS_SERVER(lrp->args.inode); switch (task->tk_status) { case -NFS4ERR_OLD_STATEID: - if (nfs4_layoutreturn_refresh_stateid(&lrp->args.stateid, + if (nfs4_layout_refresh_old_stateid(&lrp->args.stateid, &lrp->args.range, lrp->args.inode)) goto out_restart; @@ -9362,18 +9431,32 @@ _nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle, .rpc_resp = &res, }; struct rpc_clnt *clnt = server->client; + struct nfs4_call_sync_data data = { + .seq_server = server, + .seq_args = &args.seq_args, + .seq_res = &res.seq_res, + }; + struct rpc_task_setup task_setup = { + .rpc_client = server->client, + .rpc_message = &msg, + .callback_ops = server->nfs_client->cl_mvops->call_sync_ops, + .callback_data = &data, + .flags = RPC_TASK_NO_ROUND_ROBIN, + }; const struct cred *cred = NULL; int status; if (use_integrity) { clnt = server->nfs_client->cl_rpcclient; + task_setup.rpc_client = clnt; + cred = nfs4_get_clid_cred(server->nfs_client); msg.rpc_cred = cred; } dprintk("--> %s\n", __func__); - status = nfs4_call_sync(clnt, server, &msg, &args.seq_args, - &res.seq_res, RPC_TASK_NO_ROUND_ROBIN); + nfs4_init_sequence(&args.seq_args, &res.seq_res, 0, 0); + status = nfs4_call_sync_custom(&task_setup); dprintk("<-- %s status=%d\n", __func__, status); put_cred(cred); diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index cad4e064b328..0c6d53dc3672 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1015,22 +1015,6 @@ out: return ret; } -bool nfs4_refresh_open_stateid(nfs4_stateid *dst, struct nfs4_state *state) -{ - bool ret; - int seq; - - do { - ret = false; - seq = read_seqbegin(&state->seqlock); - if (nfs4_state_match_open_stateid_other(state, dst)) { - dst->seqid = state->open_stateid.seqid; - ret = true; - } - } while (read_seqretry(&state->seqlock, seq)); - return ret; -} - bool nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state) { bool ret; @@ -2095,8 +2079,10 @@ static int nfs4_try_migration(struct nfs_server *server, const struct cred *cred } status = nfs4_begin_drain_session(clp); - if (status != 0) - return status; + if (status != 0) { + result = status; + goto out; + } status = nfs4_replace_transport(server, locations); if (status != 0) { diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 46a8d636d151..ab07db0f07cd 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1174,7 +1174,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, } else *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME); } - if (bmval[2] & FATTR4_WORD2_SECURITY_LABEL) { + if (label && (bmval[2] & FATTR4_WORD2_SECURITY_LABEL)) { *p++ = cpu_to_be32(label->lfs); *p++ = cpu_to_be32(label->pi); *p++ = cpu_to_be32(label->len); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 4525d5acae38..bb80034a7661 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -359,9 +359,10 @@ pnfs_clear_lseg_state(struct pnfs_layout_segment *lseg, } /* - * Update the seqid of a layout stateid + * Update the seqid of a layout stateid after receiving + * NFS4ERR_OLD_STATEID */ -bool nfs4_layoutreturn_refresh_stateid(nfs4_stateid *dst, +bool nfs4_layout_refresh_old_stateid(nfs4_stateid *dst, struct pnfs_layout_range *dst_range, struct inode *inode) { @@ -377,7 +378,15 @@ bool nfs4_layoutreturn_refresh_stateid(nfs4_stateid *dst, spin_lock(&inode->i_lock); lo = NFS_I(inode)->layout; - if (lo && nfs4_stateid_match_other(dst, &lo->plh_stateid)) { + if (lo && pnfs_layout_is_valid(lo) && + nfs4_stateid_match_other(dst, &lo->plh_stateid)) { + /* Is our call using the most recent seqid? If so, bump it */ + if (!nfs4_stateid_is_newer(&lo->plh_stateid, dst)) { + nfs4_stateid_seqid_inc(dst); + ret = true; + goto out; + } + /* Try to update the seqid to the most recent */ err = pnfs_mark_matching_lsegs_return(lo, &head, &range, 0); if (err != -EBUSY) { dst->seqid = lo->plh_stateid.seqid; @@ -385,6 +394,7 @@ bool nfs4_layoutreturn_refresh_stateid(nfs4_stateid *dst, ret = true; } } +out: spin_unlock(&inode->i_lock); pnfs_free_lseg_list(&head); return ret; @@ -1440,6 +1450,52 @@ out_noroc: return false; } +int pnfs_roc_done(struct rpc_task *task, struct inode *inode, + struct nfs4_layoutreturn_args **argpp, + struct nfs4_layoutreturn_res **respp, + int *ret) +{ + struct nfs4_layoutreturn_args *arg = *argpp; + int retval = -EAGAIN; + + if (!arg) + return 0; + /* Handle Layoutreturn errors */ + switch (*ret) { + case 0: + retval = 0; + break; + case -NFS4ERR_NOMATCHING_LAYOUT: + /* Was there an RPC level error? If not, retry */ + if (task->tk_rpc_status == 0) + break; + /* If the call was not sent, let caller handle it */ + if (!RPC_WAS_SENT(task)) + return 0; + /* + * Otherwise, assume the call succeeded and + * that we need to release the layout + */ + *ret = 0; + (*respp)->lrs_present = 0; + retval = 0; + break; + case -NFS4ERR_DELAY: + /* Let the caller handle the retry */ + *ret = -NFS4ERR_NOMATCHING_LAYOUT; + return 0; + case -NFS4ERR_OLD_STATEID: + if (!nfs4_layout_refresh_old_stateid(&arg->stateid, + &arg->range, inode)) + break; + *ret = -NFS4ERR_NOMATCHING_LAYOUT; + return -EAGAIN; + } + *argpp = NULL; + *respp = NULL; + return retval; +} + void pnfs_roc_release(struct nfs4_layoutreturn_args *args, struct nfs4_layoutreturn_res *res, int ret) @@ -1449,10 +1505,15 @@ void pnfs_roc_release(struct nfs4_layoutreturn_args *args, const nfs4_stateid *res_stateid = NULL; struct nfs4_xdr_opaque_data *ld_private = args->ld_private; - if (ret == 0) { - arg_stateid = &args->stateid; + switch (ret) { + case -NFS4ERR_NOMATCHING_LAYOUT: + break; + case 0: if (res->lrs_present) res_stateid = &res->stateid; + /* Fallthrough */ + default: + arg_stateid = &args->stateid; } pnfs_layoutreturn_free_lsegs(lo, arg_stateid, &args->range, res_stateid); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index f15609c003d8..f8a38065c7e4 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -261,7 +261,7 @@ int pnfs_destroy_layouts_byfsid(struct nfs_client *clp, bool is_recall); int pnfs_destroy_layouts_byclid(struct nfs_client *clp, bool is_recall); -bool nfs4_layoutreturn_refresh_stateid(nfs4_stateid *dst, +bool nfs4_layout_refresh_old_stateid(nfs4_stateid *dst, struct pnfs_layout_range *dst_range, struct inode *inode); void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo); @@ -282,6 +282,10 @@ bool pnfs_roc(struct inode *ino, struct nfs4_layoutreturn_args *args, struct nfs4_layoutreturn_res *res, const struct cred *cred); +int pnfs_roc_done(struct rpc_task *task, struct inode *inode, + struct nfs4_layoutreturn_args **argpp, + struct nfs4_layoutreturn_res **respp, + int *ret); void pnfs_roc_release(struct nfs4_layoutreturn_args *args, struct nfs4_layoutreturn_res *res, int ret); @@ -701,6 +705,15 @@ pnfs_roc(struct inode *ino, return false; } +static inline int +pnfs_roc_done(struct rpc_task *task, struct inode *inode, + struct nfs4_layoutreturn_args **argpp, + struct nfs4_layoutreturn_res **respp, + int *ret) +{ + return 0; +} + static inline void pnfs_roc_release(struct nfs4_layoutreturn_args *args, struct nfs4_layoutreturn_res *res, @@ -785,7 +798,7 @@ static inline void nfs4_pnfs_v3_ds_connect_unload(void) { } -static inline bool nfs4_layoutreturn_refresh_stateid(nfs4_stateid *dst, +static inline bool nfs4_layout_refresh_old_stateid(nfs4_stateid *dst, struct pnfs_layout_range *dst_range, struct inode *inode) { diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 19a76cfa8b1f..a84df7d63403 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2645,6 +2645,13 @@ int nfs_clone_sb_security(struct super_block *s, struct dentry *mntroot, } EXPORT_SYMBOL_GPL(nfs_clone_sb_security); +static void nfs_set_readahead(struct backing_dev_info *bdi, + unsigned long iomax_pages) +{ + bdi->ra_pages = VM_READAHEAD_PAGES; + bdi->io_pages = iomax_pages; +} + struct dentry *nfs_fs_mount_common(struct nfs_server *server, int flags, const char *dev_name, struct nfs_mount_info *mount_info, @@ -2687,7 +2694,7 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server, mntroot = ERR_PTR(error); goto error_splat_super; } - s->s_bdi->ra_pages = server->rpages * NFS_MAX_READAHEAD; + nfs_set_readahead(s->s_bdi, server->rpages); server->super = s; } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 85ca49549b39..52cab65f91cf 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -786,7 +786,6 @@ static void nfs_inode_remove_request(struct nfs_page *req) struct nfs_inode *nfsi = NFS_I(inode); struct nfs_page *head; - atomic_long_dec(&nfsi->nrequests); if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) { head = req->wb_head; @@ -799,8 +798,10 @@ static void nfs_inode_remove_request(struct nfs_page *req) spin_unlock(&mapping->private_lock); } - if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) + if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) { nfs_release_request(req); + atomic_long_dec(&nfsi->nrequests); + } } static void diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index d25f6bbe7006..10cefb0c07c7 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -3,6 +3,7 @@ config NFSD tristate "NFS server support" depends on INET depends on FILE_LOCKING + depends on FSNOTIFY select LOCKD select SUNRPC select EXPORTFS @@ -147,7 +148,7 @@ config NFSD_V4_SECURITY_LABEL config NFSD_FAULT_INJECTION bool "NFS server manual fault injection" - depends on NFSD_V4 && DEBUG_KERNEL && DEBUG_FS + depends on NFSD_V4 && DEBUG_KERNEL && DEBUG_FS && BROKEN help This option enables support for manually injecting faults into the NFS server. This is intended to be used for diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile index 2bfb58eefad1..6a40b1afe703 100644 --- a/fs/nfsd/Makefile +++ b/fs/nfsd/Makefile @@ -11,7 +11,8 @@ obj-$(CONFIG_NFSD) += nfsd.o nfsd-y += trace.o nfsd-y += nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \ - export.o auth.o lockd.o nfscache.o nfsxdr.o stats.o + export.o auth.o lockd.o nfscache.o nfsxdr.o \ + stats.o filecache.o nfsd-$(CONFIG_NFSD_FAULT_INJECTION) += fault_inject.o nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs3xdr.o diff --git a/fs/nfsd/acl.h b/fs/nfsd/acl.h index 4cd7c69a6cb9..ba14d2f4b64f 100644 --- a/fs/nfsd/acl.h +++ b/fs/nfsd/acl.h @@ -39,14 +39,6 @@ struct nfs4_acl; struct svc_fh; struct svc_rqst; -/* - * Maximum ACL we'll accept from a client; chosen (somewhat - * arbitrarily) so that kmalloc'ing the ACL shouldn't require a - * high-order allocation. This allows 204 ACEs on x86_64: - */ -#define NFS4_ACL_MAX ((PAGE_SIZE - sizeof(struct nfs4_acl)) \ - / sizeof(struct nfs4_ace)) - int nfs4_acl_bytes(int entries); int nfs4_acl_get_whotype(char *, u32); __be32 nfs4_acl_write_who(struct xdr_stream *xdr, int who); diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index 66d4c55eb48e..9bbaa671c079 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -15,6 +15,7 @@ #include "blocklayoutxdr.h" #include "pnfs.h" +#include "filecache.h" #define NFSDDBG_FACILITY NFSDDBG_PNFS @@ -404,7 +405,7 @@ static void nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls) { struct nfs4_client *clp = ls->ls_stid.sc_client; - struct block_device *bdev = ls->ls_file->f_path.mnt->mnt_sb->s_bdev; + struct block_device *bdev = ls->ls_file->nf_file->f_path.mnt->mnt_sb->s_bdev; bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY, nfsd4_scsi_pr_key(clp), 0, true); diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index baa01956a5b3..15422c951fd1 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -22,6 +22,7 @@ #include "nfsfh.h" #include "netns.h" #include "pnfs.h" +#include "filecache.h" #define NFSDDBG_FACILITY NFSDDBG_EXPORT @@ -232,6 +233,17 @@ static struct cache_head *expkey_alloc(void) return NULL; } +static void expkey_flush(void) +{ + /* + * Take the nfsd_mutex here to ensure that the file cache is not + * destroyed while we're in the middle of flushing. + */ + mutex_lock(&nfsd_mutex); + nfsd_file_cache_purge(current->nsproxy->net_ns); + mutex_unlock(&nfsd_mutex); +} + static const struct cache_detail svc_expkey_cache_template = { .owner = THIS_MODULE, .hash_size = EXPKEY_HASHMAX, @@ -244,6 +256,7 @@ static const struct cache_detail svc_expkey_cache_template = { .init = expkey_init, .update = expkey_update, .alloc = expkey_alloc, + .flush = expkey_flush, }; static int diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c new file mode 100644 index 000000000000..ef55e9b1cd4e --- /dev/null +++ b/fs/nfsd/filecache.c @@ -0,0 +1,934 @@ +/* + * Open file cache. + * + * (c) 2015 - Jeff Layton <jeff.layton@primarydata.com> + */ + +#include <linux/hash.h> +#include <linux/slab.h> +#include <linux/file.h> +#include <linux/sched.h> +#include <linux/list_lru.h> +#include <linux/fsnotify_backend.h> +#include <linux/fsnotify.h> +#include <linux/seq_file.h> + +#include "vfs.h" +#include "nfsd.h" +#include "nfsfh.h" +#include "netns.h" +#include "filecache.h" +#include "trace.h" + +#define NFSDDBG_FACILITY NFSDDBG_FH + +/* FIXME: dynamically size this for the machine somehow? */ +#define NFSD_FILE_HASH_BITS 12 +#define NFSD_FILE_HASH_SIZE (1 << NFSD_FILE_HASH_BITS) +#define NFSD_LAUNDRETTE_DELAY (2 * HZ) + +#define NFSD_FILE_LRU_RESCAN (0) +#define NFSD_FILE_SHUTDOWN (1) +#define NFSD_FILE_LRU_THRESHOLD (4096UL) +#define NFSD_FILE_LRU_LIMIT (NFSD_FILE_LRU_THRESHOLD << 2) + +/* We only care about NFSD_MAY_READ/WRITE for this cache */ +#define NFSD_FILE_MAY_MASK (NFSD_MAY_READ|NFSD_MAY_WRITE) + +struct nfsd_fcache_bucket { + struct hlist_head nfb_head; + spinlock_t nfb_lock; + unsigned int nfb_count; + unsigned int nfb_maxcount; +}; + +static DEFINE_PER_CPU(unsigned long, nfsd_file_cache_hits); + +static struct kmem_cache *nfsd_file_slab; +static struct kmem_cache *nfsd_file_mark_slab; +static struct nfsd_fcache_bucket *nfsd_file_hashtbl; +static struct list_lru nfsd_file_lru; +static long nfsd_file_lru_flags; +static struct fsnotify_group *nfsd_file_fsnotify_group; +static atomic_long_t nfsd_filecache_count; +static struct delayed_work nfsd_filecache_laundrette; + +enum nfsd_file_laundrette_ctl { + NFSD_FILE_LAUNDRETTE_NOFLUSH = 0, + NFSD_FILE_LAUNDRETTE_MAY_FLUSH +}; + +static void +nfsd_file_schedule_laundrette(enum nfsd_file_laundrette_ctl ctl) +{ + long count = atomic_long_read(&nfsd_filecache_count); + + if (count == 0 || test_bit(NFSD_FILE_SHUTDOWN, &nfsd_file_lru_flags)) + return; + + /* Be more aggressive about scanning if over the threshold */ + if (count > NFSD_FILE_LRU_THRESHOLD) + mod_delayed_work(system_wq, &nfsd_filecache_laundrette, 0); + else + schedule_delayed_work(&nfsd_filecache_laundrette, NFSD_LAUNDRETTE_DELAY); + + if (ctl == NFSD_FILE_LAUNDRETTE_NOFLUSH) + return; + + /* ...and don't delay flushing if we're out of control */ + if (count >= NFSD_FILE_LRU_LIMIT) + flush_delayed_work(&nfsd_filecache_laundrette); +} + +static void +nfsd_file_slab_free(struct rcu_head *rcu) +{ + struct nfsd_file *nf = container_of(rcu, struct nfsd_file, nf_rcu); + + put_cred(nf->nf_cred); + kmem_cache_free(nfsd_file_slab, nf); +} + +static void +nfsd_file_mark_free(struct fsnotify_mark *mark) +{ + struct nfsd_file_mark *nfm = container_of(mark, struct nfsd_file_mark, + nfm_mark); + + kmem_cache_free(nfsd_file_mark_slab, nfm); +} + +static struct nfsd_file_mark * +nfsd_file_mark_get(struct nfsd_file_mark *nfm) +{ + if (!atomic_inc_not_zero(&nfm->nfm_ref)) + return NULL; + return nfm; +} + +static void +nfsd_file_mark_put(struct nfsd_file_mark *nfm) +{ + if (atomic_dec_and_test(&nfm->nfm_ref)) { + + fsnotify_destroy_mark(&nfm->nfm_mark, nfsd_file_fsnotify_group); + fsnotify_put_mark(&nfm->nfm_mark); + } +} + +static struct nfsd_file_mark * +nfsd_file_mark_find_or_create(struct nfsd_file *nf) +{ + int err; + struct fsnotify_mark *mark; + struct nfsd_file_mark *nfm = NULL, *new; + struct inode *inode = nf->nf_inode; + + do { + mutex_lock(&nfsd_file_fsnotify_group->mark_mutex); + mark = fsnotify_find_mark(&inode->i_fsnotify_marks, + nfsd_file_fsnotify_group); + if (mark) { + nfm = nfsd_file_mark_get(container_of(mark, + struct nfsd_file_mark, + nfm_mark)); + mutex_unlock(&nfsd_file_fsnotify_group->mark_mutex); + fsnotify_put_mark(mark); + if (likely(nfm)) + break; + } else + mutex_unlock(&nfsd_file_fsnotify_group->mark_mutex); + + /* allocate a new nfm */ + new = kmem_cache_alloc(nfsd_file_mark_slab, GFP_KERNEL); + if (!new) + return NULL; + fsnotify_init_mark(&new->nfm_mark, nfsd_file_fsnotify_group); + new->nfm_mark.mask = FS_ATTRIB|FS_DELETE_SELF; + atomic_set(&new->nfm_ref, 1); + + err = fsnotify_add_inode_mark(&new->nfm_mark, inode, 0); + + /* + * If the add was successful, then return the object. + * Otherwise, we need to put the reference we hold on the + * nfm_mark. The fsnotify code will take a reference and put + * it on failure, so we can't just free it directly. It's also + * not safe to call fsnotify_destroy_mark on it as the + * mark->group will be NULL. Thus, we can't let the nfm_ref + * counter drive the destruction at this point. + */ + if (likely(!err)) + nfm = new; + else + fsnotify_put_mark(&new->nfm_mark); + } while (unlikely(err == -EEXIST)); + + return nfm; +} + +static struct nfsd_file * +nfsd_file_alloc(struct inode *inode, unsigned int may, unsigned int hashval, + struct net *net) +{ + struct nfsd_file *nf; + + nf = kmem_cache_alloc(nfsd_file_slab, GFP_KERNEL); + if (nf) { + INIT_HLIST_NODE(&nf->nf_node); + INIT_LIST_HEAD(&nf->nf_lru); + nf->nf_file = NULL; + nf->nf_cred = get_current_cred(); + nf->nf_net = net; + nf->nf_flags = 0; + nf->nf_inode = inode; + nf->nf_hashval = hashval; + atomic_set(&nf->nf_ref, 1); + nf->nf_may = may & NFSD_FILE_MAY_MASK; + if (may & NFSD_MAY_NOT_BREAK_LEASE) { + if (may & NFSD_MAY_WRITE) + __set_bit(NFSD_FILE_BREAK_WRITE, &nf->nf_flags); + if (may & NFSD_MAY_READ) + __set_bit(NFSD_FILE_BREAK_READ, &nf->nf_flags); + } + nf->nf_mark = NULL; + trace_nfsd_file_alloc(nf); + } + return nf; +} + +static bool +nfsd_file_free(struct nfsd_file *nf) +{ + bool flush = false; + + trace_nfsd_file_put_final(nf); + if (nf->nf_mark) + nfsd_file_mark_put(nf->nf_mark); + if (nf->nf_file) { + get_file(nf->nf_file); + filp_close(nf->nf_file, NULL); + fput(nf->nf_file); + flush = true; + } + call_rcu(&nf->nf_rcu, nfsd_file_slab_free); + return flush; +} + +static bool +nfsd_file_check_writeback(struct nfsd_file *nf) +{ + struct file *file = nf->nf_file; + struct address_space *mapping; + + if (!file || !(file->f_mode & FMODE_WRITE)) + return false; + mapping = file->f_mapping; + return mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) || + mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK); +} + +static int +nfsd_file_check_write_error(struct nfsd_file *nf) +{ + struct file *file = nf->nf_file; + + if (!file || !(file->f_mode & FMODE_WRITE)) + return 0; + return filemap_check_wb_err(file->f_mapping, READ_ONCE(file->f_wb_err)); +} + +static bool +nfsd_file_in_use(struct nfsd_file *nf) +{ + return nfsd_file_check_writeback(nf) || + nfsd_file_check_write_error(nf); +} + +static void +nfsd_file_do_unhash(struct nfsd_file *nf) +{ + lockdep_assert_held(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock); + + trace_nfsd_file_unhash(nf); + + if (nfsd_file_check_write_error(nf)) + nfsd_reset_boot_verifier(net_generic(nf->nf_net, nfsd_net_id)); + --nfsd_file_hashtbl[nf->nf_hashval].nfb_count; + hlist_del_rcu(&nf->nf_node); + if (!list_empty(&nf->nf_lru)) + list_lru_del(&nfsd_file_lru, &nf->nf_lru); + atomic_long_dec(&nfsd_filecache_count); +} + +static bool +nfsd_file_unhash(struct nfsd_file *nf) +{ + if (test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) { + nfsd_file_do_unhash(nf); + return true; + } + return false; +} + +/* + * Return true if the file was unhashed. + */ +static bool +nfsd_file_unhash_and_release_locked(struct nfsd_file *nf, struct list_head *dispose) +{ + lockdep_assert_held(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock); + + trace_nfsd_file_unhash_and_release_locked(nf); + if (!nfsd_file_unhash(nf)) + return false; + /* keep final reference for nfsd_file_lru_dispose */ + if (atomic_add_unless(&nf->nf_ref, -1, 1)) + return true; + + list_add(&nf->nf_lru, dispose); + return true; +} + +static int +nfsd_file_put_noref(struct nfsd_file *nf) +{ + int count; + trace_nfsd_file_put(nf); + + count = atomic_dec_return(&nf->nf_ref); + if (!count) { + WARN_ON(test_bit(NFSD_FILE_HASHED, &nf->nf_flags)); + nfsd_file_free(nf); + } + return count; +} + +void +nfsd_file_put(struct nfsd_file *nf) +{ + bool is_hashed = test_bit(NFSD_FILE_HASHED, &nf->nf_flags) != 0; + bool unused = !nfsd_file_in_use(nf); + + set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags); + if (nfsd_file_put_noref(nf) == 1 && is_hashed && unused) + nfsd_file_schedule_laundrette(NFSD_FILE_LAUNDRETTE_MAY_FLUSH); +} + +struct nfsd_file * +nfsd_file_get(struct nfsd_file *nf) +{ + if (likely(atomic_inc_not_zero(&nf->nf_ref))) + return nf; + return NULL; +} + +static void +nfsd_file_dispose_list(struct list_head *dispose) +{ + struct nfsd_file *nf; + + while(!list_empty(dispose)) { + nf = list_first_entry(dispose, struct nfsd_file, nf_lru); + list_del(&nf->nf_lru); + nfsd_file_put_noref(nf); + } +} + +static void +nfsd_file_dispose_list_sync(struct list_head *dispose) +{ + bool flush = false; + struct nfsd_file *nf; + + while(!list_empty(dispose)) { + nf = list_first_entry(dispose, struct nfsd_file, nf_lru); + list_del(&nf->nf_lru); + if (!atomic_dec_and_test(&nf->nf_ref)) + continue; + if (nfsd_file_free(nf)) + flush = true; + } + if (flush) + flush_delayed_fput(); +} + +/* + * Note this can deadlock with nfsd_file_cache_purge. + */ +static enum lru_status +nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru, + spinlock_t *lock, void *arg) + __releases(lock) + __acquires(lock) +{ + struct list_head *head = arg; + struct nfsd_file *nf = list_entry(item, struct nfsd_file, nf_lru); + + /* + * Do a lockless refcount check. The hashtable holds one reference, so + * we look to see if anything else has a reference, or if any have + * been put since the shrinker last ran. Those don't get unhashed and + * released. + * + * Note that in the put path, we set the flag and then decrement the + * counter. Here we check the counter and then test and clear the flag. + * That order is deliberate to ensure that we can do this locklessly. + */ + if (atomic_read(&nf->nf_ref) > 1) + goto out_skip; + + /* + * Don't throw out files that are still undergoing I/O or + * that have uncleared errors pending. + */ + if (nfsd_file_check_writeback(nf)) + goto out_skip; + + if (test_and_clear_bit(NFSD_FILE_REFERENCED, &nf->nf_flags)) + goto out_rescan; + + if (!test_and_clear_bit(NFSD_FILE_HASHED, &nf->nf_flags)) + goto out_skip; + + list_lru_isolate_move(lru, &nf->nf_lru, head); + return LRU_REMOVED; +out_rescan: + set_bit(NFSD_FILE_LRU_RESCAN, &nfsd_file_lru_flags); +out_skip: + return LRU_SKIP; +} + +static void +nfsd_file_lru_dispose(struct list_head *head) +{ + while(!list_empty(head)) { + struct nfsd_file *nf = list_first_entry(head, + struct nfsd_file, nf_lru); + list_del_init(&nf->nf_lru); + spin_lock(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock); + nfsd_file_do_unhash(nf); + spin_unlock(&nfsd_file_hashtbl[nf->nf_hashval].nfb_lock); + nfsd_file_put_noref(nf); + } +} + +static unsigned long +nfsd_file_lru_count(struct shrinker *s, struct shrink_control *sc) +{ + return list_lru_count(&nfsd_file_lru); +} + +static unsigned long +nfsd_file_lru_scan(struct shrinker *s, struct shrink_control *sc) +{ + LIST_HEAD(head); + unsigned long ret; + + ret = list_lru_shrink_walk(&nfsd_file_lru, sc, nfsd_file_lru_cb, &head); + nfsd_file_lru_dispose(&head); + return ret; +} + +static struct shrinker nfsd_file_shrinker = { + .scan_objects = nfsd_file_lru_scan, + .count_objects = nfsd_file_lru_count, + .seeks = 1, +}; + +static void +__nfsd_file_close_inode(struct inode *inode, unsigned int hashval, + struct list_head *dispose) +{ + struct nfsd_file *nf; + struct hlist_node *tmp; + + spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock); + hlist_for_each_entry_safe(nf, tmp, &nfsd_file_hashtbl[hashval].nfb_head, nf_node) { + if (inode == nf->nf_inode) + nfsd_file_unhash_and_release_locked(nf, dispose); + } + spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock); +} + +/** + * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file + * @inode: inode of the file to attempt to remove + * + * Walk the whole hash bucket, looking for any files that correspond to "inode". + * If any do, then unhash them and put the hashtable reference to them and + * destroy any that had their last reference put. Also ensure that any of the + * fputs also have their final __fput done as well. + */ +void +nfsd_file_close_inode_sync(struct inode *inode) +{ + unsigned int hashval = (unsigned int)hash_long(inode->i_ino, + NFSD_FILE_HASH_BITS); + LIST_HEAD(dispose); + + __nfsd_file_close_inode(inode, hashval, &dispose); + trace_nfsd_file_close_inode_sync(inode, hashval, !list_empty(&dispose)); + nfsd_file_dispose_list_sync(&dispose); +} + +/** + * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file + * @inode: inode of the file to attempt to remove + * + * Walk the whole hash bucket, looking for any files that correspond to "inode". + * If any do, then unhash them and put the hashtable reference to them and + * destroy any that had their last reference put. + */ +static void +nfsd_file_close_inode(struct inode *inode) +{ + unsigned int hashval = (unsigned int)hash_long(inode->i_ino, + NFSD_FILE_HASH_BITS); + LIST_HEAD(dispose); + + __nfsd_file_close_inode(inode, hashval, &dispose); + trace_nfsd_file_close_inode(inode, hashval, !list_empty(&dispose)); + nfsd_file_dispose_list(&dispose); +} + +/** + * nfsd_file_delayed_close - close unused nfsd_files + * @work: dummy + * + * Walk the LRU list and close any entries that have not been used since + * the last scan. + * + * Note this can deadlock with nfsd_file_cache_purge. + */ +static void +nfsd_file_delayed_close(struct work_struct *work) +{ + LIST_HEAD(head); + + list_lru_walk(&nfsd_file_lru, nfsd_file_lru_cb, &head, LONG_MAX); + + if (test_and_clear_bit(NFSD_FILE_LRU_RESCAN, &nfsd_file_lru_flags)) + nfsd_file_schedule_laundrette(NFSD_FILE_LAUNDRETTE_NOFLUSH); + + if (!list_empty(&head)) { + nfsd_file_lru_dispose(&head); + flush_delayed_fput(); + } +} + +static int +nfsd_file_lease_notifier_call(struct notifier_block *nb, unsigned long arg, + void *data) +{ + struct file_lock *fl = data; + + /* Only close files for F_SETLEASE leases */ + if (fl->fl_flags & FL_LEASE) + nfsd_file_close_inode_sync(file_inode(fl->fl_file)); + return 0; +} + +static struct notifier_block nfsd_file_lease_notifier = { + .notifier_call = nfsd_file_lease_notifier_call, +}; + +static int +nfsd_file_fsnotify_handle_event(struct fsnotify_group *group, + struct inode *inode, + u32 mask, const void *data, int data_type, + const struct qstr *file_name, u32 cookie, + struct fsnotify_iter_info *iter_info) +{ + trace_nfsd_file_fsnotify_handle_event(inode, mask); + + /* Should be no marks on non-regular files */ + if (!S_ISREG(inode->i_mode)) { + WARN_ON_ONCE(1); + return 0; + } + + /* don't close files if this was not the last link */ + if (mask & FS_ATTRIB) { + if (inode->i_nlink) + return 0; + } + + nfsd_file_close_inode(inode); + return 0; +} + + +static const struct fsnotify_ops nfsd_file_fsnotify_ops = { + .handle_event = nfsd_file_fsnotify_handle_event, + .free_mark = nfsd_file_mark_free, +}; + +int +nfsd_file_cache_init(void) +{ + int ret = -ENOMEM; + unsigned int i; + + clear_bit(NFSD_FILE_SHUTDOWN, &nfsd_file_lru_flags); + + if (nfsd_file_hashtbl) + return 0; + + nfsd_file_hashtbl = kcalloc(NFSD_FILE_HASH_SIZE, + sizeof(*nfsd_file_hashtbl), GFP_KERNEL); + if (!nfsd_file_hashtbl) { + pr_err("nfsd: unable to allocate nfsd_file_hashtbl\n"); + goto out_err; + } + + nfsd_file_slab = kmem_cache_create("nfsd_file", + sizeof(struct nfsd_file), 0, 0, NULL); + if (!nfsd_file_slab) { + pr_err("nfsd: unable to create nfsd_file_slab\n"); + goto out_err; + } + + nfsd_file_mark_slab = kmem_cache_create("nfsd_file_mark", + sizeof(struct nfsd_file_mark), 0, 0, NULL); + if (!nfsd_file_mark_slab) { + pr_err("nfsd: unable to create nfsd_file_mark_slab\n"); + goto out_err; + } + + + ret = list_lru_init(&nfsd_file_lru); + if (ret) { + pr_err("nfsd: failed to init nfsd_file_lru: %d\n", ret); + goto out_err; + } + + ret = register_shrinker(&nfsd_file_shrinker); + if (ret) { + pr_err("nfsd: failed to register nfsd_file_shrinker: %d\n", ret); + goto out_lru; + } + + ret = lease_register_notifier(&nfsd_file_lease_notifier); + if (ret) { + pr_err("nfsd: unable to register lease notifier: %d\n", ret); + goto out_shrinker; + } + + nfsd_file_fsnotify_group = fsnotify_alloc_group(&nfsd_file_fsnotify_ops); + if (IS_ERR(nfsd_file_fsnotify_group)) { + pr_err("nfsd: unable to create fsnotify group: %ld\n", + PTR_ERR(nfsd_file_fsnotify_group)); + nfsd_file_fsnotify_group = NULL; + goto out_notifier; + } + + for (i = 0; i < NFSD_FILE_HASH_SIZE; i++) { + INIT_HLIST_HEAD(&nfsd_file_hashtbl[i].nfb_head); + spin_lock_init(&nfsd_file_hashtbl[i].nfb_lock); + } + + INIT_DELAYED_WORK(&nfsd_filecache_laundrette, nfsd_file_delayed_close); +out: + return ret; +out_notifier: + lease_unregister_notifier(&nfsd_file_lease_notifier); +out_shrinker: + unregister_shrinker(&nfsd_file_shrinker); +out_lru: + list_lru_destroy(&nfsd_file_lru); +out_err: + kmem_cache_destroy(nfsd_file_slab); + nfsd_file_slab = NULL; + kmem_cache_destroy(nfsd_file_mark_slab); + nfsd_file_mark_slab = NULL; + kfree(nfsd_file_hashtbl); + nfsd_file_hashtbl = NULL; + goto out; +} + +/* + * Note this can deadlock with nfsd_file_lru_cb. + */ +void +nfsd_file_cache_purge(struct net *net) +{ + unsigned int i; + struct nfsd_file *nf; + struct hlist_node *next; + LIST_HEAD(dispose); + bool del; + + if (!nfsd_file_hashtbl) + return; + + for (i = 0; i < NFSD_FILE_HASH_SIZE; i++) { + struct nfsd_fcache_bucket *nfb = &nfsd_file_hashtbl[i]; + + spin_lock(&nfb->nfb_lock); + hlist_for_each_entry_safe(nf, next, &nfb->nfb_head, nf_node) { + if (net && nf->nf_net != net) + continue; + del = nfsd_file_unhash_and_release_locked(nf, &dispose); + + /* + * Deadlock detected! Something marked this entry as + * unhased, but hasn't removed it from the hash list. + */ + WARN_ON_ONCE(!del); + } + spin_unlock(&nfb->nfb_lock); + nfsd_file_dispose_list(&dispose); + } +} + +void +nfsd_file_cache_shutdown(void) +{ + LIST_HEAD(dispose); + + set_bit(NFSD_FILE_SHUTDOWN, &nfsd_file_lru_flags); + + lease_unregister_notifier(&nfsd_file_lease_notifier); + unregister_shrinker(&nfsd_file_shrinker); + /* + * make sure all callers of nfsd_file_lru_cb are done before + * calling nfsd_file_cache_purge + */ + cancel_delayed_work_sync(&nfsd_filecache_laundrette); + nfsd_file_cache_purge(NULL); + list_lru_destroy(&nfsd_file_lru); + rcu_barrier(); + fsnotify_put_group(nfsd_file_fsnotify_group); + nfsd_file_fsnotify_group = NULL; + kmem_cache_destroy(nfsd_file_slab); + nfsd_file_slab = NULL; + fsnotify_wait_marks_destroyed(); + kmem_cache_destroy(nfsd_file_mark_slab); + nfsd_file_mark_slab = NULL; + kfree(nfsd_file_hashtbl); + nfsd_file_hashtbl = NULL; +} + +static bool +nfsd_match_cred(const struct cred *c1, const struct cred *c2) +{ + int i; + + if (!uid_eq(c1->fsuid, c2->fsuid)) + return false; + if (!gid_eq(c1->fsgid, c2->fsgid)) + return false; + if (c1->group_info == NULL || c2->group_info == NULL) + return c1->group_info == c2->group_info; + if (c1->group_info->ngroups != c2->group_info->ngroups) + return false; + for (i = 0; i < c1->group_info->ngroups; i++) { + if (!gid_eq(c1->group_info->gid[i], c2->group_info->gid[i])) + return false; + } + return true; +} + +static struct nfsd_file * +nfsd_file_find_locked(struct inode *inode, unsigned int may_flags, + unsigned int hashval, struct net *net) +{ + struct nfsd_file *nf; + unsigned char need = may_flags & NFSD_FILE_MAY_MASK; + + hlist_for_each_entry_rcu(nf, &nfsd_file_hashtbl[hashval].nfb_head, + nf_node) { + if ((need & nf->nf_may) != need) + continue; + if (nf->nf_inode != inode) + continue; + if (nf->nf_net != net) + continue; + if (!nfsd_match_cred(nf->nf_cred, current_cred())) + continue; + if (nfsd_file_get(nf) != NULL) + return nf; + } + return NULL; +} + +/** + * nfsd_file_is_cached - are there any cached open files for this fh? + * @inode: inode of the file to check + * + * Scan the hashtable for open files that match this fh. Returns true if there + * are any, and false if not. + */ +bool +nfsd_file_is_cached(struct inode *inode) +{ + bool ret = false; + struct nfsd_file *nf; + unsigned int hashval; + + hashval = (unsigned int)hash_long(inode->i_ino, NFSD_FILE_HASH_BITS); + + rcu_read_lock(); + hlist_for_each_entry_rcu(nf, &nfsd_file_hashtbl[hashval].nfb_head, + nf_node) { + if (inode == nf->nf_inode) { + ret = true; + break; + } + } + rcu_read_unlock(); + trace_nfsd_file_is_cached(inode, hashval, (int)ret); + return ret; +} + +__be32 +nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, + unsigned int may_flags, struct nfsd_file **pnf) +{ + __be32 status; + struct net *net = SVC_NET(rqstp); + struct nfsd_file *nf, *new; + struct inode *inode; + unsigned int hashval; + + /* FIXME: skip this if fh_dentry is already set? */ + status = fh_verify(rqstp, fhp, S_IFREG, + may_flags|NFSD_MAY_OWNER_OVERRIDE); + if (status != nfs_ok) + return status; + + inode = d_inode(fhp->fh_dentry); + hashval = (unsigned int)hash_long(inode->i_ino, NFSD_FILE_HASH_BITS); +retry: + rcu_read_lock(); + nf = nfsd_file_find_locked(inode, may_flags, hashval, net); + rcu_read_unlock(); + if (nf) + goto wait_for_construction; + + new = nfsd_file_alloc(inode, may_flags, hashval, net); + if (!new) { + trace_nfsd_file_acquire(rqstp, hashval, inode, may_flags, + NULL, nfserr_jukebox); + return nfserr_jukebox; + } + + spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock); + nf = nfsd_file_find_locked(inode, may_flags, hashval, net); + if (nf == NULL) + goto open_file; + spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock); + nfsd_file_slab_free(&new->nf_rcu); + +wait_for_construction: + wait_on_bit(&nf->nf_flags, NFSD_FILE_PENDING, TASK_UNINTERRUPTIBLE); + + /* Did construction of this file fail? */ + if (!test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) { + nfsd_file_put_noref(nf); + goto retry; + } + + this_cpu_inc(nfsd_file_cache_hits); + + if (!(may_flags & NFSD_MAY_NOT_BREAK_LEASE)) { + bool write = (may_flags & NFSD_MAY_WRITE); + + if (test_bit(NFSD_FILE_BREAK_READ, &nf->nf_flags) || + (test_bit(NFSD_FILE_BREAK_WRITE, &nf->nf_flags) && write)) { + status = nfserrno(nfsd_open_break_lease( + file_inode(nf->nf_file), may_flags)); + if (status == nfs_ok) { + clear_bit(NFSD_FILE_BREAK_READ, &nf->nf_flags); + if (write) + clear_bit(NFSD_FILE_BREAK_WRITE, + &nf->nf_flags); + } + } + } +out: + if (status == nfs_ok) { + *pnf = nf; + } else { + nfsd_file_put(nf); + nf = NULL; + } + + trace_nfsd_file_acquire(rqstp, hashval, inode, may_flags, nf, status); + return status; +open_file: + nf = new; + /* Take reference for the hashtable */ + atomic_inc(&nf->nf_ref); + __set_bit(NFSD_FILE_HASHED, &nf->nf_flags); + __set_bit(NFSD_FILE_PENDING, &nf->nf_flags); + list_lru_add(&nfsd_file_lru, &nf->nf_lru); + hlist_add_head_rcu(&nf->nf_node, &nfsd_file_hashtbl[hashval].nfb_head); + ++nfsd_file_hashtbl[hashval].nfb_count; + nfsd_file_hashtbl[hashval].nfb_maxcount = max(nfsd_file_hashtbl[hashval].nfb_maxcount, + nfsd_file_hashtbl[hashval].nfb_count); + spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock); + atomic_long_inc(&nfsd_filecache_count); + + nf->nf_mark = nfsd_file_mark_find_or_create(nf); + if (nf->nf_mark) + status = nfsd_open_verified(rqstp, fhp, S_IFREG, + may_flags, &nf->nf_file); + else + status = nfserr_jukebox; + /* + * If construction failed, or we raced with a call to unlink() + * then unhash. + */ + if (status != nfs_ok || inode->i_nlink == 0) { + bool do_free; + spin_lock(&nfsd_file_hashtbl[hashval].nfb_lock); + do_free = nfsd_file_unhash(nf); + spin_unlock(&nfsd_file_hashtbl[hashval].nfb_lock); + if (do_free) + nfsd_file_put_noref(nf); + } + clear_bit_unlock(NFSD_FILE_PENDING, &nf->nf_flags); + smp_mb__after_atomic(); + wake_up_bit(&nf->nf_flags, NFSD_FILE_PENDING); + goto out; +} + +/* + * Note that fields may be added, removed or reordered in the future. Programs + * scraping this file for info should test the labels to ensure they're + * getting the correct field. + */ +static int nfsd_file_cache_stats_show(struct seq_file *m, void *v) +{ + unsigned int i, count = 0, longest = 0; + unsigned long hits = 0; + + /* + * No need for spinlocks here since we're not terribly interested in + * accuracy. We do take the nfsd_mutex simply to ensure that we + * don't end up racing with server shutdown + */ + mutex_lock(&nfsd_mutex); + if (nfsd_file_hashtbl) { + for (i = 0; i < NFSD_FILE_HASH_SIZE; i++) { + count += nfsd_file_hashtbl[i].nfb_count; + longest = max(longest, nfsd_file_hashtbl[i].nfb_count); + } + } + mutex_unlock(&nfsd_mutex); + + for_each_possible_cpu(i) + hits += per_cpu(nfsd_file_cache_hits, i); + + seq_printf(m, "total entries: %u\n", count); + seq_printf(m, "longest chain: %u\n", longest); + seq_printf(m, "cache hits: %lu\n", hits); + return 0; +} + +int nfsd_file_cache_stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, nfsd_file_cache_stats_show, NULL); +} diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h new file mode 100644 index 000000000000..851d9abf54c2 --- /dev/null +++ b/fs/nfsd/filecache.h @@ -0,0 +1,61 @@ +#ifndef _FS_NFSD_FILECACHE_H +#define _FS_NFSD_FILECACHE_H + +#include <linux/fsnotify_backend.h> + +/* + * This is the fsnotify_mark container that nfsd attaches to the files that it + * is holding open. Note that we have a separate refcount here aside from the + * one in the fsnotify_mark. We only want a single fsnotify_mark attached to + * the inode, and for each nfsd_file to hold a reference to it. + * + * The fsnotify_mark is itself refcounted, but that's not sufficient to tell us + * how to put that reference. If there are still outstanding nfsd_files that + * reference the mark, then we would want to call fsnotify_put_mark on it. + * If there were not, then we'd need to call fsnotify_destroy_mark. Since we + * can't really tell the difference, we use the nfm_mark to keep track of how + * many nfsd_files hold references to the mark. When that counter goes to zero + * then we know to call fsnotify_destroy_mark on it. + */ +struct nfsd_file_mark { + struct fsnotify_mark nfm_mark; + atomic_t nfm_ref; +}; + +/* + * A representation of a file that has been opened by knfsd. These are hashed + * in the hashtable by inode pointer value. Note that this object doesn't + * hold a reference to the inode by itself, so the nf_inode pointer should + * never be dereferenced, only used for comparison. + */ +struct nfsd_file { + struct hlist_node nf_node; + struct list_head nf_lru; + struct rcu_head nf_rcu; + struct file *nf_file; + const struct cred *nf_cred; + struct net *nf_net; +#define NFSD_FILE_HASHED (0) +#define NFSD_FILE_PENDING (1) +#define NFSD_FILE_BREAK_READ (2) +#define NFSD_FILE_BREAK_WRITE (3) +#define NFSD_FILE_REFERENCED (4) + unsigned long nf_flags; + struct inode *nf_inode; + unsigned int nf_hashval; + atomic_t nf_ref; + unsigned char nf_may; + struct nfsd_file_mark *nf_mark; +}; + +int nfsd_file_cache_init(void); +void nfsd_file_cache_purge(struct net *); +void nfsd_file_cache_shutdown(void); +void nfsd_file_put(struct nfsd_file *nf); +struct nfsd_file *nfsd_file_get(struct nfsd_file *nf); +void nfsd_file_close_inode_sync(struct inode *inode); +bool nfsd_file_is_cached(struct inode *inode); +__be32 nfsd_file_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, + unsigned int may_flags, struct nfsd_file **nfp); +int nfsd_file_cache_stats_open(struct inode *, struct file *); +#endif /* _FS_NFSD_FILECACHE_H */ diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index bdfe5bcb3dcd..9a4ef815fb8c 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -104,6 +104,7 @@ struct nfsd_net { /* Time of server startup */ struct timespec64 nfssvc_boot; + seqlock_t boot_lock; /* * Max number of connections this nfsd container will allow. Defaults @@ -179,4 +180,7 @@ struct nfsd_net { extern void nfsd_netns_free_versions(struct nfsd_net *nn); extern unsigned int nfsd_net_id; + +void nfsd_copy_boot_verifier(__be32 verf[2], struct nfsd_net *nn); +void nfsd_reset_boot_verifier(struct nfsd_net *nn); #endif /* __NFSD_NETNS_H__ */ diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 9bc32af4e2da..cea68d8411ac 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -172,13 +172,8 @@ nfsd3_proc_read(struct svc_rqst *rqstp) nfserr = nfsd_read(rqstp, &resp->fh, argp->offset, rqstp->rq_vec, argp->vlen, - &resp->count); - if (nfserr == 0) { - struct inode *inode = d_inode(resp->fh.fh_dentry); - resp->eof = nfsd_eof_on_read(cnt, resp->count, argp->offset, - inode->i_size); - } - + &resp->count, + &resp->eof); RETURN_STATUS(nfserr); } diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index fcf31822c74c..86e5658651f1 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -27,6 +27,7 @@ static u32 nfs3_ftypes[] = { NF3SOCK, NF3BAD, NF3LNK, NF3BAD, }; + /* * XDR functions for basic NFS types */ @@ -751,14 +752,16 @@ nfs3svc_encode_writeres(struct svc_rqst *rqstp, __be32 *p) { struct nfsd3_writeres *resp = rqstp->rq_resp; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + __be32 verf[2]; p = encode_wcc_data(rqstp, p, &resp->fh); if (resp->status == 0) { *p++ = htonl(resp->count); *p++ = htonl(resp->committed); /* unique identifier, y2038 overflow can be ignored */ - *p++ = htonl((u32)nn->nfssvc_boot.tv_sec); - *p++ = htonl(nn->nfssvc_boot.tv_nsec); + nfsd_copy_boot_verifier(verf, nn); + *p++ = verf[0]; + *p++ = verf[1]; } return xdr_ressize_check(rqstp, p); } @@ -1125,13 +1128,15 @@ nfs3svc_encode_commitres(struct svc_rqst *rqstp, __be32 *p) { struct nfsd3_commitres *resp = rqstp->rq_resp; struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + __be32 verf[2]; p = encode_wcc_data(rqstp, p, &resp->fh); /* Write verifier */ if (resp->status == 0) { /* unique identifier, y2038 overflow can be ignored */ - *p++ = htonl((u32)nn->nfssvc_boot.tv_sec); - *p++ = htonl(nn->nfssvc_boot.tv_nsec); + nfsd_copy_boot_verifier(verf, nn); + *p++ = verf[0]; + *p++ = verf[1]; } return xdr_ressize_check(rqstp, p); } diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 397eb7820929..524111420b48 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -512,11 +512,9 @@ static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, if (unlikely(status)) return status; - if (cb != NULL) { - status = decode_cb_sequence4res(xdr, cb); - if (unlikely(status || cb->cb_seq_status)) - return status; - } + status = decode_cb_sequence4res(xdr, cb); + if (unlikely(status || cb->cb_seq_status)) + return status; return decode_cb_op_status(xdr, OP_CB_RECALL, &cb->cb_status); } @@ -604,11 +602,10 @@ static int nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp, if (unlikely(status)) return status; - if (cb) { - status = decode_cb_sequence4res(xdr, cb); - if (unlikely(status || cb->cb_seq_status)) - return status; - } + status = decode_cb_sequence4res(xdr, cb); + if (unlikely(status || cb->cb_seq_status)) + return status; + return decode_cb_op_status(xdr, OP_CB_LAYOUTRECALL, &cb->cb_status); } #endif /* CONFIG_NFSD_PNFS */ @@ -663,11 +660,10 @@ static int nfs4_xdr_dec_cb_notify_lock(struct rpc_rqst *rqstp, if (unlikely(status)) return status; - if (cb) { - status = decode_cb_sequence4res(xdr, cb); - if (unlikely(status || cb->cb_seq_status)) - return status; - } + status = decode_cb_sequence4res(xdr, cb); + if (unlikely(status || cb->cb_seq_status)) + return status; + return decode_cb_op_status(xdr, OP_CB_NOTIFY_LOCK, &cb->cb_status); } @@ -759,11 +755,10 @@ static int nfs4_xdr_dec_cb_offload(struct rpc_rqst *rqstp, if (unlikely(status)) return status; - if (cb) { - status = decode_cb_sequence4res(xdr, cb); - if (unlikely(status || cb->cb_seq_status)) - return status; - } + status = decode_cb_sequence4res(xdr, cb); + if (unlikely(status || cb->cb_seq_status)) + return status; + return decode_cb_op_status(xdr, OP_CB_OFFLOAD, &cb->cb_status); } /* diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index a79e24b79095..2681c70283ce 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -169,8 +169,8 @@ nfsd4_free_layout_stateid(struct nfs4_stid *stid) spin_unlock(&fp->fi_lock); if (!nfsd4_layout_ops[ls->ls_layout_type]->disable_recalls) - vfs_setlease(ls->ls_file, F_UNLCK, NULL, (void **)&ls); - fput(ls->ls_file); + vfs_setlease(ls->ls_file->nf_file, F_UNLCK, NULL, (void **)&ls); + nfsd_file_put(ls->ls_file); if (ls->ls_recalled) atomic_dec(&ls->ls_stid.sc_file->fi_lo_recalls); @@ -197,7 +197,7 @@ nfsd4_layout_setlease(struct nfs4_layout_stateid *ls) fl->fl_end = OFFSET_MAX; fl->fl_owner = ls; fl->fl_pid = current->tgid; - fl->fl_file = ls->ls_file; + fl->fl_file = ls->ls_file->nf_file; status = vfs_setlease(fl->fl_file, fl->fl_type, &fl, NULL); if (status) { @@ -236,13 +236,13 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate, NFSPROC4_CLNT_CB_LAYOUT); if (parent->sc_type == NFS4_DELEG_STID) - ls->ls_file = get_file(fp->fi_deleg_file); + ls->ls_file = nfsd_file_get(fp->fi_deleg_file); else ls->ls_file = find_any_file(fp); BUG_ON(!ls->ls_file); if (nfsd4_layout_setlease(ls)) { - fput(ls->ls_file); + nfsd_file_put(ls->ls_file); put_nfs4_file(fp); kmem_cache_free(nfs4_layout_stateid_cache, ls); return NULL; @@ -626,7 +626,7 @@ nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls) argv[0] = (char *)nfsd_recall_failed; argv[1] = addr_str; - argv[2] = ls->ls_file->f_path.mnt->mnt_sb->s_id; + argv[2] = ls->ls_file->nf_file->f_path.mnt->mnt_sb->s_id; argv[3] = NULL; error = call_usermodehelper(nfsd_recall_failed, argv, envp, diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 8beda999e134..4e3e77b76411 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -568,17 +568,11 @@ nfsd4_access(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, static void gen_boot_verifier(nfs4_verifier *verifier, struct net *net) { - __be32 verf[2]; - struct nfsd_net *nn = net_generic(net, nfsd_net_id); + __be32 *verf = (__be32 *)verifier->data; - /* - * This is opaque to client, so no need to byte-swap. Use - * __force to keep sparse happy. y2038 time_t overflow is - * irrelevant in this usage. - */ - verf[0] = (__force __be32)nn->nfssvc_boot.tv_sec; - verf[1] = (__force __be32)nn->nfssvc_boot.tv_nsec; - memcpy(verifier->data, verf, sizeof(verifier->data)); + BUILD_BUG_ON(2*sizeof(*verf) != sizeof(verifier->data)); + + nfsd_copy_boot_verifier(verf, net_generic(net, nfsd_net_id)); } static __be32 @@ -761,7 +755,7 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_read *read = &u->read; __be32 status; - read->rd_filp = NULL; + read->rd_nf = NULL; if (read->rd_offset >= OFFSET_MAX) return nfserr_inval; @@ -782,7 +776,7 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, /* check stateid */ status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, &read->rd_stateid, RD_STATE, - &read->rd_filp, &read->rd_tmp_file); + &read->rd_nf); if (status) { dprintk("NFSD: nfsd4_read: couldn't process stateid!\n"); goto out; @@ -798,8 +792,8 @@ out: static void nfsd4_read_release(union nfsd4_op_u *u) { - if (u->read.rd_filp) - fput(u->read.rd_filp); + if (u->read.rd_nf) + nfsd_file_put(u->read.rd_nf); trace_nfsd_read_done(u->read.rd_rqstp, u->read.rd_fhp, u->read.rd_offset, u->read.rd_length); } @@ -954,7 +948,7 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (setattr->sa_iattr.ia_valid & ATTR_SIZE) { status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, &setattr->sa_stateid, - WR_STATE, NULL, NULL); + WR_STATE, NULL); if (status) { dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n"); return status; @@ -993,7 +987,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { struct nfsd4_write *write = &u->write; stateid_t *stateid = &write->wr_stateid; - struct file *filp = NULL; + struct nfsd_file *nf = NULL; __be32 status = nfs_ok; unsigned long cnt; int nvecs; @@ -1005,7 +999,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, trace_nfsd_write_start(rqstp, &cstate->current_fh, write->wr_offset, cnt); status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, - stateid, WR_STATE, &filp, NULL); + stateid, WR_STATE, &nf); if (status) { dprintk("NFSD: nfsd4_write: couldn't process stateid!\n"); return status; @@ -1018,10 +1012,10 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, &write->wr_head, write->wr_buflen); WARN_ON_ONCE(nvecs > ARRAY_SIZE(rqstp->rq_vec)); - status = nfsd_vfs_write(rqstp, &cstate->current_fh, filp, + status = nfsd_vfs_write(rqstp, &cstate->current_fh, nf->nf_file, write->wr_offset, rqstp->rq_vec, nvecs, &cnt, write->wr_how_written); - fput(filp); + nfsd_file_put(nf); write->wr_bytes_written = cnt; trace_nfsd_write_done(rqstp, &cstate->current_fh, @@ -1031,8 +1025,8 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, static __be32 nfsd4_verify_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, - stateid_t *src_stateid, struct file **src, - stateid_t *dst_stateid, struct file **dst) + stateid_t *src_stateid, struct nfsd_file **src, + stateid_t *dst_stateid, struct nfsd_file **dst) { __be32 status; @@ -1040,22 +1034,22 @@ nfsd4_verify_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return nfserr_nofilehandle; status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->save_fh, - src_stateid, RD_STATE, src, NULL); + src_stateid, RD_STATE, src); if (status) { dprintk("NFSD: %s: couldn't process src stateid!\n", __func__); goto out; } status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, - dst_stateid, WR_STATE, dst, NULL); + dst_stateid, WR_STATE, dst); if (status) { dprintk("NFSD: %s: couldn't process dst stateid!\n", __func__); goto out_put_src; } /* fix up for NFS-specific error code */ - if (!S_ISREG(file_inode(*src)->i_mode) || - !S_ISREG(file_inode(*dst)->i_mode)) { + if (!S_ISREG(file_inode((*src)->nf_file)->i_mode) || + !S_ISREG(file_inode((*dst)->nf_file)->i_mode)) { status = nfserr_wrong_type; goto out_put_dst; } @@ -1063,9 +1057,9 @@ nfsd4_verify_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, out: return status; out_put_dst: - fput(*dst); + nfsd_file_put(*dst); out_put_src: - fput(*src); + nfsd_file_put(*src); goto out; } @@ -1074,7 +1068,7 @@ nfsd4_clone(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_clone *clone = &u->clone; - struct file *src, *dst; + struct nfsd_file *src, *dst; __be32 status; status = nfsd4_verify_copy(rqstp, cstate, &clone->cl_src_stateid, &src, @@ -1082,11 +1076,11 @@ nfsd4_clone(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (status) goto out; - status = nfsd4_clone_file_range(src, clone->cl_src_pos, - dst, clone->cl_dst_pos, clone->cl_count); + status = nfsd4_clone_file_range(src->nf_file, clone->cl_src_pos, + dst->nf_file, clone->cl_dst_pos, clone->cl_count); - fput(dst); - fput(src); + nfsd_file_put(dst); + nfsd_file_put(src); out: return status; } @@ -1176,8 +1170,9 @@ static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy) do { if (kthread_should_stop()) break; - bytes_copied = nfsd_copy_file_range(copy->file_src, src_pos, - copy->file_dst, dst_pos, bytes_total); + bytes_copied = nfsd_copy_file_range(copy->nf_src->nf_file, + src_pos, copy->nf_dst->nf_file, dst_pos, + bytes_total); if (bytes_copied <= 0) break; bytes_total -= bytes_copied; @@ -1204,8 +1199,8 @@ static __be32 nfsd4_do_copy(struct nfsd4_copy *copy, bool sync) status = nfs_ok; } - fput(copy->file_src); - fput(copy->file_dst); + nfsd_file_put(copy->nf_src); + nfsd_file_put(copy->nf_dst); return status; } @@ -1218,16 +1213,16 @@ static void dup_copy_fields(struct nfsd4_copy *src, struct nfsd4_copy *dst) memcpy(&dst->cp_res, &src->cp_res, sizeof(src->cp_res)); memcpy(&dst->fh, &src->fh, sizeof(src->fh)); dst->cp_clp = src->cp_clp; - dst->file_dst = get_file(src->file_dst); - dst->file_src = get_file(src->file_src); + dst->nf_dst = nfsd_file_get(src->nf_dst); + dst->nf_src = nfsd_file_get(src->nf_src); memcpy(&dst->cp_stateid, &src->cp_stateid, sizeof(src->cp_stateid)); } static void cleanup_async_copy(struct nfsd4_copy *copy) { nfs4_free_cp_state(copy); - fput(copy->file_dst); - fput(copy->file_src); + nfsd_file_put(copy->nf_dst); + nfsd_file_put(copy->nf_src); spin_lock(©->cp_clp->async_lock); list_del(©->copies); spin_unlock(©->cp_clp->async_lock); @@ -1264,8 +1259,8 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_copy *async_copy = NULL; status = nfsd4_verify_copy(rqstp, cstate, ©->cp_src_stateid, - ©->file_src, ©->cp_dst_stateid, - ©->file_dst); + ©->nf_src, ©->cp_dst_stateid, + ©->nf_dst); if (status) goto out; @@ -1347,21 +1342,21 @@ nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_fallocate *fallocate, int flags) { __be32 status; - struct file *file; + struct nfsd_file *nf; status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, &fallocate->falloc_stateid, - WR_STATE, &file, NULL); + WR_STATE, &nf); if (status != nfs_ok) { dprintk("NFSD: nfsd4_fallocate: couldn't process stateid!\n"); return status; } - status = nfsd4_vfs_fallocate(rqstp, &cstate->current_fh, file, + status = nfsd4_vfs_fallocate(rqstp, &cstate->current_fh, nf->nf_file, fallocate->falloc_offset, fallocate->falloc_length, flags); - fput(file); + nfsd_file_put(nf); return status; } static __be32 @@ -1406,11 +1401,11 @@ nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_seek *seek = &u->seek; int whence; __be32 status; - struct file *file; + struct nfsd_file *nf; status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, &seek->seek_stateid, - RD_STATE, &file, NULL); + RD_STATE, &nf); if (status) { dprintk("NFSD: nfsd4_seek: couldn't process stateid!\n"); return status; @@ -1432,14 +1427,14 @@ nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * Note: This call does change file->f_pos, but nothing in NFSD * should ever file->f_pos. */ - seek->seek_pos = vfs_llseek(file, seek->seek_offset, whence); + seek->seek_pos = vfs_llseek(nf->nf_file, seek->seek_offset, whence); if (seek->seek_pos < 0) status = nfserrno(seek->seek_pos); - else if (seek->seek_pos >= i_size_read(file_inode(file))) + else if (seek->seek_pos >= i_size_read(file_inode(nf->nf_file))) seek->seek_eof = true; out: - fput(file); + nfsd_file_put(nf); return status; } diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 87679557d0d6..cdc75ad4438b 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -59,8 +59,13 @@ struct nfsd4_client_tracking_ops { void (*remove)(struct nfs4_client *); int (*check)(struct nfs4_client *); void (*grace_done)(struct nfsd_net *); + uint8_t version; + size_t msglen; }; +static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops; +static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops_v2; + /* Globals */ static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery"; @@ -173,6 +178,7 @@ __nfsd4_create_reclaim_record_grace(struct nfs4_client *clp, const char *dname, int len, struct nfsd_net *nn) { struct xdr_netobj name; + struct xdr_netobj princhash = { .len = 0, .data = NULL }; struct nfs4_client_reclaim *crp; name.data = kmemdup(dname, len, GFP_KERNEL); @@ -182,7 +188,7 @@ __nfsd4_create_reclaim_record_grace(struct nfs4_client *clp, return; } name.len = len; - crp = nfs4_client_to_reclaim(name, nn); + crp = nfs4_client_to_reclaim(name, princhash, nn); if (!crp) { kfree(name.data); return; @@ -482,6 +488,7 @@ static int load_recdir(struct dentry *parent, struct dentry *child, struct nfsd_net *nn) { struct xdr_netobj name; + struct xdr_netobj princhash = { .len = 0, .data = NULL }; if (child->d_name.len != HEXDIR_LEN - 1) { printk("%s: illegal name %pd in recovery directory\n", @@ -496,7 +503,7 @@ load_recdir(struct dentry *parent, struct dentry *child, struct nfsd_net *nn) goto out; } name.len = HEXDIR_LEN; - if (!nfs4_client_to_reclaim(name, nn)) + if (!nfs4_client_to_reclaim(name, princhash, nn)) kfree(name.data); out: return 0; @@ -718,6 +725,8 @@ static const struct nfsd4_client_tracking_ops nfsd4_legacy_tracking_ops = { .remove = nfsd4_remove_clid_dir, .check = nfsd4_check_legacy_client, .grace_done = nfsd4_recdir_purge_old, + .version = 1, + .msglen = 0, }; /* Globals */ @@ -731,25 +740,32 @@ struct cld_net { struct list_head cn_list; unsigned int cn_xid; bool cn_has_legacy; + struct crypto_shash *cn_tfm; }; struct cld_upcall { struct list_head cu_list; struct cld_net *cu_net; struct completion cu_done; - struct cld_msg cu_msg; + union { + struct cld_msg_hdr cu_hdr; + struct cld_msg cu_msg; + struct cld_msg_v2 cu_msg_v2; + } cu_u; }; static int -__cld_pipe_upcall(struct rpc_pipe *pipe, struct cld_msg *cmsg) +__cld_pipe_upcall(struct rpc_pipe *pipe, void *cmsg) { int ret; struct rpc_pipe_msg msg; - struct cld_upcall *cup = container_of(cmsg, struct cld_upcall, cu_msg); + struct cld_upcall *cup = container_of(cmsg, struct cld_upcall, cu_u); + struct nfsd_net *nn = net_generic(pipe->dentry->d_sb->s_fs_info, + nfsd_net_id); memset(&msg, 0, sizeof(msg)); msg.data = cmsg; - msg.len = sizeof(*cmsg); + msg.len = nn->client_tracking_ops->msglen; ret = rpc_queue_upcall(pipe, &msg); if (ret < 0) { @@ -765,7 +781,7 @@ out: } static int -cld_pipe_upcall(struct rpc_pipe *pipe, struct cld_msg *cmsg) +cld_pipe_upcall(struct rpc_pipe *pipe, void *cmsg) { int ret; @@ -781,11 +797,11 @@ cld_pipe_upcall(struct rpc_pipe *pipe, struct cld_msg *cmsg) } static ssize_t -__cld_pipe_inprogress_downcall(const struct cld_msg __user *cmsg, +__cld_pipe_inprogress_downcall(const struct cld_msg_v2 __user *cmsg, struct nfsd_net *nn) { - uint8_t cmd; - struct xdr_netobj name; + uint8_t cmd, princhashlen; + struct xdr_netobj name, princhash = { .len = 0, .data = NULL }; uint16_t namelen; struct cld_net *cn = nn->cld_net; @@ -794,22 +810,48 @@ __cld_pipe_inprogress_downcall(const struct cld_msg __user *cmsg, return -EFAULT; } if (cmd == Cld_GraceStart) { - if (get_user(namelen, &cmsg->cm_u.cm_name.cn_len)) - return -EFAULT; - name.data = memdup_user(&cmsg->cm_u.cm_name.cn_id, namelen); - if (IS_ERR_OR_NULL(name.data)) - return -EFAULT; - name.len = namelen; + if (nn->client_tracking_ops->version >= 2) { + const struct cld_clntinfo __user *ci; + + ci = &cmsg->cm_u.cm_clntinfo; + if (get_user(namelen, &ci->cc_name.cn_len)) + return -EFAULT; + name.data = memdup_user(&ci->cc_name.cn_id, namelen); + if (IS_ERR_OR_NULL(name.data)) + return -EFAULT; + name.len = namelen; + get_user(princhashlen, &ci->cc_princhash.cp_len); + if (princhashlen > 0) { + princhash.data = memdup_user( + &ci->cc_princhash.cp_data, + princhashlen); + if (IS_ERR_OR_NULL(princhash.data)) + return -EFAULT; + princhash.len = princhashlen; + } else + princhash.len = 0; + } else { + const struct cld_name __user *cnm; + + cnm = &cmsg->cm_u.cm_name; + if (get_user(namelen, &cnm->cn_len)) + return -EFAULT; + name.data = memdup_user(&cnm->cn_id, namelen); + if (IS_ERR_OR_NULL(name.data)) + return -EFAULT; + name.len = namelen; + } if (name.len > 5 && memcmp(name.data, "hash:", 5) == 0) { name.len = name.len - 5; memmove(name.data, name.data + 5, name.len); cn->cn_has_legacy = true; } - if (!nfs4_client_to_reclaim(name, nn)) { + if (!nfs4_client_to_reclaim(name, princhash, nn)) { kfree(name.data); + kfree(princhash.data); return -EFAULT; } - return sizeof(*cmsg); + return nn->client_tracking_ops->msglen; } return -EFAULT; } @@ -818,21 +860,22 @@ static ssize_t cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) { struct cld_upcall *tmp, *cup; - struct cld_msg __user *cmsg = (struct cld_msg __user *)src; + struct cld_msg_hdr __user *hdr = (struct cld_msg_hdr __user *)src; + struct cld_msg_v2 __user *cmsg = (struct cld_msg_v2 __user *)src; uint32_t xid; struct nfsd_net *nn = net_generic(file_inode(filp)->i_sb->s_fs_info, nfsd_net_id); struct cld_net *cn = nn->cld_net; int16_t status; - if (mlen != sizeof(*cmsg)) { + if (mlen != nn->client_tracking_ops->msglen) { dprintk("%s: got %zu bytes, expected %zu\n", __func__, mlen, - sizeof(*cmsg)); + nn->client_tracking_ops->msglen); return -EINVAL; } /* copy just the xid so we can try to find that */ - if (copy_from_user(&xid, &cmsg->cm_xid, sizeof(xid)) != 0) { + if (copy_from_user(&xid, &hdr->cm_xid, sizeof(xid)) != 0) { dprintk("%s: error when copying xid from userspace", __func__); return -EFAULT; } @@ -842,7 +885,7 @@ cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) * list (for -EINPROGRESS, we just want to make sure the xid is * valid, not remove the upcall from the list) */ - if (get_user(status, &cmsg->cm_status)) { + if (get_user(status, &hdr->cm_status)) { dprintk("%s: error when copying status from userspace", __func__); return -EFAULT; } @@ -851,7 +894,7 @@ cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) cup = NULL; spin_lock(&cn->cn_lock); list_for_each_entry(tmp, &cn->cn_list, cu_list) { - if (get_unaligned(&tmp->cu_msg.cm_xid) == xid) { + if (get_unaligned(&tmp->cu_u.cu_hdr.cm_xid) == xid) { cup = tmp; if (status != -EINPROGRESS) list_del_init(&cup->cu_list); @@ -869,7 +912,7 @@ cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) if (status == -EINPROGRESS) return __cld_pipe_inprogress_downcall(cmsg, nn); - if (copy_from_user(&cup->cu_msg, src, mlen) != 0) + if (copy_from_user(&cup->cu_u.cu_msg_v2, src, mlen) != 0) return -EFAULT; complete(&cup->cu_done); @@ -881,7 +924,7 @@ cld_pipe_destroy_msg(struct rpc_pipe_msg *msg) { struct cld_msg *cmsg = msg->data; struct cld_upcall *cup = container_of(cmsg, struct cld_upcall, - cu_msg); + cu_u.cu_msg); /* errno >= 0 means we got a downcall */ if (msg->errno >= 0) @@ -1007,14 +1050,17 @@ nfsd4_remove_cld_pipe(struct net *net) nfsd4_cld_unregister_net(net, cn->cn_pipe); rpc_destroy_pipe_data(cn->cn_pipe); + if (cn->cn_tfm) + crypto_free_shash(cn->cn_tfm); kfree(nn->cld_net); nn->cld_net = NULL; } static struct cld_upcall * -alloc_cld_upcall(struct cld_net *cn) +alloc_cld_upcall(struct nfsd_net *nn) { struct cld_upcall *new, *tmp; + struct cld_net *cn = nn->cld_net; new = kzalloc(sizeof(*new), GFP_KERNEL); if (!new) @@ -1024,20 +1070,20 @@ alloc_cld_upcall(struct cld_net *cn) restart_search: spin_lock(&cn->cn_lock); list_for_each_entry(tmp, &cn->cn_list, cu_list) { - if (tmp->cu_msg.cm_xid == cn->cn_xid) { + if (tmp->cu_u.cu_msg.cm_xid == cn->cn_xid) { cn->cn_xid++; spin_unlock(&cn->cn_lock); goto restart_search; } } init_completion(&new->cu_done); - new->cu_msg.cm_vers = CLD_UPCALL_VERSION; - put_unaligned(cn->cn_xid++, &new->cu_msg.cm_xid); + new->cu_u.cu_msg.cm_vers = nn->client_tracking_ops->version; + put_unaligned(cn->cn_xid++, &new->cu_u.cu_msg.cm_xid); new->cu_net = cn; list_add(&new->cu_list, &cn->cn_list); spin_unlock(&cn->cn_lock); - dprintk("%s: allocated xid %u\n", __func__, new->cu_msg.cm_xid); + dprintk("%s: allocated xid %u\n", __func__, new->cu_u.cu_msg.cm_xid); return new; } @@ -1066,20 +1112,20 @@ nfsd4_cld_create(struct nfs4_client *clp) if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) return; - cup = alloc_cld_upcall(cn); + cup = alloc_cld_upcall(nn); if (!cup) { ret = -ENOMEM; goto out_err; } - cup->cu_msg.cm_cmd = Cld_Create; - cup->cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len; - memcpy(cup->cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data, + cup->cu_u.cu_msg.cm_cmd = Cld_Create; + cup->cu_u.cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len; + memcpy(cup->cu_u.cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data, clp->cl_name.len); - ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg); + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg); if (!ret) { - ret = cup->cu_msg.cm_status; + ret = cup->cu_u.cu_msg.cm_status; set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); } @@ -1092,6 +1138,75 @@ out_err: /* Ask daemon to create a new record */ static void +nfsd4_cld_create_v2(struct nfs4_client *clp) +{ + int ret; + struct cld_upcall *cup; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + struct cld_net *cn = nn->cld_net; + struct cld_msg_v2 *cmsg; + struct crypto_shash *tfm = cn->cn_tfm; + struct xdr_netobj cksum; + char *principal = NULL; + SHASH_DESC_ON_STACK(desc, tfm); + + /* Don't upcall if it's already stored */ + if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) + return; + + cup = alloc_cld_upcall(nn); + if (!cup) { + ret = -ENOMEM; + goto out_err; + } + + cmsg = &cup->cu_u.cu_msg_v2; + cmsg->cm_cmd = Cld_Create; + cmsg->cm_u.cm_clntinfo.cc_name.cn_len = clp->cl_name.len; + memcpy(cmsg->cm_u.cm_clntinfo.cc_name.cn_id, clp->cl_name.data, + clp->cl_name.len); + if (clp->cl_cred.cr_raw_principal) + principal = clp->cl_cred.cr_raw_principal; + else if (clp->cl_cred.cr_principal) + principal = clp->cl_cred.cr_principal; + if (principal) { + desc->tfm = tfm; + cksum.len = crypto_shash_digestsize(tfm); + cksum.data = kmalloc(cksum.len, GFP_KERNEL); + if (cksum.data == NULL) { + ret = -ENOMEM; + goto out; + } + ret = crypto_shash_digest(desc, principal, strlen(principal), + cksum.data); + shash_desc_zero(desc); + if (ret) { + kfree(cksum.data); + goto out; + } + cmsg->cm_u.cm_clntinfo.cc_princhash.cp_len = cksum.len; + memcpy(cmsg->cm_u.cm_clntinfo.cc_princhash.cp_data, + cksum.data, cksum.len); + kfree(cksum.data); + } else + cmsg->cm_u.cm_clntinfo.cc_princhash.cp_len = 0; + + ret = cld_pipe_upcall(cn->cn_pipe, cmsg); + if (!ret) { + ret = cmsg->cm_status; + set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); + } + +out: + free_cld_upcall(cup); +out_err: + if (ret) + pr_err("NFSD: Unable to create client record on stable storage: %d\n", + ret); +} + +/* Ask daemon to create a new record */ +static void nfsd4_cld_remove(struct nfs4_client *clp) { int ret; @@ -1103,20 +1218,20 @@ nfsd4_cld_remove(struct nfs4_client *clp) if (!test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) return; - cup = alloc_cld_upcall(cn); + cup = alloc_cld_upcall(nn); if (!cup) { ret = -ENOMEM; goto out_err; } - cup->cu_msg.cm_cmd = Cld_Remove; - cup->cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len; - memcpy(cup->cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data, + cup->cu_u.cu_msg.cm_cmd = Cld_Remove; + cup->cu_u.cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len; + memcpy(cup->cu_u.cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data, clp->cl_name.len); - ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg); + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg); if (!ret) { - ret = cup->cu_msg.cm_status; + ret = cup->cu_u.cu_msg.cm_status; clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); } @@ -1145,21 +1260,21 @@ nfsd4_cld_check_v0(struct nfs4_client *clp) if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) return 0; - cup = alloc_cld_upcall(cn); + cup = alloc_cld_upcall(nn); if (!cup) { printk(KERN_ERR "NFSD: Unable to check client record on " "stable storage: %d\n", -ENOMEM); return -ENOMEM; } - cup->cu_msg.cm_cmd = Cld_Check; - cup->cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len; - memcpy(cup->cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data, + cup->cu_u.cu_msg.cm_cmd = Cld_Check; + cup->cu_u.cu_msg.cm_u.cm_name.cn_len = clp->cl_name.len; + memcpy(cup->cu_u.cu_msg.cm_u.cm_name.cn_id, clp->cl_name.data, clp->cl_name.len); - ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg); + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg); if (!ret) { - ret = cup->cu_msg.cm_status; + ret = cup->cu_u.cu_msg.cm_status; set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); } @@ -1217,22 +1332,95 @@ found: } static int +nfsd4_cld_check_v2(struct nfs4_client *clp) +{ + struct nfs4_client_reclaim *crp; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + struct cld_net *cn = nn->cld_net; + int status; + char dname[HEXDIR_LEN]; + struct xdr_netobj name; + struct crypto_shash *tfm = cn->cn_tfm; + struct xdr_netobj cksum; + char *principal = NULL; + SHASH_DESC_ON_STACK(desc, tfm); + + /* did we already find that this client is stable? */ + if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) + return 0; + + /* look for it in the reclaim hashtable otherwise */ + crp = nfsd4_find_reclaim_client(clp->cl_name, nn); + if (crp) + goto found; + + if (cn->cn_has_legacy) { + status = nfs4_make_rec_clidname(dname, &clp->cl_name); + if (status) + return -ENOENT; + + name.data = kmemdup(dname, HEXDIR_LEN, GFP_KERNEL); + if (!name.data) { + dprintk("%s: failed to allocate memory for name.data\n", + __func__); + return -ENOENT; + } + name.len = HEXDIR_LEN; + crp = nfsd4_find_reclaim_client(name, nn); + kfree(name.data); + if (crp) + goto found; + + } + return -ENOENT; +found: + if (crp->cr_princhash.len) { + if (clp->cl_cred.cr_raw_principal) + principal = clp->cl_cred.cr_raw_principal; + else if (clp->cl_cred.cr_principal) + principal = clp->cl_cred.cr_principal; + if (principal == NULL) + return -ENOENT; + desc->tfm = tfm; + cksum.len = crypto_shash_digestsize(tfm); + cksum.data = kmalloc(cksum.len, GFP_KERNEL); + if (cksum.data == NULL) + return -ENOENT; + status = crypto_shash_digest(desc, principal, strlen(principal), + cksum.data); + shash_desc_zero(desc); + if (status) { + kfree(cksum.data); + return -ENOENT; + } + if (memcmp(crp->cr_princhash.data, cksum.data, + crp->cr_princhash.len)) { + kfree(cksum.data); + return -ENOENT; + } + kfree(cksum.data); + } + crp->cr_clp = clp; + return 0; +} + +static int nfsd4_cld_grace_start(struct nfsd_net *nn) { int ret; struct cld_upcall *cup; struct cld_net *cn = nn->cld_net; - cup = alloc_cld_upcall(cn); + cup = alloc_cld_upcall(nn); if (!cup) { ret = -ENOMEM; goto out_err; } - cup->cu_msg.cm_cmd = Cld_GraceStart; - ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg); + cup->cu_u.cu_msg.cm_cmd = Cld_GraceStart; + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg); if (!ret) - ret = cup->cu_msg.cm_status; + ret = cup->cu_u.cu_msg.cm_status; free_cld_upcall(cup); out_err: @@ -1250,17 +1438,17 @@ nfsd4_cld_grace_done_v0(struct nfsd_net *nn) struct cld_upcall *cup; struct cld_net *cn = nn->cld_net; - cup = alloc_cld_upcall(cn); + cup = alloc_cld_upcall(nn); if (!cup) { ret = -ENOMEM; goto out_err; } - cup->cu_msg.cm_cmd = Cld_GraceDone; - cup->cu_msg.cm_u.cm_gracetime = (int64_t)nn->boot_time; - ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg); + cup->cu_u.cu_msg.cm_cmd = Cld_GraceDone; + cup->cu_u.cu_msg.cm_u.cm_gracetime = (int64_t)nn->boot_time; + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg); if (!ret) - ret = cup->cu_msg.cm_status; + ret = cup->cu_u.cu_msg.cm_status; free_cld_upcall(cup); out_err: @@ -1279,16 +1467,16 @@ nfsd4_cld_grace_done(struct nfsd_net *nn) struct cld_upcall *cup; struct cld_net *cn = nn->cld_net; - cup = alloc_cld_upcall(cn); + cup = alloc_cld_upcall(nn); if (!cup) { ret = -ENOMEM; goto out_err; } - cup->cu_msg.cm_cmd = Cld_GraceDone; - ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg); + cup->cu_u.cu_msg.cm_cmd = Cld_GraceDone; + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg); if (!ret) - ret = cup->cu_msg.cm_status; + ret = cup->cu_u.cu_msg.cm_status; free_cld_upcall(cup); out_err: @@ -1337,6 +1525,53 @@ cld_running(struct nfsd_net *nn) } static int +nfsd4_cld_get_version(struct nfsd_net *nn) +{ + int ret = 0; + struct cld_upcall *cup; + struct cld_net *cn = nn->cld_net; + uint8_t version; + + cup = alloc_cld_upcall(nn); + if (!cup) { + ret = -ENOMEM; + goto out_err; + } + cup->cu_u.cu_msg.cm_cmd = Cld_GetVersion; + ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_u.cu_msg); + if (!ret) { + ret = cup->cu_u.cu_msg.cm_status; + if (ret) + goto out_free; + version = cup->cu_u.cu_msg.cm_u.cm_version; + dprintk("%s: userspace returned version %u\n", + __func__, version); + if (version < 1) + version = 1; + else if (version > CLD_UPCALL_VERSION) + version = CLD_UPCALL_VERSION; + + switch (version) { + case 1: + nn->client_tracking_ops = &nfsd4_cld_tracking_ops; + break; + case 2: + nn->client_tracking_ops = &nfsd4_cld_tracking_ops_v2; + break; + default: + break; + } + } +out_free: + free_cld_upcall(cup); +out_err: + if (ret) + dprintk("%s: Unable to get version from userspace: %d\n", + __func__, ret); + return ret; +} + +static int nfsd4_cld_tracking_init(struct net *net) { int status; @@ -1351,6 +1586,11 @@ nfsd4_cld_tracking_init(struct net *net) status = __nfsd4_init_cld_pipe(net); if (status) goto err_shutdown; + nn->cld_net->cn_tfm = crypto_alloc_shash("sha256", 0, 0); + if (IS_ERR(nn->cld_net->cn_tfm)) { + status = PTR_ERR(nn->cld_net->cn_tfm); + goto err_remove; + } /* * rpc pipe upcalls take 30 seconds to time out, so we don't want to @@ -1368,10 +1608,14 @@ nfsd4_cld_tracking_init(struct net *net) goto err_remove; } + status = nfsd4_cld_get_version(nn); + if (status == -EOPNOTSUPP) + pr_warn("NFSD: nfsdcld GetVersion upcall failed. Please upgrade nfsdcld.\n"); + status = nfsd4_cld_grace_start(nn); if (status) { if (status == -EOPNOTSUPP) - printk(KERN_WARNING "NFSD: Please upgrade nfsdcld.\n"); + pr_warn("NFSD: nfsdcld GraceStart upcall failed. Please upgrade nfsdcld.\n"); nfs4_release_reclaim(nn); goto err_remove; } else @@ -1403,6 +1647,8 @@ static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops_v0 = { .remove = nfsd4_cld_remove, .check = nfsd4_cld_check_v0, .grace_done = nfsd4_cld_grace_done_v0, + .version = 1, + .msglen = sizeof(struct cld_msg), }; /* For newer nfsdcld's */ @@ -1413,6 +1659,20 @@ static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops = { .remove = nfsd4_cld_remove, .check = nfsd4_cld_check, .grace_done = nfsd4_cld_grace_done, + .version = 1, + .msglen = sizeof(struct cld_msg), +}; + +/* v2 create/check ops include the principal, if available */ +static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops_v2 = { + .init = nfsd4_cld_tracking_init, + .exit = nfsd4_cld_tracking_exit, + .create = nfsd4_cld_create_v2, + .remove = nfsd4_cld_remove, + .check = nfsd4_cld_check_v2, + .grace_done = nfsd4_cld_grace_done, + .version = 2, + .msglen = sizeof(struct cld_msg_v2), }; /* upcall via usermodehelper */ @@ -1760,6 +2020,8 @@ static const struct nfsd4_client_tracking_ops nfsd4_umh_tracking_ops = { .remove = nfsd4_umh_cltrack_remove, .check = nfsd4_umh_cltrack_check, .grace_done = nfsd4_umh_cltrack_grace_done, + .version = 1, + .msglen = 0, }; int diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 7857942c5ca6..c65aeaa812d4 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -50,6 +50,7 @@ #include "netns.h" #include "pnfs.h" +#include "filecache.h" #define NFSDDBG_FACILITY NFSDDBG_PROC @@ -429,18 +430,18 @@ put_nfs4_file(struct nfs4_file *fi) } } -static struct file * +static struct nfsd_file * __nfs4_get_fd(struct nfs4_file *f, int oflag) { if (f->fi_fds[oflag]) - return get_file(f->fi_fds[oflag]); + return nfsd_file_get(f->fi_fds[oflag]); return NULL; } -static struct file * +static struct nfsd_file * find_writeable_file_locked(struct nfs4_file *f) { - struct file *ret; + struct nfsd_file *ret; lockdep_assert_held(&f->fi_lock); @@ -450,10 +451,10 @@ find_writeable_file_locked(struct nfs4_file *f) return ret; } -static struct file * +static struct nfsd_file * find_writeable_file(struct nfs4_file *f) { - struct file *ret; + struct nfsd_file *ret; spin_lock(&f->fi_lock); ret = find_writeable_file_locked(f); @@ -462,9 +463,10 @@ find_writeable_file(struct nfs4_file *f) return ret; } -static struct file *find_readable_file_locked(struct nfs4_file *f) +static struct nfsd_file * +find_readable_file_locked(struct nfs4_file *f) { - struct file *ret; + struct nfsd_file *ret; lockdep_assert_held(&f->fi_lock); @@ -474,10 +476,10 @@ static struct file *find_readable_file_locked(struct nfs4_file *f) return ret; } -static struct file * +static struct nfsd_file * find_readable_file(struct nfs4_file *f) { - struct file *ret; + struct nfsd_file *ret; spin_lock(&f->fi_lock); ret = find_readable_file_locked(f); @@ -486,10 +488,10 @@ find_readable_file(struct nfs4_file *f) return ret; } -struct file * +struct nfsd_file * find_any_file(struct nfs4_file *f) { - struct file *ret; + struct nfsd_file *ret; spin_lock(&f->fi_lock); ret = __nfs4_get_fd(f, O_RDWR); @@ -590,17 +592,17 @@ static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag) might_lock(&fp->fi_lock); if (atomic_dec_and_lock(&fp->fi_access[oflag], &fp->fi_lock)) { - struct file *f1 = NULL; - struct file *f2 = NULL; + struct nfsd_file *f1 = NULL; + struct nfsd_file *f2 = NULL; swap(f1, fp->fi_fds[oflag]); if (atomic_read(&fp->fi_access[1 - oflag]) == 0) swap(f2, fp->fi_fds[O_RDWR]); spin_unlock(&fp->fi_lock); if (f1) - fput(f1); + nfsd_file_put(f1); if (f2) - fput(f2); + nfsd_file_put(f2); } } @@ -933,25 +935,25 @@ nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid) static void put_deleg_file(struct nfs4_file *fp) { - struct file *filp = NULL; + struct nfsd_file *nf = NULL; spin_lock(&fp->fi_lock); if (--fp->fi_delegees == 0) - swap(filp, fp->fi_deleg_file); + swap(nf, fp->fi_deleg_file); spin_unlock(&fp->fi_lock); - if (filp) - fput(filp); + if (nf) + nfsd_file_put(nf); } static void nfs4_unlock_deleg_lease(struct nfs4_delegation *dp) { struct nfs4_file *fp = dp->dl_stid.sc_file; - struct file *filp = fp->fi_deleg_file; + struct nfsd_file *nf = fp->fi_deleg_file; WARN_ON_ONCE(!fp->fi_delegees); - vfs_setlease(filp, F_UNLCK, NULL, (void **)&dp); + vfs_setlease(nf->nf_file, F_UNLCK, NULL, (void **)&dp); put_deleg_file(fp); } @@ -1289,11 +1291,14 @@ static void nfs4_free_lock_stateid(struct nfs4_stid *stid) { struct nfs4_ol_stateid *stp = openlockstateid(stid); struct nfs4_lockowner *lo = lockowner(stp->st_stateowner); - struct file *file; + struct nfsd_file *nf; - file = find_any_file(stp->st_stid.sc_file); - if (file) - filp_close(file, (fl_owner_t)lo); + nf = find_any_file(stp->st_stid.sc_file); + if (nf) { + get_file(nf->nf_file); + filp_close(nf->nf_file, (fl_owner_t)lo); + nfsd_file_put(nf); + } nfs4_free_ol_stateid(stid); } @@ -1563,21 +1568,39 @@ static inline u32 slot_bytes(struct nfsd4_channel_attrs *ca) * re-negotiate active sessions and reduce their slot usage to make * room for new connections. For now we just fail the create session. */ -static u32 nfsd4_get_drc_mem(struct nfsd4_channel_attrs *ca) +static u32 nfsd4_get_drc_mem(struct nfsd4_channel_attrs *ca, struct nfsd_net *nn) { u32 slotsize = slot_bytes(ca); u32 num = ca->maxreqs; unsigned long avail, total_avail; + unsigned int scale_factor; spin_lock(&nfsd_drc_lock); - total_avail = nfsd_drc_max_mem - nfsd_drc_mem_used; + if (nfsd_drc_max_mem > nfsd_drc_mem_used) + total_avail = nfsd_drc_max_mem - nfsd_drc_mem_used; + else + /* We have handed out more space than we chose in + * set_max_drc() to allow. That isn't really a + * problem as long as that doesn't make us think we + * have lots more due to integer overflow. + */ + total_avail = 0; avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION, total_avail); /* - * Never use more than a third of the remaining memory, - * unless it's the only way to give this client a slot: + * Never use more than a fraction of the remaining memory, + * unless it's the only way to give this client a slot. + * The chosen fraction is either 1/8 or 1/number of threads, + * whichever is smaller. This ensures there are adequate + * slots to support multiple clients per thread. + * Give the client one slot even if that would require + * over-allocation--it is better than failure. */ - avail = clamp_t(unsigned long, avail, slotsize, total_avail/3); + scale_factor = max_t(unsigned int, 8, nn->nfsd_serv->sv_nrthreads); + + avail = clamp_t(unsigned long, avail, slotsize, + total_avail/scale_factor); num = min_t(int, num, avail / slotsize); + num = max_t(int, num, 1); nfsd_drc_mem_used += num * slotsize; spin_unlock(&nfsd_drc_lock); @@ -2323,9 +2346,9 @@ static void states_stop(struct seq_file *s, void *v) spin_unlock(&clp->cl_lock); } -static void nfs4_show_superblock(struct seq_file *s, struct file *f) +static void nfs4_show_superblock(struct seq_file *s, struct nfsd_file *f) { - struct inode *inode = file_inode(f); + struct inode *inode = f->nf_inode; seq_printf(s, "superblock: \"%02x:%02x:%ld\"", MAJOR(inode->i_sb->s_dev), @@ -2343,7 +2366,7 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st) { struct nfs4_ol_stateid *ols; struct nfs4_file *nf; - struct file *file; + struct nfsd_file *file; struct nfs4_stateowner *oo; unsigned int access, deny; @@ -2370,7 +2393,7 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st) seq_printf(s, ", "); nfs4_show_owner(s, oo); seq_printf(s, " }\n"); - fput(file); + nfsd_file_put(file); return 0; } @@ -2379,7 +2402,7 @@ static int nfs4_show_lock(struct seq_file *s, struct nfs4_stid *st) { struct nfs4_ol_stateid *ols; struct nfs4_file *nf; - struct file *file; + struct nfsd_file *file; struct nfs4_stateowner *oo; ols = openlockstateid(st); @@ -2401,7 +2424,7 @@ static int nfs4_show_lock(struct seq_file *s, struct nfs4_stid *st) seq_printf(s, ", "); nfs4_show_owner(s, oo); seq_printf(s, " }\n"); - fput(file); + nfsd_file_put(file); return 0; } @@ -2410,7 +2433,7 @@ static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st) { struct nfs4_delegation *ds; struct nfs4_file *nf; - struct file *file; + struct nfsd_file *file; ds = delegstateid(st); nf = st->sc_file; @@ -2433,7 +2456,7 @@ static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st) static int nfs4_show_layout(struct seq_file *s, struct nfs4_stid *st) { struct nfs4_layout_stateid *ls; - struct file *file; + struct nfsd_file *file; ls = container_of(st, struct nfs4_layout_stateid, ls_stid); file = ls->ls_file; @@ -3169,10 +3192,10 @@ static __be32 check_forechannel_attrs(struct nfsd4_channel_attrs *ca, struct nfs * performance. When short on memory we therefore prefer to * decrease number of slots instead of their size. Clients that * request larger slots than they need will get poor results: + * Note that we always allow at least one slot, because our + * accounting is soft and provides no guarantees either way. */ - ca->maxreqs = nfsd4_get_drc_mem(ca); - if (!ca->maxreqs) - return nfserr_jukebox; + ca->maxreqs = nfsd4_get_drc_mem(ca, nn); return nfs_ok; } @@ -4651,7 +4674,7 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, struct nfsd4_open *open) { - struct file *filp = NULL; + struct nfsd_file *nf = NULL; __be32 status; int oflag = nfs4_access_to_omode(open->op_share_access); int access = nfs4_access_to_access(open->op_share_access); @@ -4687,18 +4710,18 @@ static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp, if (!fp->fi_fds[oflag]) { spin_unlock(&fp->fi_lock); - status = nfsd_open(rqstp, cur_fh, S_IFREG, access, &filp); + status = nfsd_file_acquire(rqstp, cur_fh, access, &nf); if (status) goto out_put_access; spin_lock(&fp->fi_lock); if (!fp->fi_fds[oflag]) { - fp->fi_fds[oflag] = filp; - filp = NULL; + fp->fi_fds[oflag] = nf; + nf = NULL; } } spin_unlock(&fp->fi_lock); - if (filp) - fput(filp); + if (nf) + nfsd_file_put(nf); status = nfsd4_truncate(rqstp, cur_fh, open); if (status) @@ -4767,7 +4790,7 @@ static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp, fl->fl_end = OFFSET_MAX; fl->fl_owner = (fl_owner_t)dp; fl->fl_pid = current->tgid; - fl->fl_file = dp->dl_stid.sc_file->fi_deleg_file; + fl->fl_file = dp->dl_stid.sc_file->fi_deleg_file->nf_file; return fl; } @@ -4777,7 +4800,7 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, { int status = 0; struct nfs4_delegation *dp; - struct file *filp; + struct nfsd_file *nf; struct file_lock *fl; /* @@ -4788,8 +4811,8 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, if (fp->fi_had_conflict) return ERR_PTR(-EAGAIN); - filp = find_readable_file(fp); - if (!filp) { + nf = find_readable_file(fp); + if (!nf) { /* We should always have a readable file here */ WARN_ON_ONCE(1); return ERR_PTR(-EBADF); @@ -4799,17 +4822,17 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, if (nfs4_delegation_exists(clp, fp)) status = -EAGAIN; else if (!fp->fi_deleg_file) { - fp->fi_deleg_file = filp; + fp->fi_deleg_file = nf; /* increment early to prevent fi_deleg_file from being * cleared */ fp->fi_delegees = 1; - filp = NULL; + nf = NULL; } else fp->fi_delegees++; spin_unlock(&fp->fi_lock); spin_unlock(&state_lock); - if (filp) - fput(filp); + if (nf) + nfsd_file_put(nf); if (status) return ERR_PTR(status); @@ -4822,7 +4845,7 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, if (!fl) goto out_clnt_odstate; - status = vfs_setlease(fp->fi_deleg_file, fl->fl_type, &fl, NULL); + status = vfs_setlease(fp->fi_deleg_file->nf_file, fl->fl_type, &fl, NULL); if (fl) locks_free_lock(fl); if (status) @@ -4842,7 +4865,7 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, return dp; out_unlock: - vfs_setlease(fp->fi_deleg_file, F_UNLCK, NULL, (void **)&dp); + vfs_setlease(fp->fi_deleg_file->nf_file, F_UNLCK, NULL, (void **)&dp); out_clnt_odstate: put_clnt_odstate(dp->dl_clnt_odstate); nfs4_put_stid(&dp->dl_stid); @@ -5513,7 +5536,7 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, return nfs_ok; } -static struct file * +static struct nfsd_file * nfs4_find_file(struct nfs4_stid *s, int flags) { if (!s) @@ -5523,7 +5546,7 @@ nfs4_find_file(struct nfs4_stid *s, int flags) case NFS4_DELEG_STID: if (WARN_ON_ONCE(!s->sc_file->fi_deleg_file)) return NULL; - return get_file(s->sc_file->fi_deleg_file); + return nfsd_file_get(s->sc_file->fi_deleg_file); case NFS4_OPEN_STID: case NFS4_LOCK_STID: if (flags & RD_STATE) @@ -5549,32 +5572,28 @@ nfs4_check_olstateid(struct nfs4_ol_stateid *ols, int flags) static __be32 nfs4_check_file(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfs4_stid *s, - struct file **filpp, bool *tmp_file, int flags) + struct nfsd_file **nfp, int flags) { int acc = (flags & RD_STATE) ? NFSD_MAY_READ : NFSD_MAY_WRITE; - struct file *file; + struct nfsd_file *nf; __be32 status; - file = nfs4_find_file(s, flags); - if (file) { + nf = nfs4_find_file(s, flags); + if (nf) { status = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry, acc | NFSD_MAY_OWNER_OVERRIDE); if (status) { - fput(file); - return status; + nfsd_file_put(nf); + goto out; } - - *filpp = file; } else { - status = nfsd_open(rqstp, fhp, S_IFREG, acc, filpp); + status = nfsd_file_acquire(rqstp, fhp, acc, &nf); if (status) return status; - - if (tmp_file) - *tmp_file = true; } - - return 0; + *nfp = nf; +out: + return status; } /* @@ -5583,7 +5602,7 @@ nfs4_check_file(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfs4_stid *s, __be32 nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct svc_fh *fhp, - stateid_t *stateid, int flags, struct file **filpp, bool *tmp_file) + stateid_t *stateid, int flags, struct nfsd_file **nfp) { struct inode *ino = d_inode(fhp->fh_dentry); struct net *net = SVC_NET(rqstp); @@ -5591,10 +5610,8 @@ nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, struct nfs4_stid *s = NULL; __be32 status; - if (filpp) - *filpp = NULL; - if (tmp_file) - *tmp_file = false; + if (nfp) + *nfp = NULL; if (grace_disallows_io(net, ino)) return nfserr_grace; @@ -5631,8 +5648,8 @@ nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, status = nfs4_check_fh(fhp, s); done: - if (!status && filpp) - status = nfs4_check_file(rqstp, fhp, s, filpp, tmp_file, flags); + if (status == nfs_ok && nfp) + status = nfs4_check_file(rqstp, fhp, s, nfp, flags); out: if (s) nfs4_put_stid(s); @@ -6392,7 +6409,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfs4_ol_stateid *lock_stp = NULL; struct nfs4_ol_stateid *open_stp = NULL; struct nfs4_file *fp; - struct file *filp = NULL; + struct nfsd_file *nf = NULL; struct nfsd4_blocked_lock *nbl = NULL; struct file_lock *file_lock = NULL; struct file_lock *conflock = NULL; @@ -6474,8 +6491,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, /* Fallthrough */ case NFS4_READ_LT: spin_lock(&fp->fi_lock); - filp = find_readable_file_locked(fp); - if (filp) + nf = find_readable_file_locked(fp); + if (nf) get_lock_access(lock_stp, NFS4_SHARE_ACCESS_READ); spin_unlock(&fp->fi_lock); fl_type = F_RDLCK; @@ -6486,8 +6503,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, /* Fallthrough */ case NFS4_WRITE_LT: spin_lock(&fp->fi_lock); - filp = find_writeable_file_locked(fp); - if (filp) + nf = find_writeable_file_locked(fp); + if (nf) get_lock_access(lock_stp, NFS4_SHARE_ACCESS_WRITE); spin_unlock(&fp->fi_lock); fl_type = F_WRLCK; @@ -6497,7 +6514,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; } - if (!filp) { + if (!nf) { status = nfserr_openmode; goto out; } @@ -6513,7 +6530,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, file_lock->fl_type = fl_type; file_lock->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(&lock_sop->lo_owner)); file_lock->fl_pid = current->tgid; - file_lock->fl_file = filp; + file_lock->fl_file = nf->nf_file; file_lock->fl_flags = fl_flags; file_lock->fl_lmops = &nfsd_posix_mng_ops; file_lock->fl_start = lock->lk_offset; @@ -6535,7 +6552,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, spin_unlock(&nn->blocked_locks_lock); } - err = vfs_lock_file(filp, F_SETLK, file_lock, conflock); + err = vfs_lock_file(nf->nf_file, F_SETLK, file_lock, conflock); switch (err) { case 0: /* success! */ nfs4_inc_and_copy_stateid(&lock->lk_resp_stateid, &lock_stp->st_stid); @@ -6570,8 +6587,8 @@ out: } free_blocked_lock(nbl); } - if (filp) - fput(filp); + if (nf) + nfsd_file_put(nf); if (lock_stp) { /* Bump seqid manually if the 4.0 replay owner is openowner */ if (cstate->replay_owner && @@ -6606,11 +6623,11 @@ out: */ static __be32 nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file_lock *lock) { - struct file *file; - __be32 err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file); + struct nfsd_file *nf; + __be32 err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_READ, &nf); if (!err) { - err = nfserrno(vfs_test_lock(file, lock)); - fput(file); + err = nfserrno(vfs_test_lock(nf->nf_file, lock)); + nfsd_file_put(nf); } return err; } @@ -6698,7 +6715,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { struct nfsd4_locku *locku = &u->locku; struct nfs4_ol_stateid *stp; - struct file *filp = NULL; + struct nfsd_file *nf = NULL; struct file_lock *file_lock = NULL; __be32 status; int err; @@ -6716,8 +6733,8 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, &stp, nn); if (status) goto out; - filp = find_any_file(stp->st_stid.sc_file); - if (!filp) { + nf = find_any_file(stp->st_stid.sc_file); + if (!nf) { status = nfserr_lock_range; goto put_stateid; } @@ -6725,13 +6742,13 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (!file_lock) { dprintk("NFSD: %s: unable to allocate lock!\n", __func__); status = nfserr_jukebox; - goto fput; + goto put_file; } file_lock->fl_type = F_UNLCK; file_lock->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(stp->st_stateowner)); file_lock->fl_pid = current->tgid; - file_lock->fl_file = filp; + file_lock->fl_file = nf->nf_file; file_lock->fl_flags = FL_POSIX; file_lock->fl_lmops = &nfsd_posix_mng_ops; file_lock->fl_start = locku->lu_offset; @@ -6740,14 +6757,14 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, locku->lu_length); nfs4_transform_lock_offset(file_lock); - err = vfs_lock_file(filp, F_SETLK, file_lock, NULL); + err = vfs_lock_file(nf->nf_file, F_SETLK, file_lock, NULL); if (err) { dprintk("NFSD: nfs4_locku: vfs_lock_file failed!\n"); goto out_nfserr; } nfs4_inc_and_copy_stateid(&locku->lu_stateid, &stp->st_stid); -fput: - fput(filp); +put_file: + nfsd_file_put(nf); put_stateid: mutex_unlock(&stp->st_mutex); nfs4_put_stid(&stp->st_stid); @@ -6759,7 +6776,7 @@ out: out_nfserr: status = nfserrno(err); - goto fput; + goto put_file; } /* @@ -6772,17 +6789,17 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner) { struct file_lock *fl; int status = false; - struct file *filp = find_any_file(fp); + struct nfsd_file *nf = find_any_file(fp); struct inode *inode; struct file_lock_context *flctx; - if (!filp) { + if (!nf) { /* Any valid lock stateid should have some sort of access */ WARN_ON_ONCE(1); return status; } - inode = locks_inode(filp); + inode = locks_inode(nf->nf_file); flctx = inode->i_flctx; if (flctx && !list_empty_careful(&flctx->flc_posix)) { @@ -6795,7 +6812,7 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner) } spin_unlock(&flctx->flc_lock); } - fput(filp); + nfsd_file_put(nf); return status; } @@ -6888,7 +6905,8 @@ nfs4_has_reclaimed_state(struct xdr_netobj name, struct nfsd_net *nn) * will be freed in nfs4_remove_reclaim_record in the normal case). */ struct nfs4_client_reclaim * -nfs4_client_to_reclaim(struct xdr_netobj name, struct nfsd_net *nn) +nfs4_client_to_reclaim(struct xdr_netobj name, struct xdr_netobj princhash, + struct nfsd_net *nn) { unsigned int strhashval; struct nfs4_client_reclaim *crp; @@ -6901,6 +6919,8 @@ nfs4_client_to_reclaim(struct xdr_netobj name, struct nfsd_net *nn) list_add(&crp->cr_strhash, &nn->reclaim_str_hashtbl[strhashval]); crp->cr_name.data = name.data; crp->cr_name.len = name.len; + crp->cr_princhash.data = princhash.data; + crp->cr_princhash.len = princhash.len; crp->cr_clp = NULL; nn->reclaim_str_hashtbl_size++; } @@ -6912,6 +6932,7 @@ nfs4_remove_reclaim_record(struct nfs4_client_reclaim *crp, struct nfsd_net *nn) { list_del(&crp->cr_strhash); kfree(crp->cr_name.data); + kfree(crp->cr_princhash.data); kfree(crp); nn->reclaim_str_hashtbl_size--; } diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 442811809f3d..533d0fc3c96b 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -49,6 +49,7 @@ #include "cache.h" #include "netns.h" #include "pnfs.h" +#include "filecache.h" #ifdef CONFIG_NFSD_V4_SECURITY_LABEL #include <linux/security.h> @@ -203,6 +204,13 @@ static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes) return p; } +static unsigned int compoundargs_bytes_left(struct nfsd4_compoundargs *argp) +{ + unsigned int this = (char *)argp->end - (char *)argp->p; + + return this + argp->pagelen; +} + static int zero_clientid(clientid_t *clid) { return (clid->cl_boot == 0) && (clid->cl_id == 0); @@ -211,10 +219,10 @@ static int zero_clientid(clientid_t *clid) /** * svcxdr_tmpalloc - allocate memory to be freed after compound processing * @argp: NFSv4 compound argument structure - * @p: pointer to be freed (with kfree()) + * @len: length of buffer to allocate * - * Marks @p to be freed when processing the compound operation - * described in @argp finishes. + * Allocates a buffer of size @len to be freed when processing the compound + * operation described in @argp finishes. */ static void * svcxdr_tmpalloc(struct nfsd4_compoundargs *argp, u32 len) @@ -347,7 +355,12 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, READ_BUF(4); len += 4; nace = be32_to_cpup(p++); - if (nace > NFS4_ACL_MAX) + if (nace > compoundargs_bytes_left(argp)/20) + /* + * Even with 4-byte names there wouldn't be + * space for that many aces; something fishy is + * going on: + */ return nfserr_fbig; *acl = svcxdr_tmpalloc(argp, nfs4_acl_bytes(nace)); @@ -1418,7 +1431,6 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp, struct nfsd4_create_session *sess) { DECODE_HEAD; - u32 dummy; READ_BUF(16); COPYMEM(&sess->clientid, 8); @@ -1427,7 +1439,7 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp, /* Fore channel attrs */ READ_BUF(28); - dummy = be32_to_cpup(p++); /* headerpadsz is always 0 */ + p++; /* headerpadsz is always 0 */ sess->fore_channel.maxreq_sz = be32_to_cpup(p++); sess->fore_channel.maxresp_sz = be32_to_cpup(p++); sess->fore_channel.maxresp_cached = be32_to_cpup(p++); @@ -1444,7 +1456,7 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp, /* Back channel attrs */ READ_BUF(28); - dummy = be32_to_cpup(p++); /* headerpadsz is always 0 */ + p++; /* headerpadsz is always 0 */ sess->back_channel.maxreq_sz = be32_to_cpup(p++); sess->back_channel.maxresp_sz = be32_to_cpup(p++); sess->back_channel.maxresp_cached = be32_to_cpup(p++); @@ -1736,7 +1748,6 @@ static __be32 nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy) { DECODE_HEAD; - unsigned int tmp; status = nfsd4_decode_stateid(argp, ©->cp_src_stateid); if (status) @@ -1751,7 +1762,7 @@ nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy) p = xdr_decode_hyper(p, ©->cp_count); p++; /* ca_consecutive: we always do consecutive copies */ copy->cp_synchronous = be32_to_cpup(p++); - tmp = be32_to_cpup(p); /* Source server list not supported */ + /* tmp = be32_to_cpup(p); Source server list not supported */ DECODE_TAIL; } @@ -3217,9 +3228,8 @@ nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ if (!p) return nfserr_resource; encode_cinfo(p, &create->cr_cinfo); - nfserr = nfsd4_encode_bitmap(xdr, create->cr_bmval[0], + return nfsd4_encode_bitmap(xdr, create->cr_bmval[0], create->cr_bmval[1], create->cr_bmval[2]); - return 0; } static __be32 @@ -3462,7 +3472,7 @@ static __be32 nfsd4_encode_splice_read( len = maxcount; nfserr = nfsd_splice_read(read->rd_rqstp, read->rd_fhp, - file, read->rd_offset, &maxcount); + file, read->rd_offset, &maxcount, &eof); read->rd_length = maxcount; if (nfserr) { /* @@ -3474,9 +3484,6 @@ static __be32 nfsd4_encode_splice_read( return nfserr; } - eof = nfsd_eof_on_read(len, maxcount, read->rd_offset, - d_inode(read->rd_fhp->fh_dentry)->i_size); - *(p++) = htonl(eof); *(p++) = htonl(maxcount); @@ -3547,15 +3554,13 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp, len = maxcount; nfserr = nfsd_readv(resp->rqstp, read->rd_fhp, file, read->rd_offset, - resp->rqstp->rq_vec, read->rd_vlen, &maxcount); + resp->rqstp->rq_vec, read->rd_vlen, &maxcount, + &eof); read->rd_length = maxcount; if (nfserr) return nfserr; xdr_truncate_encode(xdr, starting_len + 8 + ((maxcount+3)&~3)); - eof = nfsd_eof_on_read(len, maxcount, read->rd_offset, - d_inode(read->rd_fhp->fh_dentry)->i_size); - tmp = htonl(eof); write_bytes_to_xdr_buf(xdr->buf, starting_len , &tmp, 4); tmp = htonl(maxcount); @@ -3574,11 +3579,14 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, { unsigned long maxcount; struct xdr_stream *xdr = &resp->xdr; - struct file *file = read->rd_filp; + struct file *file; int starting_len = xdr->buf->len; - struct raparms *ra = NULL; __be32 *p; + if (nfserr) + return nfserr; + file = read->rd_nf->nf_file; + p = xdr_reserve_space(xdr, 8); /* eof flag and byte count */ if (!p) { WARN_ON_ONCE(test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)); @@ -3596,18 +3604,12 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, (xdr->buf->buflen - xdr->buf->len)); maxcount = min_t(unsigned long, maxcount, read->rd_length); - if (read->rd_tmp_file) - ra = nfsd_init_raparms(file); - if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)) nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount); else nfserr = nfsd4_encode_readv(resp, read, file, maxcount); - if (ra) - nfsd_put_raparams(file, ra); - if (nfserr) xdr_truncate_encode(xdr, starting_len); diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 2c215171c0eb..11b42c523f04 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1476,6 +1476,7 @@ static __net_init int nfsd_init_net(struct net *net) atomic_set(&nn->ntf_refcnt, 0); init_waitqueue_head(&nn->ntf_wq); + seqlock_init(&nn->boot_lock); mnt = vfs_kern_mount(&nfsd_fs_type, SB_KERNMOUNT, "nfsd", NULL); if (IS_ERR(mnt)) { diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 0d20fd161225..c83ddac22f38 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -172,6 +172,7 @@ nfsd_proc_read(struct svc_rqst *rqstp) struct nfsd_readargs *argp = rqstp->rq_argp; struct nfsd_readres *resp = rqstp->rq_resp; __be32 nfserr; + u32 eof; dprintk("nfsd: READ %s %d bytes at %d\n", SVCFH_fmt(&argp->fh), @@ -195,7 +196,8 @@ nfsd_proc_read(struct svc_rqst *rqstp) nfserr = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh), argp->offset, rqstp->rq_vec, argp->vlen, - &resp->count); + &resp->count, + &eof); if (nfserr) return nfserr; return fh_getattr(&resp->fh, &resp->stat); diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 18d94ea984ba..fdf7ed4bd5dd 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -27,6 +27,7 @@ #include "cache.h" #include "vfs.h" #include "netns.h" +#include "filecache.h" #define NFSDDBG_FACILITY NFSDDBG_SVC @@ -313,22 +314,17 @@ static int nfsd_startup_generic(int nrservs) if (nfsd_users++) return 0; - /* - * Readahead param cache - will no-op if it already exists. - * (Note therefore results will be suboptimal if number of - * threads is modified after nfsd start.) - */ - ret = nfsd_racache_init(2*nrservs); + ret = nfsd_file_cache_init(); if (ret) goto dec_users; ret = nfs4_state_start(); if (ret) - goto out_racache; + goto out_file_cache; return 0; -out_racache: - nfsd_racache_shutdown(); +out_file_cache: + nfsd_file_cache_shutdown(); dec_users: nfsd_users--; return ret; @@ -340,7 +336,7 @@ static void nfsd_shutdown_generic(void) return; nfs4_state_shutdown(); - nfsd_racache_shutdown(); + nfsd_file_cache_shutdown(); } static bool nfsd_needs_lockd(struct nfsd_net *nn) @@ -348,6 +344,35 @@ static bool nfsd_needs_lockd(struct nfsd_net *nn) return nfsd_vers(nn, 2, NFSD_TEST) || nfsd_vers(nn, 3, NFSD_TEST); } +void nfsd_copy_boot_verifier(__be32 verf[2], struct nfsd_net *nn) +{ + int seq = 0; + + do { + read_seqbegin_or_lock(&nn->boot_lock, &seq); + /* + * This is opaque to client, so no need to byte-swap. Use + * __force to keep sparse happy. y2038 time_t overflow is + * irrelevant in this usage + */ + verf[0] = (__force __be32)nn->nfssvc_boot.tv_sec; + verf[1] = (__force __be32)nn->nfssvc_boot.tv_nsec; + } while (need_seqretry(&nn->boot_lock, seq)); + done_seqretry(&nn->boot_lock, seq); +} + +static void nfsd_reset_boot_verifier_locked(struct nfsd_net *nn) +{ + ktime_get_real_ts64(&nn->nfssvc_boot); +} + +void nfsd_reset_boot_verifier(struct nfsd_net *nn) +{ + write_seqlock(&nn->boot_lock); + nfsd_reset_boot_verifier_locked(nn); + write_sequnlock(&nn->boot_lock); +} + static int nfsd_startup_net(int nrservs, struct net *net, const struct cred *cred) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); @@ -391,6 +416,7 @@ static void nfsd_shutdown_net(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); + nfsd_file_cache_purge(net); nfs4_state_shutdown_net(net); if (nn->lockd_up) { lockd_down(net); @@ -599,7 +625,7 @@ int nfsd_create_serv(struct net *net) #endif } atomic_inc(&nn->ntf_refcnt); - ktime_get_real_ts64(&nn->nfssvc_boot); /* record boot time */ + nfsd_reset_boot_verifier(nn); return 0; } diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 5dbd16946e8e..46f56afb6cb8 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -378,6 +378,7 @@ struct nfs4_client_reclaim { struct list_head cr_strhash; /* hash by cr_name */ struct nfs4_client *cr_clp; /* pointer to associated clp */ struct xdr_netobj cr_name; /* recovery dir name */ + struct xdr_netobj cr_princhash; }; /* A reasonable value for REPLAY_ISIZE was estimated as follows: @@ -506,7 +507,7 @@ struct nfs4_file { }; struct list_head fi_clnt_odstate; /* One each for O_RDONLY, O_WRONLY, O_RDWR: */ - struct file * fi_fds[3]; + struct nfsd_file *fi_fds[3]; /* * Each open or lock stateid contributes 0-4 to the counts * below depending on which bits are set in st_access_bitmap: @@ -516,7 +517,7 @@ struct nfs4_file { */ atomic_t fi_access[2]; u32 fi_share_deny; - struct file *fi_deleg_file; + struct nfsd_file *fi_deleg_file; int fi_delegees; struct knfsd_fh fi_fhandle; bool fi_had_conflict; @@ -565,7 +566,7 @@ struct nfs4_layout_stateid { spinlock_t ls_lock; struct list_head ls_layouts; u32 ls_layout_type; - struct file *ls_file; + struct nfsd_file *ls_file; struct nfsd4_callback ls_recall; stateid_t ls_recall_sid; bool ls_recalled; @@ -616,7 +617,7 @@ struct nfsd4_copy; extern __be32 nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct svc_fh *fhp, - stateid_t *stateid, int flags, struct file **filp, bool *tmp_file); + stateid_t *stateid, int flags, struct nfsd_file **filp); __be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, stateid_t *stateid, unsigned char typemask, struct nfs4_stid **s, struct nfsd_net *nn); @@ -645,7 +646,7 @@ extern void nfsd4_shutdown_callback(struct nfs4_client *); extern void nfsd4_shutdown_copy(struct nfs4_client *clp); extern void nfsd4_prepare_cb_recall(struct nfs4_delegation *dp); extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(struct xdr_netobj name, - struct nfsd_net *nn); + struct xdr_netobj princhash, struct nfsd_net *nn); extern bool nfs4_has_reclaimed_state(struct xdr_netobj name, struct nfsd_net *nn); struct nfs4_file *find_file(struct knfsd_fh *fh); @@ -657,7 +658,7 @@ static inline void get_nfs4_file(struct nfs4_file *fi) { refcount_inc(&fi->fi_ref); } -struct file *find_any_file(struct nfs4_file *f); +struct nfsd_file *find_any_file(struct nfs4_file *f); /* grace period management */ void nfsd4_end_grace(struct nfsd_net *nn); diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 80933e4334d8..ffc78a0e28b2 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -126,6 +126,8 @@ DEFINE_NFSD_ERR_EVENT(read_err); DEFINE_NFSD_ERR_EVENT(write_err); #include "state.h" +#include "filecache.h" +#include "vfs.h" DECLARE_EVENT_CLASS(nfsd_stateid_class, TP_PROTO(stateid_t *stp), @@ -164,6 +166,144 @@ DEFINE_STATEID_EVENT(layout_recall_done); DEFINE_STATEID_EVENT(layout_recall_fail); DEFINE_STATEID_EVENT(layout_recall_release); +#define show_nf_flags(val) \ + __print_flags(val, "|", \ + { 1 << NFSD_FILE_HASHED, "HASHED" }, \ + { 1 << NFSD_FILE_PENDING, "PENDING" }, \ + { 1 << NFSD_FILE_BREAK_READ, "BREAK_READ" }, \ + { 1 << NFSD_FILE_BREAK_WRITE, "BREAK_WRITE" }, \ + { 1 << NFSD_FILE_REFERENCED, "REFERENCED"}) + +/* FIXME: This should probably be fleshed out in the future. */ +#define show_nf_may(val) \ + __print_flags(val, "|", \ + { NFSD_MAY_READ, "READ" }, \ + { NFSD_MAY_WRITE, "WRITE" }, \ + { NFSD_MAY_NOT_BREAK_LEASE, "NOT_BREAK_LEASE" }) + +DECLARE_EVENT_CLASS(nfsd_file_class, + TP_PROTO(struct nfsd_file *nf), + TP_ARGS(nf), + TP_STRUCT__entry( + __field(unsigned int, nf_hashval) + __field(void *, nf_inode) + __field(int, nf_ref) + __field(unsigned long, nf_flags) + __field(unsigned char, nf_may) + __field(struct file *, nf_file) + ), + TP_fast_assign( + __entry->nf_hashval = nf->nf_hashval; + __entry->nf_inode = nf->nf_inode; + __entry->nf_ref = atomic_read(&nf->nf_ref); + __entry->nf_flags = nf->nf_flags; + __entry->nf_may = nf->nf_may; + __entry->nf_file = nf->nf_file; + ), + TP_printk("hash=0x%x inode=0x%p ref=%d flags=%s may=%s file=%p", + __entry->nf_hashval, + __entry->nf_inode, + __entry->nf_ref, + show_nf_flags(__entry->nf_flags), + show_nf_may(__entry->nf_may), + __entry->nf_file) +) + +#define DEFINE_NFSD_FILE_EVENT(name) \ +DEFINE_EVENT(nfsd_file_class, name, \ + TP_PROTO(struct nfsd_file *nf), \ + TP_ARGS(nf)) + +DEFINE_NFSD_FILE_EVENT(nfsd_file_alloc); +DEFINE_NFSD_FILE_EVENT(nfsd_file_put_final); +DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash); +DEFINE_NFSD_FILE_EVENT(nfsd_file_put); +DEFINE_NFSD_FILE_EVENT(nfsd_file_unhash_and_release_locked); + +TRACE_EVENT(nfsd_file_acquire, + TP_PROTO(struct svc_rqst *rqstp, unsigned int hash, + struct inode *inode, unsigned int may_flags, + struct nfsd_file *nf, __be32 status), + + TP_ARGS(rqstp, hash, inode, may_flags, nf, status), + + TP_STRUCT__entry( + __field(__be32, xid) + __field(unsigned int, hash) + __field(void *, inode) + __field(unsigned int, may_flags) + __field(int, nf_ref) + __field(unsigned long, nf_flags) + __field(unsigned char, nf_may) + __field(struct file *, nf_file) + __field(__be32, status) + ), + + TP_fast_assign( + __entry->xid = rqstp->rq_xid; + __entry->hash = hash; + __entry->inode = inode; + __entry->may_flags = may_flags; + __entry->nf_ref = nf ? atomic_read(&nf->nf_ref) : 0; + __entry->nf_flags = nf ? nf->nf_flags : 0; + __entry->nf_may = nf ? nf->nf_may : 0; + __entry->nf_file = nf ? nf->nf_file : NULL; + __entry->status = status; + ), + + TP_printk("xid=0x%x hash=0x%x inode=0x%p may_flags=%s ref=%d nf_flags=%s nf_may=%s nf_file=0x%p status=%u", + be32_to_cpu(__entry->xid), __entry->hash, __entry->inode, + show_nf_may(__entry->may_flags), __entry->nf_ref, + show_nf_flags(__entry->nf_flags), + show_nf_may(__entry->nf_may), __entry->nf_file, + be32_to_cpu(__entry->status)) +); + +DECLARE_EVENT_CLASS(nfsd_file_search_class, + TP_PROTO(struct inode *inode, unsigned int hash, int found), + TP_ARGS(inode, hash, found), + TP_STRUCT__entry( + __field(struct inode *, inode) + __field(unsigned int, hash) + __field(int, found) + ), + TP_fast_assign( + __entry->inode = inode; + __entry->hash = hash; + __entry->found = found; + ), + TP_printk("hash=0x%x inode=0x%p found=%d", __entry->hash, + __entry->inode, __entry->found) +); + +#define DEFINE_NFSD_FILE_SEARCH_EVENT(name) \ +DEFINE_EVENT(nfsd_file_search_class, name, \ + TP_PROTO(struct inode *inode, unsigned int hash, int found), \ + TP_ARGS(inode, hash, found)) + +DEFINE_NFSD_FILE_SEARCH_EVENT(nfsd_file_close_inode_sync); +DEFINE_NFSD_FILE_SEARCH_EVENT(nfsd_file_close_inode); +DEFINE_NFSD_FILE_SEARCH_EVENT(nfsd_file_is_cached); + +TRACE_EVENT(nfsd_file_fsnotify_handle_event, + TP_PROTO(struct inode *inode, u32 mask), + TP_ARGS(inode, mask), + TP_STRUCT__entry( + __field(struct inode *, inode) + __field(unsigned int, nlink) + __field(umode_t, mode) + __field(u32, mask) + ), + TP_fast_assign( + __entry->inode = inode; + __entry->nlink = inode->i_nlink; + __entry->mode = inode->i_mode; + __entry->mask = mask; + ), + TP_printk("inode=0x%p nlink=%u mode=0%ho mask=0x%x", __entry->inode, + __entry->nlink, __entry->mode, __entry->mask) +); + #endif /* _NFSD_TRACE_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index c85783e536d5..bd0a385df3fc 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -44,38 +44,11 @@ #include "nfsd.h" #include "vfs.h" +#include "filecache.h" #include "trace.h" #define NFSDDBG_FACILITY NFSDDBG_FILEOP - -/* - * This is a cache of readahead params that help us choose the proper - * readahead strategy. Initially, we set all readahead parameters to 0 - * and let the VFS handle things. - * If you increase the number of cached files very much, you'll need to - * add a hash table here. - */ -struct raparms { - struct raparms *p_next; - unsigned int p_count; - ino_t p_ino; - dev_t p_dev; - int p_set; - struct file_ra_state p_ra; - unsigned int p_hindex; -}; - -struct raparm_hbucket { - struct raparms *pb_head; - spinlock_t pb_lock; -} ____cacheline_aligned_in_smp; - -#define RAPARM_HASH_BITS 4 -#define RAPARM_HASH_SIZE (1<<RAPARM_HASH_BITS) -#define RAPARM_HASH_MASK (RAPARM_HASH_SIZE-1) -static struct raparm_hbucket raparm_hash[RAPARM_HASH_SIZE]; - /* * Called from nfsd_lookup and encode_dirent. Check if we have crossed * a mount point. @@ -699,7 +672,7 @@ nfsd_access(struct svc_rqst *rqstp, struct svc_fh *fhp, u32 *access, u32 *suppor } #endif /* CONFIG_NFSD_V3 */ -static int nfsd_open_break_lease(struct inode *inode, int access) +int nfsd_open_break_lease(struct inode *inode, int access) { unsigned int mode; @@ -715,8 +688,8 @@ static int nfsd_open_break_lease(struct inode *inode, int access) * and additional flags. * N.B. After this call fhp needs an fh_put */ -__be32 -nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, +static __be32 +__nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int may_flags, struct file **filp) { struct path path; @@ -726,25 +699,6 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, __be32 err; int host_err = 0; - validate_process_creds(); - - /* - * If we get here, then the client has already done an "open", - * and (hopefully) checked permission - so allow OWNER_OVERRIDE - * in case a chmod has now revoked permission. - * - * Arguably we should also allow the owner override for - * directories, but we never have and it doesn't seem to have - * caused anyone a problem. If we were to change this, note - * also that our filldir callbacks would need a variant of - * lookup_one_len that doesn't check permissions. - */ - if (type == S_IFREG) - may_flags |= NFSD_MAY_OWNER_OVERRIDE; - err = fh_verify(rqstp, fhp, type, may_flags); - if (err) - goto out; - path.mnt = fhp->fh_export->ex_path.mnt; path.dentry = fhp->fh_dentry; inode = d_inode(path.dentry); @@ -798,67 +752,46 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, out_nfserr: err = nfserrno(host_err); out: - validate_process_creds(); return err; } -struct raparms * -nfsd_init_raparms(struct file *file) +__be32 +nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, + int may_flags, struct file **filp) { - struct inode *inode = file_inode(file); - dev_t dev = inode->i_sb->s_dev; - ino_t ino = inode->i_ino; - struct raparms *ra, **rap, **frap = NULL; - int depth = 0; - unsigned int hash; - struct raparm_hbucket *rab; - - hash = jhash_2words(dev, ino, 0xfeedbeef) & RAPARM_HASH_MASK; - rab = &raparm_hash[hash]; - - spin_lock(&rab->pb_lock); - for (rap = &rab->pb_head; (ra = *rap); rap = &ra->p_next) { - if (ra->p_ino == ino && ra->p_dev == dev) - goto found; - depth++; - if (ra->p_count == 0) - frap = rap; - } - depth = nfsdstats.ra_size; - if (!frap) { - spin_unlock(&rab->pb_lock); - return NULL; - } - rap = frap; - ra = *frap; - ra->p_dev = dev; - ra->p_ino = ino; - ra->p_set = 0; - ra->p_hindex = hash; -found: - if (rap != &rab->pb_head) { - *rap = ra->p_next; - ra->p_next = rab->pb_head; - rab->pb_head = ra; - } - ra->p_count++; - nfsdstats.ra_depth[depth*10/nfsdstats.ra_size]++; - spin_unlock(&rab->pb_lock); + __be32 err; - if (ra->p_set) - file->f_ra = ra->p_ra; - return ra; + validate_process_creds(); + /* + * If we get here, then the client has already done an "open", + * and (hopefully) checked permission - so allow OWNER_OVERRIDE + * in case a chmod has now revoked permission. + * + * Arguably we should also allow the owner override for + * directories, but we never have and it doesn't seem to have + * caused anyone a problem. If we were to change this, note + * also that our filldir callbacks would need a variant of + * lookup_one_len that doesn't check permissions. + */ + if (type == S_IFREG) + may_flags |= NFSD_MAY_OWNER_OVERRIDE; + err = fh_verify(rqstp, fhp, type, may_flags); + if (!err) + err = __nfsd_open(rqstp, fhp, type, may_flags, filp); + validate_process_creds(); + return err; } -void nfsd_put_raparams(struct file *file, struct raparms *ra) +__be32 +nfsd_open_verified(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, + int may_flags, struct file **filp) { - struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex]; + __be32 err; - spin_lock(&rab->pb_lock); - ra->p_ra = file->f_ra; - ra->p_set = 1; - ra->p_count--; - spin_unlock(&rab->pb_lock); + validate_process_creds(); + err = __nfsd_open(rqstp, fhp, type, may_flags, filp); + validate_process_creds(); + return err; } /* @@ -901,12 +834,23 @@ static int nfsd_direct_splice_actor(struct pipe_inode_info *pipe, return __splice_from_pipe(pipe, sd, nfsd_splice_actor); } +static u32 nfsd_eof_on_read(struct file *file, loff_t offset, ssize_t len, + size_t expected) +{ + if (expected != 0 && len == 0) + return 1; + if (offset+len >= i_size_read(file_inode(file))) + return 1; + return 0; +} + static __be32 nfsd_finish_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, - unsigned long *count, int host_err) + unsigned long *count, u32 *eof, ssize_t host_err) { if (host_err >= 0) { nfsdstats.io_read += host_err; + *eof = nfsd_eof_on_read(file, offset, host_err, *count); *count = host_err; fsnotify_access(file); trace_nfsd_read_io_done(rqstp, fhp, offset, *count); @@ -918,7 +862,8 @@ static __be32 nfsd_finish_read(struct svc_rqst *rqstp, struct svc_fh *fhp, } __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, - struct file *file, loff_t offset, unsigned long *count) + struct file *file, loff_t offset, unsigned long *count, + u32 *eof) { struct splice_desc sd = { .len = 0, @@ -926,25 +871,27 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, .pos = offset, .u.data = rqstp, }; - int host_err; + ssize_t host_err; trace_nfsd_read_splice(rqstp, fhp, offset, *count); rqstp->rq_next_page = rqstp->rq_respages + 1; host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor); - return nfsd_finish_read(rqstp, fhp, file, offset, count, host_err); + return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err); } __be32 nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, - struct kvec *vec, int vlen, unsigned long *count) + struct kvec *vec, int vlen, unsigned long *count, + u32 *eof) { struct iov_iter iter; - int host_err; + loff_t ppos = offset; + ssize_t host_err; trace_nfsd_read_vector(rqstp, fhp, offset, *count); iov_iter_kvec(&iter, READ, vec, vlen, *count); - host_err = vfs_iter_read(file, &iter, &offset, 0); - return nfsd_finish_read(rqstp, fhp, file, offset, count, host_err); + host_err = vfs_iter_read(file, &iter, &ppos, 0); + return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err); } /* @@ -1025,8 +972,12 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, nfsdstats.io_write += *cnt; fsnotify_modify(file); - if (stable && use_wgather) + if (stable && use_wgather) { host_err = wait_for_concurrent_writes(file); + if (host_err < 0) + nfsd_reset_boot_verifier(net_generic(SVC_NET(rqstp), + nfsd_net_id)); + } out_nfserr: if (host_err >= 0) { @@ -1047,27 +998,25 @@ out_nfserr: * N.B. After this call fhp needs an fh_put */ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, - loff_t offset, struct kvec *vec, int vlen, unsigned long *count) + loff_t offset, struct kvec *vec, int vlen, unsigned long *count, + u32 *eof) { + struct nfsd_file *nf; struct file *file; - struct raparms *ra; __be32 err; trace_nfsd_read_start(rqstp, fhp, offset, *count); - err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file); + err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_READ, &nf); if (err) return err; - ra = nfsd_init_raparms(file); - + file = nf->nf_file; if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &rqstp->rq_flags)) - err = nfsd_splice_read(rqstp, fhp, file, offset, count); + err = nfsd_splice_read(rqstp, fhp, file, offset, count, eof); else - err = nfsd_readv(rqstp, fhp, file, offset, vec, vlen, count); + err = nfsd_readv(rqstp, fhp, file, offset, vec, vlen, count, eof); - if (ra) - nfsd_put_raparams(file, ra); - fput(file); + nfsd_file_put(nf); trace_nfsd_read_done(rqstp, fhp, offset, *count); @@ -1083,17 +1032,18 @@ __be32 nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset, struct kvec *vec, int vlen, unsigned long *cnt, int stable) { - struct file *file = NULL; - __be32 err = 0; + struct nfsd_file *nf; + __be32 err; trace_nfsd_write_start(rqstp, fhp, offset, *cnt); - err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file); + err = nfsd_file_acquire(rqstp, fhp, NFSD_MAY_WRITE, &nf); if (err) goto out; - err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt, stable); - fput(file); + err = nfsd_vfs_write(rqstp, fhp, nf->nf_file, offset, vec, + vlen, cnt, stable); + nfsd_file_put(nf); out: trace_nfsd_write_done(rqstp, fhp, offset, *cnt); return err; @@ -1113,9 +1063,9 @@ __be32 nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset, unsigned long count) { - struct file *file; - loff_t end = LLONG_MAX; - __be32 err = nfserr_inval; + struct nfsd_file *nf; + loff_t end = LLONG_MAX; + __be32 err = nfserr_inval; if (offset < 0) goto out; @@ -1125,20 +1075,27 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, goto out; } - err = nfsd_open(rqstp, fhp, S_IFREG, - NFSD_MAY_WRITE|NFSD_MAY_NOT_BREAK_LEASE, &file); + err = nfsd_file_acquire(rqstp, fhp, + NFSD_MAY_WRITE|NFSD_MAY_NOT_BREAK_LEASE, &nf); if (err) goto out; if (EX_ISSYNC(fhp->fh_export)) { - int err2 = vfs_fsync_range(file, offset, end, 0); + int err2 = vfs_fsync_range(nf->nf_file, offset, end, 0); - if (err2 != -EINVAL) - err = nfserrno(err2); - else + switch (err2) { + case 0: + break; + case -EINVAL: err = nfserr_notsupp; + break; + default: + err = nfserrno(err2); + nfsd_reset_boot_verifier(net_generic(nf->nf_net, + nfsd_net_id)); + } } - fput(file); + nfsd_file_put(nf); out: return err; } @@ -1659,6 +1616,26 @@ out_nfserr: goto out_unlock; } +static void +nfsd_close_cached_files(struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + + if (inode && S_ISREG(inode->i_mode)) + nfsd_file_close_inode_sync(inode); +} + +static bool +nfsd_has_cached_files(struct dentry *dentry) +{ + bool ret = false; + struct inode *inode = d_inode(dentry); + + if (inode && S_ISREG(inode->i_mode)) + ret = nfsd_file_is_cached(inode); + return ret; +} + /* * Rename a file * N.B. After this call _both_ ffhp and tfhp need an fh_put @@ -1671,6 +1648,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, struct inode *fdir, *tdir; __be32 err; int host_err; + bool has_cached = false; err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_REMOVE); if (err) @@ -1689,6 +1667,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, if (!flen || isdotent(fname, flen) || !tlen || isdotent(tname, tlen)) goto out; +retry: host_err = fh_want_write(ffhp); if (host_err) { err = nfserrno(host_err); @@ -1728,11 +1707,16 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry) goto out_dput_new; - host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL, 0); - if (!host_err) { - host_err = commit_metadata(tfhp); - if (!host_err) - host_err = commit_metadata(ffhp); + if (nfsd_has_cached_files(ndentry)) { + has_cached = true; + goto out_dput_old; + } else { + host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL, 0); + if (!host_err) { + host_err = commit_metadata(tfhp); + if (!host_err) + host_err = commit_metadata(ffhp); + } } out_dput_new: dput(ndentry); @@ -1745,12 +1729,26 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, * as that would do the wrong thing if the two directories * were the same, so again we do it by hand. */ - fill_post_wcc(ffhp); - fill_post_wcc(tfhp); + if (!has_cached) { + fill_post_wcc(ffhp); + fill_post_wcc(tfhp); + } unlock_rename(tdentry, fdentry); ffhp->fh_locked = tfhp->fh_locked = false; fh_drop_write(ffhp); + /* + * If the target dentry has cached open files, then we need to try to + * close them prior to doing the rename. Flushing delayed fput + * shouldn't be done with locks held however, so we delay it until this + * point and then reattempt the whole shebang. + */ + if (has_cached) { + has_cached = false; + nfsd_close_cached_files(ndentry); + dput(ndentry); + goto retry; + } out: return err; } @@ -1797,10 +1795,13 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, if (!type) type = d_inode(rdentry)->i_mode & S_IFMT; - if (type != S_IFDIR) + if (type != S_IFDIR) { + nfsd_close_cached_files(rdentry); host_err = vfs_unlink(dirp, rdentry, NULL); - else + } else { host_err = vfs_rmdir(dirp, rdentry); + } + if (!host_err) host_err = commit_metadata(fhp); dput(rdentry); @@ -2074,63 +2075,3 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, return err? nfserrno(err) : 0; } - -void -nfsd_racache_shutdown(void) -{ - struct raparms *raparm, *last_raparm; - unsigned int i; - - dprintk("nfsd: freeing readahead buffers.\n"); - - for (i = 0; i < RAPARM_HASH_SIZE; i++) { - raparm = raparm_hash[i].pb_head; - while(raparm) { - last_raparm = raparm; - raparm = raparm->p_next; - kfree(last_raparm); - } - raparm_hash[i].pb_head = NULL; - } -} -/* - * Initialize readahead param cache - */ -int -nfsd_racache_init(int cache_size) -{ - int i; - int j = 0; - int nperbucket; - struct raparms **raparm = NULL; - - - if (raparm_hash[0].pb_head) - return 0; - nperbucket = DIV_ROUND_UP(cache_size, RAPARM_HASH_SIZE); - nperbucket = max(2, nperbucket); - cache_size = nperbucket * RAPARM_HASH_SIZE; - - dprintk("nfsd: allocating %d readahead buffers.\n", cache_size); - - for (i = 0; i < RAPARM_HASH_SIZE; i++) { - spin_lock_init(&raparm_hash[i].pb_lock); - - raparm = &raparm_hash[i].pb_head; - for (j = 0; j < nperbucket; j++) { - *raparm = kzalloc(sizeof(struct raparms), GFP_KERNEL); - if (!*raparm) - goto out_nomem; - raparm = &(*raparm)->p_next; - } - *raparm = NULL; - } - - nfsdstats.ra_size = cache_size; - return 0; - -out_nomem: - dprintk("nfsd: kmalloc failed, freeing readahead buffers\n"); - nfsd_racache_shutdown(); - return -ENOMEM; -} diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index db351247892d..a13fd9d7e1f5 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -40,8 +40,6 @@ typedef int (*nfsd_filldir_t)(void *, const char *, int, loff_t, u64, unsigned); /* nfsd/vfs.c */ -int nfsd_racache_init(int); -void nfsd_racache_shutdown(void); int nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, struct svc_export **expp); __be32 nfsd_lookup(struct svc_rqst *, struct svc_fh *, @@ -75,18 +73,23 @@ __be32 do_nfsd_create(struct svc_rqst *, struct svc_fh *, __be32 nfsd_commit(struct svc_rqst *, struct svc_fh *, loff_t, unsigned long); #endif /* CONFIG_NFSD_V3 */ +int nfsd_open_break_lease(struct inode *, int); __be32 nfsd_open(struct svc_rqst *, struct svc_fh *, umode_t, int, struct file **); -struct raparms; +__be32 nfsd_open_verified(struct svc_rqst *, struct svc_fh *, umode_t, + int, struct file **); __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, - unsigned long *count); + unsigned long *count, + u32 *eof); __be32 nfsd_readv(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, struct kvec *vec, int vlen, - unsigned long *count); + unsigned long *count, + u32 *eof); __be32 nfsd_read(struct svc_rqst *, struct svc_fh *, - loff_t, struct kvec *, int, unsigned long *); + loff_t, struct kvec *, int, unsigned long *, + u32 *eof); __be32 nfsd_write(struct svc_rqst *, struct svc_fh *, loff_t, struct kvec *, int, unsigned long *, int); __be32 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, @@ -115,9 +118,6 @@ __be32 nfsd_statfs(struct svc_rqst *, struct svc_fh *, __be32 nfsd_permission(struct svc_rqst *, struct svc_export *, struct dentry *, int); -struct raparms *nfsd_init_raparms(struct file *file); -void nfsd_put_raparams(struct file *file, struct raparms *ra); - static inline int fh_want_write(struct svc_fh *fh) { int ret; @@ -152,23 +152,4 @@ static inline int nfsd_create_is_exclusive(int createmode) || createmode == NFS4_CREATE_EXCLUSIVE4_1; } -static inline bool nfsd_eof_on_read(long requested, long read, - loff_t offset, loff_t size) -{ - /* We assume a short read means eof: */ - if (requested > read) - return true; - /* - * A non-short read might also reach end of file. The spec - * still requires us to set eof in that case. - * - * Further operations may have modified the file size since - * the read, so the following check is not atomic with the read. - * We've only seen that cause a problem for a client in the case - * where the read returned a count of 0 without setting eof. - * That case was fixed by the addition of the above check. - */ - return (offset + read >= size); -} - #endif /* LINUX_NFSD_VFS_H */ diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h index 2cb29e961a76..99ff9f403ff1 100644 --- a/fs/nfsd/xdr3.h +++ b/fs/nfsd/xdr3.h @@ -151,7 +151,7 @@ struct nfsd3_readres { __be32 status; struct svc_fh fh; unsigned long count; - int eof; + __u32 eof; }; struct nfsd3_writeres { diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index d64c870f998a..f4737d66ee98 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -273,15 +273,14 @@ struct nfsd4_open_downgrade { struct nfsd4_read { - stateid_t rd_stateid; /* request */ - u64 rd_offset; /* request */ - u32 rd_length; /* request */ - int rd_vlen; - struct file *rd_filp; - bool rd_tmp_file; + stateid_t rd_stateid; /* request */ + u64 rd_offset; /* request */ + u32 rd_length; /* request */ + int rd_vlen; + struct nfsd_file *rd_nf; - struct svc_rqst *rd_rqstp; /* response */ - struct svc_fh * rd_fhp; /* response */ + struct svc_rqst *rd_rqstp; /* response */ + struct svc_fh *rd_fhp; /* response */ }; struct nfsd4_readdir { @@ -538,8 +537,8 @@ struct nfsd4_copy { struct nfs4_client *cp_clp; - struct file *file_src; - struct file *file_dst; + struct nfsd_file *nf_src; + struct nfsd_file *nf_dst; stateid_t cp_stateid; diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h index 5a00121fb219..f3462828a0e2 100644 --- a/fs/notify/fsnotify.h +++ b/fs/notify/fsnotify.h @@ -54,8 +54,6 @@ static inline void fsnotify_clear_marks_by_sb(struct super_block *sb) { fsnotify_destroy_marks(&sb->s_fsnotify_marks); } -/* Wait until all marks queued for destruction are destroyed */ -extern void fsnotify_wait_marks_destroyed(void); /* * update the dentry->d_flags of all of inode's children to indicate if inode cares diff --git a/fs/notify/group.c b/fs/notify/group.c index 0391190305cc..133f723aca07 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -108,6 +108,7 @@ void fsnotify_put_group(struct fsnotify_group *group) if (refcount_dec_and_test(&group->refcnt)) fsnotify_final_destroy_group(group); } +EXPORT_SYMBOL_GPL(fsnotify_put_group); /* * Create a new fsnotify_group and hold a reference for the group returned. @@ -137,6 +138,7 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops) return group; } +EXPORT_SYMBOL_GPL(fsnotify_alloc_group); int fsnotify_fasync(int fd, struct file *file, int on) { diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 99ddd126f6f0..1d96216dffd1 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -276,6 +276,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark) queue_delayed_work(system_unbound_wq, &reaper_work, FSNOTIFY_REAPER_DELAY); } +EXPORT_SYMBOL_GPL(fsnotify_put_mark); /* * Get mark reference when we found the mark via lockless traversal of object @@ -430,6 +431,7 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark, mutex_unlock(&group->mark_mutex); fsnotify_free_mark(mark); } +EXPORT_SYMBOL_GPL(fsnotify_destroy_mark); /* * Sorting function for lists of fsnotify marks. @@ -685,6 +687,7 @@ int fsnotify_add_mark(struct fsnotify_mark *mark, fsnotify_connp_t *connp, mutex_unlock(&group->mark_mutex); return ret; } +EXPORT_SYMBOL_GPL(fsnotify_add_mark); /* * Given a list of marks, find the mark associated with given group. If found @@ -711,6 +714,7 @@ struct fsnotify_mark *fsnotify_find_mark(fsnotify_connp_t *connp, spin_unlock(&conn->lock); return NULL; } +EXPORT_SYMBOL_GPL(fsnotify_find_mark); /* Clear any marks in a group with given type mask */ void fsnotify_clear_marks_by_group(struct fsnotify_group *group, @@ -809,6 +813,7 @@ void fsnotify_init_mark(struct fsnotify_mark *mark, mark->group = group; WRITE_ONCE(mark->connector, NULL); } +EXPORT_SYMBOL_GPL(fsnotify_init_mark); /* * Destroy all marks in destroy_list, waits for SRCU period to finish before @@ -837,3 +842,4 @@ void fsnotify_wait_marks_destroyed(void) { flush_delayed_work(&reaper_work); } +EXPORT_SYMBOL_GPL(fsnotify_wait_marks_destroyed); diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index 20c841a906f2..3aac5c917afe 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -71,7 +71,7 @@ static inline MFT_RECORD *map_mft_record_page(ntfs_inode *ni) } /* Read, map, and pin the page. */ page = ntfs_map_page(mft_vi->i_mapping, index); - if (likely(!IS_ERR(page))) { + if (!IS_ERR(page)) { /* Catch multi sector transfer fixup errors. */ if (likely(ntfs_is_mft_recordp((le32*)(page_address(page) + ofs)))) { @@ -154,7 +154,7 @@ MFT_RECORD *map_mft_record(ntfs_inode *ni) mutex_lock(&ni->mrec_lock); m = map_mft_record_page(ni); - if (likely(!IS_ERR(m))) + if (!IS_ERR(m)) return m; mutex_unlock(&ni->mrec_lock); @@ -271,7 +271,7 @@ MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref, m = map_mft_record(ni); /* map_mft_record() has incremented this on success. */ atomic_dec(&ni->count); - if (likely(!IS_ERR(m))) { + if (!IS_ERR(m)) { /* Verify the sequence number. */ if (likely(le16_to_cpu(m->sequence_number) == seq_no)) { ntfs_debug("Done 1."); @@ -1303,7 +1303,7 @@ static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol) read_unlock_irqrestore(&mftbmp_ni->size_lock, flags); rl = ntfs_attr_find_vcn_nolock(mftbmp_ni, (ll - 1) >> vol->cluster_size_bits, NULL); - if (unlikely(IS_ERR(rl) || !rl->length || rl->lcn < 0)) { + if (IS_ERR(rl) || unlikely(!rl->length || rl->lcn < 0)) { up_write(&mftbmp_ni->runlist.lock); ntfs_error(vol->sb, "Failed to determine last allocated " "cluster of mft bitmap attribute."); @@ -1734,7 +1734,7 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol) read_unlock_irqrestore(&mft_ni->size_lock, flags); rl = ntfs_attr_find_vcn_nolock(mft_ni, (ll - 1) >> vol->cluster_size_bits, NULL); - if (unlikely(IS_ERR(rl) || !rl->length || rl->lcn < 0)) { + if (IS_ERR(rl) || unlikely(!rl->length || rl->lcn < 0)) { up_write(&mft_ni->runlist.lock); ntfs_error(vol->sb, "Failed to determine last allocated " "cluster of mft data attribute."); @@ -1776,7 +1776,7 @@ static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol) do { rl2 = ntfs_cluster_alloc(vol, old_last_vcn, nr, lcn, MFT_ZONE, true); - if (likely(!IS_ERR(rl2))) + if (!IS_ERR(rl2)) break; if (PTR_ERR(rl2) != -ENOSPC || nr == min_nr) { ntfs_error(vol->sb, "Failed to allocate the minimal " diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c index 2d3cc9e3395d..4e6a44bc654c 100644 --- a/fs/ntfs/namei.c +++ b/fs/ntfs/namei.c @@ -115,7 +115,7 @@ static struct dentry *ntfs_lookup(struct inode *dir_ino, struct dentry *dent, dent_ino = MREF(mref); ntfs_debug("Found inode 0x%lx. Calling ntfs_iget.", dent_ino); dent_inode = ntfs_iget(vol->sb, dent_ino); - if (likely(!IS_ERR(dent_inode))) { + if (!IS_ERR(dent_inode)) { /* Consistency check. */ if (is_bad_inode(dent_inode) || MSEQNO(mref) == NTFS_I(dent_inode)->seq_no || diff --git a/fs/ntfs/runlist.c b/fs/ntfs/runlist.c index 508744a93180..97932fb5179c 100644 --- a/fs/ntfs/runlist.c +++ b/fs/ntfs/runlist.c @@ -951,7 +951,7 @@ mpa_err: } /* Now combine the new and old runlists checking for overlaps. */ old_rl = ntfs_runlists_merge(old_rl, rl); - if (likely(!IS_ERR(old_rl))) + if (!IS_ERR(old_rl)) return old_rl; ntfs_free(rl); ntfs_error(vol->sb, "Failed to merge runlists."); diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 29621d40f448..7dc3bc604f78 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -1475,7 +1475,7 @@ not_enabled: kfree(name); /* Get the inode. */ tmp_ino = ntfs_iget(vol->sb, MREF(mref)); - if (unlikely(IS_ERR(tmp_ino) || is_bad_inode(tmp_ino))) { + if (IS_ERR(tmp_ino) || unlikely(is_bad_inode(tmp_ino))) { if (!IS_ERR(tmp_ino)) iput(tmp_ino); ntfs_error(vol->sb, "Failed to load $UsnJrnl."); diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 0c335b51043d..f9baefc76cf9 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -5993,6 +5993,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) struct buffer_head *data_alloc_bh = NULL; struct ocfs2_dinode *di; struct ocfs2_truncate_log *tl; + struct ocfs2_journal *journal = osb->journal; BUG_ON(inode_trylock(tl_inode)); @@ -6013,6 +6014,20 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) goto out; } + /* Appending truncate log(TA) and and flushing truncate log(TF) are + * two separated transactions. They can be both committed but not + * checkpointed. If crash occurs then, both two transaction will be + * replayed with several already released to global bitmap clusters. + * Then truncate log will be replayed resulting in cluster double free. + */ + jbd2_journal_lock_updates(journal->j_journal); + status = jbd2_journal_flush(journal->j_journal); + jbd2_journal_unlock_updates(journal->j_journal); + if (status < 0) { + mlog_errno(status); + goto out; + } + data_alloc_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE, OCFS2_INVALID_SLOT); @@ -6792,6 +6807,8 @@ void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, struct page *page, int zero, u64 *phys) { int ret, partial = 0; + loff_t start_byte = ((loff_t)page->index << PAGE_SHIFT) + from; + loff_t length = to - from; ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0); if (ret) @@ -6811,7 +6828,8 @@ void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, if (ret < 0) mlog_errno(ret); else if (ocfs2_should_order_data(inode)) { - ret = ocfs2_jbd2_file_inode(handle, inode); + ret = ocfs2_jbd2_inode_add_write(handle, inode, + start_byte, length); if (ret < 0) mlog_errno(ret); } diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index a4c905d6b575..9cd0a6815933 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -942,7 +942,8 @@ static void ocfs2_write_failure(struct inode *inode, if (tmppage && page_has_buffers(tmppage)) { if (ocfs2_should_order_data(inode)) - ocfs2_jbd2_file_inode(wc->w_handle, inode); + ocfs2_jbd2_inode_add_write(wc->w_handle, inode, + user_pos, user_len); block_commit_write(tmppage, from, to); } @@ -2023,8 +2024,14 @@ int ocfs2_write_end_nolock(struct address_space *mapping, } if (page_has_buffers(tmppage)) { - if (handle && ocfs2_should_order_data(inode)) - ocfs2_jbd2_file_inode(handle, inode); + if (handle && ocfs2_should_order_data(inode)) { + loff_t start_byte = + ((loff_t)tmppage->index << PAGE_SHIFT) + + from; + loff_t length = to - from; + ocfs2_jbd2_inode_add_write(handle, inode, + start_byte, length); + } block_commit_write(tmppage, from, to); } } @@ -2042,7 +2049,8 @@ out_write_size: inode->i_mtime = inode->i_ctime = current_time(inode); di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); - ocfs2_update_inode_fsync_trans(handle, inode, 1); + if (handle) + ocfs2_update_inode_fsync_trans(handle, inode, 1); } if (handle) ocfs2_journal_dirty(handle, wc->w_di_bh); @@ -2139,13 +2147,30 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock, struct ocfs2_dio_write_ctxt *dwc = NULL; struct buffer_head *di_bh = NULL; u64 p_blkno; - loff_t pos = iblock << inode->i_sb->s_blocksize_bits; + unsigned int i_blkbits = inode->i_sb->s_blocksize_bits; + loff_t pos = iblock << i_blkbits; + sector_t endblk = (i_size_read(inode) - 1) >> i_blkbits; unsigned len, total_len = bh_result->b_size; int ret = 0, first_get_block = 0; len = osb->s_clustersize - (pos & (osb->s_clustersize - 1)); len = min(total_len, len); + /* + * bh_result->b_size is count in get_more_blocks according to write + * "pos" and "end", we need map twice to return different buffer state: + * 1. area in file size, not set NEW; + * 2. area out file size, set NEW. + * + * iblock endblk + * |--------|---------|---------|--------- + * |<-------area in file------->| + */ + + if ((iblock <= endblk) && + ((iblock + ((len - 1) >> i_blkbits)) > endblk)) + len = (endblk - iblock + 1) << i_blkbits; + mlog(0, "get block of %lu at %llu:%u req %u\n", inode->i_ino, pos, len, total_len); @@ -2229,6 +2254,9 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock, if (desc->c_needs_zero) set_buffer_new(bh_result); + if (iblock > endblk) + set_buffer_new(bh_result); + /* May sleep in end_io. It should not happen in a irq context. So defer * it to dio work queue. */ set_buffer_defer_completion(bh_result); diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c index 429e6a8359a5..eaf042feaf5e 100644 --- a/fs/ocfs2/blockcheck.c +++ b/fs/ocfs2/blockcheck.c @@ -231,14 +231,6 @@ static int blockcheck_u64_get(void *data, u64 *val) } DEFINE_SIMPLE_ATTRIBUTE(blockcheck_fops, blockcheck_u64_get, NULL, "%llu\n"); -static struct dentry *blockcheck_debugfs_create(const char *name, - struct dentry *parent, - u64 *value) -{ - return debugfs_create_file(name, S_IFREG | S_IRUSR, parent, value, - &blockcheck_fops); -} - static void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats) { if (stats) { @@ -250,16 +242,20 @@ static void ocfs2_blockcheck_debug_remove(struct ocfs2_blockcheck_stats *stats) static void ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats, struct dentry *parent) { - stats->b_debug_dir = debugfs_create_dir("blockcheck", parent); + struct dentry *dir; + + dir = debugfs_create_dir("blockcheck", parent); + stats->b_debug_dir = dir; + + debugfs_create_file("blocks_checked", S_IFREG | S_IRUSR, dir, + &stats->b_check_count, &blockcheck_fops); - blockcheck_debugfs_create("blocks_checked", stats->b_debug_dir, - &stats->b_check_count); + debugfs_create_file("checksums_failed", S_IFREG | S_IRUSR, dir, + &stats->b_failure_count, &blockcheck_fops); - blockcheck_debugfs_create("checksums_failed", stats->b_debug_dir, - &stats->b_failure_count); + debugfs_create_file("ecc_recoveries", S_IFREG | S_IRUSR, dir, + &stats->b_recover_count, &blockcheck_fops); - blockcheck_debugfs_create("ecc_recoveries", stats->b_debug_dir, - &stats->b_recover_count); } #else static inline void ocfs2_blockcheck_debug_install(struct ocfs2_blockcheck_stats *stats, diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index f1b613327ac8..a368350d4c27 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -225,10 +225,6 @@ struct o2hb_region { unsigned int hr_region_num; struct dentry *hr_debug_dir; - struct dentry *hr_debug_livenodes; - struct dentry *hr_debug_regnum; - struct dentry *hr_debug_elapsed_time; - struct dentry *hr_debug_pinned; struct o2hb_debug_buf *hr_db_livenodes; struct o2hb_debug_buf *hr_db_regnum; struct o2hb_debug_buf *hr_db_elapsed_time; @@ -1394,21 +1390,20 @@ void o2hb_exit(void) kfree(o2hb_db_failedregions); } -static struct dentry *o2hb_debug_create(const char *name, struct dentry *dir, - struct o2hb_debug_buf **db, int db_len, - int type, int size, int len, void *data) +static void o2hb_debug_create(const char *name, struct dentry *dir, + struct o2hb_debug_buf **db, int db_len, int type, + int size, int len, void *data) { *db = kmalloc(db_len, GFP_KERNEL); if (!*db) - return NULL; + return; (*db)->db_type = type; (*db)->db_size = size; (*db)->db_len = len; (*db)->db_data = data; - return debugfs_create_file(name, S_IFREG|S_IRUSR, dir, *db, - &o2hb_debug_fops); + debugfs_create_file(name, S_IFREG|S_IRUSR, dir, *db, &o2hb_debug_fops); } static void o2hb_debug_init(void) @@ -1525,11 +1520,7 @@ static void o2hb_region_release(struct config_item *item) kfree(reg->hr_slots); - debugfs_remove(reg->hr_debug_livenodes); - debugfs_remove(reg->hr_debug_regnum); - debugfs_remove(reg->hr_debug_elapsed_time); - debugfs_remove(reg->hr_debug_pinned); - debugfs_remove(reg->hr_debug_dir); + debugfs_remove_recursive(reg->hr_debug_dir); kfree(reg->hr_db_livenodes); kfree(reg->hr_db_regnum); kfree(reg->hr_db_elapsed_time); @@ -1988,69 +1979,33 @@ static struct o2hb_heartbeat_group *to_o2hb_heartbeat_group(struct config_group : NULL; } -static int o2hb_debug_region_init(struct o2hb_region *reg, struct dentry *dir) +static void o2hb_debug_region_init(struct o2hb_region *reg, + struct dentry *parent) { - int ret = -ENOMEM; + struct dentry *dir; - reg->hr_debug_dir = - debugfs_create_dir(config_item_name(®->hr_item), dir); - if (!reg->hr_debug_dir) { - mlog_errno(ret); - goto bail; - } + dir = debugfs_create_dir(config_item_name(®->hr_item), parent); + reg->hr_debug_dir = dir; - reg->hr_debug_livenodes = - o2hb_debug_create(O2HB_DEBUG_LIVENODES, - reg->hr_debug_dir, - &(reg->hr_db_livenodes), - sizeof(*(reg->hr_db_livenodes)), - O2HB_DB_TYPE_REGION_LIVENODES, - sizeof(reg->hr_live_node_bitmap), - O2NM_MAX_NODES, reg); - if (!reg->hr_debug_livenodes) { - mlog_errno(ret); - goto bail; - } + o2hb_debug_create(O2HB_DEBUG_LIVENODES, dir, &(reg->hr_db_livenodes), + sizeof(*(reg->hr_db_livenodes)), + O2HB_DB_TYPE_REGION_LIVENODES, + sizeof(reg->hr_live_node_bitmap), O2NM_MAX_NODES, + reg); - reg->hr_debug_regnum = - o2hb_debug_create(O2HB_DEBUG_REGION_NUMBER, - reg->hr_debug_dir, - &(reg->hr_db_regnum), - sizeof(*(reg->hr_db_regnum)), - O2HB_DB_TYPE_REGION_NUMBER, - 0, O2NM_MAX_NODES, reg); - if (!reg->hr_debug_regnum) { - mlog_errno(ret); - goto bail; - } + o2hb_debug_create(O2HB_DEBUG_REGION_NUMBER, dir, &(reg->hr_db_regnum), + sizeof(*(reg->hr_db_regnum)), + O2HB_DB_TYPE_REGION_NUMBER, 0, O2NM_MAX_NODES, reg); - reg->hr_debug_elapsed_time = - o2hb_debug_create(O2HB_DEBUG_REGION_ELAPSED_TIME, - reg->hr_debug_dir, - &(reg->hr_db_elapsed_time), - sizeof(*(reg->hr_db_elapsed_time)), - O2HB_DB_TYPE_REGION_ELAPSED_TIME, - 0, 0, reg); - if (!reg->hr_debug_elapsed_time) { - mlog_errno(ret); - goto bail; - } + o2hb_debug_create(O2HB_DEBUG_REGION_ELAPSED_TIME, dir, + &(reg->hr_db_elapsed_time), + sizeof(*(reg->hr_db_elapsed_time)), + O2HB_DB_TYPE_REGION_ELAPSED_TIME, 0, 0, reg); - reg->hr_debug_pinned = - o2hb_debug_create(O2HB_DEBUG_REGION_PINNED, - reg->hr_debug_dir, - &(reg->hr_db_pinned), - sizeof(*(reg->hr_db_pinned)), - O2HB_DB_TYPE_REGION_PINNED, - 0, 0, reg); - if (!reg->hr_debug_pinned) { - mlog_errno(ret); - goto bail; - } + o2hb_debug_create(O2HB_DEBUG_REGION_PINNED, dir, &(reg->hr_db_pinned), + sizeof(*(reg->hr_db_pinned)), + O2HB_DB_TYPE_REGION_PINNED, 0, 0, reg); - ret = 0; -bail: - return ret; } static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *group, @@ -2106,11 +2061,7 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g if (ret) goto unregister_handler; - ret = o2hb_debug_region_init(reg, o2hb_debug_dir); - if (ret) { - config_item_put(®->hr_item); - goto unregister_handler; - } + o2hb_debug_region_init(reg, o2hb_debug_dir); return ®->hr_item; diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 784426dee56c..bdef72c0f099 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -3636,7 +3636,7 @@ static void ocfs2_dx_dir_transfer_leaf(struct inode *dir, u32 split_hash, int i, j, num_used; u32 major_hash; struct ocfs2_dx_leaf *orig_dx_leaf, *new_dx_leaf; - struct ocfs2_dx_entry_list *orig_list, *new_list, *tmp_list; + struct ocfs2_dx_entry_list *orig_list, *tmp_list; struct ocfs2_dx_entry *dx_entry; tmp_list = &tmp_dx_leaf->dl_list; @@ -3645,7 +3645,6 @@ static void ocfs2_dx_dir_transfer_leaf(struct inode *dir, u32 split_hash, orig_dx_leaf = (struct ocfs2_dx_leaf *) orig_dx_leaves[i]->b_data; orig_list = &orig_dx_leaf->dl_list; new_dx_leaf = (struct ocfs2_dx_leaf *) new_dx_leaves[i]->b_data; - new_list = &new_dx_leaf->dl_list; num_used = le16_to_cpu(orig_list->de_num_used); diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index 69a429b625cc..aaf24548b02a 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -142,7 +142,6 @@ struct dlm_ctxt atomic_t res_tot_count; atomic_t res_cur_count; - struct dlm_debug_ctxt *dlm_debug_ctxt; struct dentry *dlm_debugfs_subroot; /* NOTE: Next three are protected by dlm_domain_lock */ diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index a4b58ba99927..4d0b452012b2 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -853,67 +853,34 @@ static const struct file_operations debug_state_fops = { /* files in subroot */ void dlm_debug_init(struct dlm_ctxt *dlm) { - struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt; - /* for dumping dlm_ctxt */ - dc->debug_state_dentry = debugfs_create_file(DLM_DEBUGFS_DLM_STATE, - S_IFREG|S_IRUSR, - dlm->dlm_debugfs_subroot, - dlm, &debug_state_fops); + debugfs_create_file(DLM_DEBUGFS_DLM_STATE, S_IFREG|S_IRUSR, + dlm->dlm_debugfs_subroot, dlm, &debug_state_fops); /* for dumping lockres */ - dc->debug_lockres_dentry = - debugfs_create_file(DLM_DEBUGFS_LOCKING_STATE, - S_IFREG|S_IRUSR, - dlm->dlm_debugfs_subroot, - dlm, &debug_lockres_fops); + debugfs_create_file(DLM_DEBUGFS_LOCKING_STATE, S_IFREG|S_IRUSR, + dlm->dlm_debugfs_subroot, dlm, &debug_lockres_fops); /* for dumping mles */ - dc->debug_mle_dentry = debugfs_create_file(DLM_DEBUGFS_MLE_STATE, - S_IFREG|S_IRUSR, - dlm->dlm_debugfs_subroot, - dlm, &debug_mle_fops); + debugfs_create_file(DLM_DEBUGFS_MLE_STATE, S_IFREG|S_IRUSR, + dlm->dlm_debugfs_subroot, dlm, &debug_mle_fops); /* for dumping lockres on the purge list */ - dc->debug_purgelist_dentry = - debugfs_create_file(DLM_DEBUGFS_PURGE_LIST, - S_IFREG|S_IRUSR, - dlm->dlm_debugfs_subroot, - dlm, &debug_purgelist_fops); -} - -void dlm_debug_shutdown(struct dlm_ctxt *dlm) -{ - struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt; - - if (dc) { - debugfs_remove(dc->debug_purgelist_dentry); - debugfs_remove(dc->debug_mle_dentry); - debugfs_remove(dc->debug_lockres_dentry); - debugfs_remove(dc->debug_state_dentry); - kfree(dc); - dc = NULL; - } + debugfs_create_file(DLM_DEBUGFS_PURGE_LIST, S_IFREG|S_IRUSR, + dlm->dlm_debugfs_subroot, dlm, + &debug_purgelist_fops); } /* subroot - domain dir */ -int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm) +void dlm_create_debugfs_subroot(struct dlm_ctxt *dlm) { - dlm->dlm_debug_ctxt = kzalloc(sizeof(struct dlm_debug_ctxt), - GFP_KERNEL); - if (!dlm->dlm_debug_ctxt) { - mlog_errno(-ENOMEM); - return -ENOMEM; - } - dlm->dlm_debugfs_subroot = debugfs_create_dir(dlm->name, dlm_debugfs_root); - return 0; } void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm) { - debugfs_remove(dlm->dlm_debugfs_subroot); + debugfs_remove_recursive(dlm->dlm_debugfs_subroot); } /* debugfs root */ diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h index 7d0c7c9013ce..f8fd8680a4b6 100644 --- a/fs/ocfs2/dlm/dlmdebug.h +++ b/fs/ocfs2/dlm/dlmdebug.h @@ -14,13 +14,6 @@ void dlm_print_one_mle(struct dlm_master_list_entry *mle); #ifdef CONFIG_DEBUG_FS -struct dlm_debug_ctxt { - struct dentry *debug_state_dentry; - struct dentry *debug_lockres_dentry; - struct dentry *debug_mle_dentry; - struct dentry *debug_purgelist_dentry; -}; - struct debug_lockres { int dl_len; char *dl_buf; @@ -29,9 +22,8 @@ struct debug_lockres { }; void dlm_debug_init(struct dlm_ctxt *dlm); -void dlm_debug_shutdown(struct dlm_ctxt *dlm); -int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm); +void dlm_create_debugfs_subroot(struct dlm_ctxt *dlm); void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm); void dlm_create_debugfs_root(void); @@ -42,12 +34,8 @@ void dlm_destroy_debugfs_root(void); static inline void dlm_debug_init(struct dlm_ctxt *dlm) { } -static inline void dlm_debug_shutdown(struct dlm_ctxt *dlm) -{ -} -static inline int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm) +static inline void dlm_create_debugfs_subroot(struct dlm_ctxt *dlm) { - return 0; } static inline void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm) { diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 7338b5d4647c..ee6f459f9770 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -387,7 +387,6 @@ static void dlm_destroy_dlm_worker(struct dlm_ctxt *dlm) static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm) { dlm_unregister_domain_handlers(dlm); - dlm_debug_shutdown(dlm); dlm_complete_thread(dlm); dlm_complete_recovery_thread(dlm); dlm_destroy_dlm_worker(dlm); @@ -1938,7 +1937,6 @@ bail: if (status) { dlm_unregister_domain_handlers(dlm); - dlm_debug_shutdown(dlm); dlm_complete_thread(dlm); dlm_complete_recovery_thread(dlm); dlm_destroy_dlm_worker(dlm); @@ -1992,9 +1990,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, dlm->key = key; dlm->node_num = o2nm_this_node(); - ret = dlm_create_debugfs_subroot(dlm); - if (ret < 0) - goto leave; + dlm_create_debugfs_subroot(dlm); spin_lock_init(&dlm->spinlock); spin_lock_init(&dlm->master_lock); @@ -2056,6 +2052,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, mlog(0, "context init: refcount %u\n", kref_read(&dlm->dlm_refs)); + ret = 0; leave: if (ret < 0 && dlm) { if (dlm->master_hash) diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index e78657742bd8..3883633e82eb 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -90,7 +90,8 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm, enum dlm_status status; int actions = 0; int in_use; - u8 owner; + u8 owner; + int recovery_wait = 0; mlog(0, "master_node = %d, valblk = %d\n", master_node, flags & LKM_VALBLK); @@ -193,9 +194,12 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm, } if (flags & LKM_CANCEL) lock->cancel_pending = 0; - else - lock->unlock_pending = 0; - + else { + if (!lock->unlock_pending) + recovery_wait = 1; + else + lock->unlock_pending = 0; + } } /* get an extra ref on lock. if we are just switching @@ -229,6 +233,17 @@ leave: spin_unlock(&res->spinlock); wake_up(&res->wq); + if (recovery_wait) { + spin_lock(&res->spinlock); + /* Unlock request will directly succeed after owner dies, + * and the lock is already removed from grant list. We have to + * wait for RECOVERING done or we miss the chance to purge it + * since the removement is much faster than RECOVERING proc. + */ + __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_RECOVERING); + spin_unlock(&res->spinlock); + } + /* let the caller's final dlm_lock_put handle the actual kfree */ if (actions & DLM_UNLOCK_FREE_LOCK) { /* this should always be coupled with list removal */ diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 14207234fa3d..6e774c5ea13b 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2508,9 +2508,7 @@ bail: ocfs2_inode_unlock(inode, ex); } - if (local_bh) - brelse(local_bh); - + brelse(local_bh); return status; } @@ -2593,8 +2591,7 @@ int ocfs2_inode_lock_atime(struct inode *inode, *level = 1; if (ocfs2_should_update_atime(inode, vfsmnt)) ocfs2_update_inode_atime(inode, bh); - if (bh) - brelse(bh); + brelse(bh); } else *level = 0; @@ -3012,8 +3009,6 @@ struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void) kref_init(&dlm_debug->d_refcnt); INIT_LIST_HEAD(&dlm_debug->d_lockres_tracking); - dlm_debug->d_locking_state = NULL; - dlm_debug->d_locking_filter = NULL; dlm_debug->d_filter_secs = 0; out: return dlm_debug; @@ -3282,27 +3277,19 @@ static void ocfs2_dlm_init_debug(struct ocfs2_super *osb) { struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug; - dlm_debug->d_locking_state = debugfs_create_file("locking_state", - S_IFREG|S_IRUSR, - osb->osb_debug_root, - osb, - &ocfs2_dlm_debug_fops); + debugfs_create_file("locking_state", S_IFREG|S_IRUSR, + osb->osb_debug_root, osb, &ocfs2_dlm_debug_fops); - dlm_debug->d_locking_filter = debugfs_create_u32("locking_filter", - 0600, - osb->osb_debug_root, - &dlm_debug->d_filter_secs); + debugfs_create_u32("locking_filter", 0600, osb->osb_debug_root, + &dlm_debug->d_filter_secs); } static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb) { struct ocfs2_dlm_debug *dlm_debug = osb->osb_dlm_debug; - if (dlm_debug) { - debugfs_remove(dlm_debug->d_locking_state); - debugfs_remove(dlm_debug->d_locking_filter); + if (dlm_debug) ocfs2_put_dlm_debug(dlm_debug); - } } int ocfs2_dlm_init(struct ocfs2_super *osb) diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index e66a249fe07c..e3e2d1b2af51 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -590,8 +590,7 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, *extent_flags = rec->e_flags; } out: - if (eb_bh) - brelse(eb_bh); + brelse(eb_bh); return ret; } diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 4435df3e5adb..9876db52913a 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -706,7 +706,9 @@ leave: * Thus, we need to explicitly order the zeroed pages. */ static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode, - struct buffer_head *di_bh) + struct buffer_head *di_bh, + loff_t start_byte, + loff_t length) { struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); handle_t *handle = NULL; @@ -722,7 +724,7 @@ static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode, goto out; } - ret = ocfs2_jbd2_file_inode(handle, inode); + ret = ocfs2_jbd2_inode_add_write(handle, inode, start_byte, length); if (ret < 0) { mlog_errno(ret); goto out; @@ -761,7 +763,9 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, BUG_ON(abs_to > (((u64)index + 1) << PAGE_SHIFT)); BUG_ON(abs_from & (inode->i_blkbits - 1)); - handle = ocfs2_zero_start_ordered_transaction(inode, di_bh); + handle = ocfs2_zero_start_ordered_transaction(inode, di_bh, + abs_from, + abs_to - abs_from); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out; @@ -1226,6 +1230,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) transfer_to[USRQUOTA] = dqget(sb, make_kqid_uid(attr->ia_uid)); if (IS_ERR(transfer_to[USRQUOTA])) { status = PTR_ERR(transfer_to[USRQUOTA]); + transfer_to[USRQUOTA] = NULL; goto bail_unlock; } } @@ -1235,6 +1240,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) transfer_to[GRPQUOTA] = dqget(sb, make_kqid_gid(attr->ia_gid)); if (IS_ERR(transfer_to[GRPQUOTA])) { status = PTR_ERR(transfer_to[GRPQUOTA]); + transfer_to[GRPQUOTA] = NULL; goto bail_unlock; } } @@ -2092,54 +2098,89 @@ static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos) return 0; } -static int ocfs2_prepare_inode_for_refcount(struct inode *inode, - struct file *file, - loff_t pos, size_t count, - int *meta_level) +static int ocfs2_inode_lock_for_extent_tree(struct inode *inode, + struct buffer_head **di_bh, + int meta_level, + int overwrite_io, + int write_sem, + int wait) { - int ret; - struct buffer_head *di_bh = NULL; - u32 cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits; - u32 clusters = - ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos; + int ret = 0; - ret = ocfs2_inode_lock(inode, &di_bh, 1); - if (ret) { - mlog_errno(ret); + if (wait) + ret = ocfs2_inode_lock(inode, NULL, meta_level); + else + ret = ocfs2_try_inode_lock(inode, + overwrite_io ? NULL : di_bh, meta_level); + if (ret < 0) goto out; + + if (wait) { + if (write_sem) + down_write(&OCFS2_I(inode)->ip_alloc_sem); + else + down_read(&OCFS2_I(inode)->ip_alloc_sem); + } else { + if (write_sem) + ret = down_write_trylock(&OCFS2_I(inode)->ip_alloc_sem); + else + ret = down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem); + + if (!ret) { + ret = -EAGAIN; + goto out_unlock; + } } - *meta_level = 1; + return ret; - ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX); - if (ret) - mlog_errno(ret); +out_unlock: + brelse(*di_bh); + ocfs2_inode_unlock(inode, meta_level); out: - brelse(di_bh); return ret; } +static void ocfs2_inode_unlock_for_extent_tree(struct inode *inode, + struct buffer_head **di_bh, + int meta_level, + int write_sem) +{ + if (write_sem) + up_write(&OCFS2_I(inode)->ip_alloc_sem); + else + up_read(&OCFS2_I(inode)->ip_alloc_sem); + + brelse(*di_bh); + *di_bh = NULL; + + if (meta_level >= 0) + ocfs2_inode_unlock(inode, meta_level); +} + static int ocfs2_prepare_inode_for_write(struct file *file, loff_t pos, size_t count, int wait) { int ret = 0, meta_level = 0, overwrite_io = 0; + int write_sem = 0; struct dentry *dentry = file->f_path.dentry; struct inode *inode = d_inode(dentry); struct buffer_head *di_bh = NULL; - loff_t end; + u32 cpos; + u32 clusters; /* * We start with a read level meta lock and only jump to an ex * if we need to make modifications here. */ for(;;) { - if (wait) - ret = ocfs2_inode_lock(inode, NULL, meta_level); - else - ret = ocfs2_try_inode_lock(inode, - overwrite_io ? NULL : &di_bh, meta_level); + ret = ocfs2_inode_lock_for_extent_tree(inode, + &di_bh, + meta_level, + overwrite_io, + write_sem, + wait); if (ret < 0) { - meta_level = -1; if (ret != -EAGAIN) mlog_errno(ret); goto out; @@ -2151,15 +2192,8 @@ static int ocfs2_prepare_inode_for_write(struct file *file, */ if (!wait && !overwrite_io) { overwrite_io = 1; - if (!down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem)) { - ret = -EAGAIN; - goto out_unlock; - } ret = ocfs2_overwrite_io(inode, di_bh, pos, count); - brelse(di_bh); - di_bh = NULL; - up_read(&OCFS2_I(inode)->ip_alloc_sem); if (ret < 0) { if (ret != -EAGAIN) mlog_errno(ret); @@ -2178,7 +2212,10 @@ static int ocfs2_prepare_inode_for_write(struct file *file, * set inode->i_size at the end of a write. */ if (should_remove_suid(dentry)) { if (meta_level == 0) { - ocfs2_inode_unlock(inode, meta_level); + ocfs2_inode_unlock_for_extent_tree(inode, + &di_bh, + meta_level, + write_sem); meta_level = 1; continue; } @@ -2190,22 +2227,34 @@ static int ocfs2_prepare_inode_for_write(struct file *file, } } - end = pos + count; - ret = ocfs2_check_range_for_refcount(inode, pos, count); if (ret == 1) { - ocfs2_inode_unlock(inode, meta_level); - meta_level = -1; - - ret = ocfs2_prepare_inode_for_refcount(inode, - file, - pos, - count, - &meta_level); + ocfs2_inode_unlock_for_extent_tree(inode, + &di_bh, + meta_level, + write_sem); + ret = ocfs2_inode_lock_for_extent_tree(inode, + &di_bh, + meta_level, + overwrite_io, + 1, + wait); + write_sem = 1; + if (ret < 0) { + if (ret != -EAGAIN) + mlog_errno(ret); + goto out; + } + + cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits; + clusters = + ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos; + ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX); } if (ret < 0) { - mlog_errno(ret); + if (ret != -EAGAIN) + mlog_errno(ret); goto out_unlock; } @@ -2216,10 +2265,10 @@ out_unlock: trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno, pos, count, wait); - brelse(di_bh); - - if (meta_level >= 0) - ocfs2_inode_unlock(inode, meta_level); + ocfs2_inode_unlock_for_extent_tree(inode, + &di_bh, + meta_level, + write_sem); out: return ret; diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 7ad9d6590818..7c9dfd50c1c1 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -534,7 +534,7 @@ static int ocfs2_read_locked_inode(struct inode *inode, */ mlog_bug_on_msg(!!(fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) != !!(args->fi_flags & OCFS2_FI_FLAG_SYSFILE), - "Inode %llu: system file state is ambigous\n", + "Inode %llu: system file state is ambiguous\n", (unsigned long long)args->fi_blkno); if (S_ISCHR(le16_to_cpu(fe->i_mode)) || diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index d6f7b299eb23..efeea208fdeb 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -283,7 +283,7 @@ static int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb, if (inode_alloc) inode_lock(inode_alloc); - if (o2info_coherent(&fi->ifi_req)) { + if (inode_alloc && o2info_coherent(&fi->ifi_req)) { status = ocfs2_inode_lock(inode_alloc, &bh, 0); if (status < 0) { mlog_errno(status); diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 930e3d388579..699a560efbb0 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -217,7 +217,8 @@ void ocfs2_recovery_exit(struct ocfs2_super *osb) /* At this point, we know that no more recovery threads can be * launched, so wait for any recovery completion work to * complete. */ - flush_workqueue(osb->ocfs2_wq); + if (osb->ocfs2_wq) + flush_workqueue(osb->ocfs2_wq); /* * Now that recovery is shut down, and the osb is about to be diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index c0fe6ed08ab1..3103ba7f97a2 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -144,7 +144,6 @@ static inline void ocfs2_ci_set_new(struct ocfs2_super *osb, void ocfs2_orphan_scan_init(struct ocfs2_super *osb); void ocfs2_orphan_scan_start(struct ocfs2_super *osb); void ocfs2_orphan_scan_stop(struct ocfs2_super *osb); -void ocfs2_orphan_scan_exit(struct ocfs2_super *osb); void ocfs2_complete_recovery(struct work_struct *work); void ocfs2_wait_for_recovery(struct ocfs2_super *osb); @@ -232,8 +231,8 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode) * ocfs2_journal_access_*() unless you intend to * manage the checksum by hand. * ocfs2_journal_dirty - Mark a journalled buffer as having dirty data. - * ocfs2_jbd2_file_inode - Mark an inode so that its data goes out before - * the current handle commits. + * ocfs2_jbd2_inode_add_write - Mark an inode with range so that its data goes + * out before the current handle commits. */ /* You must always start_trans with a number of buffs > 0, but it's @@ -441,7 +440,7 @@ static inline int ocfs2_mknod_credits(struct super_block *sb, int is_dir, * previous dirblock update in the free list */ static inline int ocfs2_link_credits(struct super_block *sb) { - return 2*OCFS2_INODE_UPDATE_CREDITS + 4 + + return 2 * OCFS2_INODE_UPDATE_CREDITS + 4 + ocfs2_quota_trans_credits(sb); } @@ -575,37 +574,12 @@ static inline int ocfs2_calc_bg_discontig_credits(struct super_block *sb) return ocfs2_extent_recs_per_gd(sb); } -static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb, - unsigned int clusters_to_del, - struct ocfs2_dinode *fe, - struct ocfs2_extent_list *last_el) +static inline int ocfs2_jbd2_inode_add_write(handle_t *handle, struct inode *inode, + loff_t start_byte, loff_t length) { - /* for dinode + all headers in this pass + update to next leaf */ - u16 next_free = le16_to_cpu(last_el->l_next_free_rec); - u16 tree_depth = le16_to_cpu(fe->id2.i_list.l_tree_depth); - int credits = 1 + tree_depth + 1; - int i; - - i = next_free - 1; - BUG_ON(i < 0); - - /* We may be deleting metadata blocks, so metadata alloc dinode + - one desc. block for each possible delete. */ - if (tree_depth && next_free == 1 && - ocfs2_rec_clusters(last_el, &last_el->l_recs[i]) == clusters_to_del) - credits += 1 + tree_depth; - - /* update to the truncate log. */ - credits += OCFS2_TRUNCATE_LOG_UPDATE; - - credits += ocfs2_quota_trans_credits(sb); - - return credits; -} - -static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode) -{ - return jbd2_journal_inode_add_write(handle, &OCFS2_I(inode)->ip_jinode); + return jbd2_journal_inode_ranged_write(handle, + &OCFS2_I(inode)->ip_jinode, + start_byte, length); } static inline int ocfs2_begin_ordered_truncate(struct inode *inode, diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 158e5af767fd..720e9f94957e 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -377,7 +377,8 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb) struct ocfs2_dinode *alloc = NULL; cancel_delayed_work(&osb->la_enable_wq); - flush_workqueue(osb->ocfs2_wq); + if (osb->ocfs2_wq) + flush_workqueue(osb->ocfs2_wq); if (osb->local_alloc_state == OCFS2_LA_UNUSED) goto out; diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 6f8e1c4fdb9c..8ea51cf27b97 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -2486,7 +2486,6 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, struct inode *inode = NULL; struct inode *orphan_dir = NULL; struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); - struct ocfs2_dinode *di = NULL; handle_t *handle = NULL; char orphan_name[OCFS2_ORPHAN_NAMELEN + 1]; struct buffer_head *parent_di_bh = NULL; @@ -2552,7 +2551,6 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, goto leave; } - di = (struct ocfs2_dinode *)new_di_bh->b_data; status = ocfs2_orphan_add(osb, handle, inode, new_di_bh, orphan_name, &orphan_insert, orphan_dir, false); if (status < 0) { diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index fddbbd60f434..9150cfa4df7d 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -223,8 +223,6 @@ struct ocfs2_orphan_scan { struct ocfs2_dlm_debug { struct kref d_refcnt; - struct dentry *d_locking_state; - struct dentry *d_locking_filter; u32 d_filter_secs; struct list_head d_lockres_tracking; }; @@ -401,7 +399,6 @@ struct ocfs2_super struct ocfs2_dlm_debug *osb_dlm_debug; struct dentry *osb_debug_root; - struct dentry *osb_ctxt; wait_queue_head_t recovery_event; diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 8b2f39506648..c81e86c62380 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1080,10 +1080,8 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) osb->osb_debug_root = debugfs_create_dir(osb->uuid_str, ocfs2_debugfs_root); - osb->osb_ctxt = debugfs_create_file("fs_state", S_IFREG|S_IRUSR, - osb->osb_debug_root, - osb, - &ocfs2_osb_debug_fops); + debugfs_create_file("fs_state", S_IFREG|S_IRUSR, osb->osb_debug_root, + osb, &ocfs2_osb_debug_fops); if (ocfs2_meta_ecc(osb)) ocfs2_blockcheck_stats_debugfs_install( &osb->osb_ecc_stats, @@ -1861,8 +1859,6 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) kset_unregister(osb->osb_dev_kset); - debugfs_remove(osb->osb_ctxt); - /* Orphan scan should be stopped as early as possible */ ocfs2_orphan_scan_stop(osb); @@ -1918,7 +1914,7 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) ocfs2_dlm_shutdown(osb, hangup_needed); ocfs2_blockcheck_stats_debugfs_remove(&osb->osb_ecc_stats); - debugfs_remove(osb->osb_debug_root); + debugfs_remove_recursive(osb->osb_debug_root); if (hangup_needed) ocfs2_cluster_hangup(osb->uuid_str, strlen(osb->uuid_str)); diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 90c830e3758e..d8507972ee13 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -1490,18 +1490,6 @@ static int ocfs2_xa_check_space(struct ocfs2_xa_loc *loc, return loc->xl_ops->xlo_check_space(loc, xi); } -static void ocfs2_xa_add_entry(struct ocfs2_xa_loc *loc, u32 name_hash) -{ - loc->xl_ops->xlo_add_entry(loc, name_hash); - loc->xl_entry->xe_name_hash = cpu_to_le32(name_hash); - /* - * We can't leave the new entry's xe_name_offset at zero or - * add_namevalue() will go nuts. We set it to the size of our - * storage so that it can never be less than any other entry. - */ - loc->xl_entry->xe_name_offset = cpu_to_le16(loc->xl_size); -} - static void ocfs2_xa_add_namevalue(struct ocfs2_xa_loc *loc, struct ocfs2_xattr_info *xi) { @@ -2133,29 +2121,31 @@ static int ocfs2_xa_prepare_entry(struct ocfs2_xa_loc *loc, if (rc) goto out; - if (loc->xl_entry) { - if (ocfs2_xa_can_reuse_entry(loc, xi)) { - orig_value_size = loc->xl_entry->xe_value_size; - rc = ocfs2_xa_reuse_entry(loc, xi, ctxt); - if (rc) - goto out; - goto alloc_value; - } + if (!loc->xl_entry) { + rc = -EINVAL; + goto out; + } - if (!ocfs2_xattr_is_local(loc->xl_entry)) { - orig_clusters = ocfs2_xa_value_clusters(loc); - rc = ocfs2_xa_value_truncate(loc, 0, ctxt); - if (rc) { - mlog_errno(rc); - ocfs2_xa_cleanup_value_truncate(loc, - "overwriting", - orig_clusters); - goto out; - } + if (ocfs2_xa_can_reuse_entry(loc, xi)) { + orig_value_size = loc->xl_entry->xe_value_size; + rc = ocfs2_xa_reuse_entry(loc, xi, ctxt); + if (rc) + goto out; + goto alloc_value; + } + + if (!ocfs2_xattr_is_local(loc->xl_entry)) { + orig_clusters = ocfs2_xa_value_clusters(loc); + rc = ocfs2_xa_value_truncate(loc, 0, ctxt); + if (rc) { + mlog_errno(rc); + ocfs2_xa_cleanup_value_truncate(loc, + "overwriting", + orig_clusters); + goto out; } - ocfs2_xa_wipe_namevalue(loc); - } else - ocfs2_xa_add_entry(loc, name_hash); + } + ocfs2_xa_wipe_namevalue(loc); /* * If we get here, we have a blank entry. Fill it. We grow our diff --git a/fs/open.c b/fs/open.c index a59abe3c669a..b62f5c0923a8 100644 --- a/fs/open.c +++ b/fs/open.c @@ -776,7 +776,7 @@ static int do_dentry_open(struct file *f, f->f_mode |= FMODE_ATOMIC_POS; f->f_op = fops_get(inode->i_fop); - if (unlikely(WARN_ON(!f->f_op))) { + if (WARN_ON(!f->f_op)) { error = -ENODEV; goto cleanup_all; } @@ -818,6 +818,14 @@ static int do_dentry_open(struct file *f, if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO) return -EINVAL; } + + /* + * XXX: Huge page cache doesn't support writing yet. Drop all page + * cache for this file before processing writes. + */ + if ((f->f_mode & FMODE_WRITE) && filemap_nr_thps(inode->i_mapping)) + truncate_pagecache(inode, 0); + return 0; cleanup_all: diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index f5834488b67d..e2ed8e08cc7a 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -31,6 +31,7 @@ #include <linux/ioport.h> #include <linux/memory.h> #include <linux/sched/task.h> +#include <linux/security.h> #include <asm/sections.h> #include "internal.h" @@ -545,9 +546,14 @@ out: static int open_kcore(struct inode *inode, struct file *filp) { + int ret = security_locked_down(LOCKDOWN_KCORE); + if (!capable(CAP_SYS_RAWIO)) return -EPERM; + if (ret) + return ret; + filp->private_data = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!filp->private_data) return -ENOMEM; diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 465ea0153b2a..8c1f1bb1a5ce 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -8,7 +8,6 @@ #include <linux/mmzone.h> #include <linux/proc_fs.h> #include <linux/percpu.h> -#include <linux/quicklist.h> #include <linux/seq_file.h> #include <linux/swap.h> #include <linux/vmstat.h> @@ -106,9 +105,6 @@ static int meminfo_proc_show(struct seq_file *m, void *v) global_zone_page_state(NR_KERNEL_STACK_KB)); show_val_kb(m, "PageTables: ", global_zone_page_state(NR_PAGETABLE)); -#ifdef CONFIG_QUICKLIST - show_val_kb(m, "Quicklists: ", quicklist_total_size()); -#endif show_val_kb(m, "NFS_Unstable: ", global_node_page_state(NR_UNSTABLE_NFS)); @@ -136,6 +132,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v) global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR); show_val_kb(m, "ShmemPmdMapped: ", global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR); + show_val_kb(m, "FileHugePages: ", + global_node_page_state(NR_FILE_THPS) * HPAGE_PMD_NR); + show_val_kb(m, "FilePmdMapped: ", + global_node_page_state(NR_FILE_PMDMAPPED) * HPAGE_PMD_NR); #endif #ifdef CONFIG_CMA diff --git a/fs/proc/page.c b/fs/proc/page.c index 544d1ee15aee..7c952ee732e6 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -42,10 +42,12 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf, return -EINVAL; while (count > 0) { - if (pfn_valid(pfn)) - ppage = pfn_to_page(pfn); - else - ppage = NULL; + /* + * TODO: ZONE_DEVICE support requires to identify + * memmaps that were actually initialized. + */ + ppage = pfn_to_online_page(pfn); + if (!ppage || PageSlab(ppage) || page_has_type(ppage)) pcount = 0; else @@ -216,10 +218,11 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf, return -EINVAL; while (count > 0) { - if (pfn_valid(pfn)) - ppage = pfn_to_page(pfn); - else - ppage = NULL; + /* + * TODO: ZONE_DEVICE support requires to identify + * memmaps that were actually initialized. + */ + ppage = pfn_to_online_page(pfn); if (put_user(stable_page_flags(ppage), out)) { ret = -EFAULT; @@ -261,10 +264,11 @@ static ssize_t kpagecgroup_read(struct file *file, char __user *buf, return -EINVAL; while (count > 0) { - if (pfn_valid(pfn)) - ppage = pfn_to_page(pfn); - else - ppage = NULL; + /* + * TODO: ZONE_DEVICE support requires to identify + * memmaps that were actually initialized. + */ + ppage = pfn_to_online_page(pfn); if (ppage) ino = page_cgroup_ino(ppage); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index bf43d1d60059..9442631fd4af 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -417,6 +417,7 @@ struct mem_size_stats { unsigned long lazyfree; unsigned long anonymous_thp; unsigned long shmem_thp; + unsigned long file_thp; unsigned long swap; unsigned long shared_hugetlb; unsigned long private_hugetlb; @@ -461,7 +462,7 @@ static void smaps_page_accumulate(struct mem_size_stats *mss, static void smaps_account(struct mem_size_stats *mss, struct page *page, bool compound, bool young, bool dirty, bool locked) { - int i, nr = compound ? 1 << compound_order(page) : 1; + int i, nr = compound ? compound_nr(page) : 1; unsigned long size = nr * PAGE_SIZE; /* @@ -588,7 +589,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, else if (is_zone_device_page(page)) /* pass */; else - VM_BUG_ON_PAGE(1, page); + mss->file_thp += HPAGE_PMD_SIZE; smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd), locked); } #else @@ -809,6 +810,7 @@ static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss, SEQ_PUT_DEC(" kB\nLazyFree: ", mss->lazyfree); SEQ_PUT_DEC(" kB\nAnonHugePages: ", mss->anonymous_thp); SEQ_PUT_DEC(" kB\nShmemPmdMapped: ", mss->shmem_thp); + SEQ_PUT_DEC(" kB\nFilePmdMapped: ", mss->file_thp); SEQ_PUT_DEC(" kB\nShared_Hugetlb: ", mss->shared_hugetlb); seq_put_decimal_ull_width(m, " kB\nPrivate_Hugetlb: ", mss->private_hugetlb >> 10, 7); diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index e16fb8f2049e..273ee82d8aa9 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -88,7 +88,7 @@ static inline void mangle(struct seq_file *m, const char *s) static void show_type(struct seq_file *m, struct super_block *sb) { mangle(m, sb->s_type->name); - if (sb->s_subtype && sb->s_subtype[0]) { + if (sb->s_subtype) { seq_putc(m, '.'); mangle(m, sb->s_subtype); } diff --git a/fs/readdir.c b/fs/readdir.c index 2f6a4534e0df..d26d5ea4de7b 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -20,9 +20,23 @@ #include <linux/syscalls.h> #include <linux/unistd.h> #include <linux/compat.h> - #include <linux/uaccess.h> +#include <asm/unaligned.h> + +/* + * Note the "unsafe_put_user() semantics: we goto a + * label for errors. + */ +#define unsafe_copy_dirent_name(_dst, _src, _len, label) do { \ + char __user *dst = (_dst); \ + const char *src = (_src); \ + size_t len = (_len); \ + unsafe_put_user(0, dst+len, label); \ + unsafe_copy_to_user(dst, src, len, label); \ +} while (0) + + int iterate_dir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); @@ -65,6 +79,40 @@ out: EXPORT_SYMBOL(iterate_dir); /* + * POSIX says that a dirent name cannot contain NULL or a '/'. + * + * It's not 100% clear what we should really do in this case. + * The filesystem is clearly corrupted, but returning a hard + * error means that you now don't see any of the other names + * either, so that isn't a perfect alternative. + * + * And if you return an error, what error do you use? Several + * filesystems seem to have decided on EUCLEAN being the error + * code for EFSCORRUPTED, and that may be the error to use. Or + * just EIO, which is perhaps more obvious to users. + * + * In order to see the other file names in the directory, the + * caller might want to make this a "soft" error: skip the + * entry, and return the error at the end instead. + * + * Note that this should likely do a "memchr(name, 0, len)" + * check too, since that would be filesystem corruption as + * well. However, that case can't actually confuse user space, + * which has to do a strlen() on the name anyway to find the + * filename length, and the above "soft error" worry means + * that it's probably better left alone until we have that + * issue clarified. + */ +static int verify_dirent_name(const char *name, int len) +{ + if (!len) + return -EIO; + if (memchr(name, '/', len)) + return -EIO; + return 0; +} + +/* * Traditional linux readdir() handling.. * * "count=1" is a special case, meaning that the buffer is one @@ -173,6 +221,9 @@ static int filldir(struct dir_context *ctx, const char *name, int namlen, int reclen = ALIGN(offsetof(struct linux_dirent, d_name) + namlen + 2, sizeof(long)); + buf->error = verify_dirent_name(name, namlen); + if (unlikely(buf->error)) + return buf->error; buf->error = -EINVAL; /* only used if we fail.. */ if (reclen > buf->count) return -EINVAL; @@ -182,28 +233,31 @@ static int filldir(struct dir_context *ctx, const char *name, int namlen, return -EOVERFLOW; } dirent = buf->previous; - if (dirent) { - if (signal_pending(current)) - return -EINTR; - if (__put_user(offset, &dirent->d_off)) - goto efault; - } - dirent = buf->current_dir; - if (__put_user(d_ino, &dirent->d_ino)) - goto efault; - if (__put_user(reclen, &dirent->d_reclen)) - goto efault; - if (copy_to_user(dirent->d_name, name, namlen)) - goto efault; - if (__put_user(0, dirent->d_name + namlen)) - goto efault; - if (__put_user(d_type, (char __user *) dirent + reclen - 1)) + if (dirent && signal_pending(current)) + return -EINTR; + + /* + * Note! This range-checks 'previous' (which may be NULL). + * The real range was checked in getdents + */ + if (!user_access_begin(dirent, sizeof(*dirent))) goto efault; + if (dirent) + unsafe_put_user(offset, &dirent->d_off, efault_end); + dirent = buf->current_dir; + unsafe_put_user(d_ino, &dirent->d_ino, efault_end); + unsafe_put_user(reclen, &dirent->d_reclen, efault_end); + unsafe_put_user(d_type, (char __user *) dirent + reclen - 1, efault_end); + unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end); + user_access_end(); + buf->previous = dirent; dirent = (void __user *)dirent + reclen; buf->current_dir = dirent; buf->count -= reclen; return 0; +efault_end: + user_access_end(); efault: buf->error = -EFAULT; return -EFAULT; @@ -259,34 +313,38 @@ static int filldir64(struct dir_context *ctx, const char *name, int namlen, int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1, sizeof(u64)); + buf->error = verify_dirent_name(name, namlen); + if (unlikely(buf->error)) + return buf->error; buf->error = -EINVAL; /* only used if we fail.. */ if (reclen > buf->count) return -EINVAL; dirent = buf->previous; - if (dirent) { - if (signal_pending(current)) - return -EINTR; - if (__put_user(offset, &dirent->d_off)) - goto efault; - } - dirent = buf->current_dir; - if (__put_user(ino, &dirent->d_ino)) - goto efault; - if (__put_user(0, &dirent->d_off)) - goto efault; - if (__put_user(reclen, &dirent->d_reclen)) - goto efault; - if (__put_user(d_type, &dirent->d_type)) - goto efault; - if (copy_to_user(dirent->d_name, name, namlen)) - goto efault; - if (__put_user(0, dirent->d_name + namlen)) + if (dirent && signal_pending(current)) + return -EINTR; + + /* + * Note! This range-checks 'previous' (which may be NULL). + * The real range was checked in getdents + */ + if (!user_access_begin(dirent, sizeof(*dirent))) goto efault; + if (dirent) + unsafe_put_user(offset, &dirent->d_off, efault_end); + dirent = buf->current_dir; + unsafe_put_user(ino, &dirent->d_ino, efault_end); + unsafe_put_user(reclen, &dirent->d_reclen, efault_end); + unsafe_put_user(d_type, &dirent->d_type, efault_end); + unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end); + user_access_end(); + buf->previous = dirent; dirent = (void __user *)dirent + reclen; buf->current_dir = dirent; buf->count -= reclen; return 0; +efault_end: + user_access_end(); efault: buf->error = -EFAULT; return -EFAULT; diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c index 9c02d96d3a42..4075e41408b4 100644 --- a/fs/reiserfs/do_balan.c +++ b/fs/reiserfs/do_balan.c @@ -239,10 +239,8 @@ static int balance_leaf_when_delete_left(struct tree_balance *tb) static int balance_leaf_when_delete(struct tree_balance *tb, int flag) { struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); - int item_pos = PATH_LAST_POSITION(tb->tb_path); struct buffer_info bi; int n; - struct item_head *ih; RFALSE(tb->FR[0] && B_LEVEL(tb->FR[0]) != DISK_LEAF_NODE_LEVEL + 1, "vs- 12000: level: wrong FR %z", tb->FR[0]); @@ -251,7 +249,6 @@ static int balance_leaf_when_delete(struct tree_balance *tb, int flag) RFALSE(!tb->blknum[0] && !PATH_H_PPARENT(tb->tb_path, 0), "PAP-12010: tree can not be empty"); - ih = item_head(tbS0, item_pos); buffer_info_init_tbS0(tb, &bi); /* Delete or truncate the item */ @@ -298,7 +295,6 @@ static unsigned int balance_leaf_insert_left(struct tree_balance *tb, if (tb->item_pos == tb->lnum[0] - 1 && tb->lbytes != -1) { /* part of new item falls into L[0] */ int new_item_len, shift; - int version; ret = leaf_shift_left(tb, tb->lnum[0] - 1, -1); @@ -317,8 +313,6 @@ static unsigned int balance_leaf_insert_left(struct tree_balance *tb, leaf_insert_into_buf(&bi, n + tb->item_pos - ret, ih, body, min_t(int, tb->zeroes_num, ih_item_len(ih))); - version = ih_version(ih); - /* * Calculate key component, item length and body to * insert into S[0] @@ -632,7 +626,6 @@ static void balance_leaf_insert_right(struct tree_balance *tb, struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); int n = B_NR_ITEMS(tbS0); struct buffer_info bi; - int ret; /* new item or part of it doesn't fall into R[0] */ if (n - tb->rnum[0] >= tb->item_pos) { @@ -646,13 +639,11 @@ static void balance_leaf_insert_right(struct tree_balance *tb, if (tb->item_pos == n - tb->rnum[0] + 1 && tb->rbytes != -1) { loff_t old_key_comp, old_len, r_zeroes_number; const char *r_body; - int version, shift; + int shift; loff_t offset; leaf_shift_right(tb, tb->rnum[0] - 1, -1); - version = ih_version(ih); - /* Remember key component and item length */ old_key_comp = le_ih_k_offset(ih); old_len = ih_item_len(ih); @@ -698,7 +689,7 @@ static void balance_leaf_insert_right(struct tree_balance *tb, /* whole new item falls into R[0] */ /* Shift rnum[0]-1 items to R[0] */ - ret = leaf_shift_right(tb, tb->rnum[0] - 1, tb->rbytes); + leaf_shift_right(tb, tb->rnum[0] - 1, tb->rbytes); /* Insert new item into R[0] */ buffer_info_init_right(tb, &bi); @@ -950,14 +941,12 @@ static void balance_leaf_new_nodes_insert(struct tree_balance *tb, if (tb->item_pos == n - tb->snum[i] + 1 && tb->sbytes[i] != -1) { int old_key_comp, old_len, r_zeroes_number; const char *r_body; - int version; /* Move snum[i]-1 items from S[0] to S_new[i] */ leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, tb->snum[i] - 1, -1, tb->S_new[i]); /* Remember key component and item length */ - version = ih_version(ih); old_key_comp = le_ih_k_offset(ih); old_len = ih_item_len(ih); diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c index 6b0ddb2a9091..117092224111 100644 --- a/fs/reiserfs/fix_node.c +++ b/fs/reiserfs/fix_node.c @@ -376,7 +376,6 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h, int to, int to_bytes, short *snum012, int flow) { int i; - int cur_free; int units; struct virtual_node *vn = tb->tb_vn; int total_node_size, max_node_size, current_item_size; @@ -438,7 +437,6 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h, /* leaf level */ needed_nodes = 1; total_node_size = 0; - cur_free = max_node_size; /* start from 'from'-th item */ start_item = from; @@ -1734,14 +1732,12 @@ static int dc_check_balance_internal(struct tree_balance *tb, int h) * and Fh is its father. */ struct buffer_head *Sh, *Fh; - int maxsize, ret; + int ret; int lfree, rfree /* free space in L and R */ ; Sh = PATH_H_PBUFFER(tb->tb_path, h); Fh = PATH_H_PPARENT(tb->tb_path, h); - maxsize = MAX_CHILD_SIZE(Sh); - /* * using tb->insert_size[h], which is negative in this case, * create_virtual_node calculates: diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 4517a1394c6f..4b3e3e73b512 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -891,7 +891,6 @@ static int flush_older_commits(struct super_block *s, struct list_head *entry; unsigned int trans_id = jl->j_trans_id; unsigned int other_trans_id; - unsigned int first_trans_id; find_first: /* @@ -914,8 +913,6 @@ find_first: return 0; } - first_trans_id = first_jl->j_trans_id; - entry = &first_jl->j_list; while (1) { other_jl = JOURNAL_LIST_ENTRY(entry); @@ -1351,7 +1348,7 @@ static int flush_journal_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall) { struct reiserfs_journal_list *pjl; - struct reiserfs_journal_cnode *cn, *last; + struct reiserfs_journal_cnode *cn; int count; int was_jwait = 0; int was_dirty = 0; @@ -1509,7 +1506,6 @@ static int flush_journal_list(struct super_block *s, b_blocknr, __func__); } free_cnode: - last = cn; cn = cn->next; if (saved_bh) { /* @@ -1792,7 +1788,6 @@ static int flush_used_journal_lists(struct super_block *s, { unsigned long len = 0; unsigned long cur_len; - int ret; int i; int limit = 256; struct reiserfs_journal_list *tjl; @@ -1829,9 +1824,9 @@ static int flush_used_journal_lists(struct super_block *s, * transactions, but only bother if we've actually spanned * across multiple lists */ - if (flush_jl != jl) { - ret = kupdate_transactions(s, jl, &tjl, &trans_id, len, i); - } + if (flush_jl != jl) + kupdate_transactions(s, jl, &tjl, &trans_id, len, i); + flush_journal_list(s, flush_jl, 1); put_journal_list(s, flush_jl); put_journal_list(s, jl); @@ -1911,7 +1906,6 @@ static int do_journal_release(struct reiserfs_transaction_handle *th, struct super_block *sb, int error) { struct reiserfs_transaction_handle myth; - int flushed = 0; struct reiserfs_journal *journal = SB_JOURNAL(sb); /* @@ -1933,7 +1927,6 @@ static int do_journal_release(struct reiserfs_transaction_handle *th, 1); journal_mark_dirty(&myth, SB_BUFFER_WITH_SB(sb)); do_journal_end(&myth, FLUSH_ALL); - flushed = 1; } } @@ -3444,9 +3437,8 @@ static int remove_from_transaction(struct super_block *sb, if (cn == journal->j_last) { journal->j_last = cn->prev; } - if (bh) - remove_journal_hash(sb, journal->j_hash_table, NULL, - bh->b_blocknr, 0); + remove_journal_hash(sb, journal->j_hash_table, NULL, + bh->b_blocknr, 0); clear_buffer_journaled(bh); /* don't log this one */ if (!already_cleaned) { @@ -3988,7 +3980,6 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, int flags) struct buffer_head *c_bh; /* commit bh */ struct buffer_head *d_bh; /* desc bh */ int cur_write_start = 0; /* start index of current log write */ - int old_start; int i; int flush; int wait_on_commit; @@ -4245,7 +4236,6 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, int flags) journal->j_num_work_lists++; /* reset journal values for the next transaction */ - old_start = journal->j_start; journal->j_start = (journal->j_start + journal->j_len + 2) % SB_ONDISK_JOURNAL_SIZE(sb); diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c index f5cebd70d903..7f868569d4d0 100644 --- a/fs/reiserfs/lbalance.c +++ b/fs/reiserfs/lbalance.c @@ -1322,7 +1322,7 @@ void leaf_paste_entries(struct buffer_info *bi, char *item; struct reiserfs_de_head *deh; char *insert_point; - int i, old_entry_num; + int i; struct buffer_head *bh = bi->bi_bh; if (new_entry_count == 0) @@ -1362,7 +1362,6 @@ void leaf_paste_entries(struct buffer_info *bi, put_deh_location(&deh[i], deh_location(&deh[i]) + paste_size); - old_entry_num = ih_entry_count(ih); put_ih_entry_count(ih, ih_entry_count(ih) + new_entry_count); /* prepare space for pasted records */ diff --git a/fs/reiserfs/objectid.c b/fs/reiserfs/objectid.c index 415d66ca87d1..34baf5c0f265 100644 --- a/fs/reiserfs/objectid.c +++ b/fs/reiserfs/objectid.c @@ -183,13 +183,12 @@ int reiserfs_convert_objectid_map_v1(struct super_block *s) int new_size = (s->s_blocksize - SB_SIZE) / sizeof(__u32) / 2 * 2; int old_max = sb_oid_maxsize(disk_sb); struct reiserfs_super_block_v1 *disk_sb_v1; - __le32 *objectid_map, *new_objectid_map; + __le32 *objectid_map; int i; disk_sb_v1 = (struct reiserfs_super_block_v1 *)(SB_BUFFER_WITH_SB(s)->b_data); objectid_map = (__le32 *) (disk_sb_v1 + 1); - new_objectid_map = (__le32 *) (disk_sb + 1); if (cur_size > new_size) { /* diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c index 9fed1c05f1f4..500f2000eb41 100644 --- a/fs/reiserfs/prints.c +++ b/fs/reiserfs/prints.c @@ -746,9 +746,6 @@ static void check_leaf_block_head(struct buffer_head *bh) static void check_internal_block_head(struct buffer_head *bh) { - struct block_head *blkh; - - blkh = B_BLK_HEAD(bh); if (!(B_LEVEL(bh) > DISK_LEAF_NODE_LEVEL && B_LEVEL(bh) <= MAX_HEIGHT)) reiserfs_panic(NULL, "vs-6025", "invalid level %z", bh); diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index 0037aea97d39..da9ebe33882b 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -593,7 +593,6 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, struct buffer_head *bh; struct path_element *last_element; int node_level, retval; - int right_neighbor_of_leaf_node; int fs_gen; struct buffer_head *reada_bh[SEARCH_BY_KEY_READA]; b_blocknr_t reada_blocks[SEARCH_BY_KEY_READA]; @@ -614,8 +613,6 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, pathrelse(search_path); - right_neighbor_of_leaf_node = 0; - /* * With each iteration of this loop we search through the items in the * current node, and calculate the next current node(next path element) @@ -701,7 +698,6 @@ io_error: */ block_number = SB_ROOT_BLOCK(sb); expected_level = -1; - right_neighbor_of_leaf_node = 0; /* repeat search from the root */ continue; diff --git a/fs/statfs.c b/fs/statfs.c index eea7af6f2f22..2616424012ea 100644 --- a/fs/statfs.c +++ b/fs/statfs.c @@ -318,19 +318,10 @@ COMPAT_SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct compat_statfs __user *, static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstatfs *kbuf) { struct compat_statfs64 buf; - if (sizeof(ubuf->f_bsize) == 4) { - if ((kbuf->f_type | kbuf->f_bsize | kbuf->f_namelen | - kbuf->f_frsize | kbuf->f_flags) & 0xffffffff00000000ULL) - return -EOVERFLOW; - /* f_files and f_ffree may be -1; it's okay - * to stuff that into 32 bits */ - if (kbuf->f_files != 0xffffffffffffffffULL - && (kbuf->f_files & 0xffffffff00000000ULL)) - return -EOVERFLOW; - if (kbuf->f_ffree != 0xffffffffffffffffULL - && (kbuf->f_ffree & 0xffffffff00000000ULL)) - return -EOVERFLOW; - } + + if ((kbuf->f_bsize | kbuf->f_frsize) & 0xffffffff00000000ULL) + return -EOVERFLOW; + memset(&buf, 0, sizeof(struct compat_statfs64)); buf.f_type = kbuf->f_type; buf.f_bsize = kbuf->f_bsize; diff --git a/fs/super.c b/fs/super.c index 8020974b2a68..cfadab2cbf35 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1300,6 +1300,7 @@ int get_tree_bdev(struct fs_context *fc, mutex_lock(&bdev->bd_fsfreeze_mutex); if (bdev->bd_fsfreeze_count > 0) { mutex_unlock(&bdev->bd_fsfreeze_mutex); + blkdev_put(bdev, mode); warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev); return -EBUSY; } @@ -1308,8 +1309,10 @@ int get_tree_bdev(struct fs_context *fc, fc->sget_key = bdev; s = sget_fc(fc, test_bdev_super_fc, set_bdev_super_fc); mutex_unlock(&bdev->bd_fsfreeze_mutex); - if (IS_ERR(s)) + if (IS_ERR(s)) { + blkdev_put(bdev, mode); return PTR_ERR(s); + } if (s->s_root) { /* Don't summarily change the RO/RW state. */ @@ -1555,11 +1558,6 @@ int vfs_get_tree(struct fs_context *fc) sb = fc->root->d_sb; WARN_ON(!sb->s_bdi); - if (fc->subtype && !sb->s_subtype) { - sb->s_subtype = fc->subtype; - fc->subtype = NULL; - } - /* * Write barrier is for super_cache_count(). We place it before setting * SB_BORN as the data dependency between the two functions is the diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index eeeae0475da9..0caa151cae4e 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -16,6 +16,7 @@ #include <linux/namei.h> #include <linux/tracefs.h> #include <linux/fsnotify.h> +#include <linux/security.h> #include <linux/seq_file.h> #include <linux/parser.h> #include <linux/magic.h> @@ -390,6 +391,9 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode, struct dentry *dentry; struct inode *inode; + if (security_locked_down(LOCKDOWN_TRACEFS)) + return NULL; + if (!(mode & S_IFMT)) mode |= S_IFREG; BUG_ON(!S_ISREG(mode)); diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index fe6d804a38dc..f9fd18670e22 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1272,21 +1272,23 @@ static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx, } static __always_inline int validate_range(struct mm_struct *mm, - __u64 start, __u64 len) + __u64 *start, __u64 len) { __u64 task_size = mm->task_size; - if (start & ~PAGE_MASK) + *start = untagged_addr(*start); + + if (*start & ~PAGE_MASK) return -EINVAL; if (len & ~PAGE_MASK) return -EINVAL; if (!len) return -EINVAL; - if (start < mmap_min_addr) + if (*start < mmap_min_addr) return -EINVAL; - if (start >= task_size) + if (*start >= task_size) return -EINVAL; - if (len > task_size - start) + if (len > task_size - *start) return -EINVAL; return 0; } @@ -1336,7 +1338,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, goto out; } - ret = validate_range(mm, uffdio_register.range.start, + ret = validate_range(mm, &uffdio_register.range.start, uffdio_register.range.len); if (ret) goto out; @@ -1525,7 +1527,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister))) goto out; - ret = validate_range(mm, uffdio_unregister.start, + ret = validate_range(mm, &uffdio_unregister.start, uffdio_unregister.len); if (ret) goto out; @@ -1676,7 +1678,7 @@ static int userfaultfd_wake(struct userfaultfd_ctx *ctx, if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake))) goto out; - ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len); + ret = validate_range(ctx->mm, &uffdio_wake.start, uffdio_wake.len); if (ret) goto out; @@ -1716,7 +1718,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, sizeof(uffdio_copy)-sizeof(__s64))) goto out; - ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len); + ret = validate_range(ctx->mm, &uffdio_copy.dst, uffdio_copy.len); if (ret) goto out; /* @@ -1772,7 +1774,7 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, sizeof(uffdio_zeropage)-sizeof(__s64))) goto out; - ret = validate_range(ctx->mm, uffdio_zeropage.range.start, + ret = validate_range(ctx->mm, &uffdio_zeropage.range.start, uffdio_zeropage.range.len); if (ret) goto out; diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 5de296b34ab1..14fbdf22b7e7 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -28,12 +28,11 @@ xfs_get_aghdr_buf( struct xfs_mount *mp, xfs_daddr_t blkno, size_t numblks, - int flags, const struct xfs_buf_ops *ops) { struct xfs_buf *bp; - bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, flags); + bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, 0); if (!bp) return NULL; @@ -345,7 +344,7 @@ xfs_ag_init_hdr( { struct xfs_buf *bp; - bp = xfs_get_aghdr_buf(mp, id->daddr, id->numblks, 0, ops); + bp = xfs_get_aghdr_buf(mp, id->daddr, id->numblks, ops); if (!bp) return -ENOMEM; diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 58fa85cec325..d6ed5d2c07c2 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -81,9 +81,10 @@ typedef struct xfs_alloc_arg { /* * Defines for datatype */ -#define XFS_ALLOC_INITIAL_USER_DATA (1 << 0)/* special case start of file */ -#define XFS_ALLOC_USERDATA_ZERO (1 << 1)/* zero extent on allocation */ -#define XFS_ALLOC_NOBUSY (1 << 2)/* Busy extents not allowed */ +#define XFS_ALLOC_USERDATA (1 << 0)/* allocation is for user data*/ +#define XFS_ALLOC_INITIAL_USER_DATA (1 << 1)/* special case start of file */ +#define XFS_ALLOC_USERDATA_ZERO (1 << 2)/* zero extent on allocation */ +#define XFS_ALLOC_NOBUSY (1 << 3)/* Busy extents not allowed */ static inline bool xfs_alloc_is_userdata(int datatype) diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index b9f019603d0b..f0089e862216 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -826,32 +826,17 @@ xfs_attr_shortform_to_leaf( sf = (xfs_attr_shortform_t *)tmpbuffer; xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); - xfs_bmap_local_to_extents_empty(dp, XFS_ATTR_FORK); + xfs_bmap_local_to_extents_empty(args->trans, dp, XFS_ATTR_FORK); bp = NULL; error = xfs_da_grow_inode(args, &blkno); - if (error) { - /* - * If we hit an IO error middle of the transaction inside - * grow_inode(), we may have inconsistent data. Bail out. - */ - if (error == -EIO) - goto out; - xfs_idata_realloc(dp, size, XFS_ATTR_FORK); /* try to put */ - memcpy(ifp->if_u1.if_data, tmpbuffer, size); /* it back */ + if (error) goto out; - } ASSERT(blkno == 0); error = xfs_attr3_leaf_create(args, blkno, &bp); - if (error) { - /* xfs_attr3_leaf_create may not have instantiated a block */ - if (bp && (xfs_da_shrink_inode(args, 0, bp) != 0)) - goto out; - xfs_idata_realloc(dp, size, XFS_ATTR_FORK); /* try to put */ - memcpy(ifp->if_u1.if_data, tmpbuffer, size); /* it back */ + if (error) goto out; - } memset((char *)&nargs, 0, sizeof(nargs)); nargs.dp = dp; diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 054b4ce30033..02469d59c787 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -792,6 +792,7 @@ out_root_realloc: */ void xfs_bmap_local_to_extents_empty( + struct xfs_trans *tp, struct xfs_inode *ip, int whichfork) { @@ -808,6 +809,7 @@ xfs_bmap_local_to_extents_empty( ifp->if_u1.if_root = NULL; ifp->if_height = 0; XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } @@ -840,7 +842,7 @@ xfs_bmap_local_to_extents( ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); if (!ifp->if_bytes) { - xfs_bmap_local_to_extents_empty(ip, whichfork); + xfs_bmap_local_to_extents_empty(tp, ip, whichfork); flags = XFS_ILOG_CORE; goto done; } @@ -887,7 +889,7 @@ xfs_bmap_local_to_extents( /* account for the change in fork size */ xfs_idata_realloc(ip, -ifp->if_bytes, whichfork); - xfs_bmap_local_to_extents_empty(ip, whichfork); + xfs_bmap_local_to_extents_empty(tp, ip, whichfork); flags |= XFS_ILOG_CORE; ifp->if_u1.if_root = NULL; @@ -4042,8 +4044,12 @@ xfs_bmapi_allocate( */ if (!(bma->flags & XFS_BMAPI_METADATA)) { bma->datatype = XFS_ALLOC_NOBUSY; - if (whichfork == XFS_DATA_FORK && bma->offset == 0) - bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA; + if (whichfork == XFS_DATA_FORK) { + if (bma->offset == 0) + bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA; + else + bma->datatype |= XFS_ALLOC_USERDATA; + } if (bma->flags & XFS_BMAPI_ZERO) bma->datatype |= XFS_ALLOC_USERDATA_ZERO; } @@ -5621,6 +5627,11 @@ xfs_bmse_merge( if (error) return error; + /* change to extent format if required after extent removal */ + error = xfs_bmap_btree_to_extents(tp, ip, cur, logflags, whichfork); + if (error) + return error; + done: xfs_iext_remove(ip, icur, 0); xfs_iext_prev(XFS_IFORK_PTR(ip, whichfork), icur); diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 5bb446d80542..e2798c6f3a5f 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -182,7 +182,8 @@ void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno, xfs_filblks_t len); int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); int xfs_bmap_set_attrforkoff(struct xfs_inode *ip, int size, int *version); -void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork); +void xfs_bmap_local_to_extents_empty(struct xfs_trans *tp, + struct xfs_inode *ip, int whichfork); void __xfs_bmap_add_free(struct xfs_trans *tp, xfs_fsblock_t bno, xfs_filblks_t len, const struct xfs_owner_info *oinfo, bool skip_discard); diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index 9595ced393dc..49e4bc39e7bb 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -1096,7 +1096,7 @@ xfs_dir2_sf_to_block( memcpy(sfp, oldsfp, ifp->if_bytes); xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK); - xfs_bmap_local_to_extents_empty(dp, XFS_DATA_FORK); + xfs_bmap_local_to_extents_empty(tp, dp, XFS_DATA_FORK); dp->i_d.di_size = 0; /* diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 39dd2b908106..e9371a8e0e26 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -366,11 +366,11 @@ struct xfs_bulkstat { uint64_t bs_blocks; /* number of blocks */ uint64_t bs_xflags; /* extended flags */ - uint64_t bs_atime; /* access time, seconds */ - uint64_t bs_mtime; /* modify time, seconds */ + int64_t bs_atime; /* access time, seconds */ + int64_t bs_mtime; /* modify time, seconds */ - uint64_t bs_ctime; /* inode change time, seconds */ - uint64_t bs_btime; /* creation time, seconds */ + int64_t bs_ctime; /* inode change time, seconds */ + int64_t bs_btime; /* creation time, seconds */ uint32_t bs_gen; /* generation count */ uint32_t bs_uid; /* user id */ diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index a08dd8f40346..ac6cdca63e15 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -928,7 +928,7 @@ xfs_log_sb( xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb); xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF); - xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb)); + xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb) - 1); } /* diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c index a43d1813c4ff..5533e48e605d 100644 --- a/fs/xfs/scrub/alloc.c +++ b/fs/xfs/scrub/alloc.c @@ -97,7 +97,6 @@ xchk_allocbt_rec( xfs_agnumber_t agno = bs->cur->bc_private.a.agno; xfs_agblock_t bno; xfs_extlen_t len; - int error = 0; bno = be32_to_cpu(rec->alloc.ar_startblock); len = be32_to_cpu(rec->alloc.ar_blockcount); @@ -109,7 +108,7 @@ xchk_allocbt_rec( xchk_allocbt_xref(bs->sc, bno, len); - return error; + return 0; } /* Scrub the freespace btrees for some AG. */ diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c index 93b3793bc5b3..0cab11a5d390 100644 --- a/fs/xfs/scrub/refcount.c +++ b/fs/xfs/scrub/refcount.c @@ -341,7 +341,6 @@ xchk_refcountbt_rec( xfs_extlen_t len; xfs_nlink_t refcount; bool has_cowflag; - int error = 0; bno = be32_to_cpu(rec->refc.rc_startblock); len = be32_to_cpu(rec->refc.rc_blockcount); @@ -366,7 +365,7 @@ xchk_refcountbt_rec( xchk_refcountbt_xref(bs->sc, bno, len, refcount); - return error; + return 0; } /* Make sure we have as many refc blocks as the rmap says. */ diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 0910cb75b65d..4f443703065e 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -864,6 +864,7 @@ xfs_alloc_file_space( xfs_filblks_t allocatesize_fsb; xfs_extlen_t extsz, temp; xfs_fileoff_t startoffset_fsb; + xfs_fileoff_t endoffset_fsb; int nimaps; int quota_flag; int rt; @@ -891,7 +892,8 @@ xfs_alloc_file_space( imapp = &imaps[0]; nimaps = 1; startoffset_fsb = XFS_B_TO_FSBT(mp, offset); - allocatesize_fsb = XFS_B_TO_FSB(mp, count); + endoffset_fsb = XFS_B_TO_FSB(mp, offset + count); + allocatesize_fsb = endoffset_fsb - startoffset_fsb; /* * Allocate file space until done or until there is an error diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 120ef99d09e8..0abba171aa89 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -345,6 +345,15 @@ xfs_buf_allocate_memory( unsigned short page_count, i; xfs_off_t start, end; int error; + xfs_km_flags_t kmflag_mask = 0; + + /* + * assure zeroed buffer for non-read cases. + */ + if (!(flags & XBF_READ)) { + kmflag_mask |= KM_ZERO; + gfp_mask |= __GFP_ZERO; + } /* * for buffers that are contained within a single page, just allocate @@ -354,7 +363,8 @@ xfs_buf_allocate_memory( size = BBTOB(bp->b_length); if (size < PAGE_SIZE) { int align_mask = xfs_buftarg_dma_alignment(bp->b_target); - bp->b_addr = kmem_alloc_io(size, align_mask, KM_NOFS); + bp->b_addr = kmem_alloc_io(size, align_mask, + KM_NOFS | kmflag_mask); if (!bp->b_addr) { /* low memory - use alloc_page loop instead */ goto use_alloc_page; @@ -2097,7 +2107,7 @@ xfs_verify_magic( int idx; idx = xfs_sb_version_hascrc(&mp->m_sb); - if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx]))) + if (WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx])) return false; return dmagic == bp->b_ops->magic[idx]; } @@ -2115,7 +2125,7 @@ xfs_verify_magic16( int idx; idx = xfs_sb_version_hascrc(&mp->m_sb); - if (unlikely(WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx]))) + if (WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx])) return false; return dmagic == bp->b_ops->magic16[idx]; } diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index d952d5962e93..1ffb179f35d2 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -370,21 +370,23 @@ static int xfs_dio_write_end_io( struct kiocb *iocb, ssize_t size, + int error, unsigned flags) { struct inode *inode = file_inode(iocb->ki_filp); struct xfs_inode *ip = XFS_I(inode); loff_t offset = iocb->ki_pos; unsigned int nofs_flag; - int error = 0; trace_xfs_end_io_direct_write(ip, offset, size); if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; - if (size <= 0) - return size; + if (error) + return error; + if (!size) + return 0; /* * Capture amount written on completion as we can't reliably account @@ -441,6 +443,10 @@ out: return error; } +static const struct iomap_dio_ops xfs_dio_write_ops = { + .end_io = xfs_dio_write_end_io, +}; + /* * xfs_file_dio_aio_write - handle direct IO writes * @@ -541,7 +547,7 @@ xfs_file_dio_aio_write( } trace_xfs_file_direct_write(ip, count, iocb->ki_pos); - ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io); + ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, &xfs_dio_write_ops); /* * If unaligned, this is the only IO in-flight. If it has not yet diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index a2beee9f74da..641d07f30a27 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1443,7 +1443,7 @@ xlog_alloc_log( prev_iclog = iclog; iclog->ic_data = kmem_alloc_io(log->l_iclog_size, align_mask, - KM_MAYFAIL); + KM_MAYFAIL | KM_ZERO); if (!iclog->ic_data) goto out_free_iclog; #ifdef DEBUG diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 508319039dce..c1a514ffff55 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -127,7 +127,7 @@ xlog_alloc_buffer( if (nbblks > 1 && log->l_sectBBsize > 1) nbblks += log->l_sectBBsize; nbblks = round_up(nbblks, log->l_sectBBsize); - return kmem_alloc_io(BBTOB(nbblks), align_mask, KM_MAYFAIL); + return kmem_alloc_io(BBTOB(nbblks), align_mask, KM_MAYFAIL | KM_ZERO); } /* diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index ddd0bf7a4740..f1bc88f4367c 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -63,19 +63,6 @@ static const struct sysfs_ops xfs_sysfs_ops = { .store = xfs_sysfs_object_store, }; -/* - * xfs_mount kobject. The mp kobject also serves as the per-mount parent object - * that is identified by the fsname under sysfs. - */ - -static inline struct xfs_mount * -to_mp(struct kobject *kobject) -{ - struct xfs_kobj *kobj = to_kobj(kobject); - - return container_of(kobj, struct xfs_mount, m_kobj); -} - static struct attribute *xfs_mp_attrs[] = { NULL, }; |