diff options
Diffstat (limited to 'fs')
86 files changed, 1131 insertions, 462 deletions
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 58c2bbd385ad..57a27c42b5ac 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -1,6 +1,6 @@ config BINFMT_ELF bool "Kernel support for ELF binaries" - depends on MMU && (BROKEN || !FRV) + depends on MMU select ELFCORE default y ---help--- @@ -35,7 +35,7 @@ config ARCH_BINFMT_ELF_STATE config BINFMT_ELF_FDPIC bool "Kernel support for FDPIC ELF binaries" default y if !BINFMT_ELF - depends on (ARM || FRV || BLACKFIN || (SUPERH32 && !MMU) || C6X) + depends on (ARM || (SUPERH32 && !MMU) || C6X) select ELFCORE help ELF FDPIC binaries are based on ELF, but allow the individual load @@ -90,7 +90,6 @@ config BINFMT_SCRIPT config BINFMT_FLAT bool "Kernel support for flat binaries" depends on !MMU || ARM || M68K - depends on !FRV || BROKEN help Support uClinux FLAT format binaries. diff --git a/fs/afs/cell.c b/fs/afs/cell.c index 9bb921d120d0..3d2c5e0e854e 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -25,7 +25,7 @@ static void afs_manage_cell(struct work_struct *); static void afs_dec_cells_outstanding(struct afs_net *net) { if (atomic_dec_and_test(&net->cells_outstanding)) - wake_up_atomic_t(&net->cells_outstanding); + wake_up_var(&net->cells_outstanding); } /* @@ -764,7 +764,7 @@ void afs_cell_purge(struct afs_net *net) afs_queue_cell_manager(net); _debug("wait"); - wait_on_atomic_t(&net->cells_outstanding, atomic_t_wait, - TASK_UNINTERRUPTIBLE); + wait_var_event(&net->cells_outstanding, + !atomic_read(&net->cells_outstanding)); _leave(""); } diff --git a/fs/afs/internal.h b/fs/afs/internal.h index f38d6a561a84..72217170b155 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -118,6 +118,7 @@ struct afs_call { bool ret_reply0; /* T if should return reply[0] on success */ bool upgrade; /* T to request service upgrade */ u16 service_id; /* Actual service ID (after upgrade) */ + unsigned int debug_id; /* Trace ID */ u32 operation_ID; /* operation ID for an incoming call */ u32 count; /* count for use in unmarshalling */ __be32 tmp; /* place to extract temporary data */ diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index e1126659f043..f7ae54b6a393 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -103,8 +103,8 @@ void afs_close_socket(struct afs_net *net) } _debug("outstanding %u", atomic_read(&net->nr_outstanding_calls)); - wait_on_atomic_t(&net->nr_outstanding_calls, atomic_t_wait, - TASK_UNINTERRUPTIBLE); + wait_var_event(&net->nr_outstanding_calls, + !atomic_read(&net->nr_outstanding_calls)); _debug("no outstanding calls"); kernel_sock_shutdown(net->socket, SHUT_RDWR); @@ -131,6 +131,7 @@ static struct afs_call *afs_alloc_call(struct afs_net *net, call->type = type; call->net = net; + call->debug_id = atomic_inc_return(&rxrpc_debug_id); atomic_set(&call->usage, 1); INIT_WORK(&call->async_work, afs_process_async_call); init_waitqueue_head(&call->waitq); @@ -169,13 +170,14 @@ void afs_put_call(struct afs_call *call) afs_put_server(call->net, call->cm_server); afs_put_cb_interest(call->net, call->cbi); kfree(call->request); - kfree(call); - o = atomic_dec_return(&net->nr_outstanding_calls); trace_afs_call(call, afs_call_trace_free, 0, o, __builtin_return_address(0)); + kfree(call); + + o = atomic_dec_return(&net->nr_outstanding_calls); if (o == 0) - wake_up_atomic_t(&net->nr_outstanding_calls); + wake_up_var(&net->nr_outstanding_calls); } } @@ -378,7 +380,8 @@ long afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, (async ? afs_wake_up_async_call : afs_wake_up_call_waiter), - call->upgrade); + call->upgrade, + call->debug_id); if (IS_ERR(rxcall)) { ret = PTR_ERR(rxcall); goto error_kill_call; @@ -727,7 +730,8 @@ void afs_charge_preallocation(struct work_struct *work) afs_wake_up_async_call, afs_rx_attach, (unsigned long)call, - GFP_KERNEL) < 0) + GFP_KERNEL, + call->debug_id) < 0) break; call = NULL; } diff --git a/fs/afs/server.c b/fs/afs/server.c index 1880f1b6a9f1..a43ef77dabae 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -25,7 +25,7 @@ static void afs_inc_servers_outstanding(struct afs_net *net) static void afs_dec_servers_outstanding(struct afs_net *net) { if (atomic_dec_and_test(&net->servers_outstanding)) - wake_up_atomic_t(&net->servers_outstanding); + wake_up_var(&net->servers_outstanding); } /* @@ -521,8 +521,8 @@ void afs_purge_servers(struct afs_net *net) afs_queue_server_manager(net); _debug("wait"); - wait_on_atomic_t(&net->servers_outstanding, atomic_t_wait, - TASK_UNINTERRUPTIBLE); + wait_var_event(&net->servers_outstanding, + !atomic_read(&net->servers_outstanding)); _leave(""); } @@ -68,9 +68,9 @@ struct aio_ring { #define AIO_RING_PAGES 8 struct kioctx_table { - struct rcu_head rcu; - unsigned nr; - struct kioctx *table[]; + struct rcu_head rcu; + unsigned nr; + struct kioctx __rcu *table[]; }; struct kioctx_cpu { @@ -115,7 +115,7 @@ struct kioctx { struct page **ring_pages; long nr_pages; - struct work_struct free_work; + struct rcu_work free_rwork; /* see free_ioctx() */ /* * signals when all in-flight requests are done @@ -329,7 +329,7 @@ static int aio_ring_mremap(struct vm_area_struct *vma) for (i = 0; i < table->nr; i++) { struct kioctx *ctx; - ctx = table->table[i]; + ctx = rcu_dereference(table->table[i]); if (ctx && ctx->aio_ring_file == file) { if (!atomic_read(&ctx->dead)) { ctx->user_id = ctx->mmap_base = vma->vm_start; @@ -588,10 +588,15 @@ static int kiocb_cancel(struct aio_kiocb *kiocb) return cancel(&kiocb->common); } +/* + * free_ioctx() should be RCU delayed to synchronize against the RCU + * protected lookup_ioctx() and also needs process context to call + * aio_free_ring(). Use rcu_work. + */ static void free_ioctx(struct work_struct *work) { - struct kioctx *ctx = container_of(work, struct kioctx, free_work); - + struct kioctx *ctx = container_of(to_rcu_work(work), struct kioctx, + free_rwork); pr_debug("freeing %p\n", ctx); aio_free_ring(ctx); @@ -609,8 +614,9 @@ static void free_ioctx_reqs(struct percpu_ref *ref) if (ctx->rq_wait && atomic_dec_and_test(&ctx->rq_wait->count)) complete(&ctx->rq_wait->comp); - INIT_WORK(&ctx->free_work, free_ioctx); - schedule_work(&ctx->free_work); + /* Synchronize against RCU protected table->table[] dereferences */ + INIT_RCU_WORK(&ctx->free_rwork, free_ioctx); + queue_rcu_work(system_wq, &ctx->free_rwork); } /* @@ -651,9 +657,9 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) while (1) { if (table) for (i = 0; i < table->nr; i++) - if (!table->table[i]) { + if (!rcu_access_pointer(table->table[i])) { ctx->id = i; - table->table[i] = ctx; + rcu_assign_pointer(table->table[i], ctx); spin_unlock(&mm->ioctx_lock); /* While kioctx setup is in progress, @@ -834,11 +840,11 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx, } table = rcu_dereference_raw(mm->ioctx_table); - WARN_ON(ctx != table->table[ctx->id]); - table->table[ctx->id] = NULL; + WARN_ON(ctx != rcu_access_pointer(table->table[ctx->id])); + RCU_INIT_POINTER(table->table[ctx->id], NULL); spin_unlock(&mm->ioctx_lock); - /* percpu_ref_kill() will do the necessary call_rcu() */ + /* free_ioctx_reqs() will do the necessary RCU synchronization */ wake_up_all(&ctx->wait); /* @@ -880,7 +886,8 @@ void exit_aio(struct mm_struct *mm) skipped = 0; for (i = 0; i < table->nr; ++i) { - struct kioctx *ctx = table->table[i]; + struct kioctx *ctx = + rcu_dereference_protected(table->table[i], true); if (!ctx) { skipped++; @@ -1069,7 +1076,7 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) if (!table || id >= table->nr) goto out; - ctx = table->table[id]; + ctx = rcu_dereference(table->table[id]); if (ctx && ctx->user_id == ctx_id) { percpu_ref_get(&ctx->users); ret = ctx; diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index b7c816f39404..26f6b4f41ce6 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -310,7 +310,7 @@ static int autofs_dev_ioctl_closemount(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { - return sys_close(param->ioctlfd); + return ksys_close(param->ioctlfd); } /* diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index a7c5a9861bef..a41b48f82a70 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -241,7 +241,7 @@ ret: return retval; error: if (fd_binary > 0) - sys_close(fd_binary); + ksys_close(fd_binary); bprm->interp_flags = 0; bprm->interp_data = 0; goto ret; diff --git a/fs/block_dev.c b/fs/block_dev.c index 4a181fcb5175..fe09ef9c21f3 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1058,6 +1058,27 @@ retry: return 0; } +static struct gendisk *bdev_get_gendisk(struct block_device *bdev, int *partno) +{ + struct gendisk *disk = get_gendisk(bdev->bd_dev, partno); + + if (!disk) + return NULL; + /* + * Now that we hold gendisk reference we make sure bdev we looked up is + * not stale. If it is, it means device got removed and created before + * we looked up gendisk and we fail open in such case. Associating + * unhashed bdev with newly created gendisk could lead to two bdevs + * (and thus two independent caches) being associated with one device + * which is bad. + */ + if (inode_unhashed(bdev->bd_inode)) { + put_disk_and_module(disk); + return NULL; + } + return disk; +} + /** * bd_start_claiming - start claiming a block device * @bdev: block device of interest @@ -1094,7 +1115,7 @@ static struct block_device *bd_start_claiming(struct block_device *bdev, * @bdev might not have been initialized properly yet, look up * and grab the outer block device the hard way. */ - disk = get_gendisk(bdev->bd_dev, &partno); + disk = bdev_get_gendisk(bdev, &partno); if (!disk) return ERR_PTR(-ENXIO); @@ -1111,8 +1132,7 @@ static struct block_device *bd_start_claiming(struct block_device *bdev, else whole = bdgrab(bdev); - module_put(disk->fops->owner); - put_disk(disk); + put_disk_and_module(disk); if (!whole) return ERR_PTR(-ENOMEM); @@ -1407,10 +1427,10 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) { struct gendisk *disk; - struct module *owner; int ret; int partno; int perm = 0; + bool first_open = false; if (mode & FMODE_READ) perm |= MAY_READ; @@ -1430,14 +1450,14 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) restart: ret = -ENXIO; - disk = get_gendisk(bdev->bd_dev, &partno); + disk = bdev_get_gendisk(bdev, &partno); if (!disk) goto out; - owner = disk->fops->owner; disk_block_events(disk); mutex_lock_nested(&bdev->bd_mutex, for_part); if (!bdev->bd_openers) { + first_open = true; bdev->bd_disk = disk; bdev->bd_queue = disk->queue; bdev->bd_contains = bdev; @@ -1463,8 +1483,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_queue = NULL; mutex_unlock(&bdev->bd_mutex); disk_unblock_events(disk); - put_disk(disk); - module_put(owner); + put_disk_and_module(disk); goto restart; } } @@ -1524,15 +1543,15 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) if (ret) goto out_unlock_bdev; } - /* only one opener holds refs to the module and disk */ - put_disk(disk); - module_put(owner); } bdev->bd_openers++; if (for_part) bdev->bd_part_count++; mutex_unlock(&bdev->bd_mutex); disk_unblock_events(disk); + /* only one opener holds refs to the module and disk */ + if (!first_open) + put_disk_and_module(disk); return 0; out_clear: @@ -1546,8 +1565,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) out_unlock_bdev: mutex_unlock(&bdev->bd_mutex); disk_unblock_events(disk); - put_disk(disk); - module_put(owner); + put_disk_and_module(disk); out: bdput(bdev); @@ -1770,8 +1788,6 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) disk->fops->release(disk, mode); } if (!bdev->bd_openers) { - struct module *owner = disk->fops->owner; - disk_put_part(bdev->bd_part); bdev->bd_part = NULL; bdev->bd_disk = NULL; @@ -1779,8 +1795,7 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) victim = bdev->bd_contains; bdev->bd_contains = NULL; - put_disk(disk); - module_put(owner); + put_disk_and_module(disk); } mutex_unlock(&bdev->bd_mutex); bdput(bdev); diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index f94b2d8c744a..26484648d090 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1519,6 +1519,7 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr) if (!node) break; bytenr = node->val; + shared.share_count = 0; cond_resched(); } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 1a462ab85c49..da308774b8a4 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2974,7 +2974,7 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info) kfree(fs_info->super_copy); kfree(fs_info->super_for_commit); security_free_mnt_opts(&fs_info->security_opts); - kfree(fs_info); + kvfree(fs_info); } /* tree mod log functions from ctree.c */ @@ -3095,7 +3095,10 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, u64 inode_objectid, u64 ref_objectid, int ins_len, int cow); -int btrfs_find_name_in_ext_backref(struct btrfs_path *path, +int btrfs_find_name_in_backref(struct extent_buffer *leaf, int slot, + const char *name, + int name_len, struct btrfs_inode_ref **ref_ret); +int btrfs_find_name_in_ext_backref(struct extent_buffer *leaf, int slot, u64 ref_objectid, const char *name, int name_len, struct btrfs_inode_extref **extref_ret); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c1618ab9fecf..e0460d7b5622 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3990,7 +3990,7 @@ void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) bg = btrfs_lookup_block_group(fs_info, bytenr); ASSERT(bg); if (atomic_dec_and_test(&bg->nocow_writers)) - wake_up_atomic_t(&bg->nocow_writers); + wake_up_var(&bg->nocow_writers); /* * Once for our lookup and once for the lookup done by a previous call * to btrfs_inc_nocow_writers() @@ -4001,8 +4001,7 @@ void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg) { - wait_on_atomic_t(&bg->nocow_writers, atomic_t_wait, - TASK_UNINTERRUPTIBLE); + wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers)); } static const char *alloc_name(u64 flags) @@ -6526,7 +6525,7 @@ void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, bg = btrfs_lookup_block_group(fs_info, start); ASSERT(bg); if (atomic_dec_and_test(&bg->reservations)) - wake_up_atomic_t(&bg->reservations); + wake_up_var(&bg->reservations); btrfs_put_block_group(bg); } @@ -6552,8 +6551,7 @@ void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg) down_write(&space_info->groups_sem); up_write(&space_info->groups_sem); - wait_on_atomic_t(&bg->reservations, atomic_t_wait, - TASK_UNINTERRUPTIBLE); + wait_var_event(&bg->reservations, !atomic_read(&bg->reservations)); } /** @@ -11061,7 +11059,7 @@ void btrfs_wait_for_snapshot_creation(struct btrfs_root *root) ret = btrfs_start_write_no_snapshotting(root); if (ret) break; - wait_on_atomic_t(&root->will_be_snapshotted, atomic_t_wait, - TASK_UNINTERRUPTIBLE); + wait_var_event(&root->will_be_snapshotted, + !atomic_read(&root->will_be_snapshotted)); } } diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 39c968f80157..65e1a76bf755 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -22,10 +22,10 @@ #include "transaction.h" #include "print-tree.h" -static int find_name_in_backref(struct btrfs_path *path, const char *name, - int name_len, struct btrfs_inode_ref **ref_ret) +int btrfs_find_name_in_backref(struct extent_buffer *leaf, int slot, + const char *name, + int name_len, struct btrfs_inode_ref **ref_ret) { - struct extent_buffer *leaf; struct btrfs_inode_ref *ref; unsigned long ptr; unsigned long name_ptr; @@ -33,9 +33,8 @@ static int find_name_in_backref(struct btrfs_path *path, const char *name, u32 cur_offset = 0; int len; - leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); - ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + item_size = btrfs_item_size_nr(leaf, slot); + ptr = btrfs_item_ptr_offset(leaf, slot); while (cur_offset < item_size) { ref = (struct btrfs_inode_ref *)(ptr + cur_offset); len = btrfs_inode_ref_name_len(leaf, ref); @@ -44,18 +43,19 @@ static int find_name_in_backref(struct btrfs_path *path, const char *name, if (len != name_len) continue; if (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) { - *ref_ret = ref; + if (ref_ret) + *ref_ret = ref; return 1; } } return 0; } -int btrfs_find_name_in_ext_backref(struct btrfs_path *path, u64 ref_objectid, +int btrfs_find_name_in_ext_backref(struct extent_buffer *leaf, int slot, + u64 ref_objectid, const char *name, int name_len, struct btrfs_inode_extref **extref_ret) { - struct extent_buffer *leaf; struct btrfs_inode_extref *extref; unsigned long ptr; unsigned long name_ptr; @@ -63,9 +63,8 @@ int btrfs_find_name_in_ext_backref(struct btrfs_path *path, u64 ref_objectid, u32 cur_offset = 0; int ref_name_len; - leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); - ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + item_size = btrfs_item_size_nr(leaf, slot); + ptr = btrfs_item_ptr_offset(leaf, slot); /* * Search all extended backrefs in this item. We're only @@ -113,7 +112,9 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, return ERR_PTR(ret); if (ret > 0) return NULL; - if (!btrfs_find_name_in_ext_backref(path, ref_objectid, name, name_len, &extref)) + if (!btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0], + ref_objectid, name, name_len, + &extref)) return NULL; return extref; } @@ -155,7 +156,8 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, * This should always succeed so error here will make the FS * readonly. */ - if (!btrfs_find_name_in_ext_backref(path, ref_objectid, + if (!btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0], + ref_objectid, name, name_len, &extref)) { btrfs_handle_fs_error(root->fs_info, -ENOENT, NULL); ret = -EROFS; @@ -225,7 +227,8 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, } else if (ret < 0) { goto out; } - if (!find_name_in_backref(path, name, name_len, &ref)) { + if (!btrfs_find_name_in_backref(path->nodes[0], path->slots[0], + name, name_len, &ref)) { ret = -ENOENT; search_ext_refs = 1; goto out; @@ -293,7 +296,9 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, root, path, &key, ins_len); if (ret == -EEXIST) { - if (btrfs_find_name_in_ext_backref(path, ref_objectid, + if (btrfs_find_name_in_ext_backref(path->nodes[0], + path->slots[0], + ref_objectid, name, name_len, NULL)) goto out; @@ -351,7 +356,8 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, if (ret == -EEXIST) { u32 old_size; - if (find_name_in_backref(path, name, name_len, &ref)) + if (btrfs_find_name_in_backref(path->nodes[0], path->slots[0], + name, name_len, &ref)) goto out; old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); @@ -365,7 +371,9 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, ret = 0; } else if (ret < 0) { if (ret == -EOVERFLOW) { - if (find_name_in_backref(path, name, name_len, &ref)) + if (btrfs_find_name_in_backref(path->nodes[0], + path->slots[0], + name, name_len, &ref)) ret = -EEXIST; else ret = -EMLINK; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a79299a89b7d..f53470112670 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2043,12 +2043,15 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans, struct inode *inode, struct list_head *list) { struct btrfs_ordered_sum *sum; + int ret; list_for_each_entry(sum, list, list) { trans->adding_csums = true; - btrfs_csum_file_blocks(trans, + ret = btrfs_csum_file_blocks(trans, BTRFS_I(inode)->root->fs_info->csum_root, sum); trans->adding_csums = false; + if (ret) + return ret; } return 0; } @@ -3062,7 +3065,11 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } - add_pending_csums(trans, inode, &ordered_extent->list); + ret = add_pending_csums(trans, inode, &ordered_extent->list); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out; + } btrfs_ordered_update_i_size(inode, 0, ordered_extent); ret = btrfs_update_inode_fallback(trans, root, inode); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 111ee282b777..3278ae592a2c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -723,7 +723,7 @@ fail: btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv); dec_and_free: if (atomic_dec_and_test(&root->will_be_snapshotted)) - wake_up_atomic_t(&root->will_be_snapshotted); + wake_up_var(&root->will_be_snapshotted); free_pending: kfree(pending_snapshot->root_item); btrfs_free_path(pending_snapshot->path); diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index dec0907dfb8a..fcfc20de2df3 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1370,6 +1370,7 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio, stripe_start = stripe->physical; if (physical >= stripe_start && physical < stripe_start + rbio->stripe_len && + stripe->dev->bdev && bio->bi_disk == stripe->dev->bdev->bd_disk && bio->bi_partno == stripe->dev->bdev->bd_partno) { return i; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index f0c3f00e97cb..cd2298d185dd 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3268,8 +3268,22 @@ static int relocate_file_extent_cluster(struct inode *inode, nr++; } - btrfs_set_extent_delalloc(inode, page_start, page_end, 0, NULL, - 0); + ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0, + NULL, 0); + if (ret) { + unlock_page(page); + put_page(page); + btrfs_delalloc_release_metadata(BTRFS_I(inode), + PAGE_SIZE); + btrfs_delalloc_release_extents(BTRFS_I(inode), + PAGE_SIZE); + + clear_extent_bits(&BTRFS_I(inode)->io_tree, + page_start, page_end, + EXTENT_LOCKED | EXTENT_BOUNDARY); + goto out; + + } set_page_dirty(page); unlock_extent(&BTRFS_I(inode)->io_tree, diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index f306c608dc28..484e2af793de 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -5005,6 +5005,9 @@ static int send_hole(struct send_ctx *sctx, u64 end) u64 len; int ret = 0; + if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) + return send_update_extent(sctx, offset, end - offset); + p = fs_path_alloc(); if (!p) return -ENOMEM; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 6e71a2a78363..4b817947e00f 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1545,7 +1545,7 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, * it for searching for existing supers, so this lets us do that and * then open_ctree will properly initialize everything later. */ - fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_KERNEL); + fs_info = kvzalloc(sizeof(struct btrfs_fs_info), GFP_KERNEL); if (!fs_info) { error = -ENOMEM; goto error_sec_opts; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 4fd19b4d6675..434457794c27 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -967,7 +967,9 @@ static noinline int backref_in_log(struct btrfs_root *log, ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); if (key->type == BTRFS_INODE_EXTREF_KEY) { - if (btrfs_find_name_in_ext_backref(path, ref_objectid, + if (btrfs_find_name_in_ext_backref(path->nodes[0], + path->slots[0], + ref_objectid, name, namelen, NULL)) match = 1; @@ -1191,7 +1193,8 @@ static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, read_extent_buffer(eb, *name, (unsigned long)&extref->name, *namelen); - *index = btrfs_inode_extref_index(eb, extref); + if (index) + *index = btrfs_inode_extref_index(eb, extref); if (parent_objectid) *parent_objectid = btrfs_inode_extref_parent(eb, extref); @@ -1212,12 +1215,102 @@ static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen); - *index = btrfs_inode_ref_index(eb, ref); + if (index) + *index = btrfs_inode_ref_index(eb, ref); return 0; } /* + * Take an inode reference item from the log tree and iterate all names from the + * inode reference item in the subvolume tree with the same key (if it exists). + * For any name that is not in the inode reference item from the log tree, do a + * proper unlink of that name (that is, remove its entry from the inode + * reference item and both dir index keys). + */ +static int unlink_old_inode_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_inode *inode, + struct extent_buffer *log_eb, + int log_slot, + struct btrfs_key *key) +{ + int ret; + unsigned long ref_ptr; + unsigned long ref_end; + struct extent_buffer *eb; + +again: + btrfs_release_path(path); + ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + if (ret > 0) { + ret = 0; + goto out; + } + if (ret < 0) + goto out; + + eb = path->nodes[0]; + ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]); + ref_end = ref_ptr + btrfs_item_size_nr(eb, path->slots[0]); + while (ref_ptr < ref_end) { + char *name = NULL; + int namelen; + u64 parent_id; + + if (key->type == BTRFS_INODE_EXTREF_KEY) { + ret = extref_get_fields(eb, ref_ptr, &namelen, &name, + NULL, &parent_id); + } else { + parent_id = key->offset; + ret = ref_get_fields(eb, ref_ptr, &namelen, &name, + NULL); + } + if (ret) + goto out; + + if (key->type == BTRFS_INODE_EXTREF_KEY) + ret = btrfs_find_name_in_ext_backref(log_eb, log_slot, + parent_id, name, + namelen, NULL); + else + ret = btrfs_find_name_in_backref(log_eb, log_slot, name, + namelen, NULL); + + if (!ret) { + struct inode *dir; + + btrfs_release_path(path); + dir = read_one_inode(root, parent_id); + if (!dir) { + ret = -ENOENT; + kfree(name); + goto out; + } + ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), + inode, name, namelen); + kfree(name); + iput(dir); + if (ret) + goto out; + goto again; + } + + kfree(name); + ref_ptr += namelen; + if (key->type == BTRFS_INODE_EXTREF_KEY) + ref_ptr += sizeof(struct btrfs_inode_extref); + else + ref_ptr += sizeof(struct btrfs_inode_ref); + } + ret = 0; + out: + btrfs_release_path(path); + return ret; +} + +/* * replay one inode back reference item found in the log tree. * eb, slot and key refer to the buffer and key found in the log tree. * root is the destination we are replaying into, and path is for temp @@ -1345,6 +1438,19 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, } } + /* + * Before we overwrite the inode reference item in the subvolume tree + * with the item from the log tree, we must unlink all names from the + * parent directory that are in the subvolume's tree inode reference + * item, otherwise we end up with an inconsistent subvolume tree where + * dir index entries exist for a name but there is no inode reference + * item with the same name. + */ + ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot, + key); + if (ret) + goto out; + /* finally write the back reference in the inode */ ret = overwrite_item(trans, root, path, eb, slot, key); out: @@ -5853,7 +5959,7 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans, * this will force the logging code to walk the dentry chain * up for the file */ - if (S_ISREG(inode->vfs_inode.i_mode)) + if (!S_ISDIR(inode->vfs_inode.i_mode)) inode->last_unlink_trans = trans->transid; /* diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 2ceb924ca0d6..b2d05c6b1c56 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4829,10 +4829,13 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, ndevs = min(ndevs, devs_max); /* - * the primary goal is to maximize the number of stripes, so use as many - * devices as possible, even if the stripes are not maximum sized. + * The primary goal is to maximize the number of stripes, so use as + * many devices as possible, even if the stripes are not maximum sized. + * + * The DUP profile stores more than one stripe per device, the + * max_avail is the total size so we have to adjust. */ - stripe_size = devices_info[ndevs-1].max_avail; + stripe_size = div_u64(devices_info[ndevs - 1].max_avail, dev_stripes); num_stripes = ndevs * dev_stripes; /* @@ -4867,8 +4870,6 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, stripe_size = devices_info[ndevs-1].max_avail; } - stripe_size = div_u64(stripe_size, dev_stripes); - /* align to BTRFS_STRIPE_LEN */ stripe_size = round_down(stripe_size, BTRFS_STRIPE_LEN); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 6582c4507e6c..0e5bd3e3344e 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -3965,6 +3965,32 @@ void ceph_put_fmode(struct ceph_inode_info *ci, int fmode) } /* + * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it + * looks like the link count will hit 0, drop any other caps (other + * than PIN) we don't specifically want (due to the file still being + * open). + */ +int ceph_drop_caps_for_unlink(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; + + spin_lock(&ci->i_ceph_lock); + if (inode->i_nlink == 1) { + drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN); + + ci->i_ceph_flags |= CEPH_I_NODELAY; + if (__ceph_caps_dirty(ci)) { + struct ceph_mds_client *mdsc = + ceph_inode_to_client(inode)->mdsc; + __cap_delay_requeue_front(mdsc, ci); + } + } + spin_unlock(&ci->i_ceph_lock); + return drop; +} + +/* * Helpers for embedding cap and dentry lease releases into mds * requests. * diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 0c4346806e17..f1d9c6cc0491 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1003,26 +1003,6 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, } /* - * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it - * looks like the link count will hit 0, drop any other caps (other - * than PIN) we don't specifically want (due to the file still being - * open). - */ -static int drop_caps_for_unlink(struct inode *inode) -{ - struct ceph_inode_info *ci = ceph_inode(inode); - int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; - - spin_lock(&ci->i_ceph_lock); - if (inode->i_nlink == 1) { - drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN); - ci->i_ceph_flags |= CEPH_I_NODELAY; - } - spin_unlock(&ci->i_ceph_lock); - return drop; -} - -/* * rmdir and unlink are differ only by the metadata op code */ static int ceph_unlink(struct inode *dir, struct dentry *dentry) @@ -1056,7 +1036,7 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry) set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); req->r_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; - req->r_inode_drop = drop_caps_for_unlink(inode); + req->r_inode_drop = ceph_drop_caps_for_unlink(inode); err = ceph_mdsc_do_request(mdsc, dir, req); if (!err && !req->r_reply_info.head->is_dentry) d_delete(dentry); @@ -1104,8 +1084,10 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, req->r_dentry_unless = CEPH_CAP_FILE_EXCL; /* release LINK_RDCACHE on source inode (mds will lock it) */ req->r_old_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; - if (d_really_is_positive(new_dentry)) - req->r_inode_drop = drop_caps_for_unlink(d_inode(new_dentry)); + if (d_really_is_positive(new_dentry)) { + req->r_inode_drop = + ceph_drop_caps_for_unlink(d_inode(new_dentry)); + } err = ceph_mdsc_do_request(mdsc, old_dir, req); if (!err && !req->r_reply_info.head->is_dentry) { /* diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 6639926eed4e..b67eec3532a1 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -640,7 +640,8 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, struct ceph_aio_request { struct kiocb *iocb; size_t total_len; - int write; + bool write; + bool should_dirty; int error; struct list_head osd_reqs; unsigned num_reqs; @@ -750,7 +751,7 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) } } - ceph_put_page_vector(osd_data->pages, num_pages, !aio_req->write); + ceph_put_page_vector(osd_data->pages, num_pages, aio_req->should_dirty); ceph_osdc_put_request(req); if (rc < 0) @@ -847,6 +848,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, size_t count = iov_iter_count(iter); loff_t pos = iocb->ki_pos; bool write = iov_iter_rw(iter) == WRITE; + bool should_dirty = !write && iter_is_iovec(iter); if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP) return -EROFS; @@ -914,6 +916,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, if (aio_req) { aio_req->iocb = iocb; aio_req->write = write; + aio_req->should_dirty = should_dirty; INIT_LIST_HEAD(&aio_req->osd_reqs); if (write) { aio_req->mtime = mtime; @@ -971,7 +974,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, len = ret; } - ceph_put_page_vector(pages, num_pages, !write); + ceph_put_page_vector(pages, num_pages, should_dirty); ceph_osdc_put_request(req); if (ret < 0) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index a62d2a9841dc..fb2bc9c15a23 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -225,6 +225,7 @@ static int parse_fsopt_token(char *c, void *private) return -ENOMEM; break; case Opt_mds_namespace: + kfree(fsopt->mds_namespace); fsopt->mds_namespace = kstrndup(argstr[0].from, argstr[0].to-argstr[0].from, GFP_KERNEL); @@ -232,6 +233,7 @@ static int parse_fsopt_token(char *c, void *private) return -ENOMEM; break; case Opt_fscache_uniq: + kfree(fsopt->fscache_uniq); fsopt->fscache_uniq = kstrndup(argstr[0].from, argstr[0].to-argstr[0].from, GFP_KERNEL); @@ -711,14 +713,17 @@ static int __init init_caches(void) goto bad_dentry; ceph_file_cachep = KMEM_CACHE(ceph_file_info, SLAB_MEM_SPREAD); - if (!ceph_file_cachep) goto bad_file; - if ((error = ceph_fscache_register())) - goto bad_file; + error = ceph_fscache_register(); + if (error) + goto bad_fscache; return 0; + +bad_fscache: + kmem_cache_destroy(ceph_file_cachep); bad_file: kmem_cache_destroy(ceph_dentry_cachep); bad_dentry: @@ -836,7 +841,6 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc) int err; unsigned long started = jiffies; /* note the start time */ struct dentry *root; - int first = 0; /* first vfsmount for this super_block */ dout("mount start %p\n", fsc); mutex_lock(&fsc->client->mount_mutex); @@ -861,17 +865,17 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc) path = fsc->mount_options->server_path + 1; dout("mount opening path %s\n", path); } + + err = ceph_fs_debugfs_init(fsc); + if (err < 0) + goto out; + root = open_root_dentry(fsc, path, started); if (IS_ERR(root)) { err = PTR_ERR(root); goto out; } fsc->sb->s_root = dget(root); - first = 1; - - err = ceph_fs_debugfs_init(fsc); - if (err < 0) - goto fail; } else { root = dget(fsc->sb->s_root); } @@ -881,11 +885,6 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc) mutex_unlock(&fsc->client->mount_mutex); return root; -fail: - if (first) { - dput(fsc->sb->s_root); - fsc->sb->s_root = NULL; - } out: mutex_unlock(&fsc->client->mount_mutex); return ERR_PTR(err); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 21b2e5b004eb..1c2086e0fec2 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -987,7 +987,7 @@ extern void ceph_check_caps(struct ceph_inode_info *ci, int flags, struct ceph_mds_session *session); extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc); extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc); - +extern int ceph_drop_caps_for_unlink(struct inode *inode); extern int ceph_encode_inode_release(void **p, struct inode *inode, int mds, int drop, int unless, int force); extern int ceph_encode_dentry_release(void **p, struct dentry *dn, diff --git a/fs/dcookies.c b/fs/dcookies.c index 0d0461cf2431..57bc96435feb 100644 --- a/fs/dcookies.c +++ b/fs/dcookies.c @@ -146,7 +146,7 @@ out: /* And here is where the userspace process can look up the cookie value * to retrieve the path. */ -SYSCALL_DEFINE3(lookup_dcookie, u64, cookie64, char __user *, buf, size_t, len) +static int do_lookup_dcookie(u64 cookie64, char __user *buf, size_t len) { unsigned long cookie = (unsigned long)cookie64; int err = -EINVAL; @@ -203,13 +203,18 @@ out: return err; } +SYSCALL_DEFINE3(lookup_dcookie, u64, cookie64, char __user *, buf, size_t, len) +{ + return do_lookup_dcookie(cookie64, buf, len); +} + #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(lookup_dcookie, u32, w0, u32, w1, char __user *, buf, compat_size_t, len) { #ifdef __BIG_ENDIAN - return sys_lookup_dcookie(((u64)w0 << 32) | w1, buf, len); + return do_lookup_dcookie(((u64)w0 << 32) | w1, buf, len); #else - return sys_lookup_dcookie(((u64)w1 << 32) | w0, buf, len); + return do_lookup_dcookie(((u64)w1 << 32) | w0, buf, len); #endif } #endif diff --git a/fs/direct-io.c b/fs/direct-io.c index a0ca9e48e993..1357ef563893 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1274,8 +1274,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, */ if (dio->is_async && iov_iter_rw(iter) == WRITE) { retval = 0; - if ((iocb->ki_filp->f_flags & O_DSYNC) || - IS_SYNC(iocb->ki_filp->f_mapping->host)) + if (iocb->ki_flags & IOCB_DSYNC) retval = dio_set_defer_completion(dio); else if (!dio->inode->i_sb->s_dio_done_wq) { /* diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index cff79ea0c01d..5243989a60cc 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -482,7 +482,6 @@ static void lowcomms_error_report(struct sock *sk) { struct connection *con; struct sockaddr_storage saddr; - int buflen; void (*orig_report)(struct sock *) = NULL; read_lock_bh(&sk->sk_callback_lock); @@ -492,7 +491,7 @@ static void lowcomms_error_report(struct sock *sk) orig_report = listen_sock.sk_error_report; if (con->sock == NULL || - kernel_getpeername(con->sock, (struct sockaddr *)&saddr, &buflen)) { + kernel_getpeername(con->sock, (struct sockaddr *)&saddr) < 0) { printk_ratelimited(KERN_ERR "dlm: node %d: socket error " "sending to node %d, port %d, " "sk_err=%d/%d\n", dlm_our_nodeid(), @@ -757,8 +756,8 @@ static int tcp_accept_from_sock(struct connection *con) /* Get the connected socket's peer */ memset(&peeraddr, 0, sizeof(peeraddr)); - if (newsock->ops->getname(newsock, (struct sockaddr *)&peeraddr, - &len, 2)) { + len = newsock->ops->getname(newsock, (struct sockaddr *)&peeraddr, 2); + if (len < 0) { result = -ECONNABORTED; goto accept_err; } diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c index 5f22e74bbade..8e568428c88b 100644 --- a/fs/efivarfs/file.c +++ b/fs/efivarfs/file.c @@ -8,6 +8,7 @@ */ #include <linux/efi.h> +#include <linux/delay.h> #include <linux/fs.h> #include <linux/slab.h> #include <linux/mount.h> @@ -74,6 +75,11 @@ static ssize_t efivarfs_file_read(struct file *file, char __user *userbuf, ssize_t size = 0; int err; + while (!__ratelimit(&file->f_cred->user->ratelimit)) { + if (!msleep_interruptible(50)) + return -EINTR; + } + err = efivar_entry_size(var, &datasize); /* diff --git a/fs/eventfd.c b/fs/eventfd.c index 012f5bd46dfa..08d3bd602f73 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -380,7 +380,7 @@ struct eventfd_ctx *eventfd_ctx_fileget(struct file *file) } EXPORT_SYMBOL_GPL(eventfd_ctx_fileget); -SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags) +static int do_eventfd(unsigned int count, int flags) { struct eventfd_ctx *ctx; int fd; @@ -409,8 +409,13 @@ SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags) return fd; } +SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags) +{ + return do_eventfd(count, flags); +} + SYSCALL_DEFINE1(eventfd, unsigned int, count) { - return sys_eventfd2(count, 0); + return do_eventfd(count, 0); } diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 0f3494ed3ed0..602ca4285b2e 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1936,7 +1936,7 @@ static void clear_tfile_check_list(void) /* * Open an eventpoll file descriptor. */ -SYSCALL_DEFINE1(epoll_create1, int, flags) +static int do_epoll_create(int flags) { int error, fd; struct eventpoll *ep = NULL; @@ -1979,12 +1979,17 @@ out_free_ep: return error; } +SYSCALL_DEFINE1(epoll_create1, int, flags) +{ + return do_epoll_create(flags); +} + SYSCALL_DEFINE1(epoll_create, int, size) { if (size <= 0) return -EINVAL; - return sys_epoll_create1(0); + return do_epoll_create(0); } /* @@ -2148,8 +2153,8 @@ error_return: * Implement the event wait interface for the eventpoll file. It is the kernel * part of the user space epoll_wait(2). */ -SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events, - int, maxevents, int, timeout) +static int do_epoll_wait(int epfd, struct epoll_event __user *events, + int maxevents, int timeout) { int error; struct fd f; @@ -2190,6 +2195,12 @@ error_fput: return error; } +SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events, + int, maxevents, int, timeout) +{ + return do_epoll_wait(epfd, events, maxevents, timeout); +} + /* * Implement the event wait interface for the eventpoll file. It is the kernel * part of the user space epoll_pwait(2). @@ -2214,7 +2225,7 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events, set_current_blocked(&ksigmask); } - error = sys_epoll_wait(epfd, events, maxevents, timeout); + error = do_epoll_wait(epfd, events, maxevents, timeout); /* * If we changed the signal mask, we need to restore the original one. @@ -2257,7 +2268,7 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd, set_current_blocked(&ksigmask); } - err = sys_epoll_wait(epfd, events, maxevents, timeout); + err = do_epoll_wait(epfd, events, maxevents, timeout); /* * If we changed the signal mask, we need to restore the original one. diff --git a/fs/fcntl.c b/fs/fcntl.c index 1e97f1fda90c..d737ff082472 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -607,8 +607,8 @@ static int fixup_compat_flock(struct flock *flock) return 0; } -COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, - compat_ulong_t, arg) +static long do_compat_fcntl64(unsigned int fd, unsigned int cmd, + compat_ulong_t arg) { struct fd f = fdget_raw(fd); struct flock flock; @@ -672,6 +672,12 @@ out_put: return err; } +COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, + compat_ulong_t, arg) +{ + return do_compat_fcntl64(fd, cmd, arg); +} + COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, compat_ulong_t, arg) { @@ -684,7 +690,7 @@ COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, case F_OFD_SETLKW: return -EINVAL; } - return compat_sys_fcntl64(fd, cmd, arg); + return do_compat_fcntl64(fd, cmd, arg); } #endif diff --git a/fs/file.c b/fs/file.c index 42f0db4bd0fb..7ffd6e9d103d 100644 --- a/fs/file.c +++ b/fs/file.c @@ -638,6 +638,7 @@ out_unlock: spin_unlock(&files->file_lock); return -EBADF; } +EXPORT_SYMBOL(__close_fd); /* for ksys_close() */ void do_close_on_exec(struct files_struct *files) { @@ -870,7 +871,7 @@ out_unlock: return err; } -SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags) +static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags) { int err = -EBADF; struct file *file; @@ -904,6 +905,11 @@ out_unlock: return err; } +SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags) +{ + return ksys_dup3(oldfd, newfd, flags); +} + SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd) { if (unlikely(newfd == oldfd)) { /* corner case */ @@ -916,10 +922,10 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd) rcu_read_unlock(); return retval; } - return sys_dup3(oldfd, newfd, 0); + return ksys_dup3(oldfd, newfd, 0); } -SYSCALL_DEFINE1(dup, unsigned int, fildes) +int ksys_dup(unsigned int fildes) { int ret = -EBADF; struct file *file = fget_raw(fildes); @@ -934,6 +940,11 @@ SYSCALL_DEFINE1(dup, unsigned int, fildes) return ret; } +SYSCALL_DEFINE1(dup, unsigned int, fildes) +{ + return ksys_dup(fildes); +} + int f_dupfd(unsigned int from, struct file *file, unsigned flags) { int err; diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index ff84258132bb..d705125665f0 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -557,9 +557,10 @@ void __fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate) * n_active reaches 0). This makes sure outstanding reads and writes * have completed. */ - if (!atomic_dec_and_test(&cookie->n_active)) - wait_on_atomic_t(&cookie->n_active, atomic_t_wait, - TASK_UNINTERRUPTIBLE); + if (!atomic_dec_and_test(&cookie->n_active)) { + wait_var_event(&cookie->n_active, + !atomic_read(&cookie->n_active)); + } /* Make sure any pending writes are cancelled. */ if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 86d6a4435c87..51f940e76c5e 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -807,9 +807,6 @@ do_alloc: iomap->length = hole_size(inode, lblock, &mp); else iomap->length = size - pos; - } else { - if (height <= ip->i_height) - iomap->length = hole_size(inode, lblock, &mp); } goto out_release; } diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h index ffaec2e7526c..cb8374af08a6 100644 --- a/fs/hostfs/hostfs.h +++ b/fs/hostfs/hostfs.h @@ -84,7 +84,7 @@ extern int set_attr(const char *file, struct hostfs_iattr *attrs, int fd); extern int make_symlink(const char *from, const char *to); extern int unlink_file(const char *file); extern int do_mkdir(const char *file, int mode); -extern int do_rmdir(const char *file); +extern int hostfs_do_rmdir(const char *file); extern int do_mknod(const char *file, int mode, unsigned int major, unsigned int minor); extern int link_file(const char *from, const char *to); diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index c148e7f4f451..3cd85eb5bbb1 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -706,7 +706,7 @@ static int hostfs_rmdir(struct inode *ino, struct dentry *dentry) if ((file = dentry_name(dentry)) == NULL) return -ENOMEM; - err = do_rmdir(file); + err = hostfs_do_rmdir(file); __putname(file); return err; } diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c index 9c1e0f019880..5ecc4706172b 100644 --- a/fs/hostfs/hostfs_user.c +++ b/fs/hostfs/hostfs_user.c @@ -304,7 +304,7 @@ int do_mkdir(const char *file, int mode) return 0; } -int do_rmdir(const char *file) +int hostfs_do_rmdir(const char *file) { int err; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 8fe1b0aa2896..b9a254dcc0e7 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -108,6 +108,16 @@ static void huge_pagevec_release(struct pagevec *pvec) pagevec_reinit(pvec); } +/* + * Mask used when checking the page offset value passed in via system + * calls. This value will be converted to a loff_t which is signed. + * Therefore, we want to check the upper PAGE_SHIFT + 1 bits of the + * value. The extra bit (- 1 in the shift value) is to take the sign + * bit into account. + */ +#define PGOFF_LOFFT_MAX \ + (((1UL << (PAGE_SHIFT + 1)) - 1) << (BITS_PER_LONG - (PAGE_SHIFT + 1))) + static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); @@ -127,12 +137,13 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) vma->vm_ops = &hugetlb_vm_ops; /* - * Offset passed to mmap (before page shift) could have been - * negative when represented as a (l)off_t. + * page based offset in vm_pgoff could be sufficiently large to + * overflow a (l)off_t when converted to byte offset. */ - if (((loff_t)vma->vm_pgoff << PAGE_SHIFT) < 0) + if (vma->vm_pgoff & PGOFF_LOFFT_MAX) return -EINVAL; + /* must be huge page aligned */ if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) return -EINVAL; diff --git a/fs/internal.h b/fs/internal.h index df262f41a0ef..980d005b21b4 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -55,7 +55,15 @@ extern void __init chrdev_init(void); extern int user_path_mountpoint_at(int, const char __user *, unsigned int, struct path *); extern int vfs_path_lookup(struct dentry *, struct vfsmount *, const char *, unsigned int, struct path *); +long do_mknodat(int dfd, const char __user *filename, umode_t mode, + unsigned int dev); +long do_mkdirat(int dfd, const char __user *pathname, umode_t mode); +long do_rmdir(int dfd, const char __user *pathname); long do_unlinkat(int dfd, struct filename *name); +long do_symlinkat(const char __user *oldname, int newdfd, + const char __user *newname); +int do_linkat(int olddfd, const char __user *oldname, int newdfd, + const char __user *newname, int flags); /* * namespace.c @@ -111,6 +119,12 @@ extern struct file *do_filp_open(int dfd, struct filename *pathname, extern struct file *do_file_open_root(struct dentry *, struct vfsmount *, const char *, const struct open_flags *); +long do_sys_ftruncate(unsigned int fd, loff_t length, int small); +long do_faccessat(int dfd, const char __user *filename, int mode); +int do_fchmodat(int dfd, const char __user *filename, umode_t mode); +int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group, + int flag); + extern int open_check_o_direct(struct file *f); extern int vfs_open(const struct path *, struct file *, const struct cred *); extern struct file *filp_clone_open(struct file *); diff --git a/fs/ioctl.c b/fs/ioctl.c index 5ace7efb0d04..4823431d1c9d 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -689,7 +689,7 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, return error; } -SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg) +int ksys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg) { int error; struct fd f = fdget(fd); @@ -702,3 +702,8 @@ SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg) fdput(f); return error; } + +SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg) +{ + return ksys_ioctl(fd, cmd, arg); +} diff --git a/fs/minix/Kconfig b/fs/minix/Kconfig index f2a0cfcef11d..bcd53a79156f 100644 --- a/fs/minix/Kconfig +++ b/fs/minix/Kconfig @@ -18,7 +18,7 @@ config MINIX_FS config MINIX_FS_NATIVE_ENDIAN def_bool MINIX_FS - depends on M32R || MICROBLAZE || MIPS || S390 || SUPERH || SPARC || XTENSA || (M68K && !MMU) + depends on MICROBLAZE || MIPS || S390 || SUPERH || SPARC || XTENSA || (M68K && !MMU) config MINIX_FS_BIG_ENDIAN_16BIT_INDEXED def_bool MINIX_FS diff --git a/fs/namei.c b/fs/namei.c index a3cd028e8a9b..a09419379f5d 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -559,9 +559,10 @@ static int __nd_alloc_stack(struct nameidata *nd) static bool path_connected(const struct path *path) { struct vfsmount *mnt = path->mnt; + struct super_block *sb = mnt->mnt_sb; - /* Only bind mounts can have disconnected paths */ - if (mnt->mnt_root == mnt->mnt_sb->s_root) + /* Bind mounts and multi-root filesystems can have disconnected paths */ + if (!(sb->s_iflags & SB_I_MULTIROOT) && (mnt->mnt_root == sb->s_root)) return true; return is_subdir(path->dentry, mnt->mnt_root); @@ -3721,8 +3722,8 @@ static int may_mknod(umode_t mode) } } -SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode, - unsigned, dev) +long do_mknodat(int dfd, const char __user *filename, umode_t mode, + unsigned int dev) { struct dentry *dentry; struct path path; @@ -3765,9 +3766,15 @@ out: return error; } +SYSCALL_DEFINE4(mknodat, int, dfd, const char __user *, filename, umode_t, mode, + unsigned int, dev) +{ + return do_mknodat(dfd, filename, mode, dev); +} + SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, dev) { - return sys_mknodat(AT_FDCWD, filename, mode, dev); + return do_mknodat(AT_FDCWD, filename, mode, dev); } int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) @@ -3796,7 +3803,7 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) } EXPORT_SYMBOL(vfs_mkdir); -SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode) +long do_mkdirat(int dfd, const char __user *pathname, umode_t mode) { struct dentry *dentry; struct path path; @@ -3821,9 +3828,14 @@ retry: return error; } +SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode) +{ + return do_mkdirat(dfd, pathname, mode); +} + SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode) { - return sys_mkdirat(AT_FDCWD, pathname, mode); + return do_mkdirat(AT_FDCWD, pathname, mode); } int vfs_rmdir(struct inode *dir, struct dentry *dentry) @@ -3865,7 +3877,7 @@ out: } EXPORT_SYMBOL(vfs_rmdir); -static long do_rmdir(int dfd, const char __user *pathname) +long do_rmdir(int dfd, const char __user *pathname) { int error = 0; struct filename *name; @@ -4101,8 +4113,8 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname) } EXPORT_SYMBOL(vfs_symlink); -SYSCALL_DEFINE3(symlinkat, const char __user *, oldname, - int, newdfd, const char __user *, newname) +long do_symlinkat(const char __user *oldname, int newdfd, + const char __user *newname) { int error; struct filename *from; @@ -4132,9 +4144,15 @@ out_putname: return error; } +SYSCALL_DEFINE3(symlinkat, const char __user *, oldname, + int, newdfd, const char __user *, newname) +{ + return do_symlinkat(oldname, newdfd, newname); +} + SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newname) { - return sys_symlinkat(oldname, AT_FDCWD, newname); + return do_symlinkat(oldname, AT_FDCWD, newname); } /** @@ -4226,8 +4244,8 @@ EXPORT_SYMBOL(vfs_link); * with linux 2.0, and to avoid hard-linking to directories * and other special files. --ADM */ -SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname, - int, newdfd, const char __user *, newname, int, flags) +int do_linkat(int olddfd, const char __user *oldname, int newdfd, + const char __user *newname, int flags) { struct dentry *new_dentry; struct path old_path, new_path; @@ -4291,9 +4309,15 @@ out: return error; } +SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname, + int, newdfd, const char __user *, newname, int, flags) +{ + return do_linkat(olddfd, oldname, newdfd, newname, flags); +} + SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname) { - return sys_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0); + return do_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0); } /** @@ -4471,8 +4495,8 @@ out: } EXPORT_SYMBOL(vfs_rename); -SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname, - int, newdfd, const char __user *, newname, unsigned int, flags) +static int do_renameat2(int olddfd, const char __user *oldname, int newdfd, + const char __user *newname, unsigned int flags) { struct dentry *old_dentry, *new_dentry; struct dentry *trap; @@ -4614,15 +4638,21 @@ exit: return error; } +SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname, + int, newdfd, const char __user *, newname, unsigned int, flags) +{ + return do_renameat2(olddfd, oldname, newdfd, newname, flags); +} + SYSCALL_DEFINE4(renameat, int, olddfd, const char __user *, oldname, int, newdfd, const char __user *, newname) { - return sys_renameat2(olddfd, oldname, newdfd, newname, 0); + return do_renameat2(olddfd, oldname, newdfd, newname, 0); } SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newname) { - return sys_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0); + return do_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0); } int vfs_whiteout(struct inode *dir, struct dentry *dentry) diff --git a/fs/namespace.c b/fs/namespace.c index 9d1374ab6e06..e398f32d7541 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1680,7 +1680,7 @@ static inline bool may_mandlock(void) * unixes. Our API is identical to OSF/1 to avoid making a mess of AMD */ -SYSCALL_DEFINE2(umount, char __user *, name, int, flags) +int ksys_umount(char __user *name, int flags) { struct path path; struct mount *mnt; @@ -1720,6 +1720,11 @@ out: return retval; } +SYSCALL_DEFINE2(umount, char __user *, name, int, flags) +{ + return ksys_umount(name, flags); +} + #ifdef __ARCH_WANT_SYS_OLDUMOUNT /* @@ -1727,7 +1732,7 @@ out: */ SYSCALL_DEFINE1(oldumount, char __user *, name) { - return sys_umount(name, 0); + return ksys_umount(name, 0); } #endif @@ -3032,8 +3037,8 @@ struct dentry *mount_subtree(struct vfsmount *mnt, const char *name) } EXPORT_SYMBOL(mount_subtree); -SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, - char __user *, type, unsigned long, flags, void __user *, data) +int ksys_mount(char __user *dev_name, char __user *dir_name, char __user *type, + unsigned long flags, void __user *data) { int ret; char *kernel_type; @@ -3066,6 +3071,12 @@ out_type: return ret; } +SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, + char __user *, type, unsigned long, flags, void __user *, data) +{ + return ksys_mount(dev_name, dir_name, type, flags, data); +} + /* * Return true if path is reachable from root * diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 2435af56b87e..a50d7813e3ea 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -572,7 +572,7 @@ out: } static bool -validate_bitmap_values(unsigned long mask) +validate_bitmap_values(unsigned int mask) { return (mask & ~RCA4_TYPE_MASK_ALL) == 0; } @@ -596,17 +596,15 @@ __be32 nfs4_callback_recallany(void *argp, void *resp, goto out; status = cpu_to_be32(NFS4_OK); - if (test_bit(RCA4_TYPE_MASK_RDATA_DLG, (const unsigned long *) - &args->craa_type_mask)) + if (args->craa_type_mask & BIT(RCA4_TYPE_MASK_RDATA_DLG)) flags = FMODE_READ; - if (test_bit(RCA4_TYPE_MASK_WDATA_DLG, (const unsigned long *) - &args->craa_type_mask)) + if (args->craa_type_mask & BIT(RCA4_TYPE_MASK_WDATA_DLG)) flags |= FMODE_WRITE; - if (test_bit(RCA4_TYPE_MASK_FILE_LAYOUT, (const unsigned long *) - &args->craa_type_mask)) - pnfs_recall_all_layouts(cps->clp); if (flags) nfs_expire_unused_delegation_types(cps->clp, flags); + + if (args->craa_type_mask & BIT(RCA4_TYPE_MASK_FILE_LAYOUT)) + pnfs_recall_all_layouts(cps->clp); out: dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); return status; diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 8c10b0562e75..621c517b325c 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -86,10 +86,10 @@ struct nfs_direct_req { struct nfs_direct_mirror mirrors[NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX]; int mirror_count; + loff_t io_start; /* Start offset for I/O */ ssize_t count, /* bytes actually processed */ max_count, /* max expected count */ bytes_left, /* bytes left to be sent */ - io_start, /* start of IO */ error; /* any reported error */ struct completion completion; /* wait for i/o completion */ diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 7d893543cf3b..d17a90c4fa37 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -85,11 +85,6 @@ int nfs_wait_bit_killable(struct wait_bit_key *key, int mode) } EXPORT_SYMBOL_GPL(nfs_wait_bit_killable); -int nfs_wait_atomic_killable(atomic_t *p, unsigned int mode) -{ - return nfs_wait_killable(mode); -} - /** * nfs_compat_user_ino64 - returns the user-visible inode number * @fileid: 64-bit fileid diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 49f848fd1f04..7327930ad970 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -873,7 +873,7 @@ static void nfs3_nlm_release_call(void *data) } } -const struct nlmclnt_operations nlmclnt_fl_close_lock_ops = { +static const struct nlmclnt_operations nlmclnt_fl_close_lock_ops = { .nlmclnt_alloc_call = nfs3_nlm_alloc_call, .nlmclnt_unlock_prepare = nfs3_nlm_unlock_prepare, .nlmclnt_release_call = nfs3_nlm_release_call, diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 04612c24d394..979631411a0e 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -868,8 +868,10 @@ static int nfs4_set_client(struct nfs_server *server, if (IS_ERR(clp)) return PTR_ERR(clp); - if (server->nfs_client == clp) + if (server->nfs_client == clp) { + nfs_put_client(clp); return -ELOOP; + } /* * Query for the lease time on clientid setup or renewal @@ -1244,11 +1246,11 @@ int nfs4_update_server(struct nfs_server *server, const char *hostname, clp->cl_proto, clnt->cl_timeout, clp->cl_minorversion, net); clear_bit(NFS_MIG_TSM_POSSIBLE, &server->mig_status); - nfs_put_client(clp); if (error != 0) { nfs_server_insert_lists(server); return error; } + nfs_put_client(clp); if (server->nfs_client->cl_hostname == NULL) server->nfs_client->cl_hostname = kstrdup(hostname, GFP_KERNEL); diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 18a7626ac638..67d19cd92e44 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -98,8 +98,8 @@ nfs_page_free(struct nfs_page *p) int nfs_iocounter_wait(struct nfs_lock_context *l_ctx) { - return wait_on_atomic_t(&l_ctx->io_count, nfs_wait_atomic_killable, - TASK_KILLABLE); + return wait_var_event_killable(&l_ctx->io_count, + !atomic_read(&l_ctx->io_count)); } /** @@ -395,7 +395,7 @@ static void nfs_clear_request(struct nfs_page *req) } if (l_ctx != NULL) { if (atomic_dec_and_test(&l_ctx->io_count)) { - wake_up_atomic_t(&l_ctx->io_count); + wake_up_var(&l_ctx->io_count); if (test_bit(NFS_CONTEXT_UNLOCK, &ctx->flags)) rpc_wake_up(&NFS_SERVER(d_inode(ctx->dentry))->uoc_rpcwaitq); } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index c13e826614b5..ee723aa153a3 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -292,8 +292,11 @@ pnfs_detach_layout_hdr(struct pnfs_layout_hdr *lo) void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo) { - struct inode *inode = lo->plh_inode; + struct inode *inode; + if (!lo) + return; + inode = lo->plh_inode; pnfs_layoutreturn_before_put_layout_hdr(lo); if (refcount_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) { @@ -1241,10 +1244,12 @@ retry: spin_lock(&ino->i_lock); lo = nfsi->layout; if (!lo || !pnfs_layout_is_valid(lo) || - test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) + test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { + lo = NULL; goto out_noroc; + } + pnfs_get_layout_hdr(lo); if (test_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) { - pnfs_get_layout_hdr(lo); spin_unlock(&ino->i_lock); wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN, TASK_UNINTERRUPTIBLE); @@ -1312,10 +1317,12 @@ out_noroc: struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld; if (ld->prepare_layoutreturn) ld->prepare_layoutreturn(args); + pnfs_put_layout_hdr(lo); return true; } if (layoutreturn) pnfs_send_layoutreturn(lo, &stateid, iomode, true); + pnfs_put_layout_hdr(lo); return false; } diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 03aaa60c7768..32ba2d471853 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -245,7 +245,7 @@ pnfs_generic_commit_cancel_empty_pagelist(struct list_head *pages, { if (list_empty(pages)) { if (atomic_dec_and_test(&cinfo->mds->rpcs_out)) - wake_up_atomic_t(&cinfo->mds->rpcs_out); + wake_up_var(&cinfo->mds->rpcs_out); /* don't call nfs_commitdata_release - it tries to put * the open_context which is not acquired until nfs_init_commit * which has not been called on @data */ diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 29bacdc56f6a..5e470e233c83 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2631,6 +2631,8 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server, /* initial superblock/root creation */ mount_info->fill_super(s, mount_info); nfs_get_cache_cookie(s, mount_info->parsed, mount_info->cloned); + if (!(server->flags & NFS_MOUNT_UNSHARED)) + s->s_iflags |= SB_I_MULTIROOT; } mntroot = nfs_get_root(s, mount_info->mntfh, dev_name); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 7428a669d7a7..6579f3b367bd 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1620,8 +1620,8 @@ static void nfs_writeback_result(struct rpc_task *task, static int wait_on_commit(struct nfs_mds_commit_info *cinfo) { - return wait_on_atomic_t(&cinfo->rpcs_out, - nfs_wait_atomic_killable, TASK_KILLABLE); + return wait_var_event_killable(&cinfo->rpcs_out, + !atomic_read(&cinfo->rpcs_out)); } static void nfs_commit_begin(struct nfs_mds_commit_info *cinfo) @@ -1632,7 +1632,7 @@ static void nfs_commit_begin(struct nfs_mds_commit_info *cinfo) static void nfs_commit_end(struct nfs_mds_commit_info *cinfo) { if (atomic_dec_and_test(&cinfo->rpcs_out)) - wake_up_atomic_t(&cinfo->rpcs_out); + wake_up_var(&cinfo->rpcs_out); } void nfs_commitdata_release(struct nfs_commit_data *data) @@ -1876,40 +1876,43 @@ int nfs_generic_commit_list(struct inode *inode, struct list_head *head, return status; } -int nfs_commit_inode(struct inode *inode, int how) +static int __nfs_commit_inode(struct inode *inode, int how, + struct writeback_control *wbc) { LIST_HEAD(head); struct nfs_commit_info cinfo; int may_wait = how & FLUSH_SYNC; - int error = 0; - int res; + int ret, nscan; nfs_init_cinfo_from_inode(&cinfo, inode); nfs_commit_begin(cinfo.mds); - res = nfs_scan_commit(inode, &head, &cinfo); - if (res) - error = nfs_generic_commit_list(inode, &head, how, &cinfo); + for (;;) { + ret = nscan = nfs_scan_commit(inode, &head, &cinfo); + if (ret <= 0) + break; + ret = nfs_generic_commit_list(inode, &head, how, &cinfo); + if (ret < 0) + break; + ret = 0; + if (wbc && wbc->sync_mode == WB_SYNC_NONE) { + if (nscan < wbc->nr_to_write) + wbc->nr_to_write -= nscan; + else + wbc->nr_to_write = 0; + } + if (nscan < INT_MAX) + break; + cond_resched(); + } nfs_commit_end(cinfo.mds); - if (res == 0) - return res; - if (error < 0) - goto out_error; - if (!may_wait) - goto out_mark_dirty; - error = wait_on_commit(cinfo.mds); - if (error < 0) - return error; - return res; -out_error: - res = error; - /* Note: If we exit without ensuring that the commit is complete, - * we must mark the inode as dirty. Otherwise, future calls to - * sync_inode() with the WB_SYNC_ALL flag set will fail to ensure - * that the data is on the disk. - */ -out_mark_dirty: - __mark_inode_dirty(inode, I_DIRTY_DATASYNC); - return res; + if (ret || !may_wait) + return ret; + return wait_on_commit(cinfo.mds); +} + +int nfs_commit_inode(struct inode *inode, int how) +{ + return __nfs_commit_inode(inode, how, NULL); } EXPORT_SYMBOL_GPL(nfs_commit_inode); @@ -1919,11 +1922,11 @@ int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) int flags = FLUSH_SYNC; int ret = 0; - /* no commits means nothing needs to be done */ - if (!atomic_long_read(&nfsi->commit_info.ncommit)) - return ret; - if (wbc->sync_mode == WB_SYNC_NONE) { + /* no commits means nothing needs to be done */ + if (!atomic_long_read(&nfsi->commit_info.ncommit)) + goto check_requests_outstanding; + /* Don't commit yet if this is a non-blocking flush and there * are a lot of outstanding writes for this mapping. */ @@ -1934,16 +1937,16 @@ int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) flags = 0; } - ret = nfs_commit_inode(inode, flags); - if (ret >= 0) { - if (wbc->sync_mode == WB_SYNC_NONE) { - if (ret < wbc->nr_to_write) - wbc->nr_to_write -= ret; - else - wbc->nr_to_write = 0; - } - return 0; - } + ret = __nfs_commit_inode(inode, flags, wbc); + if (!ret) { + if (flags & FLUSH_SYNC) + return 0; + } else if (atomic_long_read(&nfsi->commit_info.ncommit)) + goto out_mark_dirty; + +check_requests_outstanding: + if (!atomic_read(&nfsi->commit_info.rpcs_out)) + return ret; out_mark_dirty: __mark_inode_dirty(inode, I_DIRTY_DATASYNC); return ret; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 150521c9671b..61b770e39809 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -268,6 +268,35 @@ free_blocked_lock(struct nfsd4_blocked_lock *nbl) kfree(nbl); } +static void +remove_blocked_locks(struct nfs4_lockowner *lo) +{ + struct nfs4_client *clp = lo->lo_owner.so_client; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + struct nfsd4_blocked_lock *nbl; + LIST_HEAD(reaplist); + + /* Dequeue all blocked locks */ + spin_lock(&nn->blocked_locks_lock); + while (!list_empty(&lo->lo_blocked)) { + nbl = list_first_entry(&lo->lo_blocked, + struct nfsd4_blocked_lock, + nbl_list); + list_del_init(&nbl->nbl_list); + list_move(&nbl->nbl_lru, &reaplist); + } + spin_unlock(&nn->blocked_locks_lock); + + /* Now free them */ + while (!list_empty(&reaplist)) { + nbl = list_first_entry(&reaplist, struct nfsd4_blocked_lock, + nbl_lru); + list_del_init(&nbl->nbl_lru); + posix_unblock_lock(&nbl->nbl_lock); + free_blocked_lock(nbl); + } +} + static int nfsd4_cb_notify_lock_done(struct nfsd4_callback *cb, struct rpc_task *task) { @@ -1866,6 +1895,7 @@ static __be32 mark_client_expired_locked(struct nfs4_client *clp) static void __destroy_client(struct nfs4_client *clp) { + int i; struct nfs4_openowner *oo; struct nfs4_delegation *dp; struct list_head reaplist; @@ -1895,6 +1925,16 @@ __destroy_client(struct nfs4_client *clp) nfs4_get_stateowner(&oo->oo_owner); release_openowner(oo); } + for (i = 0; i < OWNER_HASH_SIZE; i++) { + struct nfs4_stateowner *so, *tmp; + + list_for_each_entry_safe(so, tmp, &clp->cl_ownerstr_hashtbl[i], + so_strhash) { + /* Should be no openowners at this point */ + WARN_ON_ONCE(so->so_is_open_owner); + remove_blocked_locks(lockowner(so)); + } + } nfsd4_return_all_client_layouts(clp); nfsd4_shutdown_callback(clp); if (clp->cl_cb_conn.cb_xprt) @@ -6355,6 +6395,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, } spin_unlock(&clp->cl_lock); free_ol_stateid_reaplist(&reaplist); + remove_blocked_locks(lo); nfs4_put_stateowner(&lo->lo_owner); return status; @@ -7140,6 +7181,8 @@ nfs4_state_destroy_net(struct net *net) } } + WARN_ON(!list_empty(&nn->blocked_locks_lru)); + for (i = 0; i < CLIENT_HASH_SIZE; i++) { while (!list_empty(&nn->unconf_id_hashtbl[i])) { clp = list_entry(nn->unconf_id_hashtbl[i].next, struct nfs4_client, cl_idhash); @@ -7206,7 +7249,6 @@ nfs4_state_shutdown_net(struct net *net) struct nfs4_delegation *dp = NULL; struct list_head *pos, *next, reaplist; struct nfsd_net *nn = net_generic(net, nfsd_net_id); - struct nfsd4_blocked_lock *nbl; cancel_delayed_work_sync(&nn->laundromat_work); locks_end_grace(&nn->nfsd4_manager); @@ -7227,24 +7269,6 @@ nfs4_state_shutdown_net(struct net *net) nfs4_put_stid(&dp->dl_stid); } - BUG_ON(!list_empty(&reaplist)); - spin_lock(&nn->blocked_locks_lock); - while (!list_empty(&nn->blocked_locks_lru)) { - nbl = list_first_entry(&nn->blocked_locks_lru, - struct nfsd4_blocked_lock, nbl_lru); - list_move(&nbl->nbl_lru, &reaplist); - list_del_init(&nbl->nbl_list); - } - spin_unlock(&nn->blocked_locks_lock); - - while (!list_empty(&reaplist)) { - nbl = list_first_entry(&reaplist, - struct nfsd4_blocked_lock, nbl_lru); - list_del_init(&nbl->nbl_lru); - posix_unblock_lock(&nbl->nbl_lock); - free_blocked_lock(nbl); - } - nfsd4_client_tracking_exit(net); nfs4_state_destroy_net(net); } diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index c07eb3d655ea..fa803a58a605 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -820,9 +820,8 @@ out_destroy_group: return fd; } -SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags, - __u64, mask, int, dfd, - const char __user *, pathname) +static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, + int dfd, const char __user *pathname) { struct inode *inode = NULL; struct vfsmount *mnt = NULL; @@ -928,13 +927,20 @@ fput_and_out: return ret; } +SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags, + __u64, mask, int, dfd, + const char __user *, pathname) +{ + return do_fanotify_mark(fanotify_fd, flags, mask, dfd, pathname); +} + #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE6(fanotify_mark, int, fanotify_fd, unsigned int, flags, __u32, mask0, __u32, mask1, int, dfd, const char __user *, pathname) { - return sys_fanotify_mark(fanotify_fd, flags, + return do_fanotify_mark(fanotify_fd, flags, #ifdef __BIG_ENDIAN ((__u64)mask0 << 32) | mask1, #else diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 2c908b31d6c9..43c23653ce2e 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -635,7 +635,7 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events) /* inotify syscalls */ -SYSCALL_DEFINE1(inotify_init1, int, flags) +static int do_inotify_init(int flags) { struct fsnotify_group *group; int ret; @@ -660,9 +660,14 @@ SYSCALL_DEFINE1(inotify_init1, int, flags) return ret; } +SYSCALL_DEFINE1(inotify_init1, int, flags) +{ + return do_inotify_init(flags); +} + SYSCALL_DEFINE0(inotify_init) { - return sys_inotify_init1(0); + return do_inotify_init(0); } SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname, diff --git a/fs/nsfs.c b/fs/nsfs.c index 36b0772701a0..60702d677bd4 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -184,6 +184,7 @@ int open_related_ns(struct ns_common *ns, return fd; } +EXPORT_SYMBOL_GPL(open_related_ns); static long ns_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index eac5140aac47..e5076185cc1e 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1819,7 +1819,7 @@ int o2net_register_hb_callbacks(void) static int o2net_accept_one(struct socket *sock, int *more) { - int ret, slen; + int ret; struct sockaddr_in sin; struct socket *new_sock = NULL; struct o2nm_node *node = NULL; @@ -1864,9 +1864,7 @@ static int o2net_accept_one(struct socket *sock, int *more) goto out; } - slen = sizeof(sin); - ret = new_sock->ops->getname(new_sock, (struct sockaddr *) &sin, - &slen, 1); + ret = new_sock->ops->getname(new_sock, (struct sockaddr *) &sin, 1); if (ret < 0) goto out; diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c index e87279e49ba3..6b92cb241138 100644 --- a/fs/ocfs2/filecheck.c +++ b/fs/ocfs2/filecheck.c @@ -134,9 +134,10 @@ ocfs2_filecheck_sysfs_free(struct ocfs2_filecheck_sysfs_entry *entry) { struct ocfs2_filecheck_entry *p; - if (!atomic_dec_and_test(&entry->fs_count)) - wait_on_atomic_t(&entry->fs_count, atomic_t_wait, - TASK_UNINTERRUPTIBLE); + if (!atomic_dec_and_test(&entry->fs_count)) { + wait_var_event(&entry->fs_count, + !atomic_read(&entry->fs_count)); + } spin_lock(&entry->fs_fcheck->fc_lock); while (!list_empty(&entry->fs_fcheck->fc_head)) { @@ -183,7 +184,7 @@ static void ocfs2_filecheck_sysfs_put(struct ocfs2_filecheck_sysfs_entry *entry) { if (atomic_dec_and_test(&entry->fs_count)) - wake_up_atomic_t(&entry->fs_count); + wake_up_var(&entry->fs_count); } static struct ocfs2_filecheck_sysfs_entry * diff --git a/fs/open.c b/fs/open.c index 7ea118471dce..d0e955b558ad 100644 --- a/fs/open.c +++ b/fs/open.c @@ -128,7 +128,7 @@ out: } EXPORT_SYMBOL_GPL(vfs_truncate); -static long do_sys_truncate(const char __user *pathname, loff_t length) +long do_sys_truncate(const char __user *pathname, loff_t length) { unsigned int lookup_flags = LOOKUP_FOLLOW; struct path path; @@ -162,7 +162,7 @@ COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length } #endif -static long do_sys_ftruncate(unsigned int fd, loff_t length, int small) +long do_sys_ftruncate(unsigned int fd, loff_t length, int small) { struct inode *inode; struct dentry *dentry; @@ -333,7 +333,7 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) } EXPORT_SYMBOL_GPL(vfs_fallocate); -SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len) +int ksys_fallocate(int fd, int mode, loff_t offset, loff_t len) { struct fd f = fdget(fd); int error = -EBADF; @@ -345,12 +345,17 @@ SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len) return error; } +SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len) +{ + return ksys_fallocate(fd, mode, offset, len); +} + /* * access() needs to use the real uid/gid, not the effective uid/gid. * We do this by temporarily clearing all FS-related capabilities and * switching the fsuid/fsgid around to the real ones. */ -SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode) +long do_faccessat(int dfd, const char __user *filename, int mode) { const struct cred *old_cred; struct cred *override_cred; @@ -426,12 +431,17 @@ out: return res; } +SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode) +{ + return do_faccessat(dfd, filename, mode); +} + SYSCALL_DEFINE2(access, const char __user *, filename, int, mode) { - return sys_faccessat(AT_FDCWD, filename, mode); + return do_faccessat(AT_FDCWD, filename, mode); } -SYSCALL_DEFINE1(chdir, const char __user *, filename) +int ksys_chdir(const char __user *filename) { struct path path; int error; @@ -457,6 +467,11 @@ out: return error; } +SYSCALL_DEFINE1(chdir, const char __user *, filename) +{ + return ksys_chdir(filename); +} + SYSCALL_DEFINE1(fchdir, unsigned int, fd) { struct fd f = fdget_raw(fd); @@ -479,7 +494,7 @@ out: return error; } -SYSCALL_DEFINE1(chroot, const char __user *, filename) +int ksys_chroot(const char __user *filename) { struct path path; int error; @@ -512,6 +527,11 @@ out: return error; } +SYSCALL_DEFINE1(chroot, const char __user *, filename) +{ + return ksys_chroot(filename); +} + static int chmod_common(const struct path *path, umode_t mode) { struct inode *inode = path->dentry->d_inode; @@ -541,7 +561,7 @@ out_unlock: return error; } -SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode) +int ksys_fchmod(unsigned int fd, umode_t mode) { struct fd f = fdget(fd); int err = -EBADF; @@ -554,7 +574,12 @@ SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode) return err; } -SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, umode_t, mode) +SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode) +{ + return ksys_fchmod(fd, mode); +} + +int do_fchmodat(int dfd, const char __user *filename, umode_t mode) { struct path path; int error; @@ -572,9 +597,15 @@ retry: return error; } +SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, + umode_t, mode) +{ + return do_fchmodat(dfd, filename, mode); +} + SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode) { - return sys_fchmodat(AT_FDCWD, filename, mode); + return do_fchmodat(AT_FDCWD, filename, mode); } static int chown_common(const struct path *path, uid_t user, gid_t group) @@ -619,8 +650,8 @@ retry_deleg: return error; } -SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user, - gid_t, group, int, flag) +int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group, + int flag) { struct path path; int error = -EINVAL; @@ -651,18 +682,24 @@ out: return error; } +SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user, + gid_t, group, int, flag) +{ + return do_fchownat(dfd, filename, user, group, flag); +} + SYSCALL_DEFINE3(chown, const char __user *, filename, uid_t, user, gid_t, group) { - return sys_fchownat(AT_FDCWD, filename, user, group, 0); + return do_fchownat(AT_FDCWD, filename, user, group, 0); } SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group) { - return sys_fchownat(AT_FDCWD, filename, user, group, - AT_SYMLINK_NOFOLLOW); + return do_fchownat(AT_FDCWD, filename, user, group, + AT_SYMLINK_NOFOLLOW); } -SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group) +int ksys_fchown(unsigned int fd, uid_t user, gid_t group) { struct fd f = fdget(fd); int error = -EBADF; @@ -682,6 +719,11 @@ out: return error; } +SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group) +{ + return ksys_fchown(fd, user, group); +} + int open_check_o_direct(struct file *f) { /* NB: we're sure to have correct a_ops only after f_op->open */ @@ -1114,7 +1156,7 @@ COMPAT_SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, fla */ SYSCALL_DEFINE2(creat, const char __user *, pathname, umode_t, mode) { - return sys_open(pathname, O_CREAT | O_WRONLY | O_TRUNC, mode); + return ksys_open(pathname, O_CREAT | O_WRONLY | O_TRUNC, mode); } #endif @@ -1163,7 +1205,6 @@ SYSCALL_DEFINE1(close, unsigned int, fd) return retval; } -EXPORT_SYMBOL(sys_close); /* * This routine simulates a hangup on the tty, to arrange that users diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig index 406e72de88f6..ce6ff5a0a6e4 100644 --- a/fs/overlayfs/Kconfig +++ b/fs/overlayfs/Kconfig @@ -24,6 +24,8 @@ config OVERLAY_FS_REDIRECT_DIR an overlay which has redirects on a kernel that doesn't support this feature will have unexpected results. + If unsure, say N. + config OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW bool "Overlayfs: follow redirects even if redirects are turned off" default y @@ -32,8 +34,13 @@ config OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW Disable this to get a possibly more secure configuration, but that might not be backward compatible with previous kernels. + If backward compatibility is not an issue, then it is safe and + recommended to say N here. + For more information, see Documentation/filesystems/overlayfs.txt + If unsure, say Y. + config OVERLAY_FS_INDEX bool "Overlayfs: turn on inodes index feature by default" depends on OVERLAY_FS @@ -51,6 +58,8 @@ config OVERLAY_FS_INDEX That is, mounting an overlay which has an inodes index on a kernel that doesn't support this feature will have unexpected results. + If unsure, say N. + config OVERLAY_FS_NFS_EXPORT bool "Overlayfs: turn on NFS export feature by default" depends on OVERLAY_FS @@ -72,3 +81,8 @@ config OVERLAY_FS_NFS_EXPORT Note, that the NFS export feature is not backward compatible. That is, mounting an overlay which has a full index on a kernel that doesn't support this feature will have unexpected results. + + Most users should say N here and enable this feature on a case-by- + case basis with the "nfs_export=on" mount option. + + Say N unless you fully understand the consequences. diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index bb94ce9da5c8..87bd4148f4fb 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -19,6 +19,142 @@ #include <linux/ratelimit.h> #include "overlayfs.h" +static int ovl_encode_maybe_copy_up(struct dentry *dentry) +{ + int err; + + if (ovl_dentry_upper(dentry)) + return 0; + + err = ovl_want_write(dentry); + if (!err) { + err = ovl_copy_up(dentry); + ovl_drop_write(dentry); + } + + if (err) { + pr_warn_ratelimited("overlayfs: failed to copy up on encode (%pd2, err=%i)\n", + dentry, err); + } + + return err; +} + +/* + * Before encoding a non-upper directory file handle from real layer N, we need + * to check if it will be possible to reconnect an overlay dentry from the real + * lower decoded dentry. This is done by following the overlay ancestry up to a + * "layer N connected" ancestor and verifying that all parents along the way are + * "layer N connectable". If an ancestor that is NOT "layer N connectable" is + * found, we need to copy up an ancestor, which is "layer N connectable", thus + * making that ancestor "layer N connected". For example: + * + * layer 1: /a + * layer 2: /a/b/c + * + * The overlay dentry /a is NOT "layer 2 connectable", because if dir /a is + * copied up and renamed, upper dir /a will be indexed by lower dir /a from + * layer 1. The dir /a from layer 2 will never be indexed, so the algorithm (*) + * in ovl_lookup_real_ancestor() will not be able to lookup a connected overlay + * dentry from the connected lower dentry /a/b/c. + * + * To avoid this problem on decode time, we need to copy up an ancestor of + * /a/b/c, which is "layer 2 connectable", on encode time. That ancestor is + * /a/b. After copy up (and index) of /a/b, it will become "layer 2 connected" + * and when the time comes to decode the file handle from lower dentry /a/b/c, + * ovl_lookup_real_ancestor() will find the indexed ancestor /a/b and decoding + * a connected overlay dentry will be accomplished. + * + * (*) the algorithm in ovl_lookup_real_ancestor() can be improved to lookup an + * entry /a in the lower layers above layer N and find the indexed dir /a from + * layer 1. If that improvement is made, then the check for "layer N connected" + * will need to verify there are no redirects in lower layers above N. In the + * example above, /a will be "layer 2 connectable". However, if layer 2 dir /a + * is a target of a layer 1 redirect, then /a will NOT be "layer 2 connectable": + * + * layer 1: /A (redirect = /a) + * layer 2: /a/b/c + */ + +/* Return the lowest layer for encoding a connectable file handle */ +static int ovl_connectable_layer(struct dentry *dentry) +{ + struct ovl_entry *oe = OVL_E(dentry); + + /* We can get overlay root from root of any layer */ + if (dentry == dentry->d_sb->s_root) + return oe->numlower; + + /* + * If it's an unindexed merge dir, then it's not connectable with any + * lower layer + */ + if (ovl_dentry_upper(dentry) && + !ovl_test_flag(OVL_INDEX, d_inode(dentry))) + return 0; + + /* We can get upper/overlay path from indexed/lower dentry */ + return oe->lowerstack[0].layer->idx; +} + +/* + * @dentry is "connected" if all ancestors up to root or a "connected" ancestor + * have the same uppermost lower layer as the origin's layer. We may need to + * copy up a "connectable" ancestor to make it "connected". A "connected" dentry + * cannot become non "connected", so cache positive result in dentry flags. + * + * Return the connected origin layer or < 0 on error. + */ +static int ovl_connect_layer(struct dentry *dentry) +{ + struct dentry *next, *parent = NULL; + int origin_layer; + int err = 0; + + if (WARN_ON(dentry == dentry->d_sb->s_root) || + WARN_ON(!ovl_dentry_lower(dentry))) + return -EIO; + + origin_layer = OVL_E(dentry)->lowerstack[0].layer->idx; + if (ovl_dentry_test_flag(OVL_E_CONNECTED, dentry)) + return origin_layer; + + /* Find the topmost origin layer connectable ancestor of @dentry */ + next = dget(dentry); + for (;;) { + parent = dget_parent(next); + if (WARN_ON(parent == next)) { + err = -EIO; + break; + } + + /* + * If @parent is not origin layer connectable, then copy up + * @next which is origin layer connectable and we are done. + */ + if (ovl_connectable_layer(parent) < origin_layer) { + err = ovl_encode_maybe_copy_up(next); + break; + } + + /* If @parent is connected or indexed we are done */ + if (ovl_dentry_test_flag(OVL_E_CONNECTED, parent) || + ovl_test_flag(OVL_INDEX, d_inode(parent))) + break; + + dput(next); + next = parent; + } + + dput(parent); + dput(next); + + if (!err) + ovl_dentry_set_flag(OVL_E_CONNECTED, dentry); + + return err ?: origin_layer; +} + /* * We only need to encode origin if there is a chance that the same object was * encoded pre copy up and then we need to stay consistent with the same @@ -41,73 +177,59 @@ * L = lower file handle * * (*) Connecting an overlay dir from real lower dentry is not always - * possible when there are redirects in lower layers. To mitigate this case, - * we copy up the lower dir first and then encode an upper dir file handle. + * possible when there are redirects in lower layers and non-indexed merge dirs. + * To mitigate those case, we may copy up the lower dir ancestor before encode + * a lower dir file handle. + * + * Return 0 for upper file handle, > 0 for lower file handle or < 0 on error. */ -static bool ovl_should_encode_origin(struct dentry *dentry) +static int ovl_check_encode_origin(struct dentry *dentry) { struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + /* Upper file handle for pure upper */ if (!ovl_dentry_lower(dentry)) - return false; + return 0; /* - * Decoding a merge dir, whose origin's parent is under a redirected - * lower dir is not always possible. As a simple aproximation, we do - * not encode lower dir file handles when overlay has multiple lower - * layers and origin is below the topmost lower layer. + * Upper file handle for non-indexed upper. * - * TODO: copy up only the parent that is under redirected lower. + * Root is never indexed, so if there's an upper layer, encode upper for + * root. */ - if (d_is_dir(dentry) && ofs->upper_mnt && - OVL_E(dentry)->lowerstack[0].layer->idx > 1) - return false; - - /* Decoding a non-indexed upper from origin is not implemented */ if (ovl_dentry_upper(dentry) && !ovl_test_flag(OVL_INDEX, d_inode(dentry))) - return false; - - return true; -} - -static int ovl_encode_maybe_copy_up(struct dentry *dentry) -{ - int err; - - if (ovl_dentry_upper(dentry)) return 0; - err = ovl_want_write(dentry); - if (err) - return err; - - err = ovl_copy_up(dentry); + /* + * Decoding a merge dir, whose origin's ancestor is under a redirected + * lower dir or under a non-indexed upper is not always possible. + * ovl_connect_layer() will try to make origin's layer "connected" by + * copying up a "connectable" ancestor. + */ + if (d_is_dir(dentry) && ofs->upper_mnt) + return ovl_connect_layer(dentry); - ovl_drop_write(dentry); - return err; + /* Lower file handle for indexed and non-upper dir/non-dir */ + return 1; } static int ovl_d_to_fh(struct dentry *dentry, char *buf, int buflen) { - struct dentry *origin = ovl_dentry_lower(dentry); struct ovl_fh *fh = NULL; - int err; + int err, enc_lower; /* - * If we should not encode a lower dir file handle, copy up and encode - * an upper dir file handle. + * Check if we should encode a lower or upper file handle and maybe + * copy up an ancestor to make lower file handle connectable. */ - if (!ovl_should_encode_origin(dentry)) { - err = ovl_encode_maybe_copy_up(dentry); - if (err) - goto fail; - - origin = NULL; - } + err = enc_lower = ovl_check_encode_origin(dentry); + if (enc_lower < 0) + goto fail; - /* Encode an upper or origin file handle */ - fh = ovl_encode_fh(origin ?: ovl_dentry_upper(dentry), !origin); + /* Encode an upper or lower file handle */ + fh = ovl_encode_fh(enc_lower ? ovl_dentry_lower(dentry) : + ovl_dentry_upper(dentry), !enc_lower); err = PTR_ERR(fh); if (IS_ERR(fh)) goto fail; @@ -355,8 +477,8 @@ static struct dentry *ovl_lookup_real_inode(struct super_block *sb, dput(upper); } - if (!this) - return NULL; + if (IS_ERR_OR_NULL(this)) + return this; if (WARN_ON(ovl_dentry_real_at(this, layer->idx) != real)) { dput(this); @@ -498,7 +620,7 @@ static struct dentry *ovl_lookup_real(struct super_block *sb, if (err == -ECHILD) { this = ovl_lookup_real_ancestor(sb, real, layer); - err = IS_ERR(this) ? PTR_ERR(this) : 0; + err = PTR_ERR_OR_ZERO(this); } if (!err) { dput(connected); diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index fcd97b783fa1..3b1bd469accd 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -669,38 +669,59 @@ struct inode *ovl_lookup_inode(struct super_block *sb, struct dentry *real, return inode; } +/* + * Does overlay inode need to be hashed by lower inode? + */ +static bool ovl_hash_bylower(struct super_block *sb, struct dentry *upper, + struct dentry *lower, struct dentry *index) +{ + struct ovl_fs *ofs = sb->s_fs_info; + + /* No, if pure upper */ + if (!lower) + return false; + + /* Yes, if already indexed */ + if (index) + return true; + + /* Yes, if won't be copied up */ + if (!ofs->upper_mnt) + return true; + + /* No, if lower hardlink is or will be broken on copy up */ + if ((upper || !ovl_indexdir(sb)) && + !d_is_dir(lower) && d_inode(lower)->i_nlink > 1) + return false; + + /* No, if non-indexed upper with NFS export */ + if (sb->s_export_op && upper) + return false; + + /* Otherwise, hash by lower inode for fsnotify */ + return true; +} + struct inode *ovl_get_inode(struct super_block *sb, struct dentry *upperdentry, struct dentry *lowerdentry, struct dentry *index, unsigned int numlower) { - struct ovl_fs *ofs = sb->s_fs_info; struct inode *realinode = upperdentry ? d_inode(upperdentry) : NULL; struct inode *inode; - /* Already indexed or could be indexed on copy up? */ - bool indexed = (index || (ovl_indexdir(sb) && !upperdentry)); - struct dentry *origin = indexed ? lowerdentry : NULL; + bool bylower = ovl_hash_bylower(sb, upperdentry, lowerdentry, index); bool is_dir; - if (WARN_ON(upperdentry && indexed && !lowerdentry)) - return ERR_PTR(-EIO); - if (!realinode) realinode = d_inode(lowerdentry); /* - * Copy up origin (lower) may exist for non-indexed non-dir upper, but - * we must not use lower as hash key in that case. - * Hash non-dir that is or could be indexed by origin inode. - * Hash dir that is or could be merged by origin inode. - * Hash pure upper and non-indexed non-dir by upper inode. - * Hash non-indexed dir by upper inode for NFS export. + * Copy up origin (lower) may exist for non-indexed upper, but we must + * not use lower as hash key if this is a broken hardlink. */ is_dir = S_ISDIR(realinode->i_mode); - if (is_dir && (indexed || !sb->s_export_op || !ofs->upper_mnt)) - origin = lowerdentry; - - if (upperdentry || origin) { - struct inode *key = d_inode(origin ?: upperdentry); + if (upperdentry || bylower) { + struct inode *key = d_inode(bylower ? lowerdentry : + upperdentry); unsigned int nlink = is_dir ? 1 : realinode->i_nlink; inode = iget5_locked(sb, (unsigned long) key, @@ -728,6 +749,7 @@ struct inode *ovl_get_inode(struct super_block *sb, struct dentry *upperdentry, nlink = ovl_get_nlink(lowerdentry, upperdentry, nlink); set_nlink(inode, nlink); } else { + /* Lower hardlink that will be broken on copy up */ inode = new_inode(sb); if (!inode) goto out_nomem; diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index de3e6da1d5a5..70fcfcc684cc 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -913,9 +913,6 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, stack[ctr].layer = lower.layer; ctr++; - if (d.stop) - break; - /* * Following redirects can have security consequences: it's like * a symlink into the lower layer without the permission checks. @@ -933,6 +930,9 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, goto out_put; } + if (d.stop) + break; + if (d.redirect && d.redirect[0] == '/' && poe != roe) { poe = roe; /* Find the current layer on the root dentry */ diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 0df25a9c94bd..225ff1171147 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -40,6 +40,7 @@ enum ovl_inode_flag { enum ovl_entry_flag { OVL_E_UPPER_ALIAS, OVL_E_OPAQUE, + OVL_E_CONNECTED, }; /* diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 9ee37c76091d..7c24619ae7fc 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -1359,6 +1359,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) /* Root is always merge -> can have whiteouts */ ovl_set_flag(OVL_WHITEOUTS, d_inode(root_dentry)); + ovl_dentry_set_flag(OVL_E_CONNECTED, root_dentry); ovl_inode_init(d_inode(root_dentry), upperpath.dentry, ovl_dentry_lower(root_dentry)); diff --git a/fs/pipe.c b/fs/pipe.c index 7b1954caf388..39d6f431da83 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -841,7 +841,7 @@ int do_pipe_flags(int *fd, int flags) * sys_pipe() is the normal C calling standard for creating * a pipe. It's not the way Unix traditionally does this, though. */ -SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags) +static int do_pipe2(int __user *fildes, int flags) { struct file *files[2]; int fd[2]; @@ -863,9 +863,14 @@ SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags) return error; } +SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags) +{ + return do_pipe2(fildes, flags); +} + SYSCALL_DEFINE1(pipe, int __user *, fildes) { - return sys_pipe2(fildes, 0); + return do_pipe2(fildes, 0); } static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt) diff --git a/fs/proc/base.c b/fs/proc/base.c index 9298324325ed..d53246863cfb 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -94,9 +94,6 @@ #include <linux/sched/stat.h> #include <linux/flex_array.h> #include <linux/posix-timers.h> -#ifdef CONFIG_HARDWALL -#include <asm/hardwall.h> -#endif #include <trace/events/oom.h> #include "internal.h" #include "fd.h" @@ -3002,9 +2999,6 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_TASK_IO_ACCOUNTING ONE("io", S_IRUSR, proc_tgid_io_accounting), #endif -#ifdef CONFIG_HARDWALL - ONE("hardwall", S_IRUGO, proc_pid_hardwall), -#endif #ifdef CONFIG_USER_NS REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations), @@ -3393,9 +3387,6 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_TASK_IO_ACCOUNTING ONE("io", S_IRUSR, proc_tid_io_accounting), #endif -#ifdef CONFIG_HARDWALL - ONE("hardwall", S_IRUGO, proc_pid_hardwall), -#endif #ifdef CONFIG_USER_NS REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations), diff --git a/fs/quota/compat.c b/fs/quota/compat.c index 779caed4f078..c30572857619 100644 --- a/fs/quota/compat.c +++ b/fs/quota/compat.c @@ -41,8 +41,9 @@ struct compat_fs_quota_stat { __u16 qs_iwarnlimit; }; -asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special, - qid_t id, void __user *addr) +COMPAT_SYSCALL_DEFINE4(quotactl32, unsigned int, cmd, + const char __user *, special, qid_t, id, + void __user *, addr) { unsigned int cmds; struct if_dqblk __user *dqblk; @@ -59,7 +60,7 @@ asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special, case Q_GETQUOTA: dqblk = compat_alloc_user_space(sizeof(struct if_dqblk)); compat_dqblk = addr; - ret = sys_quotactl(cmd, special, id, dqblk); + ret = kernel_quotactl(cmd, special, id, dqblk); if (ret) break; if (copy_in_user(compat_dqblk, dqblk, sizeof(*compat_dqblk)) || @@ -75,12 +76,12 @@ asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special, get_user(data, &compat_dqblk->dqb_valid) || put_user(data, &dqblk->dqb_valid)) break; - ret = sys_quotactl(cmd, special, id, dqblk); + ret = kernel_quotactl(cmd, special, id, dqblk); break; case Q_XGETQSTAT: fsqstat = compat_alloc_user_space(sizeof(struct fs_quota_stat)); compat_fsqstat = addr; - ret = sys_quotactl(cmd, special, id, fsqstat); + ret = kernel_quotactl(cmd, special, id, fsqstat); if (ret) break; ret = -EFAULT; @@ -113,7 +114,7 @@ asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special, ret = 0; break; default: - ret = sys_quotactl(cmd, special, id, addr); + ret = kernel_quotactl(cmd, special, id, addr); } return ret; } diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 43612e2a73af..860bfbe7a07a 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -833,8 +833,8 @@ static struct super_block *quotactl_block(const char __user *special, int cmd) * calls. Maybe we need to add the process quotas etc. in the future, * but we probably should use rlimits for that. */ -SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special, - qid_t, id, void __user *, addr) +int kernel_quotactl(unsigned int cmd, const char __user *special, + qid_t id, void __user *addr) { uint cmds, type; struct super_block *sb = NULL; @@ -885,3 +885,9 @@ out: path_put(pathp); return ret; } + +SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special, + qid_t, id, void __user *, addr) +{ + return kernel_quotactl(cmd, special, id, addr); +} diff --git a/fs/read_write.c b/fs/read_write.c index f8547b82dfb3..c4eabbfc90df 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -301,7 +301,7 @@ loff_t vfs_llseek(struct file *file, loff_t offset, int whence) } EXPORT_SYMBOL(vfs_llseek); -SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence) +off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence) { off_t retval; struct fd f = fdget_pos(fd); @@ -319,10 +319,15 @@ SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence) return retval; } +SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence) +{ + return ksys_lseek(fd, offset, whence); +} + #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE3(lseek, unsigned int, fd, compat_off_t, offset, unsigned int, whence) { - return sys_lseek(fd, offset, whence); + return ksys_lseek(fd, offset, whence); } #endif @@ -563,7 +568,7 @@ static inline void file_pos_write(struct file *file, loff_t pos) file->f_pos = pos; } -SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) +ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count) { struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; @@ -578,8 +583,12 @@ SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) return ret; } -SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, - size_t, count) +SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) +{ + return ksys_read(fd, buf, count); +} + +ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count) { struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; @@ -595,8 +604,14 @@ SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, return ret; } -SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf, - size_t, count, loff_t, pos) +SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, + size_t, count) +{ + return ksys_write(fd, buf, count); +} + +ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count, + loff_t pos) { struct fd f; ssize_t ret = -EBADF; @@ -615,8 +630,14 @@ SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf, return ret; } -SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf, - size_t, count, loff_t, pos) +SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf, + size_t, count, loff_t, pos) +{ + return ksys_pread64(fd, buf, count, pos); +} + +ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf, + size_t count, loff_t pos) { struct fd f; ssize_t ret = -EBADF; @@ -635,6 +656,12 @@ SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf, return ret; } +SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf, + size_t, count, loff_t, pos) +{ + return ksys_pwrite64(fd, buf, count, pos); +} + static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, loff_t *ppos, int type, rwf_t flags) { diff --git a/fs/readdir.c b/fs/readdir.c index 1b83b0ad183b..d97f548e6323 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -292,8 +292,8 @@ efault: return -EFAULT; } -SYSCALL_DEFINE3(getdents64, unsigned int, fd, - struct linux_dirent64 __user *, dirent, unsigned int, count) +int ksys_getdents64(unsigned int fd, struct linux_dirent64 __user *dirent, + unsigned int count) { struct fd f; struct linux_dirent64 __user * lastdirent; @@ -326,6 +326,13 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd, return error; } + +SYSCALL_DEFINE3(getdents64, unsigned int, fd, + struct linux_dirent64 __user *, dirent, unsigned int, count) +{ + return ksys_getdents64(fd, dirent, count); +} + #ifdef CONFIG_COMPAT struct compat_old_linux_dirent { compat_ulong_t d_ino; diff --git a/fs/select.c b/fs/select.c index b6c36254028a..ba879c51288f 100644 --- a/fs/select.c +++ b/fs/select.c @@ -675,8 +675,8 @@ out_nofds: return ret; } -SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp, - fd_set __user *, exp, struct timeval __user *, tvp) +static int kern_select(int n, fd_set __user *inp, fd_set __user *outp, + fd_set __user *exp, struct timeval __user *tvp) { struct timespec64 end_time, *to = NULL; struct timeval tv; @@ -699,6 +699,12 @@ SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp, return ret; } +SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp, + fd_set __user *, exp, struct timeval __user *, tvp) +{ + return kern_select(n, inp, outp, exp, tvp); +} + static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timespec __user *tsp, const sigset_t __user *sigmask, size_t sigsetsize) @@ -784,7 +790,7 @@ SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg) if (copy_from_user(&a, arg, sizeof(a))) return -EFAULT; - return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp); + return kern_select(a.n, a.inp, a.outp, a.exp, a.tvp); } #endif @@ -1259,9 +1265,9 @@ out_nofds: return ret; } -COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp, - compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, - struct compat_timeval __user *, tvp) +static int do_compat_select(int n, compat_ulong_t __user *inp, + compat_ulong_t __user *outp, compat_ulong_t __user *exp, + struct compat_timeval __user *tvp) { struct timespec64 end_time, *to = NULL; struct compat_timeval tv; @@ -1284,6 +1290,13 @@ COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp, return ret; } +COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp, + compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, + struct compat_timeval __user *, tvp) +{ + return do_compat_select(n, inp, outp, exp, tvp); +} + struct compat_sel_arg_struct { compat_ulong_t n; compat_uptr_t inp; @@ -1298,8 +1311,8 @@ COMPAT_SYSCALL_DEFINE1(old_select, struct compat_sel_arg_struct __user *, arg) if (copy_from_user(&a, arg, sizeof(a))) return -EFAULT; - return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp), - compat_ptr(a.exp), compat_ptr(a.tvp)); + return do_compat_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp), + compat_ptr(a.exp), compat_ptr(a.tvp)); } static long do_compat_pselect(int n, compat_ulong_t __user *inp, diff --git a/fs/signalfd.c b/fs/signalfd.c index 9990957264e3..d2187a813376 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -118,13 +118,22 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo, err |= __put_user(kinfo->si_trapno, &uinfo->ssi_trapno); #endif #ifdef BUS_MCEERR_AO - /* + /* * Other callers might not initialize the si_lsb field, * so check explicitly for the right codes here. */ if (kinfo->si_signo == SIGBUS && - (kinfo->si_code == BUS_MCEERR_AR || - kinfo->si_code == BUS_MCEERR_AO)) + kinfo->si_code == BUS_MCEERR_AO) + err |= __put_user((short) kinfo->si_addr_lsb, + &uinfo->ssi_addr_lsb); +#endif +#ifdef BUS_MCEERR_AR + /* + * Other callers might not initialize the si_lsb field, + * so check explicitly for the right codes here. + */ + if (kinfo->si_signo == SIGBUS && + kinfo->si_code == BUS_MCEERR_AR) err |= __put_user((short) kinfo->si_addr_lsb, &uinfo->ssi_addr_lsb); #endif @@ -247,8 +256,8 @@ static const struct file_operations signalfd_fops = { .llseek = noop_llseek, }; -SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask, - size_t, sizemask, int, flags) +static int do_signalfd4(int ufd, sigset_t __user *user_mask, size_t sizemask, + int flags) { sigset_t sigmask; struct signalfd_ctx *ctx; @@ -301,17 +310,22 @@ SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask, return ufd; } +SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask, + size_t, sizemask, int, flags) +{ + return do_signalfd4(ufd, user_mask, sizemask, flags); +} + SYSCALL_DEFINE3(signalfd, int, ufd, sigset_t __user *, user_mask, size_t, sizemask) { - return sys_signalfd4(ufd, user_mask, sizemask, 0); + return do_signalfd4(ufd, user_mask, sizemask, 0); } #ifdef CONFIG_COMPAT -COMPAT_SYSCALL_DEFINE4(signalfd4, int, ufd, - const compat_sigset_t __user *,sigmask, - compat_size_t, sigsetsize, - int, flags) +static long do_compat_signalfd4(int ufd, + const compat_sigset_t __user *sigmask, + compat_size_t sigsetsize, int flags) { sigset_t tmp; sigset_t __user *ksigmask; @@ -324,13 +338,21 @@ COMPAT_SYSCALL_DEFINE4(signalfd4, int, ufd, if (copy_to_user(ksigmask, &tmp, sizeof(sigset_t))) return -EFAULT; - return sys_signalfd4(ufd, ksigmask, sizeof(sigset_t), flags); + return do_signalfd4(ufd, ksigmask, sizeof(sigset_t), flags); +} + +COMPAT_SYSCALL_DEFINE4(signalfd4, int, ufd, + const compat_sigset_t __user *, sigmask, + compat_size_t, sigsetsize, + int, flags) +{ + return do_compat_signalfd4(ufd, sigmask, sigsetsize, flags); } COMPAT_SYSCALL_DEFINE3(signalfd, int, ufd, const compat_sigset_t __user *,sigmask, compat_size_t, sigsetsize) { - return compat_sys_signalfd4(ufd, sigmask, sigsetsize, 0); + return do_compat_signalfd4(ufd, sigmask, sigsetsize, 0); } #endif diff --git a/fs/splice.c b/fs/splice.c index 39e2dc01ac12..005d09cf3fa8 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1331,8 +1331,8 @@ static long vmsplice_to_pipe(struct file *file, const struct iovec __user *uiov, * Currently we punt and implement it as a normal copy, see pipe_to_user(). * */ -SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov, - unsigned long, nr_segs, unsigned int, flags) +static long do_vmsplice(int fd, const struct iovec __user *iov, + unsigned long nr_segs, unsigned int flags) { struct fd f; long error; @@ -1358,6 +1358,12 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov, return error; } +SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov, + unsigned long, nr_segs, unsigned int, flags) +{ + return do_vmsplice(fd, iov, nr_segs, flags); +} + #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, iov32, unsigned int, nr_segs, unsigned int, flags) @@ -1375,7 +1381,7 @@ COMPAT_SYSCALL_DEFINE4(vmsplice, int, fd, const struct compat_iovec __user *, io put_user(v.iov_len, &iov[i].iov_len)) return -EFAULT; } - return sys_vmsplice(fd, iov, nr_segs, flags); + return do_vmsplice(fd, iov, nr_segs, flags); } #endif diff --git a/fs/stat.c b/fs/stat.c index 873785dae022..f8e6fb2c3657 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -379,8 +379,8 @@ SYSCALL_DEFINE2(newfstat, unsigned int, fd, struct stat __user *, statbuf) return error; } -SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname, - char __user *, buf, int, bufsiz) +static int do_readlinkat(int dfd, const char __user *pathname, + char __user *buf, int bufsiz) { struct path path; int error; @@ -415,10 +415,16 @@ retry: return error; } +SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname, + char __user *, buf, int, bufsiz) +{ + return do_readlinkat(dfd, pathname, buf, bufsiz); +} + SYSCALL_DEFINE3(readlink, const char __user *, path, char __user *, buf, int, bufsiz) { - return sys_readlinkat(AT_FDCWD, path, buf, bufsiz); + return do_readlinkat(AT_FDCWD, path, buf, bufsiz); } diff --git a/fs/sync.c b/fs/sync.c index 6e0a2cbaf6de..9908a114d506 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -105,7 +105,7 @@ static void fdatawait_one_bdev(struct block_device *bdev, void *arg) * just write metadata (such as inodes or bitmaps) to block device page cache * and do not sync it on their own in ->sync_fs(). */ -SYSCALL_DEFINE0(sync) +void ksys_sync(void) { int nowait = 0, wait = 1; @@ -117,6 +117,11 @@ SYSCALL_DEFINE0(sync) iterate_bdevs(fdatawait_one_bdev, NULL); if (unlikely(laptop_mode)) laptop_sync_completion(); +} + +SYSCALL_DEFINE0(sync) +{ + ksys_sync(); return 0; } @@ -280,8 +285,8 @@ SYSCALL_DEFINE1(fdatasync, unsigned int, fd) * already-instantiated disk blocks, there are no guarantees here that the data * will be available after a crash. */ -SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes, - unsigned int, flags) +int ksys_sync_file_range(int fd, loff_t offset, loff_t nbytes, + unsigned int flags) { int ret; struct fd f; @@ -359,10 +364,16 @@ out: return ret; } +SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes, + unsigned int, flags) +{ + return ksys_sync_file_range(fd, offset, nbytes, flags); +} + /* It would be nice if people remember that not all the world's an i386 when they introduce new system calls */ SYSCALL_DEFINE4(sync_file_range2, int, fd, unsigned int, flags, loff_t, offset, loff_t, nbytes) { - return sys_sync_file_range(fd, offset, nbytes, flags); + return ksys_sync_file_range(fd, offset, nbytes, flags); } diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index 8664db25a9a6..215c225b2ca1 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -106,6 +106,7 @@ int sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target, { return sysfs_do_create_link(kobj, target, name, 0); } +EXPORT_SYMBOL_GPL(sysfs_create_link_nowarn); /** * sysfs_delete_link - remove symlink in object's directory. diff --git a/fs/utimes.c b/fs/utimes.c index e4b3d7c2c9f5..69d4b6ba1bfb 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -184,8 +184,8 @@ SYSCALL_DEFINE4(utimensat, int, dfd, const char __user *, filename, return do_utimes(dfd, filename, utimes ? tstimes : NULL, flags); } -SYSCALL_DEFINE3(futimesat, int, dfd, const char __user *, filename, - struct timeval __user *, utimes) +static long do_futimesat(int dfd, const char __user *filename, + struct timeval __user *utimes) { struct timeval times[2]; struct timespec64 tstimes[2]; @@ -212,10 +212,17 @@ SYSCALL_DEFINE3(futimesat, int, dfd, const char __user *, filename, return do_utimes(dfd, filename, utimes ? tstimes : NULL, 0); } + +SYSCALL_DEFINE3(futimesat, int, dfd, const char __user *, filename, + struct timeval __user *, utimes) +{ + return do_futimesat(dfd, filename, utimes); +} + SYSCALL_DEFINE2(utimes, char __user *, filename, struct timeval __user *, utimes) { - return sys_futimesat(AT_FDCWD, filename, utimes); + return do_futimesat(AT_FDCWD, filename, utimes); } #ifdef CONFIG_COMPAT @@ -253,7 +260,8 @@ COMPAT_SYSCALL_DEFINE4(utimensat, unsigned int, dfd, const char __user *, filena return do_utimes(dfd, filename, t ? tv : NULL, flags); } -COMPAT_SYSCALL_DEFINE3(futimesat, unsigned int, dfd, const char __user *, filename, struct compat_timeval __user *, t) +static long do_compat_futimesat(unsigned int dfd, const char __user *filename, + struct compat_timeval __user *t) { struct timespec64 tv[2]; @@ -272,8 +280,15 @@ COMPAT_SYSCALL_DEFINE3(futimesat, unsigned int, dfd, const char __user *, filena return do_utimes(dfd, filename, t ? tv : NULL, 0); } +COMPAT_SYSCALL_DEFINE3(futimesat, unsigned int, dfd, + const char __user *, filename, + struct compat_timeval __user *, t) +{ + return do_compat_futimesat(dfd, filename, t); +} + COMPAT_SYSCALL_DEFINE2(utimes, const char __user *, filename, struct compat_timeval __user *, t) { - return compat_sys_futimesat(AT_FDCWD, filename, t); + return do_compat_futimesat(AT_FDCWD, filename, t); } #endif diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index fd975524f460..05c66e05ae20 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -767,7 +767,7 @@ int xfs_scrub_agfl( struct xfs_scrub_context *sc) { - struct xfs_scrub_agfl_info sai = { 0 }; + struct xfs_scrub_agfl_info sai; struct xfs_agf *agf; xfs_agnumber_t agno; unsigned int agflcount; @@ -795,6 +795,7 @@ xfs_scrub_agfl( xfs_scrub_block_set_corrupt(sc, sc->sa.agf_bp); goto out; } + memset(&sai, 0, sizeof(sai)); sai.sz_entries = agflcount; sai.entries = kmem_zalloc(sizeof(xfs_agblock_t) * agflcount, KM_NOFS); if (!sai.entries) { diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 66e1edbfb2b2..046469fcc1b8 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -955,15 +955,29 @@ static inline bool imap_needs_alloc(struct inode *inode, (IS_DAX(inode) && imap->br_state == XFS_EXT_UNWRITTEN); } +static inline bool needs_cow_for_zeroing(struct xfs_bmbt_irec *imap, int nimaps) +{ + return nimaps && + imap->br_startblock != HOLESTARTBLOCK && + imap->br_state != XFS_EXT_UNWRITTEN; +} + static inline bool need_excl_ilock(struct xfs_inode *ip, unsigned flags) { /* - * COW writes will allocate delalloc space, so we need to make sure - * to take the lock exclusively here. + * COW writes may allocate delalloc space or convert unwritten COW + * extents, so we need to make sure to take the lock exclusively here. */ if (xfs_is_reflink_inode(ip) && (flags & (IOMAP_WRITE | IOMAP_ZERO))) return true; - if ((flags & IOMAP_DIRECT) && (flags & IOMAP_WRITE)) + + /* + * Extents not yet cached requires exclusive access, don't block. + * This is an opencoded xfs_ilock_data_map_shared() to cater for the + * non-blocking behaviour. + */ + if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE && + !(ip->i_df.if_flags & XFS_IFEXTENTS)) return true; return false; } @@ -993,16 +1007,18 @@ xfs_file_iomap_begin( return xfs_file_iomap_begin_delay(inode, offset, length, iomap); } - if (need_excl_ilock(ip, flags)) { + if (need_excl_ilock(ip, flags)) lockmode = XFS_ILOCK_EXCL; - xfs_ilock(ip, XFS_ILOCK_EXCL); - } else { - lockmode = xfs_ilock_data_map_shared(ip); - } + else + lockmode = XFS_ILOCK_SHARED; - if ((flags & IOMAP_NOWAIT) && !(ip->i_df.if_flags & XFS_IFEXTENTS)) { - error = -EAGAIN; - goto out_unlock; + if (flags & IOMAP_NOWAIT) { + if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) + return -EAGAIN; + if (!xfs_ilock_nowait(ip, lockmode)) + return -EAGAIN; + } else { + xfs_ilock(ip, lockmode); } ASSERT(offset <= mp->m_super->s_maxbytes); @@ -1024,7 +1040,9 @@ xfs_file_iomap_begin( goto out_unlock; } - if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) { + if (xfs_is_reflink_inode(ip) && + ((flags & IOMAP_WRITE) || + ((flags & IOMAP_ZERO) && needs_cow_for_zeroing(&imap, nimaps)))) { if (flags & IOMAP_DIRECT) { /* * A reflinked inode will result in CoW alloc. diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 3a55d6fc271b..7a39f40645f7 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -23,6 +23,7 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" +#include "xfs_shared.h" #include "xfs_mount.h" #include "xfs_defer.h" #include "xfs_trans.h" @@ -456,10 +457,12 @@ xfs_cui_recover( * transaction. Normally, any work that needs to be deferred * gets attached to the same defer_ops that scheduled the * refcount update. However, we're in log recovery here, so we - * we create our own defer_ops and use that to finish up any - * work that doesn't fit. + * we use the passed in defer_ops and to finish up any work that + * doesn't fit. We need to reserve enough blocks to handle a + * full btree split on either end of the refcount range. */ - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, + mp->m_refc_maxlevels * 2, 0, XFS_TRANS_RESERVE, &tp); if (error) return error; cudp = xfs_trans_get_cud(tp, cuip); diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index f3b139c9aa16..49d3124863a8 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -23,6 +23,7 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" +#include "xfs_shared.h" #include "xfs_mount.h" #include "xfs_defer.h" #include "xfs_trans.h" @@ -470,7 +471,8 @@ xfs_rui_recover( } } - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, + mp->m_rmap_maxlevels, 0, XFS_TRANS_RESERVE, &tp); if (error) return error; rudp = xfs_trans_get_rud(tp, ruip); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 7aba628dc527..93588ea3d3d2 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -250,6 +250,7 @@ xfs_parseargs( return -EINVAL; break; case Opt_logdev: + kfree(mp->m_logname); mp->m_logname = match_strdup(args); if (!mp->m_logname) return -ENOMEM; @@ -258,6 +259,7 @@ xfs_parseargs( xfs_warn(mp, "%s option not allowed on this system", p); return -EINVAL; case Opt_rtdev: + kfree(mp->m_rtname); mp->m_rtname = match_strdup(args); if (!mp->m_rtname) return -ENOMEM; |