diff options
Diffstat (limited to 'fs')
68 files changed, 866 insertions, 519 deletions
diff --git a/fs/afs/cell.c b/fs/afs/cell.c index 52233fa6195f..887b673f6223 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -589,7 +589,7 @@ struct afs_cell *afs_use_cell(struct afs_cell *cell, enum afs_cell_trace reason) */ void afs_unuse_cell(struct afs_net *net, struct afs_cell *cell, enum afs_cell_trace reason) { - unsigned int debug_id = cell->debug_id; + unsigned int debug_id; time64_t now, expire_delay; int u, a; @@ -604,6 +604,7 @@ void afs_unuse_cell(struct afs_net *net, struct afs_cell *cell, enum afs_cell_tr if (cell->vl_servers->nr_servers) expire_delay = afs_cell_gc_delay; + debug_id = cell->debug_id; u = atomic_read(&cell->ref); a = atomic_dec_return(&cell->active); trace_afs_cell(debug_id, u, a, reason); diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 1d2e61e0ab04..1bb5b9d7f0a2 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -281,8 +281,7 @@ retry: if (ret < 0) goto error; - set_page_private(req->pages[i], 1); - SetPagePrivate(req->pages[i]); + attach_page_private(req->pages[i], (void *)1); unlock_page(req->pages[i]); i++; } else { @@ -1975,8 +1974,7 @@ static int afs_dir_releasepage(struct page *page, gfp_t gfp_flags) _enter("{{%llx:%llu}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, page->index); - set_page_private(page, 0); - ClearPagePrivate(page); + detach_page_private(page); /* The directory will need reloading. */ if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) @@ -2003,8 +2001,6 @@ static void afs_dir_invalidatepage(struct page *page, unsigned int offset, afs_stat_v(dvnode, n_inval); /* we clean up only if the entire page is being invalidated */ - if (offset == 0 && length == PAGE_SIZE) { - set_page_private(page, 0); - ClearPagePrivate(page); - } + if (offset == 0 && length == PAGE_SIZE) + detach_page_private(page); } diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c index b108528bf010..2ffe09abae7f 100644 --- a/fs/afs/dir_edit.c +++ b/fs/afs/dir_edit.c @@ -243,10 +243,8 @@ void afs_edit_dir_add(struct afs_vnode *vnode, index, gfp); if (!page) goto error; - if (!PagePrivate(page)) { - set_page_private(page, 1); - SetPagePrivate(page); - } + if (!PagePrivate(page)) + attach_page_private(page, (void *)1); dir_page = kmap(page); } diff --git a/fs/afs/file.c b/fs/afs/file.c index 371d1488cc54..85f5adf21aa0 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -33,6 +33,7 @@ const struct file_operations afs_file_operations = { .write_iter = afs_file_write, .mmap = afs_file_mmap, .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, .fsync = afs_fsync, .lock = afs_lock, .flock = afs_flock, @@ -601,6 +602,63 @@ static int afs_readpages(struct file *file, struct address_space *mapping, } /* + * Adjust the dirty region of the page on truncation or full invalidation, + * getting rid of the markers altogether if the region is entirely invalidated. + */ +static void afs_invalidate_dirty(struct page *page, unsigned int offset, + unsigned int length) +{ + struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); + unsigned long priv; + unsigned int f, t, end = offset + length; + + priv = page_private(page); + + /* we clean up only if the entire page is being invalidated */ + if (offset == 0 && length == thp_size(page)) + goto full_invalidate; + + /* If the page was dirtied by page_mkwrite(), the PTE stays writable + * and we don't get another notification to tell us to expand it + * again. + */ + if (afs_is_page_dirty_mmapped(priv)) + return; + + /* We may need to shorten the dirty region */ + f = afs_page_dirty_from(priv); + t = afs_page_dirty_to(priv); + + if (t <= offset || f >= end) + return; /* Doesn't overlap */ + + if (f < offset && t > end) + return; /* Splits the dirty region - just absorb it */ + + if (f >= offset && t <= end) + goto undirty; + + if (f < offset) + t = offset; + else + f = end; + if (f == t) + goto undirty; + + priv = afs_page_dirty(f, t); + set_page_private(page, priv); + trace_afs_page_dirty(vnode, tracepoint_string("trunc"), page->index, priv); + return; + +undirty: + trace_afs_page_dirty(vnode, tracepoint_string("undirty"), page->index, priv); + clear_page_dirty_for_io(page); +full_invalidate: + priv = (unsigned long)detach_page_private(page); + trace_afs_page_dirty(vnode, tracepoint_string("inval"), page->index, priv); +} + +/* * invalidate part or all of a page * - release a page and clean up its private data if offset is 0 (indicating * the entire page) @@ -608,31 +666,23 @@ static int afs_readpages(struct file *file, struct address_space *mapping, static void afs_invalidatepage(struct page *page, unsigned int offset, unsigned int length) { - struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); - unsigned long priv; - _enter("{%lu},%u,%u", page->index, offset, length); BUG_ON(!PageLocked(page)); +#ifdef CONFIG_AFS_FSCACHE /* we clean up only if the entire page is being invalidated */ if (offset == 0 && length == PAGE_SIZE) { -#ifdef CONFIG_AFS_FSCACHE if (PageFsCache(page)) { struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); fscache_wait_on_page_write(vnode->cache, page); fscache_uncache_page(vnode->cache, page); } + } #endif - if (PagePrivate(page)) { - priv = page_private(page); - trace_afs_page_dirty(vnode, tracepoint_string("inval"), - page->index, priv); - set_page_private(page, 0); - ClearPagePrivate(page); - } - } + if (PagePrivate(page)) + afs_invalidate_dirty(page, offset, length); _leave(""); } @@ -660,11 +710,9 @@ static int afs_releasepage(struct page *page, gfp_t gfp_flags) #endif if (PagePrivate(page)) { - priv = page_private(page); + priv = (unsigned long)detach_page_private(page); trace_afs_page_dirty(vnode, tracepoint_string("rel"), page->index, priv); - set_page_private(page, 0); - ClearPagePrivate(page); } /* indicate that the page can be released */ diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 81b0485fd22a..14d5d75f4b6e 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -812,6 +812,7 @@ struct afs_operation { pgoff_t last; /* last page in mapping to deal with */ unsigned first_offset; /* offset into mapping[first] */ unsigned last_to; /* amount of mapping[last] */ + bool laundering; /* Laundering page, PG_writeback not set */ } store; struct { struct iattr *attr; @@ -857,6 +858,62 @@ struct afs_vnode_cache_aux { u64 data_version; } __packed; +/* + * We use page->private to hold the amount of the page that we've written to, + * splitting the field into two parts. However, we need to represent a range + * 0...PAGE_SIZE, so we reduce the resolution if the size of the page + * exceeds what we can encode. + */ +#ifdef CONFIG_64BIT +#define __AFS_PAGE_PRIV_MASK 0x7fffffffUL +#define __AFS_PAGE_PRIV_SHIFT 32 +#define __AFS_PAGE_PRIV_MMAPPED 0x80000000UL +#else +#define __AFS_PAGE_PRIV_MASK 0x7fffUL +#define __AFS_PAGE_PRIV_SHIFT 16 +#define __AFS_PAGE_PRIV_MMAPPED 0x8000UL +#endif + +static inline unsigned int afs_page_dirty_resolution(void) +{ + int shift = PAGE_SHIFT - (__AFS_PAGE_PRIV_SHIFT - 1); + return (shift > 0) ? shift : 0; +} + +static inline size_t afs_page_dirty_from(unsigned long priv) +{ + unsigned long x = priv & __AFS_PAGE_PRIV_MASK; + + /* The lower bound is inclusive */ + return x << afs_page_dirty_resolution(); +} + +static inline size_t afs_page_dirty_to(unsigned long priv) +{ + unsigned long x = (priv >> __AFS_PAGE_PRIV_SHIFT) & __AFS_PAGE_PRIV_MASK; + + /* The upper bound is immediately beyond the region */ + return (x + 1) << afs_page_dirty_resolution(); +} + +static inline unsigned long afs_page_dirty(size_t from, size_t to) +{ + unsigned int res = afs_page_dirty_resolution(); + from >>= res; + to = (to - 1) >> res; + return (to << __AFS_PAGE_PRIV_SHIFT) | from; +} + +static inline unsigned long afs_page_dirty_mmapped(unsigned long priv) +{ + return priv | __AFS_PAGE_PRIV_MMAPPED; +} + +static inline bool afs_is_page_dirty_mmapped(unsigned long priv) +{ + return priv & __AFS_PAGE_PRIV_MMAPPED; +} + #include <trace/events/afs.h> /*****************************************************************************/ diff --git a/fs/afs/write.c b/fs/afs/write.c index da12abd6db21..50371207f327 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -76,7 +76,7 @@ static int afs_fill_page(struct afs_vnode *vnode, struct key *key, */ int afs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata) + struct page **_page, void **fsdata) { struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); struct page *page; @@ -90,11 +90,6 @@ int afs_write_begin(struct file *file, struct address_space *mapping, _enter("{%llx:%llu},{%lx},%u,%u", vnode->fid.vid, vnode->fid.vnode, index, from, to); - /* We want to store information about how much of a page is altered in - * page->private. - */ - BUILD_BUG_ON(PAGE_SIZE > 32768 && sizeof(page->private) < 8); - page = grab_cache_page_write_begin(mapping, index, flags); if (!page) return -ENOMEM; @@ -110,9 +105,6 @@ int afs_write_begin(struct file *file, struct address_space *mapping, SetPageUptodate(page); } - /* page won't leak in error case: it eventually gets cleaned off LRU */ - *pagep = page; - try_again: /* See if this page is already partially written in a way that we can * merge the new write with. @@ -120,8 +112,8 @@ try_again: t = f = 0; if (PagePrivate(page)) { priv = page_private(page); - f = priv & AFS_PRIV_MAX; - t = priv >> AFS_PRIV_SHIFT; + f = afs_page_dirty_from(priv); + t = afs_page_dirty_to(priv); ASSERTCMP(f, <=, t); } @@ -138,21 +130,9 @@ try_again: if (!test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags) && (to < f || from > t)) goto flush_conflicting_write; - if (from < f) - f = from; - if (to > t) - t = to; - } else { - f = from; - t = to; } - priv = (unsigned long)t << AFS_PRIV_SHIFT; - priv |= f; - trace_afs_page_dirty(vnode, tracepoint_string("begin"), - page->index, priv); - SetPagePrivate(page); - set_page_private(page, priv); + *_page = page; _leave(" = 0"); return 0; @@ -162,17 +142,18 @@ try_again: flush_conflicting_write: _debug("flush conflict"); ret = write_one_page(page); - if (ret < 0) { - _leave(" = %d", ret); - return ret; - } + if (ret < 0) + goto error; ret = lock_page_killable(page); - if (ret < 0) { - _leave(" = %d", ret); - return ret; - } + if (ret < 0) + goto error; goto try_again; + +error: + put_page(page); + _leave(" = %d", ret); + return ret; } /* @@ -184,6 +165,9 @@ int afs_write_end(struct file *file, struct address_space *mapping, { struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); struct key *key = afs_file_key(file); + unsigned long priv; + unsigned int f, from = pos & (PAGE_SIZE - 1); + unsigned int t, to = from + copied; loff_t i_size, maybe_i_size; int ret; @@ -215,6 +199,25 @@ int afs_write_end(struct file *file, struct address_space *mapping, SetPageUptodate(page); } + if (PagePrivate(page)) { + priv = page_private(page); + f = afs_page_dirty_from(priv); + t = afs_page_dirty_to(priv); + if (from < f) + f = from; + if (to > t) + t = to; + priv = afs_page_dirty(f, t); + set_page_private(page, priv); + trace_afs_page_dirty(vnode, tracepoint_string("dirty+"), + page->index, priv); + } else { + priv = afs_page_dirty(from, to); + attach_page_private(page, (void *)priv); + trace_afs_page_dirty(vnode, tracepoint_string("dirty"), + page->index, priv); + } + set_page_dirty(page); if (PageDirty(page)) _debug("dirtied"); @@ -334,10 +337,9 @@ static void afs_pages_written_back(struct afs_vnode *vnode, ASSERTCMP(pv.nr, ==, count); for (loop = 0; loop < count; loop++) { - priv = page_private(pv.pages[loop]); + priv = (unsigned long)detach_page_private(pv.pages[loop]); trace_afs_page_dirty(vnode, tracepoint_string("clear"), pv.pages[loop]->index, priv); - set_page_private(pv.pages[loop], 0); end_page_writeback(pv.pages[loop]); } first += count; @@ -396,7 +398,8 @@ static void afs_store_data_success(struct afs_operation *op) op->ctime = op->file[0].scb.status.mtime_client; afs_vnode_commit_status(op, &op->file[0]); if (op->error == 0) { - afs_pages_written_back(vnode, op->store.first, op->store.last); + if (!op->store.laundering) + afs_pages_written_back(vnode, op->store.first, op->store.last); afs_stat_v(vnode, n_stores); atomic_long_add((op->store.last * PAGE_SIZE + op->store.last_to) - (op->store.first * PAGE_SIZE + op->store.first_offset), @@ -415,7 +418,7 @@ static const struct afs_operation_ops afs_store_data_operation = { */ static int afs_store_data(struct address_space *mapping, pgoff_t first, pgoff_t last, - unsigned offset, unsigned to) + unsigned offset, unsigned to, bool laundering) { struct afs_vnode *vnode = AFS_FS_I(mapping->host); struct afs_operation *op; @@ -448,6 +451,7 @@ static int afs_store_data(struct address_space *mapping, op->store.last = last; op->store.first_offset = offset; op->store.last_to = to; + op->store.laundering = laundering; op->mtime = vnode->vfs_inode.i_mtime; op->flags |= AFS_OPERATION_UNINTR; op->ops = &afs_store_data_operation; @@ -509,8 +513,8 @@ static int afs_write_back_from_locked_page(struct address_space *mapping, */ start = primary_page->index; priv = page_private(primary_page); - offset = priv & AFS_PRIV_MAX; - to = priv >> AFS_PRIV_SHIFT; + offset = afs_page_dirty_from(priv); + to = afs_page_dirty_to(priv); trace_afs_page_dirty(vnode, tracepoint_string("store"), primary_page->index, priv); @@ -555,8 +559,8 @@ static int afs_write_back_from_locked_page(struct address_space *mapping, } priv = page_private(page); - f = priv & AFS_PRIV_MAX; - t = priv >> AFS_PRIV_SHIFT; + f = afs_page_dirty_from(priv); + t = afs_page_dirty_to(priv); if (f != 0 && !test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags)) { unlock_page(page); @@ -601,7 +605,7 @@ no_more: if (end > i_size) to = i_size & ~PAGE_MASK; - ret = afs_store_data(mapping, first, last, offset, to); + ret = afs_store_data(mapping, first, last, offset, to, false); switch (ret) { case 0: ret = count; @@ -857,12 +861,14 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf) */ wait_on_page_writeback(vmf->page); - priv = (unsigned long)PAGE_SIZE << AFS_PRIV_SHIFT; /* To */ - priv |= 0; /* From */ + priv = afs_page_dirty(0, PAGE_SIZE); + priv = afs_page_dirty_mmapped(priv); trace_afs_page_dirty(vnode, tracepoint_string("mkwrite"), vmf->page->index, priv); - SetPagePrivate(vmf->page); - set_page_private(vmf->page, priv); + if (PagePrivate(vmf->page)) + set_page_private(vmf->page, priv); + else + attach_page_private(vmf->page, (void *)priv); file_update_time(file); sb_end_pagefault(inode->i_sb); @@ -915,19 +921,18 @@ int afs_launder_page(struct page *page) f = 0; t = PAGE_SIZE; if (PagePrivate(page)) { - f = priv & AFS_PRIV_MAX; - t = priv >> AFS_PRIV_SHIFT; + f = afs_page_dirty_from(priv); + t = afs_page_dirty_to(priv); } trace_afs_page_dirty(vnode, tracepoint_string("launder"), page->index, priv); - ret = afs_store_data(mapping, page->index, page->index, t, f); + ret = afs_store_data(mapping, page->index, page->index, t, f, true); } + priv = (unsigned long)detach_page_private(page); trace_afs_page_dirty(vnode, tracepoint_string("laundered"), page->index, priv); - set_page_private(page, 0); - ClearPagePrivate(page); #ifdef CONFIG_AFS_FSCACHE if (PageFsCache(page)) { diff --git a/fs/afs/xattr.c b/fs/afs/xattr.c index 84f3c4f57531..95c573dcda11 100644 --- a/fs/afs/xattr.c +++ b/fs/afs/xattr.c @@ -85,7 +85,7 @@ static int afs_xattr_get_acl(const struct xattr_handler *handler, if (acl->size <= size) memcpy(buffer, acl->data, acl->size); else - op->error = -ERANGE; + ret = -ERANGE; } } @@ -148,11 +148,6 @@ static const struct xattr_handler afs_xattr_afs_acl_handler = { .set = afs_xattr_set_acl, }; -static void yfs_acl_put(struct afs_operation *op) -{ - yfs_free_opaque_acl(op->yacl); -} - static const struct afs_operation_ops yfs_fetch_opaque_acl_operation = { .issue_yfs_rpc = yfs_fs_fetch_opaque_acl, .success = afs_acl_success, @@ -246,7 +241,7 @@ error: static const struct afs_operation_ops yfs_store_opaque_acl2_operation = { .issue_yfs_rpc = yfs_fs_store_opaque_acl2, .success = afs_acl_success, - .put = yfs_acl_put, + .put = afs_acl_put, }; /* diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c index 3b1239b7e90d..bd787e71a657 100644 --- a/fs/afs/yfsclient.c +++ b/fs/afs/yfsclient.c @@ -1990,6 +1990,7 @@ void yfs_fs_store_opaque_acl2(struct afs_operation *op) memcpy(bp, acl->data, acl->size); if (acl->size != size) memset((void *)bp + acl->size, 0, size - acl->size); + bp += size / sizeof(__be32); yfs_check_req(call, bp); trace_afs_make_fs_call(call, &vp->fid); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index b6b3d052ca86..fa50e8936f5f 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1690,7 +1690,7 @@ struct elf_thread_core_info { struct elf_thread_core_info *next; struct task_struct *task; struct elf_prstatus prstatus; - struct memelfnote notes[0]; + struct memelfnote notes[]; }; struct elf_note_info { diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index b3268f4ea5f3..771a036867dc 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -544,7 +544,18 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, int level = ref->level; struct btrfs_key search_key = ref->key_for_search; - root = btrfs_get_fs_root(fs_info, ref->root_id, false); + /* + * If we're search_commit_root we could possibly be holding locks on + * other tree nodes. This happens when qgroups does backref walks when + * adding new delayed refs. To deal with this we need to look in cache + * for the root, and if we don't find it then we need to search the + * tree_root's commit root, thus the btrfs_get_fs_root_commit_root usage + * here. + */ + if (path->search_commit_root) + root = btrfs_get_fs_root_commit_root(fs_info, path, ref->root_id); + else + root = btrfs_get_fs_root(fs_info, ref->root_id, false); if (IS_ERR(root)) { ret = PTR_ERR(root); goto out_free; diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index c0f1d6818df7..3ba6f3839d39 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2024,6 +2024,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) key.offset = 0; btrfs_release_path(path); } + btrfs_release_path(path); list_for_each_entry(space_info, &info->space_info, list) { int i; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index aac3d6f4e35b..0378933d163c 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3564,6 +3564,8 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root, int btrfs_reada_wait(void *handle); void btrfs_reada_detach(void *handle); int btree_readahead_hook(struct extent_buffer *eb, int err); +void btrfs_reada_remove_dev(struct btrfs_device *dev); +void btrfs_reada_undo_remove_dev(struct btrfs_device *dev); static inline int is_fstree(u64 rootid) { diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 4a0243cb9d97..5b9e3f3ace22 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -688,6 +688,9 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, } btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); + if (!scrub_ret) + btrfs_reada_remove_dev(src_device); + /* * We have to use this loop approach because at this point src_device * has to be available for transaction commit to complete, yet new @@ -696,6 +699,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, while (1) { trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { + btrfs_reada_undo_remove_dev(src_device); mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return PTR_ERR(trans); } @@ -746,6 +750,7 @@ error: up_write(&dev_replace->rwsem); mutex_unlock(&fs_info->chunk_mutex); mutex_unlock(&fs_info->fs_devices->device_list_mutex); + btrfs_reada_undo_remove_dev(src_device); btrfs_rm_dev_replace_blocked(fs_info); if (tgt_device) btrfs_destroy_dev_replace_tgtdev(tgt_device); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 8e3438672a82..af97ddcc6b3e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1281,32 +1281,26 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, return 0; } -struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, - struct btrfs_key *key) +static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root, + struct btrfs_path *path, + struct btrfs_key *key) { struct btrfs_root *root; struct btrfs_fs_info *fs_info = tree_root->fs_info; - struct btrfs_path *path; u64 generation; int ret; int level; - path = btrfs_alloc_path(); - if (!path) - return ERR_PTR(-ENOMEM); - root = btrfs_alloc_root(fs_info, key->objectid, GFP_NOFS); - if (!root) { - ret = -ENOMEM; - goto alloc_fail; - } + if (!root) + return ERR_PTR(-ENOMEM); ret = btrfs_find_root(tree_root, key, path, &root->root_item, &root->root_key); if (ret) { if (ret > 0) ret = -ENOENT; - goto find_fail; + goto fail; } generation = btrfs_root_generation(&root->root_item); @@ -1317,21 +1311,31 @@ struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, if (IS_ERR(root->node)) { ret = PTR_ERR(root->node); root->node = NULL; - goto find_fail; + goto fail; } else if (!btrfs_buffer_uptodate(root->node, generation, 0)) { ret = -EIO; - goto find_fail; + goto fail; } root->commit_root = btrfs_root_node(root); -out: - btrfs_free_path(path); return root; - -find_fail: +fail: btrfs_put_root(root); -alloc_fail: - root = ERR_PTR(ret); - goto out; + return ERR_PTR(ret); +} + +struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, + struct btrfs_key *key) +{ + struct btrfs_root *root; + struct btrfs_path *path; + + path = btrfs_alloc_path(); + if (!path) + return ERR_PTR(-ENOMEM); + root = read_tree_root_path(tree_root, path, key); + btrfs_free_path(path); + + return root; } /* @@ -1419,6 +1423,31 @@ static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, return root; } +static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info, + u64 objectid) +{ + if (objectid == BTRFS_ROOT_TREE_OBJECTID) + return btrfs_grab_root(fs_info->tree_root); + if (objectid == BTRFS_EXTENT_TREE_OBJECTID) + return btrfs_grab_root(fs_info->extent_root); + if (objectid == BTRFS_CHUNK_TREE_OBJECTID) + return btrfs_grab_root(fs_info->chunk_root); + if (objectid == BTRFS_DEV_TREE_OBJECTID) + return btrfs_grab_root(fs_info->dev_root); + if (objectid == BTRFS_CSUM_TREE_OBJECTID) + return btrfs_grab_root(fs_info->csum_root); + if (objectid == BTRFS_QUOTA_TREE_OBJECTID) + return btrfs_grab_root(fs_info->quota_root) ? + fs_info->quota_root : ERR_PTR(-ENOENT); + if (objectid == BTRFS_UUID_TREE_OBJECTID) + return btrfs_grab_root(fs_info->uuid_root) ? + fs_info->uuid_root : ERR_PTR(-ENOENT); + if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) + return btrfs_grab_root(fs_info->free_space_root) ? + fs_info->free_space_root : ERR_PTR(-ENOENT); + return NULL; +} + int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) { @@ -1518,25 +1547,9 @@ static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info, struct btrfs_key key; int ret; - if (objectid == BTRFS_ROOT_TREE_OBJECTID) - return btrfs_grab_root(fs_info->tree_root); - if (objectid == BTRFS_EXTENT_TREE_OBJECTID) - return btrfs_grab_root(fs_info->extent_root); - if (objectid == BTRFS_CHUNK_TREE_OBJECTID) - return btrfs_grab_root(fs_info->chunk_root); - if (objectid == BTRFS_DEV_TREE_OBJECTID) - return btrfs_grab_root(fs_info->dev_root); - if (objectid == BTRFS_CSUM_TREE_OBJECTID) - return btrfs_grab_root(fs_info->csum_root); - if (objectid == BTRFS_QUOTA_TREE_OBJECTID) - return btrfs_grab_root(fs_info->quota_root) ? - fs_info->quota_root : ERR_PTR(-ENOENT); - if (objectid == BTRFS_UUID_TREE_OBJECTID) - return btrfs_grab_root(fs_info->uuid_root) ? - fs_info->uuid_root : ERR_PTR(-ENOENT); - if (objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) - return btrfs_grab_root(fs_info->free_space_root) ? - fs_info->free_space_root : ERR_PTR(-ENOENT); + root = btrfs_get_global_root(fs_info, objectid); + if (root) + return root; again: root = btrfs_lookup_fs_root(fs_info, objectid); if (root) { @@ -1622,6 +1635,52 @@ struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info, } /* + * btrfs_get_fs_root_commit_root - return a root for the given objectid + * @fs_info: the fs_info + * @objectid: the objectid we need to lookup + * + * This is exclusively used for backref walking, and exists specifically because + * of how qgroups does lookups. Qgroups will do a backref lookup at delayed ref + * creation time, which means we may have to read the tree_root in order to look + * up a fs root that is not in memory. If the root is not in memory we will + * read the tree root commit root and look up the fs root from there. This is a + * temporary root, it will not be inserted into the radix tree as it doesn't + * have the most uptodate information, it'll simply be discarded once the + * backref code is finished using the root. + */ +struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + u64 objectid) +{ + struct btrfs_root *root; + struct btrfs_key key; + + ASSERT(path->search_commit_root && path->skip_locking); + + /* + * This can return -ENOENT if we ask for a root that doesn't exist, but + * since this is called via the backref walking code we won't be looking + * up a root that doesn't exist, unless there's corruption. So if root + * != NULL just return it. + */ + root = btrfs_get_global_root(fs_info, objectid); + if (root) + return root; + + root = btrfs_lookup_fs_root(fs_info, objectid); + if (root) + return root; + + key.objectid = objectid; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + root = read_tree_root_path(fs_info->tree_root, path, &key); + btrfs_release_path(path); + + return root; +} + +/* * called by the kthread helper functions to finally call the bio end_io * functions. This is where read checksum verification actually happens */ diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index fee69ced58b4..182540bdcea0 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -69,6 +69,9 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, u64 objectid, bool check_ref); struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info, u64 objectid, dev_t anon_dev); +struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + u64 objectid); void btrfs_free_fs_info(struct btrfs_fs_info *fs_info); int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3b21fee13e77..5fd60b13f4f8 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3185,7 +3185,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_tree_block_info *bi; if (item_size < sizeof(*ei) + sizeof(*bi)) { btrfs_crit(info, -"invalid extent item size for key (%llu, %u, %llu) owner %llu, has %u expect >= %lu", +"invalid extent item size for key (%llu, %u, %llu) owner %llu, has %u expect >= %zu", key.objectid, key.type, key.offset, owner_objectid, item_size, sizeof(*ei) + sizeof(*bi)); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 0ff659455b1e..87355a38a654 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3628,7 +3628,8 @@ static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) inode_lock_shared(inode); ret = btrfs_direct_IO(iocb, to); inode_unlock_shared(inode); - if (ret < 0) + if (ret < 0 || !iov_iter_count(to) || + iocb->ki_pos >= i_size_read(file_inode(iocb->ki_filp))) return ret; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 936c3137c646..da58c58ef9aa 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9672,10 +9672,16 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, * clear_offset by our extent size. */ clear_offset += ins.offset; - btrfs_dec_block_group_reservations(fs_info, ins.objectid); last_alloc = ins.offset; trans = insert_prealloc_file_extent(trans, inode, &ins, cur_offset); + /* + * Now that we inserted the prealloc extent we can finally + * decrement the number of reservations in the block group. + * If we did it before, we could race with relocation and have + * relocation miss the reserved extent, making it fail later. + */ + btrfs_dec_block_group_reservations(fs_info, ins.objectid); if (IS_ERR(trans)) { ret = PTR_ERR(trans); btrfs_free_reserved_extent(fs_info, ins.objectid, diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 580899bdb991..c54ea6586632 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1026,6 +1026,10 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) btrfs_item_key_to_cpu(leaf, &found_key, slot); if (found_key.type == BTRFS_ROOT_REF_KEY) { + + /* Release locks on tree_root before we access quota_root */ + btrfs_release_path(path); + ret = add_qgroup_item(trans, quota_root, found_key.offset); if (ret) { @@ -1044,6 +1048,20 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) btrfs_abort_transaction(trans, ret); goto out_free_path; } + ret = btrfs_search_slot_for_read(tree_root, &found_key, + path, 1, 0); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); + goto out_free_path; + } + if (ret > 0) { + /* + * Shouldn't happen, but in case it does we + * don't need to do the btrfs_next_item, just + * continue. + */ + continue; + } } ret = btrfs_next_item(tree_root, path); if (ret < 0) { diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 9d4f5316a7e8..d9a166eb344e 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -421,6 +421,9 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, if (!dev->bdev) continue; + if (test_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state)) + continue; + if (dev_replace_is_ongoing && dev == fs_info->dev_replace.tgtdev) { /* @@ -445,6 +448,8 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, } have_zone = 1; } + if (!have_zone) + radix_tree_delete(&fs_info->reada_tree, index); spin_unlock(&fs_info->reada_lock); up_read(&fs_info->dev_replace.rwsem); @@ -1020,3 +1025,45 @@ void btrfs_reada_detach(void *handle) kref_put(&rc->refcnt, reada_control_release); } + +/* + * Before removing a device (device replace or device remove ioctls), call this + * function to wait for all existing readahead requests on the device and to + * make sure no one queues more readahead requests for the device. + * + * Must be called without holding neither the device list mutex nor the device + * replace semaphore, otherwise it will deadlock. + */ +void btrfs_reada_remove_dev(struct btrfs_device *dev) +{ + struct btrfs_fs_info *fs_info = dev->fs_info; + + /* Serialize with readahead extent creation at reada_find_extent(). */ + spin_lock(&fs_info->reada_lock); + set_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state); + spin_unlock(&fs_info->reada_lock); + + /* + * There might be readahead requests added to the radix trees which + * were not yet added to the readahead work queue. We need to start + * them and wait for their completion, otherwise we can end up with + * use-after-free problems when dropping the last reference on the + * readahead extents and their zones, as they need to access the + * device structure. + */ + reada_start_machine(fs_info); + btrfs_flush_workqueue(fs_info->readahead_workers); +} + +/* + * If when removing a device (device replace or device remove ioctls) an error + * happens after calling btrfs_reada_remove_dev(), call this to undo what that + * function did. This is safe to call even if btrfs_reada_remove_dev() was not + * called before. + */ +void btrfs_reada_undo_remove_dev(struct btrfs_device *dev) +{ + spin_lock(&dev->fs_info->reada_lock); + clear_bit(BTRFS_DEV_STATE_NO_READA, &dev->dev_state); + spin_unlock(&dev->fs_info->reada_lock); +} diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index f0ffd5ee77bd..8784b74f5232 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -760,18 +760,36 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf, u64 type; u64 features; bool mixed = false; + int raid_index; + int nparity; + int ncopies; length = btrfs_chunk_length(leaf, chunk); stripe_len = btrfs_chunk_stripe_len(leaf, chunk); num_stripes = btrfs_chunk_num_stripes(leaf, chunk); sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); type = btrfs_chunk_type(leaf, chunk); + raid_index = btrfs_bg_flags_to_raid_index(type); + ncopies = btrfs_raid_array[raid_index].ncopies; + nparity = btrfs_raid_array[raid_index].nparity; if (!num_stripes) { chunk_err(leaf, chunk, logical, "invalid chunk num_stripes, have %u", num_stripes); return -EUCLEAN; } + if (num_stripes < ncopies) { + chunk_err(leaf, chunk, logical, + "invalid chunk num_stripes < ncopies, have %u < %d", + num_stripes, ncopies); + return -EUCLEAN; + } + if (nparity && num_stripes == nparity) { + chunk_err(leaf, chunk, logical, + "invalid chunk num_stripes == nparity, have %u == %d", + num_stripes, nparity); + return -EUCLEAN; + } if (!IS_ALIGNED(logical, fs_info->sectorsize)) { chunk_err(leaf, chunk, logical, "invalid chunk logical, have %llu should aligned to %u", diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 58b9c419a2b6..b1e48078c318 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -431,7 +431,7 @@ static struct btrfs_device *__alloc_device(struct btrfs_fs_info *fs_info) atomic_set(&dev->reada_in_flight, 0); atomic_set(&dev->dev_stats_ccnt, 0); - btrfs_device_data_ordered_init(dev); + btrfs_device_data_ordered_init(dev, fs_info); INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); extent_io_tree_init(fs_info, &dev->alloc_state, @@ -2099,6 +2099,8 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, mutex_unlock(&uuid_mutex); ret = btrfs_shrink_device(device, 0); + if (!ret) + btrfs_reada_remove_dev(device); mutex_lock(&uuid_mutex); if (ret) goto error_undo; @@ -2179,6 +2181,7 @@ out: return ret; error_undo: + btrfs_reada_undo_remove_dev(device); if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { mutex_lock(&fs_info->chunk_mutex); list_add(&device->dev_alloc_list, diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index bf27ac07d315..232f02bd214f 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -39,10 +39,10 @@ struct btrfs_io_geometry { #if BITS_PER_LONG==32 && defined(CONFIG_SMP) #include <linux/seqlock.h> #define __BTRFS_NEED_DEVICE_DATA_ORDERED -#define btrfs_device_data_ordered_init(device) \ - seqcount_init(&device->data_seqcount) +#define btrfs_device_data_ordered_init(device, info) \ + seqcount_mutex_init(&device->data_seqcount, &info->chunk_mutex) #else -#define btrfs_device_data_ordered_init(device) do { } while (0) +#define btrfs_device_data_ordered_init(device, info) do { } while (0) #endif #define BTRFS_DEV_STATE_WRITEABLE (0) @@ -50,6 +50,7 @@ struct btrfs_io_geometry { #define BTRFS_DEV_STATE_MISSING (2) #define BTRFS_DEV_STATE_REPLACE_TGT (3) #define BTRFS_DEV_STATE_FLUSH_SENT (4) +#define BTRFS_DEV_STATE_NO_READA (5) struct btrfs_device { struct list_head dev_list; /* device_list_mutex */ @@ -71,7 +72,8 @@ struct btrfs_device { blk_status_t last_flush_error; #ifdef __BTRFS_NEED_DEVICE_DATA_ORDERED - seqcount_t data_seqcount; + /* A seqcount_t with associated chunk_mutex (for lockdep) */ + seqcount_mutex_t data_seqcount; #endif /* the internal btrfs device id */ @@ -162,11 +164,9 @@ btrfs_device_get_##name(const struct btrfs_device *dev) \ static inline void \ btrfs_device_set_##name(struct btrfs_device *dev, u64 size) \ { \ - preempt_disable(); \ write_seqcount_begin(&dev->data_seqcount); \ dev->name = size; \ write_seqcount_end(&dev->data_seqcount); \ - preempt_enable(); \ } #elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) #define BTRFS_DEVICE_GETSET_FUNCS(name) \ diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index 3080cda9e824..8bda092e60c5 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -121,7 +121,7 @@ static int cachefiles_read_reissue(struct cachefiles_object *object, _debug("reissue read"); ret = bmapping->a_ops->readpage(NULL, backpage); if (ret < 0) - goto unlock_discard; + goto discard; } /* but the page may have been read before the monitor was installed, so @@ -138,6 +138,7 @@ static int cachefiles_read_reissue(struct cachefiles_object *object, unlock_discard: unlock_page(backpage); +discard: spin_lock_irq(&object->work_lock); list_del(&monitor->op_link); spin_unlock_irq(&object->work_lock); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 5027bbdca419..ded4229c314a 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -4074,7 +4074,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, vino.snap, inode); mutex_lock(&session->s_mutex); - session->s_seq++; + inc_session_sequence(session); dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, (unsigned)seq); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 08f1c0c31dc2..8f1d7500a7ec 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -4231,7 +4231,7 @@ static void handle_lease(struct ceph_mds_client *mdsc, dname.len, dname.name); mutex_lock(&session->s_mutex); - session->s_seq++; + inc_session_sequence(session); if (!inode) { dout("handle_lease no inode %llx\n", vino.ino); @@ -4385,29 +4385,49 @@ static void maybe_recover_session(struct ceph_mds_client *mdsc) bool check_session_state(struct ceph_mds_session *s) { - if (s->s_state == CEPH_MDS_SESSION_CLOSING) { - dout("resending session close request for mds%d\n", - s->s_mds); - request_close_session(s); - return false; - } - if (s->s_ttl && time_after(jiffies, s->s_ttl)) { - if (s->s_state == CEPH_MDS_SESSION_OPEN) { + switch (s->s_state) { + case CEPH_MDS_SESSION_OPEN: + if (s->s_ttl && time_after(jiffies, s->s_ttl)) { s->s_state = CEPH_MDS_SESSION_HUNG; pr_info("mds%d hung\n", s->s_mds); } - } - if (s->s_state == CEPH_MDS_SESSION_NEW || - s->s_state == CEPH_MDS_SESSION_RESTARTING || - s->s_state == CEPH_MDS_SESSION_CLOSED || - s->s_state == CEPH_MDS_SESSION_REJECTED) - /* this mds is failed or recovering, just wait */ + break; + case CEPH_MDS_SESSION_CLOSING: + /* Should never reach this when we're unmounting */ + WARN_ON_ONCE(true); + fallthrough; + case CEPH_MDS_SESSION_NEW: + case CEPH_MDS_SESSION_RESTARTING: + case CEPH_MDS_SESSION_CLOSED: + case CEPH_MDS_SESSION_REJECTED: return false; + } return true; } /* + * If the sequence is incremented while we're waiting on a REQUEST_CLOSE reply, + * then we need to retransmit that request. + */ +void inc_session_sequence(struct ceph_mds_session *s) +{ + lockdep_assert_held(&s->s_mutex); + + s->s_seq++; + + if (s->s_state == CEPH_MDS_SESSION_CLOSING) { + int ret; + + dout("resending session close request for mds%d\n", s->s_mds); + ret = request_close_session(s); + if (ret < 0) + pr_err("unable to close session to mds%d: %d\n", + s->s_mds, ret); + } +} + +/* * delayed work -- periodically trim expired leases, renew caps with mds */ static void schedule_delayed(struct ceph_mds_client *mdsc) diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index cbf8af437140..f5adbebcb38e 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -480,6 +480,7 @@ struct ceph_mds_client { extern const char *ceph_mds_op_name(int op); extern bool check_session_state(struct ceph_mds_session *s); +void inc_session_sequence(struct ceph_mds_session *s); extern struct ceph_mds_session * __ceph_lookup_mds_session(struct ceph_mds_client *, int mds); diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index 83cb4f26b689..9b785f11e95a 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c @@ -53,7 +53,7 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc, /* increment msg sequence number */ mutex_lock(&session->s_mutex); - session->s_seq++; + inc_session_sequence(session); mutex_unlock(&session->s_mutex); /* lookup inode */ diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 0da39c16dab4..b611f829cb61 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -873,7 +873,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, ceph_snap_op_name(op), split, trace_len); mutex_lock(&session->s_mutex); - session->s_seq++; + inc_session_sequence(session); mutex_unlock(&session->s_mutex); down_write(&mdsc->snap_rwsem); diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index a768a09430c3..686e0ad28788 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -1127,24 +1127,23 @@ static const struct file_operations debugfs_devm_entry_ops = { * file will be created in the root of the debugfs filesystem. * @read_fn: function pointer called to print the seq_file content. */ -struct dentry *debugfs_create_devm_seqfile(struct device *dev, const char *name, - struct dentry *parent, - int (*read_fn)(struct seq_file *s, - void *data)) +void debugfs_create_devm_seqfile(struct device *dev, const char *name, + struct dentry *parent, + int (*read_fn)(struct seq_file *s, void *data)) { struct debugfs_devm_entry *entry; if (IS_ERR(parent)) - return ERR_PTR(-ENOENT); + return; entry = devm_kzalloc(dev, sizeof(*entry), GFP_KERNEL); if (!entry) - return ERR_PTR(-ENOMEM); + return; entry->read = read_fn; entry->dev = dev; - return debugfs_create_file(name, S_IRUGO, parent, entry, - &debugfs_devm_entry_ops); + debugfs_create_file(name, S_IRUGO, parent, entry, + &debugfs_devm_entry_ops); } EXPORT_SYMBOL_GPL(debugfs_create_devm_seqfile); diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 5b81f3b080ee..ca50c90adc4c 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -669,68 +669,8 @@ const struct file_operations ext4_dir_operations = { }; #ifdef CONFIG_UNICODE -static int ext4_d_compare(const struct dentry *dentry, unsigned int len, - const char *str, const struct qstr *name) -{ - struct qstr qstr = {.name = str, .len = len }; - const struct dentry *parent = READ_ONCE(dentry->d_parent); - const struct inode *inode = d_inode_rcu(parent); - char strbuf[DNAME_INLINE_LEN]; - - if (!inode || !IS_CASEFOLDED(inode) || - !EXT4_SB(inode->i_sb)->s_encoding) { - if (len != name->len) - return -1; - return memcmp(str, name->name, len); - } - - /* - * If the dentry name is stored in-line, then it may be concurrently - * modified by a rename. If this happens, the VFS will eventually retry - * the lookup, so it doesn't matter what ->d_compare() returns. - * However, it's unsafe to call utf8_strncasecmp() with an unstable - * string. Therefore, we have to copy the name into a temporary buffer. - */ - if (len <= DNAME_INLINE_LEN - 1) { - memcpy(strbuf, str, len); - strbuf[len] = 0; - qstr.name = strbuf; - /* prevent compiler from optimizing out the temporary buffer */ - barrier(); - } - - return ext4_ci_compare(inode, name, &qstr, false); -} - -static int ext4_d_hash(const struct dentry *dentry, struct qstr *str) -{ - const struct ext4_sb_info *sbi = EXT4_SB(dentry->d_sb); - const struct unicode_map *um = sbi->s_encoding; - const struct inode *inode = d_inode_rcu(dentry); - unsigned char *norm; - int len, ret = 0; - - if (!inode || !IS_CASEFOLDED(inode) || !um) - return 0; - - norm = kmalloc(PATH_MAX, GFP_ATOMIC); - if (!norm) - return -ENOMEM; - - len = utf8_casefold(um, str, norm, PATH_MAX); - if (len < 0) { - if (ext4_has_strict_mode(sbi)) - ret = -EINVAL; - goto out; - } - str->hash = full_name_hash(dentry, norm, len); -out: - kfree(norm); - return ret; -} - const struct dentry_operations ext4_dentry_ops = { - .d_hash = ext4_d_hash, - .d_compare = ext4_d_compare, + .d_hash = generic_ci_d_hash, + .d_compare = generic_ci_d_compare, }; #endif diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 254d1c26bea8..45fcdbf538d1 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1166,10 +1166,6 @@ struct ext4_inode_info { #define EXT4_VALID_FS 0x0001 /* Unmounted cleanly */ #define EXT4_ERROR_FS 0x0002 /* Errors detected */ #define EXT4_ORPHAN_FS 0x0004 /* Orphans being recovered */ -#define EXT4_FC_INELIGIBLE 0x0008 /* Fast commit ineligible */ -#define EXT4_FC_COMMITTING 0x0010 /* File system underoing a fast - * commit. - */ #define EXT4_FC_REPLAY 0x0020 /* Fast commit replay ongoing */ /* @@ -1431,6 +1427,10 @@ struct ext4_super_block { */ #define EXT4_MF_MNTDIR_SAMPLED 0x0001 #define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */ +#define EXT4_MF_FC_INELIGIBLE 0x0004 /* Fast commit ineligible */ +#define EXT4_MF_FC_COMMITTING 0x0008 /* File system underoing a fast + * commit. + */ #ifdef CONFIG_FS_ENCRYPTION #define DUMMY_ENCRYPTION_ENABLED(sbi) ((sbi)->s_dummy_enc_policy.policy != NULL) @@ -1444,14 +1444,6 @@ struct ext4_super_block { #define EXT4_ENC_UTF8_12_1 1 /* - * Flags for ext4_sb_info.s_encoding_flags. - */ -#define EXT4_ENC_STRICT_MODE_FL (1 << 0) - -#define ext4_has_strict_mode(sbi) \ - (sbi->s_encoding_flags & EXT4_ENC_STRICT_MODE_FL) - -/* * fourth extended-fs super-block data in memory */ struct ext4_sb_info { @@ -1500,10 +1492,6 @@ struct ext4_sb_info { struct kobject s_kobj; struct completion s_kobj_unregister; struct super_block *s_sb; -#ifdef CONFIG_UNICODE - struct unicode_map *s_encoding; - __u16 s_encoding_flags; -#endif /* Journaling */ struct journal_s *s_journal; diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 559100f3e23c..57cfa28919a9 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1471,16 +1471,16 @@ static int ext4_ext_search_left(struct inode *inode, } /* - * search the closest allocated block to the right for *logical - * and returns it at @logical + it's physical address at @phys - * if *logical is the largest allocated block, the function - * returns 0 at @phys - * return value contains 0 (success) or error code + * Search the closest allocated block to the right for *logical + * and returns it at @logical + it's physical address at @phys. + * If not exists, return 0 and @phys is set to 0. We will return + * 1 which means we found an allocated block and ret_ex is valid. + * Or return a (< 0) error code. */ static int ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t *logical, ext4_fsblk_t *phys, - struct ext4_extent **ret_ex) + struct ext4_extent *ret_ex) { struct buffer_head *bh = NULL; struct ext4_extent_header *eh; @@ -1574,10 +1574,11 @@ got_index: found_extent: *logical = le32_to_cpu(ex->ee_block); *phys = ext4_ext_pblock(ex); - *ret_ex = ex; + if (ret_ex) + *ret_ex = *ex; if (bh) put_bh(bh); - return 0; + return 1; } /* @@ -2868,8 +2869,8 @@ again: */ lblk = ex_end + 1; err = ext4_ext_search_right(inode, path, &lblk, &pblk, - &ex); - if (err) + NULL); + if (err < 0) goto out; if (pblk) { partial.pclu = EXT4_B2C(sbi, pblk); @@ -4039,7 +4040,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags) { struct ext4_ext_path *path = NULL; - struct ext4_extent newex, *ex, *ex2; + struct ext4_extent newex, *ex, ex2; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); ext4_fsblk_t newblock = 0, pblk; int err = 0, depth, ret; @@ -4175,15 +4176,14 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, if (err) goto out; ar.lright = map->m_lblk; - ex2 = NULL; err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2); - if (err) + if (err < 0) goto out; /* Check if the extent after searching to the right implies a * cluster we can use. */ - if ((sbi->s_cluster_ratio > 1) && ex2 && - get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) { + if ((sbi->s_cluster_ratio > 1) && err && + get_implied_cluster_alloc(inode->i_sb, map, &ex2, path)) { ar.len = allocated = map->m_len; newblock = map->m_pblk; goto got_allocated_blocks; diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 447c8d93f480..8d43058386c3 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -269,7 +269,7 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason) (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) return; - sbi->s_mount_state |= EXT4_FC_INELIGIBLE; + sbi->s_mount_flags |= EXT4_MF_FC_INELIGIBLE; WARN_ON(reason >= EXT4_FC_REASON_MAX); sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; } @@ -292,7 +292,7 @@ void ext4_fc_start_ineligible(struct super_block *sb, int reason) } /* - * Stop a fast commit ineligible update. We set EXT4_FC_INELIGIBLE flag here + * Stop a fast commit ineligible update. We set EXT4_MF_FC_INELIGIBLE flag here * to ensure that after stopping the ineligible update, at least one full * commit takes place. */ @@ -302,13 +302,13 @@ void ext4_fc_stop_ineligible(struct super_block *sb) (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)) return; - EXT4_SB(sb)->s_mount_state |= EXT4_FC_INELIGIBLE; + EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FC_INELIGIBLE; atomic_dec(&EXT4_SB(sb)->s_fc_ineligible_updates); } static inline int ext4_fc_is_ineligible(struct super_block *sb) { - return (EXT4_SB(sb)->s_mount_state & EXT4_FC_INELIGIBLE) || + return (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FC_INELIGIBLE) || atomic_read(&EXT4_SB(sb)->s_fc_ineligible_updates); } @@ -358,7 +358,7 @@ static int ext4_fc_track_template( spin_lock(&sbi->s_fc_lock); if (list_empty(&EXT4_I(inode)->i_fc_list)) list_add_tail(&EXT4_I(inode)->i_fc_list, - (sbi->s_mount_state & EXT4_FC_COMMITTING) ? + (sbi->s_mount_flags & EXT4_MF_FC_COMMITTING) ? &sbi->s_fc_q[FC_Q_STAGING] : &sbi->s_fc_q[FC_Q_MAIN]); spin_unlock(&sbi->s_fc_lock); @@ -411,7 +411,7 @@ static int __track_dentry_update(struct inode *inode, void *arg, bool update) node->fcd_name.len = dentry->d_name.len; spin_lock(&sbi->s_fc_lock); - if (sbi->s_mount_state & EXT4_FC_COMMITTING) + if (sbi->s_mount_flags & EXT4_MF_FC_COMMITTING) list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_STAGING]); else @@ -846,7 +846,7 @@ static int ext4_fc_submit_inode_data_all(journal_t *journal) int ret = 0; spin_lock(&sbi->s_fc_lock); - sbi->s_mount_state |= EXT4_FC_COMMITTING; + sbi->s_mount_flags |= EXT4_MF_FC_COMMITTING; list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) { ei = list_entry(pos, struct ext4_inode_info, i_fc_list); ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); @@ -964,7 +964,6 @@ static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc) fc_dentry->fcd_parent, fc_dentry->fcd_ino, fc_dentry->fcd_name.len, fc_dentry->fcd_name.name, crc)) { - spin_lock(&sbi->s_fc_lock); ret = -ENOSPC; goto lock_and_exit; } @@ -1191,8 +1190,8 @@ static void ext4_fc_cleanup(journal_t *journal, int full) list_splice_init(&sbi->s_fc_q[FC_Q_STAGING], &sbi->s_fc_q[FC_Q_STAGING]); - sbi->s_mount_state &= ~EXT4_FC_COMMITTING; - sbi->s_mount_state &= ~EXT4_FC_INELIGIBLE; + sbi->s_mount_flags &= ~EXT4_MF_FC_COMMITTING; + sbi->s_mount_flags &= ~EXT4_MF_FC_INELIGIBLE; if (full) sbi->s_fc_bytes = 0; @@ -1617,8 +1616,10 @@ static int ext4_fc_replay_add_range(struct super_block *sb, if (ret == 0) { /* Range is not mapped */ path = ext4_find_extent(inode, cur, NULL, 0); - if (!path) - continue; + if (IS_ERR(path)) { + iput(inode); + return 0; + } memset(&newex, 0, sizeof(newex)); newex.ee_block = cpu_to_le32(cur); ext4_ext_store_pblock( @@ -2078,6 +2079,8 @@ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, void ext4_fc_init(struct super_block *sb, journal_t *journal) { + int num_fc_blocks; + /* * We set replay callback even if fast commit disabled because we may * could still have fast commit blocks that need to be replayed even if @@ -2087,7 +2090,15 @@ void ext4_fc_init(struct super_block *sb, journal_t *journal) if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) return; journal->j_fc_cleanup_callback = ext4_fc_cleanup; - if (jbd2_fc_init(journal, EXT4_NUM_FC_BLKS)) { + if (!buffer_uptodate(journal->j_sb_buffer) + && ext4_read_bh_lock(journal->j_sb_buffer, REQ_META | REQ_PRIO, + true)) { + ext4_msg(sb, KERN_ERR, "I/O error on journal"); + return; + } + num_fc_blocks = be32_to_cpu(journal->j_superblock->s_num_fc_blks); + if (jbd2_fc_init(journal, num_fc_blocks ? num_fc_blocks : + EXT4_NUM_FC_BLKS)) { pr_warn("Error while enabling fast commits, turning off."); ext4_clear_feature_fast_commit(sb); } diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index 2924261226e0..a92eb79de0cc 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -275,7 +275,7 @@ int ext4fs_dirhash(const struct inode *dir, const char *name, int len, struct dx_hash_info *hinfo) { #ifdef CONFIG_UNICODE - const struct unicode_map *um = EXT4_SB(dir->i_sb)->s_encoding; + const struct unicode_map *um = dir->i_sb->s_encoding; int r, dlen; unsigned char *buff; struct qstr qstr = {.name = name, .len = len }; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 03c2253005f0..b96a18679a27 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1918,7 +1918,7 @@ static int __ext4_journalled_writepage(struct page *page, } if (ret == 0) ret = err; - err = ext4_jbd2_inode_add_write(handle, inode, 0, len); + err = ext4_jbd2_inode_add_write(handle, inode, page_offset(page), len); if (ret == 0) ret = err; EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; @@ -3307,10 +3307,12 @@ static bool ext4_inode_datasync_dirty(struct inode *inode) if (journal) { if (jbd2_transaction_committed(journal, - EXT4_I(inode)->i_datasync_tid)) - return true; - return atomic_read(&EXT4_SB(inode->i_sb)->s_fc_subtid) >= - EXT4_I(inode)->i_fc_committed_subtid; + EXT4_I(inode)->i_datasync_tid)) + return false; + if (test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT)) + return atomic_read(&EXT4_SB(inode->i_sb)->s_fc_subtid) < + EXT4_I(inode)->i_fc_committed_subtid; + return true; } /* Any metadata buffers to write? */ @@ -6157,7 +6159,8 @@ retry_alloc: if (ext4_walk_page_buffers(handle, page_buffers(page), 0, len, NULL, write_end_fn)) goto out_error; - if (ext4_jbd2_inode_add_write(handle, inode, 0, len)) + if (ext4_jbd2_inode_add_write(handle, inode, + page_offset(page), len)) goto out_error; ext4_set_inode_state(inode, EXT4_STATE_JDATA); } else { diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 5159830dacb8..f458d1d81d96 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1285,8 +1285,8 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block) int ext4_ci_compare(const struct inode *parent, const struct qstr *name, const struct qstr *entry, bool quick) { - const struct ext4_sb_info *sbi = EXT4_SB(parent->i_sb); - const struct unicode_map *um = sbi->s_encoding; + const struct super_block *sb = parent->i_sb; + const struct unicode_map *um = sb->s_encoding; int ret; if (quick) @@ -1298,7 +1298,7 @@ int ext4_ci_compare(const struct inode *parent, const struct qstr *name, /* Handle invalid character sequence as either an error * or as an opaque byte sequence. */ - if (ext4_has_strict_mode(sbi)) + if (sb_has_strict_encoding(sb)) return -EINVAL; if (name->len != entry->len) @@ -1315,7 +1315,7 @@ void ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, { int len; - if (!IS_CASEFOLDED(dir) || !EXT4_SB(dir->i_sb)->s_encoding) { + if (!IS_CASEFOLDED(dir) || !dir->i_sb->s_encoding) { cf_name->name = NULL; return; } @@ -1324,7 +1324,7 @@ void ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, if (!cf_name->name) return; - len = utf8_casefold(EXT4_SB(dir->i_sb)->s_encoding, + len = utf8_casefold(dir->i_sb->s_encoding, iname, cf_name->name, EXT4_NAME_LEN); if (len <= 0) { @@ -1361,7 +1361,7 @@ static inline bool ext4_match(const struct inode *parent, #endif #ifdef CONFIG_UNICODE - if (EXT4_SB(parent->i_sb)->s_encoding && IS_CASEFOLDED(parent)) { + if (parent->i_sb->s_encoding && IS_CASEFOLDED(parent)) { if (fname->cf_name.name) { struct qstr cf = {.name = fname->cf_name.name, .len = fname->cf_name.len}; @@ -2180,9 +2180,6 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, struct buffer_head *bh = NULL; struct ext4_dir_entry_2 *de; struct super_block *sb; -#ifdef CONFIG_UNICODE - struct ext4_sb_info *sbi; -#endif struct ext4_filename fname; int retval; int dx_fallback=0; @@ -2199,9 +2196,8 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, return -EINVAL; #ifdef CONFIG_UNICODE - sbi = EXT4_SB(sb); - if (ext4_has_strict_mode(sbi) && IS_CASEFOLDED(dir) && - sbi->s_encoding && utf8_validate(sbi->s_encoding, &dentry->d_name)) + if (sb_has_strict_encoding(sb) && IS_CASEFOLDED(dir) && + sb->s_encoding && utf8_validate(sb->s_encoding, &dentry->d_name)) return -EINVAL; #endif diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 2fe141ff3c7e..ef4734b40e2a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1288,7 +1288,7 @@ static void ext4_put_super(struct super_block *sb) fs_put_dax(sbi->s_daxdev); fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); #ifdef CONFIG_UNICODE - utf8_unload(sbi->s_encoding); + utf8_unload(sb->s_encoding); #endif kfree(sbi); } @@ -4303,7 +4303,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; #ifdef CONFIG_UNICODE - if (ext4_has_feature_casefold(sb) && !sbi->s_encoding) { + if (ext4_has_feature_casefold(sb) && !sb->s_encoding) { const struct ext4_sb_encodings *encoding_info; struct unicode_map *encoding; __u16 encoding_flags; @@ -4334,8 +4334,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) "%s-%s with flags 0x%hx", encoding_info->name, encoding_info->version?:"\b", encoding_flags); - sbi->s_encoding = encoding; - sbi->s_encoding_flags = encoding_flags; + sb->s_encoding = encoding; + sb->s_encoding_flags = encoding_flags; } #endif @@ -4777,8 +4777,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_MAIN]); INIT_LIST_HEAD(&sbi->s_fc_dentry_q[FC_Q_STAGING]); sbi->s_fc_bytes = 0; - sbi->s_mount_state &= ~EXT4_FC_INELIGIBLE; - sbi->s_mount_state &= ~EXT4_FC_COMMITTING; + sbi->s_mount_flags &= ~EXT4_MF_FC_INELIGIBLE; + sbi->s_mount_flags &= ~EXT4_MF_FC_COMMITTING; spin_lock_init(&sbi->s_fc_lock); memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats)); sbi->s_fc_replay_state.fc_regions = NULL; @@ -4975,7 +4975,7 @@ no_journal: } #ifdef CONFIG_UNICODE - if (sbi->s_encoding) + if (sb->s_encoding) sb->s_d_op = &ext4_dentry_ops; #endif @@ -5184,7 +5184,7 @@ failed_mount: crypto_free_shash(sbi->s_chksum_driver); #ifdef CONFIG_UNICODE - utf8_unload(sbi->s_encoding); + utf8_unload(sb->s_encoding); #endif #ifdef CONFIG_QUOTA diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 5ff33d18996a..4e27fe6ed3ae 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -315,6 +315,7 @@ EXT4_ATTR_FEATURE(casefold); EXT4_ATTR_FEATURE(verity); #endif EXT4_ATTR_FEATURE(metadata_csum_seed); +EXT4_ATTR_FEATURE(fast_commit); static struct attribute *ext4_feat_attrs[] = { ATTR_LIST(lazy_itable_init), @@ -331,6 +332,7 @@ static struct attribute *ext4_feat_attrs[] = { ATTR_LIST(verity), #endif ATTR_LIST(metadata_csum_seed), + ATTR_LIST(fast_commit), NULL, }; ATTRIBUTE_GROUPS(ext4_feat); diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 5441c17562c5..d98a2e5dab9f 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1078,7 +1078,8 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, out_free: kfree(gl->gl_lksb.sb_lvbptr); kmem_cache_free(cachep, gl); - atomic_dec(&sdp->sd_glock_disposal); + if (atomic_dec_and_test(&sdp->sd_glock_disposal)) + wake_up(&sdp->sd_glock_wait); out: return ret; diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index aa3f5236befb..6c1432d78dce 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -165,6 +165,31 @@ void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) } /** + * gfs2_rgrp_metasync - sync out the metadata of a resource group + * @gl: the glock protecting the resource group + * + */ + +static int gfs2_rgrp_metasync(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + struct address_space *metamapping = &sdp->sd_aspace; + struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(gl); + const unsigned bsize = sdp->sd_sb.sb_bsize; + loff_t start = (rgd->rd_addr * bsize) & PAGE_MASK; + loff_t end = PAGE_ALIGN((rgd->rd_addr + rgd->rd_length) * bsize) - 1; + int error; + + filemap_fdatawrite_range(metamapping, start, end); + error = filemap_fdatawait_range(metamapping, start, end); + WARN_ON_ONCE(error && !gfs2_withdrawn(sdp)); + mapping_set_error(metamapping, error); + if (error) + gfs2_io_error(sdp); + return error; +} + +/** * rgrp_go_sync - sync out the metadata for this glock * @gl: the glock * @@ -176,11 +201,7 @@ void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) static int rgrp_go_sync(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; - struct address_space *mapping = &sdp->sd_aspace; struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(gl); - const unsigned bsize = sdp->sd_sb.sb_bsize; - loff_t start = (rgd->rd_addr * bsize) & PAGE_MASK; - loff_t end = PAGE_ALIGN((rgd->rd_addr + rgd->rd_length) * bsize) - 1; int error; if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) @@ -189,10 +210,7 @@ static int rgrp_go_sync(struct gfs2_glock *gl) gfs2_log_flush(sdp, gl, GFS2_LOG_HEAD_FLUSH_NORMAL | GFS2_LFC_RGRP_GO_SYNC); - filemap_fdatawrite_range(mapping, start, end); - error = filemap_fdatawait_range(mapping, start, end); - WARN_ON_ONCE(error && !gfs2_withdrawn(sdp)); - mapping_set_error(mapping, error); + error = gfs2_rgrp_metasync(gl); if (!error) error = gfs2_ail_empty_gl(gl); gfs2_free_clones(rgd); @@ -266,7 +284,24 @@ static void gfs2_clear_glop_pending(struct gfs2_inode *ip) } /** - * inode_go_sync - Sync the dirty data and/or metadata for an inode glock + * gfs2_inode_metasync - sync out the metadata of an inode + * @gl: the glock protecting the inode + * + */ +int gfs2_inode_metasync(struct gfs2_glock *gl) +{ + struct address_space *metamapping = gfs2_glock2aspace(gl); + int error; + + filemap_fdatawrite(metamapping); + error = filemap_fdatawait(metamapping); + if (error) + gfs2_io_error(gl->gl_name.ln_sbd); + return error; +} + +/** + * inode_go_sync - Sync the dirty metadata of an inode * @gl: the glock protecting the inode * */ @@ -297,8 +332,7 @@ static int inode_go_sync(struct gfs2_glock *gl) error = filemap_fdatawait(mapping); mapping_set_error(mapping, error); } - ret = filemap_fdatawait(metamapping); - mapping_set_error(metamapping, ret); + ret = gfs2_inode_metasync(gl); if (!error) error = ret; gfs2_ail_empty_gl(gl); diff --git a/fs/gfs2/glops.h b/fs/gfs2/glops.h index 2dd192e85618..695898afcaf1 100644 --- a/fs/gfs2/glops.h +++ b/fs/gfs2/glops.h @@ -22,6 +22,7 @@ extern const struct gfs2_glock_operations gfs2_quota_glops; extern const struct gfs2_glock_operations gfs2_journal_glops; extern const struct gfs2_glock_operations *gfs2_glops_list[]; +extern int gfs2_inode_metasync(struct gfs2_glock *gl); extern void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync); #endif /* __GLOPS_DOT_H__ */ diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 6774865f5b5b..077ccb1b3ccc 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -180,7 +180,8 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh); if (unlikely(error)) goto fail; - gfs2_cancel_delete_work(ip->i_iopen_gh.gh_gl); + if (blktype != GFS2_BLKST_UNLINKED) + gfs2_cancel_delete_work(ip->i_iopen_gh.gh_gl); glock_set_object(ip->i_iopen_gh.gh_gl, ip); gfs2_glock_put(io_gl); io_gl = NULL; diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index ed69298dd824..3922b26264f5 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -22,6 +22,7 @@ #include "incore.h" #include "inode.h" #include "glock.h" +#include "glops.h" #include "log.h" #include "lops.h" #include "meta_io.h" @@ -817,41 +818,19 @@ static int buf_lo_scan_elements(struct gfs2_jdesc *jd, u32 start, return error; } -/** - * gfs2_meta_sync - Sync all buffers associated with a glock - * @gl: The glock - * - */ - -void gfs2_meta_sync(struct gfs2_glock *gl) -{ - struct address_space *mapping = gfs2_glock2aspace(gl); - struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; - int error; - - if (mapping == NULL) - mapping = &sdp->sd_aspace; - - filemap_fdatawrite(mapping); - error = filemap_fdatawait(mapping); - - if (error) - gfs2_io_error(gl->gl_name.ln_sbd); -} - static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass) { struct gfs2_inode *ip = GFS2_I(jd->jd_inode); struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); if (error) { - gfs2_meta_sync(ip->i_gl); + gfs2_inode_metasync(ip->i_gl); return; } if (pass != 1) return; - gfs2_meta_sync(ip->i_gl); + gfs2_inode_metasync(ip->i_gl); fs_info(sdp, "jid=%u: Replayed %u of %u blocks\n", jd->jd_jid, jd->jd_replayed_blocks, jd->jd_found_blocks); @@ -1060,14 +1039,14 @@ static void databuf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass) struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); if (error) { - gfs2_meta_sync(ip->i_gl); + gfs2_inode_metasync(ip->i_gl); return; } if (pass != 1) return; /* data sync? */ - gfs2_meta_sync(ip->i_gl); + gfs2_inode_metasync(ip->i_gl); fs_info(sdp, "jid=%u: Replayed %u of %u data blocks\n", jd->jd_jid, jd->jd_replayed_blocks, jd->jd_found_blocks); diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h index 4a3d8aecdf82..fbdbb08dcec6 100644 --- a/fs/gfs2/lops.h +++ b/fs/gfs2/lops.h @@ -27,8 +27,6 @@ extern void gfs2_log_submit_bio(struct bio **biop, int opf); extern void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh); extern int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, bool keep_cache); -extern void gfs2_meta_sync(struct gfs2_glock *gl); - static inline unsigned int buf_limit(struct gfs2_sbd *sdp) { unsigned int limit; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 7a7e3c10a9a9..61fce59cb4d3 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -633,8 +633,10 @@ static int init_statfs(struct gfs2_sbd *sdp) if (IS_ERR(sdp->sd_statfs_inode)) { error = PTR_ERR(sdp->sd_statfs_inode); fs_err(sdp, "can't read in statfs inode: %d\n", error); - goto fail; + goto out; } + if (sdp->sd_args.ar_spectator) + goto out; pn = gfs2_lookup_simple(master, "per_node"); if (IS_ERR(pn)) { @@ -682,15 +684,17 @@ free_local: iput(pn); put_statfs: iput(sdp->sd_statfs_inode); -fail: +out: return error; } /* Uninitialize and free up memory used by the list of statfs inodes */ static void uninit_statfs(struct gfs2_sbd *sdp) { - gfs2_glock_dq_uninit(&sdp->sd_sc_gh); - free_local_statfs_inodes(sdp); + if (!sdp->sd_args.ar_spectator) { + gfs2_glock_dq_uninit(&sdp->sd_sc_gh); + free_local_statfs_inodes(sdp); + } iput(sdp->sd_statfs_inode); } @@ -704,7 +708,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) if (undo) { jindex = 0; - goto fail_jinode_gh; + goto fail_statfs; } sdp->sd_jindex = gfs2_lookup_simple(master, "jindex"); diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index b5cbe21efdfb..c26c68ebd29d 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -349,7 +349,7 @@ static int update_statfs_inode(struct gfs2_jdesc *jd, mark_buffer_dirty(bh); brelse(bh); - gfs2_meta_sync(ip->i_gl); + gfs2_inode_metasync(ip->i_gl); out: return error; diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index ee491bb9c1cc..92d799a193b8 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -719,9 +719,9 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp) } gfs2_free_clones(rgd); + return_all_reservations(rgd); kfree(rgd->rd_bits); rgd->rd_bits = NULL; - return_all_reservations(rgd); kmem_cache_free(gfs2_rgrpd_cachep, rgd); } } @@ -1370,6 +1370,9 @@ int gfs2_fitrim(struct file *filp, void __user *argp) if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) + return -EROFS; + if (!blk_queue_discard(q)) return -EOPNOTSUPP; diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index b285192bd6b3..b3d951ab8068 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -738,6 +738,7 @@ restart: gfs2_jindex_free(sdp); /* Take apart glock structures and buffer lists */ gfs2_gl_hash_clear(sdp); + truncate_inode_pages_final(&sdp->sd_aspace); gfs2_delete_debugfs_file(sdp); /* Unmount the locking protocol */ gfs2_lm_unmount(sdp); diff --git a/fs/hfs/btree.h b/fs/hfs/btree.h index dcc2aab1b2c4..4ba45caf5939 100644 --- a/fs/hfs/btree.h +++ b/fs/hfs/btree.h @@ -60,7 +60,7 @@ struct hfs_bnode { wait_queue_head_t lock_wq; atomic_t refcnt; unsigned int page_offset; - struct page *page[0]; + struct page *page[]; }; #define HFS_BNODE_ERROR 0 diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index 3b03fff68543..a92de5199ec3 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -117,7 +117,7 @@ struct hfs_bnode { wait_queue_head_t lock_wq; atomic_t refcnt; unsigned int page_offset; - struct page *page[0]; + struct page *page[]; }; #define HFS_BNODE_LOCK 0 diff --git a/fs/io-wq.c b/fs/io-wq.c index 02894df7656d..b53c055bea6a 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -482,6 +482,10 @@ static void io_impersonate_work(struct io_worker *worker, current->files = work->identity->files; current->nsproxy = work->identity->nsproxy; task_unlock(current); + if (!work->identity->files) { + /* failed grabbing files, ensure work gets cancelled */ + work->flags |= IO_WQ_WORK_CANCEL; + } } if ((work->flags & IO_WQ_WORK_FS) && current->fs != work->identity->fs) current->fs = work->identity->fs; diff --git a/fs/io_uring.c b/fs/io_uring.c index b42dfa0243bf..8018c7076b25 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -995,20 +995,33 @@ static void io_sq_thread_drop_mm(void) if (mm) { kthread_unuse_mm(mm); mmput(mm); + current->mm = NULL; } } static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx) { - if (!current->mm) { - if (unlikely(!(ctx->flags & IORING_SETUP_SQPOLL) || - !ctx->sqo_task->mm || - !mmget_not_zero(ctx->sqo_task->mm))) - return -EFAULT; - kthread_use_mm(ctx->sqo_task->mm); + struct mm_struct *mm; + + if (current->mm) + return 0; + + /* Should never happen */ + if (unlikely(!(ctx->flags & IORING_SETUP_SQPOLL))) + return -EFAULT; + + task_lock(ctx->sqo_task); + mm = ctx->sqo_task->mm; + if (unlikely(!mm || !mmget_not_zero(mm))) + mm = NULL; + task_unlock(ctx->sqo_task); + + if (mm) { + kthread_use_mm(mm); + return 0; } - return 0; + return -EFAULT; } static int io_sq_thread_acquire_mm(struct io_ring_ctx *ctx, @@ -1274,9 +1287,12 @@ static bool io_identity_cow(struct io_kiocb *req) /* add one for this request */ refcount_inc(&id->count); - /* drop old identity, assign new one. one ref for req, one for tctx */ - if (req->work.identity != tctx->identity && - refcount_sub_and_test(2, &req->work.identity->count)) + /* drop tctx and req identity references, if needed */ + if (tctx->identity != &tctx->__identity && + refcount_dec_and_test(&tctx->identity->count)) + kfree(tctx->identity); + if (req->work.identity != &tctx->__identity && + refcount_dec_and_test(&req->work.identity->count)) kfree(req->work.identity); req->work.identity = id; @@ -1365,6 +1381,9 @@ static void io_prep_async_work(struct io_kiocb *req) io_req_init_async(req); id = req->work.identity; + if (req->flags & REQ_F_FORCE_ASYNC) + req->work.flags |= IO_WQ_WORK_CONCURRENT; + if (req->flags & REQ_F_ISREG) { if (def->hash_reg_file || (ctx->flags & IORING_SETUP_IOPOLL)) io_wq_hash_work(&req->work, file_inode(req->file)); @@ -1574,14 +1593,29 @@ static void io_cqring_mark_overflow(struct io_ring_ctx *ctx) } } -static inline bool io_match_files(struct io_kiocb *req, - struct files_struct *files) +static inline bool __io_match_files(struct io_kiocb *req, + struct files_struct *files) { + return ((req->flags & REQ_F_WORK_INITIALIZED) && + (req->work.flags & IO_WQ_WORK_FILES)) && + req->work.identity->files == files; +} + +static bool io_match_files(struct io_kiocb *req, + struct files_struct *files) +{ + struct io_kiocb *link; + if (!files) return true; - if ((req->flags & REQ_F_WORK_INITIALIZED) && - (req->work.flags & IO_WQ_WORK_FILES)) - return req->work.identity->files == files; + if (__io_match_files(req, files)) + return true; + if (req->flags & REQ_F_LINK_HEAD) { + list_for_each_entry(link, &req->link_list, link_list) { + if (__io_match_files(link, files)) + return true; + } + } return false; } @@ -1665,7 +1699,8 @@ static void __io_cqring_fill_event(struct io_kiocb *req, long res, long cflags) WRITE_ONCE(cqe->user_data, req->user_data); WRITE_ONCE(cqe->res, res); WRITE_ONCE(cqe->flags, cflags); - } else if (ctx->cq_overflow_flushed || req->task->io_uring->in_idle) { + } else if (ctx->cq_overflow_flushed || + atomic_read(&req->task->io_uring->in_idle)) { /* * If we're in ring overflow flush mode, or in task cancel mode, * then we cannot store the request for later flushing, we need @@ -1835,7 +1870,7 @@ static void __io_free_req(struct io_kiocb *req) io_dismantle_req(req); percpu_counter_dec(&tctx->inflight); - if (tctx->in_idle) + if (atomic_read(&tctx->in_idle)) wake_up(&tctx->wait); put_task_struct(req->task); @@ -1846,59 +1881,39 @@ static void __io_free_req(struct io_kiocb *req) percpu_ref_put(&ctx->refs); } -static bool io_link_cancel_timeout(struct io_kiocb *req) +static void io_kill_linked_timeout(struct io_kiocb *req) { - struct io_timeout_data *io = req->async_data; struct io_ring_ctx *ctx = req->ctx; - int ret; - - ret = hrtimer_try_to_cancel(&io->timer); - if (ret != -1) { - io_cqring_fill_event(req, -ECANCELED); - io_commit_cqring(ctx); - req->flags &= ~REQ_F_LINK_HEAD; - io_put_req_deferred(req, 1); - return true; - } - - return false; -} - -static bool __io_kill_linked_timeout(struct io_kiocb *req) -{ struct io_kiocb *link; - bool wake_ev; + bool cancelled = false; + unsigned long flags; - if (list_empty(&req->link_list)) - return false; - link = list_first_entry(&req->link_list, struct io_kiocb, link_list); - if (link->opcode != IORING_OP_LINK_TIMEOUT) - return false; + spin_lock_irqsave(&ctx->completion_lock, flags); + link = list_first_entry_or_null(&req->link_list, struct io_kiocb, + link_list); /* * Can happen if a linked timeout fired and link had been like * req -> link t-out -> link t-out [-> ...] */ - if (!(link->flags & REQ_F_LTIMEOUT_ACTIVE)) - return false; + if (link && (link->flags & REQ_F_LTIMEOUT_ACTIVE)) { + struct io_timeout_data *io = link->async_data; + int ret; - list_del_init(&link->link_list); - wake_ev = io_link_cancel_timeout(link); + list_del_init(&link->link_list); + ret = hrtimer_try_to_cancel(&io->timer); + if (ret != -1) { + io_cqring_fill_event(link, -ECANCELED); + io_commit_cqring(ctx); + cancelled = true; + } + } req->flags &= ~REQ_F_LINK_TIMEOUT; - return wake_ev; -} - -static void io_kill_linked_timeout(struct io_kiocb *req) -{ - struct io_ring_ctx *ctx = req->ctx; - unsigned long flags; - bool wake_ev; - - spin_lock_irqsave(&ctx->completion_lock, flags); - wake_ev = __io_kill_linked_timeout(req); spin_unlock_irqrestore(&ctx->completion_lock, flags); - if (wake_ev) + if (cancelled) { io_cqring_ev_posted(ctx); + io_put_req(link); + } } static struct io_kiocb *io_req_link_next(struct io_kiocb *req) @@ -4977,8 +4992,10 @@ static int io_poll_double_wake(struct wait_queue_entry *wait, unsigned mode, /* make sure double remove sees this as being gone */ wait->private = NULL; spin_unlock(&poll->head->lock); - if (!done) - __io_async_wake(req, poll, mask, io_poll_task_func); + if (!done) { + /* use wait func handler, so it matches the rq type */ + poll->wait.func(&poll->wait, mode, sync, key); + } } refcount_dec(&req->refs); return 1; @@ -6180,7 +6197,6 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) static void __io_queue_sqe(struct io_kiocb *req, struct io_comp_state *cs) { struct io_kiocb *linked_timeout; - struct io_kiocb *nxt; const struct cred *old_creds = NULL; int ret; @@ -6206,7 +6222,6 @@ again: */ if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { if (!io_arm_poll_handler(req)) { -punt: /* * Queued up for async execution, worker will release * submit reference when the iocb is actually submitted. @@ -6216,33 +6231,25 @@ punt: if (linked_timeout) io_queue_linked_timeout(linked_timeout); - goto exit; - } + } else if (likely(!ret)) { + /* drop submission reference */ + req = io_put_req_find_next(req); + if (linked_timeout) + io_queue_linked_timeout(linked_timeout); - if (unlikely(ret)) { + if (req) { + if (!(req->flags & REQ_F_FORCE_ASYNC)) + goto again; + io_queue_async_work(req); + } + } else { /* un-prep timeout, so it'll be killed as any other linked */ req->flags &= ~REQ_F_LINK_TIMEOUT; req_set_fail_links(req); io_put_req(req); io_req_complete(req, ret); - goto exit; } - /* drop submission reference */ - nxt = io_put_req_find_next(req); - if (linked_timeout) - io_queue_linked_timeout(linked_timeout); - - if (nxt) { - req = nxt; - - if (req->flags & REQ_F_FORCE_ASYNC) { - linked_timeout = NULL; - goto punt; - } - goto again; - } -exit: if (old_creds) revert_creds(old_creds); } @@ -6266,13 +6273,6 @@ fail_req: if (unlikely(ret)) goto fail_req; } - - /* - * Never try inline submit of IOSQE_ASYNC is set, go straight - * to async execution. - */ - io_req_init_async(req); - req->work.flags |= IO_WQ_WORK_CONCURRENT; io_queue_async_work(req); } else { if (sqe) { @@ -7727,7 +7727,8 @@ static int io_uring_alloc_task_context(struct task_struct *task) xa_init(&tctx->xa); init_waitqueue_head(&tctx->wait); tctx->last = NULL; - tctx->in_idle = 0; + atomic_set(&tctx->in_idle, 0); + tctx->sqpoll = false; io_init_identity(&tctx->__identity); tctx->identity = &tctx->__identity; task->io_uring = tctx; @@ -8420,22 +8421,6 @@ static bool io_match_link(struct io_kiocb *preq, struct io_kiocb *req) return false; } -static bool io_match_link_files(struct io_kiocb *req, - struct files_struct *files) -{ - struct io_kiocb *link; - - if (io_match_files(req, files)) - return true; - if (req->flags & REQ_F_LINK_HEAD) { - list_for_each_entry(link, &req->link_list, link_list) { - if (io_match_files(link, files)) - return true; - } - } - return false; -} - /* * We're looking to cancel 'req' because it's holding on to our files, but * 'req' could be a link to another request. See if it is, and cancel that @@ -8485,7 +8470,21 @@ static bool io_timeout_remove_link(struct io_ring_ctx *ctx, static bool io_cancel_link_cb(struct io_wq_work *work, void *data) { - return io_match_link(container_of(work, struct io_kiocb, work), data); + struct io_kiocb *req = container_of(work, struct io_kiocb, work); + bool ret; + + if (req->flags & REQ_F_LINK_TIMEOUT) { + unsigned long flags; + struct io_ring_ctx *ctx = req->ctx; + + /* protect against races with linked timeouts */ + spin_lock_irqsave(&ctx->completion_lock, flags); + ret = io_match_link(req, data); + spin_unlock_irqrestore(&ctx->completion_lock, flags); + } else { + ret = io_match_link(req, data); + } + return ret; } static void io_attempt_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) @@ -8511,6 +8510,7 @@ static void io_attempt_cancel(struct io_ring_ctx *ctx, struct io_kiocb *req) } static void io_cancel_defer_files(struct io_ring_ctx *ctx, + struct task_struct *task, struct files_struct *files) { struct io_defer_entry *de = NULL; @@ -8518,7 +8518,8 @@ static void io_cancel_defer_files(struct io_ring_ctx *ctx, spin_lock_irq(&ctx->completion_lock); list_for_each_entry_reverse(de, &ctx->defer_list, list) { - if (io_match_link_files(de->req, files)) { + if (io_task_match(de->req, task) && + io_match_files(de->req, files)) { list_cut_position(&list, &ctx->defer_list, &de->list); break; } @@ -8544,7 +8545,6 @@ static bool io_uring_cancel_files(struct io_ring_ctx *ctx, if (list_empty_careful(&ctx->inflight_list)) return false; - io_cancel_defer_files(ctx, files); /* cancel all at once, should be faster than doing it one by one*/ io_wq_cancel_cb(ctx->io_wq, io_wq_files_match, files, true); @@ -8630,8 +8630,16 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx, { struct task_struct *task = current; - if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) + if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) { task = ctx->sq_data->thread; + atomic_inc(&task->io_uring->in_idle); + io_sq_thread_park(ctx->sq_data); + } + + if (files) + io_cancel_defer_files(ctx, NULL, files); + else + io_cancel_defer_files(ctx, task, NULL); io_cqring_overflow_flush(ctx, true, task, files); @@ -8639,12 +8647,23 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx, io_run_task_work(); cond_resched(); } + + if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) { + atomic_dec(&task->io_uring->in_idle); + /* + * If the files that are going away are the ones in the thread + * identity, clear them out. + */ + if (task->io_uring->identity->files == files) + task->io_uring->identity->files = NULL; + io_sq_thread_unpark(ctx->sq_data); + } } /* * Note that this task has used io_uring. We use it for cancelation purposes. */ -static int io_uring_add_task_file(struct file *file) +static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file) { struct io_uring_task *tctx = current->io_uring; @@ -8666,6 +8685,14 @@ static int io_uring_add_task_file(struct file *file) tctx->last = file; } + /* + * This is race safe in that the task itself is doing this, hence it + * cannot be going through the exit/cancel paths at the same time. + * This cannot be modified while exit/cancel is running. + */ + if (!tctx->sqpoll && (ctx->flags & IORING_SETUP_SQPOLL)) + tctx->sqpoll = true; + return 0; } @@ -8707,7 +8734,7 @@ void __io_uring_files_cancel(struct files_struct *files) unsigned long index; /* make sure overflow events are dropped */ - tctx->in_idle = true; + atomic_inc(&tctx->in_idle); xa_for_each(&tctx->xa, index, file) { struct io_ring_ctx *ctx = file->private_data; @@ -8716,6 +8743,35 @@ void __io_uring_files_cancel(struct files_struct *files) if (files) io_uring_del_task_file(file); } + + atomic_dec(&tctx->in_idle); +} + +static s64 tctx_inflight(struct io_uring_task *tctx) +{ + unsigned long index; + struct file *file; + s64 inflight; + + inflight = percpu_counter_sum(&tctx->inflight); + if (!tctx->sqpoll) + return inflight; + + /* + * If we have SQPOLL rings, then we need to iterate and find them, and + * add the pending count for those. + */ + xa_for_each(&tctx->xa, index, file) { + struct io_ring_ctx *ctx = file->private_data; + + if (ctx->flags & IORING_SETUP_SQPOLL) { + struct io_uring_task *__tctx = ctx->sqo_task->io_uring; + + inflight += percpu_counter_sum(&__tctx->inflight); + } + } + + return inflight; } /* @@ -8729,11 +8785,11 @@ void __io_uring_task_cancel(void) s64 inflight; /* make sure overflow events are dropped */ - tctx->in_idle = true; + atomic_inc(&tctx->in_idle); do { /* read completions before cancelations */ - inflight = percpu_counter_sum(&tctx->inflight); + inflight = tctx_inflight(tctx); if (!inflight) break; __io_uring_files_cancel(NULL); @@ -8744,13 +8800,13 @@ void __io_uring_task_cancel(void) * If we've seen completions, retry. This avoids a race where * a completion comes in before we did prepare_to_wait(). */ - if (inflight != percpu_counter_sum(&tctx->inflight)) + if (inflight != tctx_inflight(tctx)) continue; schedule(); } while (1); finish_wait(&tctx->wait, &wait); - tctx->in_idle = false; + atomic_dec(&tctx->in_idle); } static int io_uring_flush(struct file *file, void *data) @@ -8895,7 +8951,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, io_sqpoll_wait_sq(ctx); submitted = to_submit; } else if (to_submit) { - ret = io_uring_add_task_file(f.file); + ret = io_uring_add_task_file(ctx, f.file); if (unlikely(ret)) goto out; mutex_lock(&ctx->uring_lock); @@ -8932,7 +8988,8 @@ out_fput: #ifdef CONFIG_PROC_FS static int io_uring_show_cred(int id, void *p, void *data) { - const struct cred *cred = p; + struct io_identity *iod = p; + const struct cred *cred = iod->creds; struct seq_file *m = data; struct user_namespace *uns = seq_user_ns(m); struct group_info *gi; @@ -9124,7 +9181,7 @@ err_fd: #if defined(CONFIG_UNIX) ctx->ring_sock->file = file; #endif - if (unlikely(io_uring_add_task_file(file))) { + if (unlikely(io_uring_add_task_file(ctx, file))) { file = ERR_PTR(-ENOMEM); goto err_fd; } diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 8180061b9e16..10cc7979ce38 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1374,6 +1374,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, WARN_ON_ONCE(!wpc->ioend && !list_empty(&submit_list)); WARN_ON_ONCE(!PageLocked(page)); WARN_ON_ONCE(PageWriteback(page)); + WARN_ON_ONCE(PageDirty(page)); /* * We cannot cancel the ioend directly here on error. We may have @@ -1382,33 +1383,22 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, * appropriately. */ if (unlikely(error)) { + /* + * Let the filesystem know what portion of the current page + * failed to map. If the page wasn't been added to ioend, it + * won't be affected by I/O completion and we must unlock it + * now. + */ + if (wpc->ops->discard_page) + wpc->ops->discard_page(page, file_offset); if (!count) { - /* - * If the current page hasn't been added to ioend, it - * won't be affected by I/O completions and we must - * discard and unlock it right here. - */ - if (wpc->ops->discard_page) - wpc->ops->discard_page(page); ClearPageUptodate(page); unlock_page(page); goto done; } - - /* - * If the page was not fully cleaned, we need to ensure that the - * higher layers come back to it correctly. That means we need - * to keep the page dirty, and for WB_SYNC_ALL writeback we need - * to ensure the PAGECACHE_TAG_TOWRITE index mark is not removed - * so another attempt to write this page in this writeback sweep - * will be made. - */ - set_page_writeback_keepwrite(page); - } else { - clear_page_dirty_for_io(page); - set_page_writeback(page); } + set_page_writeback(page); unlock_page(page); /* diff --git a/fs/isofs/rock.h b/fs/isofs/rock.h index 1558cf22ef8a..ee9660e9671c 100644 --- a/fs/isofs/rock.h +++ b/fs/isofs/rock.h @@ -22,7 +22,7 @@ struct SU_ER_s { __u8 len_des; __u8 len_src; __u8 ext_ver; - __u8 data[0]; + __u8 data[]; } __attribute__ ((packed)); struct RR_RR_s { @@ -44,7 +44,7 @@ struct RR_PN_s { struct SL_component { __u8 flags; __u8 len; - __u8 text[0]; + __u8 text[]; } __attribute__ ((packed)); struct RR_SL_s { @@ -54,7 +54,7 @@ struct RR_SL_s { struct RR_NM_s { __u8 flags; - char name[0]; + char name[]; } __attribute__ ((packed)); struct RR_CL_s { @@ -71,7 +71,7 @@ struct stamp { struct RR_TF_s { __u8 flags; - struct stamp times[0]; /* Variable number of these beasts */ + struct stamp times[]; /* Variable number of these beasts */ } __attribute__ ((packed)); /* Linux-specific extension for transparent decompression */ diff --git a/fs/proc/base.c b/fs/proc/base.c index 0f707003dda5..b362523a9829 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1049,6 +1049,8 @@ static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count, oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) / OOM_SCORE_ADJ_MAX; put_task_struct(task); + if (oom_adj > OOM_ADJUST_MAX) + oom_adj = OOM_ADJUST_MAX; len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj); return simple_read_from_buffer(buf, count, ppos, buffer, len); } diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c index d0989a443c77..419760fd77bd 100644 --- a/fs/proc/cpuinfo.c +++ b/fs/proc/cpuinfo.c @@ -19,7 +19,7 @@ static int cpuinfo_open(struct inode *inode, struct file *file) static const struct proc_ops cpuinfo_proc_ops = { .proc_flags = PROC_ENTRY_PERMANENT, .proc_open = cpuinfo_open, - .proc_read = seq_read, + .proc_read_iter = seq_read_iter, .proc_lseek = seq_lseek, .proc_release = seq_release, }; diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 2f9fa179194d..b84663252add 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -590,7 +590,7 @@ static int proc_seq_release(struct inode *inode, struct file *file) static const struct proc_ops proc_seq_ops = { /* not permanent -- can call into arbitrary seq_operations */ .proc_open = proc_seq_open, - .proc_read = seq_read, + .proc_read_iter = seq_read_iter, .proc_lseek = seq_lseek, .proc_release = proc_seq_release, }; @@ -621,7 +621,7 @@ static int proc_single_open(struct inode *inode, struct file *file) static const struct proc_ops proc_single_ops = { /* not permanent -- can call into arbitrary ->single_show */ .proc_open = proc_single_open, - .proc_read = seq_read, + .proc_read_iter = seq_read_iter, .proc_lseek = seq_lseek, .proc_release = single_release, }; diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 58c075e2a452..bde6b6f69852 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -597,6 +597,7 @@ static const struct file_operations proc_iter_file_ops = { .llseek = proc_reg_llseek, .read_iter = proc_reg_read_iter, .write = proc_reg_write, + .splice_read = generic_file_splice_read, .poll = proc_reg_poll, .unlocked_ioctl = proc_reg_unlocked_ioctl, .mmap = proc_reg_mmap, @@ -622,6 +623,7 @@ static const struct file_operations proc_reg_file_ops_compat = { static const struct file_operations proc_iter_file_ops_compat = { .llseek = proc_reg_llseek, .read_iter = proc_reg_read_iter, + .splice_read = generic_file_splice_read, .write = proc_reg_write, .poll = proc_reg_poll, .unlocked_ioctl = proc_reg_unlocked_ioctl, diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 46b3293015fe..4695b6de3151 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -226,7 +226,7 @@ static int stat_open(struct inode *inode, struct file *file) static const struct proc_ops stat_proc_ops = { .proc_flags = PROC_ENTRY_PERMANENT, .proc_open = stat_open, - .proc_read = seq_read, + .proc_read_iter = seq_read_iter, .proc_lseek = seq_lseek, .proc_release = single_release, }; diff --git a/fs/select.c b/fs/select.c index 7aef49552d4c..ebfebdfe5c69 100644 --- a/fs/select.c +++ b/fs/select.c @@ -97,7 +97,7 @@ u64 select_estimate_accuracy(struct timespec64 *tv) struct poll_table_page { struct poll_table_page * next; struct poll_table_entry * entry; - struct poll_table_entry entries[0]; + struct poll_table_entry entries[]; }; #define POLL_TABLE_FULL(table) \ @@ -836,7 +836,7 @@ SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg) struct poll_list { struct poll_list *next; int len; - struct pollfd entries[0]; + struct pollfd entries[]; }; #define POLLFD_PER_PAGE ((PAGE_SIZE-sizeof(struct poll_list)) / sizeof(struct pollfd)) diff --git a/fs/seq_file.c b/fs/seq_file.c index 31219c1db17d..3b20e21604e7 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -18,6 +18,7 @@ #include <linux/mm.h> #include <linux/printk.h> #include <linux/string_helpers.h> +#include <linux/uio.h> #include <linux/uaccess.h> #include <asm/page.h> @@ -146,7 +147,28 @@ Eoverflow: */ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) { - struct seq_file *m = file->private_data; + struct iovec iov = { .iov_base = buf, .iov_len = size}; + struct kiocb kiocb; + struct iov_iter iter; + ssize_t ret; + + init_sync_kiocb(&kiocb, file); + iov_iter_init(&iter, READ, &iov, 1, size); + + kiocb.ki_pos = *ppos; + ret = seq_read_iter(&kiocb, &iter); + *ppos = kiocb.ki_pos; + return ret; +} +EXPORT_SYMBOL(seq_read); + +/* + * Ready-made ->f_op->read_iter() + */ +ssize_t seq_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct seq_file *m = iocb->ki_filp->private_data; + size_t size = iov_iter_count(iter); size_t copied = 0; size_t n; void *p; @@ -158,14 +180,14 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) * if request is to read from zero offset, reset iterator to first * record as it might have been already advanced by previous requests */ - if (*ppos == 0) { + if (iocb->ki_pos == 0) { m->index = 0; m->count = 0; } - /* Don't assume *ppos is where we left it */ - if (unlikely(*ppos != m->read_pos)) { - while ((err = traverse(m, *ppos)) == -EAGAIN) + /* Don't assume ki_pos is where we left it */ + if (unlikely(iocb->ki_pos != m->read_pos)) { + while ((err = traverse(m, iocb->ki_pos)) == -EAGAIN) ; if (err) { /* With prejudice... */ @@ -174,7 +196,7 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) m->count = 0; goto Done; } else { - m->read_pos = *ppos; + m->read_pos = iocb->ki_pos; } } @@ -187,13 +209,11 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) /* if not empty - flush it first */ if (m->count) { n = min(m->count, size); - err = copy_to_user(buf, m->buf + m->from, n); - if (err) + if (copy_to_iter(m->buf + m->from, n, iter) != n) goto Efault; m->count -= n; m->from += n; size -= n; - buf += n; copied += n; if (!size) goto Done; @@ -254,8 +274,7 @@ Fill: } m->op->stop(m, p); n = min(m->count, size); - err = copy_to_user(buf, m->buf, n); - if (err) + if (copy_to_iter(m->buf, n, iter) != n) goto Efault; copied += n; m->count -= n; @@ -264,7 +283,7 @@ Done: if (!copied) copied = err; else { - *ppos += copied; + iocb->ki_pos += copied; m->read_pos += copied; } mutex_unlock(&m->lock); @@ -276,7 +295,7 @@ Efault: err = -EFAULT; goto Done; } -EXPORT_SYMBOL(seq_read); +EXPORT_SYMBOL(seq_read_iter); /** * seq_lseek - ->llseek() method for sequential files. diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 852b536551b5..15640015be9d 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2467,6 +2467,7 @@ xfs_defer_agfl_block( new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno); new->xefi_blockcount = 1; new->xefi_oinfo = *oinfo; + new->xefi_skip_discard = false; trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1); diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index e1bd484e5548..6747e97a7949 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -52,9 +52,9 @@ struct xfs_extent_free_item { xfs_fsblock_t xefi_startblock;/* starting fs block number */ xfs_extlen_t xefi_blockcount;/* number of blocks in extent */ + bool xefi_skip_discard; struct list_head xefi_list; struct xfs_owner_info xefi_oinfo; /* extent owner */ - bool xefi_skip_discard; }; #define XFS_BMAP_MAX_NMAP 4 diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index 3aa85b64de36..bb25ff1b770d 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -121,8 +121,7 @@ xchk_inode_flags( goto bad; /* rt flags require rt device */ - if ((flags & (XFS_DIFLAG_REALTIME | XFS_DIFLAG_RTINHERIT)) && - !mp->m_rtdev_targp) + if ((flags & XFS_DIFLAG_REALTIME) && !mp->m_rtdev_targp) goto bad; /* new rt bitmap flag only valid for rbmino */ diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 55d126d4e096..4304c6416fbb 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -346,8 +346,8 @@ xfs_map_blocks( ssize_t count = i_blocksize(inode); xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); - xfs_fileoff_t cow_fsb = NULLFILEOFF; - int whichfork = XFS_DATA_FORK; + xfs_fileoff_t cow_fsb; + int whichfork; struct xfs_bmbt_irec imap; struct xfs_iext_cursor icur; int retries = 0; @@ -381,6 +381,8 @@ xfs_map_blocks( * landed in a hole and we skip the block. */ retry: + cow_fsb = NULLFILEOFF; + whichfork = XFS_DATA_FORK; xfs_ilock(ip, XFS_ILOCK_SHARED); ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE || (ip->i_df.if_flags & XFS_IFEXTENTS)); @@ -527,13 +529,15 @@ xfs_prepare_ioend( */ static void xfs_discard_page( - struct page *page) + struct page *page, + loff_t fileoff) { struct inode *inode = page->mapping->host; struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; - loff_t offset = page_offset(page); - xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, offset); + unsigned int pageoff = offset_in_page(fileoff); + xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, fileoff); + xfs_fileoff_t pageoff_fsb = XFS_B_TO_FSBT(mp, pageoff); int error; if (XFS_FORCED_SHUTDOWN(mp)) @@ -541,14 +545,14 @@ xfs_discard_page( xfs_alert_ratelimited(mp, "page discard on page "PTR_FMT", inode 0x%llx, offset %llu.", - page, ip->i_ino, offset); + page, ip->i_ino, fileoff); error = xfs_bmap_punch_delalloc_range(ip, start_fsb, - i_blocks_per_page(inode, page)); + i_blocks_per_page(inode, page) - pageoff_fsb); if (error && !XFS_FORCED_SHUTDOWN(mp)) xfs_alert(mp, "page discard unable to remove delalloc mapping."); out_invalidate: - iomap_invalidatepage(page, 0, PAGE_SIZE); + iomap_invalidatepage(page, pageoff, PAGE_SIZE - pageoff); } static const struct iomap_writeback_ops xfs_writeback_ops = { diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 5e165456da68..1414ab79eacf 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -911,6 +911,16 @@ xfs_setattr_size( error = iomap_zero_range(inode, oldsize, newsize - oldsize, &did_zeroing, &xfs_buffered_write_iomap_ops); } else { + /* + * iomap won't detect a dirty page over an unwritten block (or a + * cow block over a hole) and subsequently skips zeroing the + * newly post-EOF portion of the page. Flush the new EOF to + * convert the block before the pagecache truncate. + */ + error = filemap_write_and_wait_range(inode->i_mapping, newsize, + newsize); + if (error) + return error; error = iomap_truncate_page(inode, newsize, &did_zeroing, &xfs_buffered_write_iomap_ops); } diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 16098dc42add..6fa05fb78189 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1502,7 +1502,8 @@ xfs_reflink_unshare( &xfs_buffered_write_iomap_ops); if (error) goto out; - error = filemap_write_and_wait(inode->i_mapping); + + error = filemap_write_and_wait_range(inode->i_mapping, offset, len); if (error) goto out; |