From f21c601a2bb319ec19eb4562eadc7797d90fd90e Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 15 Jun 2018 09:35:33 -0400 Subject: dm: use bio_split() when splitting out the already processed bio Use of bio_clone_bioset() is inefficient if there is no need to clone the original bio's bio_vec array. Best to use the bio_clone_fast() variant. Also, just using bio_advance() is only part of what is needed to properly setup the clone -- it doesn't account for the various bio_integrity() related work that also needs to be performed (see bio_split). Address both of these issues by switching from bio_clone_bioset() to bio_split(). Fixes: 18a25da8 ("dm: ensure bio submission follows a depth-first tree walk") Cc: stable@vger.kernel.org # 4.15+, requires removal of '&' before md->queue->bio_split Reported-by: Christoph Hellwig Reviewed-by: NeilBrown Signed-off-by: Mike Snitzer --- drivers/md/dm.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index e65429a29c06..a3b103e8e3ce 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1606,10 +1606,9 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md, * the usage of io->orig_bio in dm_remap_zone_report() * won't be affected by this reassignment. */ - struct bio *b = bio_clone_bioset(bio, GFP_NOIO, - &md->queue->bio_split); + struct bio *b = bio_split(bio, bio_sectors(bio) - ci.sector_count, + GFP_NOIO, &md->queue->bio_split); ci.io->orig_bio = b; - bio_advance(bio, (bio_sectors(bio) - ci.sector_count) << 9); bio_chain(b, bio); ret = generic_make_request(bio); break; -- cgit v1.2.3 From 7ccdbf85d3b2237cdbdafe6a47673dbdfab4b7a2 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Mon, 18 Jun 2018 11:57:51 -0400 Subject: dm thin metadata: remove needless work from __commit_transaction Commit 5a32083d03fb5 ("dm: take care to copy the space map roots before locking the superblock") properly removed the calls to dm_sm_root_size() from __write_initial_superblock(). But the dm_sm_root_size() calls were left dangling in __commit_transaction(). Fixes: 5a32083d03fb5 ("dm: take care to copy the space map roots before locking the superblock") Signed-off-by: Mike Snitzer --- drivers/md/dm-thin-metadata.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 36ef284ad086..72142021b5c9 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -776,7 +776,6 @@ static int __write_changed_details(struct dm_pool_metadata *pmd) static int __commit_transaction(struct dm_pool_metadata *pmd) { int r; - size_t metadata_len, data_len; struct thin_disk_superblock *disk_super; struct dm_block *sblock; @@ -797,14 +796,6 @@ static int __commit_transaction(struct dm_pool_metadata *pmd) if (r < 0) return r; - r = dm_sm_root_size(pmd->metadata_sm, &metadata_len); - if (r < 0) - return r; - - r = dm_sm_root_size(pmd->data_sm, &data_len); - if (r < 0) - return r; - r = save_sm_roots(pmd); if (r < 0) return r; -- cgit v1.2.3 From 50a7d3ba7c9ac5e0b7e03fc7f420180989361dbf Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 18 Jun 2018 10:50:33 -0700 Subject: dm writecache: use 2-factor allocator arguments This adjusts the allocator calls to use the 2-factor argument style, as already done treewide for better defense against allocator overflows. Signed-off-by: Kees Cook [snitzer: tweaked code to leave assignment in a test alone] Signed-off-by: Mike Snitzer --- drivers/md/dm-writecache.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/md/dm-writecache.c b/drivers/md/dm-writecache.c index 5961c7794ef3..07ea6a48aac6 100644 --- a/drivers/md/dm-writecache.c +++ b/drivers/md/dm-writecache.c @@ -259,7 +259,7 @@ static int persistent_memory_claim(struct dm_writecache *wc) if (da != p) { long i; wc->memory_map = NULL; - pages = kvmalloc(p * sizeof(struct page *), GFP_KERNEL); + pages = kvmalloc_array(p, sizeof(struct page *), GFP_KERNEL); if (!pages) { r = -ENOMEM; goto err2; @@ -859,7 +859,7 @@ static int writecache_alloc_entries(struct dm_writecache *wc) if (wc->entries) return 0; - wc->entries = vmalloc(sizeof(struct wc_entry) * wc->n_blocks); + wc->entries = vmalloc(array_size(sizeof(struct wc_entry), wc->n_blocks)); if (!wc->entries) return -ENOMEM; for (b = 0; b < wc->n_blocks; b++) { @@ -1481,9 +1481,9 @@ static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeba wb->bio.bi_iter.bi_sector = read_original_sector(wc, e); wb->page_offset = PAGE_SIZE; if (max_pages <= WB_LIST_INLINE || - unlikely(!(wb->wc_list = kmalloc(max_pages * sizeof(struct wc_entry *), - GFP_NOIO | __GFP_NORETRY | - __GFP_NOMEMALLOC | __GFP_NOWARN)))) { + unlikely(!(wb->wc_list = kmalloc_array(max_pages, sizeof(struct wc_entry *), + GFP_NOIO | __GFP_NORETRY | + __GFP_NOMEMALLOC | __GFP_NOWARN)))) { wb->wc_list = wb->wc_list_inline; max_pages = WB_LIST_INLINE; } -- cgit v1.2.3 From 2d0b2d64d325e22939d9db3ba784f1236459ed98 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Fri, 22 Jun 2018 08:09:11 -0700 Subject: dm zoned: avoid triggering reclaim from inside dmz_map() This patch avoids that lockdep reports the following: ====================================================== WARNING: possible circular locking dependency detected 4.18.0-rc1 #62 Not tainted ------------------------------------------------------ kswapd0/84 is trying to acquire lock: 00000000c313516d (&xfs_nondir_ilock_class){++++}, at: xfs_free_eofblocks+0xa2/0x1e0 but task is already holding lock: 00000000591c83ae (fs_reclaim){+.+.}, at: __fs_reclaim_acquire+0x5/0x30 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (fs_reclaim){+.+.}: kmem_cache_alloc+0x2c/0x2b0 radix_tree_node_alloc.constprop.19+0x3d/0xc0 __radix_tree_create+0x161/0x1c0 __radix_tree_insert+0x45/0x210 dmz_map+0x245/0x2d0 [dm_zoned] __map_bio+0x40/0x260 __split_and_process_non_flush+0x116/0x220 __split_and_process_bio+0x81/0x180 __dm_make_request.isra.32+0x5a/0x100 generic_make_request+0x36e/0x690 submit_bio+0x6c/0x140 mpage_readpages+0x19e/0x1f0 read_pages+0x6d/0x1b0 __do_page_cache_readahead+0x21b/0x2d0 force_page_cache_readahead+0xc4/0x100 generic_file_read_iter+0x7c6/0xd20 __vfs_read+0x102/0x180 vfs_read+0x9b/0x140 ksys_read+0x55/0xc0 do_syscall_64+0x5a/0x1f0 entry_SYSCALL_64_after_hwframe+0x49/0xbe -> #1 (&dmz->chunk_lock){+.+.}: dmz_map+0x133/0x2d0 [dm_zoned] __map_bio+0x40/0x260 __split_and_process_non_flush+0x116/0x220 __split_and_process_bio+0x81/0x180 __dm_make_request.isra.32+0x5a/0x100 generic_make_request+0x36e/0x690 submit_bio+0x6c/0x140 _xfs_buf_ioapply+0x31c/0x590 xfs_buf_submit_wait+0x73/0x520 xfs_buf_read_map+0x134/0x2f0 xfs_trans_read_buf_map+0xc3/0x580 xfs_read_agf+0xa5/0x1e0 xfs_alloc_read_agf+0x59/0x2b0 xfs_alloc_pagf_init+0x27/0x60 xfs_bmap_longest_free_extent+0x43/0xb0 xfs_bmap_btalloc_nullfb+0x7f/0xf0 xfs_bmap_btalloc+0x428/0x7c0 xfs_bmapi_write+0x598/0xcc0 xfs_iomap_write_allocate+0x15a/0x330 xfs_map_blocks+0x1cf/0x3f0 xfs_do_writepage+0x15f/0x7b0 write_cache_pages+0x1ca/0x540 xfs_vm_writepages+0x65/0xa0 do_writepages+0x48/0xf0 __writeback_single_inode+0x58/0x730 writeback_sb_inodes+0x249/0x5c0 wb_writeback+0x11e/0x550 wb_workfn+0xa3/0x670 process_one_work+0x228/0x670 worker_thread+0x3c/0x390 kthread+0x11c/0x140 ret_from_fork+0x3a/0x50 -> #0 (&xfs_nondir_ilock_class){++++}: down_read_nested+0x43/0x70 xfs_free_eofblocks+0xa2/0x1e0 xfs_fs_destroy_inode+0xac/0x270 dispose_list+0x51/0x80 prune_icache_sb+0x52/0x70 super_cache_scan+0x127/0x1a0 shrink_slab.part.47+0x1bd/0x590 shrink_node+0x3b5/0x470 balance_pgdat+0x158/0x3b0 kswapd+0x1ba/0x600 kthread+0x11c/0x140 ret_from_fork+0x3a/0x50 other info that might help us debug this: Chain exists of: &xfs_nondir_ilock_class --> &dmz->chunk_lock --> fs_reclaim Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(fs_reclaim); lock(&dmz->chunk_lock); lock(fs_reclaim); lock(&xfs_nondir_ilock_class); *** DEADLOCK *** 3 locks held by kswapd0/84: #0: 00000000591c83ae (fs_reclaim){+.+.}, at: __fs_reclaim_acquire+0x5/0x30 #1: 000000000f8208f5 (shrinker_rwsem){++++}, at: shrink_slab.part.47+0x3f/0x590 #2: 00000000cacefa54 (&type->s_umount_key#43){.+.+}, at: trylock_super+0x16/0x50 stack backtrace: CPU: 7 PID: 84 Comm: kswapd0 Not tainted 4.18.0-rc1 #62 Hardware name: Supermicro Super Server/X10SRL-F, BIOS 2.0 12/17/2015 Call Trace: dump_stack+0x85/0xcb print_circular_bug.isra.36+0x1ce/0x1db __lock_acquire+0x124e/0x1310 lock_acquire+0x9f/0x1f0 down_read_nested+0x43/0x70 xfs_free_eofblocks+0xa2/0x1e0 xfs_fs_destroy_inode+0xac/0x270 dispose_list+0x51/0x80 prune_icache_sb+0x52/0x70 super_cache_scan+0x127/0x1a0 shrink_slab.part.47+0x1bd/0x590 shrink_node+0x3b5/0x470 balance_pgdat+0x158/0x3b0 kswapd+0x1ba/0x600 kthread+0x11c/0x140 ret_from_fork+0x3a/0x50 Reported-by: Masato Suzuki Fixes: 4218a9554653 ("dm zoned: use GFP_NOIO in I/O path") Cc: Signed-off-by: Bart Van Assche Signed-off-by: Mike Snitzer --- drivers/md/dm-zoned-target.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index 3c0e45f4dcf5..a44183ff4be0 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -787,7 +787,7 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv) /* Chunk BIO work */ mutex_init(&dmz->chunk_lock); - INIT_RADIX_TREE(&dmz->chunk_rxtree, GFP_KERNEL); + INIT_RADIX_TREE(&dmz->chunk_rxtree, GFP_NOIO); dmz->chunk_wq = alloc_workqueue("dmz_cwq_%s", WQ_MEM_RECLAIM | WQ_UNBOUND, 0, dev->name); if (!dmz->chunk_wq) { -- cgit v1.2.3 From f2ccaa5904666f84d4e6958d6db9d1e65ead5085 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 22 Jun 2018 10:01:19 +0200 Subject: dm raid: don't use 'const' in function return A newly introduced function has 'const int' as the return type, but as "make W=1" reports, that has no meaning: drivers/md/dm-raid.c:510:18: error: type qualifiers ignored on function return type [-Werror=ignored-qualifiers] This changes the return type to plain 'int'. Signed-off-by: Arnd Bergmann Fixes: 33e53f06850f ("dm raid: introduce extended superblock and new raid types to support takeover/reshaping") Signed-off-by: Mike Snitzer Fixes: 552aa679f2657431 ("dm raid: use rs_is_raid*()") Signed-off-by: Geert Uytterhoeven Signed-off-by: Mike Snitzer --- drivers/md/dm-raid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index ab13fcec3fca..75df4c9d8b54 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -588,7 +588,7 @@ static const char *raid10_md_layout_to_format(int layout) } /* Return md raid10 algorithm for @name */ -static const int raid10_name_to_format(const char *name) +static int raid10_name_to_format(const char *name) { if (!strcasecmp(name, "near")) return ALGORITHM_RAID10_NEAR; -- cgit v1.2.3 From a685557fbbc3122ed11e8ad3fa63a11ebc5de8c3 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Tue, 26 Jun 2018 12:04:23 -0400 Subject: dm thin: handle running out of data space vs concurrent discard Discards issued to a DM thin device can complete to userspace (via fstrim) _before_ the metadata changes associated with the discards is reflected in the thinp superblock (e.g. free blocks). As such, if a user constructs a test that loops repeatedly over these steps, block allocation can fail due to discards not having completed yet: 1) fill thin device via filesystem file 2) remove file 3) fstrim From initial report, here: https://www.redhat.com/archives/dm-devel/2018-April/msg00022.html "The root cause of this issue is that dm-thin will first remove mapping and increase corresponding blocks' reference count to prevent them from being reused before DISCARD bios get processed by the underlying layers. However. increasing blocks' reference count could also increase the nr_allocated_this_transaction in struct sm_disk which makes smd->old_ll.nr_allocated + smd->nr_allocated_this_transaction bigger than smd->old_ll.nr_blocks. In this case, alloc_data_block() will never commit metadata to reset the begin pointer of struct sm_disk, because sm_disk_get_nr_free() always return an underflow value." While there is room for improvement to the space-map accounting that thinp is making use of: the reality is this test is inherently racey and will result in the previous iteration's fstrim's discard(s) completing vs concurrent block allocation, via dd, in the next iteration of the loop. No amount of space map accounting improvements will be able to allow user's to use a block before a discard of that block has completed. So the best we can really do is allow DM thinp to gracefully handle such aggressive use of all the pool's data by degrading the pool into out-of-data-space (OODS) mode. We _should_ get that behaviour already (if space map accounting didn't falsely cause alloc_data_block() to believe free space was available).. but short of that we handle the current reality that dm_pool_alloc_data_block() can return -ENOSPC. Reported-by: Dennis Yang Cc: stable@vger.kernel.org Signed-off-by: Mike Snitzer --- drivers/md/dm-thin.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 7945238df1c0..b900723bbd0f 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -1386,6 +1386,8 @@ static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block, static void set_pool_mode(struct pool *pool, enum pool_mode new_mode); +static void requeue_bios(struct pool *pool); + static void check_for_space(struct pool *pool) { int r; @@ -1398,8 +1400,10 @@ static void check_for_space(struct pool *pool) if (r) return; - if (nr_free) + if (nr_free) { set_pool_mode(pool, PM_WRITE); + requeue_bios(pool); + } } /* @@ -1476,7 +1480,10 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result) r = dm_pool_alloc_data_block(pool->pmd, result); if (r) { - metadata_operation_failed(pool, "dm_pool_alloc_data_block", r); + if (r == -ENOSPC) + set_pool_mode(pool, PM_OUT_OF_DATA_SPACE); + else + metadata_operation_failed(pool, "dm_pool_alloc_data_block", r); return r; } -- cgit v1.2.3 From 4557641b4c7046625c026fb809c47ef0d43ae595 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Tue, 26 Jun 2018 16:30:39 -0600 Subject: pmem: only set QUEUE_FLAG_DAX for fsdax mode QUEUE_FLAG_DAX is an indication that a given block device supports filesystem DAX and should not be set for PMEM namespaces which are in "raw" mode. These namespaces lack struct page and are prevented from participating in filesystem DAX as of commit 569d0365f571 ("dax: require 'struct page' by default for filesystem dax"). Signed-off-by: Ross Zwisler Suggested-by: Mike Snitzer Fixes: 569d0365f571 ("dax: require 'struct page' by default for filesystem dax") Cc: stable@vger.kernel.org Acked-by: Dan Williams Reviewed-by: Toshi Kani Signed-off-by: Mike Snitzer --- drivers/nvdimm/pmem.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 68940356cad3..8b1fd7f1a224 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -414,7 +414,8 @@ static int pmem_attach_disk(struct device *dev, blk_queue_logical_block_size(q, pmem_sector_size(ndns)); blk_queue_max_hw_sectors(q, UINT_MAX); blk_queue_flag_set(QUEUE_FLAG_NONROT, q); - blk_queue_flag_set(QUEUE_FLAG_DAX, q); + if (pmem->pfn_flags & PFN_MAP) + blk_queue_flag_set(QUEUE_FLAG_DAX, q); q->queuedata = pmem; disk = alloc_disk_node(0, nid); -- cgit v1.2.3 From 15256f6cc4b44f2e70503758150267fd2a53c0d6 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Tue, 26 Jun 2018 16:30:40 -0600 Subject: dax: check for QUEUE_FLAG_DAX in bdev_dax_supported() Add an explicit check for QUEUE_FLAG_DAX to __bdev_dax_supported(). This is needed for DM configurations where the first element in the dm-linear or dm-stripe target supports DAX, but other elements do not. Without this check __bdev_dax_supported() will pass for such devices, letting a filesystem on that device mount with the DAX option. Signed-off-by: Ross Zwisler Suggested-by: Mike Snitzer Fixes: commit 545ed20e6df6 ("dm: add infrastructure for DAX support") Cc: stable@vger.kernel.org Acked-by: Dan Williams Reviewed-by: Toshi Kani Signed-off-by: Mike Snitzer --- drivers/dax/super.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 903d9c473749..45276abf03aa 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -86,6 +86,7 @@ bool __bdev_dax_supported(struct block_device *bdev, int blocksize) { struct dax_device *dax_dev; bool dax_enabled = false; + struct request_queue *q; pgoff_t pgoff; int err, id; void *kaddr; @@ -99,6 +100,13 @@ bool __bdev_dax_supported(struct block_device *bdev, int blocksize) return false; } + q = bdev_get_queue(bdev); + if (!q || !blk_queue_dax(q)) { + pr_debug("%s: error: request queue doesn't support dax\n", + bdevname(bdev, buf)); + return false; + } + err = bdev_dax_pgoff(bdev, 0, PAGE_SIZE, &pgoff); if (err) { pr_debug("%s: error: unaligned partition for dax\n", -- cgit v1.2.3 From dbc626597c39b24cefce09fbd8e9dea85869a801 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Tue, 26 Jun 2018 16:30:41 -0600 Subject: dm: prevent DAX mounts if not supported Currently device_supports_dax() just checks to see if the QUEUE_FLAG_DAX flag is set on the device's request queue to decide whether or not the device supports filesystem DAX. Really we should be using bdev_dax_supported() like filesystems do at mount time. This performs other tests like checking to make sure the dax_direct_access() path works. We also explicitly clear QUEUE_FLAG_DAX on the DM device's request queue if any of the underlying devices do not support DAX. This makes the handling of QUEUE_FLAG_DAX consistent with the setting/clearing of most other flags in dm_table_set_restrictions(). Now that bdev_dax_supported() explicitly checks for QUEUE_FLAG_DAX, this will ensure that filesystems built upon DM devices will only be able to mount with DAX if all underlying devices also support DAX. Signed-off-by: Ross Zwisler Fixes: commit 545ed20e6df6 ("dm: add infrastructure for DAX support") Cc: stable@vger.kernel.org Acked-by: Dan Williams Reviewed-by: Toshi Kani Signed-off-by: Mike Snitzer --- drivers/md/dm-table.c | 7 ++++--- drivers/md/dm.c | 3 +-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 938766794c2e..3d0e2c198f06 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -885,9 +885,7 @@ EXPORT_SYMBOL_GPL(dm_table_set_type); static int device_supports_dax(struct dm_target *ti, struct dm_dev *dev, sector_t start, sector_t len, void *data) { - struct request_queue *q = bdev_get_queue(dev->bdev); - - return q && blk_queue_dax(q); + return bdev_dax_supported(dev->bdev, PAGE_SIZE); } static bool dm_table_supports_dax(struct dm_table *t) @@ -1907,6 +1905,9 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, if (dm_table_supports_dax(t)) blk_queue_flag_set(QUEUE_FLAG_DAX, q); + else + blk_queue_flag_clear(QUEUE_FLAG_DAX, q); + if (dm_table_supports_dax_write_cache(t)) dax_write_cache(t->md->dax_dev, true); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index a3b103e8e3ce..b0dd7027848b 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1056,8 +1056,7 @@ static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, if (len < 1) goto out; nr_pages = min(len, nr_pages); - if (ti->type->direct_access) - ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn); + ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn); out: dm_put_live_table(md, srcu_idx); -- cgit v1.2.3